Skip to content
Snippets Groups Projects
Commit 539ae24b authored by Tom Bray's avatar Tom Bray
Browse files

primary recommendation system done

parent e1602073
Branches
No related tags found
1 merge request!12Search algo
Pipeline #42525 passed
from pymongo import MongoClient from pymongo import MongoClient
import pandas as pd import pandas as pd
#import ast import ast
from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity from sklearn.metrics.pairwise import cosine_similarity
#import seaborn as sns import seaborn as sns
import numpy as np import numpy as np
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
...@@ -16,9 +16,8 @@ def dbToDf(): ...@@ -16,9 +16,8 @@ def dbToDf():
client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true") client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true")
db = client.group3 db = client.group3
collection = db.movies_populated collection = db.movies_populated
cursor = collection.find() cursor = collection.find({},{"_id":1, "original_title": 1, "genre": 1, "id":1, "overview":1, "popularity":1, "vote_count":1, "release_date":1, "cast": {"name":1, "order":1}})
df=pd.DataFrame(list(cursor)) df=pd.DataFrame(list(cursor))
return df return df
def preFiltering(df,percent=15): def preFiltering(df,percent=15):
...@@ -49,7 +48,7 @@ def similarity(df): ...@@ -49,7 +48,7 @@ def similarity(df):
''' '''
tf_idf = TfidfVectorizer(stop_words='english') tf_idf = TfidfVectorizer(stop_words='english')
tf_idf_matrix = tf_idf.fit_transform(df['overview']); tf_idf_matrix = tf_idf.fit_transform(df['overview']);
print(tf_idf_matrix)
# calculating cosine similarity between movies # calculating cosine similarity between movies
cosine_similarity_matrix = cosine_similarity(tf_idf_matrix, tf_idf_matrix) cosine_similarity_matrix = cosine_similarity(tf_idf_matrix, tf_idf_matrix)
...@@ -83,7 +82,6 @@ def recommendations_on_overview( original_title, df, number_of_recommendations): ...@@ -83,7 +82,6 @@ def recommendations_on_overview( original_title, df, number_of_recommendations):
#calculates similarity scores of all movies #calculates similarity scores of all movies
calculated_sim = similarity(df) calculated_sim = similarity(df)
similarity_scores = list(enumerate(calculated_sim[index])) similarity_scores = list(enumerate(calculated_sim[index]))
similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True) similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
...@@ -94,4 +92,4 @@ def recommendations_on_overview( original_title, df, number_of_recommendations): ...@@ -94,4 +92,4 @@ def recommendations_on_overview( original_title, df, number_of_recommendations):
df = dbToDf() df = dbToDf()
print(recommendations_on_overview( 'Batman', df, 9)) print(recommendations_on_overview('Avatar', df, 9))
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment