diff --git a/algo/recommendation.py b/algo/recommendation.py new file mode 100644 index 0000000000000000000000000000000000000000..8dfd5953ed5fa839a0ec7599ca6ed1458c8700a8 --- /dev/null +++ b/algo/recommendation.py @@ -0,0 +1,97 @@ +from pymongo import MongoClient +import pandas as pd +#import ast +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.metrics.pairwise import cosine_similarity +#import seaborn as sns +import numpy as np +import matplotlib.pyplot as plt + + +def dbToDf(): + ''' + This function convert a DataBase from mongoDB into a pandas DataFrame + ''' + client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true") + db = client.group3 + collection = db.movies_populated + cursor = collection.find() + df=pd.DataFrame(list(cursor)) + + return df + +def preFiltering(df,percent=15): + ''' + This function removes movies who do not have enough votes to be evaluated + ''' + df = df[df['vote_count'].notna()] + min_votes = np.percentile(df['vote_count'].values, 100-percent) + newdf = df.copy(deep=True).loc[df['vote_count'] > min_votes] + + return newdf + +def process_text(text): + ''' + This function transform a text before calculating the tf-idf + ''' + # replace multiple spaces with one + text = ' '.join(text.split()) + + # lowercase + text = text.lower() + + return text + +def similarity(df): + ''' + This function calculates the similarity between movies + ''' + tf_idf = TfidfVectorizer(stop_words='english') + tf_idf_matrix = tf_idf.fit_transform(df['overview']); + + # calculating cosine similarity between movies + cosine_similarity_matrix = cosine_similarity(tf_idf_matrix, tf_idf_matrix) + + return cosine_similarity_matrix + +def index_from_title(df,title): + ''' + return the index of a movie from its title + ''' + return df[df['original_title']==title].index.values[0] + +def title_from_index(df,index): + ''' + return the title of a movie from its index + ''' + return df[df.index==index].original_title.values[0] + +def recommendations_on_overview( original_title, df, number_of_recommendations): + + #prefilter the dataframe + df=preFiltering(df) + + # removing rows with missing overview + df = df[df['overview'].notna()] + df.reset_index(inplace=True) + + #process text of all overviews + df['overview'] = df.apply(lambda x: process_text(x.overview),axis=1) + + index= index_from_title(df,original_title) + + #calculates similarity scores of all movies + calculated_sim = similarity(df) + + similarity_scores = list(enumerate(calculated_sim[index])) + + similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True) + + recommendations_indices = [t[0] for t in similarity_scores_sorted[1:(number_of_recommendations+1)]] + + return df['original_title'].iloc[recommendations_indices] + +df = dbToDf() + +print(recommendations_on_overview( 'Batman', df, 9)) \ No newline at end of file