from pymongo import MongoClient import pandas as pd from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import seaborn as sns import numpy as np ### Parameters ### w_genres = 10 w_keywords = 17 w_actor = 15 w_director = 15 w_release_date = 8 #w_genres = 1 #w_keywords = 1 #w_actor = 1 #w_director = 1 #w_release_date = 1 def movieDbToDf(): ''' This function convert a movie DataBase from mongoDB into a pandas DataFrame ''' #load DB client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true") db = client.group3 collection = db.movies_populated #projection on useful data cursor = collection.find({},{"_id":1, "id":1, "original_title":1, "genre_ids": 1, "overview":1, "vote_count":1, "release_date":1, "main_actor":1, "director":1, "keywords":1}) df=pd.DataFrame(list(cursor)) return df def preFiltering(df,percent=90): ''' This function removes movies who do not have enough votes to be evaluated ''' df = df[df['vote_count'].notna()] min_votes = np.percentile(df['vote_count'].values, 100-percent) newdf = df.copy(deep=True).loc[df['vote_count'] > min_votes] return newdf def process_text(text): ''' This function transform a text before calculating the tf-idf ''' # replace multiple spaces with one text = ' '.join(text.split()) # lowercase text = text.lower() return text def dfToVectMatrix(df): """ This function returns the vect-matrix of the column features from a dataframe """ vect = CountVectorizer(stop_words='english') vect_matrix = vect.fit_transform(df['features']) return vect_matrix def similarity(df): ''' This function calculates the similarity between movies ''' vect_matrix=dfToVectMatrix(df) cosine_similarity_matrix_count_based = cosine_similarity(vect_matrix, vect_matrix) return cosine_similarity_matrix_count_based def index_from_title(df,title): ''' return the index of a movie from its title ''' return df[df['original_title']==title].index.values[0] def title_from_index(df,index): ''' return the title of a movie from its index ''' return df[df.index==index].original_title.values[0] def id_from_index(df,index): ''' return the id of a movie from its index ''' return df[df.index==index]._id.values[0] def index_from_id(df,id): ''' return the index of a movie from its id ''' print(df[df['original_title']=='Uncharted'].index.values[0]) return df[df['_id']==id].index.values[0] def recommendations(original_title, df, number_of_recommendations): #prefilter the dataframe #df=preFiltering(df) #creates features column df['features']=df.apply(formatingFeatures,axis=1) df['features']=df.apply(lambda x: process_text(x.features),axis=1) index= index_from_title(df,original_title) #calculates similarity scores of all movies vect_matrix=dfToVectMatrix(df) calculated_sim = cosine_similarity(vect_matrix, vect_matrix) similarity_scores = list(enumerate(calculated_sim[index])) similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True) recommendations_indices = [t[0] for t in similarity_scores_sorted[1:(number_of_recommendations+1)]] return df['original_title'].iloc[recommendations_indices] def formatingFeatures(df_row): g = [] genres = [] k=[] keywords = [] #creates genres list g+=df_row['genre_ids'] for i in range(len(g)): genres.append(str(g[i])) genres=' '.join(genres) #creates keywords list k+=df_row['keywords'] for i in range(len(k)): keywords.append(str(k[i])) keywords=' '.join(keywords) return ' '.join([genres]*w_genres)+' '+' '.join([keywords]*w_keywords)+' '+' '.join([str(df_row['main_actor'])]*w_actor)+' '+' '.join([str(df_row['director'])]*w_director)+' '+' '.join([str(df_row['release_date'])]*w_release_date) def userDbToDf(): ''' This function convert a movie DataBase from mongoDB into a pandas DataFrame ''' #load DB client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true") db = client.group3 collection = db.users #projection on useful data cursor = collection.find({},{"_id":1, "liked_movies": 1, "update":1}) df=pd.DataFrame(list(cursor)) return df def user_profile( user_index, moviesdf, usersdf, vectMatrix ): moviesID=usersdf['liked_movies'].iloc[user_index] print(moviesID) print('Hello') moviesindex=[index_from_id(moviesdf,ID) for ID in moviesID] n=len(moviesID)#number of film liked if moviesindex != []: vectuser=vectMatrix[moviesindex[0]] moviesindex.pop(0) for i in moviesindex: vectuser = vectuser + vectMatrix[i] vectuser=vectuser/n calculated_sim = cosine_similarity(vectuser, vectMatrix) similarity_scores = list(enumerate(calculated_sim[0])) similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True) recommendations_indices = [t[0] for t in similarity_scores_sorted[1:(100+1)]] return recommendations_indices else: return def loadRecDB(): #load DB client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true") db = client.group3 collection = db['recommendations'] return collection def updateDB(): #loadDB moviesdf = movieDbToDf() usersdf = userDbToDf() recdb= loadRecDB() #creates features column moviesdf['features']=moviesdf.apply(formatingFeatures,axis=1) moviesdf['features']=moviesdf.apply(lambda x: process_text(x.features),axis=1) #calculates similarity scores of all movies vect_matrix=dfToVectMatrix(moviesdf) for i in usersdf.index: #init var dict = {"user_id": usersdf['_id'][i]} recommended_movies=[] #fetch liked movies index rec_indices=user_profile( i, moviesdf, usersdf, vect_matrix) if rec_indices != None: print('pass') recdf = moviesdf['id'].iloc[rec_indices] print(recdf) for j in recdf.index: recommended_movies.append(int(recdf[j])) dict['recommended_movies']=recommended_movies #update db: recdb.update_one({ "user_id": dict["user_id"] }, [{"$set": dict}], upsert=True) updateDB()