diff --git a/algo/adreco.py b/algo/adreco.py new file mode 100644 index 0000000000000000000000000000000000000000..fc477936033d7f50597202ea5407156489b7cb05 --- /dev/null +++ b/algo/adreco.py @@ -0,0 +1,246 @@ +from pymongo import MongoClient +import pandas as pd +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.metrics.pairwise import cosine_similarity +import seaborn as sns +import numpy as np + +### Parameters ### + +w_genres = 10 +w_keywords = 17 +w_actor = 15 +w_director = 15 +w_release_date = 8 + +#w_genres = 1 +#w_keywords = 1 +#w_actor = 1 +#w_director = 1 +#w_release_date = 1 + + +def movieDbToDf(): + ''' + This function convert a movie DataBase from mongoDB into a pandas DataFrame + ''' + #load DB + client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true") + db = client.group3 + collection = db.movies_populated + + #projection on useful data + cursor = collection.find({},{"_id":1, "id":1, "original_title":1, "genre_ids": 1, "overview":1, "vote_count":1, "release_date":1, "main_actor":1, "director":1, "keywords":1}) + df=pd.DataFrame(list(cursor)) + + return df + +def preFiltering(df,percent=90): + ''' + This function removes movies who do not have enough votes to be evaluated + ''' + df = df[df['vote_count'].notna()] + min_votes = np.percentile(df['vote_count'].values, 100-percent) + newdf = df.copy(deep=True).loc[df['vote_count'] > min_votes] + + return newdf + +def process_text(text): + ''' + This function transform a text before calculating the tf-idf + ''' + # replace multiple spaces with one + text = ' '.join(text.split()) + + # lowercase + text = text.lower() + + return text + +def dfToVectMatrix(df): + """ + This function returns the vect-matrix of the column features from a dataframe + """ + vect = CountVectorizer(stop_words='english') + + vect_matrix = vect.fit_transform(df['features']) + + return vect_matrix + +def similarity(df): + ''' + This function calculates the similarity between movies + ''' + + vect_matrix=dfToVectMatrix(df) + + cosine_similarity_matrix_count_based = cosine_similarity(vect_matrix, vect_matrix) + + return cosine_similarity_matrix_count_based + +def index_from_title(df,title): + ''' + return the index of a movie from its title + ''' + return df[df['original_title']==title].index.values[0] + +def title_from_index(df,index): + ''' + return the title of a movie from its index + ''' + return df[df.index==index].original_title.values[0] + +def id_from_index(df,index): + ''' + return the id of a movie from its index + ''' + return df[df.index==index]._id.values[0] + +def index_from_id(df,id): + ''' + return the index of a movie from its id + ''' + return df[df['_id']==id].index.values[0] + + +def recommendations(original_title, df, number_of_recommendations): + + #prefilter the dataframe + #df=preFiltering(df) + + #creates features column + df['features']=df.apply(formatingFeatures,axis=1) + df['features']=df.apply(lambda x: process_text(x.features),axis=1) + index= index_from_title(df,original_title) + + #calculates similarity scores of all movies + vect_matrix=dfToVectMatrix(df) + + + + calculated_sim = cosine_similarity(vect_matrix, vect_matrix) + + similarity_scores = list(enumerate(calculated_sim[index])) + + similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True) + + recommendations_indices = [t[0] for t in similarity_scores_sorted[1:(number_of_recommendations+1)]] + + return df['original_title'].iloc[recommendations_indices] + +def formatingFeatures(df_row): + g = [] + genres = [] + k=[] + keywords = [] + + #creates genres list + g+=df_row['genre_ids'] + for i in range(len(g)): + genres.append(str(g[i])) + genres=' '.join(genres) + + #creates keywords list + k+=df_row['keywords'] + for i in range(len(k)): + keywords.append(str(k[i])) + keywords=' '.join(keywords) + + + return ' '.join([genres]*w_genres)+' '+' '.join([keywords]*w_keywords)+' '+' '.join([str(df_row['main_actor'])]*w_actor)+' '+' '.join([str(df_row['director'])]*w_director)+' '+' '.join([str(df_row['release_date'])]*w_release_date) + +def userDbToDf(): + ''' + This function convert a movie DataBase from mongoDB into a pandas DataFrame + ''' + #load DB + client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true") + db = client.group3 + collection = db.users + + #projection on useful data + cursor = collection.find({},{"_id":1, "liked_movies": 1, "update":1}) + df=pd.DataFrame(list(cursor)) + + return df + +def user_profile( user_index, moviesdf, usersdf, vectMatrix ): + """ + This function creates a user profile based on the likef movies of the user + and ponderating the vectMatrix of all film liked + """ + + #fetch movies ID and index from the liked_movies + moviesID=usersdf['liked_movies'].iloc[user_index] + moviesindex=[index_from_id(moviesdf,ID) for ID in moviesID] + + n=len(moviesID)#number of film liked + + + if moviesindex != []: + #creates the vector of the user + vectuser=vectMatrix[moviesindex[0]] + moviesindex.pop(0) + for i in moviesindex: + vectuser = vectuser + vectMatrix[i] + vectuser=vectuser/n + + #calculates the user similarity + calculated_sim = cosine_similarity(vectuser, vectMatrix) + + similarity_scores = list(enumerate(calculated_sim[0])) + similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True) + + #lists recommendations index of the movies, ordered by weights + recommendations_indices = [t[0] for t in similarity_scores_sorted[1:(100+1)]] + + return recommendations_indices + + else: + return [i for i in range(100)] + +def loadRecDB(): + + #load DB + client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true") + db = client.group3 + collection = db['recommendations'] + return collection + +def updateDB(): + + #loadDB + moviesdf = movieDbToDf() + usersdf = userDbToDf() + recdb= loadRecDB() + + #creates features column + moviesdf['features']=moviesdf.apply(formatingFeatures,axis=1) + moviesdf['features']=moviesdf.apply(lambda x: process_text(x.features),axis=1) + + #calculates similarity scores of all movies + vect_matrix=dfToVectMatrix(moviesdf) + + for i in usersdf.index: + #init var + dict = {"user_id": usersdf['_id'][i]} + recommended_movies=[] + + #fetch liked movies index + rec_indices=user_profile( i, moviesdf, usersdf, vect_matrix) + + if rec_indices != None: + recdf = moviesdf['id'].iloc[rec_indices] + titledf = moviesdf['original_title'].iloc[rec_indices] + + for j in recdf.index: + recommended_movies.append(int(recdf[j])) + + dict['recommended_movies']=recommended_movies + + #update db: + recdb.update_one({ "user_id": dict["user_id"] }, [{"$set": dict}], upsert=True) + + +updateDB() diff --git a/algo/recommendation.py b/algo/recommendation.py index 735e40f386060d0a1199cce86cbd372041a2ef74..727cb08f105d4de55f3c45787cb53cd6962c59ed 100644 --- a/algo/recommendation.py +++ b/algo/recommendation.py @@ -1,23 +1,24 @@ from pymongo import MongoClient import pandas as pd -import ast from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import seaborn as sns import numpy as np -import matplotlib.pyplot as plt - def dbToDf(): ''' This function convert a DataBase from mongoDB into a pandas DataFrame ''' + #load DB client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true") db = client.group3 collection = db.movies_populated - cursor = collection.find({},{"_id":1, "original_title": 1, "genre": 1, "id":1, "overview":1, "popularity":1, "vote_count":1, "release_date":1, "cast": {"name":1, "order":1}}) + + #projection on useful data + cursor = collection.find({},{"_id":1, "original_title": 1, "genre": 1, "id":1, "overview":1, "vote_count":1}) df=pd.DataFrame(list(cursor)) + return df def preFiltering(df,percent=15): @@ -48,7 +49,7 @@ def similarity(df): ''' tf_idf = TfidfVectorizer(stop_words='english') tf_idf_matrix = tf_idf.fit_transform(df['overview']); - print(tf_idf_matrix) + # calculating cosine similarity between movies cosine_similarity_matrix = cosine_similarity(tf_idf_matrix, tf_idf_matrix) diff --git a/algo/search_engine.py b/algo/search_engine.py index 578de91e1b21e885df5aab448bad0b0a1a5a8dfe..ffdabcdebc764fe0245a9ea4659b7ea9fec36e14 100644 --- a/algo/search_engine.py +++ b/algo/search_engine.py @@ -1,11 +1,7 @@ -from doctest import DocFileSuite from pymongo import MongoClient import pandas as pd -from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity -import numpy as np -import matplotlib.pyplot as plt def dbToDf(): @@ -15,21 +11,11 @@ def dbToDf(): client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true") db = client.group3 collection = db.movies_populated - cursor = collection.find({},{'_id':1, "title":1, "overview":1, "vote_count":1}) + cursor = collection.find({},{'_id':1, "title":1, "vote_count":1}) df=pd.DataFrame(list(cursor)) return df -def preFiltering(df,percent=15): - ''' - This function removes movies who do not have enough votes to be evaluated - ''' - df = df[df['vote_count'].notna()] - min_votes = np.percentile(df['vote_count'].values, 100-percent) - newdf = df.copy(deep=True).loc[df['vote_count'] > min_votes] - - return newdf - def process_text(text): ''' This function transform a text before calculating the tf-idf @@ -54,18 +40,6 @@ def similarity(df,category='title'): return cosine_similarity_matrix -def index_from_title(df,title): - ''' - return the index of a movie from its title - ''' - return df[df['title']==title].index.values[0] - -def title_from_index(df,index): - ''' - return the title of a movie from its index - ''' - return df[df.index==index].title.values[0] - def search_engine( query, df, number_of_recommendations): #process text of all titles @@ -83,10 +57,9 @@ def search_engine( query, df, number_of_recommendations): similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True) recommendations_indices = [t[0] for t in similarity_scores_sorted[1:(number_of_recommendations+1)]] - print(recommendations_indices) return df['title'].iloc[recommendations_indices] df = dbToDf() -print(search_engine('sword', df, 9)) +print(search_engine('sword', df, 5))