Tom Bray · c715efef
--- a/algo/adreco.py 0 → 100644

+ 246

− 0

View file @ c715efef

Open in Web IDE
+++ b/algo/adreco.py 0 → 100644

+ 246

− 0

View file @ c715efef

Open in Web IDE
+from pymongo import MongoClient
+import pandas as pd
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import seaborn as sns
+import numpy as np
+
+### Parameters ###
+
+w_genres = 10
+w_keywords = 17
+w_actor = 15
+w_director = 15
+w_release_date = 8
+
+#w_genres = 1
+#w_keywords = 1
+#w_actor = 1
+#w_director = 1
+#w_release_date = 1
+
+
+def movieDbToDf():
+    '''
+    This function convert a movie DataBase from mongoDB into a pandas DataFrame
+    '''
+    #load DB
+    client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true")
+    db = client.group3
+    collection = db.movies_populated
+
+    #projection on useful data
+    cursor = collection.find({},{"_id":1, "id":1, "original_title":1, "genre_ids": 1,  "overview":1, "vote_count":1, "release_date":1, "main_actor":1, "director":1, "keywords":1})
+    df=pd.DataFrame(list(cursor))
+
+    return df
+
+def preFiltering(df,percent=90):
+    '''
+    This function removes movies who do not have enough votes to be evaluated
+    ''' 
+    df = df[df['vote_count'].notna()]
+    min_votes = np.percentile(df['vote_count'].values, 100-percent)
+    newdf = df.copy(deep=True).loc[df['vote_count'] > min_votes]
+
+    return newdf
+
+def process_text(text):
+    '''
+    This function transform a text before calculating the tf-idf
+    '''
+    # replace multiple spaces with one
+    text = ' '.join(text.split())
+ 
+    # lowercase
+    text = text.lower()
+ 
+    return text
+
+def dfToVectMatrix(df):
+    """
+    This function returns the vect-matrix of the column features from a dataframe
+    """
+    vect = CountVectorizer(stop_words='english')
+    
+    vect_matrix = vect.fit_transform(df['features'])
+
+    return vect_matrix
+
+def similarity(df):
+    '''
+    This function calculates the similarity between movies
+    '''
+
+    vect_matrix=dfToVectMatrix(df)
+
+    cosine_similarity_matrix_count_based = cosine_similarity(vect_matrix, vect_matrix)
+
+    return cosine_similarity_matrix_count_based
+
+def index_from_title(df,title):
+    '''
+    return the index of a movie from its title
+    '''
+    return df[df['original_title']==title].index.values[0]
+
+def title_from_index(df,index):
+    '''
+    return the title of a movie from its index
+    '''
+    return df[df.index==index].original_title.values[0]
+
+def id_from_index(df,index):
+    '''
+    return the id of a movie from its index
+    '''
+    return df[df.index==index]._id.values[0]
+
+def index_from_id(df,id):
+    '''
+    return the index of a movie from its id
+    '''
+    return df[df['_id']==id].index.values[0]
+
+
+def recommendations(original_title, df, number_of_recommendations):
+    
+    #prefilter the dataframe
+    #df=preFiltering(df)
+    
+    #creates features column
+    df['features']=df.apply(formatingFeatures,axis=1)
+    df['features']=df.apply(lambda x: process_text(x.features),axis=1)
+    index= index_from_title(df,original_title)
+
+    #calculates similarity scores of all movies
+    vect_matrix=dfToVectMatrix(df)
+
+
+
+    calculated_sim = cosine_similarity(vect_matrix, vect_matrix)
+
+    similarity_scores = list(enumerate(calculated_sim[index]))
+    
+    similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
+    
+    recommendations_indices = [t[0] for t in similarity_scores_sorted[1:(number_of_recommendations+1)]]
+    
+    return df['original_title'].iloc[recommendations_indices]
+
+def formatingFeatures(df_row):
+    g = []
+    genres = []
+    k=[]
+    keywords = []
+    
+    #creates genres list
+    g+=df_row['genre_ids']
+    for i in range(len(g)):
+        genres.append(str(g[i]))
+    genres=' '.join(genres)
+
+    #creates keywords list
+    k+=df_row['keywords']
+    for i in range(len(k)):
+        keywords.append(str(k[i]))
+    keywords=' '.join(keywords)
+    
+
+    return ' '.join([genres]*w_genres)+' '+' '.join([keywords]*w_keywords)+' '+' '.join([str(df_row['main_actor'])]*w_actor)+' '+' '.join([str(df_row['director'])]*w_director)+' '+' '.join([str(df_row['release_date'])]*w_release_date)
+
+def userDbToDf():
+    '''
+    This function convert a movie DataBase from mongoDB into a pandas DataFrame
+    '''
+    #load DB
+    client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true")
+    db = client.group3
+    collection = db.users
+
+    #projection on useful data
+    cursor = collection.find({},{"_id":1, "liked_movies": 1, "update":1})
+    df=pd.DataFrame(list(cursor))
+
+    return df
+
+def user_profile( user_index, moviesdf, usersdf, vectMatrix ):
+    """
+    This function creates a user profile based on the likef movies of the user 
+    and ponderating the vectMatrix of all film liked
+    """
+
+    #fetch movies ID and index from the liked_movies
+    moviesID=usersdf['liked_movies'].iloc[user_index]
+    moviesindex=[index_from_id(moviesdf,ID) for ID in moviesID]
+
+    n=len(moviesID)#number of film liked
+
+
+    if moviesindex != []:
+        #creates the vector of the user
+        vectuser=vectMatrix[moviesindex[0]]
+        moviesindex.pop(0)
+        for i in moviesindex:
+            vectuser = vectuser + vectMatrix[i]
+        vectuser=vectuser/n
+
+        #calculates the user similarity
+        calculated_sim = cosine_similarity(vectuser, vectMatrix)
+
+        similarity_scores = list(enumerate(calculated_sim[0]))
+        similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
+
+        #lists recommendations index of the movies, ordered by weights
+        recommendations_indices = [t[0] for t in similarity_scores_sorted[1:(100+1)]]
+
+        return recommendations_indices
+    
+    else:
+        return [i for i in range(100)]
+    
+def loadRecDB():
+
+    #load DB
+    client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true")
+    db = client.group3
+    collection = db['recommendations']
+    return collection
+
+def updateDB():
+
+    #loadDB
+    moviesdf = movieDbToDf()
+    usersdf = userDbToDf()
+    recdb= loadRecDB()
+
+    #creates features column
+    moviesdf['features']=moviesdf.apply(formatingFeatures,axis=1)
+    moviesdf['features']=moviesdf.apply(lambda x: process_text(x.features),axis=1)
+
+    #calculates similarity scores of all movies
+    vect_matrix=dfToVectMatrix(moviesdf)
+
+    for i in usersdf.index:
+        #init var
+        dict = {"user_id": usersdf['_id'][i]}
+        recommended_movies=[]
+        
+        #fetch liked movies index
+        rec_indices=user_profile( i, moviesdf, usersdf, vect_matrix)
+        
+        if rec_indices != None:
+            recdf = moviesdf['id'].iloc[rec_indices]
+            titledf = moviesdf['original_title'].iloc[rec_indices]
+
+            for j in recdf.index:
+                recommended_movies.append(int(recdf[j]))
+            
+        dict['recommended_movies']=recommended_movies
+
+        #update db:
+        recdb.update_one({ "user_id": dict["user_id"] }, [{"$set": dict}], upsert=True)
+            
+
+updateDB()