update DB with rec

c3631a87 · Tom Bray · 539ae24b · c3631a87 · c3631a87 · c3631a87
Commit c3631a87 authored 3 years ago by Tom Bray
--- a/algo/adreco.py
+++ b/algo/adreco.py
+from pymongo import MongoClient
+import pandas as pd
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import seaborn as sns
+import numpy as np
+### Parameters ###
+w_genres = 10
+w_keywords = 17
+w_actor = 15
+w_director = 15
+w_release_date = 8
+#w_genres = 1
+#w_keywords = 1
+#w_actor = 1
+#w_director = 1
+#w_release_date = 1
+def movieDbToDf():
+    '''
+    This function convert a movie DataBase from mongoDB into a pandas DataFrame
+    '''
+    #load DB
+    client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true")
+    db = client.group3
+    collection = db.movies_populated
+    #projection on useful data
+    cursor = collection.find({},{"_id":1, "id":1, "original_title":1, "genre_ids": 1,  "overview":1, "vote_count":1, "release_date":1, "main_actor":1, "director":1, "keywords":1})
+    df=pd.DataFrame(list(cursor))
+    return df
+def preFiltering(df,percent=90):
+    '''
+    This function removes movies who do not have enough votes to be evaluated
+    ''' 
+    df = df[df['vote_count'].notna()]
+    min_votes = np.percentile(df['vote_count'].values, 100-percent)
+    newdf = df.copy(deep=True).loc[df['vote_count'] > min_votes]
+    return newdf
+def process_text(text):
+    '''
+    This function transform a text before calculating the tf-idf
+    '''
+    # replace multiple spaces with one
+    text = ' '.join(text.split())
+    # lowercase
+    text = text.lower()
+    return text
+def dfToVectMatrix(df):
+    """
+    This function returns the vect-matrix of the column features from a dataframe
+    """
+    vect = CountVectorizer(stop_words='english')
+    vect_matrix = vect.fit_transform(df['features'])
+    return vect_matrix
+def similarity(df):
+    '''
+    This function calculates the similarity between movies
+    '''
+    vect_matrix=dfToVectMatrix(df)
+    cosine_similarity_matrix_count_based = cosine_similarity(vect_matrix, vect_matrix)
+    return cosine_similarity_matrix_count_based
+def index_from_title(df,title):
+    '''
+    return the index of a movie from its title
+    '''
+    return df[df['original_title']==title].index.values[0]
+def title_from_index(df,index):
+    '''
+    return the title of a movie from its index
+    '''
+    return df[df.index==index].original_title.values[0]
+def id_from_index(df,index):
+    '''
+    return the id of a movie from its index
+    '''
+    return df[df.index==index]._id.values[0]
+def index_from_id(df,id):
+    '''
+    return the index of a movie from its id
+    '''
+    print(df[df['original_title']=='Uncharted'].index.values[0])
+    return df[df['_id']==id].index.values[0]
+def recommendations(original_title, df, number_of_recommendations):
+    #prefilter the dataframe
+    #df=preFiltering(df)
+    #creates features column
+    df['features']=df.apply(formatingFeatures,axis=1)
+    df['features']=df.apply(lambda x: process_text(x.features),axis=1)
+    index= index_from_title(df,original_title)
+    #calculates similarity scores of all movies
+    vect_matrix=dfToVectMatrix(df)
+    calculated_sim = cosine_similarity(vect_matrix, vect_matrix)
+    similarity_scores = list(enumerate(calculated_sim[index]))
+    similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
+    recommendations_indices = [t[0] for t in similarity_scores_sorted[1:(number_of_recommendations+1)]]
+    return df['original_title'].iloc[recommendations_indices]
+def formatingFeatures(df_row):
+    g = []
+    genres = []
+    k=[]
+    keywords = []
+    #creates genres list
+    g+=df_row['genre_ids']
+    for i in range(len(g)):
+        genres.append(str(g[i]))
+    genres=' '.join(genres)
+    #creates keywords list
+    k+=df_row['keywords']
+    for i in range(len(k)):
+        keywords.append(str(k[i]))
+    keywords=' '.join(keywords)
+    return ' '.join([genres]*w_genres)+' '+' '.join([keywords]*w_keywords)+' '+' '.join([str(df_row['main_actor'])]*w_actor)+' '+' '.join([str(df_row['director'])]*w_director)+' '+' '.join([str(df_row['release_date'])]*w_release_date)
+def userDbToDf():
+    '''
+    This function convert a movie DataBase from mongoDB into a pandas DataFrame
+    '''
+    #load DB
+    client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true")
+    db = client.group3
+    collection = db.users
+    #projection on useful data
+    cursor = collection.find({},{"_id":1, "liked_movies": 1, "update":1})
+    df=pd.DataFrame(list(cursor))
+    return df
+def user_profile( user_index, moviesdf, usersdf, vectMatrix ):
+    moviesID=usersdf['liked_movies'].iloc[user_index]
+    print(moviesID)
+    print('Hello')
+    moviesindex=[index_from_id(moviesdf,ID) for ID in moviesID]
+    n=len(moviesID)#number of film liked
+    if moviesindex != []:
+        vectuser=vectMatrix[moviesindex[0]]
+        moviesindex.pop(0)
+        for i in moviesindex:
+            vectuser = vectuser + vectMatrix[i]
+        vectuser=vectuser/n
+        calculated_sim = cosine_similarity(vectuser, vectMatrix)
+        similarity_scores = list(enumerate(calculated_sim[0]))
+        similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
+        recommendations_indices = [t[0] for t in similarity_scores_sorted[1:(100+1)]]
+        return recommendations_indices
+    else:
+        return
+def loadRecDB():
+    #load DB
+    client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true")
+    db = client.group3
+    collection = db['recommendations']
+    return collection
+def updateDB():
+    #loadDB
+    moviesdf = movieDbToDf()
+    usersdf = userDbToDf()
+    recdb= loadRecDB()
+    #creates features column
+    moviesdf['features']=moviesdf.apply(formatingFeatures,axis=1)
+    moviesdf['features']=moviesdf.apply(lambda x: process_text(x.features),axis=1)
+    #calculates similarity scores of all movies
+    vect_matrix=dfToVectMatrix(moviesdf)
+    for i in usersdf.index:
+        #init var
+        dict = {"user_id": usersdf['_id'][i]}
+        recommended_movies=[]
+        #fetch liked movies index
+        rec_indices=user_profile( i, moviesdf, usersdf, vect_matrix)
+        if rec_indices != None:
+            print('pass')
+            recdf = moviesdf['id'].iloc[rec_indices]
+            print(recdf)
+            for j in recdf.index:
+                recommended_movies.append(int(recdf[j]))
+        dict['recommended_movies']=recommended_movies
+        #update db:
+        recdb.update_one({ "user_id": dict["user_id"] }, [{"$set": dict}], upsert=True)
+updateDB()
--- a/algo/recommendation.py
+++ b/algo/recommendation.py
 from pymongo import MongoClient
 import pandas as pd
-import ast
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 import seaborn as sns
 import numpy as np
-import matplotlib.pyplot as plt
 def dbToDf():
    '''
    This function convert a DataBase from mongoDB into a pandas DataFrame
    '''
+    #load DB
    client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true")
    db = client.group3
    collection = db.movies_populated
-    cursor = collection.find({},{"_id":1, "original_title": 1, "genre": 1, "id":1, "overview":1, "popularity":1, "vote_count":1, "release_date":1, "cast": {"name":1, "order":1}})
+    #projection on useful data
+    cursor = collection.find({},{"_id":1, "original_title": 1, "genre": 1, "id":1, "overview":1, "vote_count":1})
    df=pd.DataFrame(list(cursor))
    return df
 def preFiltering(df,percent=15):
@@ -48,7 +49,7 @@ def similarity(df):
    '''
    tf_idf = TfidfVectorizer(stop_words='english')
    tf_idf_matrix = tf_idf.fit_transform(df['overview']);
-    print(tf_idf_matrix)
    # calculating cosine similarity between movies
    cosine_similarity_matrix = cosine_similarity(tf_idf_matrix, tf_idf_matrix)

--- a/algo/search_engine.py
+++ b/algo/search_engine.py
-from doctest import DocFileSuite
 from pymongo import MongoClient
 import pandas as pd
-from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
-import numpy as np
-import matplotlib.pyplot as plt
 def dbToDf():
@@ -15,21 +11,11 @@ def dbToDf():
    client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true")
    db = client.group3
    collection = db.movies_populated
-    cursor = collection.find({},{'_id':1, "title":1, "overview":1, "vote_count":1})
+    cursor = collection.find({},{'_id':1, "title":1, "vote_count":1})
    df=pd.DataFrame(list(cursor))
    return df
-def preFiltering(df,percent=15):
-    '''
-    This function removes movies who do not have enough votes to be evaluated
-    ''' 
-    df = df[df['vote_count'].notna()]
-    min_votes = np.percentile(df['vote_count'].values, 100-percent)
-    newdf = df.copy(deep=True).loc[df['vote_count'] > min_votes]
-    return newdf
 def process_text(text):
    '''
    This function transform a text before calculating the tf-idf
@@ -54,18 +40,6 @@ def similarity(df,category='title'):
    return cosine_similarity_matrix
-def index_from_title(df,title):
-    '''
-    return the index of a movie from its title
-    '''
-    return df[df['title']==title].index.values[0]
-def title_from_index(df,index):
-    '''
-    return the title of a movie from its index
-    '''
-    return df[df.index==index].title.values[0]
 def search_engine( query, df, number_of_recommendations):
    #process text of all titles
@@ -83,10 +57,9 @@ def search_engine( query, df, number_of_recommendations):
    similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    recommendations_indices = [t[0] for t in similarity_scores_sorted[1:(number_of_recommendations+1)]]
-    print(recommendations_indices)
    return df['title'].iloc[recommendations_indices]
 df = dbToDf()
-print(search_engine('sword', df, 9))
+print(search_engine('sword', df, 5))