From c3631a87c2f64fcfd4a050c5e596bdb93ecadbfa Mon Sep 17 00:00:00 2001
From: Tom Bray <tom.bray@student-cs.fr>
Date: Fri, 10 Jun 2022 10:38:27 +0200
Subject: [PATCH] update DB with rec

---
 algo/adreco.py         | 240 +++++++++++++++++++++++++++++++++++++++++
 algo/recommendation.py |  11 +-
 algo/search_engine.py  |  31 +-----
 3 files changed, 248 insertions(+), 34 deletions(-)
 create mode 100644 algo/adreco.py

diff --git a/algo/adreco.py b/algo/adreco.py
new file mode 100644
index 0000000..66711f8
--- /dev/null
+++ b/algo/adreco.py
@@ -0,0 +1,240 @@
+from pymongo import MongoClient
+import pandas as pd
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import seaborn as sns
+import numpy as np
+
+### Parameters ###
+
+w_genres = 10
+w_keywords = 17
+w_actor = 15
+w_director = 15
+w_release_date = 8
+
+#w_genres = 1
+#w_keywords = 1
+#w_actor = 1
+#w_director = 1
+#w_release_date = 1
+
+
+def movieDbToDf():
+    '''
+    This function convert a movie DataBase from mongoDB into a pandas DataFrame
+    '''
+    #load DB
+    client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true")
+    db = client.group3
+    collection = db.movies_populated
+
+    #projection on useful data
+    cursor = collection.find({},{"_id":1, "id":1, "original_title":1, "genre_ids": 1,  "overview":1, "vote_count":1, "release_date":1, "main_actor":1, "director":1, "keywords":1})
+    df=pd.DataFrame(list(cursor))
+
+    return df
+
+def preFiltering(df,percent=90):
+    '''
+    This function removes movies who do not have enough votes to be evaluated
+    ''' 
+    df = df[df['vote_count'].notna()]
+    min_votes = np.percentile(df['vote_count'].values, 100-percent)
+    newdf = df.copy(deep=True).loc[df['vote_count'] > min_votes]
+
+    return newdf
+
+def process_text(text):
+    '''
+    This function transform a text before calculating the tf-idf
+    '''
+    # replace multiple spaces with one
+    text = ' '.join(text.split())
+ 
+    # lowercase
+    text = text.lower()
+ 
+    return text
+
+def dfToVectMatrix(df):
+    """
+    This function returns the vect-matrix of the column features from a dataframe
+    """
+    vect = CountVectorizer(stop_words='english')
+    
+    vect_matrix = vect.fit_transform(df['features'])
+
+    return vect_matrix
+
+def similarity(df):
+    '''
+    This function calculates the similarity between movies
+    '''
+
+    vect_matrix=dfToVectMatrix(df)
+
+    cosine_similarity_matrix_count_based = cosine_similarity(vect_matrix, vect_matrix)
+
+    return cosine_similarity_matrix_count_based
+
+def index_from_title(df,title):
+    '''
+    return the index of a movie from its title
+    '''
+    return df[df['original_title']==title].index.values[0]
+
+def title_from_index(df,index):
+    '''
+    return the title of a movie from its index
+    '''
+    return df[df.index==index].original_title.values[0]
+
+def id_from_index(df,index):
+    '''
+    return the id of a movie from its index
+    '''
+    return df[df.index==index]._id.values[0]
+
+def index_from_id(df,id):
+    '''
+    return the index of a movie from its id
+    '''
+    print(df[df['original_title']=='Uncharted'].index.values[0])
+    return df[df['_id']==id].index.values[0]
+
+
+def recommendations(original_title, df, number_of_recommendations):
+    
+    #prefilter the dataframe
+    #df=preFiltering(df)
+    
+    #creates features column
+    df['features']=df.apply(formatingFeatures,axis=1)
+    df['features']=df.apply(lambda x: process_text(x.features),axis=1)
+    index= index_from_title(df,original_title)
+
+    #calculates similarity scores of all movies
+    vect_matrix=dfToVectMatrix(df)
+
+
+
+    calculated_sim = cosine_similarity(vect_matrix, vect_matrix)
+
+    similarity_scores = list(enumerate(calculated_sim[index]))
+    
+    similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
+    
+    recommendations_indices = [t[0] for t in similarity_scores_sorted[1:(number_of_recommendations+1)]]
+    
+    return df['original_title'].iloc[recommendations_indices]
+
+def formatingFeatures(df_row):
+    g = []
+    genres = []
+    k=[]
+    keywords = []
+    
+    #creates genres list
+    g+=df_row['genre_ids']
+    for i in range(len(g)):
+        genres.append(str(g[i]))
+    genres=' '.join(genres)
+
+    #creates keywords list
+    k+=df_row['keywords']
+    for i in range(len(k)):
+        keywords.append(str(k[i]))
+    keywords=' '.join(keywords)
+    
+
+    return ' '.join([genres]*w_genres)+' '+' '.join([keywords]*w_keywords)+' '+' '.join([str(df_row['main_actor'])]*w_actor)+' '+' '.join([str(df_row['director'])]*w_director)+' '+' '.join([str(df_row['release_date'])]*w_release_date)
+
+def userDbToDf():
+    '''
+    This function convert a movie DataBase from mongoDB into a pandas DataFrame
+    '''
+    #load DB
+    client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true")
+    db = client.group3
+    collection = db.users
+
+    #projection on useful data
+    cursor = collection.find({},{"_id":1, "liked_movies": 1, "update":1})
+    df=pd.DataFrame(list(cursor))
+
+    return df
+
+def user_profile( user_index, moviesdf, usersdf, vectMatrix ):
+    
+    moviesID=usersdf['liked_movies'].iloc[user_index]
+    print(moviesID)
+    print('Hello')
+    moviesindex=[index_from_id(moviesdf,ID) for ID in moviesID]
+    n=len(moviesID)#number of film liked
+    if moviesindex != []:
+        vectuser=vectMatrix[moviesindex[0]]
+        moviesindex.pop(0)
+        for i in moviesindex:
+            vectuser = vectuser + vectMatrix[i]
+        vectuser=vectuser/n
+        calculated_sim = cosine_similarity(vectuser, vectMatrix)
+
+        similarity_scores = list(enumerate(calculated_sim[0]))
+
+        similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
+
+        recommendations_indices = [t[0] for t in similarity_scores_sorted[1:(100+1)]]
+
+        return recommendations_indices
+    
+    else:
+        return
+    
+def loadRecDB():
+
+    #load DB
+    client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true")
+    db = client.group3
+    collection = db['recommendations']
+    return collection
+
+def updateDB():
+
+    #loadDB
+    moviesdf = movieDbToDf()
+    usersdf = userDbToDf()
+    recdb= loadRecDB()
+
+    #creates features column
+    moviesdf['features']=moviesdf.apply(formatingFeatures,axis=1)
+    moviesdf['features']=moviesdf.apply(lambda x: process_text(x.features),axis=1)
+
+    #calculates similarity scores of all movies
+    vect_matrix=dfToVectMatrix(moviesdf)
+
+    for i in usersdf.index:
+        #init var
+        dict = {"user_id": usersdf['_id'][i]}
+        recommended_movies=[]
+        
+        #fetch liked movies index
+        rec_indices=user_profile( i, moviesdf, usersdf, vect_matrix)
+        
+        if rec_indices != None:
+            print('pass')
+            recdf = moviesdf['id'].iloc[rec_indices]
+
+            print(recdf)
+
+            for j in recdf.index:
+                recommended_movies.append(int(recdf[j]))
+            
+        dict['recommended_movies']=recommended_movies
+
+        #update db:
+        recdb.update_one({ "user_id": dict["user_id"] }, [{"$set": dict}], upsert=True)
+            
+
+updateDB()
diff --git a/algo/recommendation.py b/algo/recommendation.py
index 735e40f..727cb08 100644
--- a/algo/recommendation.py
+++ b/algo/recommendation.py
@@ -1,23 +1,24 @@
 from pymongo import MongoClient
 import pandas as pd
-import ast
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 import seaborn as sns
 import numpy as np
-import matplotlib.pyplot as plt
-
 
 def dbToDf():
     '''
     This function convert a DataBase from mongoDB into a pandas DataFrame
     '''
+    #load DB
     client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true")
     db = client.group3
     collection = db.movies_populated
-    cursor = collection.find({},{"_id":1, "original_title": 1, "genre": 1, "id":1, "overview":1, "popularity":1, "vote_count":1, "release_date":1, "cast": {"name":1, "order":1}})
+
+    #projection on useful data
+    cursor = collection.find({},{"_id":1, "original_title": 1, "genre": 1, "id":1, "overview":1, "vote_count":1})
     df=pd.DataFrame(list(cursor))
+
     return df
 
 def preFiltering(df,percent=15):
@@ -48,7 +49,7 @@ def similarity(df):
     '''
     tf_idf = TfidfVectorizer(stop_words='english')
     tf_idf_matrix = tf_idf.fit_transform(df['overview']);
-    print(tf_idf_matrix)
+
     # calculating cosine similarity between movies
     cosine_similarity_matrix = cosine_similarity(tf_idf_matrix, tf_idf_matrix)
 
diff --git a/algo/search_engine.py b/algo/search_engine.py
index 578de91..ffdabcd 100644
--- a/algo/search_engine.py
+++ b/algo/search_engine.py
@@ -1,11 +1,7 @@
-from doctest import DocFileSuite
 from pymongo import MongoClient
 import pandas as pd
-from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
-import numpy as np
-import matplotlib.pyplot as plt
 
 
 def dbToDf():
@@ -15,21 +11,11 @@ def dbToDf():
     client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true")
     db = client.group3
     collection = db.movies_populated
-    cursor = collection.find({},{'_id':1, "title":1, "overview":1, "vote_count":1})
+    cursor = collection.find({},{'_id':1, "title":1, "vote_count":1})
     df=pd.DataFrame(list(cursor))
 
     return df
 
-def preFiltering(df,percent=15):
-    '''
-    This function removes movies who do not have enough votes to be evaluated
-    ''' 
-    df = df[df['vote_count'].notna()]
-    min_votes = np.percentile(df['vote_count'].values, 100-percent)
-    newdf = df.copy(deep=True).loc[df['vote_count'] > min_votes]
-
-    return newdf
-
 def process_text(text):
     '''
     This function transform a text before calculating the tf-idf
@@ -54,18 +40,6 @@ def similarity(df,category='title'):
 
     return cosine_similarity_matrix
 
-def index_from_title(df,title):
-    '''
-    return the index of a movie from its title
-    '''
-    return df[df['title']==title].index.values[0]
-
-def title_from_index(df,index):
-    '''
-    return the title of a movie from its index
-    '''
-    return df[df.index==index].title.values[0]
-
 def search_engine( query, df, number_of_recommendations):
     
     #process text of all titles
@@ -83,10 +57,9 @@ def search_engine( query, df, number_of_recommendations):
     similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
     
     recommendations_indices = [t[0] for t in similarity_scores_sorted[1:(number_of_recommendations+1)]]
-    print(recommendations_indices)
     
     return df['title'].iloc[recommendations_indices]
 
 df = dbToDf()
 
-print(search_engine('sword', df, 9))
+print(search_engine('sword', df, 5))
-- 
GitLab