Skip to content
Snippets Groups Projects
Commit c3631a87 authored by Tom Bray's avatar Tom Bray
Browse files

update DB with rec

parent 539ae24b
No related branches found
No related tags found
1 merge request!19Search algo
Pipeline #42628 passed
from pymongo import MongoClient
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import numpy as np
### Parameters ###
w_genres = 10
w_keywords = 17
w_actor = 15
w_director = 15
w_release_date = 8
#w_genres = 1
#w_keywords = 1
#w_actor = 1
#w_director = 1
#w_release_date = 1
def movieDbToDf():
'''
This function convert a movie DataBase from mongoDB into a pandas DataFrame
'''
#load DB
client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true")
db = client.group3
collection = db.movies_populated
#projection on useful data
cursor = collection.find({},{"_id":1, "id":1, "original_title":1, "genre_ids": 1, "overview":1, "vote_count":1, "release_date":1, "main_actor":1, "director":1, "keywords":1})
df=pd.DataFrame(list(cursor))
return df
def preFiltering(df,percent=90):
'''
This function removes movies who do not have enough votes to be evaluated
'''
df = df[df['vote_count'].notna()]
min_votes = np.percentile(df['vote_count'].values, 100-percent)
newdf = df.copy(deep=True).loc[df['vote_count'] > min_votes]
return newdf
def process_text(text):
'''
This function transform a text before calculating the tf-idf
'''
# replace multiple spaces with one
text = ' '.join(text.split())
# lowercase
text = text.lower()
return text
def dfToVectMatrix(df):
"""
This function returns the vect-matrix of the column features from a dataframe
"""
vect = CountVectorizer(stop_words='english')
vect_matrix = vect.fit_transform(df['features'])
return vect_matrix
def similarity(df):
'''
This function calculates the similarity between movies
'''
vect_matrix=dfToVectMatrix(df)
cosine_similarity_matrix_count_based = cosine_similarity(vect_matrix, vect_matrix)
return cosine_similarity_matrix_count_based
def index_from_title(df,title):
'''
return the index of a movie from its title
'''
return df[df['original_title']==title].index.values[0]
def title_from_index(df,index):
'''
return the title of a movie from its index
'''
return df[df.index==index].original_title.values[0]
def id_from_index(df,index):
'''
return the id of a movie from its index
'''
return df[df.index==index]._id.values[0]
def index_from_id(df,id):
'''
return the index of a movie from its id
'''
print(df[df['original_title']=='Uncharted'].index.values[0])
return df[df['_id']==id].index.values[0]
def recommendations(original_title, df, number_of_recommendations):
#prefilter the dataframe
#df=preFiltering(df)
#creates features column
df['features']=df.apply(formatingFeatures,axis=1)
df['features']=df.apply(lambda x: process_text(x.features),axis=1)
index= index_from_title(df,original_title)
#calculates similarity scores of all movies
vect_matrix=dfToVectMatrix(df)
calculated_sim = cosine_similarity(vect_matrix, vect_matrix)
similarity_scores = list(enumerate(calculated_sim[index]))
similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
recommendations_indices = [t[0] for t in similarity_scores_sorted[1:(number_of_recommendations+1)]]
return df['original_title'].iloc[recommendations_indices]
def formatingFeatures(df_row):
g = []
genres = []
k=[]
keywords = []
#creates genres list
g+=df_row['genre_ids']
for i in range(len(g)):
genres.append(str(g[i]))
genres=' '.join(genres)
#creates keywords list
k+=df_row['keywords']
for i in range(len(k)):
keywords.append(str(k[i]))
keywords=' '.join(keywords)
return ' '.join([genres]*w_genres)+' '+' '.join([keywords]*w_keywords)+' '+' '.join([str(df_row['main_actor'])]*w_actor)+' '+' '.join([str(df_row['director'])]*w_director)+' '+' '.join([str(df_row['release_date'])]*w_release_date)
def userDbToDf():
'''
This function convert a movie DataBase from mongoDB into a pandas DataFrame
'''
#load DB
client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true")
db = client.group3
collection = db.users
#projection on useful data
cursor = collection.find({},{"_id":1, "liked_movies": 1, "update":1})
df=pd.DataFrame(list(cursor))
return df
def user_profile( user_index, moviesdf, usersdf, vectMatrix ):
moviesID=usersdf['liked_movies'].iloc[user_index]
print(moviesID)
print('Hello')
moviesindex=[index_from_id(moviesdf,ID) for ID in moviesID]
n=len(moviesID)#number of film liked
if moviesindex != []:
vectuser=vectMatrix[moviesindex[0]]
moviesindex.pop(0)
for i in moviesindex:
vectuser = vectuser + vectMatrix[i]
vectuser=vectuser/n
calculated_sim = cosine_similarity(vectuser, vectMatrix)
similarity_scores = list(enumerate(calculated_sim[0]))
similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
recommendations_indices = [t[0] for t in similarity_scores_sorted[1:(100+1)]]
return recommendations_indices
else:
return
def loadRecDB():
#load DB
client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true")
db = client.group3
collection = db['recommendations']
return collection
def updateDB():
#loadDB
moviesdf = movieDbToDf()
usersdf = userDbToDf()
recdb= loadRecDB()
#creates features column
moviesdf['features']=moviesdf.apply(formatingFeatures,axis=1)
moviesdf['features']=moviesdf.apply(lambda x: process_text(x.features),axis=1)
#calculates similarity scores of all movies
vect_matrix=dfToVectMatrix(moviesdf)
for i in usersdf.index:
#init var
dict = {"user_id": usersdf['_id'][i]}
recommended_movies=[]
#fetch liked movies index
rec_indices=user_profile( i, moviesdf, usersdf, vect_matrix)
if rec_indices != None:
print('pass')
recdf = moviesdf['id'].iloc[rec_indices]
print(recdf)
for j in recdf.index:
recommended_movies.append(int(recdf[j]))
dict['recommended_movies']=recommended_movies
#update db:
recdb.update_one({ "user_id": dict["user_id"] }, [{"$set": dict}], upsert=True)
updateDB()
from pymongo import MongoClient from pymongo import MongoClient
import pandas as pd import pandas as pd
import ast
from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns import seaborn as sns
import numpy as np import numpy as np
import matplotlib.pyplot as plt
def dbToDf(): def dbToDf():
''' '''
This function convert a DataBase from mongoDB into a pandas DataFrame This function convert a DataBase from mongoDB into a pandas DataFrame
''' '''
#load DB
client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true") client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true")
db = client.group3 db = client.group3
collection = db.movies_populated collection = db.movies_populated
cursor = collection.find({},{"_id":1, "original_title": 1, "genre": 1, "id":1, "overview":1, "popularity":1, "vote_count":1, "release_date":1, "cast": {"name":1, "order":1}})
#projection on useful data
cursor = collection.find({},{"_id":1, "original_title": 1, "genre": 1, "id":1, "overview":1, "vote_count":1})
df=pd.DataFrame(list(cursor)) df=pd.DataFrame(list(cursor))
return df return df
def preFiltering(df,percent=15): def preFiltering(df,percent=15):
...@@ -48,7 +49,7 @@ def similarity(df): ...@@ -48,7 +49,7 @@ def similarity(df):
''' '''
tf_idf = TfidfVectorizer(stop_words='english') tf_idf = TfidfVectorizer(stop_words='english')
tf_idf_matrix = tf_idf.fit_transform(df['overview']); tf_idf_matrix = tf_idf.fit_transform(df['overview']);
print(tf_idf_matrix)
# calculating cosine similarity between movies # calculating cosine similarity between movies
cosine_similarity_matrix = cosine_similarity(tf_idf_matrix, tf_idf_matrix) cosine_similarity_matrix = cosine_similarity(tf_idf_matrix, tf_idf_matrix)
......
from doctest import DocFileSuite
from pymongo import MongoClient from pymongo import MongoClient
import pandas as pd import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import matplotlib.pyplot as plt
def dbToDf(): def dbToDf():
...@@ -15,21 +11,11 @@ def dbToDf(): ...@@ -15,21 +11,11 @@ def dbToDf():
client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true") client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true")
db = client.group3 db = client.group3
collection = db.movies_populated collection = db.movies_populated
cursor = collection.find({},{'_id':1, "title":1, "overview":1, "vote_count":1}) cursor = collection.find({},{'_id':1, "title":1, "vote_count":1})
df=pd.DataFrame(list(cursor)) df=pd.DataFrame(list(cursor))
return df return df
def preFiltering(df,percent=15):
'''
This function removes movies who do not have enough votes to be evaluated
'''
df = df[df['vote_count'].notna()]
min_votes = np.percentile(df['vote_count'].values, 100-percent)
newdf = df.copy(deep=True).loc[df['vote_count'] > min_votes]
return newdf
def process_text(text): def process_text(text):
''' '''
This function transform a text before calculating the tf-idf This function transform a text before calculating the tf-idf
...@@ -54,18 +40,6 @@ def similarity(df,category='title'): ...@@ -54,18 +40,6 @@ def similarity(df,category='title'):
return cosine_similarity_matrix return cosine_similarity_matrix
def index_from_title(df,title):
'''
return the index of a movie from its title
'''
return df[df['title']==title].index.values[0]
def title_from_index(df,index):
'''
return the title of a movie from its index
'''
return df[df.index==index].title.values[0]
def search_engine( query, df, number_of_recommendations): def search_engine( query, df, number_of_recommendations):
#process text of all titles #process text of all titles
...@@ -83,10 +57,9 @@ def search_engine( query, df, number_of_recommendations): ...@@ -83,10 +57,9 @@ def search_engine( query, df, number_of_recommendations):
similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True) similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
recommendations_indices = [t[0] for t in similarity_scores_sorted[1:(number_of_recommendations+1)]] recommendations_indices = [t[0] for t in similarity_scores_sorted[1:(number_of_recommendations+1)]]
print(recommendations_indices)
return df['title'].iloc[recommendations_indices] return df['title'].iloc[recommendations_indices]
df = dbToDf() df = dbToDf()
print(search_engine('sword', df, 9)) print(search_engine('sword', df, 5))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment