Skip to content
Snippets Groups Projects
Commit 36465458 authored by Tom Bray's avatar Tom Bray
Browse files

Merge branch 'search-algo' into 'master'

Search algo

See merge request !19
parents 055daeda c715efef
Branches
No related tags found
1 merge request!19Search algo
Pipeline #42631 passed
from pymongo import MongoClient
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import numpy as np
### Parameters ###
w_genres = 10
w_keywords = 17
w_actor = 15
w_director = 15
w_release_date = 8
#w_genres = 1
#w_keywords = 1
#w_actor = 1
#w_director = 1
#w_release_date = 1
def movieDbToDf():
'''
This function convert a movie DataBase from mongoDB into a pandas DataFrame
'''
#load DB
client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true")
db = client.group3
collection = db.movies_populated
#projection on useful data
cursor = collection.find({},{"_id":1, "id":1, "original_title":1, "genre_ids": 1, "overview":1, "vote_count":1, "release_date":1, "main_actor":1, "director":1, "keywords":1})
df=pd.DataFrame(list(cursor))
return df
def preFiltering(df,percent=90):
'''
This function removes movies who do not have enough votes to be evaluated
'''
df = df[df['vote_count'].notna()]
min_votes = np.percentile(df['vote_count'].values, 100-percent)
newdf = df.copy(deep=True).loc[df['vote_count'] > min_votes]
return newdf
def process_text(text):
'''
This function transform a text before calculating the tf-idf
'''
# replace multiple spaces with one
text = ' '.join(text.split())
# lowercase
text = text.lower()
return text
def dfToVectMatrix(df):
"""
This function returns the vect-matrix of the column features from a dataframe
"""
vect = CountVectorizer(stop_words='english')
vect_matrix = vect.fit_transform(df['features'])
return vect_matrix
def similarity(df):
'''
This function calculates the similarity between movies
'''
vect_matrix=dfToVectMatrix(df)
cosine_similarity_matrix_count_based = cosine_similarity(vect_matrix, vect_matrix)
return cosine_similarity_matrix_count_based
def index_from_title(df,title):
'''
return the index of a movie from its title
'''
return df[df['original_title']==title].index.values[0]
def title_from_index(df,index):
'''
return the title of a movie from its index
'''
return df[df.index==index].original_title.values[0]
def id_from_index(df,index):
'''
return the id of a movie from its index
'''
return df[df.index==index]._id.values[0]
def index_from_id(df,id):
'''
return the index of a movie from its id
'''
return df[df['_id']==id].index.values[0]
def recommendations(original_title, df, number_of_recommendations):
#prefilter the dataframe
#df=preFiltering(df)
#creates features column
df['features']=df.apply(formatingFeatures,axis=1)
df['features']=df.apply(lambda x: process_text(x.features),axis=1)
index= index_from_title(df,original_title)
#calculates similarity scores of all movies
vect_matrix=dfToVectMatrix(df)
calculated_sim = cosine_similarity(vect_matrix, vect_matrix)
similarity_scores = list(enumerate(calculated_sim[index]))
similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
recommendations_indices = [t[0] for t in similarity_scores_sorted[1:(number_of_recommendations+1)]]
return df['original_title'].iloc[recommendations_indices]
def formatingFeatures(df_row):
g = []
genres = []
k=[]
keywords = []
#creates genres list
g+=df_row['genre_ids']
for i in range(len(g)):
genres.append(str(g[i]))
genres=' '.join(genres)
#creates keywords list
k+=df_row['keywords']
for i in range(len(k)):
keywords.append(str(k[i]))
keywords=' '.join(keywords)
return ' '.join([genres]*w_genres)+' '+' '.join([keywords]*w_keywords)+' '+' '.join([str(df_row['main_actor'])]*w_actor)+' '+' '.join([str(df_row['director'])]*w_director)+' '+' '.join([str(df_row['release_date'])]*w_release_date)
def userDbToDf():
'''
This function convert a movie DataBase from mongoDB into a pandas DataFrame
'''
#load DB
client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true")
db = client.group3
collection = db.users
#projection on useful data
cursor = collection.find({},{"_id":1, "liked_movies": 1, "update":1})
df=pd.DataFrame(list(cursor))
return df
def user_profile( user_index, moviesdf, usersdf, vectMatrix ):
"""
This function creates a user profile based on the likef movies of the user
and ponderating the vectMatrix of all film liked
"""
#fetch movies ID and index from the liked_movies
moviesID=usersdf['liked_movies'].iloc[user_index]
moviesindex=[index_from_id(moviesdf,ID) for ID in moviesID]
n=len(moviesID)#number of film liked
if moviesindex != []:
#creates the vector of the user
vectuser=vectMatrix[moviesindex[0]]
moviesindex.pop(0)
for i in moviesindex:
vectuser = vectuser + vectMatrix[i]
vectuser=vectuser/n
#calculates the user similarity
calculated_sim = cosine_similarity(vectuser, vectMatrix)
similarity_scores = list(enumerate(calculated_sim[0]))
similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
#lists recommendations index of the movies, ordered by weights
recommendations_indices = [t[0] for t in similarity_scores_sorted[1:(100+1)]]
return recommendations_indices
else:
return [i for i in range(100)]
def loadRecDB():
#load DB
client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true")
db = client.group3
collection = db['recommendations']
return collection
def updateDB():
#loadDB
moviesdf = movieDbToDf()
usersdf = userDbToDf()
recdb= loadRecDB()
#creates features column
moviesdf['features']=moviesdf.apply(formatingFeatures,axis=1)
moviesdf['features']=moviesdf.apply(lambda x: process_text(x.features),axis=1)
#calculates similarity scores of all movies
vect_matrix=dfToVectMatrix(moviesdf)
for i in usersdf.index:
#init var
dict = {"user_id": usersdf['_id'][i]}
recommended_movies=[]
#fetch liked movies index
rec_indices=user_profile( i, moviesdf, usersdf, vect_matrix)
if rec_indices != None:
recdf = moviesdf['id'].iloc[rec_indices]
titledf = moviesdf['original_title'].iloc[rec_indices]
for j in recdf.index:
recommended_movies.append(int(recdf[j]))
dict['recommended_movies']=recommended_movies
#update db:
recdb.update_one({ "user_id": dict["user_id"] }, [{"$set": dict}], upsert=True)
updateDB()
from pymongo import MongoClient
import pandas as pd
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
def dbToDf():
'''
This function convert a DataBase from mongoDB into a pandas DataFrame
'''
#load DB
client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true")
db = client.group3
collection = db.movies_populated
cursor = collection.find({},{"_id":1, "original_title": 1, "genre": 1, "id":1, "overview":1, "popularity":1, "vote_count":1, "release_date":1, "cast": {"name":1, "order":1}})
#projection on useful data
cursor = collection.find({},{"_id":1, "original_title": 1, "genre": 1, "id":1, "overview":1, "vote_count":1})
df=pd.DataFrame(list(cursor))
return df
def preFiltering(df,percent=15):
......@@ -48,7 +49,7 @@ def similarity(df):
'''
tf_idf = TfidfVectorizer(stop_words='english')
tf_idf_matrix = tf_idf.fit_transform(df['overview']);
print(tf_idf_matrix)
# calculating cosine similarity between movies
cosine_similarity_matrix = cosine_similarity(tf_idf_matrix, tf_idf_matrix)
......
from doctest import DocFileSuite
from pymongo import MongoClient
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import matplotlib.pyplot as plt
def dbToDf():
......@@ -15,21 +11,11 @@ def dbToDf():
client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true")
db = client.group3
collection = db.movies_populated
cursor = collection.find({},{'_id':1, "title":1, "overview":1, "vote_count":1})
cursor = collection.find({},{'_id':1, "title":1, "vote_count":1})
df=pd.DataFrame(list(cursor))
return df
def preFiltering(df,percent=15):
'''
This function removes movies who do not have enough votes to be evaluated
'''
df = df[df['vote_count'].notna()]
min_votes = np.percentile(df['vote_count'].values, 100-percent)
newdf = df.copy(deep=True).loc[df['vote_count'] > min_votes]
return newdf
def process_text(text):
'''
This function transform a text before calculating the tf-idf
......@@ -54,18 +40,6 @@ def similarity(df,category='title'):
return cosine_similarity_matrix
def index_from_title(df,title):
'''
return the index of a movie from its title
'''
return df[df['title']==title].index.values[0]
def title_from_index(df,index):
'''
return the title of a movie from its index
'''
return df[df.index==index].title.values[0]
def search_engine( query, df, number_of_recommendations):
#process text of all titles
......@@ -83,10 +57,9 @@ def search_engine( query, df, number_of_recommendations):
similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
recommendations_indices = [t[0] for t in similarity_scores_sorted[1:(number_of_recommendations+1)]]
print(recommendations_indices)
return df['title'].iloc[recommendations_indices]
df = dbToDf()
print(search_engine('sword', df, 9))
print(search_engine('sword', df, 5))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment