Skip to content
Snippets Groups Projects
Commit a745aa1a authored by Tom Bray's avatar Tom Bray
Browse files

first recommendation algorithm based on overview

parent 9de50d69
No related branches found
No related tags found
1 merge request!12Search algo
Pipeline #42496 passed
from pymongo import MongoClient
import pandas as pd
#import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
#import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
def dbToDf():
'''
This function convert a DataBase from mongoDB into a pandas DataFrame
'''
client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true")
db = client.group3
collection = db.movies_populated
cursor = collection.find()
df=pd.DataFrame(list(cursor))
return df
def preFiltering(df,percent=15):
'''
This function removes movies who do not have enough votes to be evaluated
'''
df = df[df['vote_count'].notna()]
min_votes = np.percentile(df['vote_count'].values, 100-percent)
newdf = df.copy(deep=True).loc[df['vote_count'] > min_votes]
return newdf
def process_text(text):
'''
This function transform a text before calculating the tf-idf
'''
# replace multiple spaces with one
text = ' '.join(text.split())
# lowercase
text = text.lower()
return text
def similarity(df):
'''
This function calculates the similarity between movies
'''
tf_idf = TfidfVectorizer(stop_words='english')
tf_idf_matrix = tf_idf.fit_transform(df['overview']);
# calculating cosine similarity between movies
cosine_similarity_matrix = cosine_similarity(tf_idf_matrix, tf_idf_matrix)
return cosine_similarity_matrix
def index_from_title(df,title):
'''
return the index of a movie from its title
'''
return df[df['original_title']==title].index.values[0]
def title_from_index(df,index):
'''
return the title of a movie from its index
'''
return df[df.index==index].original_title.values[0]
def recommendations_on_overview( original_title, df, number_of_recommendations):
#prefilter the dataframe
df=preFiltering(df)
# removing rows with missing overview
df = df[df['overview'].notna()]
df.reset_index(inplace=True)
#process text of all overviews
df['overview'] = df.apply(lambda x: process_text(x.overview),axis=1)
index= index_from_title(df,original_title)
#calculates similarity scores of all movies
calculated_sim = similarity(df)
similarity_scores = list(enumerate(calculated_sim[index]))
similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
recommendations_indices = [t[0] for t in similarity_scores_sorted[1:(number_of_recommendations+1)]]
return df['original_title'].iloc[recommendations_indices]
df = dbToDf()
print(recommendations_on_overview( 'Batman', df, 9))
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment