Skip to content
Snippets Groups Projects
Commit f701b26f authored by Tom Bray's avatar Tom Bray
Browse files

Merge branch 'search-algo' into 'master'

Search algo

See merge request !12
parents 203be8f8 539ae24b
Branches
No related tags found
1 merge request!12Search algo
Pipeline #42561 passed
from pymongo import MongoClient
import pandas as pd
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
def dbToDf():
'''
This function convert a DataBase from mongoDB into a pandas DataFrame
'''
client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true")
db = client.group3
collection = db.movies_populated
cursor = collection.find({},{"_id":1, "original_title": 1, "genre": 1, "id":1, "overview":1, "popularity":1, "vote_count":1, "release_date":1, "cast": {"name":1, "order":1}})
df=pd.DataFrame(list(cursor))
return df
def preFiltering(df,percent=15):
'''
This function removes movies who do not have enough votes to be evaluated
'''
df = df[df['vote_count'].notna()]
min_votes = np.percentile(df['vote_count'].values, 100-percent)
newdf = df.copy(deep=True).loc[df['vote_count'] > min_votes]
return newdf
def process_text(text):
'''
This function transform a text before calculating the tf-idf
'''
# replace multiple spaces with one
text = ' '.join(text.split())
# lowercase
text = text.lower()
return text
def similarity(df):
'''
This function calculates the similarity between movies
'''
tf_idf = TfidfVectorizer(stop_words='english')
tf_idf_matrix = tf_idf.fit_transform(df['overview']);
print(tf_idf_matrix)
# calculating cosine similarity between movies
cosine_similarity_matrix = cosine_similarity(tf_idf_matrix, tf_idf_matrix)
return cosine_similarity_matrix
def index_from_title(df,title):
'''
return the index of a movie from its title
'''
return df[df['original_title']==title].index.values[0]
def title_from_index(df,index):
'''
return the title of a movie from its index
'''
return df[df.index==index].original_title.values[0]
def recommendations_on_overview( original_title, df, number_of_recommendations):
#prefilter the dataframe
df=preFiltering(df)
# removing rows with missing overview
df = df[df['overview'].notna()]
df.reset_index(inplace=True)
#process text of all overviews
df['overview'] = df.apply(lambda x: process_text(x.overview),axis=1)
index= index_from_title(df,original_title)
#calculates similarity scores of all movies
calculated_sim = similarity(df)
similarity_scores = list(enumerate(calculated_sim[index]))
similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
recommendations_indices = [t[0] for t in similarity_scores_sorted[1:(number_of_recommendations+1)]]
return df['original_title'].iloc[recommendations_indices]
df = dbToDf()
print(recommendations_on_overview('Avatar', df, 9))
from doctest import DocFileSuite
from pymongo import MongoClient
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import matplotlib.pyplot as plt
def dbToDf():
'''
This function convert a DataBase from mongoDB into a pandas DataFrame
'''
client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true")
db = client.group3
collection = db.movies_populated
cursor = collection.find({},{'_id':1, "title":1, "overview":1, "vote_count":1})
df=pd.DataFrame(list(cursor))
return df
def preFiltering(df,percent=15):
'''
This function removes movies who do not have enough votes to be evaluated
'''
df = df[df['vote_count'].notna()]
min_votes = np.percentile(df['vote_count'].values, 100-percent)
newdf = df.copy(deep=True).loc[df['vote_count'] > min_votes]
return newdf
def process_text(text):
'''
This function transform a text before calculating the tf-idf
'''
# replace multiple spaces with one
text = ' '.join(text.split())
# lowercase
text = text.lower()
return text
def similarity(df,category='title'):
'''
This function calculates the similarity between movies
'''
tf_idf = TfidfVectorizer(stop_words='english')
tf_idf_matrix = tf_idf.fit_transform(df[category])
# calculating cosine similarity between movies
cosine_similarity_matrix = cosine_similarity(tf_idf_matrix, tf_idf_matrix)
return cosine_similarity_matrix
def index_from_title(df,title):
'''
return the index of a movie from its title
'''
return df[df['title']==title].index.values[0]
def title_from_index(df,index):
'''
return the title of a movie from its index
'''
return df[df.index==index].title.values[0]
def search_engine( query, df, number_of_recommendations):
#process text of all titles
title=df[['title']]
title.loc[-1,'title']=query
title['title'] = title.apply(lambda x: process_text(x.title),axis=1)
index= -1
#calculates similarity scores of all movies
calculated_sim = similarity(title, 'title')
similarity_scores = list(enumerate(calculated_sim[index]))
similarity_scores.pop()
similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
recommendations_indices = [t[0] for t in similarity_scores_sorted[1:(number_of_recommendations+1)]]
print(recommendations_indices)
return df['title'].iloc[recommendations_indices]
df = dbToDf()
print(search_engine('sword', df, 9))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment