Merge branch 'search-algo' into 'master'

Search algo See merge request !12

Merge branch 'search-algo' into 'master'
f701b26f · Tom Bray · 203be8f8 · 539ae24b · f701b26f · f701b26f
Commit f701b26f authored 3 years ago by Tom Bray
--- a/algo/recommendation.py
+++ b/algo/recommendation.py
+from pymongo import MongoClient
+import pandas as pd
+import ast
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import seaborn as sns
+import numpy as np
+import matplotlib.pyplot as plt
+def dbToDf():
+    '''
+    This function convert a DataBase from mongoDB into a pandas DataFrame
+    '''
+    client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true")
+    db = client.group3
+    collection = db.movies_populated
+    cursor = collection.find({},{"_id":1, "original_title": 1, "genre": 1, "id":1, "overview":1, "popularity":1, "vote_count":1, "release_date":1, "cast": {"name":1, "order":1}})
+    df=pd.DataFrame(list(cursor))
+    return df
+def preFiltering(df,percent=15):
+    '''
+    This function removes movies who do not have enough votes to be evaluated
+    ''' 
+    df = df[df['vote_count'].notna()]
+    min_votes = np.percentile(df['vote_count'].values, 100-percent)
+    newdf = df.copy(deep=True).loc[df['vote_count'] > min_votes]
+    return newdf
+def process_text(text):
+    '''
+    This function transform a text before calculating the tf-idf
+    '''
+    # replace multiple spaces with one
+    text = ' '.join(text.split())
+    # lowercase
+    text = text.lower()
+    return text
+def similarity(df):
+    '''
+    This function calculates the similarity between movies
+    '''
+    tf_idf = TfidfVectorizer(stop_words='english')
+    tf_idf_matrix = tf_idf.fit_transform(df['overview']);
+    print(tf_idf_matrix)
+    # calculating cosine similarity between movies
+    cosine_similarity_matrix = cosine_similarity(tf_idf_matrix, tf_idf_matrix)
+    return cosine_similarity_matrix
+def index_from_title(df,title):
+    '''
+    return the index of a movie from its title
+    '''
+    return df[df['original_title']==title].index.values[0]
+def title_from_index(df,index):
+    '''
+    return the title of a movie from its index
+    '''
+    return df[df.index==index].original_title.values[0]
+def recommendations_on_overview( original_title, df, number_of_recommendations):
+    #prefilter the dataframe
+    df=preFiltering(df)
+    # removing rows with missing overview
+    df = df[df['overview'].notna()] 
+    df.reset_index(inplace=True)
+    #process text of all overviews
+    df['overview'] = df.apply(lambda x: process_text(x.overview),axis=1)
+    index= index_from_title(df,original_title)
+    #calculates similarity scores of all movies
+    calculated_sim = similarity(df)
+    similarity_scores = list(enumerate(calculated_sim[index]))
+    similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
+    recommendations_indices = [t[0] for t in similarity_scores_sorted[1:(number_of_recommendations+1)]]
+    return df['original_title'].iloc[recommendations_indices]
+df = dbToDf()
+print(recommendations_on_overview('Avatar', df, 9))
--- a/algo/search_engine.py
+++ b/algo/search_engine.py
+from doctest import DocFileSuite
+from pymongo import MongoClient
+import pandas as pd
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+import matplotlib.pyplot as plt
+def dbToDf():
+    '''
+    This function convert a DataBase from mongoDB into a pandas DataFrame
+    '''
+    client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true")
+    db = client.group3
+    collection = db.movies_populated
+    cursor = collection.find({},{'_id':1, "title":1, "overview":1, "vote_count":1})
+    df=pd.DataFrame(list(cursor))
+    return df
+def preFiltering(df,percent=15):
+    '''
+    This function removes movies who do not have enough votes to be evaluated
+    ''' 
+    df = df[df['vote_count'].notna()]
+    min_votes = np.percentile(df['vote_count'].values, 100-percent)
+    newdf = df.copy(deep=True).loc[df['vote_count'] > min_votes]
+    return newdf
+def process_text(text):
+    '''
+    This function transform a text before calculating the tf-idf
+    '''
+    # replace multiple spaces with one
+    text = ' '.join(text.split())
+    # lowercase
+    text = text.lower()
+    return text
+def similarity(df,category='title'):
+    '''
+    This function calculates the similarity between movies
+    '''
+    tf_idf = TfidfVectorizer(stop_words='english')
+    tf_idf_matrix = tf_idf.fit_transform(df[category])
+    # calculating cosine similarity between movies
+    cosine_similarity_matrix = cosine_similarity(tf_idf_matrix, tf_idf_matrix)
+    return cosine_similarity_matrix
+def index_from_title(df,title):
+    '''
+    return the index of a movie from its title
+    '''
+    return df[df['title']==title].index.values[0]
+def title_from_index(df,index):
+    '''
+    return the title of a movie from its index
+    '''
+    return df[df.index==index].title.values[0]
+def search_engine( query, df, number_of_recommendations):
+    #process text of all titles
+    title=df[['title']]
+    title.loc[-1,'title']=query
+    title['title'] = title.apply(lambda x: process_text(x.title),axis=1)
+    index= -1
+    #calculates similarity scores of all movies
+    calculated_sim = similarity(title, 'title')
+    similarity_scores = list(enumerate(calculated_sim[index]))
+    similarity_scores.pop()
+    similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
+    recommendations_indices = [t[0] for t in similarity_scores_sorted[1:(number_of_recommendations+1)]]
+    print(recommendations_indices)
+    return df['title'].iloc[recommendations_indices]
+df = dbToDf()
+print(search_engine('sword', df, 9))