from pymongo import MongoClient import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity def dbToDf(): ''' This function convert a DataBase from mongoDB into a pandas DataFrame ''' client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true") db = client.group3 collection = db.movies_populated cursor = collection.find({},{'_id':1, "title":1, "vote_count":1}) df=pd.DataFrame(list(cursor)) return df def process_text(text): ''' This function transform a text before calculating the tf-idf ''' # replace multiple spaces with one text = ' '.join(text.split()) # lowercase text = text.lower() return text def similarity(df,category='title'): ''' This function calculates the similarity between movies ''' tf_idf = TfidfVectorizer(stop_words='english') tf_idf_matrix = tf_idf.fit_transform(df[category]) # calculating cosine similarity between movies cosine_similarity_matrix = cosine_similarity(tf_idf_matrix, tf_idf_matrix) return cosine_similarity_matrix def search_engine( query, df, number_of_recommendations): #process text of all titles title=df[['title']] title.loc[-1,'title']=query title['title'] = title.apply(lambda x: process_text(x.title),axis=1) index= -1 #calculates similarity scores of all movies calculated_sim = similarity(title, 'title') similarity_scores = list(enumerate(calculated_sim[index])) similarity_scores.pop() similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True) recommendations_indices = [t[0] for t in similarity_scores_sorted[1:(number_of_recommendations+1)]] return df['title'].iloc[recommendations_indices] df = dbToDf() print(search_engine('sword', df, 5))