Skip to content
Snippets Groups Projects
Select Git revision
  • 3ea68da8f28312fc825ce2e5d7807a22f2c31e34
  • master default
2 results

NotificationRepository.php

Blame
  • search_engine.py 2.75 KiB
    from doctest import DocFileSuite
    from pymongo import MongoClient
    import pandas as pd
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
    import numpy as np
    import matplotlib.pyplot as plt
    
    
    def dbToDf():
        '''
        This function convert a DataBase from mongoDB into a pandas DataFrame
        '''
        client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true")
        db = client.group3
        collection = db.movies_populated
        cursor = collection.find({},{'_id':1, "title":1, "overview":1, "vote_count":1})
        df=pd.DataFrame(list(cursor))
    
        return df
    
    def preFiltering(df,percent=15):
        '''
        This function removes movies who do not have enough votes to be evaluated
        ''' 
        df = df[df['vote_count'].notna()]
        min_votes = np.percentile(df['vote_count'].values, 100-percent)
        newdf = df.copy(deep=True).loc[df['vote_count'] > min_votes]
    
        return newdf
    
    def process_text(text):
        '''
        This function transform a text before calculating the tf-idf
        '''
        # replace multiple spaces with one
        text = ' '.join(text.split())
     
        # lowercase
        text = text.lower()
     
        return text
    
    def similarity(df,category='title'):
        '''
        This function calculates the similarity between movies
        '''
        tf_idf = TfidfVectorizer(stop_words='english')
        tf_idf_matrix = tf_idf.fit_transform(df[category])
    
        # calculating cosine similarity between movies
        cosine_similarity_matrix = cosine_similarity(tf_idf_matrix, tf_idf_matrix)
    
        return cosine_similarity_matrix
    
    def index_from_title(df,title):
        '''
        return the index of a movie from its title
        '''
        return df[df['title']==title].index.values[0]
    
    def title_from_index(df,index):
        '''
        return the title of a movie from its index
        '''
        return df[df.index==index].title.values[0]
    
    def search_engine( query, df, number_of_recommendations):
        
        #process text of all titles
        title=df[['title']]
        title.loc[-1,'title']=query
        title['title'] = title.apply(lambda x: process_text(x.title),axis=1)
        
        index= -1
    
        #calculates similarity scores of all movies
        calculated_sim = similarity(title, 'title')
    
        similarity_scores = list(enumerate(calculated_sim[index]))
        similarity_scores.pop()
        similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
        
        recommendations_indices = [t[0] for t in similarity_scores_sorted[1:(number_of_recommendations+1)]]
        print(recommendations_indices)
        
        return df['title'].iloc[recommendations_indices]
    
    df = dbToDf()
    
    print(search_engine('sword', df, 9))