Skip to content
Snippets Groups Projects
Select Git revision
  • 97b9de1d541819d09e472ce8e8662ee34dfd2146
  • main default
  • tp3
  • tp2
  • tp1
  • tp3-correction
  • tp2-correction
  • tp1-correction
  • admins
9 results

test_expression.py

Blame
  • Forked from an inaccessible project.
    adreco.py 6.81 KiB
    from pymongo import MongoClient
    import pandas as pd
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
    import seaborn as sns
    import numpy as np
    
    ### Parameters ###
    
    w_genres = 10
    w_keywords = 17
    w_actor = 15
    w_director = 15
    w_release_date = 8
    
    #w_genres = 1
    #w_keywords = 1
    #w_actor = 1
    #w_director = 1
    #w_release_date = 1
    
    
    def movieDbToDf():
        '''
        This function convert a movie DataBase from mongoDB into a pandas DataFrame
        '''
        #load DB
        client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true")
        db = client.group3
        collection = db.movies_populated
    
        #projection on useful data
        cursor = collection.find({},{"_id":1, "id":1, "original_title":1, "genre_ids": 1,  "overview":1, "vote_count":1, "release_date":1, "main_actor":1, "director":1, "keywords":1})
        df=pd.DataFrame(list(cursor))
    
        return df
    
    def preFiltering(df,percent=90):
        '''
        This function removes movies who do not have enough votes to be evaluated
        ''' 
        df = df[df['vote_count'].notna()]
        min_votes = np.percentile(df['vote_count'].values, 100-percent)
        newdf = df.copy(deep=True).loc[df['vote_count'] > min_votes]
    
        return newdf
    
    def process_text(text):
        '''
        This function transform a text before calculating the tf-idf
        '''
        # replace multiple spaces with one
        text = ' '.join(text.split())
     
        # lowercase
        text = text.lower()
     
        return text
    
    def dfToVectMatrix(df):
        """
        This function returns the vect-matrix of the column features from a dataframe
        """
        vect = CountVectorizer(stop_words='english')
        
        vect_matrix = vect.fit_transform(df['features'])
    
        return vect_matrix
    
    def similarity(df):
        '''
        This function calculates the similarity between movies
        '''
    
        vect_matrix=dfToVectMatrix(df)
    
        cosine_similarity_matrix_count_based = cosine_similarity(vect_matrix, vect_matrix)
    
        return cosine_similarity_matrix_count_based
    
    def index_from_title(df,title):
        '''
        return the index of a movie from its title
        '''
        return df[df['original_title']==title].index.values[0]
    
    def title_from_index(df,index):
        '''
        return the title of a movie from its index
        '''
        return df[df.index==index].original_title.values[0]
    
    def id_from_index(df,index):
        '''
        return the id of a movie from its index
        '''
        return df[df.index==index]._id.values[0]
    
    def index_from_id(df,id):
        '''
        return the index of a movie from its id
        '''
        print(df[df['original_title']=='Uncharted'].index.values[0])
        return df[df['_id']==id].index.values[0]
    
    
    def recommendations(original_title, df, number_of_recommendations):
        
        #prefilter the dataframe
        #df=preFiltering(df)
        
        #creates features column
        df['features']=df.apply(formatingFeatures,axis=1)
        df['features']=df.apply(lambda x: process_text(x.features),axis=1)
        index= index_from_title(df,original_title)
    
        #calculates similarity scores of all movies
        vect_matrix=dfToVectMatrix(df)
    
    
    
        calculated_sim = cosine_similarity(vect_matrix, vect_matrix)
    
        similarity_scores = list(enumerate(calculated_sim[index]))
        
        similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
        
        recommendations_indices = [t[0] for t in similarity_scores_sorted[1:(number_of_recommendations+1)]]
        
        return df['original_title'].iloc[recommendations_indices]
    
    def formatingFeatures(df_row):
        g = []
        genres = []
        k=[]
        keywords = []
        
        #creates genres list
        g+=df_row['genre_ids']
        for i in range(len(g)):
            genres.append(str(g[i]))
        genres=' '.join(genres)
    
        #creates keywords list
        k+=df_row['keywords']
        for i in range(len(k)):
            keywords.append(str(k[i]))
        keywords=' '.join(keywords)
        
    
        return ' '.join([genres]*w_genres)+' '+' '.join([keywords]*w_keywords)+' '+' '.join([str(df_row['main_actor'])]*w_actor)+' '+' '.join([str(df_row['director'])]*w_director)+' '+' '.join([str(df_row['release_date'])]*w_release_date)
    
    def userDbToDf():
        '''
        This function convert a movie DataBase from mongoDB into a pandas DataFrame
        '''
        #load DB
        client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true")
        db = client.group3
        collection = db.users
    
        #projection on useful data
        cursor = collection.find({},{"_id":1, "liked_movies": 1, "update":1})
        df=pd.DataFrame(list(cursor))
    
        return df
    
    def user_profile( user_index, moviesdf, usersdf, vectMatrix ):
        
        moviesID=usersdf['liked_movies'].iloc[user_index]
        print(moviesID)
        print('Hello')
        moviesindex=[index_from_id(moviesdf,ID) for ID in moviesID]
        n=len(moviesID)#number of film liked
        if moviesindex != []:
            vectuser=vectMatrix[moviesindex[0]]
            moviesindex.pop(0)
            for i in moviesindex:
                vectuser = vectuser + vectMatrix[i]
            vectuser=vectuser/n
            calculated_sim = cosine_similarity(vectuser, vectMatrix)
    
            similarity_scores = list(enumerate(calculated_sim[0]))
    
            similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
            recommendations_indices = [t[0] for t in similarity_scores_sorted[1:(100+1)]]
    
            return recommendations_indices
        
        else:
            return
        
    def loadRecDB():
    
        #load DB
        client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true")
        db = client.group3
        collection = db['recommendations']
        return collection
    
    def updateDB():
    
        #loadDB
        moviesdf = movieDbToDf()
        usersdf = userDbToDf()
        recdb= loadRecDB()
    
        #creates features column
        moviesdf['features']=moviesdf.apply(formatingFeatures,axis=1)
        moviesdf['features']=moviesdf.apply(lambda x: process_text(x.features),axis=1)
    
        #calculates similarity scores of all movies
        vect_matrix=dfToVectMatrix(moviesdf)
    
        for i in usersdf.index:
            #init var
            dict = {"user_id": usersdf['_id'][i]}
            recommended_movies=[]
            
            #fetch liked movies index
            rec_indices=user_profile( i, moviesdf, usersdf, vect_matrix)
            
            if rec_indices != None:
                print('pass')
                recdf = moviesdf['id'].iloc[rec_indices]
    
                print(recdf)
    
                for j in recdf.index:
                    recommended_movies.append(int(recdf[j]))
                
            dict['recommended_movies']=recommended_movies
    
            #update db:
            recdb.update_one({ "user_id": dict["user_id"] }, [{"$set": dict}], upsert=True)
                
    
    updateDB()