adreco.py

from pymongo import MongoClient
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import numpy as np

### Parameters ###

w_genres = 10
w_keywords = 17
w_actor = 15
w_director = 15
w_release_date = 8

#w_genres = 1
#w_keywords = 1
#w_actor = 1
#w_director = 1
#w_release_date = 1


def movieDbToDf():
    '''
    This function convert a movie DataBase from mongoDB into a pandas DataFrame
    '''
    #load DB
    client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true")
    db = client.group3
    collection = db.movies_populated

    #projection on useful data
    cursor = collection.find({},{"_id":1, "id":1, "original_title":1, "genre_ids": 1,  "overview":1, "vote_count":1, "release_date":1, "main_actor":1, "director":1, "keywords":1})
    df=pd.DataFrame(list(cursor))

    return df

def preFiltering(df,percent=90):
    '''
    This function removes movies who do not have enough votes to be evaluated
    '''
    df = df[df['vote_count'].notna()]
    min_votes = np.percentile(df['vote_count'].values, 100-percent)
    newdf = df.copy(deep=True).loc[df['vote_count'] > min_votes]

    return newdf

def process_text(text):
    '''
    This function transform a text before calculating the tf-idf
    '''
    # replace multiple spaces with one
    text = ' '.join(text.split())

    # lowercase
    text = text.lower()

    return text

def dfToVectMatrix(df):
    """
    This function returns the vect-matrix of the column features from a dataframe
    """
    vect = CountVectorizer(stop_words='english')

    vect_matrix = vect.fit_transform(df['features'])

    return vect_matrix

def similarity(df):
    '''
    This function calculates the similarity between movies
    '''

    vect_matrix=dfToVectMatrix(df)

    cosine_similarity_matrix_count_based = cosine_similarity(vect_matrix, vect_matrix)

    return cosine_similarity_matrix_count_based

def index_from_title(df,title):
    '''
    return the index of a movie from its title
    '''
    return df[df['original_title']==title].index.values[0]

def title_from_index(df,index):
    '''
    return the title of a movie from its index
    '''
    return df[df.index==index].original_title.values[0]

def id_from_index(df,index):
    '''
    return the id of a movie from its index
    '''
    return df[df.index==index]._id.values[0]

def index_from_id(df,id):
    '''
    return the index of a movie from its id
    '''
    print(df[df['original_title']=='Uncharted'].index.values[0])
    return df[df['_id']==id].index.values[0]


def recommendations(original_title, df, number_of_recommendations):

    #prefilter the dataframe
    #df=preFiltering(df)

    #creates features column
    df['features']=df.apply(formatingFeatures,axis=1)
    df['features']=df.apply(lambda x: process_text(x.features),axis=1)
    index= index_from_title(df,original_title)

    #calculates similarity scores of all movies
    vect_matrix=dfToVectMatrix(df)


    calculated_sim = cosine_similarity(vect_matrix, vect_matrix)

    similarity_scores = list(enumerate(calculated_sim[index]))

    similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    recommendations_indices = [t[0] for t in similarity_scores_sorted[1:(number_of_recommendations+1)]]

    return df['original_title'].iloc[recommendations_indices]

def formatingFeatures(df_row):
    g = []
    genres = []
    k=[]
    keywords = []

    #creates genres list
    g+=df_row['genre_ids']
    for i in range(len(g)):
        genres.append(str(g[i]))
    genres=' '.join(genres)

    #creates keywords list
    k+=df_row['keywords']
    for i in range(len(k)):
        keywords.append(str(k[i]))
    keywords=' '.join(keywords)


    return ' '.join([genres]*w_genres)+' '+' '.join([keywords]*w_keywords)+' '+' '.join([str(df_row['main_actor'])]*w_actor)+' '+' '.join([str(df_row['director'])]*w_director)+' '+' '.join([str(df_row['release_date'])]*w_release_date)

def userDbToDf():
    '''
    This function convert a movie DataBase from mongoDB into a pandas DataFrame
    '''
    #load DB
    client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true")
    db = client.group3
    collection = db.users

    #projection on useful data
    cursor = collection.find({},{"_id":1, "liked_movies": 1, "update":1})
    df=pd.DataFrame(list(cursor))

    return df

def user_profile( user_index, moviesdf, usersdf, vectMatrix ):

    moviesID=usersdf['liked_movies'].iloc[user_index]
    print(moviesID)
    print('Hello')
    moviesindex=[index_from_id(moviesdf,ID) for ID in moviesID]
    n=len(moviesID)#number of film liked
    if moviesindex != []:
        vectuser=vectMatrix[moviesindex[0]]
        moviesindex.pop(0)
        for i in moviesindex:
            vectuser = vectuser + vectMatrix[i]
        vectuser=vectuser/n
        calculated_sim = cosine_similarity(vectuser, vectMatrix)

        similarity_scores = list(enumerate(calculated_sim[0]))

        similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

        recommendations_indices = [t[0] for t in similarity_scores_sorted[1:(100+1)]]

        return recommendations_indices

    else:
        return

def loadRecDB():

    #load DB
    client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true")
    db = client.group3
    collection = db['recommendations']
    return collection

def updateDB():

    #loadDB
    moviesdf = movieDbToDf()
    usersdf = userDbToDf()
    recdb= loadRecDB()

    #creates features column
    moviesdf['features']=moviesdf.apply(formatingFeatures,axis=1)
    moviesdf['features']=moviesdf.apply(lambda x: process_text(x.features),axis=1)

    #calculates similarity scores of all movies
    vect_matrix=dfToVectMatrix(moviesdf)

    for i in usersdf.index:
        #init var
        dict = {"user_id": usersdf['_id'][i]}
        recommended_movies=[]

        #fetch liked movies index
        rec_indices=user_profile( i, moviesdf, usersdf, vect_matrix)

        if rec_indices != None:
            print('pass')
            recdf = moviesdf['id'].iloc[rec_indices]

            print(recdf)

            for j in recdf.index:
                recommended_movies.append(int(recdf[j]))

        dict['recommended_movies']=recommended_movies

        #update db:
        recdb.update_one({ "user_id": dict["user_id"] }, [{"$set": dict}], upsert=True)


updateDB()