from pymongo import MongoClient
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


def dbToDf():
    '''
    This function convert a DataBase from mongoDB into a pandas DataFrame
    '''
    client = MongoClient("mongodb://group3:GJF6cQqM4RLxBfNb@cs2022.lmichelin.fr:27017/group3?ssl=true")
    db = client.group3
    collection = db.movies_populated
    cursor = collection.find({},{'_id':1, "title":1, "vote_count":1})
    df=pd.DataFrame(list(cursor))

    return df

def process_text(text):
    '''
    This function transform a text before calculating the tf-idf
    '''
    # replace multiple spaces with one
    text = ' '.join(text.split())
 
    # lowercase
    text = text.lower()
 
    return text

def similarity(df,category='title'):
    '''
    This function calculates the similarity between movies
    '''
    tf_idf = TfidfVectorizer(stop_words='english')
    tf_idf_matrix = tf_idf.fit_transform(df[category])

    # calculating cosine similarity between movies
    cosine_similarity_matrix = cosine_similarity(tf_idf_matrix, tf_idf_matrix)

    return cosine_similarity_matrix

def search_engine( query, df, number_of_recommendations):
    
    #process text of all titles
    title=df[['title']]
    title.loc[-1,'title']=query
    title['title'] = title.apply(lambda x: process_text(x.title),axis=1)
    
    index= -1

    #calculates similarity scores of all movies
    calculated_sim = similarity(title, 'title')

    similarity_scores = list(enumerate(calculated_sim[index]))
    similarity_scores.pop()
    similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    recommendations_indices = [t[0] for t in similarity_scores_sorted[1:(number_of_recommendations+1)]]
    
    return df['title'].iloc[recommendations_indices]

df = dbToDf()

print(search_engine('sword', df, 5))