Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
NLP Tutorials - Part 3: Document Similarity
import re
import nltk
import numpy as np
import unicodedata
import pandas as pd'stopwords')'wordnet')'punkt')
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
# Reading data
data = pd.read_csv('tmdb_5000_movies.csv')
# Let's also take care of the null values present in the data
data.fillna('', inplace = True)
# Text Processing
STOPWORDS = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
def process_text(text):
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
text = re.sub(r'[^a-zA-Z\s]', '', text)
#text = text.translate(str.maketrans('', '', string.punctuation))
text = text.lower()
text = " ".join([word for word in str(text).split() if word not in STOPWORDS])
text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])
return text
# Applying the preprocessing on the overview column of the dataset
data['processed_overview'] = df2['overview'].map(process_text)
# Also, we shall select the top 4 columns for our problem statement
data = data[['title', 'overview', 'processed_overview', 'tagline']]
# First let us get the processed data
data_list = data['processed_overview'].to_list()
# CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(min_df = 0., max_df = 1.)
count_vect_matrix = count_vect.fit_transform(data_list)
# Output - (4803, 20449)
# CountVectorized features' similarity matrix
from sklearn.metrics.pairwise import cosine_similarity
count_doc_sim = cosine_similarity(count_vect_matrix)
# Let us create a dataframe out of this matrix for easy retrieval of data
count_doc_sim_df = pd.DataFrame(count_doc_sim)
# Now let's retrieve some data from this matrix
# Getting index from a movie title
movies = data['title'].to_list()
movie_idx = movies.index("Captain America: Civil War")
# Getting the specific row from the similarity matrix (dataframe)
movie_similarities = count_doc_sim_df.iloc[movie_idx].values
# Getting the Top-5 similar movies' indices
similar_movie_idxs = np.argsort(-movie_similarities)[1:6]
# Getting the movie title's using the indices
similar_movies = []
for i in similar_movie_idxs:
# Creating a function which will take a similarity matrix and movie title as input and get the top 5 recommended titles
def get_similar_document(movie_title, similarity_matrix):
index = movies.index(movie_title)
sim = sim_matrix[index].values
sim_index = np.argsort(-sim)[1:6]
similar_movies = []
for i in sim_index:
return similar_movies
# Now it will be easy to get the similar_docs given a title and the similarity matrix
get_similar_document("Captain America: Civil War", count_doc_sim_df)
# Moving onto the Tf-Idf Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf = TfidfVectorizer(ngram_range=(1, 2), min_df=2)
tfidf_matrix = tf_idf.fit_transform(data_list)
from sklearn.metrics.pairwise import cosine_similarity
tf_doc_sim = cosine_similarity(tfidf_matrix)
tf_doc_sim_df = pd.DataFrame(tf_doc_sim)
get_recommendations("Captain America: Civil War", tf_doc_sim_df)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment