Skip to content

Instantly share code, notes, and snippets.

View prateekjoshi565's full-sized avatar
🎯
Focusing

Prateek Joshi prateekjoshi565

🎯
Focusing
View GitHub Profile
@prateekjoshi565
prateekjoshi565 / genre_labels_visual.py
Created April 21, 2019 12:08
genre_labels_visual
g = all_genres_df.nlargest(columns="Count", n = 50)
plt.figure(figsize=(12,15))
ax = sns.barplot(data=g, x= "Count", y = "Genre")
ax.set(ylabel = 'Count')
plt.show()
@prateekjoshi565
prateekjoshi565 / genre_text_cleaning.py
Created April 21, 2019 12:11
genre_text_cleaning
# function for text cleaning
def clean_text(text):
# remove backslash-apostrophe
text = re.sub("\'", "", text)
# remove everything except alphabets
text = re.sub("[^a-zA-Z]"," ",text)
# remove whitespaces
text = ' '.join(text.split())
# convert text to lowercase
text = text.lower()
@prateekjoshi565
prateekjoshi565 / genre_words_visual.py
Last active April 21, 2019 12:15
genre_words_visual
def freq_words(x, terms = 30):
all_words = ' '.join([text for text in x])
all_words = all_words.split()
fdist = nltk.FreqDist(all_words)
words_df = pd.DataFrame({'word':list(fdist.keys()), 'count':list(fdist.values())})
# selecting top 20 most frequent words
d = words_df.nlargest(columns="count", n = terms)
# visualize words and frequencies
@prateekjoshi565
prateekjoshi565 / genre_stopwords_remove.py
Created April 21, 2019 12:17
genre_stopwords_remove
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
# function to remove stopwords
def remove_stopwords(text):
no_stopword_text = [w for w in text.split() if not w in stop_words]
return ' '.join(no_stopword_text)
movies_new['clean_plot'] = movies_new['clean_plot'].apply(lambda x: remove_stopwords(x))
from sklearn.preprocessing import MultiLabelBinarizer
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(movies_new['genre_new'])
# transform target variable
y = multilabel_binarizer.transform(movies_new['genre_new'])
@prateekjoshi565
prateekjoshi565 / genre_traintest_split.py
Created April 21, 2019 12:20
genre_traintest_split
# split dataset into training and validation set
xtrain, xval, ytrain, yval = train_test_split(movies_new['clean_plot'], y, test_size=0.2, random_state=9)
def infer_tags(q):
q = clean_text(q)
q = remove_stopwords(q)
q_vec = tfidf_vectorizer.transform([q])
q_pred = clf.predict(q_vec)
return multilabel_binarizer.inverse_transform(q_pred)
for i in range(5):
k = xval.sample(1).index[0]
print("Movie: ", movies_new['movie_name'][k], "\nPredicted genre: ", infer_tags(xval[k])), print("Actual genre: ",movies_new['genre_new'][k], "\n")
@prateekjoshi565
prateekjoshi565 / w2v_rcm_libs.py
Last active July 28, 2019 18:07
word2vec for recommendation
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
%matplotlib inline
import warnings;
warnings.filterwarnings('ignore')
df = pd.read_excel('Online Retail.xlsx')
df.head()