Skip to content

Instantly share code, notes, and snippets.

View prateekjoshi565's full-sized avatar
🎯
Focusing

Prateek Joshi prateekjoshi565

🎯
Focusing
View GitHub Profile
@prateekjoshi565
prateekjoshi565 / genre_merge_data_labels.py
Created April 21, 2019 12:01
genre_merge_data_labels
# change datatype of 'movie_id'
meta['movie_id'] = meta['movie_id'].astype(str)
# merge meta with movies
movies = pd.merge(movies, meta[['movie_id', 'movie_name', 'genre']], on = 'movie_id')
movies.head()
@prateekjoshi565
prateekjoshi565 / genre_extract_genres.py
Created April 21, 2019 12:05
genre_extract_genres
# an empty list
genres = []
# extract genres
for i in movies['genre']:
genres.append(list(json.loads(i).values()))
# add to 'movies' dataframe
movies['genre_new'] = genres
all_genres = nltk.FreqDist(all_genres)
# create dataframe
all_genres_df = pd.DataFrame({'Genre': list(all_genres.keys()),
'Count': list(all_genres.values())})
@prateekjoshi565
prateekjoshi565 / genre_labels_visual.py
Created April 21, 2019 12:08
genre_labels_visual
g = all_genres_df.nlargest(columns="Count", n = 50)
plt.figure(figsize=(12,15))
ax = sns.barplot(data=g, x= "Count", y = "Genre")
ax.set(ylabel = 'Count')
plt.show()
@prateekjoshi565
prateekjoshi565 / genre_text_cleaning.py
Created April 21, 2019 12:11
genre_text_cleaning
# function for text cleaning
def clean_text(text):
# remove backslash-apostrophe
text = re.sub("\'", "", text)
# remove everything except alphabets
text = re.sub("[^a-zA-Z]"," ",text)
# remove whitespaces
text = ' '.join(text.split())
# convert text to lowercase
text = text.lower()
@prateekjoshi565
prateekjoshi565 / genre_words_visual.py
Last active April 21, 2019 12:15
genre_words_visual
def freq_words(x, terms = 30):
all_words = ' '.join([text for text in x])
all_words = all_words.split()
fdist = nltk.FreqDist(all_words)
words_df = pd.DataFrame({'word':list(fdist.keys()), 'count':list(fdist.values())})
# selecting top 20 most frequent words
d = words_df.nlargest(columns="count", n = terms)
# visualize words and frequencies
@prateekjoshi565
prateekjoshi565 / genre_stopwords_remove.py
Created April 21, 2019 12:17
genre_stopwords_remove
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
# function to remove stopwords
def remove_stopwords(text):
no_stopword_text = [w for w in text.split() if not w in stop_words]
return ' '.join(no_stopword_text)
movies_new['clean_plot'] = movies_new['clean_plot'].apply(lambda x: remove_stopwords(x))
from sklearn.preprocessing import MultiLabelBinarizer
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(movies_new['genre_new'])
# transform target variable
y = multilabel_binarizer.transform(movies_new['genre_new'])
@prateekjoshi565
prateekjoshi565 / genre_traintest_split.py
Created April 21, 2019 12:20
genre_traintest_split
# split dataset into training and validation set
xtrain, xval, ytrain, yval = train_test_split(movies_new['clean_plot'], y, test_size=0.2, random_state=9)
def infer_tags(q):
q = clean_text(q)
q = remove_stopwords(q)
q_vec = tfidf_vectorizer.transform([q])
q_pred = clf.predict(q_vec)
return multilabel_binarizer.inverse_transform(q_pred)