Skip to content

Instantly share code, notes, and snippets.

View prateekjoshi565's full-sized avatar
🎯
Focusing

Prateek Joshi prateekjoshi565

🎯
Focusing
View GitHub Profile
from sklearn.model_selection import train_test_split
xtrain, xvalid, ytrain, yvalid = train_test_split(elmo_train_new,
train['label'],
random_state=42,
test_size=0.2)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
lreg = LogisticRegression()
lreg.fit(xtrain, ytrain)
# prepare submission dataframe
sub = pd.DataFrame({'id':test['id'], 'label':preds_test})
# write predictions to a CSV file
sub.to_csv("sub_lreg.csv", index=False)
@prateekjoshi565
prateekjoshi565 / text_normalization_elmo.py
Last active March 24, 2019 06:16
text normalization elmo
# import spaCy's language model
nlp = spacy.load('en', disable=['parser', 'ner'])
# function to lemmatize text
def lemmatization(texts):
output = []
for i in texts:
s = [token.lemma_ for token in nlp(i)]
output.append(' '.join(s))
return output
@prateekjoshi565
prateekjoshi565 / genre_import_libraries.py
Created April 21, 2019 11:52
genre_import_libraries
import pandas as pd
import numpy as np
import json
import nltk
import re
import csv
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
plots = []
with open("plot_summaries.txt", 'r') as f:
reader = csv.reader(f, dialect='excel-tab')
for row in tqdm(reader):
plots.append(row)
@prateekjoshi565
prateekjoshi565 / genre_split_id_plot.py
Created April 21, 2019 12:00
genre_split_id_plot
movie_id = []
plot = []
# extract movie Ids and plot summaries
for i in tqdm(plots):
movie_id.append(i[0])
plot.append(i[1])
# create dataframe
movies = pd.DataFrame({'movie_id': movie_id, 'plot': plot})
@prateekjoshi565
prateekjoshi565 / genre_merge_data_labels.py
Created April 21, 2019 12:01
genre_merge_data_labels
# change datatype of 'movie_id'
meta['movie_id'] = meta['movie_id'].astype(str)
# merge meta with movies
movies = pd.merge(movies, meta[['movie_id', 'movie_name', 'genre']], on = 'movie_id')
movies.head()
@prateekjoshi565
prateekjoshi565 / genre_extract_genres.py
Created April 21, 2019 12:05
genre_extract_genres
# an empty list
genres = []
# extract genres
for i in movies['genre']:
genres.append(list(json.loads(i).values()))
# add to 'movies' dataframe
movies['genre_new'] = genres
all_genres = nltk.FreqDist(all_genres)
# create dataframe
all_genres_df = pd.DataFrame({'Genre': list(all_genres.keys()),
'Count': list(all_genres.values())})