Instantly share code, notes, and snippets.
Created
November 26, 2017 23:24
-
Save alchemistsrivastava/56a7ad0e8d95c63502aa7e683abbb24b to your computer and use it in GitHub Desktop.
Version1OmAuthorNLP
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
# In[1]: | |
get_ipython().run_line_magic('reload_ext', 'autoreload') | |
get_ipython().run_line_magic('autoreload', '2') | |
get_ipython().run_line_magic('matplotlib', 'inline') | |
from fastai.learner import * | |
import torchtext | |
from torchtext import vocab, data | |
from torchtext.datasets import language_modeling | |
from fastai.rnn_reg import * | |
from fastai.rnn_train import * | |
from fastai.nlp import * | |
from fastai.lm_rnn import * | |
import dill as pickle | |
import pandas as pd | |
import numpy as np | |
# In[2]: | |
PATH='data/spooky-author-identification/' | |
get_ipython().run_line_magic('ls', '{PATH}') | |
# In[3]: | |
def save_data(df, file_train): | |
trainData ="" | |
for idx, row in df.iterrows(): | |
data = row['text'] | |
if trainData == "": | |
trainData= data | |
else : | |
trainData=trainData + " " + data | |
file_train.write(trainData) | |
file_train.close() | |
return trainData | |
# In[4]: | |
file_train= open(f'{PATH}trainData.txt','w') | |
# In[5]: | |
df_train = pd.read_csv(f'{PATH}train.csv') | |
# In[6]: | |
train_data= save_data(df_train,file_train) | |
# In[7]: | |
df_test = pd.read_csv(f'{PATH}test.csv') | |
# In[8]: | |
file_test= open(f'{PATH}testData.txt','w') | |
# In[9]: | |
test_data= save_data(df_test,file_test) | |
# In[ ]: | |
' '.join(spacy_tok(train_data)) | |
# In[11]: | |
TEXT = data.Field(lower=True, tokenize=spacy_tok) | |
# In[12]: | |
TRN_PATH = 'trainData.txt' | |
VAL_PATH = 'testData.txt' | |
TRN = f'{PATH}trainData.txt' | |
VAL = f'{PATH}testData.txt' | |
# In[13]: | |
VAL_PATH | |
# In[14]: | |
bs=2; bptt=70 | |
# In[15]: | |
FILES = dict(train=TRN_PATH, validation=VAL_PATH, test=VAL_PATH) | |
# In[16]: | |
md = LanguageModelData(PATH, TEXT, **FILES, bs=bs, bptt=bptt, min_freq=10) | |
# In[17]: | |
pickle.dump(TEXT, open(f'{PATH}models/TEXT.pkl','wb')) | |
# In[18]: | |
len(md.trn_dl), md.nt, len(md.trn_ds), len(md.trn_ds[0].text) | |
# In[19]: | |
TEXT.vocab.itos[:12] | |
# In[20]: | |
# 'stoi': 'string to int' | |
TEXT.vocab.stoi['the'] | |
# In[21]: | |
md.trn_ds[0].text[:12] | |
# In[22]: | |
TEXT.numericalize([md.trn_ds[0].text[:12]]) | |
# In[ ]: | |
next(iter(md.trn_dl)) | |
# In[24]: | |
em_sz = 200 # size of each embedding vector | |
nh = 50 # number of hidden activations per layer | |
nl = 3 # number of layers | |
# In[25]: | |
opt_fn = partial(optim.Adam, betas=(0.7, 0.99)) | |
# In[26]: | |
learner = md.get_model(opt_fn, em_sz, nh, nl, | |
dropouti=0.05, dropout=0.05, wdrop=0.1, dropoute=0.02, dropouth=0.05) | |
learner.reg_fn = partial(seq2seq_reg, alpha=2, beta=1) | |
learner.clip=0.3 | |
# In[ ]: | |
learner.fit(3e-3, 4, wds=1e-6, cycle_len=1, cycle_mult=1) | |
# In[ ]: | |
learner.save_encoder('adam1_enc') | |
# In[ ]: | |
learner.load_encoder('adam1_enc') | |
# In[ ]: | |
TEXT = pickle.load(open(f'{PATH}models/TEXT.pkl','rb')) | |
# In[ ]: | |
IMDB_LABEL = data.Field(sequential=False) | |
# In[ ]: | |
m=learner.model | |
ss=""". So, it wasn't quite was I was expecting, but I really liked it anyway! The best""" | |
s = [spacy_tok(ss)] | |
t=TEXT.numericalize(s) | |
' '.join(s[0]) | |
# In[ ]: | |
# Set batch size to 1 | |
m[0].bs=1 | |
# Turn off dropout | |
m.eval() | |
# Reset hidden state | |
m.reset() | |
# Get predictions from model | |
res,*_ = m(t) | |
# Put the batch size back to what it was | |
m[0].bs=bs | |
# In[ ]: | |
nexts = torch.topk(res[-1], 10)[1] | |
[TEXT.vocab.itos[o] for o in to_np(nexts)] | |
# In[ ]: | |
print(ss,"\n") | |
for i in range(50): | |
n=res[-1].topk(2)[1] | |
n = n[1] if n.data[0]==0 else n[0] | |
print(TEXT.vocab.itos[n.data[0]], end=' ') | |
res,*_ = m(n[0].unsqueeze(0)) | |
print('...') | |
# In[22]: | |
TEXT = pickle.load(open(f'{PATH}models/TEXT.pkl','rb')) | |
# In[ ]: | |
TEXT.vocab.itos | |
# In[18]: | |
import spacy | |
spacy_en = spacy.load('en') | |
def tokenizer(text): # create a tokenizer function | |
return [tok.text for tok in spacy_en.tokenizer(text)] | |
TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True, fix_length=150) | |
LABEL = data.Field(sequential=False, use_vocab=False) | |
# In[19]: | |
train, val, test = data.TabularDataset.splits( | |
PATH, train='train.csv', | |
validation='test.csv', test='test.csv', format='csv', | |
fields=[('Text', TEXT), ('Label', LABEL)]) | |
# In[27]: | |
splits=data.TabularDataset.splits( | |
PATH, train='train.csv', | |
validation='test.csv', test='test.csv', format='csv', | |
fields=[('Text', TEXT)) | |
# In[28]: | |
TEXT = pickle.load(open(f'{PATH}models/TEXT.pkl','rb')) | |
# In[29]: | |
LABEL = data.Field(sequential=False, use_vocab=True) | |
# In[30]: | |
splits = data.TabularDataset.splits( | |
PATH, train='train.csv', | |
validation='test.csv', test='test.csv', format='csv', | |
fields=[('text', TEXT), ('label', LABEL)]) | |
# In[23]: | |
TEXT.init_token | |
# In[24]: | |
trn_iter,val_iter = torchtext.data.BucketIterator.splits(splits, batch_size=bs) | |
# In[31]: | |
train_iter, val_iter, test_iter = torchtext.data.Iterator.splits( | |
(train, val, test), sort_key=lambda x: len(x.Text), | |
batch_sizes=(32, 256, 256)) | |
# In[32]: | |
def from_splits(cls, path, splits, bs, text_name='text', label_name='label'): | |
text_fld = splits[0].fields[text_name] | |
print(text_fld ) | |
label_fld = splits[0].fields[label_name] | |
print(label_fld) | |
label_fld.build_vocab(splits[0]) | |
print(splits[0]) | |
#trn_iter,val_iter = torchtext.data.BucketIterator.splits(splits, batch_size=bs) | |
trn_iter, val_iter, test_iter = torchtext.data.Iterator.splits( | |
(train, val, test), sort_key=lambda x: len(x.Text), batch_sizes=(32, 256, 256)) | |
trn_dl = TextDataLoader(trn_iter, text_name, label_name) | |
val_dl = TextDataLoader(val_iter, text_name, label_name) | |
obj = TextData.from_dls(path, trn_dl, val_dl) | |
obj.bs = bs | |
obj.pad_idx = text_fld.vocab.stoi[text_fld.pad_token] | |
obj.nt = len(text_fld.vocab) | |
obj.c = len(label_fld.vocab) | |
return obj | |
# In[33]: | |
md2=from_splits(TEXT, PATH, splits, bs, text_name='text', label_name='label') | |
# In[34]: | |
md2.c | |
# In[35]: | |
md2.nt | |
# In[84]: | |
get_ipython().run_line_magic('pinfo2', 'TextData.from_dls') | |
# In[72]: | |
get_ipython().run_line_magic('pinfo2', 'TextData.from_splits') | |
# In[36]: | |
md3 = md2.get_model(opt_fn, 1500, bptt, emb_sz=em_sz, n_hid=nh, n_layers=nl, | |
dropout=0.1, dropouti=0.4, wdrop=0.5, dropoute=0.05, dropouth=0.3) | |
# In[37]: | |
md3.summary | |
# In[40]: | |
md3.fit(1e01, 1, metrics=[accuracy]) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment