Skip to content

Instantly share code, notes, and snippets.

@alchemistsrivastava
Created November 26, 2017 23:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alchemistsrivastava/56a7ad0e8d95c63502aa7e683abbb24b to your computer and use it in GitHub Desktop.
Save alchemistsrivastava/56a7ad0e8d95c63502aa7e683abbb24b to your computer and use it in GitHub Desktop.
Version1OmAuthorNLP
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# coding: utf-8
# In[1]:
get_ipython().run_line_magic('reload_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')
get_ipython().run_line_magic('matplotlib', 'inline')
from fastai.learner import *
import torchtext
from torchtext import vocab, data
from torchtext.datasets import language_modeling
from fastai.rnn_reg import *
from fastai.rnn_train import *
from fastai.nlp import *
from fastai.lm_rnn import *
import dill as pickle
import pandas as pd
import numpy as np
# In[2]:
PATH='data/spooky-author-identification/'
get_ipython().run_line_magic('ls', '{PATH}')
# In[3]:
def save_data(df, file_train):
trainData =""
for idx, row in df.iterrows():
data = row['text']
if trainData == "":
trainData= data
else :
trainData=trainData + " " + data
file_train.write(trainData)
file_train.close()
return trainData
# In[4]:
file_train= open(f'{PATH}trainData.txt','w')
# In[5]:
df_train = pd.read_csv(f'{PATH}train.csv')
# In[6]:
train_data= save_data(df_train,file_train)
# In[7]:
df_test = pd.read_csv(f'{PATH}test.csv')
# In[8]:
file_test= open(f'{PATH}testData.txt','w')
# In[9]:
test_data= save_data(df_test,file_test)
# In[ ]:
' '.join(spacy_tok(train_data))
# In[11]:
TEXT = data.Field(lower=True, tokenize=spacy_tok)
# In[12]:
TRN_PATH = 'trainData.txt'
VAL_PATH = 'testData.txt'
TRN = f'{PATH}trainData.txt'
VAL = f'{PATH}testData.txt'
# In[13]:
VAL_PATH
# In[14]:
bs=2; bptt=70
# In[15]:
FILES = dict(train=TRN_PATH, validation=VAL_PATH, test=VAL_PATH)
# In[16]:
md = LanguageModelData(PATH, TEXT, **FILES, bs=bs, bptt=bptt, min_freq=10)
# In[17]:
pickle.dump(TEXT, open(f'{PATH}models/TEXT.pkl','wb'))
# In[18]:
len(md.trn_dl), md.nt, len(md.trn_ds), len(md.trn_ds[0].text)
# In[19]:
TEXT.vocab.itos[:12]
# In[20]:
# 'stoi': 'string to int'
TEXT.vocab.stoi['the']
# In[21]:
md.trn_ds[0].text[:12]
# In[22]:
TEXT.numericalize([md.trn_ds[0].text[:12]])
# In[ ]:
next(iter(md.trn_dl))
# In[24]:
em_sz = 200 # size of each embedding vector
nh = 50 # number of hidden activations per layer
nl = 3 # number of layers
# In[25]:
opt_fn = partial(optim.Adam, betas=(0.7, 0.99))
# In[26]:
learner = md.get_model(opt_fn, em_sz, nh, nl,
dropouti=0.05, dropout=0.05, wdrop=0.1, dropoute=0.02, dropouth=0.05)
learner.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
learner.clip=0.3
# In[ ]:
learner.fit(3e-3, 4, wds=1e-6, cycle_len=1, cycle_mult=1)
# In[ ]:
learner.save_encoder('adam1_enc')
# In[ ]:
learner.load_encoder('adam1_enc')
# In[ ]:
TEXT = pickle.load(open(f'{PATH}models/TEXT.pkl','rb'))
# In[ ]:
IMDB_LABEL = data.Field(sequential=False)
# In[ ]:
m=learner.model
ss=""". So, it wasn't quite was I was expecting, but I really liked it anyway! The best"""
s = [spacy_tok(ss)]
t=TEXT.numericalize(s)
' '.join(s[0])
# In[ ]:
# Set batch size to 1
m[0].bs=1
# Turn off dropout
m.eval()
# Reset hidden state
m.reset()
# Get predictions from model
res,*_ = m(t)
# Put the batch size back to what it was
m[0].bs=bs
# In[ ]:
nexts = torch.topk(res[-1], 10)[1]
[TEXT.vocab.itos[o] for o in to_np(nexts)]
# In[ ]:
print(ss,"\n")
for i in range(50):
n=res[-1].topk(2)[1]
n = n[1] if n.data[0]==0 else n[0]
print(TEXT.vocab.itos[n.data[0]], end=' ')
res,*_ = m(n[0].unsqueeze(0))
print('...')
# In[22]:
TEXT = pickle.load(open(f'{PATH}models/TEXT.pkl','rb'))
# In[ ]:
TEXT.vocab.itos
# In[18]:
import spacy
spacy_en = spacy.load('en')
def tokenizer(text): # create a tokenizer function
return [tok.text for tok in spacy_en.tokenizer(text)]
TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True, fix_length=150)
LABEL = data.Field(sequential=False, use_vocab=False)
# In[19]:
train, val, test = data.TabularDataset.splits(
PATH, train='train.csv',
validation='test.csv', test='test.csv', format='csv',
fields=[('Text', TEXT), ('Label', LABEL)])
# In[27]:
splits=data.TabularDataset.splits(
PATH, train='train.csv',
validation='test.csv', test='test.csv', format='csv',
fields=[('Text', TEXT))
# In[28]:
TEXT = pickle.load(open(f'{PATH}models/TEXT.pkl','rb'))
# In[29]:
LABEL = data.Field(sequential=False, use_vocab=True)
# In[30]:
splits = data.TabularDataset.splits(
PATH, train='train.csv',
validation='test.csv', test='test.csv', format='csv',
fields=[('text', TEXT), ('label', LABEL)])
# In[23]:
TEXT.init_token
# In[24]:
trn_iter,val_iter = torchtext.data.BucketIterator.splits(splits, batch_size=bs)
# In[31]:
train_iter, val_iter, test_iter = torchtext.data.Iterator.splits(
(train, val, test), sort_key=lambda x: len(x.Text),
batch_sizes=(32, 256, 256))
# In[32]:
def from_splits(cls, path, splits, bs, text_name='text', label_name='label'):
text_fld = splits[0].fields[text_name]
print(text_fld )
label_fld = splits[0].fields[label_name]
print(label_fld)
label_fld.build_vocab(splits[0])
print(splits[0])
#trn_iter,val_iter = torchtext.data.BucketIterator.splits(splits, batch_size=bs)
trn_iter, val_iter, test_iter = torchtext.data.Iterator.splits(
(train, val, test), sort_key=lambda x: len(x.Text), batch_sizes=(32, 256, 256))
trn_dl = TextDataLoader(trn_iter, text_name, label_name)
val_dl = TextDataLoader(val_iter, text_name, label_name)
obj = TextData.from_dls(path, trn_dl, val_dl)
obj.bs = bs
obj.pad_idx = text_fld.vocab.stoi[text_fld.pad_token]
obj.nt = len(text_fld.vocab)
obj.c = len(label_fld.vocab)
return obj
# In[33]:
md2=from_splits(TEXT, PATH, splits, bs, text_name='text', label_name='label')
# In[34]:
md2.c
# In[35]:
md2.nt
# In[84]:
get_ipython().run_line_magic('pinfo2', 'TextData.from_dls')
# In[72]:
get_ipython().run_line_magic('pinfo2', 'TextData.from_splits')
# In[36]:
md3 = md2.get_model(opt_fn, 1500, bptt, emb_sz=em_sz, n_hid=nh, n_layers=nl,
dropout=0.1, dropouti=0.4, wdrop=0.5, dropoute=0.05, dropouth=0.3)
# In[37]:
md3.summary
# In[40]:
md3.fit(1e01, 1, metrics=[accuracy])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment