This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# build NMT model | |
def build_model(in_vocab,out_vocab, in_timesteps,out_timesteps,n): | |
model = Sequential() | |
model.add(Embedding(in_vocab, n, input_length=in_timesteps, | |
mask_zero=True)) | |
model.add(LSTM(n)) | |
model.add(RepeatVector(out_timesteps)) | |
model.add(LSTM(n, return_sequences=True)) | |
model.add(Dense(out_vocab, activation='softmax')) | |
return model |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# model compilation (with 512 hidden units) | |
model = build_model(deu_vocab_size, eng_vocab_size, deu_length, eng_length, 512) | |
rms = optimizers.RMSprop(lr=0.001) | |
model.compile(optimizer=rms, loss='sparse_categorical_crossentropy') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
filename = 'model.h1.24_jan_19' | |
# set checkpoint | |
checkpoint = ModelCheckpoint(filename, monitor='val_loss', | |
verbose=1, save_best_only=True, | |
mode='min') | |
# train model | |
history = model.fit(trainX, trainY.reshape(trainY.shape[0], trainY.shape[1], 1), |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
model = load_model('model.h1.24_jan_19') | |
preds = model.predict_classes(testX.reshape((testX.shape[0], testX.shape[1]))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_word(n, tokenizer): | |
for word, index in tokenizer.word_index.items(): | |
if index == n: | |
return word | |
return None |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
preds_text = [] | |
for i in preds: | |
temp = [] | |
for j in range(len(i)): | |
t = get_word(i[j], eng_tokenizer) | |
if j > 0: | |
if (t==get_word(i[j-1],eng_tokenizer))or(t== None): | |
temp.append('') | |
else: | |
temp.append(t) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import spacy | |
from tqdm import tqdm | |
import re | |
import time | |
import pickle | |
pd.set_option('display.max_colwidth', 200) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# remove URL's from train and test | |
train['clean_tweet'] = train['tweet'].apply(lambda x: re.sub(r'http\S+', '', x)) | |
test['clean_tweet'] = test['tweet'].apply(lambda x: re.sub(r'http\S+', '', x)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# remove punctuation marks | |
punctuation = '!"#$%&()*+-/:;<=>?@[\\]^_`{|}~' | |
train['clean_tweet'] = train['clean_tweet'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation))) | |
test['clean_tweet'] = test['clean_tweet'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation))) | |
# convert text to lowercase | |
train['clean_tweet'] = train['clean_tweet'].str.lower() | |
test['clean_tweet'] = test['clean_tweet'].str.lower() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# import spaCy's language model | |
nlp = spacy.load('en', disable=['parser', 'ner']) | |
# function to lemmatize text | |
def lemmatization(texts): | |
output = [] | |
for i in texts: | |
s = [token.lemma_ for token in nlp(i)] | |
output.append(' '.join(s)) | |
return output |