Prateek Joshi prateekjoshi565

## model_architecture.py
# build NMT model
def build_model(in_vocab,out_vocab, in_timesteps,out_timesteps,n):
      model = Sequential()
      model.add(Embedding(in_vocab, n, input_length=in_timesteps,
      mask_zero=True))
      model.add(LSTM(n))
      model.add(RepeatVector(out_timesteps))
      model.add(LSTM(n, return_sequences=True))
      model.add(Dense(out_vocab, activation='softmax'))
      return model

## compile_model.py
# model compilation (with 512 hidden units)
model = build_model(deu_vocab_size, eng_vocab_size, deu_length, eng_length, 512)

rms = optimizers.RMSprop(lr=0.001)
model.compile(optimizer=rms, loss='sparse_categorical_crossentropy')

## train_model.py
filename = 'model.h1.24_jan_19'

# set checkpoint
checkpoint = ModelCheckpoint(filename, monitor='val_loss',
                             verbose=1, save_best_only=True,
                             mode='min')


# train model
history = model.fit(trainX, trainY.reshape(trainY.shape[0], trainY.shape[1], 1),

## predict.py
model = load_model('model.h1.24_jan_19')

preds = model.predict_classes(testX.reshape((testX.shape[0], testX.shape[1])))

## integer_word_mapping.py
def get_word(n, tokenizer):
      for word, index in tokenizer.word_index.items():
          if index == n:
              return word
      return None

## prepare_predictions.py
preds_text = []
for i in preds:
       temp = []
       for j in range(len(i)):
            t = get_word(i[j], eng_tokenizer)
            if j > 0:
                if (t==get_word(i[j-1],eng_tokenizer))or(t== None):
                     temp.append('')
                else:
                     temp.append(t)

## import_libraries_elmo.py
import pandas as pd
import numpy as np
import spacy
from tqdm import tqdm
import re
import time
import pickle
pd.set_option('display.max_colwidth', 200)

## remove_url_elmo.py
# remove URL's from train and test
train['clean_tweet'] = train['tweet'].apply(lambda x: re.sub(r'http\S+', '', x))

test['clean_tweet'] = test['tweet'].apply(lambda x: re.sub(r'http\S+', '', x))

## text_preprocessing_elmo.py
# remove punctuation marks
punctuation = '!"#$%&()*+-/:;<=>?@[\\]^_`{|}~'

train['clean_tweet'] = train['clean_tweet'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))
test['clean_tweet'] = test['clean_tweet'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))

# convert text to lowercase
train['clean_tweet'] = train['clean_tweet'].str.lower()
test['clean_tweet'] = test['clean_tweet'].str.lower()

## text_normalization_elmo.py
# import spaCy's language model
nlp = spacy.load('en', disable=['parser', 'ner'])

# function to lemmatize text
def lemmatization(texts):
    output = []
    for i in texts:
        s = [token.lemma_ for token in nlp(i)]
        output.append(' '.join(s))
    return output
	# build NMT model
	def build_model(in_vocab,out_vocab, in_timesteps,out_timesteps,n):
	model = Sequential()
	model.add(Embedding(in_vocab, n, input_length=in_timesteps,
	mask_zero=True))
	model.add(LSTM(n))
	model.add(RepeatVector(out_timesteps))
	model.add(LSTM(n, return_sequences=True))
	model.add(Dense(out_vocab, activation='softmax'))
	return model
	# model compilation (with 512 hidden units)
	model = build_model(deu_vocab_size, eng_vocab_size, deu_length, eng_length, 512)

	rms = optimizers.RMSprop(lr=0.001)
	model.compile(optimizer=rms, loss='sparse_categorical_crossentropy')
	filename = 'model.h1.24_jan_19'

	# set checkpoint
	checkpoint = ModelCheckpoint(filename, monitor='val_loss',
	verbose=1, save_best_only=True,
	mode='min')


	# train model
	history = model.fit(trainX, trainY.reshape(trainY.shape[0], trainY.shape[1], 1),
	model = load_model('model.h1.24_jan_19')

	preds = model.predict_classes(testX.reshape((testX.shape[0], testX.shape[1])))
	def get_word(n, tokenizer):
	for word, index in tokenizer.word_index.items():
	if index == n:
	return word
	return None
	preds_text = []
	for i in preds:
	temp = []
	for j in range(len(i)):
	t = get_word(i[j], eng_tokenizer)
	if j > 0:
	if (t==get_word(i[j-1],eng_tokenizer))or(t== None):
	temp.append('')
	else:
	temp.append(t)
	import pandas as pd
	import numpy as np
	import spacy
	from tqdm import tqdm
	import re
	import time
	import pickle
	pd.set_option('display.max_colwidth', 200)
	# remove URL's from train and test
	train['clean_tweet'] = train['tweet'].apply(lambda x: re.sub(r'http\S+', '', x))

	test['clean_tweet'] = test['tweet'].apply(lambda x: re.sub(r'http\S+', '', x))
	# remove punctuation marks
	punctuation = '!"#$%&()*+-/:;<=>?@[\\]^_`{\|}~'

	train['clean_tweet'] = train['clean_tweet'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))
	test['clean_tweet'] = test['clean_tweet'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))

	# convert text to lowercase
	train['clean_tweet'] = train['clean_tweet'].str.lower()
	test['clean_tweet'] = test['clean_tweet'].str.lower()
	# import spaCy's language model
	nlp = spacy.load('en', disable=['parser', 'ner'])

	# function to lemmatize text
	def lemmatization(texts):
	output = []
	for i in texts:
	s = [token.lemma_ for token in nlp(i)]
	output.append(' '.join(s))
	return output