Skip to content

Instantly share code, notes, and snippets.

Avatar
🎯
Focusing

Prateek Joshi prateekjoshi565

🎯
Focusing
View GitHub Profile
@prateekjoshi565
prateekjoshi565 / import_libraries_MT.py
Created Feb 6, 2019
import libraries for Machine Translation
View import_libraries_MT.py
import string
import re
from numpy import array, argmax, random, take
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, RepeatVector
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
@prateekjoshi565
prateekjoshi565 / func_read_text.py
Last active Feb 6, 2019
function to read raw text file
View func_read_text.py
# function to read raw text file
def read_text(filename):
# open the file
file = open(filename, mode='rt', encoding='utf-8')
# read all text
text = file.read()
file.close()
return text
View func_split_text.py
# split text into sentences
def to_lines(text):
sents = text.strip().split('\n')
sents = [i.split('\t') for i in sents]
return sents
View text_preprocessing.py
# Remove punctuation
deu_eng[:,0] = [s.translate(str.maketrans('', '', string.punctuation)) for s in deu_eng[:,0]]
deu_eng[:,1] = [s.translate(str.maketrans('', '', string.punctuation)) for s in deu_eng[:,1]]
# convert text to lowercase
for i in range(len(deu_eng)):
deu_eng[i,0] = deu_eng[i,0].lower()
deu_eng[i,1] = deu_eng[i,1].lower()
View sequence_length.py
# empty lists
eng_l = []
deu_l = []
# populate the lists with sentence lengths
for i in deu_eng[:,0]:
eng_l.append(len(i.split()))
for i in deu_eng[:,1]:
deu_l.append(len(i.split()))
View sequence_prep.py
# function to build a tokenizer
def tokenization(lines):
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
return tokenizer
# prepare english tokenizer
eng_tokenizer = tokenization(deu_eng[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = 8
View encode_sequence.py
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
# integer encode sequences
seq = tokenizer.texts_to_sequences(lines)
# pad sequences with 0 values
seq = pad_sequences(seq, maxlen=length, padding='post')
return seq
View split_data.py
from sklearn.model_selection import train_test_split
# split data into train and test set
train,test= train_test_split(deu_eng,test_size=0.2,random_state= 12)
View data_prep.py
# prepare training data
trainX = encode_sequences(deu_tokenizer, deu_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
# prepare validation data
testX = encode_sequences(deu_tokenizer, deu_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
View model_architecture.py
# build NMT model
def build_model(in_vocab,out_vocab, in_timesteps,out_timesteps,n):
model = Sequential()
model.add(Embedding(in_vocab, n, input_length=in_timesteps,
mask_zero=True))
model.add(LSTM(n))
model.add(RepeatVector(out_timesteps))
model.add(LSTM(n, return_sequences=True))
model.add(Dense(out_vocab, activation='softmax'))
return model