Skip to content

Instantly share code, notes, and snippets.

View prateekjoshi565's full-sized avatar
🎯
Focusing

Prateek Joshi prateekjoshi565

🎯
Focusing
View GitHub Profile
@prateekjoshi565
prateekjoshi565 / NMT.py
Created February 2, 2019 07:09
Neural Machine Translation using Keras
import re
import string
from numpy import array, argmax, random, take
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Bidirectional, RepeatVector, TimeDistributed
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
@prateekjoshi565
prateekjoshi565 / import_libraries_MT.py
Created February 6, 2019 09:21
import libraries for Machine Translation
import string
import re
from numpy import array, argmax, random, take
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, RepeatVector
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
@prateekjoshi565
prateekjoshi565 / func_read_text.py
Last active May 23, 2023 14:33
function to read raw text file
# function to read raw text file
def read_text(filename):
# open the file
file = open(filename, mode='rt', encoding='utf-8')
# read all text
text = file.read()
file.close()
return text
@prateekjoshi565
prateekjoshi565 / func_split_text.py
Created February 6, 2019 09:26
split text into sentences
# split text into sentences
def to_lines(text):
sents = text.strip().split('\n')
sents = [i.split('\t') for i in sents]
return sents
@prateekjoshi565
prateekjoshi565 / text_preprocessing.py
Created February 6, 2019 09:36
text preprocessing
# Remove punctuation
deu_eng[:,0] = [s.translate(str.maketrans('', '', string.punctuation)) for s in deu_eng[:,0]]
deu_eng[:,1] = [s.translate(str.maketrans('', '', string.punctuation)) for s in deu_eng[:,1]]
# convert text to lowercase
for i in range(len(deu_eng)):
deu_eng[i,0] = deu_eng[i,0].lower()
deu_eng[i,1] = deu_eng[i,1].lower()
@prateekjoshi565
prateekjoshi565 / sequence_length.py
Last active February 6, 2019 09:39
Text preprocessing 2
# empty lists
eng_l = []
deu_l = []
# populate the lists with sentence lengths
for i in deu_eng[:,0]:
eng_l.append(len(i.split()))
for i in deu_eng[:,1]:
deu_l.append(len(i.split()))
@prateekjoshi565
prateekjoshi565 / sequence_prep.py
Created February 6, 2019 09:44
Sequence preparation
# function to build a tokenizer
def tokenization(lines):
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
return tokenizer
# prepare english tokenizer
eng_tokenizer = tokenization(deu_eng[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = 8
@prateekjoshi565
prateekjoshi565 / encode_sequence.py
Created February 6, 2019 09:45
Encode Sequences
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
# integer encode sequences
seq = tokenizer.texts_to_sequences(lines)
# pad sequences with 0 values
seq = pad_sequences(seq, maxlen=length, padding='post')
return seq
from sklearn.model_selection import train_test_split
# split data into train and test set
train,test= train_test_split(deu_eng,test_size=0.2,random_state= 12)
@prateekjoshi565
prateekjoshi565 / data_prep.py
Last active February 6, 2019 09:50
Data preparation
# prepare training data
trainX = encode_sequences(deu_tokenizer, deu_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
# prepare validation data
testX = encode_sequences(deu_tokenizer, deu_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])