This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import string | |
from numpy import array, argmax, random, take | |
import pandas as pd | |
from keras.models import Sequential | |
from keras.layers import Dense, LSTM, Embedding, Bidirectional, RepeatVector, TimeDistributed | |
from keras.preprocessing.text import Tokenizer | |
from keras.callbacks import ModelCheckpoint | |
from keras.preprocessing.sequence import pad_sequences | |
from keras.models import load_model |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import string | |
import re | |
from numpy import array, argmax, random, take | |
import pandas as pd | |
from keras.models import Sequential | |
from keras.layers import Dense, LSTM, Embedding, RepeatVector | |
from keras.preprocessing.text import Tokenizer | |
from keras.callbacks import ModelCheckpoint | |
from keras.preprocessing.sequence import pad_sequences | |
from keras.models import load_model |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# function to read raw text file | |
def read_text(filename): | |
# open the file | |
file = open(filename, mode='rt', encoding='utf-8') | |
# read all text | |
text = file.read() | |
file.close() | |
return text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# split text into sentences | |
def to_lines(text): | |
sents = text.strip().split('\n') | |
sents = [i.split('\t') for i in sents] | |
return sents |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Remove punctuation | |
deu_eng[:,0] = [s.translate(str.maketrans('', '', string.punctuation)) for s in deu_eng[:,0]] | |
deu_eng[:,1] = [s.translate(str.maketrans('', '', string.punctuation)) for s in deu_eng[:,1]] | |
# convert text to lowercase | |
for i in range(len(deu_eng)): | |
deu_eng[i,0] = deu_eng[i,0].lower() | |
deu_eng[i,1] = deu_eng[i,1].lower() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# empty lists | |
eng_l = [] | |
deu_l = [] | |
# populate the lists with sentence lengths | |
for i in deu_eng[:,0]: | |
eng_l.append(len(i.split())) | |
for i in deu_eng[:,1]: | |
deu_l.append(len(i.split())) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# function to build a tokenizer | |
def tokenization(lines): | |
tokenizer = Tokenizer() | |
tokenizer.fit_on_texts(lines) | |
return tokenizer | |
# prepare english tokenizer | |
eng_tokenizer = tokenization(deu_eng[:, 0]) | |
eng_vocab_size = len(eng_tokenizer.word_index) + 1 | |
eng_length = 8 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encode and pad sequences | |
def encode_sequences(tokenizer, length, lines): | |
# integer encode sequences | |
seq = tokenizer.texts_to_sequences(lines) | |
# pad sequences with 0 values | |
seq = pad_sequences(seq, maxlen=length, padding='post') | |
return seq |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.model_selection import train_test_split | |
# split data into train and test set | |
train,test= train_test_split(deu_eng,test_size=0.2,random_state= 12) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# prepare training data | |
trainX = encode_sequences(deu_tokenizer, deu_length, train[:, 1]) | |
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0]) | |
# prepare validation data | |
testX = encode_sequences(deu_tokenizer, deu_length, test[:, 1]) | |
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0]) |
OlderNewer