Skip to content

Instantly share code, notes, and snippets.

@pi19404

pi19404/lstm3.py Secret

Created January 31, 2018 15:02
Show Gist options
  • Save pi19404/4a054e0ef1f0dc2fbd3661fb8be00d37 to your computer and use it in GitHub Desktop.
Save pi19404/4a054e0ef1f0dc2fbd3661fb8be00d37 to your computer and use it in GitHub Desktop.
import numpy as np;
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dense, Embedding
import os
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
def save_model(model, filename):
model_json = model.to_json()
with open(filename + '.model', "w") as json_file:
json_file.write(model_json)
json_file.close();
model.save_weights(filename + ".weights")
def load_model(filename):
json_file = open(filename + '.model', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
loaded_model.load_weights(filename + ".weights")
return loaded_model;
def processCSVData(filename):
output=[]
lables=[]
with open(filename, 'r') as f:
for rowx in f:
r={}
words = rowx.split()
intent_name=words[len( words)-1]
words=words[0:len( words)-1]
aa="";
cc="";
state=-1
for w in words:
if w=="EOS":
state=1;
if state==0:
aa=aa+" "+w;
if state==1:
cc=cc+" "+w
if w=="BOS":
state=0
r['text']=aa
r['tags'] =cc
r['intent']=intent_name
lables.append(intent_name)
output.append(r)
lables=np.unique(lables)
return output,lables
def getEmbeddingLayer(EMBEDDING_DIM):
embeddings_index = {}
f = open(os.path.join('data/', 'glove.6B.100d.txt'))
count=0
words=[]
for line in f:
values = line.split()
word = values[0]
words.append(word)
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
count=count+1;
f.close()
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(words)
word_index = tokenizer.word_index
print "total words embeddings is ",count,len(word_index)
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
# words not found in embedding index will be all-zeros.
embedding_matrix[i] = embedding_vector
embedding_layer = Embedding(input_dim=len(word_index) + 1,
output_dim=EMBEDDING_DIM,
weights=[embedding_matrix],
input_length=MAX_SEQUENCE_LENGTH,
trainable=True)
return tokenizer,embedding_layer
def create_embedded_model(embedding_layer,num_classes,MAX_SEQUENCE_LENGTH,EMBEDDING_DIM):
model = Sequential()
model.add(embedding_layer)
model.add(LSTM(128, return_sequences=False
, input_shape=(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM)))
model.add(Dense(num_classes, activation='softmax'))
print(model.summary())
return model;
output,lables=processCSVData("atis.txt")
#print output[0],lables
print "number of samples",len(output)
print "number of intent",len(lables)
EMBEDDING_DIM = 100
MAX_NB_WORDS = 100000
MAX_SEQUENCE_LENGTH = 50
text=[]
labels=[]
for x in output:
text.append(x['text'])
labels.append(x['intent'])
num_classes=np.unique(labels)
print('Found %s texts.' % len(text))
print "number of classes",len(num_classes)
tokenizer,embedding_layer=getEmbeddingLayer(EMBEDDING_DIM)
sequences = tokenizer.texts_to_sequences(text)
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
uniques, ids = np.unique(labels, return_inverse=True)
y_train = keras.utils.to_categorical(ids, len(uniques))
print "Training data"
print data.shape,y_train.shape
model=create_embedded_model(embedding_layer,y_train.shape[1],MAX_SEQUENCE_LENGTH,EMBEDDING_DIM)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(data, y_train, epochs=20, batch_size=100, verbose=1)
save_model(model, "/tmp/model6")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment