-
-
Save pi19404/4a054e0ef1f0dc2fbd3661fb8be00d37 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np; | |
from keras.preprocessing.sequence import pad_sequences | |
import numpy as np | |
import keras | |
from keras.models import Sequential | |
from keras.layers import Dense | |
from keras.layers import LSTM | |
from keras.layers import Dense, Embedding | |
import os | |
from keras.preprocessing.text import Tokenizer | |
from keras.preprocessing.sequence import pad_sequences | |
from keras.utils.np_utils import to_categorical | |
def save_model(model, filename): | |
model_json = model.to_json() | |
with open(filename + '.model', "w") as json_file: | |
json_file.write(model_json) | |
json_file.close(); | |
model.save_weights(filename + ".weights") | |
def load_model(filename): | |
json_file = open(filename + '.model', 'r') | |
loaded_model_json = json_file.read() | |
json_file.close() | |
loaded_model = model_from_json(loaded_model_json) | |
loaded_model.load_weights(filename + ".weights") | |
return loaded_model; | |
def processCSVData(filename): | |
output=[] | |
lables=[] | |
with open(filename, 'r') as f: | |
for rowx in f: | |
r={} | |
words = rowx.split() | |
intent_name=words[len( words)-1] | |
words=words[0:len( words)-1] | |
aa=""; | |
cc=""; | |
state=-1 | |
for w in words: | |
if w=="EOS": | |
state=1; | |
if state==0: | |
aa=aa+" "+w; | |
if state==1: | |
cc=cc+" "+w | |
if w=="BOS": | |
state=0 | |
r['text']=aa | |
r['tags'] =cc | |
r['intent']=intent_name | |
lables.append(intent_name) | |
output.append(r) | |
lables=np.unique(lables) | |
return output,lables | |
def getEmbeddingLayer(EMBEDDING_DIM): | |
embeddings_index = {} | |
f = open(os.path.join('data/', 'glove.6B.100d.txt')) | |
count=0 | |
words=[] | |
for line in f: | |
values = line.split() | |
word = values[0] | |
words.append(word) | |
coefs = np.asarray(values[1:], dtype='float32') | |
embeddings_index[word] = coefs | |
count=count+1; | |
f.close() | |
tokenizer = Tokenizer(num_words=MAX_NB_WORDS) | |
tokenizer.fit_on_texts(words) | |
word_index = tokenizer.word_index | |
print "total words embeddings is ",count,len(word_index) | |
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM)) | |
for word, i in word_index.items(): | |
embedding_vector = embeddings_index.get(word) | |
if embedding_vector is not None: | |
# words not found in embedding index will be all-zeros. | |
embedding_matrix[i] = embedding_vector | |
embedding_layer = Embedding(input_dim=len(word_index) + 1, | |
output_dim=EMBEDDING_DIM, | |
weights=[embedding_matrix], | |
input_length=MAX_SEQUENCE_LENGTH, | |
trainable=True) | |
return tokenizer,embedding_layer | |
def create_embedded_model(embedding_layer,num_classes,MAX_SEQUENCE_LENGTH,EMBEDDING_DIM): | |
model = Sequential() | |
model.add(embedding_layer) | |
model.add(LSTM(128, return_sequences=False | |
, input_shape=(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM))) | |
model.add(Dense(num_classes, activation='softmax')) | |
print(model.summary()) | |
return model; | |
output,lables=processCSVData("atis.txt") | |
#print output[0],lables | |
print "number of samples",len(output) | |
print "number of intent",len(lables) | |
EMBEDDING_DIM = 100 | |
MAX_NB_WORDS = 100000 | |
MAX_SEQUENCE_LENGTH = 50 | |
text=[] | |
labels=[] | |
for x in output: | |
text.append(x['text']) | |
labels.append(x['intent']) | |
num_classes=np.unique(labels) | |
print('Found %s texts.' % len(text)) | |
print "number of classes",len(num_classes) | |
tokenizer,embedding_layer=getEmbeddingLayer(EMBEDDING_DIM) | |
sequences = tokenizer.texts_to_sequences(text) | |
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) | |
uniques, ids = np.unique(labels, return_inverse=True) | |
y_train = keras.utils.to_categorical(ids, len(uniques)) | |
print "Training data" | |
print data.shape,y_train.shape | |
model=create_embedded_model(embedding_layer,y_train.shape[1],MAX_SEQUENCE_LENGTH,EMBEDDING_DIM) | |
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) | |
model.fit(data, y_train, epochs=20, batch_size=100, verbose=1) | |
save_model(model, "/tmp/model6") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment