Skip to content

Instantly share code, notes, and snippets.

@pi19404

pi19404/lstm2.py Secret

Created January 31, 2018 04:36
Show Gist options
  • Save pi19404/1d2f5cb64380740781338949ec4aac8a to your computer and use it in GitHub Desktop.
Save pi19404/1d2f5cb64380740781338949ec4aac8a to your computer and use it in GitHub Desktop.
import numpy as np
from keras.datasets import imdb
from keras.preprocessing import sequence
import numpy as np
import keras
from keras.datasets import reuters
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.preprocessing.text import Tokenizer
from nltk.tokenize import sent_tokenize, word_tokenize
import os
import spacy
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dense, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
def pad_vec_sequences(sequences, maxlen=40):
new_sequences = []
for sequence in sequences:
orig_len, vec_len = np.shape(sequence)
if orig_len < maxlen:
new = np.zeros((maxlen, vec_len))
for k in range(maxlen-orig_len,maxlen):
new[k:, :] = sequence[k-maxlen+orig_len]
else:
new = np.zeros((maxlen, vec_len))
for k in range(0,maxlen):
new[k:,:] = sequence[k]
new_sequences.append(new)
return np.array(new_sequences)
def save_model(model, filename):
model_json = model.to_json()
with open(filename + '.model', "w") as json_file:
json_file.write(model_json)
json_file.close();
model.save_weights(filename + ".weights")
def load_model(filename):
json_file = open(filename + '.model', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
loaded_model.load_weights(filename + ".weights")
return loaded_model;
def build_dict(f, grams):
dic = Counter()
for sentence in open(f).xreadlines():
dic.update(tokenize(sentence, grams))
return dic
def tokenize(line):
words = word_tokenize(line)
return words
# LSTM network using glove2vec word vector embeddings as initializations to embedding layer
# of LSTM
def model4():
f="data/tpos.txt"
train_data=[]
Xtrain=[]
labels=[]
print "reading postive samples"
text=[]
for sentence in open(f).xreadlines():
row={}
tt = [];
t=[]
try:
sentence1 = ''.join([i if ord(i) < 128 else ' ' for i in sentence])
str1 = " ".join(str(x) for x in sentence1);
labels.append(1)
text.append(str1)
except:
print "pskip data row",sentence
continue;
f="data/tneg.txt"
print "reading negative samples"
for sentence in open(f).xreadlines():
row={}
tt = [];
t=[]
try:
sentence1 = ''.join([i if ord(i) < 128 else ' ' for i in sentence])
str1 = " ".join(str(x) for x in sentence1);
labels.append(0)
text.append(str1)
except:
print "nskip data row",sentence
continue;
MAX_NB_WORDS = 100
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(text)
word_index = tokenizer.word_index
from keras.layers import Embedding
MAX_SEQUENCE_LENGTH = 50
EMBEDDING_DIM = 100
embeddings_index = {}
f = open(os.path.join('data/', 'glove.6B.100d.txt'))
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
# words not found in embedding index will be all-zeros.
embedding_matrix[i] = embedding_vector
embedding_layer = Embedding(input_dim=len(word_index) + 1,
output_dim=EMBEDDING_DIM,
weights=[embedding_matrix],
input_length=MAX_SEQUENCE_LENGTH,
trainable=True)
num_classes = np.max(labels) + 1
max_words=MAX_SEQUENCE_LENGTH
y_train = keras.utils.to_categorical(labels, num_classes)
sequences = tokenizer.texts_to_sequences(text)
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print("Training data: ")
print(data.shape),(y_train.shape)
model = Sequential()
model.add(embedding_layer)
model.add(LSTM(128, return_sequences=False
, input_shape=(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM)))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(data,y_train,epochs=20, batch_size=100, verbose=1)
save_model(model,"/tmp/model4")
model.evaluate(X_test,y_test)
# LSTM network using spacy word vector embeddings
def model3():
nlp1 = spacy.load('en')
f="data/tpos.txt"
train_data=[]
Xtrain=[]
labels=[]
print "reading postive samples"
for sentence in open(f).xreadlines():
row={}
t=[]
try:
sentence1 = ''.join([i if ord(i) < 128 else ' ' for i in sentence])
r=unicode(sentence1)
doc = nlp1(r)
#print doc
for token in doc:
t.append(token.vector)
row['text']=sentence;
row['feature']=t;
row['lable']=1;
Xtrain.append(t)
labels.append(1)
train_data.append(row)
except:
print "skip data row",sentence
continue;
f = "data/tneg.txt"
print "reading negative samples"
for sentence in open(f).xreadlines():
#print "AAAAAAAAAA",sentence
row={}
t=[]
try:
sentence1=''.join([i if ord(i) < 128 else ' ' for i in sentence])
r=unicode(sentence1)
doc = nlp1(r)
#print doc
for token in doc:
t.append(token.vector)
row['text']=sentence;
row['feature']=t;
row['lable']=0;
Xtrain.append(t)
labels.append(0)
train_data.append(row)
except:
print "skip data row",sentence
continue;
max_words=50
num_classes = np.max(labels) + 1
X_train = pad_vec_sequences(Xtrain, maxlen=max_words)
y_train = keras.utils.to_categorical(labels, num_classes)
print("Training data: ")
print(X_train.shape),(y_train.shape)
#X_train=np.reshape(X_train,(X_train.shape[0],max_words));
#print("input tensor")
#print(X_train.shape),y_train.shape
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dense, Embedding
model=Sequential()
#model.add(Embedding(vocab_size,300,input_length=max_words))
model.add(LSTM(128,return_sequences=False,input_shape=(max_words,300)))
model.add(Dense(num_classes, activation='softmax'))
print(model.summary())
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train,y_train,epochs=20, batch_size=100, verbose=1)
save_model(model,"/tmp/model3")
model.evaluate(X_test,y_test)
print train_data;
#LSTM with embedding layer
def model2():
(X_train, y_train), (X_test, y_test) = imdb.load_data()
print("Training data: ")
print(X_train.shape),(y_train.shape)
print("Testing data: ")
print(X_test.shape),(y_test.shape)
num_classes = np.max(y_train) + 1
print(num_classes, 'classes')
vocab_size = np.max(np.hstack(X_train))
print(vocab_size, 'vocab size')
max_words = 50
X_train = sequence.pad_sequences(X_train, maxlen=max_words,padding="post",truncating="post")
X_test = sequence.pad_sequences(X_test, maxlen=max_words,padding="post",truncating="post")
print('Convert class vector to binary class matrix '
'(for use with categorical_crossentropy)')
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)
#X_train=X_train[0:500,:]
#y_train=y_train[0:500]
X_train=np.reshape(X_train,(X_train.shape[0],max_words));
X_test=np.reshape(X_test,(X_test.shape[0],max_words));
#y_train=np.reshape(y_train,(500,1));
print("input tensor")
print(X_train.shape),y_train.shape
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dense, Embedding
model=Sequential()
model.add(Embedding(vocab_size,300,input_length=max_words))
model.add(LSTM(128,return_sequences=False,input_shape=(300,max_words)))
model.add(Dense(num_classes, activation='softmax'))
print(model.summary())
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train,y_train,epochs=1000, batch_size=100, verbose=1)
save_model(model,"/tmp/model2")
model.evaluate(X_test,y_test)
#LSTM with no embedding layer
def model1():
(X_train, y_train), (X_test, y_test) = imdb.load_data()
print("Training data: ")
print(X_train.shape), (y_train.shape)
print("Testing data: ")
print(X_test.shape), (y_test.shape)
num_classes = np.max(y_train) + 1
print(num_classes, 'classes')
vocab_size = np.max(np.hstack(X_train))
print(vocab_size, 'vocab size')
max_words = 50
X_train = sequence.pad_sequences(X_train, maxlen=max_words, padding="post", truncating="post")
X_test = sequence.pad_sequences(X_test, maxlen=max_words, padding="post", truncating="post")
print('Convert class vector to binary class matrix '
'(for use with categorical_crossentropy)')
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)
# X_train=X_train[0:500,:]
# y_train=y_train[0:500]
X_train = np.reshape(X_train, (X_train.shape[0], max_words));
X_test = np.reshape(X_test, (X_test.shape[0], max_words));
# y_train=np.reshape(y_train,(500,1));
print("input tensor")
print(X_train.shape), y_train.shape
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dense, Embedding
model = Sequential()
#model.add(Embedding(vocab_size, 300, input_length=max_words))
model.add(LSTM(128, return_sequences=False, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dense(num_classes, activation='softmax'))
print(model.summary())
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=1000, batch_size=100, verbose=1)
save_model(model, "/tmp/model1")
model.evaluate(X_test, y_test)
model4();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment