-
-
Save pi19404/1d2f5cb64380740781338949ec4aac8a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from keras.datasets import imdb | |
from keras.preprocessing import sequence | |
import numpy as np | |
import keras | |
from keras.datasets import reuters | |
from keras.models import Sequential | |
from keras.layers import Dense, Dropout, Activation | |
from keras.preprocessing.text import Tokenizer | |
from nltk.tokenize import sent_tokenize, word_tokenize | |
import os | |
import spacy | |
from keras.preprocessing.sequence import pad_sequences | |
from keras.models import Sequential | |
from keras.layers import Dense | |
from keras.layers import LSTM | |
from keras.layers import Dense, Embedding | |
from keras.preprocessing.text import Tokenizer | |
from keras.preprocessing.sequence import pad_sequences | |
from keras.utils.np_utils import to_categorical | |
def pad_vec_sequences(sequences, maxlen=40): | |
new_sequences = [] | |
for sequence in sequences: | |
orig_len, vec_len = np.shape(sequence) | |
if orig_len < maxlen: | |
new = np.zeros((maxlen, vec_len)) | |
for k in range(maxlen-orig_len,maxlen): | |
new[k:, :] = sequence[k-maxlen+orig_len] | |
else: | |
new = np.zeros((maxlen, vec_len)) | |
for k in range(0,maxlen): | |
new[k:,:] = sequence[k] | |
new_sequences.append(new) | |
return np.array(new_sequences) | |
def save_model(model, filename): | |
model_json = model.to_json() | |
with open(filename + '.model', "w") as json_file: | |
json_file.write(model_json) | |
json_file.close(); | |
model.save_weights(filename + ".weights") | |
def load_model(filename): | |
json_file = open(filename + '.model', 'r') | |
loaded_model_json = json_file.read() | |
json_file.close() | |
loaded_model = model_from_json(loaded_model_json) | |
loaded_model.load_weights(filename + ".weights") | |
return loaded_model; | |
def build_dict(f, grams): | |
dic = Counter() | |
for sentence in open(f).xreadlines(): | |
dic.update(tokenize(sentence, grams)) | |
return dic | |
def tokenize(line): | |
words = word_tokenize(line) | |
return words | |
# LSTM network using glove2vec word vector embeddings as initializations to embedding layer | |
# of LSTM | |
def model4(): | |
f="data/tpos.txt" | |
train_data=[] | |
Xtrain=[] | |
labels=[] | |
print "reading postive samples" | |
text=[] | |
for sentence in open(f).xreadlines(): | |
row={} | |
tt = []; | |
t=[] | |
try: | |
sentence1 = ''.join([i if ord(i) < 128 else ' ' for i in sentence]) | |
str1 = " ".join(str(x) for x in sentence1); | |
labels.append(1) | |
text.append(str1) | |
except: | |
print "pskip data row",sentence | |
continue; | |
f="data/tneg.txt" | |
print "reading negative samples" | |
for sentence in open(f).xreadlines(): | |
row={} | |
tt = []; | |
t=[] | |
try: | |
sentence1 = ''.join([i if ord(i) < 128 else ' ' for i in sentence]) | |
str1 = " ".join(str(x) for x in sentence1); | |
labels.append(0) | |
text.append(str1) | |
except: | |
print "nskip data row",sentence | |
continue; | |
MAX_NB_WORDS = 100 | |
tokenizer = Tokenizer(num_words=MAX_NB_WORDS) | |
tokenizer.fit_on_texts(text) | |
word_index = tokenizer.word_index | |
from keras.layers import Embedding | |
MAX_SEQUENCE_LENGTH = 50 | |
EMBEDDING_DIM = 100 | |
embeddings_index = {} | |
f = open(os.path.join('data/', 'glove.6B.100d.txt')) | |
for line in f: | |
values = line.split() | |
word = values[0] | |
coefs = np.asarray(values[1:], dtype='float32') | |
embeddings_index[word] = coefs | |
f.close() | |
print('Found %s word vectors.' % len(embeddings_index)) | |
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM)) | |
for word, i in word_index.items(): | |
embedding_vector = embeddings_index.get(word) | |
if embedding_vector is not None: | |
# words not found in embedding index will be all-zeros. | |
embedding_matrix[i] = embedding_vector | |
embedding_layer = Embedding(input_dim=len(word_index) + 1, | |
output_dim=EMBEDDING_DIM, | |
weights=[embedding_matrix], | |
input_length=MAX_SEQUENCE_LENGTH, | |
trainable=True) | |
num_classes = np.max(labels) + 1 | |
max_words=MAX_SEQUENCE_LENGTH | |
y_train = keras.utils.to_categorical(labels, num_classes) | |
sequences = tokenizer.texts_to_sequences(text) | |
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) | |
print("Training data: ") | |
print(data.shape),(y_train.shape) | |
model = Sequential() | |
model.add(embedding_layer) | |
model.add(LSTM(128, return_sequences=False | |
, input_shape=(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM))) | |
model.add(Dense(num_classes, activation='softmax')) | |
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) | |
model.fit(data,y_train,epochs=20, batch_size=100, verbose=1) | |
save_model(model,"/tmp/model4") | |
model.evaluate(X_test,y_test) | |
# LSTM network using spacy word vector embeddings | |
def model3(): | |
nlp1 = spacy.load('en') | |
f="data/tpos.txt" | |
train_data=[] | |
Xtrain=[] | |
labels=[] | |
print "reading postive samples" | |
for sentence in open(f).xreadlines(): | |
row={} | |
t=[] | |
try: | |
sentence1 = ''.join([i if ord(i) < 128 else ' ' for i in sentence]) | |
r=unicode(sentence1) | |
doc = nlp1(r) | |
#print doc | |
for token in doc: | |
t.append(token.vector) | |
row['text']=sentence; | |
row['feature']=t; | |
row['lable']=1; | |
Xtrain.append(t) | |
labels.append(1) | |
train_data.append(row) | |
except: | |
print "skip data row",sentence | |
continue; | |
f = "data/tneg.txt" | |
print "reading negative samples" | |
for sentence in open(f).xreadlines(): | |
#print "AAAAAAAAAA",sentence | |
row={} | |
t=[] | |
try: | |
sentence1=''.join([i if ord(i) < 128 else ' ' for i in sentence]) | |
r=unicode(sentence1) | |
doc = nlp1(r) | |
#print doc | |
for token in doc: | |
t.append(token.vector) | |
row['text']=sentence; | |
row['feature']=t; | |
row['lable']=0; | |
Xtrain.append(t) | |
labels.append(0) | |
train_data.append(row) | |
except: | |
print "skip data row",sentence | |
continue; | |
max_words=50 | |
num_classes = np.max(labels) + 1 | |
X_train = pad_vec_sequences(Xtrain, maxlen=max_words) | |
y_train = keras.utils.to_categorical(labels, num_classes) | |
print("Training data: ") | |
print(X_train.shape),(y_train.shape) | |
#X_train=np.reshape(X_train,(X_train.shape[0],max_words)); | |
#print("input tensor") | |
#print(X_train.shape),y_train.shape | |
from keras.models import Sequential | |
from keras.layers import Dense | |
from keras.layers import LSTM | |
from keras.layers import Dense, Embedding | |
model=Sequential() | |
#model.add(Embedding(vocab_size,300,input_length=max_words)) | |
model.add(LSTM(128,return_sequences=False,input_shape=(max_words,300))) | |
model.add(Dense(num_classes, activation='softmax')) | |
print(model.summary()) | |
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) | |
model.fit(X_train,y_train,epochs=20, batch_size=100, verbose=1) | |
save_model(model,"/tmp/model3") | |
model.evaluate(X_test,y_test) | |
print train_data; | |
#LSTM with embedding layer | |
def model2(): | |
(X_train, y_train), (X_test, y_test) = imdb.load_data() | |
print("Training data: ") | |
print(X_train.shape),(y_train.shape) | |
print("Testing data: ") | |
print(X_test.shape),(y_test.shape) | |
num_classes = np.max(y_train) + 1 | |
print(num_classes, 'classes') | |
vocab_size = np.max(np.hstack(X_train)) | |
print(vocab_size, 'vocab size') | |
max_words = 50 | |
X_train = sequence.pad_sequences(X_train, maxlen=max_words,padding="post",truncating="post") | |
X_test = sequence.pad_sequences(X_test, maxlen=max_words,padding="post",truncating="post") | |
print('Convert class vector to binary class matrix ' | |
'(for use with categorical_crossentropy)') | |
y_train = keras.utils.to_categorical(y_train, num_classes) | |
y_test = keras.utils.to_categorical(y_test, num_classes) | |
print('y_train shape:', y_train.shape) | |
print('y_test shape:', y_test.shape) | |
#X_train=X_train[0:500,:] | |
#y_train=y_train[0:500] | |
X_train=np.reshape(X_train,(X_train.shape[0],max_words)); | |
X_test=np.reshape(X_test,(X_test.shape[0],max_words)); | |
#y_train=np.reshape(y_train,(500,1)); | |
print("input tensor") | |
print(X_train.shape),y_train.shape | |
from keras.models import Sequential | |
from keras.layers import Dense | |
from keras.layers import LSTM | |
from keras.layers import Dense, Embedding | |
model=Sequential() | |
model.add(Embedding(vocab_size,300,input_length=max_words)) | |
model.add(LSTM(128,return_sequences=False,input_shape=(300,max_words))) | |
model.add(Dense(num_classes, activation='softmax')) | |
print(model.summary()) | |
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) | |
model.fit(X_train,y_train,epochs=1000, batch_size=100, verbose=1) | |
save_model(model,"/tmp/model2") | |
model.evaluate(X_test,y_test) | |
#LSTM with no embedding layer | |
def model1(): | |
(X_train, y_train), (X_test, y_test) = imdb.load_data() | |
print("Training data: ") | |
print(X_train.shape), (y_train.shape) | |
print("Testing data: ") | |
print(X_test.shape), (y_test.shape) | |
num_classes = np.max(y_train) + 1 | |
print(num_classes, 'classes') | |
vocab_size = np.max(np.hstack(X_train)) | |
print(vocab_size, 'vocab size') | |
max_words = 50 | |
X_train = sequence.pad_sequences(X_train, maxlen=max_words, padding="post", truncating="post") | |
X_test = sequence.pad_sequences(X_test, maxlen=max_words, padding="post", truncating="post") | |
print('Convert class vector to binary class matrix ' | |
'(for use with categorical_crossentropy)') | |
y_train = keras.utils.to_categorical(y_train, num_classes) | |
y_test = keras.utils.to_categorical(y_test, num_classes) | |
print('y_train shape:', y_train.shape) | |
print('y_test shape:', y_test.shape) | |
# X_train=X_train[0:500,:] | |
# y_train=y_train[0:500] | |
X_train = np.reshape(X_train, (X_train.shape[0], max_words)); | |
X_test = np.reshape(X_test, (X_test.shape[0], max_words)); | |
# y_train=np.reshape(y_train,(500,1)); | |
print("input tensor") | |
print(X_train.shape), y_train.shape | |
from keras.models import Sequential | |
from keras.layers import Dense | |
from keras.layers import LSTM | |
from keras.layers import Dense, Embedding | |
model = Sequential() | |
#model.add(Embedding(vocab_size, 300, input_length=max_words)) | |
model.add(LSTM(128, return_sequences=False, input_shape=(X_train.shape[1], X_train.shape[2]))) | |
model.add(Dense(num_classes, activation='softmax')) | |
print(model.summary()) | |
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) | |
model.fit(X_train, y_train, epochs=1000, batch_size=100, verbose=1) | |
save_model(model, "/tmp/model1") | |
model.evaluate(X_test, y_test) | |
model4(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment