Created
December 2, 2016 16:24
-
-
Save glamp/8f32c773ea9ae3215088ff2a3b8543ed to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import print_function | |
import numpy as np | |
np.random.seed(1337) # for reproducibility | |
from keras.preprocessing import sequence | |
from keras.models import Sequential | |
from keras.layers import Dense, Flatten | |
from keras.layers import Embedding | |
from keras.layers import AveragePooling1D | |
from keras.datasets import imdb | |
def create_ngram_set(input_list, ngram_value=2): | |
""" | |
Extract a set of n-grams from a list of integers. | |
>>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2) | |
{(4, 9), (4, 1), (1, 4), (9, 4)} | |
>>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3) | |
[(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)] | |
""" | |
return set(zip(*[input_list[i:] for i in range(ngram_value)])) | |
def add_ngram(sequences, token_indice, ngram_range=2): | |
""" | |
Augment the input list of list (sequences) by appending n-grams values. | |
Example: adding bi-gram | |
>>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]] | |
>>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017} | |
>>> add_ngram(sequences, token_indice, ngram_range=2) | |
[[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]] | |
Example: adding tri-gram | |
>>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]] | |
>>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018} | |
>>> add_ngram(sequences, token_indice, ngram_range=3) | |
[[1, 3, 4, 5, 1337], [1, 3, 7, 9, 2, 1337, 2018]] | |
""" | |
new_sequences = [] | |
for input_list in sequences: | |
new_list = input_list[:] | |
for i in range(len(new_list)-ngram_range+1): | |
for ngram_value in range(2, ngram_range+1): | |
ngram = tuple(new_list[i:i+ngram_value]) | |
if ngram in token_indice: | |
new_list.append(token_indice[ngram]) | |
new_sequences.append(new_list) | |
return new_sequences | |
# Set parameters: | |
# ngram_range = 2 will add bi-grams features | |
ngram_range = 1 | |
max_features = 20000 | |
maxlen = 400 | |
batch_size = 32 | |
embedding_dims = 50 | |
nb_epoch = 5 | |
print('Loading data...') | |
(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features) | |
print(len(X_train), 'train sequences') | |
print(len(X_test), 'test sequences') | |
print('Average train sequence length: {}'.format(np.mean(list(map(len, X_train)), dtype=int))) | |
print('Average test sequence length: {}'.format(np.mean(list(map(len, X_test)), dtype=int))) | |
if ngram_range > 1: | |
print('Adding {}-gram features'.format(ngram_range)) | |
# Create set of unique n-gram from the training set. | |
ngram_set = set() | |
for input_list in X_train: | |
for i in range(2, ngram_range+1): | |
set_of_ngram = create_ngram_set(input_list, ngram_value=i) | |
ngram_set.update(set_of_ngram) | |
# Dictionary mapping n-gram token to a unique integer. | |
# Integer values are greater than max_features in order | |
# to avoid collision with existing features. | |
start_index = max_features + 1 | |
token_indice = {v: k+start_index for k, v in enumerate(ngram_set)} | |
indice_token = {token_indice[k]: k for k in token_indice} | |
# max_features is the highest integer that could be found in the dataset. | |
max_features = np.max(list(indice_token.keys())) + 1 | |
# Augmenting X_train and X_test with n-grams features | |
X_train = add_ngram(X_train, token_indice, ngram_range) | |
X_test = add_ngram(X_test, token_indice, ngram_range) | |
print('Average train sequence length: {}'.format(np.mean(list(map(len, X_train)), dtype=int))) | |
print('Average test sequence length: {}'.format(np.mean(list(map(len, X_test)), dtype=int))) | |
print('Pad sequences (samples x time)') | |
X_train = sequence.pad_sequences(X_train, maxlen=maxlen) | |
X_test = sequence.pad_sequences(X_test, maxlen=maxlen) | |
print('X_train shape:', X_train.shape) | |
print('X_test shape:', X_test.shape) | |
print('Build model...') | |
model = Sequential() | |
# we start off with an efficient embedding layer which maps | |
# our vocab indices into embedding_dims dimensions | |
model.add(Embedding(max_features, | |
embedding_dims, | |
input_length=maxlen)) | |
# we add a AveragePooling1D, which will average the embeddings | |
# of all words in the document | |
model.add(AveragePooling1D(pool_length=model.output_shape[1])) | |
# We flatten the output of the AveragePooling1D layer | |
model.add(Flatten()) | |
# We project onto a single unit output layer, and squash it with a sigmoid: | |
model.add(Dense(1, activation='sigmoid')) | |
model.compile(loss='binary_crossentropy', | |
optimizer='adam', | |
metrics=['accuracy']) | |
model.fit(X_train, y_train, | |
batch_size=batch_size, | |
nb_epoch=nb_epoch, | |
validation_data=(X_test, y_test)) | |
from yhat import Yhat, YhatModel | |
class KerasTest(YhatModel): | |
REQUIREMENTS=[ | |
"Keras==1.1.1", | |
"numpy==1.11.2" | |
] | |
def execute(self, data): | |
data = np.array([data['x']]) | |
return { "prob": model.predict_proba(data)[0].tolist() } | |
xtest = { "x": X_train[0].tolist() } | |
kt = KerasTest() | |
kt.execute(xtest) | |
yh = Yhat(USERNAME, APIKEY, URL) | |
yh.deploy("KerasTest", KerasTest, globals(), sure=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment