Last active
January 3, 2019 22:00
-
-
Save lettergram/87b6c193ca4acdc54a88d8b9281b52ee to your computer and use it in GitHub Desktop.
For use on https://austingwalters.com
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
max_words, batch_size, maxlen, epochs, ngram_range = 10000, 32, 500, 5, 2 | |
# Determine the number of categories + default(i.e. sentence types) | |
num_classes = np.max(y_train) + 1 | |
# Vectorize the output sentence type classifcations to Keras readable format | |
y_train = keras.utils.to_categorical(y_train, num_classes) | |
y_test = keras.utils.to_categorical(y_test, num_classes) | |
if ngram_range > 1: | |
# Create set of unique n-gram from the training set | |
ngram_set = set() | |
for input_list in x_train: | |
for i in range(2, ngram_range + 1): | |
set_of_ngram = create_ngram_set(input_list, ngram_value=i) | |
ngram_set.update(set_of_ngram) | |
# Dictionary mapping n-gram token to a unique integer | |
# Integer values are greater than max_words in order | |
# to avoid collision with existing features | |
start_index = max_words + 1 | |
token_indice = {v: k + start_index for k, v in enumerate(ngram_set)} | |
indice_token = {token_indice[k]: k for k in token_indice} | |
# max_words is the highest integer that could be found in the dataset | |
max_words = np.max(list(indice_token.keys())) + 1 | |
# Augmenting x_train and x_test with n-grams features | |
x_train = add_ngram(x_train, token_indice, ngram_range) | |
x_test = add_ngram(x_test, token_indice, ngram_range) | |
# Pad the input vectors to ensure a consistent length | |
x_train = sequence.pad_sequences(x_train, maxlen=maxlen) | |
x_test = sequence.pad_sequences(x_test, maxlen=maxlen) | |
model = Sequential() | |
# Created Embedding (Input) Layer (max_words) --> Pooling Layer | |
model.add(Embedding(max_words, embedding_dims, input_length=maxlen)) | |
# Create the average Pooling Layer | |
model.add(GlobalAveragePooling1D()) | |
# Create the output layer (num_classes) | |
model.add(Dense(num_classes, activation='softmax')) | |
# Add optimization method, loss function and optimization value | |
model.compile(loss='categorical_crossentropy', | |
optimizer='adam', metrics=['accuracy']) | |
# "Fit the model" (train model), using training data (80% of datset) | |
model.fit(x_train, y_train, batch_size=batch_size, | |
epochs=epochs, validation_data=(x_test, y_test)) | |
# Evaluate the trained model, using the test data (20% of the dataset) | |
score = model.evaluate(x_test, y_test, batch_size=batch_size) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment