Last active
January 3, 2019 22:02
-
-
Save lettergram/6a343d4530d4eebe1ce029b45c3c0a14 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
max_words, batch_size, maxlen, epochs, ngram_range = 10000, 32, 500, 5, 2 | |
# Determine the number of categories + default(i.e. sentence types) | |
num_classes = np.max(y_train) + 1 | |
# Vectorize the output sentence type classifcations to Keras readable format | |
y_train = keras.utils.to_categorical(y_train, num_classes) | |
y_test = keras.utils.to_categorical(y_test, num_classes) | |
if ngram_range > 1: | |
# Create set of unique n-gram from the training set | |
ngram_set = set() | |
for input_list in x_train: | |
for i in range(2, ngram_range + 1): | |
set_of_ngram = create_ngram_set(input_list, ngram_value=i) | |
ngram_set.update(set_of_ngram) | |
# Dictionary mapping n-gram token to a unique integer | |
# Integer values are greater than max_words in order | |
# to avoid collision with existing features | |
start_index = max_words + 1 | |
token_indice = {v: k + start_index for k, v in enumerate(ngram_set)} | |
indice_token = {token_indice[k]: k for k in token_indice} | |
# max_words is the highest integer that could be found in the dataset | |
max_words = np.max(list(indice_token.keys())) + 1 | |
# Augmenting x_train and x_test with n-grams features | |
x_train = add_ngram(x_train, token_indice, ngram_range) | |
x_test = add_ngram(x_test, token_indice, ngram_range) | |
# Pad the input vectors to ensure a consistent length | |
x_train = sequence.pad_sequences(x_train, maxlen=maxlen) | |
x_test = sequence.pad_sequences(x_test, maxlen=maxlen) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment