Skip to content

Instantly share code, notes, and snippets.

@lettergram
Last active January 3, 2019 22:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lettergram/6a343d4530d4eebe1ce029b45c3c0a14 to your computer and use it in GitHub Desktop.
Save lettergram/6a343d4530d4eebe1ce029b45c3c0a14 to your computer and use it in GitHub Desktop.
max_words, batch_size, maxlen, epochs, ngram_range = 10000, 32, 500, 5, 2
# Determine the number of categories + default(i.e. sentence types)
num_classes = np.max(y_train) + 1
# Vectorize the output sentence type classifcations to Keras readable format
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
if ngram_range > 1:
# Create set of unique n-gram from the training set
ngram_set = set()
for input_list in x_train:
for i in range(2, ngram_range + 1):
set_of_ngram = create_ngram_set(input_list, ngram_value=i)
ngram_set.update(set_of_ngram)
# Dictionary mapping n-gram token to a unique integer
# Integer values are greater than max_words in order
# to avoid collision with existing features
start_index = max_words + 1
token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
indice_token = {token_indice[k]: k for k in token_indice}
# max_words is the highest integer that could be found in the dataset
max_words = np.max(list(indice_token.keys())) + 1
# Augmenting x_train and x_test with n-grams features
x_train = add_ngram(x_train, token_indice, ngram_range)
x_test = add_ngram(x_test, token_indice, ngram_range)
# Pad the input vectors to ensure a consistent length
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment