lettergram/fasttext_sentence_type_classification.py

## fasttext_sentence_type_classification.py
max_words, batch_size, maxlen, epochs, ngram_range = 10000, 32, 500, 5, 2

# Determine the number of categories + default(i.e. sentence types)
num_classes = np.max(y_train) + 1

# Vectorize the output sentence type classifcations to Keras readable format
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

if ngram_range > 1:

    # Create set of unique n-gram from the training set
    ngram_set = set()
    for input_list in x_train:
        for i in range(2, ngram_range + 1):
            set_of_ngram = create_ngram_set(input_list, ngram_value=i)
            ngram_set.update(set_of_ngram)

    # Dictionary mapping n-gram token to a unique integer
    # Integer values are greater than max_words in order
    # to avoid collision with existing features
    start_index = max_words + 1
    token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
    indice_token = {token_indice[k]: k for k in token_indice}

    # max_words is the highest integer that could be found in the dataset
    max_words = np.max(list(indice_token.keys())) + 1

    # Augmenting x_train and x_test with n-grams features
    x_train = add_ngram(x_train, token_indice, ngram_range)
    x_test = add_ngram(x_test, token_indice, ngram_range)

# Pad the input vectors to ensure a consistent length
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

model = Sequential()

# Created Embedding (Input) Layer (max_words) --> Pooling Layer
model.add(Embedding(max_words, embedding_dims, input_length=maxlen))

# Create the average Pooling Layer
model.add(GlobalAveragePooling1D())

# Create the output layer (num_classes)
model.add(Dense(num_classes, activation='softmax'))

# Add optimization method, loss function and optimization value
model.compile(loss='categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])

# "Fit the model" (train model), using training data (80% of datset)
model.fit(x_train, y_train, batch_size=batch_size,
          epochs=epochs, validation_data=(x_test, y_test))

# Evaluate the trained model, using the test data (20% of the dataset)
score = model.evaluate(x_test, y_test, batch_size=batch_size)
	max_words, batch_size, maxlen, epochs, ngram_range = 10000, 32, 500, 5, 2

	# Determine the number of categories + default(i.e. sentence types)
	num_classes = np.max(y_train) + 1

	# Vectorize the output sentence type classifcations to Keras readable format
	y_train = keras.utils.to_categorical(y_train, num_classes)
	y_test = keras.utils.to_categorical(y_test, num_classes)

	if ngram_range > 1:

	# Create set of unique n-gram from the training set
	ngram_set = set()
	for input_list in x_train:
	for i in range(2, ngram_range + 1):
	set_of_ngram = create_ngram_set(input_list, ngram_value=i)
	ngram_set.update(set_of_ngram)

	# Dictionary mapping n-gram token to a unique integer
	# Integer values are greater than max_words in order
	# to avoid collision with existing features
	start_index = max_words + 1
	token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
	indice_token = {token_indice[k]: k for k in token_indice}

	# max_words is the highest integer that could be found in the dataset
	max_words = np.max(list(indice_token.keys())) + 1

	# Augmenting x_train and x_test with n-grams features
	x_train = add_ngram(x_train, token_indice, ngram_range)
	x_test = add_ngram(x_test, token_indice, ngram_range)

	# Pad the input vectors to ensure a consistent length
	x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
	x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

	model = Sequential()

	# Created Embedding (Input) Layer (max_words) --> Pooling Layer
	model.add(Embedding(max_words, embedding_dims, input_length=maxlen))

	# Create the average Pooling Layer
	model.add(GlobalAveragePooling1D())

	# Create the output layer (num_classes)
	model.add(Dense(num_classes, activation='softmax'))

	# Add optimization method, loss function and optimization value
	model.compile(loss='categorical_crossentropy',
	optimizer='adam', metrics=['accuracy'])

	# "Fit the model" (train model), using training data (80% of datset)
	model.fit(x_train, y_train, batch_size=batch_size,
	epochs=epochs, validation_data=(x_test, y_test))

	# Evaluate the trained model, using the test data (20% of the dataset)
	score = model.evaluate(x_test, y_test, batch_size=batch_size)