Last active
January 3, 2019 22:17
-
-
Save lettergram/ea30e48f7bb49728b3cf3ca72bd2d448 to your computer and use it in GitHub Desktop.
Supplemental material for https://austingwalters.com/fasttext-for-sentence-classification/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Taken from: https://github.com/keras-team/keras | |
Based on Joulin et al's paper: | |
Bags of Tricks for Efficient Text Classification | |
https://arxiv.org/abs/1607.01759 | |
""" | |
def create_ngram_set(input_list, ngram_value=2): | |
""" | |
Extract a set of n-grams from a list of integers. | |
>>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2) | |
{(4, 9), (4, 1), (1, 4), (9, 4)} | |
""" | |
return set(zip(*[input_list[i:] for i in range(ngram_value)])) | |
def add_ngram(sequences, token_indice, ngram_range=2): | |
""" | |
Augment the input list of list (sequences) by appending n-grams values (example: bi-gram) | |
>>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]] | |
>>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017} | |
>>> add_ngram(sequences, token_indice, ngram_range=2) | |
[[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]] | |
""" | |
new_sequences = [] | |
for input_list in sequences: | |
new_list = input_list[:] | |
for ngram_value in range(2, ngram_range + 1): | |
for i in range(len(new_list) - ngram_value + 1): | |
ngram = tuple(new_list[i:i + ngram_value]) | |
if ngram in token_indice: | |
new_list.append(token_indice[ngram]) | |
new_sequences.append(new_list) | |
return new_sequences |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment