Skip to content

Instantly share code, notes, and snippets.

@lettergram
Last active January 3, 2019 22:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lettergram/ea30e48f7bb49728b3cf3ca72bd2d448 to your computer and use it in GitHub Desktop.
Save lettergram/ea30e48f7bb49728b3cf3ca72bd2d448 to your computer and use it in GitHub Desktop.
"""
Taken from: https://github.com/keras-team/keras
Based on Joulin et al's paper:
Bags of Tricks for Efficient Text Classification
https://arxiv.org/abs/1607.01759
"""
def create_ngram_set(input_list, ngram_value=2):
"""
Extract a set of n-grams from a list of integers.
>>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
{(4, 9), (4, 1), (1, 4), (9, 4)}
"""
return set(zip(*[input_list[i:] for i in range(ngram_value)]))
def add_ngram(sequences, token_indice, ngram_range=2):
"""
Augment the input list of list (sequences) by appending n-grams values (example: bi-gram)
>>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
>>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
>>> add_ngram(sequences, token_indice, ngram_range=2)
[[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]
"""
new_sequences = []
for input_list in sequences:
new_list = input_list[:]
for ngram_value in range(2, ngram_range + 1):
for i in range(len(new_list) - ngram_value + 1):
ngram = tuple(new_list[i:i + ngram_value])
if ngram in token_indice:
new_list.append(token_indice[ngram])
new_sequences.append(new_list)
return new_sequences
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment