Skip to content

Instantly share code, notes, and snippets.

@DivineGod
Last active September 4, 2015 22:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save DivineGod/395058 to your computer and use it in GitHub Desktop.
Save DivineGod/395058 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import re
split_regex = re.compile(r'\W').split
def ngram_word(n, text):
"""Returns n-grams from text as a list of strings
>>> ngram_word(3, 'Hello, world! rejoice and be happy')
['hello world rejoice', 'world rejoice and', 'rejoice and be', 'and be happy']
"""
words = split_regex(text.lower())
words = [x for x in words if x != '']
l = len(words)
return [' '.join(words[i:i+n]) for i in range(0, l-n+1)]
def ngram_letter(n, text):
"""Returns n-grams from text as a list of strings
>>> ngram_letter(3, 'Hello, world!')
['hel', 'ell', 'llo', 'low', 'owo', 'wor', 'orl', 'rld']
"""
letters = re.sub(r'\W', '', text.lower())
return [str(letters[x:x+n]) for x in range(0, len(letters)-n+1)]
if __name__ == "__main__":
import doctest
doctest.testmod()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment