Skip to content

Instantly share code, notes, and snippets.

@ipha
Created March 22, 2016 01:23
Show Gist options
  • Save ipha/9eedbf7f5e3af0f7bbaf to your computer and use it in GitHub Desktop.
Save ipha/9eedbf7f5e3af0f7bbaf to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
from nltk.corpus import brown
PUNCTUATION = "!\"#$%'()*+,-./:;<=>?@[\\]^_`{|}~"
MIN_OCCURANCES = 20
NGRAM_LENGTH = 3
# Hold a count of words
countdict = {}
# Results in form (count, word 1, word 2, word 3)
results = list()
words = ("",) * NGRAM_LENGTH
# Itterate through every word
for word in brown.words():
words = words[1:] + (word.lower(),)
if words in countdict:
countdict[words] = countdict[words] + 1
else:
countdict[words] = 1
# Itterate through results filtering out punctuation and counts smaller than MIN_OCCURANCES
for ngram in countdict:
if countdict[ngram] > MIN_OCCURANCES:
if all(word not in PUNCTUATION for word in ngram):
results.append((countdict[ngram], ngram))
# Print results sorted by number of occurances
for result in sorted(results, key=lambda result: result[0]):
print(result)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment