ipha/ngram.py

## ngram.py
#!/usr/bin/env python3

from nltk.corpus import brown

PUNCTUATION = "!\"#$%'()*+,-./:;<=>?@[\\]^_`{|}~"
MIN_OCCURANCES = 20
NGRAM_LENGTH = 3

# Hold a count of words
countdict = {}

# Results in form (count, word 1, word 2, word 3)
results = list()

words = ("",) * NGRAM_LENGTH

# Itterate through every word
for word in brown.words():
    words = words[1:] + (word.lower(),)
    if words in countdict:
        countdict[words] = countdict[words] + 1
    else:
        countdict[words] = 1

# Itterate through results filtering out punctuation and counts smaller than MIN_OCCURANCES
for ngram in countdict:
    if countdict[ngram] > MIN_OCCURANCES:
        if all(word not in PUNCTUATION for word in ngram):
            results.append((countdict[ngram], ngram))

# Print results sorted by number of occurances
for result in sorted(results, key=lambda result: result[0]):
    print(result)
	#!/usr/bin/env python3

	from nltk.corpus import brown

	PUNCTUATION = "!\"#$%'()*+,-./:;<=>?@[\\]^_`{\|}~"
	MIN_OCCURANCES = 20
	NGRAM_LENGTH = 3

	# Hold a count of words
	countdict = {}

	# Results in form (count, word 1, word 2, word 3)
	results = list()

	words = ("",) * NGRAM_LENGTH

	# Itterate through every word
	for word in brown.words():
	words = words[1:] + (word.lower(),)
	if words in countdict:
	countdict[words] = countdict[words] + 1
	else:
	countdict[words] = 1

	# Itterate through results filtering out punctuation and counts smaller than MIN_OCCURANCES
	for ngram in countdict:
	if countdict[ngram] > MIN_OCCURANCES:
	if all(word not in PUNCTUATION for word in ngram):
	results.append((countdict[ngram], ngram))

	# Print results sorted by number of occurances
	for result in sorted(results, key=lambda result: result[0]):
	print(result)