Skip to content

Instantly share code, notes, and snippets.

@lambda-fairy
Created December 28, 2018 07:23
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lambda-fairy/34a708182649e090f5aa5d8d952e8993 to your computer and use it in GitHub Desktop.
Save lambda-fairy/34a708182649e090f5aa5d8d952e8993 to your computer and use it in GitHub Desktop.
Report the most common phrases in a text file
#!/usr/bin/env python3
from collections import Counter
from itertools import islice
import re
import string
import sys
MIN_PHRASE_LENGTH = 2
MAX_PHRASE_LENGTH = 5
NUMBER_TO_REPORT = 10
def window(seq, n=2):
it = iter(seq)
result = tuple(islice(it, n))
if len(result) == n:
yield result
for elem in it:
result = result[1:] + (elem,)
yield result
def main(argv):
if len(argv) != 2:
command = argv[0]
raise SystemExit(f'Usage: {command} FILE')
filename = argv[1]
text = open(filename, errors='replace').read()
words = list(filter(None, re.split(r'\W+', text.casefold())))
for n in range(MIN_PHRASE_LENGTH, MAX_PHRASE_LENGTH+1):
histogram = Counter(window(words, n))
results = histogram.most_common(NUMBER_TO_REPORT)
print(f'Top {NUMBER_TO_REPORT} phrases with {n} words:')
for phrase, count in results:
joined_phrase = ' '.join(phrase)
print(f'- {joined_phrase} ({count})')
print()
if __name__ == '__main__':
main(sys.argv)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment