Skip to content

Instantly share code, notes, and snippets.

@cjoshmartin
Last active June 22, 2021 07:41
Show Gist options
  • Save cjoshmartin/a21a4f957d5032773f99525ad6c90dfa to your computer and use it in GitHub Desktop.
Save cjoshmartin/a21a4f957d5032773f99525ad6c90dfa to your computer and use it in GitHub Desktop.
# Download file and have python version 3.8 or newer installed
# run "python main.py"
from pathlib import Path
import collections
import re
def top_n_words(input_file: str, common_words: str, n: int) -> None:
input_data = Path(input_file).read_text()
input_data = re.sub(' +', ' ', input_data ).replace('\n', '').split(' ')
common_words_data = Path(f"{common_words}").read_text()
common_words_data = set(common_words_data.split('\n'))
number_of_common_words = {}
for text in input_data:
word = text.lower()
if word not in common_words_data:
if word in number_of_common_words:
number_of_common_words[word] += 1
else:
number_of_common_words[word] = 1
sorted_common_words = collections.OrderedDict(number_of_common_words).items()
print(f"{'Count':<6} {'Word':<6}")
print(f"{'===':<6} {'===':<6}")
for word, count in list(sorted_common_words)[:n]:
print(f"{count:<6} {word:<6}")
if __name__ == '__main__':
top_n_words('alice_in_wonderland.txt', '1-1000.txt', 5)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment