Last active
August 28, 2021 03:29
-
-
Save adem/0b7ab474788b2cb59d5e5b06354c1efb to your computer and use it in GitHub Desktop.
Word frequency
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import re | |
def get_word_frequency(words: list[str]) -> dict[str, int]: | |
""" | |
Given a list of words, returns a Dictionary containing the words as keys | |
and their frequency as values. | |
>>> words = ["foo", "bar", "foo", "baz", "baz", "baz", "bar"] | |
>>> result = get_word_frequency(words) | |
>>> result["foo"] | |
2 | |
>>> result["bar"] | |
2 | |
>>> result["baz"] | |
3 | |
""" | |
result = {} | |
for word in words: | |
if word in result: | |
result[word] += 1 | |
else: | |
result[word] = 1 | |
return result | |
if __name__ == "__main__": | |
if len(sys.argv) != 2: | |
print("Usage: python main.py <file>") | |
exit(1) | |
words = [] | |
with open(sys.argv[1], encoding="utf8") as file: | |
for line in file: | |
# Split by various characters or strings, such as: | |
# 1. whitespace characters | |
# 2. punctuation marks | |
# 3. blocks of strings surrounded in square brackets, denoting the | |
# speaker | |
words_in_line = re.split("[\s.,?()\u200e]|\[.*?\]", line) | |
for word in words_in_line: | |
words += [word.lower()] | |
result = get_word_frequency(words) | |
with open("word_frequency.csv", "w", encoding="utf8") as file: | |
for k, v in sorted(result.items(), key=lambda item: item[1], | |
reverse=True): | |
# Remove empty string resulting from removal of consecutive separators. | |
if k == "": | |
continue | |
file.write("%s,%d\n" % (k, v)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment