Skip to content

Instantly share code, notes, and snippets.

@adem
Last active August 28, 2021 03:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save adem/0b7ab474788b2cb59d5e5b06354c1efb to your computer and use it in GitHub Desktop.
Save adem/0b7ab474788b2cb59d5e5b06354c1efb to your computer and use it in GitHub Desktop.
Word frequency
import sys
import re
def get_word_frequency(words: list[str]) -> dict[str, int]:
"""
Given a list of words, returns a Dictionary containing the words as keys
and their frequency as values.
>>> words = ["foo", "bar", "foo", "baz", "baz", "baz", "bar"]
>>> result = get_word_frequency(words)
>>> result["foo"]
2
>>> result["bar"]
2
>>> result["baz"]
3
"""
result = {}
for word in words:
if word in result:
result[word] += 1
else:
result[word] = 1
return result
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python main.py <file>")
exit(1)
words = []
with open(sys.argv[1], encoding="utf8") as file:
for line in file:
# Split by various characters or strings, such as:
# 1. whitespace characters
# 2. punctuation marks
# 3. blocks of strings surrounded in square brackets, denoting the
# speaker
words_in_line = re.split("[\s.,?()\u200e]|\[.*?\]", line)
for word in words_in_line:
words += [word.lower()]
result = get_word_frequency(words)
with open("word_frequency.csv", "w", encoding="utf8") as file:
for k, v in sorted(result.items(), key=lambda item: item[1],
reverse=True):
# Remove empty string resulting from removal of consecutive separators.
if k == "":
continue
file.write("%s,%d\n" % (k, v))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment