Skip to content

Instantly share code, notes, and snippets.

@rilerez

rilerez/wordcount.py

Last active Jun 9, 2020
Embed
What would you like to do?
import re
from hash_map import HashMap
"""
This is the regular expression used to capture words. It could probably be endlessly
tweaked to catch more words, but this provides a standard we can test against, so don't
modify it for your assignment submission.
"""
rgx = re.compile("(\w[\w']*\w|\w)")
def hash_function_2(key):
"""
This is a hash function that can be used for the hashmap.
"""
var_hash = 0
var_index = 0
for i in key:
var_hash = var_hash + (var_index + 1) * ord(i)
var_index = var_index + 1
return var_hash
def top_words(source, number):
"""
Takes a plain text file and counts the number of occurrences of case insensitive words.
Returns the top `number` of words in a list of tuples of the form (word, count).
Args:
source: the file name containing the text
number: the number of top results to return (e.g. 5 would return the 5 most common words)
Returns:
A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)])
"""
keys = set()
ht = HashMap(2500, hash_function_2)
# This block of code will read a file one word as a time and
# put the word in `w`. It should be left as starter code.
with open(source) as f:
for line in f:
words = rgx.findall(line)
for w in words:
# WRITTEN
w = w.lower()
if ht.contains_key(w):
assert(w in keys)
ht.put(w, ht.get(w))
else:
keys.add(w)
ht.put(w, 1)
sorted_words = sorted(keys, reverse=True,
key= lambda word: ht.get(word))
high_scores = sorted_words[:number]
print(top_words("alice.txt", 10)) # COMMENT THIS OUT WHEN SUBMITTING TO GRADESCOPE
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment