Skip to content

Instantly share code, notes, and snippets.

@MarksCode
Created March 8, 2016 06:07
Show Gist options
  • Save MarksCode/ce2da23d834e35655a20 to your computer and use it in GitHub Desktop.
Save MarksCode/ce2da23d834e35655a20 to your computer and use it in GitHub Desktop.
import string
######## Begin code which needs to be modified ##########
# Implementation 1
class MyChainDict(object):
def __init__(self):
self.dictSize = 10000
self.mydict = []
for i in range(self.dictSize):
self.mydict.append([])
return
def insert(self, key, value=None):
bucketNum = self.hashFunc(key)
doesContain = self.contains(key)
if not doesContain:
self.mydict[bucketNum].append([key, value])
return
for x in self.mydict[bucketNum]:
if (key in x):
x[1] = value
return
def contains(self, key):
bucketNum = self.hashFunc(key);
doesContain = False
for x in self.mydict[bucketNum]:
if (key in x):
doesContain = True
return (doesContain)
def value(self, key):
bucketNum = self.hashFunc(key);
doesContain = self.contains(key)
if not doesContain:
return False
else:
for x in self.mydict[bucketNum]:
if (key in x):
return x[1]
def delete(self, key):
bucketNum = self.hashFunc(key)
index = 0
for x in self.mydict[bucketNum]:
if (key == x[0]):
self.mydict[bucketNum].pop(index)
else:
index = index + 1
return
def get_key_values(self):
myArray = []
for x in self.mydict:
for y in x:
myArray.append((y[0], y[1]))
return myArray
def dump(self):
for i in self.mydict:
for x in i:
print (x)
def hashFunc(self, key):
return hash(key)%self.dictSize;
# Implementation 2
class MyOpenLinearDict(object):
def __init__(self):
self.dictSize = 500000
self.mydict = []
for i in range(self.dictSize):
self.mydict.append(["q", -1])
return
def insert(self, key, value=None):
iterations = 0
bucketNum = self.hashFunc(key)
while (self.mydict[bucketNum][0] != key and self.mydict[bucketNum][0] != "q" and iterations <= self.dictSize-1):
bucketNum = bucketNum + 1
iterations = iterations + 1
if (bucketNum > self.dictSize-1):
bucketNum = 0
self.mydict[bucketNum] = [key, value]
def contains(self, key):
iterations = 0
bucketNum = self.hashFunc(key)
while (self.mydict[bucketNum][0] != key and self.mydict[bucketNum][0] != "q" and iterations <= self.dictSize-1):
bucketNum = bucketNum + 1
iterations = iterations + 1
if (bucketNum > self.dictSize-1):
bucketNum = 0
if (self.mydict[bucketNum][0] == key):
return True
else:
return False
def value(self, key):
bucketNum = self.hashFunc(key)
iterations = 0
while (self.mydict[bucketNum][0] != key and self.mydict[bucketNum][0] != "q" and iterations <= self.dictSize-1):
bucketNum = bucketNum + 1
iterations = iterations + 1
if (bucketNum > self.dictSize-1):
bucketNum = 0
if (self.mydict[bucketNum][0] == key):
return self.mydict[bucketNum][1]
def delete(self, key):
bucketNum = self.hashFunc(key)
iterations = 0
while (self.mydict[bucketNum][0] != key and self.mydict[bucketNum][0] != "q" and iterations <= self.dictSize-1):
bucketNum = bucketNum + 1
iterations = iterations + 1
if (bucketNum > self.dictSize-1):
bucketNum = 0
if (self.mydict[bucketNum][0] == key):
self.mydict[bucketNum] = ["Deleted", -1]
return
def get_key_values(self):
myArray = []
for x in self.mydict:
if (x[0] != "q"):
myArray.append((x[0], x[1]))
return myArray
def dump(self):
for x in self.mydict:
print (x[0], x[1])
def hashFunc(self, key):
return hash(key)%self.dictSize
# Implementation 3
class MyOpenQuadDict(object):
def __init__(self):
self.dictSize = 500000
self.mydict = []
for i in range(self.dictSize):
self.mydict.append(["q", -1])
return
def insert(self, key, value=None):
iterations = 0
bucketNum = self.hashFunc(key, iterations)
while (self.mydict[bucketNum][0] != key and self.mydict[bucketNum][0] != "q"):
iterations = iterations + 1
bucketNum = self.hashFunc(key, iterations)
self.mydict[bucketNum] = [key, value]
def contains(self, key):
iterations = 0
bucketNum = self.hashFunc(key, iterations)
while (self.mydict[bucketNum][0] != key and self.mydict[bucketNum][0] != "q"):
iterations = iterations + 1
bucketNum = self.hashFunc(key, iterations)
if (self.mydict[bucketNum][0] == key):
return True
else:
return False
def value(self, key):
iterations = 0
bucketNum = self.hashFunc(key, iterations)
while (self.mydict[bucketNum][0] != key and self.mydict[bucketNum][0] != "q"):
iterations = iterations + 1
bucketNum = self.hashFunc(key, iterations)
if (self.mydict[bucketNum][0] == key):
return self.mydict[bucketNum][1]
def delete(self, key):
iterations = 0
bucketNum = self.hashFunc(key, iterations)
while (self.mydict[bucketNum][0] != key and self.mydict[bucketNum][0] != "q"):
iterations = iterations + 1
bucketNum = self.hashFunc(key, iterations)
if (self.mydict[bucketNum][0] == key):
self.mydict[bucketNum] = ["Deleted", -1]
return
def get_key_values(self):
myArray = []
for x in self.mydict:
if (x[0] != "q"):
myArray.append((x[0], x[1]))
return myArray
def dump(self):
for x in self.mydict:
if (x[0] != "q"):
print ((x[0], x[1]))
def hashFunc(self, key, iteration):
return (hash(key)+10*iteration+10*iteration^2)%self.dictSize
class MySet(object):
def __init__(self):
self.myset = MyOpenQuadDict()
return
def insert(self, key):
self.myset.insert(key)
return
def contains(self, key):
return self.myset.contains(key)
def dump(self):
self.myset.dump()
######## End code which needs to be modified ##########
# Store the set of stop words in a set object
stop_words_file = "stopwords.txt"
stop_words = MySet()
with open(stop_words_file) as f:
for l in f:
stop_words.insert(l.strip())
# Download file from https://snap.stanford.edu/data/finefoods.txt.gz
# Remember to gunzip before using
review_file = "foods.txt"
review_words = []
for i in range(5):
review_words.append(MyOpenQuadDict())
with open(review_file, encoding='LATIN-1') as f:
lines = f.readlines()
idx = 0
num_lines = len(lines) - 2
while idx < num_lines:
while not lines[idx].startswith("review/score"):
idx = idx + 1
# Jump through hoops to satisfy the Unicode encodings
l = lines[idx].encode('ascii', 'ignore').decode('ascii')
l = l.strip().split(":")[1].strip()
# Extract rating
rating = int(float(l))
while not lines[idx].startswith("review/text"):
idx = idx + 1
l = lines[idx].encode('ascii', 'ignore').decode('ascii').strip().lower()
text = l.split(":")[1].strip()
text = text.translate(str.maketrans("","", string.punctuation))
# Extract words in the text
words = text.split(" ")
# words = [x.strip(string.punctuation) for x in text.split(" ")]
for w in words:
if len(w) and not stop_words.contains(w):
if review_words[rating - 1].contains(w):
count = review_words[rating - 1].value(w) + 1
else:
count = 1
review_words[rating - 1].insert(w, count)
# Print out the top words for each of the five ratings
for i in range(5):
freq_words = sorted(review_words[i].get_key_values(), key=lambda tup: tup[1], reverse=True)
print(i+1, freq_words[0:20])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment