Last active
February 20, 2020 00:21
-
-
Save jarehec/57acac25f8b1103eac4c22e64912d127 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def word_count_engine(doc: str): | |
words = {} | |
word = [] | |
pos = 0 | |
result = [] | |
# build word list and frequency | |
for i, c in enumerate(doc.lower()): | |
if c >= 'a' and c <= 'z': | |
word.append(c) | |
elif c == '\'' or word == []: | |
continue | |
elif words.get(''.join(word)) is None: | |
words[''.join(word)] = [pos, 1] | |
word = [] | |
pos += 1 | |
else: | |
words[''.join(word)][1] += 1 | |
word = [] | |
# increment word count if done scanning doc | |
if i + 1 == len(doc) and words.get(''.join(word)): | |
words[''.join(word)][1] += 1 | |
# insert sort into results list | |
for k, v in words.items(): | |
if result == []: | |
result.append((k, v)) | |
else: | |
for i, item in enumerate(result): | |
if v[1] > item[1][1]: | |
result.insert(i, (k, v)) | |
break | |
elif item[1][1] == v[1] and v[0] < item[1][0]: | |
result.insert(i, (k, v)) | |
break | |
elif i == len(result) - 1: | |
result.append((k, v)) | |
break | |
result = [[k[0], str(k[1][1])] for k in result] | |
return result |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment