Created
April 4, 2019 21:05
-
-
Save Laetus/453865d06776a5bcb810ea121687de03 to your computer and use it in GitHub Desktop.
Basic Bag of Words script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import collections | |
import logging | |
logger = logging.getLogger('root.app.bag_of_words') | |
logger.setLevel(logging.DEBUG) | |
def clean_string(string): | |
# remove special chars the hacky way | |
for char in ('.', '[', ']', ',', '\t', '\n', '(', ')', '-', '"', "'"): | |
string = string.replace(char, '') | |
for i in range(10): | |
string = string.replace(str(i), '') | |
return string.lower() | |
def bag_of_words(filename='2 - data.txt'): | |
with open(filename) as f: | |
lines = f.readlines() | |
counter = 0 | |
bag = collections.OrderedDict() | |
for line in lines: | |
counter += 1 | |
logger.debug('processing line {}'.format(counter)) | |
cleaned_line = clean_string(line) | |
for word in cleaned_line.split(' '): | |
if len(word) == 0: | |
continue | |
if word in bag: | |
bag[word] += 1 | |
else: | |
bag[word] = 1 | |
logger.debug('finished processing file') | |
return bag | |
def print_bag(bag): | |
if not isinstance(bag, collections.OrderedDict): | |
bag = collections.OrderedDict(bag) | |
# at this point I realized, I had to sort by occurrence | |
# so ordered dict does not really help | |
bag = dict(bag) | |
bag_items = list(bag.items()) | |
bag_items.sort(key=lambda x: x[1], reverse=True) | |
for item in bag_items: | |
print('{} ({})'.format(item[0], item[1])) | |
if __name__ == "__main__": | |
logger.info('starting application') | |
bag = bag_of_words() | |
print_bag(bag) |
Author
Laetus
commented
Apr 4, 2019
- add tests
- make cleanup function more generic
- add functionality to treat words, which are separated between to lines, the same way as others
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment