Skip to content

Instantly share code, notes, and snippets.

@Laetus
Created April 4, 2019 21:05
Show Gist options
  • Save Laetus/453865d06776a5bcb810ea121687de03 to your computer and use it in GitHub Desktop.
Save Laetus/453865d06776a5bcb810ea121687de03 to your computer and use it in GitHub Desktop.
Basic Bag of Words script
import collections
import logging
logger = logging.getLogger('root.app.bag_of_words')
logger.setLevel(logging.DEBUG)
def clean_string(string):
# remove special chars the hacky way
for char in ('.', '[', ']', ',', '\t', '\n', '(', ')', '-', '"', "'"):
string = string.replace(char, '')
for i in range(10):
string = string.replace(str(i), '')
return string.lower()
def bag_of_words(filename='2 - data.txt'):
with open(filename) as f:
lines = f.readlines()
counter = 0
bag = collections.OrderedDict()
for line in lines:
counter += 1
logger.debug('processing line {}'.format(counter))
cleaned_line = clean_string(line)
for word in cleaned_line.split(' '):
if len(word) == 0:
continue
if word in bag:
bag[word] += 1
else:
bag[word] = 1
logger.debug('finished processing file')
return bag
def print_bag(bag):
if not isinstance(bag, collections.OrderedDict):
bag = collections.OrderedDict(bag)
# at this point I realized, I had to sort by occurrence
# so ordered dict does not really help
bag = dict(bag)
bag_items = list(bag.items())
bag_items.sort(key=lambda x: x[1], reverse=True)
for item in bag_items:
print('{} ({})'.format(item[0], item[1]))
if __name__ == "__main__":
logger.info('starting application')
bag = bag_of_words()
print_bag(bag)
@Laetus
Copy link
Author

Laetus commented Apr 4, 2019

  • add tests
  • make cleanup function more generic
  • add functionality to treat words, which are separated between to lines, the same way as others

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment