Skip to content

Instantly share code, notes, and snippets.

@jarobins
Forked from anonymous/word_counter.py
Last active December 15, 2015 15:19
Show Gist options
  • Save jarobins/5281410 to your computer and use it in GitHub Desktop.
Save jarobins/5281410 to your computer and use it in GitHub Desktop.
from datetime import datetime
import operator, string
table = string.maketrans("","")
thelist = {}
loop_condition = True
txt_file = open('gs2.txt')
word_list = [x for x in txt_file.read().decode('utf-8').split()]
# Translate removes punctuation
# Should add a .undercase 04APR2013
word_list_dup = [
str(word).translate(table, string.punctuation) for word in word_list]
word_list_nodup = set(word_list_dup)
print "Starting find at", str(datetime.now())
word_gen = (item for item in word_list_nodup)
cur_word = word_gen.next()
# For displaying progress
length = len(word_list_nodup)
counter = 0
while loop_condition:
try:
thelist.update({'%s' % cur_word: word_list_dup.count(cur_word)})
cur_word = word_gen.next()
counter += 1
if counter % 1000 == 0:
print '%s out of %s' % (counter, length)
except StopIteration:
loop_condition = False
print "Finished find at", str(datetime.now())
sorted_items = sorted(thelist.iteritems(), key=operator.itemgetter(1))
sorted_items.reverse()
with open('output.txt', 'w') as d:
for item in sorted_items:
d.write(str(item[0].encode('utf-8')) + ' ' + str(item[1]) + '\n')
@jarobins
Copy link
Author

Ran with a text file of the bible in about 5 minutes

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment