Skip to content

Instantly share code, notes, and snippets.

@ivangonekrazy
Created February 2, 2011 01:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ivangonekrazy/807062 to your computer and use it in GitHub Desktop.
Save ivangonekrazy/807062 to your computer and use it in GitHub Desktop.
Reports a count of words for a given file.
import re
from collections import defaultdict
SHORT_WORD_LEN = 4
WORD_REGEX = '\w+'
TEXT_FILE_PATH = 'text.txt'
word_counter = defaultdict(int)
# read in all the words, cull out the ones that aren't long enough
words = re.findall( WORD_REGEX, open( TEXT_FILE_PATH ).read().lower() )
no_short_words = [ x for x in words if len(x) > SHORT_WORD_LEN ]
# tally up the words
for w in no_short_words:
word_counter[w] += 1
# remove all words that have not been repeated
no_singleton_words = [ (v, k) for k,v in word_counter.items() if v > 1 ]
# dump out the list of words
for c,w in reversed( sorted( no_singleton_words ) ):
print "%s: %s" % ( str(c).rjust(3), w )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment