Skip to content

Instantly share code, notes, and snippets.

@johndavidback
Created August 13, 2014 14:35
Show Gist options
  • Save johndavidback/ea5ad1387f8c9320127f to your computer and use it in GitHub Desktop.
Save johndavidback/ea5ad1387f8c9320127f to your computer and use it in GitHub Desktop.
A quick text analysis of the most often repeating words in a block of text
# Usage:
# $ python analyze.py somefile.txt
# Easy breezy.
import sys
import string
import operator
# These I just grabbed the top 50 from wikipedia: http://en.wikipedia.org/wiki/Most_common_words_in_English
COMMON_WORDS = 'the be to of and a in that have I it for not on with he as you do at this but his by from they we say her she or an will my one all would there their what so up out if about who get which go me'.split()
def analyze():
# Grab out the text file name
textfile = sys.argv[1]
# Read the text document
with open(textfile) as f:
# Get the content of the text file
text = f.read()
# Remove all the punctuation from the text, similar to string.punctuation but I removed some.
exclude = set('!"#$%&()*+,./:;<=>?@[\\]^_`{|}~')
text = ''.join(ch.lower() for ch in text if ch not in exclude).split() # Turn into list
# Go through the words and build them up, buttercup. Strip out the 50 most common
counts = {}
for word in text:
if not word in COMMON_WORDS:
if word in counts:
counts[word] += 1
else:
counts[word] = 1
# Now, sort them based on their keys
sorted_counts = sorted(counts.iteritems(), key=operator.itemgetter(1))
# Go through it reversed and we're golden grahams.
for word_tuple in reversed(sorted_counts):
print word_tuple[0], word_tuple[1]
if __name__ == '__main__':
analyze()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment