Skip to content

Instantly share code, notes, and snippets.

@jacobbridges
Last active August 29, 2015 14:12
Show Gist options
  • Save jacobbridges/9782a68b6d90a9787d93 to your computer and use it in GitHub Desktop.
Save jacobbridges/9782a68b6d90a9787d93 to your computer and use it in GitHub Desktop.
Obama Speech Text Analysis
import re
from collections import Counter
SPEECH_TO_PROCESS = "Year End Press Conference -- (12-19-2014).txt"
# Open speech document
with open(SPEECH_TO_PROCESS, "r") as SPEECH_FILE:
# Read speech text into variable, converting any pesky unicode characters
speech = SPEECH_FILE.read().decode('utf8').encode("ascii", "ignore")
# Remove all apostrophes from text (easier to handle contractions)
speech = speech.replace("'", '')
# Get all words from document with regex
words = re.findall(r'\w+', speech.lower())
# Load word list into Counter object
c = Counter(words)
# Print 10 most common words in document
print c.most_common(10)
###
# RESULT : [('the', 259), ('to', 241), ('and', 166), ('that', 158), ('of', 142), ('a', 115), ('in', 112), ('i', 109), ('is', 94), ('we', 90)]
###
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment