Skip to content

Instantly share code, notes, and snippets.

@jacobbridges
Last active August 29, 2015 14:12
Show Gist options
  • Save jacobbridges/f1be5eccf1a19bb8d662 to your computer and use it in GitHub Desktop.
Save jacobbridges/f1be5eccf1a19bb8d662 to your computer and use it in GitHub Desktop.
Obama Speech Text Analysis 2
import re
from collections import Counter
SPEECH_TO_PROCESS = "Year End Press Conference -- (12-19-2014).txt"
IGNORE_WORDS = ["a", "an", "the", "and", "that", "is", "are", "were", "be", "be", "being", "been", "have", "has", "had", "do", "does", "did", "not", "it", "they", "its", "aboard", "about", "above", "across", "after", "against", "along", "amid", "among", "anti", "around", "as", "at", "before", "behind", "below", "beneath", "beside", "besides", "between", "beyond", "but", "by", "concerning", "considering", "despite", "down", "during", "except", "excepting", "excluding", "following", "for", "from", "in", "inside", "into", "like", "minus", "near", "of", "off", "on", "onto", "opposite", "outside", "over", "past", "per", "plus", "regarding", "round", "save", "since", "than", "through", "to", "toward", "towards", "under", "underneath", "unlike", "until", "up", "upon", "versus", "via", "with", "within", "without"]
# Open speech document
with open(SPEECH_TO_PROCESS, "r") as SPEECH_FILE:
# Read speech text into variable, converting any pesky unicode characters
speech = SPEECH_FILE.read().decode('utf8').encode("ascii", "ignore")
# Remove all apostrophes from text (easier to handle contractions)
speech = speech.replace("'", '')
# Get all words from document with regex
words = re.findall(r'\w+', speech.lower())
# Load word list into Counter object
c = Counter(words)
# Remove any "ignore" words
for word in list(c):
if word in IGNORE_WORDS:
del c[word]
# Print 10 most common words in document
print c.most_common(10)
###
# RESULT : [('i', 109), ('we', 90), ('going', 57), ('think', 46), ('this', 45), ('some', 34), ('there', 33), ('our', 30), ('you', 29), ('what', 28)]
###
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment