jacobbridges/analyze.py

## analyze.py
import re
from collections import Counter

SPEECH_TO_PROCESS = "Year End Press Conference -- (12-19-2014).txt"

# Open speech document
with open(SPEECH_TO_PROCESS, "r") as SPEECH_FILE:

  # Read speech text into variable, converting any pesky unicode characters
	speech = SPEECH_FILE.read().decode('utf8').encode("ascii", "ignore")

	# Remove all apostrophes from text (easier to handle contractions)
	speech = speech.replace("'", '')

	# Get all words from document with regex
	words = re.findall(r'\w+', speech.lower())

	# Load word list into Counter object
	c = Counter(words)

	# Print 10 most common words in document
	print c.most_common(10)

###
# RESULT : [('the', 259), ('to', 241), ('and', 166), ('that', 158), ('of', 142), ('a', 115), ('in', 112), ('i', 109), ('is', 94), ('we', 90)]
###
	import re
	from collections import Counter

	SPEECH_TO_PROCESS = "Year End Press Conference -- (12-19-2014).txt"

	# Open speech document
	with open(SPEECH_TO_PROCESS, "r") as SPEECH_FILE:

	# Read speech text into variable, converting any pesky unicode characters
	speech = SPEECH_FILE.read().decode('utf8').encode("ascii", "ignore")

	# Remove all apostrophes from text (easier to handle contractions)
	speech = speech.replace("'", '')

	# Get all words from document with regex
	words = re.findall(r'\w+', speech.lower())

	# Load word list into Counter object
	c = Counter(words)

	# Print 10 most common words in document
	print c.most_common(10)

	###
	# RESULT : [('the', 259), ('to', 241), ('and', 166), ('that', 158), ('of', 142), ('a', 115), ('in', 112), ('i', 109), ('is', 94), ('we', 90)]
	###