jacobbridges/analyze2.py

## analyze2.py
import re
from collections import Counter

SPEECH_TO_PROCESS = "Year End Press Conference -- (12-19-2014).txt"
IGNORE_WORDS = ["a", "an", "the", "and", "that", "is", "are", "were", "be", "be", "being", "been", "have", "has", "had", "do", "does", "did", "not", "it", "they", "its", "aboard", "about", "above", "across", "after", "against", "along", "amid", "among", "anti", "around", "as", "at", "before", "behind", "below", "beneath", "beside", "besides", "between", "beyond", "but", "by", "concerning", "considering", "despite", "down", "during", "except", "excepting", "excluding", "following", "for", "from", "in", "inside", "into", "like", "minus", "near", "of", "off", "on", "onto", "opposite", "outside", "over", "past", "per", "plus", "regarding", "round", "save", "since", "than", "through", "to", "toward", "towards", "under", "underneath", "unlike", "until", "up", "upon", "versus", "via", "with", "within", "without"]


# Open speech document
with open(SPEECH_TO_PROCESS, "r") as SPEECH_FILE:

  # Read speech text into variable, converting any pesky unicode characters
	speech = SPEECH_FILE.read().decode('utf8').encode("ascii", "ignore")

	# Remove all apostrophes from text (easier to handle contractions)
	speech = speech.replace("'", '')

	# Get all words from document with regex
	words = re.findall(r'\w+', speech.lower())

	# Load word list into Counter object
	c = Counter(words)

	# Remove any "ignore" words
	for word in list(c):
		if word in IGNORE_WORDS:
			del c[word]

	# Print 10 most common words in document
	print c.most_common(10)

###
# RESULT : [('i', 109), ('we', 90), ('going', 57), ('think', 46), ('this', 45), ('some', 34), ('there', 33), ('our', 30), ('you', 29), ('what', 28)]
###
	import re
	from collections import Counter

	SPEECH_TO_PROCESS = "Year End Press Conference -- (12-19-2014).txt"
	IGNORE_WORDS = ["a", "an", "the", "and", "that", "is", "are", "were", "be", "be", "being", "been", "have", "has", "had", "do", "does", "did", "not", "it", "they", "its", "aboard", "about", "above", "across", "after", "against", "along", "amid", "among", "anti", "around", "as", "at", "before", "behind", "below", "beneath", "beside", "besides", "between", "beyond", "but", "by", "concerning", "considering", "despite", "down", "during", "except", "excepting", "excluding", "following", "for", "from", "in", "inside", "into", "like", "minus", "near", "of", "off", "on", "onto", "opposite", "outside", "over", "past", "per", "plus", "regarding", "round", "save", "since", "than", "through", "to", "toward", "towards", "under", "underneath", "unlike", "until", "up", "upon", "versus", "via", "with", "within", "without"]


	# Open speech document
	with open(SPEECH_TO_PROCESS, "r") as SPEECH_FILE:

	# Read speech text into variable, converting any pesky unicode characters
	speech = SPEECH_FILE.read().decode('utf8').encode("ascii", "ignore")

	# Remove all apostrophes from text (easier to handle contractions)
	speech = speech.replace("'", '')

	# Get all words from document with regex
	words = re.findall(r'\w+', speech.lower())

	# Load word list into Counter object
	c = Counter(words)

	# Remove any "ignore" words
	for word in list(c):
	if word in IGNORE_WORDS:
	del c[word]

	# Print 10 most common words in document
	print c.most_common(10)

	###
	# RESULT : [('i', 109), ('we', 90), ('going', 57), ('think', 46), ('this', 45), ('some', 34), ('there', 33), ('our', 30), ('you', 29), ('what', 28)]
	###