Skip to content

Instantly share code, notes, and snippets.

@drinks
Created April 24, 2012 20:36
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save drinks/2483508 to your computer and use it in GitHub Desktop.
Save drinks/2483508 to your computer and use it in GitHub Desktop.
Flesch-Kincaid grade level processing against Capitol Words API
#!/usr/bin/env python
from __future__ import division
from curses.ascii import isdigit
import json
import sys
import datetime
# import re
from django.utils.datastructures import SortedDict
from django.contrib.localflavor.us.us_states import STATE_CHOICES
from nltk import sent_tokenize, regexp_tokenize
from nltk.corpus import cmudict
from sunlight import capitolwords as cw
from sunlight import congress
SYLLABLE_AVG = 1.66
STARTING_CONGRESS = 104
CURRENT_CONGRESS = 112
PER_PAGE = 1000
DICT = cmudict.dict()
LEGISLATORS = {}
for leg in congress.legislators():
if leg['bioguide_id']:
LEGISLATORS[leg['bioguide_id']] = leg
STATES = [state[0] for state in STATE_CHOICES]
def tokenize(term):
# Adapted From Natural Language Processing with Python
regex = r'''(?xi)
(?:H|S)\.\ ?(?:(?:J|R)\.\ )?(?:Con\.\ )?(?:Res\.\ )?\d+ # Bills
| ([A-Z]\.)+ # Abbreviations (U.S.A., etc.)
| ([A-Z]+\&[A-Z]+) # Internal ampersands (AT&T, etc.)
| (Mr\.|Dr\.|Mrs\.|Ms\.) # Mr., Mrs., etc.
| \d*\.\d+ # Numbers with decimal points.
| \d\d?:\d\d # Times.
| \$?[,\.0-9]+\d # Numbers with thousands separators, (incl currency).
| (((a|A)|(p|P))\.(m|M)\.) # a.m., p.m., A.M., P.M.
| \w+((-|')\w+)* # Words with optional internal hyphens.
| \$?\d+(\.\d+)?%? # Currency and percentages.
| (?<=\b)\.\.\.(?=\b) # Ellipses surrounded by word borders
| [][.,;"'?():-_`]
'''
# Strip punctuation from this one; solr doesn't know about any of it
tokens = regexp_tokenize(term, regex)
# tokens = [re.sub(r'[.,?!]', '', token) for token in tokens] # instead of this we just test word length
return tokens
def nsyl(word):
return [len(list(y for y in x if isdigit(y[-1]))) for x in DICT[word.lower()]][0]
if __name__ == '__main__':
from optparse import OptionParser
parser = OptionParser()
parser.add_option('--facet', dest='facets', action='append', type='choice', default=[],
choices=('month', 'year', 'chamber', 'congress', 'party', 'bioguide', 'state'),
help='Set the facet(s) to aggregate grade levels over')
parser.add_option('--start-date', dest='start_date', default='1996-01-01',
help='How far back to limit the search, Required with month or year faceting.')
parser.add_option('--end-date', dest='end_date', default=datetime.datetime.now().strftime('%Y-%m-%d'),
help='How far forward to limit the search, required with month or year faceting.')
options, args = parser.parse_args()
if not options.facets:
options.facets = ['bioguide']
try:
start_date = options.start_date
except AttributeError:
start_date = None
try:
end_date = options.end_date
except AttributeError:
end_date = None
kwargs = SortedDict()
calls = []
results = {}
if 'bioguide' in options.facets:
kwargs['bioguide_id'] = LEGISLATORS.keys()
if 'chamber' in options.facets:
kwargs['chamber'] = ['house', 'senate']
if 'party' in options.facets:
kwargs['party'] = ['D', 'R', 'I']
if 'state' in options.facets:
kwargs['state'] = STATES
if 'month' in options.facets:
import calendar
cal = calendar.Calendar()
start_month = (int(start_date.split('-')[0]), int(start_date.split('-')[1]))
end_month = (int(end_date.split('-')[0]), int(end_date.split('-')[1]))
current_month = list(start_month)
starts = []
ends = []
while current_month[0] < end_month[0] or current_month[1] <= end_month[1]:
starts.append('%d-%d-01' % (current_month[0], current_month[1]))
days_in_month = len([day for day in cal.itermonthdays(current_month[0], current_month[1]) if day > 0])
ends.append('%d-%d-%d' % (current_month[0], current_month[1], days_in_month))
current_month[1] += 1
if current_month[1] == 13:
current_month[0] += 1
current_month[1] = 1
kwargs['dates'] = zip(starts, ends)
elif 'year' in options.facets:
starts = []
ends = []
start_year = int(start_date.split('-')[0])
end_year = int(end_date.split('-')[0])
for year in (range(start_year, end_year) + [end_year]):
starts.append('%d-01-01' % year)
ends.append('%d-12-31' % year)
kwargs['dates'] = zip(starts, ends)
elif 'date' in options.facets:
raise NotImplementedError('Sorry, dates aren\'t available yet')
elif 'congress' in options.facets:
kwargs['congress'] = range(STARTING_CONGRESS, CURRENT_CONGRESS + 1)
if 'month' in options.facets or 'year' in options.facets or 'date' in options.facets:
start_date = None
end_date = None
if start_date:
kwargs['start_date'] = [start_date]
if end_date:
kwargs['end_date'] = [end_date]
cursor = SortedDict()
boundary = SortedDict()
for key in kwargs.keys():
cursor[key] = 0
boundary[key] = len(kwargs[key]) - 1
itercursor = cursor.keys()
itercursor.reverse()
try:
filename = parser.parse_args()[1][0]
file = open(filename, 'w+')
except Exception, e:
print e
print 'usage: ./calculate_grade_levels.py [options] <filename>'
sys.exit()
file.write('[')
while cursor.values() <= boundary.values():
page = 0
corpus = ''
call = {}
for key, val in cursor.items():
if key == 'dates':
call['start_date'] = kwargs[key][val][0]
call['end_date'] = kwargs[key][val][1]
else:
call[key] = kwargs[key][val]
callsum = '-'.join(['-'.join([key, str(val)]) for (key, val) in call.items()])
print callsum
call.update(phrase='*', per_page=PER_PAGE)
if not call.get('bioguide_id'):
call.update(bioguide_id="['' TO *]")
while True:
print 'page %d...' % page
call.update(page=page)
resp = cw.text(**call)
if not len(resp):
break
page += 1
for chunk in resp:
try:
corpus += ' %s' % ' '.join(chunk['speaking'])
except TypeError:
corpus += ' %s' % str(chunk['speaking'])
except:
pass
words = [word for word in tokenize(corpus) if (len(word) > 1) or (word.lower() in ['a', 'i'])]
sentences = sent_tokenize(corpus)
syllables = []
misses = []
for word in words:
try:
syllables.append(nsyl(word))
except KeyError:
misses.append(word)
word_count = len(words)
sentence_count = len(sentences)
# pad syllable count out to word count
missing_syllables = word_count - len(syllables)
for i in range(missing_syllables):
syllables.append(SYLLABLE_AVG)
syllable_count = sum(syllables)
if word_count > 0 and sentence_count > 0:
results[callsum] = {
'words': word_count,
'syllables': syllable_count,
'missed_count': missing_syllables,
'missed_pct': missing_syllables / word_count,
'sentences': sentence_count,
'grade_level': (0.39 * (word_count / sentence_count)) + (11.8 * (syllable_count / word_count)) - 15.59,
'reading_ease': 206.835 - (1.015 * (word_count / sentence_count)) - (84.6 * (syllable_count / word_count)),
}
if call.get('bioguide_id') and not call['bioguide_id'].startswith('['):
leg = LEGISLATORS[call['bioguide_id']]
results[callsum].update(bioguide_id=call['bioguide_id'],
party=leg['party'],
chamber=leg['chamber'],
state=leg['state'],
name='%s, %s' % (leg['lastname'], leg['firstname']))
if call.get('start_date') and call.get('end_date'):
results[callsum].update(start_date=call['start_date'], end_date=call['end_date'])
if call.get('chamber'):
results[callsum].update(chamber=call['chamber'])
if call.get('party'):
results[callsum].update(party=call['party'])
if call.get('congress'):
results[callsum].update(congress=call['congress'])
file.write(json.dumps(results[callsum]))
file.flush()
# break out of the loop if we're done
if cursor.values() == boundary.values():
break
else:
file.write(",\n")
# otherwise, increment from right to left
for key in itercursor:
if cursor[key] < boundary[key]:
cursor[key] += 1
break
else:
cursor[key] = 0
file.write(']')
file.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment