Flesch-Kincaid grade level processing against Capitol Words API
#!/usr/bin/env python | |
from __future__ import division | |
from curses.ascii import isdigit | |
import json | |
import sys | |
import datetime | |
# import re | |
from django.utils.datastructures import SortedDict | |
from django.contrib.localflavor.us.us_states import STATE_CHOICES | |
from nltk import sent_tokenize, regexp_tokenize | |
from nltk.corpus import cmudict | |
from sunlight import capitolwords as cw | |
from sunlight import congress | |
SYLLABLE_AVG = 1.66 | |
STARTING_CONGRESS = 104 | |
CURRENT_CONGRESS = 112 | |
PER_PAGE = 1000 | |
DICT = cmudict.dict() | |
LEGISLATORS = {} | |
for leg in congress.legislators(): | |
if leg['bioguide_id']: | |
LEGISLATORS[leg['bioguide_id']] = leg | |
STATES = [state[0] for state in STATE_CHOICES] | |
def tokenize(term): | |
# Adapted From Natural Language Processing with Python | |
regex = r'''(?xi) | |
(?:H|S)\.\ ?(?:(?:J|R)\.\ )?(?:Con\.\ )?(?:Res\.\ )?\d+ # Bills | |
| ([A-Z]\.)+ # Abbreviations (U.S.A., etc.) | |
| ([A-Z]+\&[A-Z]+) # Internal ampersands (AT&T, etc.) | |
| (Mr\.|Dr\.|Mrs\.|Ms\.) # Mr., Mrs., etc. | |
| \d*\.\d+ # Numbers with decimal points. | |
| \d\d?:\d\d # Times. | |
| \$?[,\.0-9]+\d # Numbers with thousands separators, (incl currency). | |
| (((a|A)|(p|P))\.(m|M)\.) # a.m., p.m., A.M., P.M. | |
| \w+((-|')\w+)* # Words with optional internal hyphens. | |
| \$?\d+(\.\d+)?%? # Currency and percentages. | |
| (?<=\b)\.\.\.(?=\b) # Ellipses surrounded by word borders | |
| [][.,;"'?():-_`] | |
''' | |
# Strip punctuation from this one; solr doesn't know about any of it | |
tokens = regexp_tokenize(term, regex) | |
# tokens = [re.sub(r'[.,?!]', '', token) for token in tokens] # instead of this we just test word length | |
return tokens | |
def nsyl(word): | |
return [len(list(y for y in x if isdigit(y[-1]))) for x in DICT[word.lower()]][0] | |
if __name__ == '__main__': | |
from optparse import OptionParser | |
parser = OptionParser() | |
parser.add_option('--facet', dest='facets', action='append', type='choice', default=[], | |
choices=('month', 'year', 'chamber', 'congress', 'party', 'bioguide', 'state'), | |
help='Set the facet(s) to aggregate grade levels over') | |
parser.add_option('--start-date', dest='start_date', default='1996-01-01', | |
help='How far back to limit the search, Required with month or year faceting.') | |
parser.add_option('--end-date', dest='end_date', default=datetime.datetime.now().strftime('%Y-%m-%d'), | |
help='How far forward to limit the search, required with month or year faceting.') | |
options, args = parser.parse_args() | |
if not options.facets: | |
options.facets = ['bioguide'] | |
try: | |
start_date = options.start_date | |
except AttributeError: | |
start_date = None | |
try: | |
end_date = options.end_date | |
except AttributeError: | |
end_date = None | |
kwargs = SortedDict() | |
calls = [] | |
results = {} | |
if 'bioguide' in options.facets: | |
kwargs['bioguide_id'] = LEGISLATORS.keys() | |
if 'chamber' in options.facets: | |
kwargs['chamber'] = ['house', 'senate'] | |
if 'party' in options.facets: | |
kwargs['party'] = ['D', 'R', 'I'] | |
if 'state' in options.facets: | |
kwargs['state'] = STATES | |
if 'month' in options.facets: | |
import calendar | |
cal = calendar.Calendar() | |
start_month = (int(start_date.split('-')[0]), int(start_date.split('-')[1])) | |
end_month = (int(end_date.split('-')[0]), int(end_date.split('-')[1])) | |
current_month = list(start_month) | |
starts = [] | |
ends = [] | |
while current_month[0] < end_month[0] or current_month[1] <= end_month[1]: | |
starts.append('%d-%d-01' % (current_month[0], current_month[1])) | |
days_in_month = len([day for day in cal.itermonthdays(current_month[0], current_month[1]) if day > 0]) | |
ends.append('%d-%d-%d' % (current_month[0], current_month[1], days_in_month)) | |
current_month[1] += 1 | |
if current_month[1] == 13: | |
current_month[0] += 1 | |
current_month[1] = 1 | |
kwargs['dates'] = zip(starts, ends) | |
elif 'year' in options.facets: | |
starts = [] | |
ends = [] | |
start_year = int(start_date.split('-')[0]) | |
end_year = int(end_date.split('-')[0]) | |
for year in (range(start_year, end_year) + [end_year]): | |
starts.append('%d-01-01' % year) | |
ends.append('%d-12-31' % year) | |
kwargs['dates'] = zip(starts, ends) | |
elif 'date' in options.facets: | |
raise NotImplementedError('Sorry, dates aren\'t available yet') | |
elif 'congress' in options.facets: | |
kwargs['congress'] = range(STARTING_CONGRESS, CURRENT_CONGRESS + 1) | |
if 'month' in options.facets or 'year' in options.facets or 'date' in options.facets: | |
start_date = None | |
end_date = None | |
if start_date: | |
kwargs['start_date'] = [start_date] | |
if end_date: | |
kwargs['end_date'] = [end_date] | |
cursor = SortedDict() | |
boundary = SortedDict() | |
for key in kwargs.keys(): | |
cursor[key] = 0 | |
boundary[key] = len(kwargs[key]) - 1 | |
itercursor = cursor.keys() | |
itercursor.reverse() | |
try: | |
filename = parser.parse_args()[1][0] | |
file = open(filename, 'w+') | |
except Exception, e: | |
print e | |
print 'usage: ./calculate_grade_levels.py [options] <filename>' | |
sys.exit() | |
file.write('[') | |
while cursor.values() <= boundary.values(): | |
page = 0 | |
corpus = '' | |
call = {} | |
for key, val in cursor.items(): | |
if key == 'dates': | |
call['start_date'] = kwargs[key][val][0] | |
call['end_date'] = kwargs[key][val][1] | |
else: | |
call[key] = kwargs[key][val] | |
callsum = '-'.join(['-'.join([key, str(val)]) for (key, val) in call.items()]) | |
print callsum | |
call.update(phrase='*', per_page=PER_PAGE) | |
if not call.get('bioguide_id'): | |
call.update(bioguide_id="['' TO *]") | |
while True: | |
print 'page %d...' % page | |
call.update(page=page) | |
resp = cw.text(**call) | |
if not len(resp): | |
break | |
page += 1 | |
for chunk in resp: | |
try: | |
corpus += ' %s' % ' '.join(chunk['speaking']) | |
except TypeError: | |
corpus += ' %s' % str(chunk['speaking']) | |
except: | |
pass | |
words = [word for word in tokenize(corpus) if (len(word) > 1) or (word.lower() in ['a', 'i'])] | |
sentences = sent_tokenize(corpus) | |
syllables = [] | |
misses = [] | |
for word in words: | |
try: | |
syllables.append(nsyl(word)) | |
except KeyError: | |
misses.append(word) | |
word_count = len(words) | |
sentence_count = len(sentences) | |
# pad syllable count out to word count | |
missing_syllables = word_count - len(syllables) | |
for i in range(missing_syllables): | |
syllables.append(SYLLABLE_AVG) | |
syllable_count = sum(syllables) | |
if word_count > 0 and sentence_count > 0: | |
results[callsum] = { | |
'words': word_count, | |
'syllables': syllable_count, | |
'missed_count': missing_syllables, | |
'missed_pct': missing_syllables / word_count, | |
'sentences': sentence_count, | |
'grade_level': (0.39 * (word_count / sentence_count)) + (11.8 * (syllable_count / word_count)) - 15.59, | |
'reading_ease': 206.835 - (1.015 * (word_count / sentence_count)) - (84.6 * (syllable_count / word_count)), | |
} | |
if call.get('bioguide_id') and not call['bioguide_id'].startswith('['): | |
leg = LEGISLATORS[call['bioguide_id']] | |
results[callsum].update(bioguide_id=call['bioguide_id'], | |
party=leg['party'], | |
chamber=leg['chamber'], | |
state=leg['state'], | |
name='%s, %s' % (leg['lastname'], leg['firstname'])) | |
if call.get('start_date') and call.get('end_date'): | |
results[callsum].update(start_date=call['start_date'], end_date=call['end_date']) | |
if call.get('chamber'): | |
results[callsum].update(chamber=call['chamber']) | |
if call.get('party'): | |
results[callsum].update(party=call['party']) | |
if call.get('congress'): | |
results[callsum].update(congress=call['congress']) | |
file.write(json.dumps(results[callsum])) | |
file.flush() | |
# break out of the loop if we're done | |
if cursor.values() == boundary.values(): | |
break | |
else: | |
file.write(",\n") | |
# otherwise, increment from right to left | |
for key in itercursor: | |
if cursor[key] < boundary[key]: | |
cursor[key] += 1 | |
break | |
else: | |
cursor[key] = 0 | |
file.write(']') | |
file.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment