public
Created

Flesch-Kincaid grade level processing against Capitol Words API

  • Download Gist
calculate_grade_levels.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238
#!/usr/bin/env python
from __future__ import division
from curses.ascii import isdigit
import json
import sys
import datetime
# import re
 
from django.utils.datastructures import SortedDict
from django.contrib.localflavor.us.us_states import STATE_CHOICES
from nltk import sent_tokenize, regexp_tokenize
from nltk.corpus import cmudict
from sunlight import capitolwords as cw
from sunlight import congress
 
SYLLABLE_AVG = 1.66
STARTING_CONGRESS = 104
CURRENT_CONGRESS = 112
PER_PAGE = 1000
DICT = cmudict.dict()
LEGISLATORS = {}
for leg in congress.legislators():
if leg['bioguide_id']:
LEGISLATORS[leg['bioguide_id']] = leg
STATES = [state[0] for state in STATE_CHOICES]
 
 
def tokenize(term):
# Adapted From Natural Language Processing with Python
regex = r'''(?xi)
(?:H|S)\.\ ?(?:(?:J|R)\.\ )?(?:Con\.\ )?(?:Res\.\ )?\d+ # Bills
| ([A-Z]\.)+ # Abbreviations (U.S.A., etc.)
| ([A-Z]+\&[A-Z]+) # Internal ampersands (AT&T, etc.)
| (Mr\.|Dr\.|Mrs\.|Ms\.) # Mr., Mrs., etc.
| \d*\.\d+ # Numbers with decimal points.
| \d\d?:\d\d # Times.
| \$?[,\.0-9]+\d # Numbers with thousands separators, (incl currency).
| (((a|A)|(p|P))\.(m|M)\.) # a.m., p.m., A.M., P.M.
| \w+((-|')\w+)* # Words with optional internal hyphens.
| \$?\d+(\.\d+)?%? # Currency and percentages.
| (?<=\b)\.\.\.(?=\b) # Ellipses surrounded by word borders
| [][.,;"'?():-_`]
'''
# Strip punctuation from this one; solr doesn't know about any of it
tokens = regexp_tokenize(term, regex)
# tokens = [re.sub(r'[.,?!]', '', token) for token in tokens] # instead of this we just test word length
return tokens
 
 
def nsyl(word):
return [len(list(y for y in x if isdigit(y[-1]))) for x in DICT[word.lower()]][0]
 
 
if __name__ == '__main__':
from optparse import OptionParser
parser = OptionParser()
 
parser.add_option('--facet', dest='facets', action='append', type='choice', default=[],
choices=('month', 'year', 'chamber', 'congress', 'party', 'bioguide', 'state'),
help='Set the facet(s) to aggregate grade levels over')
parser.add_option('--start-date', dest='start_date', default='1996-01-01',
help='How far back to limit the search, Required with month or year faceting.')
parser.add_option('--end-date', dest='end_date', default=datetime.datetime.now().strftime('%Y-%m-%d'),
help='How far forward to limit the search, required with month or year faceting.')
options, args = parser.parse_args()
if not options.facets:
options.facets = ['bioguide']
 
try:
start_date = options.start_date
except AttributeError:
start_date = None
try:
end_date = options.end_date
except AttributeError:
end_date = None
kwargs = SortedDict()
calls = []
results = {}
 
if 'bioguide' in options.facets:
kwargs['bioguide_id'] = LEGISLATORS.keys()
if 'chamber' in options.facets:
kwargs['chamber'] = ['house', 'senate']
if 'party' in options.facets:
kwargs['party'] = ['D', 'R', 'I']
if 'state' in options.facets:
kwargs['state'] = STATES
if 'month' in options.facets:
import calendar
cal = calendar.Calendar()
start_month = (int(start_date.split('-')[0]), int(start_date.split('-')[1]))
end_month = (int(end_date.split('-')[0]), int(end_date.split('-')[1]))
current_month = list(start_month)
starts = []
ends = []
while current_month[0] < end_month[0] or current_month[1] <= end_month[1]:
starts.append('%d-%d-01' % (current_month[0], current_month[1]))
days_in_month = len([day for day in cal.itermonthdays(current_month[0], current_month[1]) if day > 0])
ends.append('%d-%d-%d' % (current_month[0], current_month[1], days_in_month))
current_month[1] += 1
if current_month[1] == 13:
current_month[0] += 1
current_month[1] = 1
kwargs['dates'] = zip(starts, ends)
elif 'year' in options.facets:
starts = []
ends = []
start_year = int(start_date.split('-')[0])
end_year = int(end_date.split('-')[0])
for year in (range(start_year, end_year) + [end_year]):
starts.append('%d-01-01' % year)
ends.append('%d-12-31' % year)
kwargs['dates'] = zip(starts, ends)
elif 'date' in options.facets:
raise NotImplementedError('Sorry, dates aren\'t available yet')
elif 'congress' in options.facets:
kwargs['congress'] = range(STARTING_CONGRESS, CURRENT_CONGRESS + 1)
if 'month' in options.facets or 'year' in options.facets or 'date' in options.facets:
start_date = None
end_date = None
if start_date:
kwargs['start_date'] = [start_date]
if end_date:
kwargs['end_date'] = [end_date]
 
cursor = SortedDict()
boundary = SortedDict()
for key in kwargs.keys():
cursor[key] = 0
boundary[key] = len(kwargs[key]) - 1
itercursor = cursor.keys()
itercursor.reverse()
 
try:
filename = parser.parse_args()[1][0]
file = open(filename, 'w+')
except Exception, e:
print e
print 'usage: ./calculate_grade_levels.py [options] <filename>'
sys.exit()
 
file.write('[')
 
while cursor.values() <= boundary.values():
page = 0
corpus = ''
call = {}
for key, val in cursor.items():
if key == 'dates':
call['start_date'] = kwargs[key][val][0]
call['end_date'] = kwargs[key][val][1]
else:
call[key] = kwargs[key][val]
callsum = '-'.join(['-'.join([key, str(val)]) for (key, val) in call.items()])
print callsum
 
call.update(phrase='*', per_page=PER_PAGE)
if not call.get('bioguide_id'):
call.update(bioguide_id="['' TO *]")
while True:
print 'page %d...' % page
call.update(page=page)
resp = cw.text(**call)
if not len(resp):
break
page += 1
for chunk in resp:
try:
corpus += ' %s' % ' '.join(chunk['speaking'])
except TypeError:
corpus += ' %s' % str(chunk['speaking'])
except:
pass
 
words = [word for word in tokenize(corpus) if (len(word) > 1) or (word.lower() in ['a', 'i'])]
sentences = sent_tokenize(corpus)
syllables = []
misses = []
for word in words:
try:
syllables.append(nsyl(word))
except KeyError:
misses.append(word)
 
word_count = len(words)
sentence_count = len(sentences)
 
# pad syllable count out to word count
missing_syllables = word_count - len(syllables)
for i in range(missing_syllables):
syllables.append(SYLLABLE_AVG)
syllable_count = sum(syllables)
if word_count > 0 and sentence_count > 0:
results[callsum] = {
'words': word_count,
'syllables': syllable_count,
'missed_count': missing_syllables,
'missed_pct': missing_syllables / word_count,
'sentences': sentence_count,
'grade_level': (0.39 * (word_count / sentence_count)) + (11.8 * (syllable_count / word_count)) - 15.59,
'reading_ease': 206.835 - (1.015 * (word_count / sentence_count)) - (84.6 * (syllable_count / word_count)),
}
 
if call.get('bioguide_id') and not call['bioguide_id'].startswith('['):
leg = LEGISLATORS[call['bioguide_id']]
results[callsum].update(bioguide_id=call['bioguide_id'],
party=leg['party'],
chamber=leg['chamber'],
state=leg['state'],
name='%s, %s' % (leg['lastname'], leg['firstname']))
if call.get('start_date') and call.get('end_date'):
results[callsum].update(start_date=call['start_date'], end_date=call['end_date'])
if call.get('chamber'):
results[callsum].update(chamber=call['chamber'])
if call.get('party'):
results[callsum].update(party=call['party'])
if call.get('congress'):
results[callsum].update(congress=call['congress'])
file.write(json.dumps(results[callsum]))
file.flush()
 
# break out of the loop if we're done
if cursor.values() == boundary.values():
break
else:
file.write(",\n")
 
# otherwise, increment from right to left
for key in itercursor:
if cursor[key] < boundary[key]:
cursor[key] += 1
break
else:
cursor[key] = 0
 
file.write(']')
file.close()

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.