drinks/calculate_grade_levels.py

## calculate_grade_levels.py
#!/usr/bin/env python
from __future__ import division
from curses.ascii import isdigit
import json
import sys
import datetime
# import re

from django.utils.datastructures import SortedDict
from django.contrib.localflavor.us.us_states import STATE_CHOICES
from nltk import sent_tokenize, regexp_tokenize
from nltk.corpus import cmudict
from sunlight import capitolwords as cw
from sunlight import congress

SYLLABLE_AVG = 1.66
STARTING_CONGRESS = 104
CURRENT_CONGRESS = 112
PER_PAGE = 1000
DICT = cmudict.dict()
LEGISLATORS = {}
for leg in congress.legislators():
    if leg['bioguide_id']:
        LEGISLATORS[leg['bioguide_id']] = leg
STATES = [state[0] for state in STATE_CHOICES]


def tokenize(term):
    # Adapted From Natural Language Processing with Python
    regex = r'''(?xi)
    (?:H|S)\.\ ?(?:(?:J|R)\.\ )?(?:Con\.\ )?(?:Res\.\ )?\d+ # Bills
  | ([A-Z]\.)+                                              # Abbreviations (U.S.A., etc.)
  | ([A-Z]+\&[A-Z]+)                                        # Internal ampersands (AT&T, etc.)
  | (Mr\.|Dr\.|Mrs\.|Ms\.)                                  # Mr., Mrs., etc.
  | \d*\.\d+                                                # Numbers with decimal points.
  | \d\d?:\d\d                                              # Times.
  | \$?[,\.0-9]+\d                                          # Numbers with thousands separators, (incl currency).
  | (((a|A)|(p|P))\.(m|M)\.)                                # a.m., p.m., A.M., P.M.
  | \w+((-|')\w+)*                                          # Words with optional internal hyphens.
  | \$?\d+(\.\d+)?%?                                        # Currency and percentages.
  | (?<=\b)\.\.\.(?=\b)                                     # Ellipses surrounded by word borders
  | [][.,;"'?():-_`]
    '''
    # Strip punctuation from this one; solr doesn't know about any of it
    tokens = regexp_tokenize(term, regex)
    # tokens = [re.sub(r'[.,?!]', '', token) for token in tokens]  # instead of this we just test word length
    return tokens


def nsyl(word):
    return [len(list(y for y in x if isdigit(y[-1]))) for x in DICT[word.lower()]][0]


if __name__ == '__main__':
    from optparse import OptionParser
    parser = OptionParser()

    parser.add_option('--facet', dest='facets', action='append', type='choice', default=[],
                      choices=('month', 'year', 'chamber', 'congress', 'party', 'bioguide', 'state'),
                      help='Set the facet(s) to aggregate grade levels over')
    parser.add_option('--start-date', dest='start_date', default='1996-01-01',
                      help='How far back to limit the search, Required with month or year faceting.')
    parser.add_option('--end-date', dest='end_date', default=datetime.datetime.now().strftime('%Y-%m-%d'),
                      help='How far forward to limit the search, required with month or year faceting.')
    options, args = parser.parse_args()
    if not options.facets:
        options.facets = ['bioguide']

    try:
        start_date = options.start_date
    except AttributeError:
        start_date = None
    try:
        end_date = options.end_date
    except AttributeError:
        end_date = None
    kwargs = SortedDict()
    calls = []
    results = {}

    if 'bioguide' in options.facets:
        kwargs['bioguide_id'] = LEGISLATORS.keys()
    if 'chamber' in options.facets:
        kwargs['chamber'] = ['house', 'senate']
    if 'party' in options.facets:
        kwargs['party'] = ['D', 'R', 'I']
    if 'state' in options.facets:
        kwargs['state'] = STATES
    if 'month' in options.facets:
        import calendar
        cal = calendar.Calendar()
        start_month = (int(start_date.split('-')[0]), int(start_date.split('-')[1]))
        end_month = (int(end_date.split('-')[0]), int(end_date.split('-')[1]))
        current_month = list(start_month)
        starts = []
        ends = []
        while current_month[0] < end_month[0] or current_month[1] <= end_month[1]:
            starts.append('%d-%d-01' % (current_month[0], current_month[1]))
            days_in_month = len([day for day in cal.itermonthdays(current_month[0], current_month[1]) if day > 0])
            ends.append('%d-%d-%d' % (current_month[0], current_month[1], days_in_month))
            current_month[1] += 1
            if current_month[1] == 13:
                current_month[0] += 1
                current_month[1] = 1
        kwargs['dates'] = zip(starts, ends)
    elif 'year' in options.facets:
        starts = []
        ends = []
        start_year = int(start_date.split('-')[0])
        end_year = int(end_date.split('-')[0])
        for year in (range(start_year, end_year) + [end_year]):
            starts.append('%d-01-01' % year)
            ends.append('%d-12-31' % year)
        kwargs['dates'] = zip(starts, ends)
    elif 'date' in options.facets:
        raise NotImplementedError('Sorry, dates aren\'t available yet')
    elif 'congress' in options.facets:
        kwargs['congress'] = range(STARTING_CONGRESS, CURRENT_CONGRESS + 1)
    if 'month' in options.facets or 'year' in options.facets or 'date' in options.facets:
        start_date = None
        end_date = None
    if start_date:
        kwargs['start_date'] = [start_date]
    if end_date:
        kwargs['end_date'] = [end_date]

    cursor = SortedDict()
    boundary = SortedDict()
    for key in kwargs.keys():
        cursor[key] = 0
        boundary[key] = len(kwargs[key]) - 1
    itercursor = cursor.keys()
    itercursor.reverse()

    try:
        filename = parser.parse_args()[1][0]
        file = open(filename, 'w+')
    except Exception, e:
        print e
        print 'usage: ./calculate_grade_levels.py [options] <filename>'
        sys.exit()

    file.write('[')

    while cursor.values() <= boundary.values():
        page = 0
        corpus = ''
        call = {}
        for key, val in cursor.items():
            if key == 'dates':
                call['start_date'] = kwargs[key][val][0]
                call['end_date'] = kwargs[key][val][1]
            else:
                call[key] = kwargs[key][val]
        callsum = '-'.join(['-'.join([key, str(val)]) for (key,  val) in call.items()])
        print callsum

        call.update(phrase='*', per_page=PER_PAGE)
        if not call.get('bioguide_id'):
            call.update(bioguide_id="['' TO *]")
        while True:
            print 'page %d...' % page
            call.update(page=page)
            resp = cw.text(**call)
            if not len(resp):
                break
            page += 1
            for chunk in resp:
                try:
                    corpus += ' %s' % ' '.join(chunk['speaking'])
                except TypeError:
                    corpus += ' %s' % str(chunk['speaking'])
                except:
                    pass

        words = [word for word in tokenize(corpus) if (len(word) > 1) or (word.lower() in ['a', 'i'])]
        sentences = sent_tokenize(corpus)
        syllables = []
        misses = []
        for word in words:
            try:
                syllables.append(nsyl(word))
            except KeyError:
                misses.append(word)

        word_count = len(words)
        sentence_count = len(sentences)

        # pad syllable count out to word count
        missing_syllables = word_count - len(syllables)
        for i in range(missing_syllables):
            syllables.append(SYLLABLE_AVG)
        syllable_count = sum(syllables)
        if word_count > 0 and sentence_count > 0:
            results[callsum] = {
                'words': word_count,
                'syllables': syllable_count,
                'missed_count': missing_syllables,
                'missed_pct': missing_syllables / word_count,
                'sentences': sentence_count,
                'grade_level': (0.39 * (word_count / sentence_count)) + (11.8 * (syllable_count / word_count)) - 15.59,
                'reading_ease': 206.835 - (1.015 * (word_count / sentence_count)) - (84.6 * (syllable_count / word_count)),
            }

            if call.get('bioguide_id') and not call['bioguide_id'].startswith('['):
                leg = LEGISLATORS[call['bioguide_id']]
                results[callsum].update(bioguide_id=call['bioguide_id'],
                                        party=leg['party'],
                                        chamber=leg['chamber'],
                                        state=leg['state'],
                                        name='%s, %s' % (leg['lastname'], leg['firstname']))
            if call.get('start_date') and call.get('end_date'):
                results[callsum].update(start_date=call['start_date'], end_date=call['end_date'])
            if call.get('chamber'):
                results[callsum].update(chamber=call['chamber'])
            if call.get('party'):
                results[callsum].update(party=call['party'])
            if call.get('congress'):
                results[callsum].update(congress=call['congress'])
            file.write(json.dumps(results[callsum]))
            file.flush()

        # break out of the loop if we're done
        if cursor.values() == boundary.values():
            break
        else:
            file.write(",\n")

        # otherwise, increment from right to left
        for key in itercursor:
            if cursor[key] < boundary[key]:
                cursor[key] += 1
                break
            else:
                cursor[key] = 0

    file.write(']')
    file.close()
	#!/usr/bin/env python
	from __future__ import division
	from curses.ascii import isdigit
	import json
	import sys
	import datetime
	# import re

	from django.utils.datastructures import SortedDict
	from django.contrib.localflavor.us.us_states import STATE_CHOICES
	from nltk import sent_tokenize, regexp_tokenize
	from nltk.corpus import cmudict
	from sunlight import capitolwords as cw
	from sunlight import congress

	SYLLABLE_AVG = 1.66
	STARTING_CONGRESS = 104
	CURRENT_CONGRESS = 112
	PER_PAGE = 1000
	DICT = cmudict.dict()
	LEGISLATORS = {}
	for leg in congress.legislators():
	if leg['bioguide_id']:
	LEGISLATORS[leg['bioguide_id']] = leg
	STATES = [state[0] for state in STATE_CHOICES]


	def tokenize(term):
	# Adapted From Natural Language Processing with Python
	regex = r'''(?xi)
	(?:H\|S)\.\ ?(?:(?:J\|R)\.\ )?(?:Con\.\ )?(?:Res\.\ )?\d+ # Bills
	\| ([A-Z]\.)+ # Abbreviations (U.S.A., etc.)
	\| ([A-Z]+\&[A-Z]+) # Internal ampersands (AT&T, etc.)
	\| (Mr\.\|Dr\.\|Mrs\.\|Ms\.) # Mr., Mrs., etc.
	\| \d*\.\d+ # Numbers with decimal points.
	\| \d\d?:\d\d # Times.
	\| \$?[,\.0-9]+\d # Numbers with thousands separators, (incl currency).
	\| (((a\|A)\|(p\|P))\.(m\|M)\.) # a.m., p.m., A.M., P.M.
	\| \w+((-\|')\w+)* # Words with optional internal hyphens.
	\| \$?\d+(\.\d+)?%? # Currency and percentages.
	\| (?<=\b)\.\.\.(?=\b) # Ellipses surrounded by word borders
	\| [][.,;"'?():-_`]
	'''
	# Strip punctuation from this one; solr doesn't know about any of it
	tokens = regexp_tokenize(term, regex)
	# tokens = [re.sub(r'[.,?!]', '', token) for token in tokens] # instead of this we just test word length
	return tokens


	def nsyl(word):
	return [len(list(y for y in x if isdigit(y[-1]))) for x in DICT[word.lower()]][0]


	if __name__ == '__main__':
	from optparse import OptionParser
	parser = OptionParser()

	parser.add_option('--facet', dest='facets', action='append', type='choice', default=[],
	choices=('month', 'year', 'chamber', 'congress', 'party', 'bioguide', 'state'),
	help='Set the facet(s) to aggregate grade levels over')
	parser.add_option('--start-date', dest='start_date', default='1996-01-01',
	help='How far back to limit the search, Required with month or year faceting.')
	parser.add_option('--end-date', dest='end_date', default=datetime.datetime.now().strftime('%Y-%m-%d'),
	help='How far forward to limit the search, required with month or year faceting.')
	options, args = parser.parse_args()
	if not options.facets:
	options.facets = ['bioguide']

	try:
	start_date = options.start_date
	except AttributeError:
	start_date = None
	try:
	end_date = options.end_date
	except AttributeError:
	end_date = None
	kwargs = SortedDict()
	calls = []
	results = {}

	if 'bioguide' in options.facets:
	kwargs['bioguide_id'] = LEGISLATORS.keys()
	if 'chamber' in options.facets:
	kwargs['chamber'] = ['house', 'senate']
	if 'party' in options.facets:
	kwargs['party'] = ['D', 'R', 'I']
	if 'state' in options.facets:
	kwargs['state'] = STATES
	if 'month' in options.facets:
	import calendar
	cal = calendar.Calendar()
	start_month = (int(start_date.split('-')[0]), int(start_date.split('-')[1]))
	end_month = (int(end_date.split('-')[0]), int(end_date.split('-')[1]))
	current_month = list(start_month)
	starts = []
	ends = []
	while current_month[0] < end_month[0] or current_month[1] <= end_month[1]:
	starts.append('%d-%d-01' % (current_month[0], current_month[1]))
	days_in_month = len([day for day in cal.itermonthdays(current_month[0], current_month[1]) if day > 0])
	ends.append('%d-%d-%d' % (current_month[0], current_month[1], days_in_month))
	current_month[1] += 1
	if current_month[1] == 13:
	current_month[0] += 1
	current_month[1] = 1
	kwargs['dates'] = zip(starts, ends)
	elif 'year' in options.facets:
	starts = []
	ends = []
	start_year = int(start_date.split('-')[0])
	end_year = int(end_date.split('-')[0])
	for year in (range(start_year, end_year) + [end_year]):
	starts.append('%d-01-01' % year)
	ends.append('%d-12-31' % year)
	kwargs['dates'] = zip(starts, ends)
	elif 'date' in options.facets:
	raise NotImplementedError('Sorry, dates aren\'t available yet')
	elif 'congress' in options.facets:
	kwargs['congress'] = range(STARTING_CONGRESS, CURRENT_CONGRESS + 1)
	if 'month' in options.facets or 'year' in options.facets or 'date' in options.facets:
	start_date = None
	end_date = None
	if start_date:
	kwargs['start_date'] = [start_date]
	if end_date:
	kwargs['end_date'] = [end_date]

	cursor = SortedDict()
	boundary = SortedDict()
	for key in kwargs.keys():
	cursor[key] = 0
	boundary[key] = len(kwargs[key]) - 1
	itercursor = cursor.keys()
	itercursor.reverse()

	try:
	filename = parser.parse_args()[1][0]
	file = open(filename, 'w+')
	except Exception, e:
	print e
	print 'usage: ./calculate_grade_levels.py [options] <filename>'
	sys.exit()

	file.write('[')

	while cursor.values() <= boundary.values():
	page = 0
	corpus = ''
	call = {}
	for key, val in cursor.items():
	if key == 'dates':
	call['start_date'] = kwargs[key][val][0]
	call['end_date'] = kwargs[key][val][1]
	else:
	call[key] = kwargs[key][val]
	callsum = '-'.join(['-'.join([key, str(val)]) for (key, val) in call.items()])
	print callsum

	call.update(phrase='*', per_page=PER_PAGE)
	if not call.get('bioguide_id'):
	call.update(bioguide_id="['' TO *]")
	while True:
	print 'page %d...' % page
	call.update(page=page)
	resp = cw.text(**call)
	if not len(resp):
	break
	page += 1
	for chunk in resp:
	try:
	corpus += ' %s' % ' '.join(chunk['speaking'])
	except TypeError:
	corpus += ' %s' % str(chunk['speaking'])
	except:
	pass

	words = [word for word in tokenize(corpus) if (len(word) > 1) or (word.lower() in ['a', 'i'])]
	sentences = sent_tokenize(corpus)
	syllables = []
	misses = []
	for word in words:
	try:
	syllables.append(nsyl(word))
	except KeyError:
	misses.append(word)

	word_count = len(words)
	sentence_count = len(sentences)

	# pad syllable count out to word count
	missing_syllables = word_count - len(syllables)
	for i in range(missing_syllables):
	syllables.append(SYLLABLE_AVG)
	syllable_count = sum(syllables)
	if word_count > 0 and sentence_count > 0:
	results[callsum] = {
	'words': word_count,
	'syllables': syllable_count,
	'missed_count': missing_syllables,
	'missed_pct': missing_syllables / word_count,
	'sentences': sentence_count,
	'grade_level': (0.39 * (word_count / sentence_count)) + (11.8 * (syllable_count / word_count)) - 15.59,
	'reading_ease': 206.835 - (1.015 * (word_count / sentence_count)) - (84.6 * (syllable_count / word_count)),
	}

	if call.get('bioguide_id') and not call['bioguide_id'].startswith('['):
	leg = LEGISLATORS[call['bioguide_id']]
	results[callsum].update(bioguide_id=call['bioguide_id'],
	party=leg['party'],
	chamber=leg['chamber'],
	state=leg['state'],
	name='%s, %s' % (leg['lastname'], leg['firstname']))
	if call.get('start_date') and call.get('end_date'):
	results[callsum].update(start_date=call['start_date'], end_date=call['end_date'])
	if call.get('chamber'):
	results[callsum].update(chamber=call['chamber'])
	if call.get('party'):
	results[callsum].update(party=call['party'])
	if call.get('congress'):
	results[callsum].update(congress=call['congress'])
	file.write(json.dumps(results[callsum]))
	file.flush()

	# break out of the loop if we're done
	if cursor.values() == boundary.values():
	break
	else:
	file.write(",\n")

	# otherwise, increment from right to left
	for key in itercursor:
	if cursor[key] < boundary[key]:
	cursor[key] += 1
	break
	else:
	cursor[key] = 0

	file.write(']')
	file.close()