Parse appropriations from Cato XML
import os | |
import json | |
import re | |
import csv | |
from bs4 import BeautifulSoup | |
#format numbers | |
def commafy(x): | |
if type(x) not in [type(0), type(0L)]: | |
raise TypeError("Parameter must be an integer.") | |
if x < 0: | |
return '-' + intWithCommas(-x) | |
result = '' | |
while x >= 1000: | |
x, r = divmod(x, 1000) | |
result = ",%03d%s" % (r, result) | |
return "%d%s" % (x, result) | |
#use govtrack's bioguide spreadheet to get state and formal name of lawmakers. we need two lookup tables because they could use 1 of 2 kinds of ids | |
#the csv comes from https://github.com/unitedstates/congress-legislators | |
#i have a simple pythons script that converts current_legislators from yaml to csv | |
bioguide = {} | |
fbioguide = csv.DictReader(open('bioguide.csv','r')) | |
for line in fbioguide: | |
bioguide[line['bioguide']] = line | |
bioguide_lis = {} | |
fbioguide = csv.DictReader(open('bioguide.csv','r')) | |
for line in fbioguide: | |
bioguide_lis[line['lis']] = line | |
def removeNonAscii(s): return "".join(filter(lambda x: ord(x)<128, s)) | |
def process_sponsor(x): | |
if not x.find('sponsor'): | |
return (None,None,None,None) | |
(sponsor_name,sponsor_id) = (x.find('sponsor').string, x.find('sponsor')['name-id']) | |
b = None | |
if sponsor_id in bioguide: | |
b = bioguide[sponsor_id] | |
elif sponsor_id in bioguide_lis: | |
b = bioguide_lis[sponsor_id] | |
if b: | |
govtrack = b['govtrack'] | |
state = b['state'] | |
sponsor_name = b['term_type'].title() + '. ' + b['official_full'] + ' (' + b['party'][0].upper() + '-' + state + ')' | |
sponsor_name = removeNonAscii(sponsor_name) | |
else: | |
govtrack = '' | |
state = '' | |
return (sponsor_name,sponsor_id,govtrack,state) | |
os.system('mkdir output') | |
os.system('rm dl -rf') | |
os.system('mkdir dl') | |
os.system('wget http://deepbills.cato.org/download -O dl/dl.zip') | |
os.system('unzip dl/dl.zip -d dl') | |
os.system('chmod 755 -R dl') #weird permissions issue with Cato ZIP file | |
billlist = [] | |
reg = re.compile('(\d+)(\w+\d+)') | |
dir_files = os.listdir('dl/bills') | |
for f in dir_files: | |
#split the filename into the congress and bill number to generate link to govtrack | |
m = reg.match(f.replace('.xml','')) | |
link = m.groups() | |
(congress,billnum) = link | |
#if there's another file representing the passed version of the bill, skip this one | |
if f.endswith('ih.xml'): | |
if '%s%seh.xml' % (congress,billnum) in dir_files: | |
continue | |
if '%s%srh.xml' % (congress,billnum) in dir_files: | |
continue | |
elif f.endswith('is.xml'): | |
if '%s%ses.xml' % (congress,billnum) in dir_files: | |
continue | |
if '%s%srs.xml' % (congress,billnum) in dir_files: | |
continue | |
if f.endswith('ih.xml'): | |
if '%s%seh.xml' % (congress,billnum) in dir_files: | |
continue | |
if '%s%srh.xml' % (congress,billnum) in dir_files: | |
continue | |
elif f.endswith('is.xml'): | |
if '%s%ses.xml' % (congress,billnum) in dir_files: | |
continue | |
if '%s%srs.xml' % (congress,billnum) in dir_files: | |
continue | |
s = removeNonAscii(open('dl/bills/'+f,'r').read()) | |
x = BeautifulSoup(s,"xml") | |
title = x.find('title') | |
try: | |
title = title.string | |
except: | |
pass | |
if not title: | |
if x.find('short-title'): | |
title = x.find('short-title').string | |
elif x.find('official-title'): | |
title = x.find('official-title').string | |
a3 = [] #there can be man appropriations in a bill; this is a list of them | |
amount = 0 #the running tally of total cost | |
indefinite = False #does "such sums as necessary" appear anywhere? if so, that changes our dollar amount | |
for a in x.findAll('funds-and-year'): | |
print a | |
if a.string: a3.append(a.string) | |
if a.get('amount'): | |
if a['amount']=='indefinite': | |
indefinite=True | |
else: | |
try: | |
amount += int(a['amount']) | |
except: | |
print a, 'amount couldnt be parsed as int' | |
try: | |
introdate = x.find('date').string | |
except: | |
introdate = None | |
lastaction = [] | |
for a in x.findAll('action-date'): | |
try: | |
lastaction.append(a['date']) | |
except: | |
pass | |
if x.find('attestation-date') and x.find('attestation-date').get('date') and len(x.find('attestation-date').get('date'))==8: | |
#special case for getting date from passed bills | |
#="20130730" chamber="House">Passed the House of Representatives July 30, 2013.</attestation-date> | |
lastaction.append( x.find('attestation-date').get('date') ) | |
lastaction.sort() | |
if len(lastaction)>0: | |
actiondate = lastaction[-1] | |
try: | |
actiondate = actiondate[:4]+'-'+actiondate[4:6]+'-'+actiondate[6:] | |
except: | |
pass | |
else: | |
actiondate = None | |
actionall = ';'.join(lastaction) | |
status = None | |
try: | |
status = x.resolution['resolution-stage'] | |
except: | |
try: | |
status = x.bill['bill-stage'] | |
status = status.replace('-',' ') | |
except: | |
pass | |
pass | |
purpose = x.find('property',{'name':'purpose'}) | |
if purpose: purpose = purpose.string | |
(sponsor_name,sponsor_id,govtrack,state) = process_sponsor(x) | |
if not sponsor_name and (f.endswith('es.xml') or f.endswith('eh.xml')): | |
#passed bills dont include sponsors, so open up the file showing its earlier version to grab it | |
earlier_version = None | |
if f.endswith('eh.xml'): | |
status = 'Passed the House' | |
if '%s%sih.xml' % (congress,billnum) in dir_files: | |
earlier_version = '%s%sih.xml' % (congress,billnum) | |
elif '%s%srh.xml' % (congress,billnum) in dir_files: | |
earlier_version = '%s%srh.xml' % (congress,billnum) | |
elif f.endswith('is.xml'): | |
status = 'Passed the Senate' | |
if '%s%sis.xml' % (congress,billnum) in dir_files: | |
earlier_version = '%s%sis.xml' % (congress,billnum) | |
elif '%s%srs.xml' % (congress,billnum) in dir_files: | |
earlier_version = '%s%srs.xml' % (congress,billnum) | |
if earlier_version: | |
s_earlier = removeNonAscii(open('dl/bills/'+earlier_version,'r').read()) | |
x_earlier = BeautifulSoup(s_earlier,"xml") | |
(sponsor_name,sponsor_id,govtrack,state) = process_sponsor(x_earlier) | |
#only list bills with money | |
if amount>0 or indefinite: | |
descrip = '; '.join(a3) | |
if indefinite: | |
if amount==0: | |
display_amount = 'such sums as necessary'; | |
else: | |
display_amount = '$'+commafy(amount) + ' + such sums as necessary'; | |
else: | |
display_amount = '$'+commafy(amount); | |
d = {'id':f,'title':title,'funds-and-year':descrip,'amount':amount, 'indefinite': indefinite, 'display_amount': display_amount, 'sponsorid':sponsor_id, 'sponsorname':sponsor_name, 'status':status, 'actiondate':actiondate,'introdate': introdate, 'actionall': actionall, 'purpose': purpose, 'congress': congress, 'billnum': billnum, 'govtrack': govtrack, 'state': state} | |
billlist.append(d) | |
#order them by how recently anything has happened with them | |
billlist.sort(key=lambda k: k['actiondate'],reverse=True) | |
fshort = open('output/short.json','w') | |
fshort.write('catoshort(' + json.dumps(billlist[:5]) + ');') | |
fshort.close() | |
flong = open('output/long.json','w') | |
flong.write('cato(' + json.dumps(billlist) + ');') | |
flong.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment