Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Parse appropriations from Cato XML
import os
import json
import re
import csv
from bs4 import BeautifulSoup
#format numbers
def commafy(x):
if type(x) not in [type(0), type(0L)]:
raise TypeError("Parameter must be an integer.")
if x < 0:
return '-' + intWithCommas(-x)
result = ''
while x >= 1000:
x, r = divmod(x, 1000)
result = ",%03d%s" % (r, result)
return "%d%s" % (x, result)
#use govtrack's bioguide spreadheet to get state and formal name of lawmakers. we need two lookup tables because they could use 1 of 2 kinds of ids
#the csv comes from https://github.com/unitedstates/congress-legislators
#i have a simple pythons script that converts current_legislators from yaml to csv
bioguide = {}
fbioguide = csv.DictReader(open('bioguide.csv','r'))
for line in fbioguide:
bioguide[line['bioguide']] = line
bioguide_lis = {}
fbioguide = csv.DictReader(open('bioguide.csv','r'))
for line in fbioguide:
bioguide_lis[line['lis']] = line
def removeNonAscii(s): return "".join(filter(lambda x: ord(x)<128, s))
def process_sponsor(x):
if not x.find('sponsor'):
return (None,None,None,None)
(sponsor_name,sponsor_id) = (x.find('sponsor').string, x.find('sponsor')['name-id'])
b = None
if sponsor_id in bioguide:
b = bioguide[sponsor_id]
elif sponsor_id in bioguide_lis:
b = bioguide_lis[sponsor_id]
if b:
govtrack = b['govtrack']
state = b['state']
sponsor_name = b['term_type'].title() + '. ' + b['official_full'] + ' (' + b['party'][0].upper() + '-' + state + ')'
sponsor_name = removeNonAscii(sponsor_name)
else:
govtrack = ''
state = ''
return (sponsor_name,sponsor_id,govtrack,state)
os.system('mkdir output')
os.system('rm dl -rf')
os.system('mkdir dl')
os.system('wget http://deepbills.cato.org/download -O dl/dl.zip')
os.system('unzip dl/dl.zip -d dl')
os.system('chmod 755 -R dl') #weird permissions issue with Cato ZIP file
billlist = []
reg = re.compile('(\d+)(\w+\d+)')
dir_files = os.listdir('dl/bills')
for f in dir_files:
#split the filename into the congress and bill number to generate link to govtrack
m = reg.match(f.replace('.xml',''))
link = m.groups()
(congress,billnum) = link
#if there's another file representing the passed version of the bill, skip this one
if f.endswith('ih.xml'):
if '%s%seh.xml' % (congress,billnum) in dir_files:
continue
if '%s%srh.xml' % (congress,billnum) in dir_files:
continue
elif f.endswith('is.xml'):
if '%s%ses.xml' % (congress,billnum) in dir_files:
continue
if '%s%srs.xml' % (congress,billnum) in dir_files:
continue
if f.endswith('ih.xml'):
if '%s%seh.xml' % (congress,billnum) in dir_files:
continue
if '%s%srh.xml' % (congress,billnum) in dir_files:
continue
elif f.endswith('is.xml'):
if '%s%ses.xml' % (congress,billnum) in dir_files:
continue
if '%s%srs.xml' % (congress,billnum) in dir_files:
continue
s = removeNonAscii(open('dl/bills/'+f,'r').read())
x = BeautifulSoup(s,"xml")
title = x.find('title')
try:
title = title.string
except:
pass
if not title:
if x.find('short-title'):
title = x.find('short-title').string
elif x.find('official-title'):
title = x.find('official-title').string
a3 = [] #there can be man appropriations in a bill; this is a list of them
amount = 0 #the running tally of total cost
indefinite = False #does "such sums as necessary" appear anywhere? if so, that changes our dollar amount
for a in x.findAll('funds-and-year'):
print a
if a.string: a3.append(a.string)
if a.get('amount'):
if a['amount']=='indefinite':
indefinite=True
else:
try:
amount += int(a['amount'])
except:
print a, 'amount couldnt be parsed as int'
try:
introdate = x.find('date').string
except:
introdate = None
lastaction = []
for a in x.findAll('action-date'):
try:
lastaction.append(a['date'])
except:
pass
if x.find('attestation-date') and x.find('attestation-date').get('date') and len(x.find('attestation-date').get('date'))==8:
#special case for getting date from passed bills
#="20130730" chamber="House">Passed the House of Representatives July 30, 2013.</attestation-date>
lastaction.append( x.find('attestation-date').get('date') )
lastaction.sort()
if len(lastaction)>0:
actiondate = lastaction[-1]
try:
actiondate = actiondate[:4]+'-'+actiondate[4:6]+'-'+actiondate[6:]
except:
pass
else:
actiondate = None
actionall = ';'.join(lastaction)
status = None
try:
status = x.resolution['resolution-stage']
except:
try:
status = x.bill['bill-stage']
status = status.replace('-',' ')
except:
pass
pass
purpose = x.find('property',{'name':'purpose'})
if purpose: purpose = purpose.string
(sponsor_name,sponsor_id,govtrack,state) = process_sponsor(x)
if not sponsor_name and (f.endswith('es.xml') or f.endswith('eh.xml')):
#passed bills dont include sponsors, so open up the file showing its earlier version to grab it
earlier_version = None
if f.endswith('eh.xml'):
status = 'Passed the House'
if '%s%sih.xml' % (congress,billnum) in dir_files:
earlier_version = '%s%sih.xml' % (congress,billnum)
elif '%s%srh.xml' % (congress,billnum) in dir_files:
earlier_version = '%s%srh.xml' % (congress,billnum)
elif f.endswith('is.xml'):
status = 'Passed the Senate'
if '%s%sis.xml' % (congress,billnum) in dir_files:
earlier_version = '%s%sis.xml' % (congress,billnum)
elif '%s%srs.xml' % (congress,billnum) in dir_files:
earlier_version = '%s%srs.xml' % (congress,billnum)
if earlier_version:
s_earlier = removeNonAscii(open('dl/bills/'+earlier_version,'r').read())
x_earlier = BeautifulSoup(s_earlier,"xml")
(sponsor_name,sponsor_id,govtrack,state) = process_sponsor(x_earlier)
#only list bills with money
if amount>0 or indefinite:
descrip = '; '.join(a3)
if indefinite:
if amount==0:
display_amount = 'such sums as necessary';
else:
display_amount = '$'+commafy(amount) + ' + such sums as necessary';
else:
display_amount = '$'+commafy(amount);
d = {'id':f,'title':title,'funds-and-year':descrip,'amount':amount, 'indefinite': indefinite, 'display_amount': display_amount, 'sponsorid':sponsor_id, 'sponsorname':sponsor_name, 'status':status, 'actiondate':actiondate,'introdate': introdate, 'actionall': actionall, 'purpose': purpose, 'congress': congress, 'billnum': billnum, 'govtrack': govtrack, 'state': state}
billlist.append(d)
#order them by how recently anything has happened with them
billlist.sort(key=lambda k: k['actiondate'],reverse=True)
fshort = open('output/short.json','w')
fshort.write('catoshort(' + json.dumps(billlist[:5]) + ');')
fshort.close()
flong = open('output/long.json','w')
flong.write('cato(' + json.dumps(billlist) + ');')
flong.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment