Skip to content

Instantly share code, notes, and snippets.

@lukerosiak
Last active January 2, 2016 21:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lukerosiak/8362479 to your computer and use it in GitHub Desktop.
Save lukerosiak/8362479 to your computer and use it in GitHub Desktop.
Parse appropriations from Cato XML
import os
import json
import re
import csv
from bs4 import BeautifulSoup
#format numbers
def commafy(x):
if type(x) not in [type(0), type(0L)]:
raise TypeError("Parameter must be an integer.")
if x < 0:
return '-' + intWithCommas(-x)
result = ''
while x >= 1000:
x, r = divmod(x, 1000)
result = ",%03d%s" % (r, result)
return "%d%s" % (x, result)
#use govtrack's bioguide spreadheet to get state and formal name of lawmakers. we need two lookup tables because they could use 1 of 2 kinds of ids
#the csv comes from https://github.com/unitedstates/congress-legislators
#i have a simple pythons script that converts current_legislators from yaml to csv
bioguide = {}
fbioguide = csv.DictReader(open('bioguide.csv','r'))
for line in fbioguide:
bioguide[line['bioguide']] = line
bioguide_lis = {}
fbioguide = csv.DictReader(open('bioguide.csv','r'))
for line in fbioguide:
bioguide_lis[line['lis']] = line
def removeNonAscii(s): return "".join(filter(lambda x: ord(x)<128, s))
def process_sponsor(x):
if not x.find('sponsor'):
return (None,None,None,None)
(sponsor_name,sponsor_id) = (x.find('sponsor').string, x.find('sponsor')['name-id'])
b = None
if sponsor_id in bioguide:
b = bioguide[sponsor_id]
elif sponsor_id in bioguide_lis:
b = bioguide_lis[sponsor_id]
if b:
govtrack = b['govtrack']
state = b['state']
sponsor_name = b['term_type'].title() + '. ' + b['official_full'] + ' (' + b['party'][0].upper() + '-' + state + ')'
sponsor_name = removeNonAscii(sponsor_name)
else:
govtrack = ''
state = ''
return (sponsor_name,sponsor_id,govtrack,state)
os.system('mkdir output')
os.system('rm dl -rf')
os.system('mkdir dl')
os.system('wget http://deepbills.cato.org/download -O dl/dl.zip')
os.system('unzip dl/dl.zip -d dl')
os.system('chmod 755 -R dl') #weird permissions issue with Cato ZIP file
billlist = []
reg = re.compile('(\d+)(\w+\d+)')
dir_files = os.listdir('dl/bills')
for f in dir_files:
#split the filename into the congress and bill number to generate link to govtrack
m = reg.match(f.replace('.xml',''))
link = m.groups()
(congress,billnum) = link
#if there's another file representing the passed version of the bill, skip this one
if f.endswith('ih.xml'):
if '%s%seh.xml' % (congress,billnum) in dir_files:
continue
if '%s%srh.xml' % (congress,billnum) in dir_files:
continue
elif f.endswith('is.xml'):
if '%s%ses.xml' % (congress,billnum) in dir_files:
continue
if '%s%srs.xml' % (congress,billnum) in dir_files:
continue
if f.endswith('ih.xml'):
if '%s%seh.xml' % (congress,billnum) in dir_files:
continue
if '%s%srh.xml' % (congress,billnum) in dir_files:
continue
elif f.endswith('is.xml'):
if '%s%ses.xml' % (congress,billnum) in dir_files:
continue
if '%s%srs.xml' % (congress,billnum) in dir_files:
continue
s = removeNonAscii(open('dl/bills/'+f,'r').read())
x = BeautifulSoup(s,"xml")
title = x.find('title')
try:
title = title.string
except:
pass
if not title:
if x.find('short-title'):
title = x.find('short-title').string
elif x.find('official-title'):
title = x.find('official-title').string
a3 = [] #there can be man appropriations in a bill; this is a list of them
amount = 0 #the running tally of total cost
indefinite = False #does "such sums as necessary" appear anywhere? if so, that changes our dollar amount
for a in x.findAll('funds-and-year'):
print a
if a.string: a3.append(a.string)
if a.get('amount'):
if a['amount']=='indefinite':
indefinite=True
else:
try:
amount += int(a['amount'])
except:
print a, 'amount couldnt be parsed as int'
try:
introdate = x.find('date').string
except:
introdate = None
lastaction = []
for a in x.findAll('action-date'):
try:
lastaction.append(a['date'])
except:
pass
if x.find('attestation-date') and x.find('attestation-date').get('date') and len(x.find('attestation-date').get('date'))==8:
#special case for getting date from passed bills
#="20130730" chamber="House">Passed the House of Representatives July 30, 2013.</attestation-date>
lastaction.append( x.find('attestation-date').get('date') )
lastaction.sort()
if len(lastaction)>0:
actiondate = lastaction[-1]
try:
actiondate = actiondate[:4]+'-'+actiondate[4:6]+'-'+actiondate[6:]
except:
pass
else:
actiondate = None
actionall = ';'.join(lastaction)
status = None
try:
status = x.resolution['resolution-stage']
except:
try:
status = x.bill['bill-stage']
status = status.replace('-',' ')
except:
pass
pass
purpose = x.find('property',{'name':'purpose'})
if purpose: purpose = purpose.string
(sponsor_name,sponsor_id,govtrack,state) = process_sponsor(x)
if not sponsor_name and (f.endswith('es.xml') or f.endswith('eh.xml')):
#passed bills dont include sponsors, so open up the file showing its earlier version to grab it
earlier_version = None
if f.endswith('eh.xml'):
status = 'Passed the House'
if '%s%sih.xml' % (congress,billnum) in dir_files:
earlier_version = '%s%sih.xml' % (congress,billnum)
elif '%s%srh.xml' % (congress,billnum) in dir_files:
earlier_version = '%s%srh.xml' % (congress,billnum)
elif f.endswith('is.xml'):
status = 'Passed the Senate'
if '%s%sis.xml' % (congress,billnum) in dir_files:
earlier_version = '%s%sis.xml' % (congress,billnum)
elif '%s%srs.xml' % (congress,billnum) in dir_files:
earlier_version = '%s%srs.xml' % (congress,billnum)
if earlier_version:
s_earlier = removeNonAscii(open('dl/bills/'+earlier_version,'r').read())
x_earlier = BeautifulSoup(s_earlier,"xml")
(sponsor_name,sponsor_id,govtrack,state) = process_sponsor(x_earlier)
#only list bills with money
if amount>0 or indefinite:
descrip = '; '.join(a3)
if indefinite:
if amount==0:
display_amount = 'such sums as necessary';
else:
display_amount = '$'+commafy(amount) + ' + such sums as necessary';
else:
display_amount = '$'+commafy(amount);
d = {'id':f,'title':title,'funds-and-year':descrip,'amount':amount, 'indefinite': indefinite, 'display_amount': display_amount, 'sponsorid':sponsor_id, 'sponsorname':sponsor_name, 'status':status, 'actiondate':actiondate,'introdate': introdate, 'actionall': actionall, 'purpose': purpose, 'congress': congress, 'billnum': billnum, 'govtrack': govtrack, 'state': state}
billlist.append(d)
#order them by how recently anything has happened with them
billlist.sort(key=lambda k: k['actiondate'],reverse=True)
fshort = open('output/short.json','w')
fshort.write('catoshort(' + json.dumps(billlist[:5]) + ');')
fshort.close()
flong = open('output/long.json','w')
flong.write('cato(' + json.dumps(billlist) + ');')
flong.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment