Skip to content

Instantly share code, notes, and snippets.

@sunlightlabs
Created December 2, 2009 20:32
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save sunlightlabs/247540 to your computer and use it in GitHub Desktop.
Save sunlightlabs/247540 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
This script was used to convert the 2009 Q3 House Disbursement PDF into detail and summary CSV files.
Source PDF: http://disbursements.house.gov/
Resulting Data: http://www.sunlightfoundation.com/projects/2009/expenditures/
It was originally authored by Luke Rosiak with improvements by James Turk for Sunlight Labs and is released into the public domain.
Disclaimer: It was written quickly under deadline and likely contains a few bugs - patches welcome
It expects a file "members-only.txt" created as the result of the following two operations:
pdftk cat 241 2780 output members-only.pdf
pdftotext -layout members-only.pdf
'''
import csv, re, sys
BAD_LINE_RE = re.compile('^(Frm|Fmt|Sfmt|Jkt|VerDate|VOUCHER|OFFICIAL\sEXPENSES|MEMBERS\sREPRESENATION\sALLOW|PO|APPS06|PsN:|09:47|M:)')
def known_bad(line):
return (not line) or BAD_LINE_RE.match(line) or 'dkrause' in line
def main():
f = open('members-only.txt', "r")
fsummary = csv.writer(open("house-disburse-summary.csv", "w"), quoting=csv.QUOTE_ALL)
fdetail = csv.writer(open("house-disburse-detail.csv", "w"), quoting=csv.QUOTE_ALL)
trashcan = open('trashlines.txt','w')
cats = ['FRANKED MAIL', 'PERSONNEL COMPENSATION', 'PERSONNEL BENEFITS', 'TRAVEL', 'RENT, COMMUNICATION, UTILITIES', 'PRINTING AND REPRODUCTION', 'OTHER SERVICES', 'SUPPLIES AND MATERIALS', 'EQUIPMENT']
thismem = ''
thiscat = ''
thisyear = ''
regular_re = re.compile(r"""(\d{2}-\d{2})\s+ # date
([0-9A-Z]{2})\s+ # transaction code
([0-9A-Z]+)\s+ # record id
(.*?) # recipient
(\d{2}/{1}\d{2}/{1}\d{2})\s+ # date-start
(\d{2}/{1}\d{2}/{1}\d{2}) # date-end
(.*?)\s+ # description
(-?[0-9,]+\.\d{2}) # amount
""", re.VERBOSE)
personel_re = re.compile(r"""(.*?) # recipient
(\d{2}/{1}\d{2}/{1}\d{2})\s+ # date-start
(\d{2}/{1}\d{2}/{1}\d{2}) # date-end
(.*?)\s+ # description
(-?[0-9,]+\.\d{2}) # amount
""", re.VERBOSE)
summary_re = re.compile(r"""(.*?)\.+\s+ # category
(-?[0-9,]+\.\d{2})\s+ # 2009
(-?[0-9,]+\.\d{2}) # 2009-Q3
""", re.VERBOSE)
for l in f.readlines():
# replace UTF-8 minus with normal dash and strip
l = l.replace('–','-').strip()
# new member
if l.startswith("2008 ") or l.startswith("2009 ") or l.startswith('2007 ') or l.startswith("FISCAL YEAR "):
thismem = l.replace('—', '')[5:]
thisyear = l[:4]
if thismem.endswith("Con."):
thismem = thismem[:-4]
continue
# category
if l in cats:
thiscat = l
continue
#regular record
ma = regular_re.search(l)
if ma:
m = ma.groups()
date1 = m[0].replace('–', '-')
transcode = m[1]
recordid = m[2]
recip = m[3].strip().rstrip('.')
sunrecip = recip
if recip=='DO ':
sunrecip = oldrecip
else:
oldrecip = recip
date2 = m[4]
date3 = m[5]
descrip = m[6].strip().rstrip('.')
amount = m[7]
fdetail.writerow([thismem, thisyear, thiscat, date1, transcode, recordid, sunrecip, recip, date2, date3, descrip, amount])
continue
# personel record
ma = personel_re.search(l)
if ma:
m = ma.groups()
recip = m[0].strip().rstrip('.')
sunrecip = recip
if recip=='DO ':
sunrecip = oldrecip
else:
oldrecip = recip
date2 = m[1]
date3 = m[2]
descrip = m[3].strip().rstrip('.')
amount = m[4]
fdetail.writerow([thismem, thisyear, thiscat, "", "", "", sunrecip, recip, date2, date3, descrip, amount])
continue
# summary record
ma = summary_re.search(l)
if ma:
m = ma.groups()
if m[0].strip() in cats:
fsummary.writerow([thismem, m[0], m[1], m[2]])
continue
if not known_bad(l):
trashcan.write(l)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment