Skip to content

Instantly share code, notes, and snippets.

@vdavez
Last active August 29, 2015 13:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vdavez/8747925 to your computer and use it in GitHub Desktop.
Save vdavez/8747925 to your computer and use it in GitHub Desktop.
FIS Scraper

FIS Scraper

Currently, the CFO makes fiscal impact statements (FIS) available at http://app.cfo.dc.gov/services/fiscal_impact/search.asp. But, bulk data. So, scrape.

Once pdfs were obtained, they were saved into txt and then reinserted into the json. Final result = searchable json.

#!/usr/bin/env python
# scrape the Fiscal Impact Statement Database
# http://app.cfo.dc.gov/services/fiscal_impact/search.asp
# imports
import os
import mechanize
from bs4 import BeautifulSoup
import re
import json
#initialize outfile
out = open('blob.json', 'w')
data = []
def addPage(iter, data, rows, source_url):
for row in rows:
col = row('td')
fis_url = "http://app.cfo.dc.gov/services/fiscal_impact/" + col[2].find("a")["href"]
data.append({
"no":iter,
"number":col[0].get_text(),
"type":col[1].get_text(),
"title":col[2].get_text(),
"date":col[3].get_text(),
"url":fis_url,
"source": source_url
})
iter = iter + 1
return [iter, data]
def getPage(year, page):
source_url = "http://app.cfo.dc.gov/services/fiscal_impact/search.asp?session_time=" + year + "&num=&title=&page=" + page
r = BeautifulSoup(br.open(source_url).read())
colors = re.compile(r"white|E1ECF2")
return [source_url, r.find_all("tr",bgcolor=colors)]
# initialize the browser to grab the landing page
br = mechanize.Browser()
# Start at the very beginning
i = 0
year = '2001'
page = '1'
while (int(year) < 2014):
while (int(page) < 35):
p = getPage(year, page)
o = addPage(i, data, p[1], p[0])
data = o[1]
i = o[0]
page = str(int(page) + 1)
page = '1'
year = str(int(year) + 1)
out.write(json.dumps(data,indent=2))
out.close()
#!/usr/bin/env python
import json
import re
import urllib
f = open('./blob.json','r')
records = json.load(f)
f_log = open ('../public/log.txt','w')
for fis in records:
fis_id = fis['no']
url = fis['url']
title = "title"
date = re.sub('\.','-', fis['date'])
fname = '../public/fis/' + str(fis_id) + '.pdf'
try:
urllib.urlretrieve(url,fname)
f_log.write(fname + ' added.')
except:
f_log("ERROR: There was an error here for:" + fname + "url: " + url)
#!/usr/bin/env python
import json
import re
import urllib
f = open('./blob.json','r')
data = json.load(f)
outdata = []
for d in data:
i = d["no"]
txt = open('./public/fis/txt/' + str(i) + '.pdf.txt', 'r').read()
d["text"] = json.JSONEncoder().encode(txt)
outdata.append(d)
out = open('./fisblob.json','w')
out.write(json.dumps(outdata,indent=2))
out.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment