Skip to content

Instantly share code, notes, and snippets.

@chris1610
Created January 9, 2015 04:02
Show Gist options
  • Save chris1610/12d7701df5c335ec98aa to your computer and use it in GitHub Desktop.
Save chris1610/12d7701df5c335ec98aa to your computer and use it in GitHub Desktop.
#Parse 2014 MN Capital budget - https://www.revisor.mn.gov/laws/?year=2014&type=0&doctype=Chapter&id=294
#Store the summary in a DataFrame for eventual manipulation
from __future__ import print_function
import os.path
from collections import defaultdict
import string
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
filename = "MNBudget-2014.html"
url = "https://www.revisor.mn.gov/laws/?year=2014&type=0&doctype=Chapter&id=294"
def convert_num(val):
"""
Convert the string number value to a float
- Remove all extra whitespace
- Remove commas
- If wrapped in (), then it is negative number
"""
val = string.strip(val).replace(",","").replace("(","-").replace(")","")
return float(val)
# As we work through the process, it is easier to
# download it once and work with the saved copy instead of
# trying to hit the server each time
# Just delete the output file to force a new download
def get_data(url, cachefile):
if os.path.isfile(cachefile):
print("Loading the data via the file.")
f = open(cachefile, 'r')
c = f.read()
else:
print("Fetching the data via the URL.")
result = requests.get(url)
c = result.content
f = open(cachefile,'w')
f.write(c)
f.close()
soup = BeautifulSoup(c)
return soup
def process_data(soup):
# Init the variables
# Use a defaultdict with an empty list because it eases the DataFrame creation
expense_lines = defaultdict(list)
funding_lines = defaultdict(list)
funding = False
# After looking at the data, we can see that the summary has a div id we can use
summary = soup.find("div", {"class":"bill_section","id": "laws.1.1.0"})
# Get all the tables in the summary
tables = summary.find_all('table')
# The first table is not useful header info
# The second table contains all the we need (the list is 0 indexed)
data_table = tables[1]
#Go through each row of the table and pull out our data
for row in data_table.find_all("tr"):
cells = row.find_all("td")
# Ignore lines that don't have 3 cells of data because it is just spacing
if len(cells) == 3:
line = (string.strip(cells[0].text), convert_num(cells[2].text))
# Once we get to the total line we start getting the funding lines
if line[0] == "TOTAL":
funding = True
# We don't want to capture the total because we can calc it
continue
if funding:
funding_lines[line[0]].append(line[1])
else:
expense_lines[line[0]].append(line[1])
return funding_lines, expense_lines
def graph_data_to_file(data, filename, title, plot_kwargs={}):
# Create the DataFrame using from_dict
data_df = pd.DataFrame.from_dict(data,orient='index')
# Label our column
data_df.rename(columns={0: 'Amount'}, inplace=True)
data_df = data_df.sort(columns='Amount')
#Set some nicer defaults for plots
pd.options.display.mpl_style = 'default'
data_bar = data_df.plot(kind='barh',title=title, **plot_kwargs)
plt.savefig(filename)
def main():
data = get_data(url, filename)
funding, expenses = process_data(data)
graph_data_to_file(funding, "MN-2014-Funding.png", "2014 MN Capital Budget Funding")
graph_data_to_file(expenses, "MN-2014-Expense.png", "2014 MN Capital Budget Spending", {"figsize":[7, 13]})
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment