Skip to content

Instantly share code, notes, and snippets.

@vphill
Created May 10, 2014 15:59
Show Gist options
  • Save vphill/222e88019267478845bf to your computer and use it in GitHub Desktop.
Save vphill/222e88019267478845bf to your computer and use it in GitHub Desktop.
Code to parse the html found here http://www.capitol.state.tx.us/BillLookup/History.aspx?LegSess=83R&Bill=HB5 and produce a json version
# coding=UTF-8
import sys
import re
import json
from dateutil import parser
from bs4 import BeautifulSoup
if len(sys.argv) != 2:
print "usage: parse_bill_html.py <bill_html>"
exit(-1)
bill = {}
leg_map_83rd = {
#senators
"Birdwell": "Birdwell, Brian",
"Campbell": "Campbell, Donna",
"Carona": "Carona, John",
"Davis": "Davis, Wendy",
"Deuell": "Deuell, Bob",
"Dewhurst": "Dewhurst, David",
"Duncan": "Duncan, Robert",
"Ellis": "Ellis, Rodney",
"Eltife": "Eltife, Kevin",
"Estes": "Estes, Craig",
"Fraser": "Fraser, Troy",
"Garcia": "Garcia, Sylvia",
"Hancock": "Hancock, Kelly",
"Hegar": "Hegar, Glenn",
"Hinojosa": "Hinojosa, Juan",
"Huffman": "Huffman, Joan",
"Lucio": "Lucio, Eddie Jr.",
"Nelson": "Nelson, Jane",
"Nichols": "Nichols, Robert",
"Patrick": "Patrick, Dan",
"Paxton": "Paxton, Ken",
u"Rodríguez": "Rodríguez, José",
"Schwertner": "Schwertner, Charles",
"Seliger": "Seliger, Kel",
"Taylor": "Taylor, Larry",
"Uresti": "Uresti, Carlos",
"Van de Putte": "Van de Putte, Leticia",
"Watson": "Watson, Kirk",
"West": "West, Royce",
"Whitmire": "Whitmire, John",
"Williams": "Williams, Tommy",
"Zaffirini": "Zaffirini, Judith",
#house
"Allen": "Allen, Alma",
"Alonzo": "Alonzo, Roberto",
"Alvarado": "Alvarado, Carol",
"Anchia": "Anchia, Rafael",
"Anderson": "Anderson, Charles",
"Ashby": "Ashby, Trent",
"Aycock": "Aycock, Jimmie Don",
"Bell": "Bell, Cecil",
"Bohac": "Bohac, Dwayne",
"Bonnen, Dennis": "Bonnen, Dennis",
"Bonnen, Greg": "Bonnen, Greg",
"Branch": "Branch, Dan",
"Burkett": "Burkett, Cindy",
"Burnam": "Burnam, Lon",
"Button": "Button, Angie",
"Callegari": "Callegari, Bill",
"Canales": "Canales, Terry",
"Capriglione": "Capriglione, Giovanni",
"Carter": "Carter, Stefani",
"Clardy": "Clardy, Travis",
"Coleman": "Coleman, Garnet",
"Collier": "Collier, Nicole",
"Cook": "Cook, Byron",
"Cortez": "Cortez, Philip",
"Craddick": "Craddick, Tom",
"Creighton": "Creighton, Brandon",
"Crownover": "Crownover, Myra",
"Dale": "Dale, Tony",
"Darby": "Darby, Drew",
"Davis, John": "Davis, John",
"Davis, Sarah": "Davis, Sarah",
"Davis, Yvonne": "Davis, Yvonne",
"Deshotel": "Deshotel, Joe",
"Dukes": "Dukew, Dawnna",
"Dutton": "Dutton, Harold V. Jr.",
"Eiland": "Eiland, Craig",
"Elkins": "Elkins, Gary",
"Fallon": "Fallon, Pat",
"Farias": "Farias, Joe",
"Farney": "Farney, Marsha",
"Farrar": "Farrar, Jessica",
"Fletcher": "Fletcher, Allen",
"Flynn": "Flynn, Dan",
"Frank": "Frank, James",
"Frullo": "Frullo, John",
"Geren": "Geren, Charlie",
"Giddings": "Giddings, Helen",
"Goldman": "Goldman, Craig",
"Gonzales, Larry": "Gonzales, Larry",
"Gonzalez, Naomi": "Gonzalez, Naomi",
u"González, Mary": u"González, Mary",
"Gooden": "Gooden, Lance",
"Guerra": 'Guerra, Robert. D.',
"Guillen": "Guillen, Ryan",
"Gutierrez": "Gutierrez, Roland",
"Harless": "Harless, Patricia",
"Harper-Brown": "Harper-Brown, Linda",
"Hernandez Luna": "Hernandez Luna, Ana",
"Herrero": "Herrero, Abel",
"Hilderbran": "Hilderbran, Harvey",
"Howard": "Howard, Donna",
"Huberty": "Huberty, Dan",
"Hughes": "Hughes, Bryan",
"Hunter": "Hunter, Todd",
"Isaac": "Issac, Jason",
"Israel": "Usrael, Celia",
"Johnson": "Johnson, Eric",
"Kacal": "Kacal, Kyle",
"Keffer": "Keffer, Jim",
"King, Ken": "King, Ken",
"King, Phil": "King, Phil",
"King, Susan": "King, Susan",
"King, Tracy O.": "King, Tracy O.",
"Kleinschmidt": "Kleinschmidt, Tim",
"Klick": "Klick, Stephanie",
"Kolkhorst": "Kolkhorst, Lois",
"Krause": "Krause, Matt",
"Kuempel": "Kuempel, John",
"Larson": "Larson, Lyle",
"Laubenberg": "Laubenberg, Jodie",
"Lavender": "Lavender, George",
"Leach": "Leach, Jeff",
"Lewis": "Lewis, Tryon",
"Longoria": "Longoria, Oscar",
"Lozano": "Lozano, Jose Manuel",
"Lucio III": "Lucio, Eddie III",
"Martinez Fischer": "Martinez Fischer, Trey",
'Martinez, "Mando"': "Martinez, Armando",
"McClendon": "McClendon, Ruth",
u"Menéndez": u"Menéndez, José",
"Miles": "Miles, Borris",
"Miller, Doug": "Miller, Doug",
"Miller, Rick": "Miller, Rick",
"Moody": "Moody, Joe",
"Morrison": "Morrison, Geanie",
"Murphy": "Murphy, Jim",
u"Muñoz, Jr.": u"Muñoz, Sergio Jr.",
u"Márquez": u"Márquez, Marisa",
"Naishtat": "Naishtat, Elliott",
u"Nevárez": u"Nevárez, Poncho",
"Oliveira": u"Oliveira, René",
"Orr": "Orr, Rob",
"Otto": "Otto, John",
"Paddie": "Paddie, Chris",
"Parker": "Parker, Tan",
"Patrick, Diane": "Patrick, Diane",
"Perez": "Perez, Mary Ann",
"Perry": "Perry, Charles",
"Phillips": "Phillips, Larry",
"Pickett": "Pickett, Joseph",
"Pitts": "Pitts, Jim",
"Price": 'Price, Walter "Four"',
"Raney": "Raney, John",
"Ratliff": "Ratliff, Bennett",
"Raymond": u"Raymond, Richard Peña",
"Reynolds": "Reynolds, Ron",
"Riddle": "Riddle, Debbie",
"Ritter": "Ritter, Allan",
"Rodriguez, Eddie": "Rodriguez, Eddie",
"Rodriguez, Justin": "Rodriguez, Justin",
"Rose": "Rose, Toni",
"Sanford": "Sanford, Scott",
"Schaefer": "Schaefer, Matt",
"Sheets": "Sheets, Kenneth",
"Sheffield, J. D.": "Sheffield, Jesse David",
"Sheffield, Ralph": "Sheffield, Ralph",
"Simmons": "Simmons, Ron",
"Simpson": "Simpson, David",
"Smith": "Smith, Wayne",
"Smithee": "Smithee, John",
"Springer": "Springer, Drew",
"Stephenson": "Stephenson, Phil",
"Stickland": "Stickland, Jonathan",
"Strama": "Strama, Mark",
"Straus": "Straus, Joe",
"Taylor, Van": "Taylor, Van",
"Thompson, Ed": "Thompson, Ed",
"Thompson, Senfronia": "Thompson, Senfronia",
"Toth": "Toth, Steve",
"Turner, Chris": "Turner, Chris",
"Turner, Scott": "Turner, Scott",
"Turner, Sylvester": "Turner, Sylvester",
"Villalba": "Villalba, Jason",
"Villarreal": "Villarreal, Mike",
"Vo": "Vo, Hubert",
"Walle": "Walle, Armando",
"White": "White, James",
"Workman": "Workman, Paul",
"Wu": "Wu, Gene",
"Zedler": "Zedler, Bill",
"Zerwas": "Zerwas, John",
}
html_doc = open(sys.argv[1]).read()
soup = BeautifulSoup(html_doc)
#Get last action date
if soup.find(id="cellLastAction"):
last_action_date = soup.find(id="cellLastAction").text.split(" ", 1)[0]
else:
last_action_date = "2013"
authors = []
if soup.find(id="cellAuthors"):
authors = [x.strip() for x in soup.find(id="cellAuthors").text.split("|")]
co_authors = []
if soup.find(id="cellCoauthors"):
co_authors = [x.strip() for x in soup.find(id="cellCoauthors").text.split("|")]
sponsors = []
if soup.find(id="cellSponsors"):
sponsors = [x.strip() for x in soup.find(id="cellSponsors").text.split("|")]
co_sponsors = []
if soup.find(id="cellCosponsors"):
co_sponsors = [x.strip() for x in soup.find(id="cellCosponsors").text.split("|")]
subject_list = []
if soup.find(id="cellSubjects"):
subjects = soup.find(id="cellSubjects").contents
for i in subjects:
if i.string:
subject_list.append(i.string)
if last_action_date.strip() != "":
#Try to parse dates into yyyy-mm-dd format only if date is non-blank
try:
d = parser.parse(last_action_date)
last_action_date = d.strftime("%Y-%m-%d")
except:
pass
bill["date"] = last_action_date
bill["filename"] = sys.argv[1]
bill["authors"] = []
for author in authors:
bill["authors"].append(leg_map_83rd[author])
bill["co_authors"] = []
for co_author in co_authors:
bill["co_authors"].append(leg_map_83rd[co_author])
bill["sponsors"] = []
for sponsor in sponsors:
bill["sponsors"].append(leg_map_83rd[sponsor])
bill["co_sponsors"] = []
for co_sponsor in co_sponsors:
if co_sponsor.strip() != "":
bill["co_sponsors"].append(leg_map_83rd[co_sponsor])
bill["subjects"] = []
for subject in subject_list:
subject = re.sub("\(.*\)", "", subject)
subject = subject.replace("--", " - ")
subject = subject.strip()
if subject.isupper():
subject = subject.title()
bill["subjects"].append(subject)
print json.dumps(bill, sort_keys=True, indent=4, separators=(',', ': '))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment