Created
May 10, 2014 15:59
-
-
Save vphill/222e88019267478845bf to your computer and use it in GitHub Desktop.
Code to parse the html found here http://www.capitol.state.tx.us/BillLookup/History.aspx?LegSess=83R&Bill=HB5 and produce a json version
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding=UTF-8 | |
import sys | |
import re | |
import json | |
from dateutil import parser | |
from bs4 import BeautifulSoup | |
if len(sys.argv) != 2: | |
print "usage: parse_bill_html.py <bill_html>" | |
exit(-1) | |
bill = {} | |
leg_map_83rd = { | |
#senators | |
"Birdwell": "Birdwell, Brian", | |
"Campbell": "Campbell, Donna", | |
"Carona": "Carona, John", | |
"Davis": "Davis, Wendy", | |
"Deuell": "Deuell, Bob", | |
"Dewhurst": "Dewhurst, David", | |
"Duncan": "Duncan, Robert", | |
"Ellis": "Ellis, Rodney", | |
"Eltife": "Eltife, Kevin", | |
"Estes": "Estes, Craig", | |
"Fraser": "Fraser, Troy", | |
"Garcia": "Garcia, Sylvia", | |
"Hancock": "Hancock, Kelly", | |
"Hegar": "Hegar, Glenn", | |
"Hinojosa": "Hinojosa, Juan", | |
"Huffman": "Huffman, Joan", | |
"Lucio": "Lucio, Eddie Jr.", | |
"Nelson": "Nelson, Jane", | |
"Nichols": "Nichols, Robert", | |
"Patrick": "Patrick, Dan", | |
"Paxton": "Paxton, Ken", | |
u"Rodríguez": "Rodríguez, José", | |
"Schwertner": "Schwertner, Charles", | |
"Seliger": "Seliger, Kel", | |
"Taylor": "Taylor, Larry", | |
"Uresti": "Uresti, Carlos", | |
"Van de Putte": "Van de Putte, Leticia", | |
"Watson": "Watson, Kirk", | |
"West": "West, Royce", | |
"Whitmire": "Whitmire, John", | |
"Williams": "Williams, Tommy", | |
"Zaffirini": "Zaffirini, Judith", | |
#house | |
"Allen": "Allen, Alma", | |
"Alonzo": "Alonzo, Roberto", | |
"Alvarado": "Alvarado, Carol", | |
"Anchia": "Anchia, Rafael", | |
"Anderson": "Anderson, Charles", | |
"Ashby": "Ashby, Trent", | |
"Aycock": "Aycock, Jimmie Don", | |
"Bell": "Bell, Cecil", | |
"Bohac": "Bohac, Dwayne", | |
"Bonnen, Dennis": "Bonnen, Dennis", | |
"Bonnen, Greg": "Bonnen, Greg", | |
"Branch": "Branch, Dan", | |
"Burkett": "Burkett, Cindy", | |
"Burnam": "Burnam, Lon", | |
"Button": "Button, Angie", | |
"Callegari": "Callegari, Bill", | |
"Canales": "Canales, Terry", | |
"Capriglione": "Capriglione, Giovanni", | |
"Carter": "Carter, Stefani", | |
"Clardy": "Clardy, Travis", | |
"Coleman": "Coleman, Garnet", | |
"Collier": "Collier, Nicole", | |
"Cook": "Cook, Byron", | |
"Cortez": "Cortez, Philip", | |
"Craddick": "Craddick, Tom", | |
"Creighton": "Creighton, Brandon", | |
"Crownover": "Crownover, Myra", | |
"Dale": "Dale, Tony", | |
"Darby": "Darby, Drew", | |
"Davis, John": "Davis, John", | |
"Davis, Sarah": "Davis, Sarah", | |
"Davis, Yvonne": "Davis, Yvonne", | |
"Deshotel": "Deshotel, Joe", | |
"Dukes": "Dukew, Dawnna", | |
"Dutton": "Dutton, Harold V. Jr.", | |
"Eiland": "Eiland, Craig", | |
"Elkins": "Elkins, Gary", | |
"Fallon": "Fallon, Pat", | |
"Farias": "Farias, Joe", | |
"Farney": "Farney, Marsha", | |
"Farrar": "Farrar, Jessica", | |
"Fletcher": "Fletcher, Allen", | |
"Flynn": "Flynn, Dan", | |
"Frank": "Frank, James", | |
"Frullo": "Frullo, John", | |
"Geren": "Geren, Charlie", | |
"Giddings": "Giddings, Helen", | |
"Goldman": "Goldman, Craig", | |
"Gonzales, Larry": "Gonzales, Larry", | |
"Gonzalez, Naomi": "Gonzalez, Naomi", | |
u"González, Mary": u"González, Mary", | |
"Gooden": "Gooden, Lance", | |
"Guerra": 'Guerra, Robert. D.', | |
"Guillen": "Guillen, Ryan", | |
"Gutierrez": "Gutierrez, Roland", | |
"Harless": "Harless, Patricia", | |
"Harper-Brown": "Harper-Brown, Linda", | |
"Hernandez Luna": "Hernandez Luna, Ana", | |
"Herrero": "Herrero, Abel", | |
"Hilderbran": "Hilderbran, Harvey", | |
"Howard": "Howard, Donna", | |
"Huberty": "Huberty, Dan", | |
"Hughes": "Hughes, Bryan", | |
"Hunter": "Hunter, Todd", | |
"Isaac": "Issac, Jason", | |
"Israel": "Usrael, Celia", | |
"Johnson": "Johnson, Eric", | |
"Kacal": "Kacal, Kyle", | |
"Keffer": "Keffer, Jim", | |
"King, Ken": "King, Ken", | |
"King, Phil": "King, Phil", | |
"King, Susan": "King, Susan", | |
"King, Tracy O.": "King, Tracy O.", | |
"Kleinschmidt": "Kleinschmidt, Tim", | |
"Klick": "Klick, Stephanie", | |
"Kolkhorst": "Kolkhorst, Lois", | |
"Krause": "Krause, Matt", | |
"Kuempel": "Kuempel, John", | |
"Larson": "Larson, Lyle", | |
"Laubenberg": "Laubenberg, Jodie", | |
"Lavender": "Lavender, George", | |
"Leach": "Leach, Jeff", | |
"Lewis": "Lewis, Tryon", | |
"Longoria": "Longoria, Oscar", | |
"Lozano": "Lozano, Jose Manuel", | |
"Lucio III": "Lucio, Eddie III", | |
"Martinez Fischer": "Martinez Fischer, Trey", | |
'Martinez, "Mando"': "Martinez, Armando", | |
"McClendon": "McClendon, Ruth", | |
u"Menéndez": u"Menéndez, José", | |
"Miles": "Miles, Borris", | |
"Miller, Doug": "Miller, Doug", | |
"Miller, Rick": "Miller, Rick", | |
"Moody": "Moody, Joe", | |
"Morrison": "Morrison, Geanie", | |
"Murphy": "Murphy, Jim", | |
u"Muñoz, Jr.": u"Muñoz, Sergio Jr.", | |
u"Márquez": u"Márquez, Marisa", | |
"Naishtat": "Naishtat, Elliott", | |
u"Nevárez": u"Nevárez, Poncho", | |
"Oliveira": u"Oliveira, René", | |
"Orr": "Orr, Rob", | |
"Otto": "Otto, John", | |
"Paddie": "Paddie, Chris", | |
"Parker": "Parker, Tan", | |
"Patrick, Diane": "Patrick, Diane", | |
"Perez": "Perez, Mary Ann", | |
"Perry": "Perry, Charles", | |
"Phillips": "Phillips, Larry", | |
"Pickett": "Pickett, Joseph", | |
"Pitts": "Pitts, Jim", | |
"Price": 'Price, Walter "Four"', | |
"Raney": "Raney, John", | |
"Ratliff": "Ratliff, Bennett", | |
"Raymond": u"Raymond, Richard Peña", | |
"Reynolds": "Reynolds, Ron", | |
"Riddle": "Riddle, Debbie", | |
"Ritter": "Ritter, Allan", | |
"Rodriguez, Eddie": "Rodriguez, Eddie", | |
"Rodriguez, Justin": "Rodriguez, Justin", | |
"Rose": "Rose, Toni", | |
"Sanford": "Sanford, Scott", | |
"Schaefer": "Schaefer, Matt", | |
"Sheets": "Sheets, Kenneth", | |
"Sheffield, J. D.": "Sheffield, Jesse David", | |
"Sheffield, Ralph": "Sheffield, Ralph", | |
"Simmons": "Simmons, Ron", | |
"Simpson": "Simpson, David", | |
"Smith": "Smith, Wayne", | |
"Smithee": "Smithee, John", | |
"Springer": "Springer, Drew", | |
"Stephenson": "Stephenson, Phil", | |
"Stickland": "Stickland, Jonathan", | |
"Strama": "Strama, Mark", | |
"Straus": "Straus, Joe", | |
"Taylor, Van": "Taylor, Van", | |
"Thompson, Ed": "Thompson, Ed", | |
"Thompson, Senfronia": "Thompson, Senfronia", | |
"Toth": "Toth, Steve", | |
"Turner, Chris": "Turner, Chris", | |
"Turner, Scott": "Turner, Scott", | |
"Turner, Sylvester": "Turner, Sylvester", | |
"Villalba": "Villalba, Jason", | |
"Villarreal": "Villarreal, Mike", | |
"Vo": "Vo, Hubert", | |
"Walle": "Walle, Armando", | |
"White": "White, James", | |
"Workman": "Workman, Paul", | |
"Wu": "Wu, Gene", | |
"Zedler": "Zedler, Bill", | |
"Zerwas": "Zerwas, John", | |
} | |
html_doc = open(sys.argv[1]).read() | |
soup = BeautifulSoup(html_doc) | |
#Get last action date | |
if soup.find(id="cellLastAction"): | |
last_action_date = soup.find(id="cellLastAction").text.split(" ", 1)[0] | |
else: | |
last_action_date = "2013" | |
authors = [] | |
if soup.find(id="cellAuthors"): | |
authors = [x.strip() for x in soup.find(id="cellAuthors").text.split("|")] | |
co_authors = [] | |
if soup.find(id="cellCoauthors"): | |
co_authors = [x.strip() for x in soup.find(id="cellCoauthors").text.split("|")] | |
sponsors = [] | |
if soup.find(id="cellSponsors"): | |
sponsors = [x.strip() for x in soup.find(id="cellSponsors").text.split("|")] | |
co_sponsors = [] | |
if soup.find(id="cellCosponsors"): | |
co_sponsors = [x.strip() for x in soup.find(id="cellCosponsors").text.split("|")] | |
subject_list = [] | |
if soup.find(id="cellSubjects"): | |
subjects = soup.find(id="cellSubjects").contents | |
for i in subjects: | |
if i.string: | |
subject_list.append(i.string) | |
if last_action_date.strip() != "": | |
#Try to parse dates into yyyy-mm-dd format only if date is non-blank | |
try: | |
d = parser.parse(last_action_date) | |
last_action_date = d.strftime("%Y-%m-%d") | |
except: | |
pass | |
bill["date"] = last_action_date | |
bill["filename"] = sys.argv[1] | |
bill["authors"] = [] | |
for author in authors: | |
bill["authors"].append(leg_map_83rd[author]) | |
bill["co_authors"] = [] | |
for co_author in co_authors: | |
bill["co_authors"].append(leg_map_83rd[co_author]) | |
bill["sponsors"] = [] | |
for sponsor in sponsors: | |
bill["sponsors"].append(leg_map_83rd[sponsor]) | |
bill["co_sponsors"] = [] | |
for co_sponsor in co_sponsors: | |
if co_sponsor.strip() != "": | |
bill["co_sponsors"].append(leg_map_83rd[co_sponsor]) | |
bill["subjects"] = [] | |
for subject in subject_list: | |
subject = re.sub("\(.*\)", "", subject) | |
subject = subject.replace("--", " - ") | |
subject = subject.strip() | |
if subject.isupper(): | |
subject = subject.title() | |
bill["subjects"].append(subject) | |
print json.dumps(bill, sort_keys=True, indent=4, separators=(',', ': ')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment