Skip to content

Instantly share code, notes, and snippets.

@greglinch
Last active March 10, 2017 22:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save greglinch/5197267b6ff8fcb19192ba5443f1f71d to your computer and use it in GitHub Desktop.
Save greglinch/5197267b6ff8fcb19192ba5443f1f71d to your computer and use it in GitHub Desktop.
Converts HTML table from congressional bio directory to a csv. For downloading images, see https://gist.github.com/greglinch/608001fa0ae39834af18354c9e8c6f09
from bs4 import BeautifulSoup
'''
Prereqs:
- Go to the congressional bio directory http://bioguide.congress.gov/biosearch/biosearch.asp
- Search the parameters you want
- inspect element and copy the html
- paste into a file and (optional?) wrap with <html></html> tags
'''
def convert_html_to_csv():
## set the file to read
file_in = 'FILENAME.html'
## open the file
with open(file_in, 'r') as read_file:
html_doc = read_file.read()
## HTML parser
soup = BeautifulSoup(html_doc, 'html.parser')
## find all rows
rows = soup.find_all('tr')
## remove the header row
rows.pop(0)
## data row count
print 'Total data rows:\t' + str(len(rows)) + '\n'
# print rows[0].find_all('a')
output = 'id,lastname,statepostal,chamber\n'
rows_processed = 0
for row in rows:
try:
member_id = row.find_all('a')[0]['href'].split('=')[1]
member_lastname = row.find_all('a')[0].string.split(',')[0].title()
member_statepostal = row.find_all('td')[-2].string
member_chamber = row.find_all('td')[2].string
output += '%s,%s,%s,%s\n' % (member_id, member_lastname, member_statepostal, member_chamber)
# output += '%s,%s\n' % (member_statepostal, member_chamber)
rows_processed += 1
except:
print 'Error:\t\t' + str(row) + '\n'
print 'Rows processed:\t\t' + str(rows_processed) + '\n'
new_file = 'FILENAME.csv'
## write to the csv
with open(new_file, 'w') as write_file:
write_file.write(output)
## execute the function
convert_html_to_csv()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment