Created
April 3, 2012 04:20
-
-
Save NickCarneiro/2289281 to your computer and use it in GitHub Desktop.
Cap 10k parsing script
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from bs4 import BeautifulSoup | |
| #produce tab delimited CSV files from large html files | |
| for year in range(2008, 2013): | |
| print 'processing ' + str(year) | |
| file = str(year) + '.html' | |
| f = open(file, 'r'); | |
| html = f.read() | |
| soup = BeautifulSoup(html) | |
| table = soup.find("table") | |
| #first row with column titles | |
| rows = table.findAll('tr') | |
| i = 0 | |
| #write data to csv file | |
| outfile = str(year) + '.csv' | |
| out = open(outfile, 'w') | |
| this_row = '' | |
| for row in rows: | |
| cols = row.findAll('td') | |
| for col in cols: | |
| b = col.find('b') | |
| text = str(b.string) | |
| this_row += text | |
| this_row += ' ' | |
| if i % 2 == 0: | |
| this_row += '\n' | |
| out.write(this_row) | |
| this_row = '' | |
| i += 1 | |
| out.close(); | |
| f.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment