Skip to content

Instantly share code, notes, and snippets.

@mila
Created March 29, 2018 13:08
Show Gist options
  • Save mila/912dc9e7d6dfe763b873e2bbea155e4b to your computer and use it in GitHub Desktop.
Save mila/912dc9e7d6dfe763b873e2bbea155e4b to your computer and use it in GitHub Desktop.
Extracts CSV from the tables in a HTML page (using regular expressions).
#!/usr/bin/env python3
import argparse
import csv
import re
table_open_re = re.compile('<table[^>]*>', re.IGNORECASE)
table_close_re = re.compile('</table[^>]*>', re.IGNORECASE)
tr_open_re = re.compile('<tr[^>]*>', re.IGNORECASE)
tr_close_re = re.compile('</tr[^>]*>', re.IGNORECASE)
td_open_re = re.compile('<t[dh][^>]*>', re.IGNORECASE)
td_close_re = re.compile('</t[dh][^>]*>', re.IGNORECASE)
def _scan(pattern, string, start, end):
match = pattern.search(string, start)
if not match:
return end, end
return match.start(), match.end()
def _find_elements(start_re, close_re, html, pos, end):
_, pos = _scan(start_re, html, pos, end)
while pos < end:
close_start, _ = _scan(close_re, html, pos, end)
open_start, open_end = _scan(start_re, html, pos, end)
yield pos, min(close_start, open_start)
pos = open_end
def read_stream(stream):
html = stream.read()
return read_html(html)
def read_html(html):
tables = _find_elements(table_open_re, table_close_re, html, 0, len(html))
for table_start, table_end in tables:
yield from read_table(html, table_start, table_end)
def read_table(html, table_start, table_end):
rows = _find_elements(tr_open_re, tr_close_re, html, table_start, table_end)
for row_start, row_end in rows:
yield read_row(html, row_start, row_end)
def read_row(html, row_start, row_end):
cells = list(_find_elements(td_open_re, td_close_re, html, row_start, row_end))
return [html[cell_start:cell_end] for cell_start, cell_end in cells]
def main():
parser = argparse.ArgumentParser(
description='Extracts CSV from the tables in a HTML page.',
)
parser.add_argument('source')
parser.add_argument('target')
args = parser.parse_args()
with open(args.source) as input_stream:
with open(args.target, 'w') as output_stream:
data = read_stream(input_stream)
writer = csv.writer(output_stream)
writer.writerows(data)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment