mila/naive_html2csv.py

## naive_html2csv.py
#!/usr/bin/env python3

import argparse
import csv
import re


table_open_re = re.compile('<table[^>]*>', re.IGNORECASE)
table_close_re = re.compile('</table[^>]*>', re.IGNORECASE)
tr_open_re = re.compile('<tr[^>]*>', re.IGNORECASE)
tr_close_re = re.compile('</tr[^>]*>', re.IGNORECASE)
td_open_re = re.compile('<t[dh][^>]*>', re.IGNORECASE)
td_close_re = re.compile('</t[dh][^>]*>', re.IGNORECASE)


def _scan(pattern, string, start, end):
    match = pattern.search(string, start)
    if not match:
        return end, end
    return match.start(), match.end()


def _find_elements(start_re, close_re, html, pos, end):
    _, pos = _scan(start_re, html, pos, end)
    while pos < end:
        close_start, _ = _scan(close_re, html, pos, end)
        open_start, open_end = _scan(start_re, html, pos, end)
        yield pos, min(close_start, open_start)
        pos = open_end


def read_stream(stream):
    html = stream.read()
    return read_html(html)


def read_html(html):
    tables = _find_elements(table_open_re, table_close_re, html, 0, len(html))
    for table_start, table_end in tables:
        yield from read_table(html, table_start, table_end)


def read_table(html, table_start, table_end):
    rows = _find_elements(tr_open_re, tr_close_re, html, table_start, table_end)
    for row_start, row_end in rows:
        yield read_row(html, row_start, row_end)


def read_row(html, row_start, row_end):
    cells = list(_find_elements(td_open_re, td_close_re, html, row_start, row_end))
    return [html[cell_start:cell_end] for cell_start, cell_end in cells]


def main():
    parser = argparse.ArgumentParser(
        description='Extracts CSV from the tables in a HTML page.',
    )
    parser.add_argument('source')
    parser.add_argument('target')
    args = parser.parse_args()

    with open(args.source) as input_stream:
        with open(args.target, 'w') as output_stream:
            data = read_stream(input_stream)
            writer = csv.writer(output_stream)
            writer.writerows(data)


if __name__ == '__main__':
    main()
	#!/usr/bin/env python3

	import argparse
	import csv
	import re


	table_open_re = re.compile('<table[^>]*>', re.IGNORECASE)
	table_close_re = re.compile('</table[^>]*>', re.IGNORECASE)
	tr_open_re = re.compile('<tr[^>]*>', re.IGNORECASE)
	tr_close_re = re.compile('</tr[^>]*>', re.IGNORECASE)
	td_open_re = re.compile('<t[dh][^>]*>', re.IGNORECASE)
	td_close_re = re.compile('</t[dh][^>]*>', re.IGNORECASE)


	def _scan(pattern, string, start, end):
	match = pattern.search(string, start)
	if not match:
	return end, end
	return match.start(), match.end()


	def _find_elements(start_re, close_re, html, pos, end):
	_, pos = _scan(start_re, html, pos, end)
	while pos < end:
	close_start, _ = _scan(close_re, html, pos, end)
	open_start, open_end = _scan(start_re, html, pos, end)
	yield pos, min(close_start, open_start)
	pos = open_end


	def read_stream(stream):
	html = stream.read()
	return read_html(html)


	def read_html(html):
	tables = _find_elements(table_open_re, table_close_re, html, 0, len(html))
	for table_start, table_end in tables:
	yield from read_table(html, table_start, table_end)


	def read_table(html, table_start, table_end):
	rows = _find_elements(tr_open_re, tr_close_re, html, table_start, table_end)
	for row_start, row_end in rows:
	yield read_row(html, row_start, row_end)


	def read_row(html, row_start, row_end):
	cells = list(_find_elements(td_open_re, td_close_re, html, row_start, row_end))
	return [html[cell_start:cell_end] for cell_start, cell_end in cells]


	def main():
	parser = argparse.ArgumentParser(
	description='Extracts CSV from the tables in a HTML page.',
	)
	parser.add_argument('source')
	parser.add_argument('target')
	args = parser.parse_args()

	with open(args.source) as input_stream:
	with open(args.target, 'w') as output_stream:
	data = read_stream(input_stream)
	writer = csv.writer(output_stream)
	writer.writerows(data)


	if __name__ == '__main__':
	main()