mdengler/html2csv.py

## html2csv.py
#!/bin/env python
# -*- coding: utf-8 -*-
"""

Examples:

%(progname)s http://en.wikipedia.org/wiki/List_of_Olympic_records_in_athletics

This is essentially this logic, done up

for table in lxml.html.from_string(some_html):
    for row in table.cssselect("tr"):
        print ",".join([td.text_content() for td in row.cssselect("td")])


There are a few ways one can narrow down which tables are shown:

1) --id="css or html id"
2) --widest
3) --tallest
4) --most-cells
5) --starting-closest-to-text="text you know comes right before the main table"

All of these selectors will cause a maximum of one table to be displayed.


Author: Martin Dengler
License: GPL v3+
"""

import csv
import lxml
import lxml.html
import optparse
import os
import sys
import urllib


def tables_to_tuples(tables):
    """takes output of csselect("table") and returns tuples (rows) of cells"""
    returned_rows = []
    for table in tables:
        rows = table.cssselect("tr")
        for row in rows:
            cells =  row.cssselect("td")
            for cell in cells:
                for span_style in cell.cssselect("span[style]"):
                    style_text = lxml.html.get_text(span_style, "style")
                    if "display:" in style_text and "none" in style_text:
#                        print "found"
                        span_style.drop_tree()
#                for hidden in cell.cssselect('*[style="display:none;"]'):
#                    print "found hidden: %s / %s" % (hidden, hidden.text_content)
#                    hidden.drop_tree()
            cells_text = [cell.text_content().strip() for cell in cells]
            returned_rows.append(cells_text)
    return returned_rows


def gettree(args, starting_closest_to_text=None):
    """returns lxml doc tree for stdin or url based on args"""
    if len(args) == 0:
        lines = sys.stdin.readlines()
    else:
        lines = urllib.urlopen(args[0]).readlines()

    if starting_closest_to_text is not None:

        text_line = 0
        found_line = None

        for line in lines:
            if starting_closest_to_text in line:
                found_line = text_line
                break
            text_line += 1

        if found_line is not None:
            lines = lines[found_line:]
            #print os.linesep.join(map(str, lines))
        else:
            raise Exception("Could not find text [%s] in input" % starting_closest_to_text)

    html = os.linesep.join(lines)
    tree = lxml.html.fromstring(html)
    return tree


def compare_by_rows(table_a, table_b):
    return cmp(len(table_a.getchildren()), len(table_b.getchildren()))


def compare_by_columns(table_a, table_b):
    return cmp(len(table_a.children()), len(table_b.children()))


def compare_by_cells(table_a, table_b):
    return cmp(len(table_a.iterdescendants()), len(table_b.iterdescendants()))


def longest(tables):
    return most(compare_by_rows, tables)


def widest(tables):
    return most(compare_by_columns, tables)


def biggest(tables):
    return most(compare_by_cells, tables)


def most(comparator, tables):
    return sorted(tables, cmp=comparator)[-1]


def gettables(tree, args, biggest=False, longest=False, widest=False, only_first=False):
    """returns lxml.cssselect tables for the tree, optionally only returning the nth table (indexed from 0)"""

    tables = list(tree.cssselect("table"))

    if len(args) > 1:
        n = int(args[1]) - 1
        tables = tables[n:n+1]

    if biggest:
        return biggest(tables)

    if longest:
        return longest(tables)

    if widest:
        return widest(tables)

    if only_first:
        tables = tables[0:1]
        print dir(tables[0]), tables[0].text_content

    return tables


def writetables(tables, outputfh=None):
    if outputfh is None:
        outputfh = sys.stdout
    csvwriter = csv.writer(outputfh)
    def clean(s):
        s = unicode(s).encode("utf-8")
        s = "".join([c for c in s if ord(c) in range(128)])
        return s
    for row in tables_to_tuples(tables):
        csvwriter.writerow([clean(c) for c in row])


def main(args):
    # fix for the below fix
    if "LANG" not in os.environ or os.environ["LANG"] == "C":
        os.environ["LANG"] = "en_US.utf8"
    # from http://wjd.nu/notes/2009#unicodeencodeerror-python-redirect-pipe
    import codecs, locale
    sys.stdout = codecs.getwriter(locale.getdefaultlocale()[1])(sys.stdout, 'replace')

    option_parser = optparse.OptionParser()
    option_parser.add_option("--starting-closest-to-text", type="string")
    option_parser.add_option("--biggest", action="store_true")
    option_parser.add_option("--widest",  action="store_true")
    option_parser.add_option("--longest", action="store_true")

    options, remaining_args = option_parser.parse_args(args)

    tree = gettree(remaining_args,
                   starting_closest_to_text=options.starting_closest_to_text)
    tables = gettables(tree, remaining_args,
                       biggest=options.biggest is not None,
                       widest=options.widest is not None,
                       longest=options.longest is not None,
                       only_first=options.starting_closest_to_text is not None)

    writetables(tables)

if __name__ == "__main__":
    main(sys.argv[1:])
	#!/bin/env python
	# -- coding: utf-8 --
	"""

	Examples:

	%(progname)s http://en.wikipedia.org/wiki/List_of_Olympic_records_in_athletics

	This is essentially this logic, done up

	for table in lxml.html.from_string(some_html):
	for row in table.cssselect("tr"):
	print ",".join([td.text_content() for td in row.cssselect("td")])


	There are a few ways one can narrow down which tables are shown:

	1) --id="css or html id"
	2) --widest
	3) --tallest
	4) --most-cells
	5) --starting-closest-to-text="text you know comes right before the main table"

	All of these selectors will cause a maximum of one table to be displayed.


	Author: Martin Dengler
	License: GPL v3+
	"""

	import csv
	import lxml
	import lxml.html
	import optparse
	import os
	import sys
	import urllib


	def tables_to_tuples(tables):
	"""takes output of csselect("table") and returns tuples (rows) of cells"""
	returned_rows = []
	for table in tables:
	rows = table.cssselect("tr")
	for row in rows:
	cells = row.cssselect("td")
	for cell in cells:
	for span_style in cell.cssselect("span[style]"):
	style_text = lxml.html.get_text(span_style, "style")
	if "display:" in style_text and "none" in style_text:
	# print "found"
	span_style.drop_tree()
	# for hidden in cell.cssselect('*[style="display:none;"]'):
	# print "found hidden: %s / %s" % (hidden, hidden.text_content)
	# hidden.drop_tree()
	cells_text = [cell.text_content().strip() for cell in cells]
	returned_rows.append(cells_text)
	return returned_rows


	def gettree(args, starting_closest_to_text=None):
	"""returns lxml doc tree for stdin or url based on args"""
	if len(args) == 0:
	lines = sys.stdin.readlines()
	else:
	lines = urllib.urlopen(args[0]).readlines()

	if starting_closest_to_text is not None:

	text_line = 0
	found_line = None

	for line in lines:
	if starting_closest_to_text in line:
	found_line = text_line
	break
	text_line += 1

	if found_line is not None:
	lines = lines[found_line:]
	#print os.linesep.join(map(str, lines))
	else:
	raise Exception("Could not find text [%s] in input" % starting_closest_to_text)

	html = os.linesep.join(lines)
	tree = lxml.html.fromstring(html)
	return tree


	def compare_by_rows(table_a, table_b):
	return cmp(len(table_a.getchildren()), len(table_b.getchildren()))


	def compare_by_columns(table_a, table_b):
	return cmp(len(table_a.children()), len(table_b.children()))


	def compare_by_cells(table_a, table_b):
	return cmp(len(table_a.iterdescendants()), len(table_b.iterdescendants()))


	def longest(tables):
	return most(compare_by_rows, tables)


	def widest(tables):
	return most(compare_by_columns, tables)


	def biggest(tables):
	return most(compare_by_cells, tables)


	def most(comparator, tables):
	return sorted(tables, cmp=comparator)[-1]


	def gettables(tree, args, biggest=False, longest=False, widest=False, only_first=False):
	"""returns lxml.cssselect tables for the tree, optionally only returning the nth table (indexed from 0)"""

	tables = list(tree.cssselect("table"))

	if len(args) > 1:
	n = int(args[1]) - 1
	tables = tables[n:n+1]

	if biggest:
	return biggest(tables)

	if longest:
	return longest(tables)

	if widest:
	return widest(tables)

	if only_first:
	tables = tables[0:1]
	print dir(tables[0]), tables[0].text_content

	return tables


	def writetables(tables, outputfh=None):
	if outputfh is None:
	outputfh = sys.stdout
	csvwriter = csv.writer(outputfh)
	def clean(s):
	s = unicode(s).encode("utf-8")
	s = "".join([c for c in s if ord(c) in range(128)])
	return s
	for row in tables_to_tuples(tables):
	csvwriter.writerow([clean(c) for c in row])



	def main(args):
	# fix for the below fix
	if "LANG" not in os.environ or os.environ["LANG"] == "C":
	os.environ["LANG"] = "en_US.utf8"
	# from http://wjd.nu/notes/2009#unicodeencodeerror-python-redirect-pipe
	import codecs, locale
	sys.stdout = codecs.getwriter(locale.getdefaultlocale()[1])(sys.stdout, 'replace')

	option_parser = optparse.OptionParser()
	option_parser.add_option("--starting-closest-to-text", type="string")
	option_parser.add_option("--biggest", action="store_true")
	option_parser.add_option("--widest", action="store_true")
	option_parser.add_option("--longest", action="store_true")

	options, remaining_args = option_parser.parse_args(args)

	tree = gettree(remaining_args,
	starting_closest_to_text=options.starting_closest_to_text)
	tables = gettables(tree, remaining_args,
	biggest=options.biggest is not None,
	widest=options.widest is not None,
	longest=options.longest is not None,
	only_first=options.starting_closest_to_text is not None)

	writetables(tables)

	if __name__ == "__main__":
	main(sys.argv[1:])