Skip to content

Instantly share code, notes, and snippets.

@mdengler
Created June 21, 2012 04:45
Show Gist options
  • Save mdengler/2963878 to your computer and use it in GitHub Desktop.
Save mdengler/2963878 to your computer and use it in GitHub Desktop.
html2csv.py
#!/bin/env python
# -*- coding: utf-8 -*-
"""
Examples:
%(progname)s http://en.wikipedia.org/wiki/List_of_Olympic_records_in_athletics
This is essentially this logic, done up
for table in lxml.html.from_string(some_html):
for row in table.cssselect("tr"):
print ",".join([td.text_content() for td in row.cssselect("td")])
There are a few ways one can narrow down which tables are shown:
1) --id="css or html id"
2) --widest
3) --tallest
4) --most-cells
5) --starting-closest-to-text="text you know comes right before the main table"
All of these selectors will cause a maximum of one table to be displayed.
Author: Martin Dengler
License: GPL v3+
"""
import csv
import lxml
import lxml.html
import optparse
import os
import sys
import urllib
def tables_to_tuples(tables):
"""takes output of csselect("table") and returns tuples (rows) of cells"""
returned_rows = []
for table in tables:
rows = table.cssselect("tr")
for row in rows:
cells = row.cssselect("td")
for cell in cells:
for span_style in cell.cssselect("span[style]"):
style_text = lxml.html.get_text(span_style, "style")
if "display:" in style_text and "none" in style_text:
# print "found"
span_style.drop_tree()
# for hidden in cell.cssselect('*[style="display:none;"]'):
# print "found hidden: %s / %s" % (hidden, hidden.text_content)
# hidden.drop_tree()
cells_text = [cell.text_content().strip() for cell in cells]
returned_rows.append(cells_text)
return returned_rows
def gettree(args, starting_closest_to_text=None):
"""returns lxml doc tree for stdin or url based on args"""
if len(args) == 0:
lines = sys.stdin.readlines()
else:
lines = urllib.urlopen(args[0]).readlines()
if starting_closest_to_text is not None:
text_line = 0
found_line = None
for line in lines:
if starting_closest_to_text in line:
found_line = text_line
break
text_line += 1
if found_line is not None:
lines = lines[found_line:]
#print os.linesep.join(map(str, lines))
else:
raise Exception("Could not find text [%s] in input" % starting_closest_to_text)
html = os.linesep.join(lines)
tree = lxml.html.fromstring(html)
return tree
def compare_by_rows(table_a, table_b):
return cmp(len(table_a.getchildren()), len(table_b.getchildren()))
def compare_by_columns(table_a, table_b):
return cmp(len(table_a.children()), len(table_b.children()))
def compare_by_cells(table_a, table_b):
return cmp(len(table_a.iterdescendants()), len(table_b.iterdescendants()))
def longest(tables):
return most(compare_by_rows, tables)
def widest(tables):
return most(compare_by_columns, tables)
def biggest(tables):
return most(compare_by_cells, tables)
def most(comparator, tables):
return sorted(tables, cmp=comparator)[-1]
def gettables(tree, args, biggest=False, longest=False, widest=False, only_first=False):
"""returns lxml.cssselect tables for the tree, optionally only returning the nth table (indexed from 0)"""
tables = list(tree.cssselect("table"))
if len(args) > 1:
n = int(args[1]) - 1
tables = tables[n:n+1]
if biggest:
return biggest(tables)
if longest:
return longest(tables)
if widest:
return widest(tables)
if only_first:
tables = tables[0:1]
print dir(tables[0]), tables[0].text_content
return tables
def writetables(tables, outputfh=None):
if outputfh is None:
outputfh = sys.stdout
csvwriter = csv.writer(outputfh)
def clean(s):
s = unicode(s).encode("utf-8")
s = "".join([c for c in s if ord(c) in range(128)])
return s
for row in tables_to_tuples(tables):
csvwriter.writerow([clean(c) for c in row])
def main(args):
# fix for the below fix
if "LANG" not in os.environ or os.environ["LANG"] == "C":
os.environ["LANG"] = "en_US.utf8"
# from http://wjd.nu/notes/2009#unicodeencodeerror-python-redirect-pipe
import codecs, locale
sys.stdout = codecs.getwriter(locale.getdefaultlocale()[1])(sys.stdout, 'replace')
option_parser = optparse.OptionParser()
option_parser.add_option("--starting-closest-to-text", type="string")
option_parser.add_option("--biggest", action="store_true")
option_parser.add_option("--widest", action="store_true")
option_parser.add_option("--longest", action="store_true")
options, remaining_args = option_parser.parse_args(args)
tree = gettree(remaining_args,
starting_closest_to_text=options.starting_closest_to_text)
tables = gettables(tree, remaining_args,
biggest=options.biggest is not None,
widest=options.widest is not None,
longest=options.longest is not None,
only_first=options.starting_closest_to_text is not None)
writetables(tables)
if __name__ == "__main__":
main(sys.argv[1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment