Skip to content

Instantly share code, notes, and snippets.

@kroger
Created April 1, 2012 18:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kroger/2277731 to your computer and use it in GitHub Desktop.
Save kroger/2277731 to your computer and use it in GitHub Desktop.
Download a webpage, clean it and generate a pdf using htmldoc
#!/usr/bin/env python
import os
import subprocess
from lxml import html
from lxml.html import builder as E
from lxml.html.clean import clean_html
url = "http://dangerousprototypes.com/docs/Bus_Blaster"
htmlfile = os.path.basename(url)
node = html.parse(url)
content = node.xpath("//div[@id='content']")[0]
# remove table of contents
for toc in content.xpath("//table[@id='toc']"):
toc.getparent().remove(toc)
# remove cruft
for item in content.xpath("//div[@id='jump-to-nav']"):
item.getparent().remove(item)
result = E.HTML(E.BODY(content))
result.make_links_absolute("http://dangerousprototypes.com")
with open(htmlfile, 'w') as out:
out.write(html.tostring(result))
subprocess.call(["htmldoc", "--webpage", "-f", htmlfile + '.pdf', htmlfile])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment