kroger/url2pdf.py

## url2pdf.py
#!/usr/bin/env python

import os
import subprocess
from lxml import html
from lxml.html import builder as E
from lxml.html.clean import clean_html


url = "http://dangerousprototypes.com/docs/Bus_Blaster"

htmlfile = os.path.basename(url)

node = html.parse(url)
content = node.xpath("//div[@id='content']")[0]

# remove table of contents
for toc in content.xpath("//table[@id='toc']"):
    toc.getparent().remove(toc)

# remove cruft
for item in content.xpath("//div[@id='jump-to-nav']"):
    item.getparent().remove(item)

result = E.HTML(E.BODY(content))
result.make_links_absolute("http://dangerousprototypes.com")

with open(htmlfile, 'w') as out:
    out.write(html.tostring(result))


subprocess.call(["htmldoc", "--webpage", "-f", htmlfile + '.pdf', htmlfile])
	#!/usr/bin/env python

	import os
	import subprocess
	from lxml import html
	from lxml.html import builder as E
	from lxml.html.clean import clean_html


	url = "http://dangerousprototypes.com/docs/Bus_Blaster"

	htmlfile = os.path.basename(url)

	node = html.parse(url)
	content = node.xpath("//div[@id='content']")[0]

	# remove table of contents
	for toc in content.xpath("//table[@id='toc']"):
	toc.getparent().remove(toc)

	# remove cruft
	for item in content.xpath("//div[@id='jump-to-nav']"):
	item.getparent().remove(item)

	result = E.HTML(E.BODY(content))
	result.make_links_absolute("http://dangerousprototypes.com")

	with open(htmlfile, 'w') as out:
	out.write(html.tostring(result))


	subprocess.call(["htmldoc", "--webpage", "-f", htmlfile + '.pdf', htmlfile])