magical/html2data.py

## html2data.py
#!/usr/bin/env python3

"""Download an HTML page, translating <img> tags to data: URIs"""

import sys

import lxml.html as etree

from urllib.request import urlopen
from base64 import b64encode
from functools import lru_cache, partial

memoize = partial(lru_cache, maxsize=None)

@memoize()
def data_from_url(url):
    with urlopen(url) as f:
        content_type = f.headers['content-type']
        data = f.read()
    mime_type, _, _ = content_type.partition(';')
    mime_type = mime_type.strip()
    uri = 'data:{0};base64,{1}'.format(mime_type, b64encode(data).decode('ascii'))
    return uri

page_url = sys.argv[1]

tree = etree.parse(page_url)
tree.getroot().make_links_absolute(page_url)

for img in tree.xpath('//img'):
    img.attrib['src'] = data_from_url(img.attrib['src'])

sys.stdout.buffer.write(etree.tostring(tree, encoding='utf-8'))
	#!/usr/bin/env python3

	"""Download an HTML page, translating <img> tags to data: URIs"""

	import sys

	import lxml.html as etree

	from urllib.request import urlopen
	from base64 import b64encode
	from functools import lru_cache, partial

	memoize = partial(lru_cache, maxsize=None)

	@memoize()
	def data_from_url(url):
	with urlopen(url) as f:
	content_type = f.headers['content-type']
	data = f.read()
	mime_type, _, _ = content_type.partition(';')
	mime_type = mime_type.strip()
	uri = 'data:{0};base64,{1}'.format(mime_type, b64encode(data).decode('ascii'))
	return uri

	page_url = sys.argv[1]

	tree = etree.parse(page_url)
	tree.getroot().make_links_absolute(page_url)

	for img in tree.xpath('//img'):
	img.attrib['src'] = data_from_url(img.attrib['src'])

	sys.stdout.buffer.write(etree.tostring(tree, encoding='utf-8'))