Skip to content

Instantly share code, notes, and snippets.

@magical
Created March 19, 2012 04:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save magical/2095875 to your computer and use it in GitHub Desktop.
Save magical/2095875 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""Download an HTML page, translating <img> tags to data: URIs"""
import sys
import lxml.html as etree
from urllib.request import urlopen
from base64 import b64encode
from functools import lru_cache, partial
memoize = partial(lru_cache, maxsize=None)
@memoize()
def data_from_url(url):
with urlopen(url) as f:
content_type = f.headers['content-type']
data = f.read()
mime_type, _, _ = content_type.partition(';')
mime_type = mime_type.strip()
uri = 'data:{0};base64,{1}'.format(mime_type, b64encode(data).decode('ascii'))
return uri
page_url = sys.argv[1]
tree = etree.parse(page_url)
tree.getroot().make_links_absolute(page_url)
for img in tree.xpath('//img'):
img.attrib['src'] = data_from_url(img.attrib['src'])
sys.stdout.buffer.write(etree.tostring(tree, encoding='utf-8'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment