public
Last active

Python 3 data url handler

  • Download Gist
dataurl.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
import binascii
import urllib.request
import urllib.parse
import email.message
import io
 
__all__ = ['parse_data_url','DataResponse','DataHandler']
 
def parse_data_url(url):
scheme, data = url.split(":",1)
assert scheme == "data", "unsupported scheme: "+scheme
mediatype, data = data.split(",",1)
# base64 urls might have a padding which might (should) be quoted:
data = urllib.parse.unquote_to_bytes(data)
if mediatype.endswith(";base64"):
return binascii.a2b_base64(data), mediatype[:-7] or None
else:
return data, mediatype or None
 
# DataResponse exposes the mediatype and emulates some methods/properties of
# HTTPResponse: msg, headers, length, info, geturl, getheader and getheaders
class DataResponse(io.BytesIO):
__slots__ = 'url','mediatype','msg','headers','length'
def __init__(self,url):
data, mediatype = parse_data_url(url)
io.BytesIO.__init__(self,data)
self.url = url
self.mediatype = mediatype
self.length = len(data)
self.headers = self.msg = email.message.Message()
if mediatype is not None:
self.msg.add_header("Content-Type",mediatype)
def getheader(self,name,default=None):
headers = self.headers.get_all(name) or default
if isinstance(headers, str) or not hasattr(headers, '__iter__'):
return headers
else:
return ', '.join(headers)
def getheaders(self):
return list(self.headers.items())
def geturl(self):
return self.url
def info(self):
return self.headers
 
class DataHandler(urllib.request.BaseHandler):
def data_open(self, req):
return DataResponse(req.full_url)
dataurl_simple.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
import binascii
import urllib.request
import urllib.parse
import io
 
__all__ = ['parse_data_url','DataHandler']
 
def parse_data_url(url):
scheme, data = url.split(":",1)
assert scheme == "data", "unsupported scheme: "+scheme
mediatype, data = data.split(",",1)
# base64 urls might have a padding which might (should) be quoted:
data = urllib.parse.unquote_to_bytes(data)
if mediatype.endswith(";base64"):
return binascii.a2b_base64(data), mediatype[:-7] or None
else:
return data, mediatype or None
 
class DataHandler(urllib.request.BaseHandler):
def data_open(self, req):
return io.BytesIO(parse_data_url(req.full_url)[0])
get_data_url_bin.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13
#!/usr/bin/env python3
import sys
from dataurl import DataHandler
from urllib.request import urlopen, build_opener, install_opener
 
install_opener(build_opener(DataHandler))
 
# so we can write binary data to stdout:
sys.stdout = open(sys.stdout.fileno(),"wb")
 
for url in sys.argv[1:]:
with urlopen(url) as f:
sys.stdout.write(f.read())
get_data_url_text.py
Python
1 2 3 4 5 6 7 8 9 10 11
#!/usr/bin/env python3
import sys
from dataurl import DataHandler
from urllib.request import urlopen, build_opener, install_opener
 
install_opener(build_opener(DataHandler))
 
for url in sys.argv[1:]:
with urlopen(url) as f:
# assumes utf-8 encoded text when no charset given:
sys.stdout.write(f.read().decode(dict(f.headers.get_params() or []).get('charset','UTF-8')))

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.