Skip to content

Instantly share code, notes, and snippets.

@keithcallenberg
Forked from pansapiens/standalone_html.py
Last active January 18, 2022 16:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save keithcallenberg/6a52f526db2a9d3530b2151ec251ffc7 to your computer and use it in GitHub Desktop.
Save keithcallenberg/6a52f526db2a9d3530b2151ec251ffc7 to your computer and use it in GitHub Desktop.
Convert HTML to a self contained file with inline Base64 encoded PNG images
#!/usr/bin/env python
# A simple script to suck up HTML, convert any images to inline Base64
# encoded format and write out the converted file.
#
# Usage: python standalone_html.py <input_file.html> <output_file.html>
#
# TODO: Consider MHTML format: https://en.wikipedia.org/wiki/MHTML
import os
from bs4 import BeautifulSoup
def guess_type(filepath):
"""
Return the mimetype of a file, given it's path.
This is a wrapper around two alternative methods - Unix 'file'-style
magic which guesses the type based on file content (if available),
and simple guessing based on the file extension (eg .jpg).
:param filepath: Path to the file.
:type filepath: str
:return: Mimetype string.
:rtype: str
"""
try:
import magic # python-magic
return magic.from_file(filepath, mime=True)
except ImportError:
import mimetypes
return mimetypes.guess_type(filepath)[0]
def file_to_base64(filepath):
"""
Returns the content of a file as a Base64 encoded string.
:param filepath: Path to the file.
:type filepath: str
:return: The file content, Base64 encoded.
:rtype: str
"""
import base64
with open(filepath, 'rb') as f:
encoded_str = base64.b64encode(f.read())
return encoded_str.decode('utf-8')
def make_html_images_inline(in_filepath, out_filepath):
"""
Takes an HTML file and writes a new version with inline Base64 encoded
images.
:param in_filepath: Input file path (HTML)
:type in_filepath: str
:param out_filepath: Output file path (HTML)
:type out_filepath: str
"""
basepath = os.path.split(in_filepath.rstrip(os.path.sep))[0]
soup = BeautifulSoup(open(in_filepath, 'r'), 'html.parser')
for img in soup.find_all('img'):
img_path = os.path.join(basepath, img.attrs['src'])
filename, fileext = os.path.splitext(os.path.basename(img.attrs['src']))
if fileext == '.svg':
mimetype = 'image/svg+xml'
else:
mimetype = guess_type(img_path)
img.attrs['src'] = \
"data:%s;base64,%s" % (mimetype, file_to_base64(img_path))
for obj in soup.find_all('object'):
if 'type' in obj.attrs and obj.attrs['type'] == 'image/svg+xml':
obj.name = 'img'
img_path = os.path.join(basepath, obj.attrs['data'])
mimetype = 'image/svg+xml'
obj.attrs['src'] = \
"data:%s;base64,%s" % (mimetype, file_to_base64(img_path))
del obj['type']
del obj['data']
with open(out_filepath, 'w') as of:
of.write(str(soup))
if __name__ == '__main__':
import sys
make_html_images_inline(sys.argv[1], sys.argv[2])
@keithcallenberg
Copy link
Author

Forced mimetype for files with extension of svg since mimetypes.guess_type() does a bad job with these. This seems to work for fMRIprep HTML reports.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment