Skip to content

Instantly share code, notes, and snippets.

@ChenyangGao
Last active August 27, 2022 13:08
Show Gist options
  • Save ChenyangGao/5088ae89bc7359b5c931343f5e690e87 to your computer and use it in GitHub Desktop.
Save ChenyangGao/5088ae89bc7359b5c931343f5e690e87 to your computer and use it in GitHub Desktop.
Python 把 HTML 中的 img 元素的 src 转换成data URL | Convert the src link of img tag of HTML into data URL
#!/usr/bin/env python3
# coding: utf-8
__author__ = "ChenyangGao <https://chenyanggao.github.io/>"
__version__ = (0, 1)
__all__ = ["process"]
if __name__ == "__main__":
from argparse import ArgumentParser
parser = ArgumentParser(description="把 html 文件中 <img> 元素的 src 属性转换成 datauri。注意:所有 html 文件会被原地替换,并不生成新文件")
parser.add_argument("pathlist", nargs="+", metavar="path_to_file_or_dir", help="html 所在的文件或文件夹")
parser.add_argument("-e", "--encoding", help="html 文件的编码")
args = parser.parse_args()
from base64 import b64encode
from contextlib import contextmanager
from mimetypes import guess_type
from os import chdir, getcwd, path
from re import compile as re_compile
from urllib.parse import urlsplit
from urllib.request import urlopen
# 正则表达式,匹配有 src 属性的 <img> 元素
cre_src = re_compile(r'(<img\b[^>]+?\bsrc=")(?P<uri>[^"]+)')
@contextmanager
def tempcwd(dir_):
"上下文管理器,切换当前工作目录"
if dir_ in ("", "."):
yield
return
cwdold = getcwd()
chdir(dir_)
try:
yield
finally:
chdir(cwdold)
# TODO: 以后将会支持 ftp(s)://, dav(s)://, sftp:// 等协议
def _read(path):
"读取文件的二进制数据"
if path.startswith(("http://", "https://")):
return urlopen(path).read()
return open(path, "rb").read()
def _repl(m):
"正则表达式替换时用到的函数,用于把搜索到的 uri 替换成 datauri"
uri = m["uri"]
# 如果本身就是 datauri,则原样返回
if uri.startswith("data:"):
return uri[0]
mimetype = guess_type(urlsplit(uri).path)[0]
# 如果判断不了 mimetype,则原样返回
if mimetype is None:
return uri[0]
# 如果读取失败,则原样返回
try:
content = _read(uri)
except:
return uri[0]
# datauri 使用 base64 编码二进制数据
return "%sdata:%s;base64,%s" % (m[1], mimetype, b64encode(content).decode('ascii'))
def process(path_file, path_save=None, encoding=None):
"""读取 html 文件,把其中可读取的图片路径转化并替换为 datauri,并保存修改后的 html
:param path_file: 原始 html 文件的路径
:param path_save: 保存更改后的 html 文件的路径,如果为 None,那么等于参数 `path_file`
:param encoding: 文件的编码
"""
if path_save is None:
path_save = path_file
dir_, name = path.split(path_file)
with tempcwd(dir_):
text = open(name, encoding=encoding).read()
text_new = cre_src.sub(_repl, text)
if path_file != path_save or text != text_new:
open(path_save, "w", encoding=encoding).write(text_new)
if __name__ == "__main__":
from glob import glob
encoding = args.encoding
def process_one(path_):
print("-" * 20)
print("[PROCESSING]", path_)
try:
process(path_, encoding=encoding)
except Exception as exc:
print("😭 FAILED:", exc)
else:
print("😄 SUCCESS")
for path_ in args.pathlist:
if path.isdir(path_):
for p in glob(path.join(path_, "**", "*.htm"), recursive=True):
process_one(p)
for p in glob(path.join(path_, "**", "*.html"), recursive=True):
process_one(p)
else:
process_one(path_)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment