Last active
August 27, 2022 13:08
-
-
Save ChenyangGao/5088ae89bc7359b5c931343f5e690e87 to your computer and use it in GitHub Desktop.
Python 把 HTML 中的 img 元素的 src 转换成data URL | Convert the src link of img tag of HTML into data URL
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# coding: utf-8 | |
__author__ = "ChenyangGao <https://chenyanggao.github.io/>" | |
__version__ = (0, 1) | |
__all__ = ["process"] | |
if __name__ == "__main__": | |
from argparse import ArgumentParser | |
parser = ArgumentParser(description="把 html 文件中 <img> 元素的 src 属性转换成 datauri。注意:所有 html 文件会被原地替换,并不生成新文件") | |
parser.add_argument("pathlist", nargs="+", metavar="path_to_file_or_dir", help="html 所在的文件或文件夹") | |
parser.add_argument("-e", "--encoding", help="html 文件的编码") | |
args = parser.parse_args() | |
from base64 import b64encode | |
from contextlib import contextmanager | |
from mimetypes import guess_type | |
from os import chdir, getcwd, path | |
from re import compile as re_compile | |
from urllib.parse import urlsplit | |
from urllib.request import urlopen | |
# 正则表达式,匹配有 src 属性的 <img> 元素 | |
cre_src = re_compile(r'(<img\b[^>]+?\bsrc=")(?P<uri>[^"]+)') | |
@contextmanager | |
def tempcwd(dir_): | |
"上下文管理器,切换当前工作目录" | |
if dir_ in ("", "."): | |
yield | |
return | |
cwdold = getcwd() | |
chdir(dir_) | |
try: | |
yield | |
finally: | |
chdir(cwdold) | |
# TODO: 以后将会支持 ftp(s)://, dav(s)://, sftp:// 等协议 | |
def _read(path): | |
"读取文件的二进制数据" | |
if path.startswith(("http://", "https://")): | |
return urlopen(path).read() | |
return open(path, "rb").read() | |
def _repl(m): | |
"正则表达式替换时用到的函数,用于把搜索到的 uri 替换成 datauri" | |
uri = m["uri"] | |
# 如果本身就是 datauri,则原样返回 | |
if uri.startswith("data:"): | |
return uri[0] | |
mimetype = guess_type(urlsplit(uri).path)[0] | |
# 如果判断不了 mimetype,则原样返回 | |
if mimetype is None: | |
return uri[0] | |
# 如果读取失败,则原样返回 | |
try: | |
content = _read(uri) | |
except: | |
return uri[0] | |
# datauri 使用 base64 编码二进制数据 | |
return "%sdata:%s;base64,%s" % (m[1], mimetype, b64encode(content).decode('ascii')) | |
def process(path_file, path_save=None, encoding=None): | |
"""读取 html 文件,把其中可读取的图片路径转化并替换为 datauri,并保存修改后的 html | |
:param path_file: 原始 html 文件的路径 | |
:param path_save: 保存更改后的 html 文件的路径,如果为 None,那么等于参数 `path_file` | |
:param encoding: 文件的编码 | |
""" | |
if path_save is None: | |
path_save = path_file | |
dir_, name = path.split(path_file) | |
with tempcwd(dir_): | |
text = open(name, encoding=encoding).read() | |
text_new = cre_src.sub(_repl, text) | |
if path_file != path_save or text != text_new: | |
open(path_save, "w", encoding=encoding).write(text_new) | |
if __name__ == "__main__": | |
from glob import glob | |
encoding = args.encoding | |
def process_one(path_): | |
print("-" * 20) | |
print("[PROCESSING]", path_) | |
try: | |
process(path_, encoding=encoding) | |
except Exception as exc: | |
print("😭 FAILED:", exc) | |
else: | |
print("😄 SUCCESS") | |
for path_ in args.pathlist: | |
if path.isdir(path_): | |
for p in glob(path.join(path_, "**", "*.htm"), recursive=True): | |
process_one(p) | |
for p in glob(path.join(path_, "**", "*.html"), recursive=True): | |
process_one(p) | |
else: | |
process_one(path_) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment