Skip to content

Instantly share code, notes, and snippets.

@wlkz
Last active February 24, 2022 19:00
Show Gist options
  • Save wlkz/6e76361442c0cc67e17c708480b06b3c to your computer and use it in GitHub Desktop.
Save wlkz/6e76361442c0cc67e17c708480b06b3c to your computer and use it in GitHub Desktop.
import xml.etree.ElementTree as ET
import zipfile
from pathlib import Path
output_path = Path('output')
def parse_container(container_path):
root = ET.fromstring(container_path.open('r').read())
ns = '{urn:oasis:names:tc:opendocument:xmlns:container}'
return [rootfile.attrib['full-path'] for rootfile in root.findall(f'./{ns}rootfiles/{ns}rootfile')]
def main():
output_path.mkdir(exist_ok=True)
# 遍历当前目录及其子目录的epub文件
for epub_path in Path.cwd().glob('**/*.epub'):
epub_root = zipfile.Path(epub_path)
container_path = epub_root / 'META-INF' / 'container.xml'
rootfiles = parse_container(container_path)
# 假定只有一个元数据文件
if len(rootfiles) != 1:
raise NotImplementedError()
main_opf_path = epub_root / rootfiles[0]
source_path = main_opf_path.parent
root = ET.fromstring(main_opf_path.open('r').read())
ns = '{http://www.idpf.org/2007/opf}'
# 这里直接获取epub里面元数据的标题,如果要文件名作为目录改下就好
# title = root.find('.//{http://purl.org/dc/elements/1.1/}title').text
title = epub_path.stem
(output_path / title).mkdir(exist_ok=True)
img_idx = 1
for item in root.findall(f'.//{ns}item'):
# item_id = item.attrib['id']
href = item.attrib['href']
media_type = item.attrib['media-type']
if media_type != 'image/jpeg':
continue
target_path = output_path / title / f'{img_idx:0>3}.jpg'
# shutil.copy(epub_root / href, target_path)
with (source_path / href).open('r') as source_fp:
with target_path.open('wb') as target_fp:
target_fp.write(source_fp.read())
img_idx +=1
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment