wlkz/main.py

## main.py
import xml.etree.ElementTree as ET
import zipfile
from pathlib import Path

output_path = Path('output')

def parse_container(container_path):
    root = ET.fromstring(container_path.open('r').read())
    ns = '{urn:oasis:names:tc:opendocument:xmlns:container}'
    return [rootfile.attrib['full-path'] for rootfile in root.findall(f'./{ns}rootfiles/{ns}rootfile')]


def main():
    output_path.mkdir(exist_ok=True)

    # 遍历当前目录及其子目录的epub文件
    for epub_path in Path.cwd().glob('**/*.epub'):
        epub_root = zipfile.Path(epub_path)
        container_path = epub_root / 'META-INF' / 'container.xml'
        rootfiles = parse_container(container_path)

        # 假定只有一个元数据文件
        if len(rootfiles) != 1:
            raise NotImplementedError()

        main_opf_path = epub_root / rootfiles[0]
        source_path = main_opf_path.parent
        root = ET.fromstring(main_opf_path.open('r').read())
        ns = '{http://www.idpf.org/2007/opf}'
        # 这里直接获取epub里面元数据的标题，如果要文件名作为目录改下就好
        # title = root.find('.//{http://purl.org/dc/elements/1.1/}title').text
        title = epub_path.stem
        (output_path / title).mkdir(exist_ok=True)
        img_idx = 1
        for item in root.findall(f'.//{ns}item'):
            # item_id = item.attrib['id']
            href = item.attrib['href']
            media_type = item.attrib['media-type']

            if media_type != 'image/jpeg':
                continue

            target_path = output_path / title / f'{img_idx:0>3}.jpg'

            # shutil.copy(epub_root / href, target_path)
            with (source_path / href).open('r') as source_fp:
                with target_path.open('wb') as target_fp:
                    target_fp.write(source_fp.read())

            img_idx +=1

if __name__ == '__main__':
    main()
	import xml.etree.ElementTree as ET
	import zipfile
	from pathlib import Path

	output_path = Path('output')

	def parse_container(container_path):
	root = ET.fromstring(container_path.open('r').read())
	ns = '{urn:oasis:names:tc:opendocument:xmlns:container}'
	return [rootfile.attrib['full-path'] for rootfile in root.findall(f'./{ns}rootfiles/{ns}rootfile')]


	def main():
	output_path.mkdir(exist_ok=True)

	# 遍历当前目录及其子目录的epub文件
	for epub_path in Path.cwd().glob('*/.epub'):
	epub_root = zipfile.Path(epub_path)
	container_path = epub_root / 'META-INF' / 'container.xml'
	rootfiles = parse_container(container_path)

	# 假定只有一个元数据文件
	if len(rootfiles) != 1:
	raise NotImplementedError()

	main_opf_path = epub_root / rootfiles[0]
	source_path = main_opf_path.parent
	root = ET.fromstring(main_opf_path.open('r').read())
	ns = '{http://www.idpf.org/2007/opf}'
	# 这里直接获取epub里面元数据的标题，如果要文件名作为目录改下就好
	# title = root.find('.//{http://purl.org/dc/elements/1.1/}title').text
	title = epub_path.stem
	(output_path / title).mkdir(exist_ok=True)
	img_idx = 1
	for item in root.findall(f'.//{ns}item'):
	# item_id = item.attrib['id']
	href = item.attrib['href']
	media_type = item.attrib['media-type']

	if media_type != 'image/jpeg':
	continue

	target_path = output_path / title / f'{img_idx:0>3}.jpg'

	# shutil.copy(epub_root / href, target_path)
	with (source_path / href).open('r') as source_fp:
	with target_path.open('wb') as target_fp:
	target_fp.write(source_fp.read())

	img_idx +=1

	if __name__ == '__main__':
	main()