ilius/epub_ungrouped.py

## epub_ungrouped.py
# -*- coding: utf-8 -*-

from formats_common import *
import zipfile

enable = True
lname = "epub_ungrouped"
format = "EpubUngrouped"
description = "EPUB (Ungrouped)"
extensions = ()
extensionCreate = ".epub"
kind = "archive"
wiki = ""
website = None

# key is option/argument name, value is instance of Option
optionsProp = {}


class Reader(object):
	ignoreFiles = set([
		"resources.opf",
		"mimetype",
		"toc.ncx",
		"META-INF/container.xml",
		"META-INF/calibre_bookmarks.txt",
	])

	def __init__(self, glos: GlossaryType) -> None:
		self._glos = glos
		self._filename = ""
		self._wordCount = 0
		self._zf = None

	def __len__(self) -> int:
		# return the number of entries if you have it
		# if you don't, return 0 and progressbar will be disabled
		# self._wordCount can be set in self.open function
		# but if you want to set it, you should set it before
		# iteration begins and __iter__ method is called
		return self._wordCount

	def open(self, filename) -> None:
		self._filename = filename
		# open the file, read headers / info and set info to self._glos
		# and set self._wordCount if you can
		# read-options should be keyword arguments in this method
		# self._wordCount = 100
		# here read info from file and set to Glossary object
		# self._glos.setInfo("name", "Test")
		# self._glos.setInfo("description", desc)
		# self._glos.setInfo("author", "Me")
		# self._glos.setInfo("copyright", "GPL")
		self._zf = zipfile.ZipFile(filename)
		self._htmlFiles = sorted([
			fpath
			for fpath in self._zf.namelist()
			if fpath.endswith(".html")
		])


	def close(self):
		# this is called after reading/conversion is finished
		# if you have an open file object, close it here
		# if you need to clean up temp files, do it here
		pass

	def __iter__(self) -> "Iterator[BaseEntry]":
		from lxml.html import document_fromstring

		glos = self._glos
		zf = self._zf

		titleByFileName = {}
		for fname in self._htmlFiles:
			doc = document_fromstring(zf.read(fname))
			titleElem = doc.find(".//title")
			if titleElem is None:
				title = fname[:len(".html")].capitalize()
				log.warn(f"Using title {title!r} for file {fname!r} with no title tag")
			else:
				title = titleElem.text
			titleByFileName[fname] = title

		re_href = re.compile(
			' href="([^<>"]*?\.html)',
			re.I,
		)

		def sub_href(m):
			fname = m.group(1)
			title = titleByFileName.get(fname)
			if title is None:
				return m.group(0)
			return f' href="bword://{title}'

		for fname in self._htmlFiles:
			title = titleByFileName.get(fname)
			if title is None:
				continue
			defi = zf.read(fname).decode("utf-8")
			defi = re_href.sub(sub_href, defi)
			yield glos.newEntry(
				title,
				defi,
				defiFormat="h",
			)

		for fpath in self._zf.namelist():
			if fpath.endswith(".html"):
				continue
			if fpath in self.ignoreFiles:
				continue
			data = zf.read(fpath)
			yield glos.newDataEntry(fpath, data)
	# -- coding: utf-8 --

	from formats_common import *
	import zipfile

	enable = True
	lname = "epub_ungrouped"
	format = "EpubUngrouped"
	description = "EPUB (Ungrouped)"
	extensions = ()
	extensionCreate = ".epub"
	kind = "archive"
	wiki = ""
	website = None

	# key is option/argument name, value is instance of Option
	optionsProp = {}


	class Reader(object):
	ignoreFiles = set([
	"resources.opf",
	"mimetype",
	"toc.ncx",
	"META-INF/container.xml",
	"META-INF/calibre_bookmarks.txt",
	])

	def __init__(self, glos: GlossaryType) -> None:
	self._glos = glos
	self._filename = ""
	self._wordCount = 0
	self._zf = None

	def __len__(self) -> int:
	# return the number of entries if you have it
	# if you don't, return 0 and progressbar will be disabled
	# self._wordCount can be set in self.open function
	# but if you want to set it, you should set it before
	# iteration begins and __iter__ method is called
	return self._wordCount

	def open(self, filename) -> None:
	self._filename = filename
	# open the file, read headers / info and set info to self._glos
	# and set self._wordCount if you can
	# read-options should be keyword arguments in this method
	# self._wordCount = 100
	# here read info from file and set to Glossary object
	# self._glos.setInfo("name", "Test")
	# self._glos.setInfo("description", desc)
	# self._glos.setInfo("author", "Me")
	# self._glos.setInfo("copyright", "GPL")
	self._zf = zipfile.ZipFile(filename)
	self._htmlFiles = sorted([
	fpath
	for fpath in self._zf.namelist()
	if fpath.endswith(".html")
	])


	def close(self):
	# this is called after reading/conversion is finished
	# if you have an open file object, close it here
	# if you need to clean up temp files, do it here
	pass

	def __iter__(self) -> "Iterator[BaseEntry]":
	from lxml.html import document_fromstring

	glos = self._glos
	zf = self._zf

	titleByFileName = {}
	for fname in self._htmlFiles:
	doc = document_fromstring(zf.read(fname))
	titleElem = doc.find(".//title")
	if titleElem is None:
	title = fname[:len(".html")].capitalize()
	log.warn(f"Using title {title!r} for file {fname!r} with no title tag")
	else:
	title = titleElem.text
	titleByFileName[fname] = title

	re_href = re.compile(
	' href="([^<>"]*?\.html)',
	re.I,
	)

	def sub_href(m):
	fname = m.group(1)
	title = titleByFileName.get(fname)
	if title is None:
	return m.group(0)
	return f' href="bword://{title}'

	for fname in self._htmlFiles:
	title = titleByFileName.get(fname)
	if title is None:
	continue
	defi = zf.read(fname).decode("utf-8")
	defi = re_href.sub(sub_href, defi)
	yield glos.newEntry(
	title,
	defi,
	defiFormat="h",
	)

	for fpath in self._zf.namelist():
	if fpath.endswith(".html"):
	continue
	if fpath in self.ignoreFiles:
	continue
	data = zf.read(fpath)
	yield glos.newDataEntry(fpath, data)