Last active
January 10, 2022 23:35
-
-
Save ilius/b5a4cbec5a81ff77557f4a54e7221692 to your computer and use it in GitHub Desktop.
PyGlossary plugin epub_ungrouped.py (issue #352)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
from formats_common import * | |
import zipfile | |
enable = True | |
lname = "epub_ungrouped" | |
format = "EpubUngrouped" | |
description = "EPUB (Ungrouped)" | |
extensions = () | |
extensionCreate = ".epub" | |
kind = "archive" | |
wiki = "" | |
website = None | |
# key is option/argument name, value is instance of Option | |
optionsProp = {} | |
class Reader(object): | |
ignoreFiles = set([ | |
"resources.opf", | |
"mimetype", | |
"toc.ncx", | |
"META-INF/container.xml", | |
"META-INF/calibre_bookmarks.txt", | |
]) | |
def __init__(self, glos: GlossaryType) -> None: | |
self._glos = glos | |
self._filename = "" | |
self._wordCount = 0 | |
self._zf = None | |
def __len__(self) -> int: | |
# return the number of entries if you have it | |
# if you don't, return 0 and progressbar will be disabled | |
# self._wordCount can be set in self.open function | |
# but if you want to set it, you should set it before | |
# iteration begins and __iter__ method is called | |
return self._wordCount | |
def open(self, filename) -> None: | |
self._filename = filename | |
# open the file, read headers / info and set info to self._glos | |
# and set self._wordCount if you can | |
# read-options should be keyword arguments in this method | |
# self._wordCount = 100 | |
# here read info from file and set to Glossary object | |
# self._glos.setInfo("name", "Test") | |
# self._glos.setInfo("description", desc) | |
# self._glos.setInfo("author", "Me") | |
# self._glos.setInfo("copyright", "GPL") | |
self._zf = zipfile.ZipFile(filename) | |
self._htmlFiles = sorted([ | |
fpath | |
for fpath in self._zf.namelist() | |
if fpath.endswith(".html") | |
]) | |
def close(self): | |
# this is called after reading/conversion is finished | |
# if you have an open file object, close it here | |
# if you need to clean up temp files, do it here | |
pass | |
def __iter__(self) -> "Iterator[BaseEntry]": | |
from lxml.html import document_fromstring | |
glos = self._glos | |
zf = self._zf | |
titleByFileName = {} | |
for fname in self._htmlFiles: | |
doc = document_fromstring(zf.read(fname)) | |
titleElem = doc.find(".//title") | |
if titleElem is None: | |
title = fname[:len(".html")].capitalize() | |
log.warn(f"Using title {title!r} for file {fname!r} with no title tag") | |
else: | |
title = titleElem.text | |
titleByFileName[fname] = title | |
re_href = re.compile( | |
' href="([^<>"]*?\.html)', | |
re.I, | |
) | |
def sub_href(m): | |
fname = m.group(1) | |
title = titleByFileName.get(fname) | |
if title is None: | |
return m.group(0) | |
return f' href="bword://{title}' | |
for fname in self._htmlFiles: | |
title = titleByFileName.get(fname) | |
if title is None: | |
continue | |
defi = zf.read(fname).decode("utf-8") | |
defi = re_href.sub(sub_href, defi) | |
yield glos.newEntry( | |
title, | |
defi, | |
defiFormat="h", | |
) | |
for fpath in self._zf.namelist(): | |
if fpath.endswith(".html"): | |
continue | |
if fpath in self.ignoreFiles: | |
continue | |
data = zf.read(fpath) | |
yield glos.newDataEntry(fpath, data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment