Skip to content

Instantly share code, notes, and snippets.

@ilius
Last active January 10, 2022 23:35
Show Gist options
  • Save ilius/b5a4cbec5a81ff77557f4a54e7221692 to your computer and use it in GitHub Desktop.
Save ilius/b5a4cbec5a81ff77557f4a54e7221692 to your computer and use it in GitHub Desktop.
PyGlossary plugin epub_ungrouped.py (issue #352)
# -*- coding: utf-8 -*-
from formats_common import *
import zipfile
enable = True
lname = "epub_ungrouped"
format = "EpubUngrouped"
description = "EPUB (Ungrouped)"
extensions = ()
extensionCreate = ".epub"
kind = "archive"
wiki = ""
website = None
# key is option/argument name, value is instance of Option
optionsProp = {}
class Reader(object):
ignoreFiles = set([
"resources.opf",
"mimetype",
"toc.ncx",
"META-INF/container.xml",
"META-INF/calibre_bookmarks.txt",
])
def __init__(self, glos: GlossaryType) -> None:
self._glos = glos
self._filename = ""
self._wordCount = 0
self._zf = None
def __len__(self) -> int:
# return the number of entries if you have it
# if you don't, return 0 and progressbar will be disabled
# self._wordCount can be set in self.open function
# but if you want to set it, you should set it before
# iteration begins and __iter__ method is called
return self._wordCount
def open(self, filename) -> None:
self._filename = filename
# open the file, read headers / info and set info to self._glos
# and set self._wordCount if you can
# read-options should be keyword arguments in this method
# self._wordCount = 100
# here read info from file and set to Glossary object
# self._glos.setInfo("name", "Test")
# self._glos.setInfo("description", desc)
# self._glos.setInfo("author", "Me")
# self._glos.setInfo("copyright", "GPL")
self._zf = zipfile.ZipFile(filename)
self._htmlFiles = sorted([
fpath
for fpath in self._zf.namelist()
if fpath.endswith(".html")
])
def close(self):
# this is called after reading/conversion is finished
# if you have an open file object, close it here
# if you need to clean up temp files, do it here
pass
def __iter__(self) -> "Iterator[BaseEntry]":
from lxml.html import document_fromstring
glos = self._glos
zf = self._zf
titleByFileName = {}
for fname in self._htmlFiles:
doc = document_fromstring(zf.read(fname))
titleElem = doc.find(".//title")
if titleElem is None:
title = fname[:len(".html")].capitalize()
log.warn(f"Using title {title!r} for file {fname!r} with no title tag")
else:
title = titleElem.text
titleByFileName[fname] = title
re_href = re.compile(
' href="([^<>"]*?\.html)',
re.I,
)
def sub_href(m):
fname = m.group(1)
title = titleByFileName.get(fname)
if title is None:
return m.group(0)
return f' href="bword://{title}'
for fname in self._htmlFiles:
title = titleByFileName.get(fname)
if title is None:
continue
defi = zf.read(fname).decode("utf-8")
defi = re_href.sub(sub_href, defi)
yield glos.newEntry(
title,
defi,
defiFormat="h",
)
for fpath in self._zf.namelist():
if fpath.endswith(".html"):
continue
if fpath in self.ignoreFiles:
continue
data = zf.read(fpath)
yield glos.newDataEntry(fpath, data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment