Skip to content

Instantly share code, notes, and snippets.

@ilius
Created November 10, 2023 07:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ilius/6f67b94cebd7b2991bcb04d8ceb41442 to your computer and use it in GitHub Desktop.
Save ilius/6f67b94cebd7b2991bcb04d8ceb41442 to your computer and use it in GitHub Desktop.
appledict-bin-to-slob.py
#!/usr/bin/env python
import sys
from os.path import extsep, join, splitext
from pyglossary import slob
from pyglossary.core import cacheDir
from pyglossary.entry import Entry
from pyglossary.glossary_v2 import Glossary
from pyglossary.plugins.aard2_slob import Writer as SlobWriter
from pyglossary.plugins.appledict_bin import KeyData, Reader
class NewEntry(Entry):
def __init__(self, entry):
Entry.__init__(
self,
word=entry.l_word,
defi=entry.defi,
)
class NewReader(Reader):
def __iter__(self):
yield from self.readResDir(
self._contentsPath,
recurse=True,
)
yield from self.readResDir(
join(self._contentsPath, "Resources"),
recurse=True,
)
keyTextFieldOrder = self._properties.key_text_variable_fields
for entryBytes, articleAddress in self.yieldEntryBytes(
self._file,
self._properties,
):
entry = self.createEntry(entryBytes, articleAddress)
if entry is None:
continue
entry = NewEntry(entry)
keyDataList: "list[KeyData]" = [
KeyData.fromRaw(rawKeyData, keyTextFieldOrder)
for rawKeyData in self._keyTextData.get(articleAddress, [])
]
anchorByKeyword = {
keyData.keyword: keyData.anchor
for keyData in keyDataList
if keyData.anchor
}
#if anchorByKeyword:
# print("anchorByKeyword = ", anchorByKeyword)
entry.anchorByKeyword = anchorByKeyword
yield entry
content_type = "text/html; charset=utf-8"
word_title: bool = False
def addEntry(entry) -> None:
words = entry.l_word
b_defi = entry.defi.encode("utf-8")
entry.detectDefiFormat()
defiFormat = entry.defiFormat
if word_title and defiFormat in ("h", "m"):
if defiFormat == "m":
defiFormat = "h"
title = glos.wordTitleStr(
words[0],
)
b_defi = title.encode("utf-8") + b_defi
if defiFormat == "h":
b_defi = b_defi.replace(b'"bword://', b'"')
b_defi = b_defi.replace(b"'bword://", b"'")
anchorByKeyword = entry.anchorByKeyword
headword, *alts = words
writer.add(
b_defi,
(headword, anchorByKeyword.get(headword, "")),
content_type=content_type,
)
for alt in alts:
writer.add(
b_defi,
(f"{alt}, {headword}", anchorByKeyword.get(alt, "")),
content_type=content_type,
)
def addDataEntry(entry) -> None:
rel_path = entry.s_word
_, ext = splitext(rel_path)
ext = ext.lstrip(extsep).lower()
content_type = SlobWriter.resourceMimeTypes.get(ext)
if not content_type:
print(f"Aard2 slob: unknown content type for {rel_path!r}")
return
content = entry.data
key = rel_path
try:
key.encode(writer.encoding)
except UnicodeEncodeError:
print(f'Failed to add, broken unicode in key: {key!a}')
return
writer.add(content, key, content_type=content_type)
inputFilename = sys.argv[1]
outputFilename = sys.argv[2]
Glossary.init()
glos = Glossary()
reader = NewReader(glos)
for _ in reader.open(inputFilename):
pass
writer = slob.Writer(
outputFilename,
workdir=cacheDir,
compression="zlib",
version_info=False,
)
# writer.tag("label", self._glos.getInfo("name") + namePostfix)
for entry in reader:
if entry.isData():
addDataEntry(entry)
else:
addEntry(entry)
writer.finalize()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment