Skip to content

Instantly share code, notes, and snippets.

@karlb
Last active June 15, 2020 07:06
Show Gist options
  • Save karlb/44532c8d45356367f65bc32662fef6fb to your computer and use it in GitHub Desktop.
Save karlb/44532c8d45356367f65bc32662fef6fb to your computer and use it in GitHub Desktop.
tei2html for pyglossary
#!/usr/bin/env python3
from typing import List, Union, Callable
from lxml import etree as ET
from io import BytesIO
tei = '{http://www.tei-c.org/ns/1.0}'
ns = {
None: 'http://www.tei-c.org/ns/1.0',
}
def to_string(el):
return ET.tostring(el, method='html', pretty_print=True).decode('utf-8')
def make_list(
hf: ET.htmlfile,
input_elements: List[ET.Element],
processor: Callable,
single_prefix=None,
skip_single=True
):
""" Wrap elements into <ol> if more than one element """
if len(input_elements) == 0:
return
if len(input_elements) == 1:
hf.write(single_prefix)
processor(hf, input_elements[0])
return
with hf.element('ol'):
for el in input_elements:
with hf.element('li'):
processor(hf, el)
def process_sense(hf: ET.htmlfile, sense: ET.Element):
# translations
hf.write(', '.join(
el.text
for el in sense.findall('cit/quote', ns)
))
make_list(
hf,
sense.findall('sense/def', ns),
lambda hf, el: hf.write(el.text),
single_prefix=' — ',
)
def get_entry_html(entry):
keywords = []
f = BytesIO()
with ET.htmlfile(f) as hf:
with hf.element('div'):
for form in entry.findall('form/orth', ns):
keywords.append(form.text)
with hf.element('b'):
hf.write(form.text)
hf.write(' ')
for pos in entry.findall('gramGrp/pos', ns):
with hf.element('i'):
hf.write(pos.text)
hf.write(ET.Element('br'))
hf.write('\n')
make_list(
hf,
entry.findall('sense', ns),
process_sense,
)
return keywords, f.getvalue()
def get_metadata(header):
return {
'title': header.find('.//title', ns).text,
'edition': header.find('.//edition', ns).text,
'availability': to_string(header.find('.//availability', ns)),
}
if __name__ == '__main__':
print()
import sys
filename = sys.argv[1]
context = ET.iterparse(filename, events=("end",))
for action, elem in context:
if elem.tag == f'{tei}teiHeader':
metadata = get_metadata(elem)
print('<code>', metadata, '</code>')
if elem.tag == f'{tei}entry':
keywords, html = get_entry_html(elem)
print(html.decode('utf8'))
print('<hr/>')
# Clean up preceding siblings to save memory
while elem.getprevious() is not None:
del elem.getparent()[0]
# break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment