Last active
June 15, 2020 07:06
-
-
Save karlb/44532c8d45356367f65bc32662fef6fb to your computer and use it in GitHub Desktop.
tei2html for pyglossary
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from typing import List, Union, Callable | |
from lxml import etree as ET | |
from io import BytesIO | |
tei = '{http://www.tei-c.org/ns/1.0}' | |
ns = { | |
None: 'http://www.tei-c.org/ns/1.0', | |
} | |
def to_string(el): | |
return ET.tostring(el, method='html', pretty_print=True).decode('utf-8') | |
def make_list( | |
hf: ET.htmlfile, | |
input_elements: List[ET.Element], | |
processor: Callable, | |
single_prefix=None, | |
skip_single=True | |
): | |
""" Wrap elements into <ol> if more than one element """ | |
if len(input_elements) == 0: | |
return | |
if len(input_elements) == 1: | |
hf.write(single_prefix) | |
processor(hf, input_elements[0]) | |
return | |
with hf.element('ol'): | |
for el in input_elements: | |
with hf.element('li'): | |
processor(hf, el) | |
def process_sense(hf: ET.htmlfile, sense: ET.Element): | |
# translations | |
hf.write(', '.join( | |
el.text | |
for el in sense.findall('cit/quote', ns) | |
)) | |
make_list( | |
hf, | |
sense.findall('sense/def', ns), | |
lambda hf, el: hf.write(el.text), | |
single_prefix=' — ', | |
) | |
def get_entry_html(entry): | |
keywords = [] | |
f = BytesIO() | |
with ET.htmlfile(f) as hf: | |
with hf.element('div'): | |
for form in entry.findall('form/orth', ns): | |
keywords.append(form.text) | |
with hf.element('b'): | |
hf.write(form.text) | |
hf.write(' ') | |
for pos in entry.findall('gramGrp/pos', ns): | |
with hf.element('i'): | |
hf.write(pos.text) | |
hf.write(ET.Element('br')) | |
hf.write('\n') | |
make_list( | |
hf, | |
entry.findall('sense', ns), | |
process_sense, | |
) | |
return keywords, f.getvalue() | |
def get_metadata(header): | |
return { | |
'title': header.find('.//title', ns).text, | |
'edition': header.find('.//edition', ns).text, | |
'availability': to_string(header.find('.//availability', ns)), | |
} | |
if __name__ == '__main__': | |
print() | |
import sys | |
filename = sys.argv[1] | |
context = ET.iterparse(filename, events=("end",)) | |
for action, elem in context: | |
if elem.tag == f'{tei}teiHeader': | |
metadata = get_metadata(elem) | |
print('<code>', metadata, '</code>') | |
if elem.tag == f'{tei}entry': | |
keywords, html = get_entry_html(elem) | |
print(html.decode('utf8')) | |
print('<hr/>') | |
# Clean up preceding siblings to save memory | |
while elem.getprevious() is not None: | |
del elem.getparent()[0] | |
# break |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment