Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
tei_to_fulltext.py
NS = {'tei': 'http://www.tei-c.org/ns/1.0'}
def tei_to_fulltext(tei):
from lxml import etree
from six import text_type
parser = etree.XMLParser(encoding='UTF-8', recover=True)
tei = tei if not isinstance(tei, text_type) else tei.encode('utf-8')
root = etree.fromstring(tei, parser)
return get_fulltext(root)
def get_fulltext(root):
"""TODO: move it to invenio_grobid`"""
return ' '.join(root.xpath(
'/tei:TEI/tei:text//text()',
namespaces=NS))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment