Skip to content

Instantly share code, notes, and snippets.

@frederik-elwert
Created December 18, 2014 18:03
Show Gist options
  • Save frederik-elwert/6abd6b5b07d210b040bd to your computer and use it in GitHub Desktop.
Save frederik-elwert/6abd6b5b07d210b040bd to your computer and use it in GitHub Desktop.
TEI to TCF conversion using TCFlib
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import re
from glob import glob
import logging
from lxml import etree
from tcflib.service import ImportingWorker, run_as_cli
from tcflib.tcf import (TextCorpus, Text, Tokens, Token, TextStructure, TextSpan,
Lemmas, POStags, NamedEntities, NamedEntity, References, Reference,
Entity, Wsd, Sentences, Sentence)
NSS = {
'tei': 'http://www.tei-c.org/ns/1.0',
'dcr': 'http://www.isocat.org/ns/dcr',
}
NAME_TAGS = ('persName', 'addName', 'roleName', 'placeName', 'name')
# tag, type, subtype
NAME2TYPE = {
('persName', None, None) : 'Person',
('persName', 'royal', None): 'Royal',
('persName', 'deity', None): 'Deity',
('addName', 'epithet', 'royal'): 'EpithetRoyal',
('addName', 'epithet', 'deity'): 'EpithetDeity',
('roleName', None, None): 'Title',
('placeName', None, None): 'Place',
}
class TEIImporter(ImportingWorker):
__options__ = {
'basedir': '',
}
def resolve_pointer(self, pointer):
for pattern, repl in self.patterns.items():
match = re.match(pattern, pointer)
if match:
pointer = re.sub(pattern, repl, pointer)
break
try:
file_, id_ = pointer.split('#')
except ValueError:
raise ValueError('pointer "{}" contains no ID reference'.format(
pointer))
if not file_:
return self.input_tree.xpath('id($id)', id=id_)[0]
else:
if not file_ in self.support_files:
logging.debug('Loading support file {}.'.format(file_))
filepath = os.path.join(self.options.basedir, file_)
_, self.support_files[file_] = etree.parseid(filepath, self.parser)
return self.support_files[file_][id_]
def setup(self, input_data):
self.support_files = {}
self.parser = etree.XMLParser(load_dtd=True, remove_blank_text=True)
input_root = etree.fromstring(input_data, parser=self.parser)
self.input_tree = etree.ElementTree(input_root)
self.patterns = {}
for pattern in self.input_tree.xpath('//tei:refsDecl[1]/tei:cRefPattern',
namespaces=NSS):
self.patterns[pattern.get('matchPattern')] = \
pattern.get('replacementPattern').replace('$', '\\')
def import_(self):
self.corpus = TextCorpus()
self.corpus.add_layer(Text(''))
self.corpus.add_layer(Tokens())
self.corpus.add_layer(Sentences())
self.corpus.add_layer(Lemmas())
self.corpus.add_layer(NamedEntities('TLA'))
self.corpus.add_layer(Wsd('TLA'))
self.corpus.add_layer(POStags('DC-1345'))
self.corpus.add_layer(TextStructure())
# Process text line by line
self.text = []
root = self.input_tree.getroot()
if root.tag == etree.QName(NSS['tei'], 'TEI').text:
# Single TEI
logging.debug('Found single TEI document.')
self.add_tei(root)
elif root.tag == etree.QName(NSS['tei'], 'TEICorpus').text:
# TEI corpus
raise NotImplementedError
self.corpus.text.text = '\n'.join(self.text)
return self.corpus
def add_tei(self, teiroot):
for text in teiroot.xpath('//tei:text[@type="text"]', namespaces=NSS):
span_text = TextSpan(type='text')
sentences = text.xpath('.//tei:s', namespaces=NSS)
for sentence in sentences:
self.text.append(' '.join([word.xpath('normalize-space(.)')
for word in sentence]))
span_sent = Sentence()
for word in sentence.xpath('.//tei:w[@lemmaRef]',
namespaces=NSS):
# Get corresponding dictionary entry
entry = self.resolve_pointer(word.get('lemmaRef'))
# Token
token = Token(word.xpath('normalize-space(.)'))
# Lemma
token.lemma = entry.xpath('normalize-space(tei:form/tei:orth)',
namespaces=NSS)
# Wordsenses
token.wordsenses = [entry.xpath('@xml:id')[0]]
# POS
# Look up POS in features.
# try:
# features = [self.resolve_pointer(f) for f in
# word.xpath('tei:fs/@feats')[0].split()]
# except IndexError:
# # TODO: Words without POS, fix this!
# raise
# pos = None
# for feature in features:
# if (feature.get(etree.QName(NSS['dcr'], 'datcat'))
# == 'http://www.isocat.org/datcat/DC-1345'):
# pos = feature.xpath('tei:symbol/@value',
# namespaces=NSS)[0]
# break
# token.tag = pos
# Look up POS in dict.
token.tag = entry.xpath('normalize-space(tei:gramGrp/tei:pos)',
namespaces=NSS)
# Named Entity
parent = word.getparent()
parent_tag = etree.QName(parent).localname
if parent_tag in NAME_TAGS:
class_ = NAME2TYPE[(parent_tag, parent.get('type'),
parent.get('subtype'))]
self.corpus.namedentities.append(
NamedEntity(class_=class_, tokens=[token]))
# Add token to text spans and corpus
span_sent.tokens.append(token)
span_text.tokens.append(token)
self.corpus.tokens.append(token)
self.corpus.sentences.append(span_sent)
self.corpus.textstructure.append(span_text)
if __name__ == '__main__':
run_as_cli(TEIImporter)
@dseddah
Copy link

dseddah commented Aug 23, 2017

Hi, thanks for your code !! Do you know if there exists a TCF to TEI version of your script ?

THanks in advance,
Djamé

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment