danem/anno2spacy.py

## anno2spacy.py
import os

class AnnoEntry(object):
    def __init__ (self, id, start, end, label, word):
        self.id = id
        self.start = start
        self.end = end
        self.label = label
        self.word = word

    def __repr__ (self):
        return "AE[id: {0}, start: {1}, end: {2}, label: {3}, word: {4}]".format(self.id, self.start, self.end, self.label, self.word)

class AnnoRelation (object):
    def __init__ (self, id, label, head, tail):
        self.id = id
        self.label = label
        self.head = head
        self.tail = tail

    def __repr__ (self):
        return "AR[id: {0}, label: {1}, head: {2}, tail: {3}]".format(self.id, self.label, self.head, self.tail)

class Sentence (object):
    def __init__ (self, src, start, end):
        self.txt = src[start:end]
        self.start = start
        self.end = end
        words = self.txt.split(' ')
        self.deps = ["-"] * len(words)
        self.heads = [0] * len(words)
        self.entities = []

    def isInSentence (self, idx):
        return idx >= self.start and idx <= self.end

    def relativeWordIndex (self, start, end):
        if not self.isInSentence(start) or not self.isInSentence(end):
            return -1
        start = start - self.start
        end   = end - self.start
        count = 0
        for i in range(start):
            if self.txt[i] == ' ':
                count += 1
        return count

    def data (self):
        return (self.txt, self.deps, self.heads, (self.txt, {"entities":self.entities}))

    def __repr__ (self):
        return "Sent[ txt: {0}, start: {1}, end: {2}]".format(self.txt, self.start, self.end)

class LabeledData (object):
    def __init__ (self, txt, deps, ents):
        self.txt = txt
        self.deps = deps
        self.ents = ents

    def __repr__ (self):
        return "LD[ txt: {0}, deps: {1}, ents: {2}]".format(self.txt, self.deps, self.ents)

def parseAnnoEntry (line, entryLookup, relLookup):
    fst   = line.split('\t')
    id    = fst[0]
    if id[0] == 'T':
        word  = fst[2]
        parts = fst[1].split(' ')
        label = parts[0]
        start = int(parts[1])
        end   = int(parts[2])
        res = AnnoEntry(id, start, end, label, word)
        entryLookup[id] = res
    else:
        snd = fst[1].split(' ')
        label = snd[0]
        a1 = entryLookup[snd[1].split(':')[1]]
        a2 = entryLookup[snd[2].split(':')[1]]
        res = AnnoRelation(id, label, a1, a2)
        relLookup[id] = res
    return res

def parseAnnoFile (file):
    entries = []
    relations = []
    rLookup = {}
    eLookup = {}
    lines = file.readlines()
    for l in lines:
        res = parseAnnoEntry(l,eLookup,rLookup)
        if isinstance(res, AnnoRelation):
            relations.append(res)
        else:
            entries.append(res)
    return entries,relations

def parseTxtFile (file):
    txt = file.read()
    sentenceSpans = absSplit(txt, '.')
    sentences = []
    for s in sentenceSpans:
        sentences.append(Sentence(txt, s[0], s[1]))
    return sentences


def absSplit (string, char):
    splits = []
    lastSplit = 0
    for i in range(len(string)):
        if string[i] == char:
            splits.append((lastSplit,i))
            lastSplit = i + 1
    return splits

def convertAnnoFile (annFile, txtFile):
    entries, relations = parseAnnoFile(annFile)
    sentences = parseTxtFile(txtFile)
    return entries, relations, sentences

def processFiles (ents, rels, sents):
    # TODO consider using a heap so we don't need to sort
    # Make sure that the lowest end idx is first in the list. This enables us to pop
    # a sentece from the sentence list as soon as we come across a relation that doesn't
    # lie in a sentence
    rels = sorted(rels, key=lambda v: min(v.head.end, v.tail.end))
    ents = sorted(ents, key=lambda v: v.end)
    currSentence = 0
    sentCount = len(sents)

    for r in rels:
        h, t = r.head, r.tail
        s1, s2 = None, None
        # TODO: Assumes all relations are within a single sentence
        while (not sents[currSentence].isInSentence(h.start) or
               not sents[currSentence].isInSentence(h.end) or
               not sents[currSentence].isInSentence(t.start) or
               not sents[currSentence].isInSentence(t.end)):
            currSentence += 1
            if currSentence >= len(sents):
                break
        if currSentence >= len(sents):
            print("TOO LONG")
            break
        sentence = sents[currSentence]
        hidx = sentence.relativeWordIndex(h.start,h.end)
        tidx = sentence.relativeWordIndex(t.start, t.end)
        sentence.heads[hidx] = hidx
        sentence.heads[tidx] = hidx
        sentence.deps[hidx] = h.label
        sentence.deps[tidx] = r.label

    currSentence = 0
    for e in ents:
        if currSentence >= len(sents):
            break
        while (not sents[currSentence].isInSentence(e.start) or
               not sents[currSentence].isInSentence(e.end)):
            currSentence += 1
            #print(currSentence, len(sents))

            if currSentence >= len(sents):
                break
        if currSentence >= len(sents):
            break
        sentence = sents[currSentence]
        sentence.entities.append((e.start - sentence.start, e.end - sentence.start, e.label))

    return ents, rels, sents

def processAnnoDirectory (path):
    anns = [os.path.join(path,f) for f in os.listdir(path) if f.endswith(".ann")]
    txt  = [os.path.splitext(f)[0]+".txt" for f in anns]
    entities, relations, sentences = [], [], []
    for a,t in zip(anns,txt):
        if os.path.getsize(a) == 0:
            continue
        with open(a,'r') as af, open(t, 'r') as bf:
            ent, rels, sents = convertAnnoFile(af,bf)
            ent, rels, sents = processFiles(ent,rels,sents)
            entities.append(ent)
            relations.append(rels)
            sentences.append(sents)
    return entities, relations, sentences
	import os

	class AnnoEntry(object):
	def __init__ (self, id, start, end, label, word):
	self.id = id
	self.start = start
	self.end = end
	self.label = label
	self.word = word

	def __repr__ (self):
	return "AE[id: {0}, start: {1}, end: {2}, label: {3}, word: {4}]".format(self.id, self.start, self.end, self.label, self.word)

	class AnnoRelation (object):
	def __init__ (self, id, label, head, tail):
	self.id = id
	self.label = label
	self.head = head
	self.tail = tail

	def __repr__ (self):
	return "AR[id: {0}, label: {1}, head: {2}, tail: {3}]".format(self.id, self.label, self.head, self.tail)

	class Sentence (object):
	def __init__ (self, src, start, end):
	self.txt = src[start:end]
	self.start = start
	self.end = end
	words = self.txt.split(' ')
	self.deps = ["-"] * len(words)
	self.heads = [0] * len(words)
	self.entities = []

	def isInSentence (self, idx):
	return idx >= self.start and idx <= self.end

	def relativeWordIndex (self, start, end):
	if not self.isInSentence(start) or not self.isInSentence(end):
	return -1
	start = start - self.start
	end = end - self.start
	count = 0
	for i in range(start):
	if self.txt[i] == ' ':
	count += 1
	return count

	def data (self):
	return (self.txt, self.deps, self.heads, (self.txt, {"entities":self.entities}))

	def __repr__ (self):
	return "Sent[ txt: {0}, start: {1}, end: {2}]".format(self.txt, self.start, self.end)

	class LabeledData (object):
	def __init__ (self, txt, deps, ents):
	self.txt = txt
	self.deps = deps
	self.ents = ents

	def __repr__ (self):
	return "LD[ txt: {0}, deps: {1}, ents: {2}]".format(self.txt, self.deps, self.ents)

	def parseAnnoEntry (line, entryLookup, relLookup):
	fst = line.split('\t')
	id = fst[0]
	if id[0] == 'T':
	word = fst[2]
	parts = fst[1].split(' ')
	label = parts[0]
	start = int(parts[1])
	end = int(parts[2])
	res = AnnoEntry(id, start, end, label, word)
	entryLookup[id] = res
	else:
	snd = fst[1].split(' ')
	label = snd[0]
	a1 = entryLookup[snd[1].split(':')[1]]
	a2 = entryLookup[snd[2].split(':')[1]]
	res = AnnoRelation(id, label, a1, a2)
	relLookup[id] = res
	return res

	def parseAnnoFile (file):
	entries = []
	relations = []
	rLookup = {}
	eLookup = {}
	lines = file.readlines()
	for l in lines:
	res = parseAnnoEntry(l,eLookup,rLookup)
	if isinstance(res, AnnoRelation):
	relations.append(res)
	else:
	entries.append(res)
	return entries,relations

	def parseTxtFile (file):
	txt = file.read()
	sentenceSpans = absSplit(txt, '.')
	sentences = []
	for s in sentenceSpans:
	sentences.append(Sentence(txt, s[0], s[1]))
	return sentences


	def absSplit (string, char):
	splits = []
	lastSplit = 0
	for i in range(len(string)):
	if string[i] == char:
	splits.append((lastSplit,i))
	lastSplit = i + 1
	return splits

	def convertAnnoFile (annFile, txtFile):
	entries, relations = parseAnnoFile(annFile)
	sentences = parseTxtFile(txtFile)
	return entries, relations, sentences

	def processFiles (ents, rels, sents):
	# TODO consider using a heap so we don't need to sort
	# Make sure that the lowest end idx is first in the list. This enables us to pop
	# a sentece from the sentence list as soon as we come across a relation that doesn't
	# lie in a sentence
	rels = sorted(rels, key=lambda v: min(v.head.end, v.tail.end))
	ents = sorted(ents, key=lambda v: v.end)
	currSentence = 0
	sentCount = len(sents)

	for r in rels:
	h, t = r.head, r.tail
	s1, s2 = None, None
	# TODO: Assumes all relations are within a single sentence
	while (not sents[currSentence].isInSentence(h.start) or
	not sents[currSentence].isInSentence(h.end) or
	not sents[currSentence].isInSentence(t.start) or
	not sents[currSentence].isInSentence(t.end)):
	currSentence += 1
	if currSentence >= len(sents):
	break
	if currSentence >= len(sents):
	print("TOO LONG")
	break
	sentence = sents[currSentence]
	hidx = sentence.relativeWordIndex(h.start,h.end)
	tidx = sentence.relativeWordIndex(t.start, t.end)
	sentence.heads[hidx] = hidx
	sentence.heads[tidx] = hidx
	sentence.deps[hidx] = h.label
	sentence.deps[tidx] = r.label

	currSentence = 0
	for e in ents:
	if currSentence >= len(sents):
	break
	while (not sents[currSentence].isInSentence(e.start) or
	not sents[currSentence].isInSentence(e.end)):
	currSentence += 1
	#print(currSentence, len(sents))

	if currSentence >= len(sents):
	break
	if currSentence >= len(sents):
	break
	sentence = sents[currSentence]
	sentence.entities.append((e.start - sentence.start, e.end - sentence.start, e.label))

	return ents, rels, sents

	def processAnnoDirectory (path):
	anns = [os.path.join(path,f) for f in os.listdir(path) if f.endswith(".ann")]
	txt = [os.path.splitext(f)[0]+".txt" for f in anns]
	entities, relations, sentences = [], [], []
	for a,t in zip(anns,txt):
	if os.path.getsize(a) == 0:
	continue
	with open(a,'r') as af, open(t, 'r') as bf:
	ent, rels, sents = convertAnnoFile(af,bf)
	ent, rels, sents = processFiles(ent,rels,sents)
	entities.append(ent)
	relations.append(rels)
	sentences.append(sents)
	return entities, relations, sentences