Skip to content

Instantly share code, notes, and snippets.

@danem
Last active May 7, 2019 18:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save danem/e7b6303939133302017eb83b0e73fac9 to your computer and use it in GitHub Desktop.
Save danem/e7b6303939133302017eb83b0e73fac9 to your computer and use it in GitHub Desktop.
Brat Standoff To Spacy
import os
class AnnoEntry(object):
def __init__ (self, id, start, end, label, word):
self.id = id
self.start = start
self.end = end
self.label = label
self.word = word
def __repr__ (self):
return "AE[id: {0}, start: {1}, end: {2}, label: {3}, word: {4}]".format(self.id, self.start, self.end, self.label, self.word)
class AnnoRelation (object):
def __init__ (self, id, label, head, tail):
self.id = id
self.label = label
self.head = head
self.tail = tail
def __repr__ (self):
return "AR[id: {0}, label: {1}, head: {2}, tail: {3}]".format(self.id, self.label, self.head, self.tail)
class Sentence (object):
def __init__ (self, src, start, end):
self.txt = src[start:end]
self.start = start
self.end = end
words = self.txt.split(' ')
self.deps = ["-"] * len(words)
self.heads = [0] * len(words)
self.entities = []
def isInSentence (self, idx):
return idx >= self.start and idx <= self.end
def relativeWordIndex (self, start, end):
if not self.isInSentence(start) or not self.isInSentence(end):
return -1
start = start - self.start
end = end - self.start
count = 0
for i in range(start):
if self.txt[i] == ' ':
count += 1
return count
def data (self):
return (self.txt, self.deps, self.heads, (self.txt, {"entities":self.entities}))
def __repr__ (self):
return "Sent[ txt: {0}, start: {1}, end: {2}]".format(self.txt, self.start, self.end)
class LabeledData (object):
def __init__ (self, txt, deps, ents):
self.txt = txt
self.deps = deps
self.ents = ents
def __repr__ (self):
return "LD[ txt: {0}, deps: {1}, ents: {2}]".format(self.txt, self.deps, self.ents)
def parseAnnoEntry (line, entryLookup, relLookup):
fst = line.split('\t')
id = fst[0]
if id[0] == 'T':
word = fst[2]
parts = fst[1].split(' ')
label = parts[0]
start = int(parts[1])
end = int(parts[2])
res = AnnoEntry(id, start, end, label, word)
entryLookup[id] = res
else:
snd = fst[1].split(' ')
label = snd[0]
a1 = entryLookup[snd[1].split(':')[1]]
a2 = entryLookup[snd[2].split(':')[1]]
res = AnnoRelation(id, label, a1, a2)
relLookup[id] = res
return res
def parseAnnoFile (file):
entries = []
relations = []
rLookup = {}
eLookup = {}
lines = file.readlines()
for l in lines:
res = parseAnnoEntry(l,eLookup,rLookup)
if isinstance(res, AnnoRelation):
relations.append(res)
else:
entries.append(res)
return entries,relations
def parseTxtFile (file):
txt = file.read()
sentenceSpans = absSplit(txt, '.')
sentences = []
for s in sentenceSpans:
sentences.append(Sentence(txt, s[0], s[1]))
return sentences
def absSplit (string, char):
splits = []
lastSplit = 0
for i in range(len(string)):
if string[i] == char:
splits.append((lastSplit,i))
lastSplit = i + 1
return splits
def convertAnnoFile (annFile, txtFile):
entries, relations = parseAnnoFile(annFile)
sentences = parseTxtFile(txtFile)
return entries, relations, sentences
def processFiles (ents, rels, sents):
# TODO consider using a heap so we don't need to sort
# Make sure that the lowest end idx is first in the list. This enables us to pop
# a sentece from the sentence list as soon as we come across a relation that doesn't
# lie in a sentence
rels = sorted(rels, key=lambda v: min(v.head.end, v.tail.end))
ents = sorted(ents, key=lambda v: v.end)
currSentence = 0
sentCount = len(sents)
for r in rels:
h, t = r.head, r.tail
s1, s2 = None, None
# TODO: Assumes all relations are within a single sentence
while (not sents[currSentence].isInSentence(h.start) or
not sents[currSentence].isInSentence(h.end) or
not sents[currSentence].isInSentence(t.start) or
not sents[currSentence].isInSentence(t.end)):
currSentence += 1
if currSentence >= len(sents):
break
if currSentence >= len(sents):
print("TOO LONG")
break
sentence = sents[currSentence]
hidx = sentence.relativeWordIndex(h.start,h.end)
tidx = sentence.relativeWordIndex(t.start, t.end)
sentence.heads[hidx] = hidx
sentence.heads[tidx] = hidx
sentence.deps[hidx] = h.label
sentence.deps[tidx] = r.label
currSentence = 0
for e in ents:
if currSentence >= len(sents):
break
while (not sents[currSentence].isInSentence(e.start) or
not sents[currSentence].isInSentence(e.end)):
currSentence += 1
#print(currSentence, len(sents))
if currSentence >= len(sents):
break
if currSentence >= len(sents):
break
sentence = sents[currSentence]
sentence.entities.append((e.start - sentence.start, e.end - sentence.start, e.label))
return ents, rels, sents
def processAnnoDirectory (path):
anns = [os.path.join(path,f) for f in os.listdir(path) if f.endswith(".ann")]
txt = [os.path.splitext(f)[0]+".txt" for f in anns]
entities, relations, sentences = [], [], []
for a,t in zip(anns,txt):
if os.path.getsize(a) == 0:
continue
with open(a,'r') as af, open(t, 'r') as bf:
ent, rels, sents = convertAnnoFile(af,bf)
ent, rels, sents = processFiles(ent,rels,sents)
entities.append(ent)
relations.append(rels)
sentences.append(sents)
return entities, relations, sentences
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment