Skip to content

Instantly share code, notes, and snippets.

@dimidd
Last active April 13, 2018 09:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dimidd/1aba8b57643d5936f42670f0c5f344e4 to your computer and use it in GitHub Desktop.
Save dimidd/1aba8b57643d5936f42670f0c5f344e4 to your computer and use it in GitHub Desktop.
import re
import spacy
def annotate(xml):
# xml matches the pattern above
if xml[1] == "/":
return xml[2:-1] + "-end"
else:
return xml[1:-1] + "-start"
def strip_word(word, matches, nlp, stripped, all_tokens, no_space, annotations):
pattern_start = re.compile("<[a-zA-Z_]+>")
pattern_end = re.compile("</[a-zA-Z_]+>")
w_annotations = []
for match in matches:
w_annotations.append(annotate(match))
splitted_start = re.split(pattern_start, word)
# TODO: we assume no word contains more than one annotation
if len(splitted_start) > 1:
prefix, rest = splitted_start
if prefix:
tokens = list(nlp(prefix))
all_tokens.extend(tokens)
# The prefix requires space before, but the tag itself not
no_space[len(stripped) + 1] = True
stripped.append(prefix)
else:
rest = splitted_start[0]
splitted_end = re.split(pattern_end, rest)
tag = splitted_end[0]
stripped.append(tag)
tokens = list(nlp(tag))
n_tokens = len(all_tokens)
for j, _ in enumerate(tokens):
annotations[n_tokens + j] = w_annotations
all_tokens.extend(tokens)
if len(splitted_end) > 1:
suffix = splitted_end[1]
if suffix:
tokens = list(nlp(suffix))
all_tokens.extend(tokens)
no_space[len(stripped)] = True
stripped.append(suffix)
def split_annotations(txt, nlp):
pattern = re.compile("</?[a-zA-Z_]+>")
original_words = txt.split()
stripped = []
# A mapping between token index and its annotations
annotations = {}
all_tokens = []
# A mapping between stripped_words index and
# whether it's preceded by a space
no_space = {}
for word in original_words:
matches = re.findall(pattern, word)
if matches:
strip_word(word, matches, nlp, stripped, all_tokens, no_space, annotations)
else:
stripped.append(word)
tokens = list(nlp(word))
all_tokens.extend(tokens)
return (stripped, annotations, no_space)
def reassemble_txt(stripped_words, no_space):
stripped_txt = stripped_words[0]
for i, word in enumerate(stripped_words[1:]):
if i + 1 in no_space:
stripped_txt += word
else:
stripped_txt += " " + word
return stripped_txt
def main():
nlp = spacy.load("en")
txt = "<personName>Harry Potter</personName> goes to \
<orgName>Hogwarts</orgName>. <personName>Sally</personName> \
lives in #<locationName>London</locationName>."
stripped_words, annotations, no_space = split_annotations(txt, nlp)
stripped_txt = reassemble_txt(stripped_words, no_space)
doc = nlp(stripped_txt)
n_tokens = 0
print(txt)
for sent_ind, sent in enumerate(doc.sents):
print("sentence{}: {}".format(sent_ind, sent))
for tok in list(sent):
if n_tokens in annotations:
anns = annotations[n_tokens]
else:
anns = []
print("\t token{}: {}, annotations: {}".format(n_tokens, tok, anns))
n_tokens += 1
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment