Last active
August 29, 2015 14:24
-
-
Save jczaplew/cc47481a02a51708c1bb to your computer and use it in GitHub Desktop.
nlp2json.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import json | |
xml = "" | |
with open("dump/sentences.txt.xml") as input: | |
xml = input.read() | |
soup = BeautifulSoup(xml, "xml") | |
output = {"sentences": [{ | |
"id": sentence["id"], | |
"tokens": [{ | |
"id": token["id"], | |
"word": token.word.text, | |
"lemma": token.lemma.text, | |
"characterOffsetBegin": token.CharacterOffsetBegin.text, | |
"characterOffsetEnd": token.CharacterOffsetEnd.text, | |
"pos": token.POS.text, | |
"ner": token.NER.text | |
} for token in sentence.find_all("token")], | |
"parse": sentence.find_all("parse")[0].text, | |
"dependencies": [{ | |
"type": dependency["type"], | |
"dependencies": [{ | |
"type": sub_dep["type"], | |
"governor": { | |
"idx": sub_dep.governor["idx"], | |
"value": sub_dep.governor.text | |
}, | |
"dependent": { | |
"idx": sub_dep.dependent["idx"], | |
"value": sub_dep.dependent.text | |
} | |
} for sub_dep in dependency.find_all("dep")] | |
} for dependency in sentence.find_all("dependencies")], | |
"machineReading": {"entities": [{ | |
"id": entity["id"], | |
"val": entity.text.strip(), | |
"start": entity.span["start"], | |
"end": entity.span["end"] | |
} for entity in sentence.find_all("entity")]} | |
} for sentence in soup.root.document.sentences.find_all("sentence")]} | |
print json.dumps(output, indent=2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Example output: