Created
November 17, 2016 19:10
-
-
Save ento/f38f9f74782cdd109c604dc11784a343 to your computer and use it in GitHub Desktop.
Simpler formatting of syntax trees: ((The quick brown fox) (jumps (over (the lazy dog))))
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Patch to make nltk.parse.bllip.py work in Python 3 | |
def _ensure_ascii(words): | |
try: | |
for i, word in enumerate(words): | |
if isinstance(word, bytes): | |
word.decode('ascii') | |
else: | |
word.encode('ascii') | |
except UnicodeDecodeError: | |
raise ValueError("Token %d (%r) is non-ASCII. BLLIP Parser " | |
"currently doesn't support non-ASCII inputs." % | |
(i, word)) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
pip install nltk bllipparser prompt_toolkit | |
python -m nltk.downloader bllip_wsj_no_aux | |
""" | |
from nltk.data import find | |
from nltk.parse.bllip import BllipParser | |
from nltk.compat import string_types | |
from nltk.tree import Tree | |
from prompt_toolkit import prompt | |
def format_flat(tree, parens='()'): | |
childstrs = [] | |
for child in tree: | |
if isinstance(child, Tree): | |
childstrs.append(format_flat(child, parens)) | |
elif isinstance(child, tuple): | |
childstrs.append("/".join(child)) | |
elif isinstance(child, string_types): | |
childstrs.append(child) | |
if len(childstrs) == 0: | |
return '' | |
elif len(childstrs) == 1: | |
return childstrs[0] | |
else: | |
return ''.join((parens[0], " ".join(childstrs), parens[1])) | |
def main(): | |
print('Loading parser..') | |
model_dir = find('models/bllip_wsj_no_aux').path | |
bllip = BllipParser.from_unified_model_dir(model_dir) | |
print('Done.') | |
print('Enter a sentence:') | |
while True: | |
text = prompt('> ') | |
top_result = bllip.parse_one(text.split()) | |
print(format_flat(top_result)) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment