Skip to content

Instantly share code, notes, and snippets.

@ento
Created November 17, 2016 19:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ento/f38f9f74782cdd109c604dc11784a343 to your computer and use it in GitHub Desktop.
Save ento/f38f9f74782cdd109c604dc11784a343 to your computer and use it in GitHub Desktop.
Simpler formatting of syntax trees: ((The quick brown fox) (jumps (over (the lazy dog))))
# Patch to make nltk.parse.bllip.py work in Python 3
def _ensure_ascii(words):
try:
for i, word in enumerate(words):
if isinstance(word, bytes):
word.decode('ascii')
else:
word.encode('ascii')
except UnicodeDecodeError:
raise ValueError("Token %d (%r) is non-ASCII. BLLIP Parser "
"currently doesn't support non-ASCII inputs." %
(i, word))
"""
pip install nltk bllipparser prompt_toolkit
python -m nltk.downloader bllip_wsj_no_aux
"""
from nltk.data import find
from nltk.parse.bllip import BllipParser
from nltk.compat import string_types
from nltk.tree import Tree
from prompt_toolkit import prompt
def format_flat(tree, parens='()'):
childstrs = []
for child in tree:
if isinstance(child, Tree):
childstrs.append(format_flat(child, parens))
elif isinstance(child, tuple):
childstrs.append("/".join(child))
elif isinstance(child, string_types):
childstrs.append(child)
if len(childstrs) == 0:
return ''
elif len(childstrs) == 1:
return childstrs[0]
else:
return ''.join((parens[0], " ".join(childstrs), parens[1]))
def main():
print('Loading parser..')
model_dir = find('models/bllip_wsj_no_aux').path
bllip = BllipParser.from_unified_model_dir(model_dir)
print('Done.')
print('Enter a sentence:')
while True:
text = prompt('> ')
top_result = bllip.parse_one(text.split())
print(format_flat(top_result))
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment