Created
April 28, 2017 03:27
-
-
Save hosford42/2902cf75b47ca8f0aded32e51f3896ee to your computer and use it in GitHub Desktop.
Example of how to use Pyramids, a pure-Python rule-based natural language parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Installation note: Unless you plan to write your own grammar, you will need to install not only the pyramids package | |
# itself, but the pyramids_categories package as well. The pyramids package should automatically detect the presence of | |
# the pyramids_categories package and use its contents to initialize the default parser instance. | |
import time | |
import pyramids | |
def main(): | |
# If your input method supports long sections of text, it is recommended that you first split your text into | |
# sentences using another NLP tool before passing them to the parser. Parser performance may also degrade | |
# significantly for very long inputs, particularly for those containing the words "and" or "or". | |
sentence = input("> ") | |
max_time = 10 # Maximum seconds to spend parsing. | |
# Parsing takes place in two phases: identification and disambiguation. The parser reduces the combinatorial | |
# explosion of possible combinations of subtrees that must be considered, by combining subtrees of similar type | |
# together during the identification phase. Then, once a valid structure covering the entirety of the input has been | |
# identified, the combined structures must be separated again, which is the disambiguation phase. Both phases | |
# can be interrupted by the timeout. If there is insufficient time to disambiguate because the identification phase | |
# timed out, a fast heuristic mechanism is used to pick out the most likely forest, to avoid returning nothing at | |
# all. This is called "emergency disambiguation", and the like-named flag is returned to indicate whether the fast | |
# heuristic mechanism had to be used. The timeout flags indicate whether the phase had to be interrupted before it | |
# was finished considering every option. | |
( | |
forests, # The actual parses. Each parse can contain multiple trees which together attempt to cover the input. | |
emergency_disambiguation, # Whether emergency disambiguation was used. | |
identification_timed_out, # Whether identification timed out before all options were considered. | |
disambiguation_timed_out # Whether disambiguation timed out before all options were considered. | |
) = pyramids.parse( | |
sentence, # The sentence to be parsed | |
'sentence', # The grammatical category to be parsed | |
timeout=time.time() + max_time # The (optional) time at which parsing is terminated if it is not complete | |
) | |
if emergency_disambiguation: | |
print("Emergency disambiguation! The parse forest is probably incorrect.") | |
if identification_timed_out: | |
print("Identification phase timed out. The parse forest is less likely to be correct.") | |
if disambiguation_timed_out: | |
print("Disambiguation phase timed out. The parse forest is less likely to be correct.") | |
# Eliminate any forest that contains zero trees or has gaps in its coverage of the input text. | |
forests = [forest for forest in forests if forest.parse_trees and not forest.has_gaps()] | |
if not forests: | |
print("No proper parses were identified.") | |
else: | |
# Eliminate any forests with more than the minimum number of trees required to cover the input text. | |
best_size = min(len(forest.parse_trees) for forest in forests) | |
forests = [forest for forest in forests if len(forest.parse_trees) == best_size] | |
for index, forest in enumerate(forests[:3]): | |
print("========") | |
print() | |
print("Parse #%s" % (index + 1)) | |
print("Score: %s avg / %s sum" % forest.get_weighted_score()) | |
graphs = pyramids.get_parse_graphs(forest) | |
for tree, graph in zip(forest.parse_trees, graphs): | |
print() | |
if tree.category.has_properties('command'): | |
print("Possible command.") | |
if tree.category.has_properties('question'): | |
print("Possible question.") | |
if tree.category.has_properties('statement'): | |
print("Possible statement.") | |
print("Graph:") | |
print(graph) | |
print() | |
# The graph can be accessed as an indexed sequence, ordered by token position. There are also several | |
# different methods for getting additional information about a token position. | |
( | |
_, # Disregard | |
spelling, # The spelling for the token at the root node in the tree. | |
span, # The span of the token in the original input string | |
category # The grammatical category for the token | |
) = graph[graph.root_index] | |
print("Root token: %s" % spelling) | |
print("Root token position: %s" % graph.root_index) | |
print("Root token category: %s" % category) | |
print("Root phrase category: %s" % graph.get_phrase_category(graph.root_index)) | |
print("Root token starts at string index %s and ends at string index %s." % span) | |
print("Root token has outbound directed edges to these token positions: %s" % | |
sorted(graph.get_sinks(graph.root_index))) | |
for sink_index in sorted(graph.get_sinks(graph.root_index)): | |
labels = graph.get_labels(graph.root_index, sink_index) | |
print("There is a directed edge from %s to %s with the following labels: %s" % | |
(graph.root_index, sink_index, sorted(labels))) | |
break | |
print() | |
print("========") | |
print() | |
print() | |
# You will likely have some additional criteria which help you to pick the best parse forest from among those | |
# that were generated. If so, you can adjust the scoring for each parse by providing a score between 0 and 1. | |
best_forest = forests[0] | |
for forest in forests: | |
if forest is best_forest: | |
score = 1 | |
else: | |
score = 0 | |
forest.adjust_score(score) | |
# If you want your scores to affect parsing on future runs, you have to save the parser's scoring measures | |
# before your program terminates. Unfortunately, I haven't yet defined a function to access the default parser, | |
# so you'll have to access a "private" variable to do this. I don't plan on removing _default_parser anytime in | |
# the foreseeable future, so this should be safe despite its ugliness. | |
pyramids._default_parser.save_scoring_measures() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I am getting issues in running your example: