Last active
March 23, 2017 01:25
-
-
Save claytantor/ddaa1dbb1d4ce7ac8182b9bf2481d3b8 to your computer and use it in GitHub Desktop.
Allows for the generation of Rasa NLU models from a simpler form. It seems the native format has a lot of redundant information. This allows for a normalized model.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
import sys | |
import json | |
import re | |
import traceback | |
def load_model(model_file): | |
model = {} | |
with open(model_file, 'r') as f: | |
model = json.loads(f.read()) | |
return model | |
def make_rasa_model_from_statements(statements_model): | |
entity_examples, intent_examples = handle_statements_model(statements_model) | |
rasa_model = {"rasa_nlu_data":{ | |
'entity_examples':entity_examples, | |
'intent_examples':intent_examples | |
}} | |
return rasa_model | |
def handle_model(intent_name, entities, template, start, merge_model, items, keys): | |
merge_model['entities'] = entities | |
for i in range(start, len(entities)): | |
#print entities[i] | |
if 'synonyms' in entities[i]: | |
entity = entities[i] | |
synonyms = entity['synonyms'] | |
for j in range(0, len(synonyms)): | |
#recurse | |
#print "{0}={1}".format(entity['name'],synonyms[j]) | |
merge_model[entity['name']] = synonyms[j] | |
if i<=len(entities): | |
handle_model(intent_name, entities, template, i+1, merge_model, items, keys) | |
add_items(intent_name, template, merge_model, items, keys) | |
def add_items(intent_name, template, merge_model, items, keys): | |
try: | |
merged_text = template.format(**merge_model) | |
if merged_text not in keys: | |
entities_parsed = [] | |
for entity in merge_model['entities']: | |
#print entity | |
entity_parsed = parse_item_entity(merged_text, entity) | |
entities_parsed.append(entity_parsed) | |
item = { | |
"text": merged_text, | |
"intent": intent_name, | |
"entities": entities_parsed | |
} | |
items.append(item) | |
keys.append(merged_text) | |
except: | |
#print "Unexpected error:", sys.exc_info()[0] | |
print json.dumps({"template":template, "merge_model":merge_model}, indent=4) | |
exc_type, exc_value, exc_traceback = sys.exc_info() | |
print "*** print_exception:" | |
traceback.print_exception(exc_type, exc_value, exc_traceback, | |
limit=2, file=sys.stdout) | |
pass | |
def parse_item_entity(speech_text, entity): | |
rasa_entity = {} | |
# split the text into words | |
parts = speech_text.split(" ") | |
word_index = entity['word'] | |
entity_word = parts[word_index] | |
rasa_entity['value'] = entity['value'] | |
rasa_entity['entity'] = entity['name'] | |
# find the start index for the word number | |
m = re.search(entity_word, speech_text) | |
if m: | |
rasa_entity['start'] = m.start() | |
rasa_entity['end'] = m.end() | |
return rasa_entity | |
def handle_statements_model(statements_model): | |
entity_examples = [] | |
intent_examples = [] | |
""" | |
"text": "provider github push branchname", | |
"intent": "provider_branchpush", | |
"entities": [{ | |
"start": 0, | |
"end": 8, | |
"value": "provider", | |
"entity": "element" | |
} | |
""" | |
items = [] | |
keys = [] | |
for statement in statements_model['statements']: | |
# we are going to recurse from here | |
merge_model = {} | |
handle_model(statement['intent'], statement['entities'], statement['text'], 0, merge_model, items, keys) | |
#now we have items, lets create the examples from each | |
for item in items: | |
entity_examples.append(item) | |
intent_example = { | |
"text": item['text'], | |
"intent": item['intent'] | |
} | |
intent_examples.append(intent_example) | |
return entity_examples, intent_examples | |
def main(args): | |
statements_model = load_model(args[0]) | |
rasa_model = make_rasa_model_from_statements(statements_model) | |
print json.dumps(rasa_model, indent=4) | |
if __name__ == "__main__": | |
main(sys.argv[1:]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"statements": [{ | |
"text": "{element} {provider_subject_vcs} {provider_verb} {provider_object_branchname}", | |
"intent": "provider_branchpush", | |
"entities": [ | |
{ | |
"word": 0, | |
"value":"provider", | |
"synonyms": ["provider"], | |
"name": "element" | |
}, { | |
"word": 1, | |
"value":"vcs", | |
"synonyms": ["vcs","github", "travis", "stash", "bitbucket"], | |
"name": "provider_subject_vcs" | |
}, { | |
"word": 2, | |
"value":"push", | |
"synonyms": ["pushed", "push", "committed", "commit"], | |
"name": "provider_verb" | |
}, { | |
"word": 3, | |
"value":"branchname", | |
"synonyms": ["branchname"], | |
"name": "provider_object_branchname" | |
}] | |
},{ | |
"text": "{element} {provider_subject_vcs} {provider_object_record} {provider_verb}", | |
"intent": "provider_prcreate", | |
"entities": [ | |
{ | |
"word": 0, | |
"value":"provider", | |
"synonyms": ["provider"], | |
"name": "element" | |
}, { | |
"word": 1, | |
"value":"vcs", | |
"synonyms": ["vcs","github", "travis", "stash", "bitbucket"], | |
"name": "provider_subject_vcs" | |
}, { | |
"word": 2, | |
"value":"pr", | |
"synonyms": ["pr", "pullrequest"], | |
"name": "provider_object_record" | |
}, { | |
"word": 3, | |
"value":"create", | |
"synonyms": ["create"], | |
"name": "provider_verb" | |
}] | |
}] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I wrote this because the native rasa format is difficult to craft, there is a ton of handwork and parsing of words required to build the training model. The above file format is simpler and is similar to API.ai's synonyms training model that I have never been able to get to work with rasa.
to make a query:
curl -X POST -H "Content-Type: application/json" -H "Cache-Control: no-cache" -d '{"q":"provider bitbucket push feature-8"}' "http://localhost:5010/parse"
with the result: