Created
November 30, 2014 03:01
-
-
Save kirsle/fc2aa51eaa598869aba5 to your computer and use it in GitHub Desktop.
Experiments for upgrading RiveScript's tag processing algorithm, to support nested tags such as <set oldname=<get name>> which previously didn't work, as I was doing a simple find/replace one tag-type at a time and <set> always came before <get>.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""Experiments to upgrade RiveScript's tag processing algorithm.""" | |
from collections import deque | |
import re | |
import random | |
tests = [ | |
"<set oldname=<get name>>I thought your name was <get oldname>?", | |
"<set name=<formal>>Nice to meet you, <get name>.", | |
"This {random}sentence|phrase{/random} has a random word.", | |
] | |
def process_tags(message): | |
"""This is the algorithm I went with, nice and simple. It will only match | |
<tags> that do not contain tags within (i.e. no < character inside), so | |
on a <set old=<get new>>, the <get new> is matched on the first pass and | |
handled, and the <set old=xxx> is then handled on a second pass. On each | |
pass, only ONE tag is handled, so if the same tag was referenced from | |
multiple spots it should be handled left-to-right, example: | |
"My name is Bob" | |
(quick static tags like <star> are already handled first) | |
1. <set oldname=<get name>>I thought you were <get oldname>? <set name=bob> | |
2. <set oldname=Alice>I thought you were <get oldname>? <set name=bob> | |
3. I thought you were <get oldname>? <set name=bob> | |
4. I thought you were Alice? <set name=bob> | |
5. I thought you were Alice? | |
Done. oldname=Alice, name=bob at the end. | |
""" | |
print "INBOUND MESSAGE:", message | |
# Angle bracket tags first. | |
while True: | |
match = re.search(r'<([^<]+?)>', message) | |
if not match: | |
break | |
body = match.group(1) | |
group = body.split(" ", 1) | |
tag = group[0] | |
data = group[1] if len(group) > 1 else "" # TODO: bounds check | |
print "TAG FOUND:", tag, group | |
insert = "" | |
if tag == "get": | |
print "\tGET tag:", group | |
insert = "Soandso" | |
elif tag == "set": | |
print "\tSET tag:", group | |
parts = data.split("=") | |
print "\tSetting {}={}".format(parts[0], parts[1]) | |
elif tag == "formal": | |
insert = "Formal Name" | |
message = message.replace("<{}>".format(body), insert) | |
def __process_tags(message): | |
"""Abandoned this method. This would scan through the tokenized tags and | |
try to work from the deepest outward but it was really hairy.""" | |
tokens = [ x for x in re.split(r'(<|\{|\}|>)', message) if x != "" ] | |
print "MESSAGE:", message | |
print "TOKENS:", tokens | |
response = "" | |
passes = 0 | |
while True: | |
print "PASS #{} - Tokens:".format(passes), tokens | |
passes += 1 | |
depth = 0 # Opening-tag depth. | |
types = [] # Types of tag characters. For syntax checking. | |
raw_input() | |
i = 0 | |
ie = len(tokens) | |
while i < ie: | |
print i, "<", ie | |
token = tokens[i] | |
print "...", i, token, tokens | |
print "- Depth:", depth, "Types:", types | |
if token == "<" or token == "{": | |
depth += 1 | |
types.append(token) | |
print "\tOpening tag character found:", token | |
elif token == ">" or token == "}": | |
depth -= 1 | |
types.pop() | |
# A tag has been closed. In case of crazy nested tags, | |
# ie {random}<set name=<get abc>>{/random} this should be the | |
# inner-most tag, i.e. "get abc" here. | |
if i == 0: | |
# Syntax error! | |
print "Closing tag character found at position 0!" | |
tokens[i] = "" | |
continue | |
tag = tokens[i-1].split(" ") | |
print "\tTag just closed:", tag | |
# Process the type of tag. | |
if tag[0] == "get": | |
print "\tGET tag!" | |
# Replace this tag's text and brackets. | |
tokens.pop(i-2) # Pop off the tags. | |
tokens.pop(i) | |
i -= 1 | |
ie = len(tokens) | |
tokens[i-1] = "Soandso" # fill in the variable's value | |
print "NEW TOKENS:", tokens | |
print "I=", i, "IE=", ie | |
elif tag[0] == "set": | |
print "\tSET tag!" | |
name = tag[1] | |
value = tokens[i+1] | |
print "\tSet {}={}".format(name, value) | |
elif tag[0] == "random": | |
print "\tRANDOM tag! Begin randomized text!" | |
texts = tokens[i+1] | |
print "\tChoose random from:", texts | |
choice = random.choose(texts) | |
tokens[i-1] = choice | |
else: | |
# Not a tag character. | |
pass | |
# If anything in these tokens were tag characters, rinse and repeat. | |
# if token in ["<", ">", "{", "}"]: | |
# continue | |
i += 1 | |
print "" | |
for t in tests: | |
process_tags(t) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment