Skip to content

Instantly share code, notes, and snippets.

@kirsle
Created November 30, 2014 03:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kirsle/fc2aa51eaa598869aba5 to your computer and use it in GitHub Desktop.
Save kirsle/fc2aa51eaa598869aba5 to your computer and use it in GitHub Desktop.
Experiments for upgrading RiveScript's tag processing algorithm, to support nested tags such as <set oldname=<get name>> which previously didn't work, as I was doing a simple find/replace one tag-type at a time and <set> always came before <get>.
#!/usr/bin/env python
"""Experiments to upgrade RiveScript's tag processing algorithm."""
from collections import deque
import re
import random
tests = [
"<set oldname=<get name>>I thought your name was <get oldname>?",
"<set name=<formal>>Nice to meet you, <get name>.",
"This {random}sentence|phrase{/random} has a random word.",
]
def process_tags(message):
"""This is the algorithm I went with, nice and simple. It will only match
<tags> that do not contain tags within (i.e. no < character inside), so
on a <set old=<get new>>, the <get new> is matched on the first pass and
handled, and the <set old=xxx> is then handled on a second pass. On each
pass, only ONE tag is handled, so if the same tag was referenced from
multiple spots it should be handled left-to-right, example:
"My name is Bob"
(quick static tags like <star> are already handled first)
1. <set oldname=<get name>>I thought you were <get oldname>? <set name=bob>
2. <set oldname=Alice>I thought you were <get oldname>? <set name=bob>
3. I thought you were <get oldname>? <set name=bob>
4. I thought you were Alice? <set name=bob>
5. I thought you were Alice?
Done. oldname=Alice, name=bob at the end.
"""
print "INBOUND MESSAGE:", message
# Angle bracket tags first.
while True:
match = re.search(r'<([^<]+?)>', message)
if not match:
break
body = match.group(1)
group = body.split(" ", 1)
tag = group[0]
data = group[1] if len(group) > 1 else "" # TODO: bounds check
print "TAG FOUND:", tag, group
insert = ""
if tag == "get":
print "\tGET tag:", group
insert = "Soandso"
elif tag == "set":
print "\tSET tag:", group
parts = data.split("=")
print "\tSetting {}={}".format(parts[0], parts[1])
elif tag == "formal":
insert = "Formal Name"
message = message.replace("<{}>".format(body), insert)
def __process_tags(message):
"""Abandoned this method. This would scan through the tokenized tags and
try to work from the deepest outward but it was really hairy."""
tokens = [ x for x in re.split(r'(<|\{|\}|>)', message) if x != "" ]
print "MESSAGE:", message
print "TOKENS:", tokens
response = ""
passes = 0
while True:
print "PASS #{} - Tokens:".format(passes), tokens
passes += 1
depth = 0 # Opening-tag depth.
types = [] # Types of tag characters. For syntax checking.
raw_input()
i = 0
ie = len(tokens)
while i < ie:
print i, "<", ie
token = tokens[i]
print "...", i, token, tokens
print "- Depth:", depth, "Types:", types
if token == "<" or token == "{":
depth += 1
types.append(token)
print "\tOpening tag character found:", token
elif token == ">" or token == "}":
depth -= 1
types.pop()
# A tag has been closed. In case of crazy nested tags,
# ie {random}<set name=<get abc>>{/random} this should be the
# inner-most tag, i.e. "get abc" here.
if i == 0:
# Syntax error!
print "Closing tag character found at position 0!"
tokens[i] = ""
continue
tag = tokens[i-1].split(" ")
print "\tTag just closed:", tag
# Process the type of tag.
if tag[0] == "get":
print "\tGET tag!"
# Replace this tag's text and brackets.
tokens.pop(i-2) # Pop off the tags.
tokens.pop(i)
i -= 1
ie = len(tokens)
tokens[i-1] = "Soandso" # fill in the variable's value
print "NEW TOKENS:", tokens
print "I=", i, "IE=", ie
elif tag[0] == "set":
print "\tSET tag!"
name = tag[1]
value = tokens[i+1]
print "\tSet {}={}".format(name, value)
elif tag[0] == "random":
print "\tRANDOM tag! Begin randomized text!"
texts = tokens[i+1]
print "\tChoose random from:", texts
choice = random.choose(texts)
tokens[i-1] = choice
else:
# Not a tag character.
pass
# If anything in these tokens were tag characters, rinse and repeat.
# if token in ["<", ">", "{", "}"]:
# continue
i += 1
print ""
for t in tests:
process_tags(t)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment