kirsle/iterative regexp.py

## iterative regexp.py
#!/usr/bin/env python

"""Experiments to upgrade RiveScript's tag processing algorithm."""

from collections import deque
import re
import random

tests = [
    "<set oldname=<get name>>I thought your name was <get oldname>?",
    "<set name=<formal>>Nice to meet you, <get name>.",
    "This {random}sentence|phrase{/random} has a random word.",
]

def process_tags(message):
    """This is the algorithm I went with, nice and simple. It will only match
    <tags> that do not contain tags within (i.e. no < character inside), so
    on a <set old=<get new>>, the <get new> is matched on the first pass and
    handled, and the <set old=xxx> is then handled on a second pass. On each
    pass, only ONE tag is handled, so if the same tag was referenced from
    multiple spots it should be handled left-to-right, example:

    "My name is Bob"
    (quick static tags like <star> are already handled first)
    1. <set oldname=<get name>>I thought you were <get oldname>? <set name=bob>
    2. <set oldname=Alice>I thought you were <get oldname>? <set name=bob>
    3. I thought you were <get oldname>? <set name=bob>
    4. I thought you were Alice? <set name=bob>
    5. I thought you were Alice?
    Done. oldname=Alice, name=bob at the end.
    """
    print "INBOUND MESSAGE:", message
    # Angle bracket tags first.
    while True:
        match = re.search(r'<([^<]+?)>', message)
        if not match:
            break

        body = match.group(1)
        group = body.split(" ", 1)
        tag = group[0]
        data = group[1] if len(group) > 1 else "" # TODO: bounds check
        print "TAG FOUND:", tag, group
        insert = ""

        if tag == "get":
            print "\tGET tag:", group
            insert = "Soandso"
        elif tag == "set":
            print "\tSET tag:", group
            parts = data.split("=")
            print "\tSetting {}={}".format(parts[0], parts[1])
        elif tag == "formal":
            insert = "Formal Name"

        message = message.replace("<{}>".format(body), insert)

def __process_tags(message):
    """Abandoned this method. This would scan through the tokenized tags and
    try to work from the deepest outward but it was really hairy."""
    tokens = [ x for x in re.split(r'(<|\{|\}|>)', message) if x != "" ]
    print "MESSAGE:", message
    print "TOKENS:", tokens

    response = ""

    passes = 0
    while True:
        print "PASS #{} - Tokens:".format(passes), tokens
        passes += 1

        depth = 0  # Opening-tag depth.
        types = [] # Types of tag characters. For syntax checking.
        raw_input()

        i  = 0
        ie = len(tokens)
        while i < ie:
            print i, "<", ie
            token = tokens[i]

            print "...", i, token, tokens
            print "- Depth:", depth, "Types:", types

            if token == "<" or token == "{":
                depth += 1
                types.append(token)

                print "\tOpening tag character found:", token
            elif token == ">" or token == "}":
                depth -= 1
                types.pop()

                # A tag has been closed. In case of crazy nested tags,
                # ie {random}<set name=<get abc>>{/random} this should be the
                # inner-most tag, i.e. "get abc" here.
                if i == 0:
                    # Syntax error!
                    print "Closing tag character found at position 0!"
                    tokens[i] = ""
                    continue

                tag = tokens[i-1].split(" ")
                print "\tTag just closed:", tag

                # Process the type of tag.
                if tag[0] == "get":
                    print "\tGET tag!"

                    # Replace this tag's text and brackets.
                    tokens.pop(i-2) # Pop off the tags.
                    tokens.pop(i)
                    i -= 1
                    ie = len(tokens)
                    tokens[i-1] = "Soandso" # fill in the variable's value
                    print "NEW TOKENS:", tokens
                    print "I=", i, "IE=", ie
                elif tag[0] == "set":
                    print "\tSET tag!"
                    name = tag[1]
                    value = tokens[i+1]
                    print "\tSet {}={}".format(name, value)
                elif tag[0] == "random":
                    print "\tRANDOM tag! Begin randomized text!"
                    texts = tokens[i+1]
                    print "\tChoose random from:", texts
                    choice = random.choose(texts)
                    tokens[i-1] = choice
            else:
                # Not a tag character.
                pass

            # If anything in these tokens were tag characters, rinse and repeat.
            # if token in ["<", ">", "{", "}"]:
            #     continue
            i += 1

    print ""

for t in tests:
    process_tags(t)
	#!/usr/bin/env python

	"""Experiments to upgrade RiveScript's tag processing algorithm."""

	from collections import deque
	import re
	import random

	tests = [
	"<set oldname=<get name>>I thought your name was <get oldname>?",
	"<set name=<formal>>Nice to meet you, <get name>.",
	"This {random}sentence\|phrase{/random} has a random word.",
	]

	def process_tags(message):
	"""This is the algorithm I went with, nice and simple. It will only match
	<tags> that do not contain tags within (i.e. no < character inside), so
	on a <set old=<get new>>, the <get new> is matched on the first pass and
	handled, and the <set old=xxx> is then handled on a second pass. On each
	pass, only ONE tag is handled, so if the same tag was referenced from
	multiple spots it should be handled left-to-right, example:

	"My name is Bob"
	(quick static tags like <star> are already handled first)
	1. <set oldname=<get name>>I thought you were <get oldname>? <set name=bob>
	2. <set oldname=Alice>I thought you were <get oldname>? <set name=bob>
	3. I thought you were <get oldname>? <set name=bob>
	4. I thought you were Alice? <set name=bob>
	5. I thought you were Alice?
	Done. oldname=Alice, name=bob at the end.
	"""
	print "INBOUND MESSAGE:", message
	# Angle bracket tags first.
	while True:
	match = re.search(r'<([^<]+?)>', message)
	if not match:
	break

	body = match.group(1)
	group = body.split(" ", 1)
	tag = group[0]
	data = group[1] if len(group) > 1 else "" # TODO: bounds check
	print "TAG FOUND:", tag, group
	insert = ""

	if tag == "get":
	print "\tGET tag:", group
	insert = "Soandso"
	elif tag == "set":
	print "\tSET tag:", group
	parts = data.split("=")
	print "\tSetting {}={}".format(parts[0], parts[1])
	elif tag == "formal":
	insert = "Formal Name"

	message = message.replace("<{}>".format(body), insert)

	def __process_tags(message):
	"""Abandoned this method. This would scan through the tokenized tags and
	try to work from the deepest outward but it was really hairy."""
	tokens = [ x for x in re.split(r'(<\|\{\|\}\|>)', message) if x != "" ]
	print "MESSAGE:", message
	print "TOKENS:", tokens

	response = ""

	passes = 0
	while True:
	print "PASS #{} - Tokens:".format(passes), tokens
	passes += 1

	depth = 0 # Opening-tag depth.
	types = [] # Types of tag characters. For syntax checking.
	raw_input()

	i = 0
	ie = len(tokens)
	while i < ie:
	print i, "<", ie
	token = tokens[i]

	print "...", i, token, tokens
	print "- Depth:", depth, "Types:", types

	if token == "<" or token == "{":
	depth += 1
	types.append(token)

	print "\tOpening tag character found:", token
	elif token == ">" or token == "}":
	depth -= 1
	types.pop()

	# A tag has been closed. In case of crazy nested tags,
	# ie {random}<set name=<get abc>>{/random} this should be the
	# inner-most tag, i.e. "get abc" here.
	if i == 0:
	# Syntax error!
	print "Closing tag character found at position 0!"
	tokens[i] = ""
	continue

	tag = tokens[i-1].split(" ")
	print "\tTag just closed:", tag

	# Process the type of tag.
	if tag[0] == "get":
	print "\tGET tag!"

	# Replace this tag's text and brackets.
	tokens.pop(i-2) # Pop off the tags.
	tokens.pop(i)
	i -= 1
	ie = len(tokens)
	tokens[i-1] = "Soandso" # fill in the variable's value
	print "NEW TOKENS:", tokens
	print "I=", i, "IE=", ie
	elif tag[0] == "set":
	print "\tSET tag!"
	name = tag[1]
	value = tokens[i+1]
	print "\tSet {}={}".format(name, value)
	elif tag[0] == "random":
	print "\tRANDOM tag! Begin randomized text!"
	texts = tokens[i+1]
	print "\tChoose random from:", texts
	choice = random.choose(texts)
	tokens[i-1] = choice
	else:
	# Not a tag character.
	pass

	# If anything in these tokens were tag characters, rinse and repeat.
	# if token in ["<", ">", "{", "}"]:
	# continue
	i += 1

	print ""

	for t in tests:
	process_tags(t)