bcho/nfa_re.py

## nfa_re.py
# coding: utf-8

'''
Support operations:

- character match
- seq
- or
- repeatition

    match_char = NFARE.from_char('a')
    match_char.match('a')  # => True
    match_char.match('b')  # => False
    match_char.match('aa')  # => False

    a, b = NFARE.from_char('a'), NFARE.from_char('b')
    ab = NFARE.rule_seq(a, b)
    ab.match('ab')  # => True
    ab.match('ba')  # => False
    ab.match('aa')  # => False
    ab.match('abc')  # => False

    a, b = NFARE.from_char('a'), NFARE.from_char('b')
    a_b = NFARE.rule_or(a, b)
    a_b.match('a')  # => True
    a_b.match('b')  # => True
    a_b.match('ab')  # => False
    a_b.match('c')  # => False

    many_a = NFARE.rule_repeat(NFARE.from_char('a')))))
    many_a.match('')  # => True
    many_a.match('a')  # => True
    many_a.match('a' * 100)  # => True

    a, b, c = NFARE.from_char('a'), NFARE.from_char('b'), NFARE.from_char('c')
    ab = NFARE.rule_seq(a, b)
    many_ab = NFARE.rule_repeat(ab)
    many_ab_or_c = NFARE.rule_or(many_ab, c)
    many_ab_or_c.match('ab')  # => True
    many_ab_or_c.match('c')  # => True
    many_ab_or_c.match('ababab')  # => True
    many_ab_or_c.match('ccc')  # => False
    many_ab_or_c.match('abc')  # => False
'''

from collections import defaultdict


class NFAState(object):
    '''Represents a NFA state.

    :param is_final: indicates if the state is final. Defaults to False.
    '''

    SUPPORTED_CHARSET = list(map(chr, range(128)))

    def __init__(self, is_final=False):
        # State type.
        self.is_final = is_final

        # Transition type.
        self.on_chars = defaultdict(list)
        self.on_empty = []

    def add_char_edge(self, char, next_state):
        '''Add a char edge to next state.

        :param char: edge's char.
        :param next_state: next state.
        '''
        if not isinstance(char, str):
            char = str(char)

        assert char in self.SUPPORTED_CHARSET

        self.on_chars[char].append(next_state)

    def add_empty_edge(self, next_state):
        '''Add an empty edge to next state.

        :param next_state: next state.
        '''
        self.on_empty.append(next_state)

    def match(self, something, visited_states=None):
        '''Match something.

        :param something: just something.
        :param visited_states: visited states.
        '''
        if visited_states is None:
            visited_states = []

        # No way out.
        if self in visited_states:
            return False

        visited_states.append(self)

        # End of something, check if we reach the final state.
        if not something:
            # Reach the final state.
            if self.is_final:
                return True

            # See if we can reach the end through an empty edge.
            return any([i.match('', visited_states) for i in self.on_empty])

        # Eat the character and match the rest.
        char = something[0]
        for next_state in self.on_chars[char]:
            if next_state.match(something[1:]):
                return True

        # Not works, try to transit to other state.
        for next_state in self.on_empty:
            if next_state.match(something, visited_states):
                return True

        # We have tried our best :/
        return False


class NFARE(object):
    '''Regex object.

    :param start: start state.
    :param end: end state.
    '''

    def __init__(self, start, end):
        # Start & end state.
        self.start_state, self.end_state = start, end

    def match(self, something):
        return self.start_state.match(something)

    @staticmethod
    def from_char(c):
        return NFARE.rule_char(c)

    @staticmethod
    def rule_char(c):
        assert isinstance(c, str)

        start_state = NFAState()
        end_state = NFAState(is_final=True)
        start_state.add_char_edge(c, end_state)

        return NFARE(start_state, end_state)

    @staticmethod
    def rule_empty():
        start_state = NFAState()
        end_state = NFAState(is_final=True)
        start_state.add_empty_edge(end_state)

        return NFARE(start_state, end_state)

    @staticmethod
    def _union_two_re(first, second):
        assert isinstance(first, NFARE)
        assert isinstance(second, NFARE)

        first.end_state.is_final = False
        second.end_state.is_final = True
        first.end_state.add_empty_edge(second.start_state)

        return NFARE(first.start_state, second.end_state)

    @staticmethod
    def rule_seq(*res):
        assert len(res) >= 2

        re = NFARE._union_two_re(res[0], res[1])
        for next_re in res[2:]:
            re = NFARE._union_two_re(re, next_re)

        return re

    @staticmethod
    def _or_two_re(choice1, choice2):
        assert isinstance(choice1, NFARE)
        assert isinstance(choice2, NFARE)

        choice1.end_state.is_final = False
        choice2.end_state.is_final = False
        start_state = NFAState()
        end_state = NFAState(is_final=True)
        start_state.add_empty_edge(choice1.start_state)
        start_state.add_empty_edge(choice2.start_state)
        choice1.end_state.add_empty_edge(end_state)
        choice2.end_state.add_empty_edge(end_state)

        return NFARE(start_state, end_state)

    @staticmethod
    def rule_or(*choices):
        assert len(choices) >= 2

        re = NFARE._or_two_re(choices[0], choices[1])
        for choice in choices[2:]:
            re = NFARE._or_two_re(re, choice)

        return re

    @staticmethod
    def rule_repeat(re):
        assert isinstance(re, NFARE)

        re.end_state.add_empty_edge(re.start_state)
        re.start_state.add_empty_edge(re.end_state)

        return re


if __name__ == '__main__':
    match_char = NFARE.from_char('a')
    match_char.match('a')  # => True
    match_char.match('b')  # => False
    match_char.match('aa')  # => False

    a, b = NFARE.from_char('a'), NFARE.from_char('b')
    ab = NFARE.rule_seq(a, b)
    assert ab.match('ab')  # => True
    assert not ab.match('ba')  # => False
    assert not ab.match('aa')  # => False
    assert not ab.match('abc')  # => False

    a, b = NFARE.from_char('a'), NFARE.from_char('b')
    a_b = NFARE.rule_or(a, b)
    assert a_b.match('a')  # => True
    assert a_b.match('b')  # => True
    assert not a_b.match('ab')  # => False
    assert not a_b.match('c')  # => False

    many_a = NFARE.rule_repeat(NFARE.rule_char('a'))
    assert many_a.match('')  # => True
    assert many_a.match('a')  # => True
    assert many_a.match('a' * 100)  # => True

    a, b, c = NFARE.from_char('a'), NFARE.from_char('b'), NFARE.from_char('c')
    ab = NFARE.rule_seq(a, b)
    many_ab = NFARE.rule_repeat(ab)
    many_ab_or_c = NFARE.rule_or(many_ab, c)
    assert many_ab_or_c.match('ab')  # => True
    assert many_ab_or_c.match('c')  # => True
    assert many_ab_or_c.match('ababab' * 20)  # => True
    assert not many_ab_or_c.match('ccc')  # => False
    assert not many_ab_or_c.match('abc')  # => False

    print('All passed.')
	# coding: utf-8

	'''
	Support operations:

	- character match
	- seq
	- or
	- repeatition

	match_char = NFARE.from_char('a')
	match_char.match('a') # => True
	match_char.match('b') # => False
	match_char.match('aa') # => False

	a, b = NFARE.from_char('a'), NFARE.from_char('b')
	ab = NFARE.rule_seq(a, b)
	ab.match('ab') # => True
	ab.match('ba') # => False
	ab.match('aa') # => False
	ab.match('abc') # => False

	a, b = NFARE.from_char('a'), NFARE.from_char('b')
	a_b = NFARE.rule_or(a, b)
	a_b.match('a') # => True
	a_b.match('b') # => True
	a_b.match('ab') # => False
	a_b.match('c') # => False

	many_a = NFARE.rule_repeat(NFARE.from_char('a')))))
	many_a.match('') # => True
	many_a.match('a') # => True
	many_a.match('a' * 100) # => True

	a, b, c = NFARE.from_char('a'), NFARE.from_char('b'), NFARE.from_char('c')
	ab = NFARE.rule_seq(a, b)
	many_ab = NFARE.rule_repeat(ab)
	many_ab_or_c = NFARE.rule_or(many_ab, c)
	many_ab_or_c.match('ab') # => True
	many_ab_or_c.match('c') # => True
	many_ab_or_c.match('ababab') # => True
	many_ab_or_c.match('ccc') # => False
	many_ab_or_c.match('abc') # => False
	'''

	from collections import defaultdict


	class NFAState(object):
	'''Represents a NFA state.

	:param is_final: indicates if the state is final. Defaults to False.
	'''

	SUPPORTED_CHARSET = list(map(chr, range(128)))

	def __init__(self, is_final=False):
	# State type.
	self.is_final = is_final

	# Transition type.
	self.on_chars = defaultdict(list)
	self.on_empty = []

	def add_char_edge(self, char, next_state):
	'''Add a char edge to next state.

	:param char: edge's char.
	:param next_state: next state.
	'''
	if not isinstance(char, str):
	char = str(char)

	assert char in self.SUPPORTED_CHARSET

	self.on_chars[char].append(next_state)

	def add_empty_edge(self, next_state):
	'''Add an empty edge to next state.

	:param next_state: next state.
	'''
	self.on_empty.append(next_state)

	def match(self, something, visited_states=None):
	'''Match something.

	:param something: just something.
	:param visited_states: visited states.
	'''
	if visited_states is None:
	visited_states = []

	# No way out.
	if self in visited_states:
	return False

	visited_states.append(self)

	# End of something, check if we reach the final state.
	if not something:
	# Reach the final state.
	if self.is_final:
	return True

	# See if we can reach the end through an empty edge.
	return any([i.match('', visited_states) for i in self.on_empty])

	# Eat the character and match the rest.
	char = something[0]
	for next_state in self.on_chars[char]:
	if next_state.match(something[1:]):
	return True

	# Not works, try to transit to other state.
	for next_state in self.on_empty:
	if next_state.match(something, visited_states):
	return True

	# We have tried our best :/
	return False


	class NFARE(object):
	'''Regex object.

	:param start: start state.
	:param end: end state.
	'''

	def __init__(self, start, end):
	# Start & end state.
	self.start_state, self.end_state = start, end

	def match(self, something):
	return self.start_state.match(something)

	@staticmethod
	def from_char(c):
	return NFARE.rule_char(c)

	@staticmethod
	def rule_char(c):
	assert isinstance(c, str)

	start_state = NFAState()
	end_state = NFAState(is_final=True)
	start_state.add_char_edge(c, end_state)

	return NFARE(start_state, end_state)

	@staticmethod
	def rule_empty():
	start_state = NFAState()
	end_state = NFAState(is_final=True)
	start_state.add_empty_edge(end_state)

	return NFARE(start_state, end_state)

	@staticmethod
	def _union_two_re(first, second):
	assert isinstance(first, NFARE)
	assert isinstance(second, NFARE)

	first.end_state.is_final = False
	second.end_state.is_final = True
	first.end_state.add_empty_edge(second.start_state)

	return NFARE(first.start_state, second.end_state)

	@staticmethod
	def rule_seq(*res):
	assert len(res) >= 2

	re = NFARE._union_two_re(res[0], res[1])
	for next_re in res[2:]:
	re = NFARE._union_two_re(re, next_re)

	return re

	@staticmethod
	def _or_two_re(choice1, choice2):
	assert isinstance(choice1, NFARE)
	assert isinstance(choice2, NFARE)

	choice1.end_state.is_final = False
	choice2.end_state.is_final = False
	start_state = NFAState()
	end_state = NFAState(is_final=True)
	start_state.add_empty_edge(choice1.start_state)
	start_state.add_empty_edge(choice2.start_state)
	choice1.end_state.add_empty_edge(end_state)
	choice2.end_state.add_empty_edge(end_state)

	return NFARE(start_state, end_state)

	@staticmethod
	def rule_or(*choices):
	assert len(choices) >= 2

	re = NFARE._or_two_re(choices[0], choices[1])
	for choice in choices[2:]:
	re = NFARE._or_two_re(re, choice)

	return re

	@staticmethod
	def rule_repeat(re):
	assert isinstance(re, NFARE)

	re.end_state.add_empty_edge(re.start_state)
	re.start_state.add_empty_edge(re.end_state)

	return re


	if __name__ == '__main__':
	match_char = NFARE.from_char('a')
	match_char.match('a') # => True
	match_char.match('b') # => False
	match_char.match('aa') # => False

	a, b = NFARE.from_char('a'), NFARE.from_char('b')
	ab = NFARE.rule_seq(a, b)
	assert ab.match('ab') # => True
	assert not ab.match('ba') # => False
	assert not ab.match('aa') # => False
	assert not ab.match('abc') # => False

	a, b = NFARE.from_char('a'), NFARE.from_char('b')
	a_b = NFARE.rule_or(a, b)
	assert a_b.match('a') # => True
	assert a_b.match('b') # => True
	assert not a_b.match('ab') # => False
	assert not a_b.match('c') # => False

	many_a = NFARE.rule_repeat(NFARE.rule_char('a'))
	assert many_a.match('') # => True
	assert many_a.match('a') # => True
	assert many_a.match('a' * 100) # => True

	a, b, c = NFARE.from_char('a'), NFARE.from_char('b'), NFARE.from_char('c')
	ab = NFARE.rule_seq(a, b)
	many_ab = NFARE.rule_repeat(ab)
	many_ab_or_c = NFARE.rule_or(many_ab, c)
	assert many_ab_or_c.match('ab') # => True
	assert many_ab_or_c.match('c') # => True
	assert many_ab_or_c.match('ababab' * 20) # => True
	assert not many_ab_or_c.match('ccc') # => False
	assert not many_ab_or_c.match('abc') # => False

	print('All passed.')