clee704/matcher.py

## matcher.py
LPAR = 0
RPAR = 1
AND = 2
OR = 3
NOT = 4
IDENT = 5
EOF = 6

token_names = ('LPAR', 'RPAR', 'AND', 'OR', 'NOT', 'IDENT', 'EOF')


class Lexer(object):

    symbol_map = {
        '(': LPAR,
        ')': RPAR,
        '+': AND,
        ',': OR,
        '~': NOT,
    }

    keyword_map = {
        'and': AND,
        'or': OR,
        'not': NOT,
    }

    def __init__(self, input):
        self.input = input
        self.curpos = -1
        self.nextpos = 0
        self.token = (EOF, '')

    def next(self):
        n = len(self.input)
        i = self.nextpos
        while i < n and self.input[i].isspace():
            i += 1
        self.curpos = i
        if i == n:
            self.nextpos = i
            self.token = (EOF, '')
            return self.token
        c = self.input[i]
        if c in self.symbol_map:
            self.nextpos = i + 1
            self.token = (self.symbol_map[c], c)
            return self.token
        j = i + 1
        while j < n and not (self.input[j].isspace() or
                             self.input[j] in self.symbol_map):
            j += 1
        self.nextpos = j
        tok = self.input[i:j]
        self.token = (self.keyword_map.get(tok.lower(), IDENT), tok)
        return self.token

    def generate(self):
        if self.token[0] == EOF:
            self.next()
        while self.token[0] != EOF:
            yield self.token
            self.next()


class Node(object):

    def __init__(self, kind, *children):
        self.kind = kind
        self.children = tuple(children)

    def __eq__(self, rhs):
        return (isinstance(rhs, Node) and self.kind == rhs.kind and
                self.children == rhs.children)

    def __repr__(self):
        return 'Node(kind={!r}, {})'.format(self.kind,
                ', '.join(repr(c) for c in self.children))


class ParserException(Exception):
    pass


class Parser(object):

    def __init__(self, lexer):
        self.lexer = lexer
        self.stash = None

    def parse(self):
        ret = self.parse_expr()
        tok = self._next_token()
        if tok[0] != EOF:
            raise ParserException('expected EOF, got {} at pos {}'.format(
                    token_names[tok[0]], self.lexer.curpos + 1))
        return ret

    def parse_expr(self):
        return self._parse_nary(OR, self.parse_term)

    def parse_term(self):
        return self._parse_nary(AND, self.parse_atom)

    def parse_atom(self):
        tok = self._next_token()
        if tok[0] == IDENT:
            return tok[1]
        elif tok[0] == NOT:
            return Node(NOT, self.parse_atom())
        elif tok[0] == LPAR:
            ret = self.parse_expr()
            tok = self._next_token()
            if tok[0] != RPAR:
                raise ParserException('expected RPAR, got {} at pos {}'
                        .format(token_names[tok[0]], self.lexer.curpos + 1))
            return ret
        else:
            raise ParserException('expected IDENT, NOT, or LPAR, got {} at '
                    'pos {}' .format(token_names[tok[0]],
                    self.lexer.curpos + 1))

    def _next_token(self):
        if self.stash is not None:
            ret = self.stash
            self.stash = None
            return ret
        else:
            return self.lexer.next()

    def _putback(self, token):
        assert self.stash is None
        self.stash = token

    def _parse_nary(self, kind, parse_child):
        nodes = [parse_child()]
        tok = self._next_token()
        while tok[0] == kind:
            nodes.append(parse_child())
            tok = self._next_token()
        self._putback(tok)
        return nodes[0] if len(nodes) == 1 else Node(kind, *nodes)


class And(object):
    def __init__(self, *matchers):
        self.matchers = matchers
    def __call__(self, arg):
        return all(m(arg) for m in self.matchers)
    def __repr__(self):
        return 'And({})'.format(', '.join(repr(m) for m in self.matchers))


class Or(object):
    def __init__(self, *matchers):
        self.matchers = matchers
    def __call__(self, arg):
        return any(m(arg) for m in self.matchers)
    def __repr__(self):
        return 'Or({})'.format(', '.join(repr(m) for m in self.matchers))


class Not(object):
    def __init__(self, matcher):
        self.matcher = matcher
    def __call__(self, arg):
        return not self.matcher(arg)
    def __repr__(self):
        return 'Not({!r})'.format(self.matcher)


class Contains(object):
    def __init__(self, expected):
        self.expected = expected
    def __call__(self, arg):
        return self.expected in arg
    def __repr__(self):
        return 'Contains({!r})'.format(self.expected)


def get_matcher(expr):
    return _get_matcher(Parser(Lexer(expr)).parse())


def _get_matcher(ast):
    if isinstance(ast, Node):
        if ast.kind == AND:
            return And(*[_get_matcher(c) for c in ast.children])
        elif ast.kind == OR:
            return Or(*[_get_matcher(c) for c in ast.children])
        elif ast.kind == NOT:
            return Not(_get_matcher(ast.children[0]))
        else:
            raise RuntimeError('unknown AST type: {!r}'.format(ast.kind))
    else:
        return Contains(ast)

## test_matcher.py
import pytest
from matcher import (AND, IDENT, LPAR, NOT, OR, RPAR, Lexer, Node, Parser,
                     ParserException, get_matcher)


@pytest.mark.parametrize('input, output', [
    ('a + b', [(IDENT, 'a'), (AND, '+'), (IDENT, 'b')]),
    ('((anderson and oreo) or not notton)', [
        (LPAR, '('),
        (LPAR, '('),
        (IDENT, 'anderson'),
        (AND, 'and'),
        (IDENT, 'oreo'),
        (RPAR, ')'),
        (OR, 'or'),
        (NOT, 'not'),
        (IDENT, 'notton'),
        (RPAR, ')'),
    ]),
    ('Dumb And Dumber', [(IDENT, 'Dumb'), (AND, 'And'), (IDENT, 'Dumber')]),
    ('    a\n    +\n    b    \n', [(IDENT, 'a'), (AND, '+'), (IDENT, 'b')]),
    ('a+b', [(IDENT, 'a'), (AND, '+'), (IDENT, 'b')]),
])
def test_lexer(input, output):
    assert list(Lexer(input).generate()) == output


@pytest.mark.parametrize('input, output', [
    ('a + b', Node(AND, 'a', 'b')),
    ('(a and b)', Node(AND, 'a', 'b')),
    ('((a and ((b))))', Node(AND, 'a', 'b')),
    ('((anderson and oreo) or not notton)',
        Node(OR,
            Node(AND, 'anderson', 'oreo'),
            Node(NOT, 'notton'))),
    ('Dumb And Dumber', Node(AND, 'Dumb', 'Dumber')),
    ('a and b or c', Node(OR, Node(AND, 'a', 'b'), 'c')),
    ('a and (b or c)', Node(AND, 'a', Node(OR, 'b', 'c'))),
    ('a or b and c', Node(OR, 'a', Node(AND, 'b', 'c'))),
    ('a and b and c', Node(AND, 'a', 'b', 'c')),
    ('a or b or c', Node(OR, 'a', 'b', 'c')),
    ('not a or b', Node(OR, Node(NOT, 'a'), 'b')),
    ('a or not b', Node(OR, 'a', Node(NOT, 'b'))),
    ('not (a or b)', Node(NOT, Node(OR, 'a', 'b'))),
    ('~a+b,c', Node(OR, Node(AND, Node(NOT, 'a'), 'b'), 'c')),
    ('a,b,c,d', Node(OR, 'a', 'b', 'c', 'd')),
    ('a+b+c+d', Node(AND, 'a', 'b', 'c', 'd')),
    ('~a+b+~c+d', Node(AND, Node(NOT, 'a'), 'b', Node(NOT, 'c'), 'd')),
    ('~a+b+c,~d',
        Node(OR,
            Node(AND, Node(NOT, 'a'), 'b', 'c'),
            Node(NOT, 'd'))),
    ('(a,b)+c', Node(AND, Node(OR, 'a', 'b'), 'c')),
])
def test_parser(input, output):
    assert Parser(Lexer(input)).parse() == output


@pytest.mark.parametrize('input, error', [
    ('', "expected IDENT, NOT, or LPAR, got EOF at pos 1"),
    ('a +', "expected IDENT, NOT, or LPAR, got EOF at pos 4"),
    ('a a', "expected EOF, got IDENT at pos 3"),
    ('a (and) b', "expected EOF, got LPAR at pos 3"),
    ('and and and', "expected IDENT, NOT, or LPAR, got AND at pos 1"),
    ('(())', "expected IDENT, NOT, or LPAR, got RPAR at pos 3"),
])
def test_parser_error(input, error):
    parser = Parser(Lexer(input))
    try:
        parser.parse()
    except ParserException as e:
        assert e.message == error
    else:
        assert False, 'exception should have been thrown'


@pytest.mark.parametrize('input, arg, result', [
    ('a', ['a'], True),
    ('a or b', ['a'], True),
    ('a or b', ['b'], True),
    ('a or b', ['c'], False),
    ('a and b or c', ['a'], False),
    ('a and b or c', ['b'], False),
    ('a and b or c', ['a', 'b'], True),
    ('a and b or c', ['c'], True),
    ('not a', ['a'], False),
    ('not a', ['b'], True),
    ('(not a or not b) and (c or d)', ['a'], False),
    ('(not a or not b) and (c or d)', ['a', 'b'], False),
    ('(not a or not b) and (c or d)', ['a', 'c'], True),
    ('(not a or not b) and (c or d)', ['a', 'b', 'c'], False),
    ('(not a or not b) and (c or d)', ['e'], False),
    ('(not a or not b) and (c or d)', ['c'], True),
    ('(not a or not b) and (c or d)', ['d'], True),
    ('(not a or not b) and (c or d)', ['c', 'e'], True),
])
def test_get_matcher(input, arg, result):
    assert get_matcher(input)(arg) == result
	LPAR = 0
	RPAR = 1
	AND = 2
	OR = 3
	NOT = 4
	IDENT = 5
	EOF = 6

	token_names = ('LPAR', 'RPAR', 'AND', 'OR', 'NOT', 'IDENT', 'EOF')


	class Lexer(object):

	symbol_map = {
	'(': LPAR,
	')': RPAR,
	'+': AND,
	',': OR,
	'~': NOT,
	}

	keyword_map = {
	'and': AND,
	'or': OR,
	'not': NOT,
	}

	def __init__(self, input):
	self.input = input
	self.curpos = -1
	self.nextpos = 0
	self.token = (EOF, '')

	def next(self):
	n = len(self.input)
	i = self.nextpos
	while i < n and self.input[i].isspace():
	i += 1
	self.curpos = i
	if i == n:
	self.nextpos = i
	self.token = (EOF, '')
	return self.token
	c = self.input[i]
	if c in self.symbol_map:
	self.nextpos = i + 1
	self.token = (self.symbol_map[c], c)
	return self.token
	j = i + 1
	while j < n and not (self.input[j].isspace() or
	self.input[j] in self.symbol_map):
	j += 1
	self.nextpos = j
	tok = self.input[i:j]
	self.token = (self.keyword_map.get(tok.lower(), IDENT), tok)
	return self.token

	def generate(self):
	if self.token[0] == EOF:
	self.next()
	while self.token[0] != EOF:
	yield self.token
	self.next()


	class Node(object):

	def __init__(self, kind, *children):
	self.kind = kind
	self.children = tuple(children)

	def __eq__(self, rhs):
	return (isinstance(rhs, Node) and self.kind == rhs.kind and
	self.children == rhs.children)

	def __repr__(self):
	return 'Node(kind={!r}, {})'.format(self.kind,
	', '.join(repr(c) for c in self.children))


	class ParserException(Exception):
	pass


	class Parser(object):

	def __init__(self, lexer):
	self.lexer = lexer
	self.stash = None

	def parse(self):
	ret = self.parse_expr()
	tok = self._next_token()
	if tok[0] != EOF:
	raise ParserException('expected EOF, got {} at pos {}'.format(
	token_names[tok[0]], self.lexer.curpos + 1))
	return ret

	def parse_expr(self):
	return self._parse_nary(OR, self.parse_term)

	def parse_term(self):
	return self._parse_nary(AND, self.parse_atom)

	def parse_atom(self):
	tok = self._next_token()
	if tok[0] == IDENT:
	return tok[1]
	elif tok[0] == NOT:
	return Node(NOT, self.parse_atom())
	elif tok[0] == LPAR:
	ret = self.parse_expr()
	tok = self._next_token()
	if tok[0] != RPAR:
	raise ParserException('expected RPAR, got {} at pos {}'
	.format(token_names[tok[0]], self.lexer.curpos + 1))
	return ret
	else:
	raise ParserException('expected IDENT, NOT, or LPAR, got {} at '
	'pos {}' .format(token_names[tok[0]],
	self.lexer.curpos + 1))

	def _next_token(self):
	if self.stash is not None:
	ret = self.stash
	self.stash = None
	return ret
	else:
	return self.lexer.next()

	def _putback(self, token):
	assert self.stash is None
	self.stash = token

	def _parse_nary(self, kind, parse_child):
	nodes = [parse_child()]
	tok = self._next_token()
	while tok[0] == kind:
	nodes.append(parse_child())
	tok = self._next_token()
	self._putback(tok)
	return nodes[0] if len(nodes) == 1 else Node(kind, *nodes)


	class And(object):
	def __init__(self, *matchers):
	self.matchers = matchers
	def __call__(self, arg):
	return all(m(arg) for m in self.matchers)
	def __repr__(self):
	return 'And({})'.format(', '.join(repr(m) for m in self.matchers))


	class Or(object):
	def __init__(self, *matchers):
	self.matchers = matchers
	def __call__(self, arg):
	return any(m(arg) for m in self.matchers)
	def __repr__(self):
	return 'Or({})'.format(', '.join(repr(m) for m in self.matchers))


	class Not(object):
	def __init__(self, matcher):
	self.matcher = matcher
	def __call__(self, arg):
	return not self.matcher(arg)
	def __repr__(self):
	return 'Not({!r})'.format(self.matcher)


	class Contains(object):
	def __init__(self, expected):
	self.expected = expected
	def __call__(self, arg):
	return self.expected in arg
	def __repr__(self):
	return 'Contains({!r})'.format(self.expected)


	def get_matcher(expr):
	return _get_matcher(Parser(Lexer(expr)).parse())


	def _get_matcher(ast):
	if isinstance(ast, Node):
	if ast.kind == AND:
	return And(*[_get_matcher(c) for c in ast.children])
	elif ast.kind == OR:
	return Or(*[_get_matcher(c) for c in ast.children])
	elif ast.kind == NOT:
	return Not(_get_matcher(ast.children[0]))
	else:
	raise RuntimeError('unknown AST type: {!r}'.format(ast.kind))
	else:
	return Contains(ast)
	import pytest
	from matcher import (AND, IDENT, LPAR, NOT, OR, RPAR, Lexer, Node, Parser,
	ParserException, get_matcher)


	@pytest.mark.parametrize('input, output', [
	('a + b', [(IDENT, 'a'), (AND, '+'), (IDENT, 'b')]),
	('((anderson and oreo) or not notton)', [
	(LPAR, '('),
	(LPAR, '('),
	(IDENT, 'anderson'),
	(AND, 'and'),
	(IDENT, 'oreo'),
	(RPAR, ')'),
	(OR, 'or'),
	(NOT, 'not'),
	(IDENT, 'notton'),
	(RPAR, ')'),
	]),
	('Dumb And Dumber', [(IDENT, 'Dumb'), (AND, 'And'), (IDENT, 'Dumber')]),
	(' a\n +\n b \n', [(IDENT, 'a'), (AND, '+'), (IDENT, 'b')]),
	('a+b', [(IDENT, 'a'), (AND, '+'), (IDENT, 'b')]),
	])
	def test_lexer(input, output):
	assert list(Lexer(input).generate()) == output


	@pytest.mark.parametrize('input, output', [
	('a + b', Node(AND, 'a', 'b')),
	('(a and b)', Node(AND, 'a', 'b')),
	('((a and ((b))))', Node(AND, 'a', 'b')),
	('((anderson and oreo) or not notton)',
	Node(OR,
	Node(AND, 'anderson', 'oreo'),
	Node(NOT, 'notton'))),
	('Dumb And Dumber', Node(AND, 'Dumb', 'Dumber')),
	('a and b or c', Node(OR, Node(AND, 'a', 'b'), 'c')),
	('a and (b or c)', Node(AND, 'a', Node(OR, 'b', 'c'))),
	('a or b and c', Node(OR, 'a', Node(AND, 'b', 'c'))),
	('a and b and c', Node(AND, 'a', 'b', 'c')),
	('a or b or c', Node(OR, 'a', 'b', 'c')),
	('not a or b', Node(OR, Node(NOT, 'a'), 'b')),
	('a or not b', Node(OR, 'a', Node(NOT, 'b'))),
	('not (a or b)', Node(NOT, Node(OR, 'a', 'b'))),
	('~a+b,c', Node(OR, Node(AND, Node(NOT, 'a'), 'b'), 'c')),
	('a,b,c,d', Node(OR, 'a', 'b', 'c', 'd')),
	('a+b+c+d', Node(AND, 'a', 'b', 'c', 'd')),
	('~a+b+~c+d', Node(AND, Node(NOT, 'a'), 'b', Node(NOT, 'c'), 'd')),
	('~a+b+c,~d',
	Node(OR,
	Node(AND, Node(NOT, 'a'), 'b', 'c'),
	Node(NOT, 'd'))),
	('(a,b)+c', Node(AND, Node(OR, 'a', 'b'), 'c')),
	])
	def test_parser(input, output):
	assert Parser(Lexer(input)).parse() == output


	@pytest.mark.parametrize('input, error', [
	('', "expected IDENT, NOT, or LPAR, got EOF at pos 1"),
	('a +', "expected IDENT, NOT, or LPAR, got EOF at pos 4"),
	('a a', "expected EOF, got IDENT at pos 3"),
	('a (and) b', "expected EOF, got LPAR at pos 3"),
	('and and and', "expected IDENT, NOT, or LPAR, got AND at pos 1"),
	('(())', "expected IDENT, NOT, or LPAR, got RPAR at pos 3"),
	])
	def test_parser_error(input, error):
	parser = Parser(Lexer(input))
	try:
	parser.parse()
	except ParserException as e:
	assert e.message == error
	else:
	assert False, 'exception should have been thrown'


	@pytest.mark.parametrize('input, arg, result', [
	('a', ['a'], True),
	('a or b', ['a'], True),
	('a or b', ['b'], True),
	('a or b', ['c'], False),
	('a and b or c', ['a'], False),
	('a and b or c', ['b'], False),
	('a and b or c', ['a', 'b'], True),
	('a and b or c', ['c'], True),
	('not a', ['a'], False),
	('not a', ['b'], True),
	('(not a or not b) and (c or d)', ['a'], False),
	('(not a or not b) and (c or d)', ['a', 'b'], False),
	('(not a or not b) and (c or d)', ['a', 'c'], True),
	('(not a or not b) and (c or d)', ['a', 'b', 'c'], False),
	('(not a or not b) and (c or d)', ['e'], False),
	('(not a or not b) and (c or d)', ['c'], True),
	('(not a or not b) and (c or d)', ['d'], True),
	('(not a or not b) and (c or d)', ['c', 'e'], True),
	])
	def test_get_matcher(input, arg, result):
	assert get_matcher(input)(arg) == result