asv/filter.py

## filter.py
# -*- coding: utf-8 -*-

import re

from funcparserlib.lexer import  make_tokenizer
from funcparserlib.parser import some, skip, forward_decl, oneplus, maybe

# https://svn.process-one.net/tsung/trunk/src/lib/rfc4515_parser.erl

_ABNF_GRAMMAR_DESCRIPTION = r"""
filter                  = LPAREN filtercomp RPAREN
filtercomp              = and / or / not / item
and                     = AMPERSAND filterlist
or                      = VERTBAR filterlist
not                     = EXCLAMATION filter
filterlist              = 1*filter
item                    = simple / present / substring / extensible
simple                  = attr filtertype assertionvalue
filtertype              = equal / approx / greaterorequal / lessorequal
equal                   = EQUALS
approx                  = TILDE EQUALS
greaterorequal          = RANGLE EQUALS
lessorequal             = LANGLE EQUALS
extensible              = ( attr [dnattrs] [atchingrule] COLON EQUALS assertionvalue ) / ( [dnattrs] matchingrule COLON EQUALS assertionvalue )
present                 = attr EQUALS ASTERISK
substring               = attr EQUALS [initial] any [final]
initial                 = assertionvalue
any                     = ASTERISK *(assertionvalue ASTERISK)
final                   = assertionvalue
attr                    = attributedescription
attributedescription    = attributetype options
attributetype           = oid
oid                     = descr / numericoid
descr                   = keystring
keystring               = leadkeychar *keychar
leadkeychar             = ALPHA
keychar                 = ALPHA / DIGIT / HYPHEN
numericoid              = number 1*( DOT number )
number                  = DIGIT / ( LDIGIT 1*DIGIT )
options                 = *( SEMI option )
option                  = 1*keychar
dnattrs                 = COLON "dn"
matchingrule            = COLON oid
assertionvalue          = valueencoding
valueencoding           = 0*(normal / escaped)
normal                  = UTF1SUBSET / UTFMB
escaped                 = ESC HEX HEX
UTF1SUBSET              = %x01-27 / %x2B-5B / %x5D-7F
UTFMB                   = UTF2 / UTF3 / UTF4
UTF0                    = %x80-BF
UTF1                    = %x00-7F
UTF2                    = %xC2-DF UTF0
UTF3                    = %xE0 %xA0-BF UTF0 / %xE1-EC 2(UTF0) / %xED %x80-9F UTF0 / %xEE-EF 2(UTF0)
UTF4                    = %xF0 %x90-BF 2(UTF0) / %xF1-F3 3(UTF0) / %xF4 %x80-8F 2(UTF0)
ESC                     = %x5C ; backslash ("\")
ALPHA                   = %x41-5A / %x61-7A   ; "A"-"Z" / "a"-"z"
DIGIT                   = %x30 / LDIGIT       ; "0"-"9"
LDIGIT                  = %x31-39             ; "1"-"9"
HEX                     = DIGIT / %x41-46 / %x61-66 ; "0"-"9" / "A"-"F" / "a"-"f"
EXCLAMATION             = %x21 ; exclamation mark ("!")
AMPERSAND               = %x26 ; ampersand (or AND symbol) ("&")
ASTERISK                = %x2A ; asterisk ("*")
COLON                   = %x3A ; colon (":")
VERTBAR                 = %x7C ; vertical bar (or pipe) ("|")
TILDE                   = %x7E ; tilde ("~")
"""

def tokenize(s):
    """ Tokenizer

    >>> tokenize('(test=abc)')
    [Token('LParen', '('), Token('AttrName', 'test'), Token('Equal', '='), Token('AttrValue', 'abc'), Token('RParen', ')')]
    """

    valueencodings = {
        'escaped': ur"""
            [\x01-\x27\x2B-\x5B\x5D-\x7F]+
            """,
        'normal':  ur"""
            \\
            [0-9A-Fa-f]{2}
            """,
    }

    specs = [
        ('Space',       (r'[ \t\r\n]+',)),

        ('And',         (ur'\x26',)),
        ('Or',          (ur'\x7C',)),
        ('Not',         (ur'\x21',)),

        ('LParen',      (ur'\x28',)),
        ('RParen',      (ur'\x29',)),

        ('Asterisk',    (ur'\x2A',)),

        ('Equal',           ('=',)),
        ('Approx',          ('~=',)),
        ('Greaterorequal',  ('>=',)),
        ('Lessorequal',     ('<=',)),

        ('AttrName',        (r"""[a-zA-Z][a-zA-Z0-9-]* | [0-9][0-9.]+[0-9]""", re.VERBOSE)),
        ('AttrValue',       (ur'(%(normal)s | %(escaped)s)*' % valueencodings, re.VERBOSE)),
    ]

    useless = ['Space', 0]

    t = make_tokenizer(specs)
    return [x for x in t(s) if x.type not in useless]

def parse(tokens):
    t = lambda s: some(lambda tok: tok.type == s)

    filter      = forward_decl()
    filterlist  = oneplus(filter)

    and_op  = t('And') + filterlist
    or_op   = t('Or')  + filterlist
    not_op  = t('Not') + filter

    lpar    = skip(t('LParen'))
    rpar    = skip(t('RParen'))

    attributedescription    = t('AttrName')
    attr                    = attributedescription

    equal           = t('Equal')
    approx          = t('Approx')
    greaterorequal  = t('Greaterorequal')
    lessorequal     = t('Lessorequal')

    asterisk        = t('Asterisk')

    filtertype      = equal | approx | greaterorequal | lessorequal

    assertionvalue  = t('AttrValue')
    simple          = attr + filtertype + assertionvalue
    present         = attr + equal + asterisk

    initial         = assertionvalue
    final           = assertionvalue
    any             = asterisk + maybe(assertionvalue | asterisk)

    substring       = attr + equal + maybe(initial) + any + maybe(final)
    item            = simple  | present  | substring

    filtercomp      = and_op | or_op | not_op | item
    filter.define(lpar + filtercomp + rpar)

    return filter.parse(tokens)

print (tokenize('(test=abc)'))
#print parse(tokenize('(test=abc)'))

if __name__ == '__main__':
    import doctest
    doctest.testmod(verbose=True, raise_on_error=True)
	# -- coding: utf-8 --

	import re

	from funcparserlib.lexer import make_tokenizer
	from funcparserlib.parser import some, skip, forward_decl, oneplus, maybe

	# https://svn.process-one.net/tsung/trunk/src/lib/rfc4515_parser.erl

	_ABNF_GRAMMAR_DESCRIPTION = r"""
	filter = LPAREN filtercomp RPAREN
	filtercomp = and / or / not / item
	and = AMPERSAND filterlist
	or = VERTBAR filterlist
	not = EXCLAMATION filter
	filterlist = 1*filter
	item = simple / present / substring / extensible
	simple = attr filtertype assertionvalue
	filtertype = equal / approx / greaterorequal / lessorequal
	equal = EQUALS
	approx = TILDE EQUALS
	greaterorequal = RANGLE EQUALS
	lessorequal = LANGLE EQUALS
	extensible = ( attr [dnattrs] [atchingrule] COLON EQUALS assertionvalue ) / ( [dnattrs] matchingrule COLON EQUALS assertionvalue )
	present = attr EQUALS ASTERISK
	substring = attr EQUALS [initial] any [final]
	initial = assertionvalue
	any = ASTERISK *(assertionvalue ASTERISK)
	final = assertionvalue
	attr = attributedescription
	attributedescription = attributetype options
	attributetype = oid
	oid = descr / numericoid
	descr = keystring
	keystring = leadkeychar *keychar
	leadkeychar = ALPHA
	keychar = ALPHA / DIGIT / HYPHEN
	numericoid = number 1*( DOT number )
	number = DIGIT / ( LDIGIT 1*DIGIT )
	options = *( SEMI option )
	option = 1*keychar
	dnattrs = COLON "dn"
	matchingrule = COLON oid
	assertionvalue = valueencoding
	valueencoding = 0*(normal / escaped)
	normal = UTF1SUBSET / UTFMB
	escaped = ESC HEX HEX
	UTF1SUBSET = %x01-27 / %x2B-5B / %x5D-7F
	UTFMB = UTF2 / UTF3 / UTF4
	UTF0 = %x80-BF
	UTF1 = %x00-7F
	UTF2 = %xC2-DF UTF0
	UTF3 = %xE0 %xA0-BF UTF0 / %xE1-EC 2(UTF0) / %xED %x80-9F UTF0 / %xEE-EF 2(UTF0)
	UTF4 = %xF0 %x90-BF 2(UTF0) / %xF1-F3 3(UTF0) / %xF4 %x80-8F 2(UTF0)
	ESC = %x5C ; backslash ("\")
	ALPHA = %x41-5A / %x61-7A ; "A"-"Z" / "a"-"z"
	DIGIT = %x30 / LDIGIT ; "0"-"9"
	LDIGIT = %x31-39 ; "1"-"9"
	HEX = DIGIT / %x41-46 / %x61-66 ; "0"-"9" / "A"-"F" / "a"-"f"
	EXCLAMATION = %x21 ; exclamation mark ("!")
	AMPERSAND = %x26 ; ampersand (or AND symbol) ("&")
	ASTERISK = %x2A ; asterisk ("*")
	COLON = %x3A ; colon (":")
	VERTBAR = %x7C ; vertical bar (or pipe) ("\|")
	TILDE = %x7E ; tilde ("~")
	"""

	def tokenize(s):
	""" Tokenizer

	>>> tokenize('(test=abc)')
	[Token('LParen', '('), Token('AttrName', 'test'), Token('Equal', '='), Token('AttrValue', 'abc'), Token('RParen', ')')]
	"""

	valueencodings = {
	'escaped': ur"""
	[\x01-\x27\x2B-\x5B\x5D-\x7F]+
	""",
	'normal': ur"""
	\\
	[0-9A-Fa-f]{2}
	""",
	}

	specs = [
	('Space', (r'[ \t\r\n]+',)),

	('And', (ur'\x26',)),
	('Or', (ur'\x7C',)),
	('Not', (ur'\x21',)),

	('LParen', (ur'\x28',)),
	('RParen', (ur'\x29',)),

	('Asterisk', (ur'\x2A',)),

	('Equal', ('=',)),
	('Approx', ('~=',)),
	('Greaterorequal', ('>=',)),
	('Lessorequal', ('<=',)),

	('AttrName', (r"""[a-zA-Z][a-zA-Z0-9-]* \| [0-9][0-9.]+[0-9]""", re.VERBOSE)),
	('AttrValue', (ur'(%(normal)s \| %(escaped)s)*' % valueencodings, re.VERBOSE)),
	]

	useless = ['Space', 0]

	t = make_tokenizer(specs)
	return [x for x in t(s) if x.type not in useless]

	def parse(tokens):
	t = lambda s: some(lambda tok: tok.type == s)

	filter = forward_decl()
	filterlist = oneplus(filter)

	and_op = t('And') + filterlist
	or_op = t('Or') + filterlist
	not_op = t('Not') + filter

	lpar = skip(t('LParen'))
	rpar = skip(t('RParen'))

	attributedescription = t('AttrName')
	attr = attributedescription

	equal = t('Equal')
	approx = t('Approx')
	greaterorequal = t('Greaterorequal')
	lessorequal = t('Lessorequal')

	asterisk = t('Asterisk')

	filtertype = equal \| approx \| greaterorequal \| lessorequal

	assertionvalue = t('AttrValue')
	simple = attr + filtertype + assertionvalue
	present = attr + equal + asterisk

	initial = assertionvalue
	final = assertionvalue
	any = asterisk + maybe(assertionvalue \| asterisk)

	substring = attr + equal + maybe(initial) + any + maybe(final)
	item = simple \| present \| substring

	filtercomp = and_op \| or_op \| not_op \| item
	filter.define(lpar + filtercomp + rpar)

	return filter.parse(tokens)

	print (tokenize('(test=abc)'))
	#print parse(tokenize('(test=abc)'))

	if __name__ == '__main__':
	import doctest
	doctest.testmod(verbose=True, raise_on_error=True)