erikrose/gist:9233244

## gistfile1.py
from parsimonious import Grammar  # Get Parsimonious from https://pypi.python.org/pypi/parsimonious/.


# This recognizes a subset of Python's regex language, minus lookaround
# assertions, non-greedy quantifiers, and named and other special sorts of
# groups. Lucene doesn't support those, though we might be able to fake it
# later via some transformation.
regex_grammar = Grammar(r"""
    regexp = branch another_branch*
    branch = piece*
    another_branch = "|" branch
    piece = quantified / atom
    quantified = atom quantifier
    quantifier = "*" / "+" / "?" / repeat
    repeat = "{" number ("," number?)? "}"
    number = ~r"\d+"

    # By making each parenthesized subexpr just a "regexp", visit_regexp can
    # assign group numbers, starting from 0, and the top-level expression
    # conveniently ends up in the conventional group 0.
    atom = ("(" regexp ")") / class / "^" / "$" / "." / char  # Optimize: vacuum up any harmless sequence of chars in one regex, first: [^()[\]^$.?*+{}]+

    # Character classes are pretty complex little beasts, even though we're
    # just scanning right over them rather than trying to pull any info out:
    class = "[" (inverted_class_start / positive_class_start) initial_class_char class_char* "]"
    inverted_class_start = "^"
    positive_class_start = !"^"
    # An unescaped ] is treated as a literal when the first char of a positive
    # or inverted character class:
    initial_class_char = "]" / class_char
    class_char = backslash_char / ~r"[^\]]"

    char = backslash_char / literal_char
    backslash_char = "\\" backslash_operand
    backslash_operand = backslash_special / backslash_hex / backslash_normal
    # We require escaping ]{} even though these are tolerated unescaped by
    # Python's re parser:
    literal_char = ~r"[^^$?*+()[\]{}|.\\]"
    # Char class abbreviations and untypeable chars:
    backslash_special = ~r"[AbBdDsSwWZabefnrtv]"
    backslash_hex = ~r"x[0-9a-fA-F]{2}"
    # Normal char with no special meaning:
    backslash_normal = ~"."
    """)


print regex_grammar.parse('hello+ dolly')
print regex_grammar.parse('hello+|hi')
print regex_grammar.parse(r'(hello|hi) dolly')
print regex_grammar.parse(r'(hello|hi|) dolly')
print regex_grammar.parse(r'(hello||hi) dolly')
print regex_grammar.parse(r'|hello|hi')
print regex_grammar.parse(r'about \d{2}')
	from parsimonious import Grammar # Get Parsimonious from https://pypi.python.org/pypi/parsimonious/.


	# This recognizes a subset of Python's regex language, minus lookaround
	# assertions, non-greedy quantifiers, and named and other special sorts of
	# groups. Lucene doesn't support those, though we might be able to fake it
	# later via some transformation.
	regex_grammar = Grammar(r"""
	regexp = branch another_branch*
	branch = piece*
	another_branch = "\|" branch
	piece = quantified / atom
	quantified = atom quantifier
	quantifier = "*" / "+" / "?" / repeat
	repeat = "{" number ("," number?)? "}"
	number = ~r"\d+"

	# By making each parenthesized subexpr just a "regexp", visit_regexp can
	# assign group numbers, starting from 0, and the top-level expression
	# conveniently ends up in the conventional group 0.
	atom = ("(" regexp ")") / class / "^" / "$" / "." / char # Optimize: vacuum up any harmless sequence of chars in one regex, first: [^()[\]^$.?*+{}]+

	# Character classes are pretty complex little beasts, even though we're
	# just scanning right over them rather than trying to pull any info out:
	class = "[" (inverted_class_start / positive_class_start) initial_class_char class_char* "]"
	inverted_class_start = "^"
	positive_class_start = !"^"
	# An unescaped ] is treated as a literal when the first char of a positive
	# or inverted character class:
	initial_class_char = "]" / class_char
	class_char = backslash_char / ~r"[^\]]"

	char = backslash_char / literal_char
	backslash_char = "\\" backslash_operand
	backslash_operand = backslash_special / backslash_hex / backslash_normal
	# We require escaping ]{} even though these are tolerated unescaped by
	# Python's re parser:
	literal_char = ~r"[^^$?*+()[\]{}\|.\\]"
	# Char class abbreviations and untypeable chars:
	backslash_special = ~r"[AbBdDsSwWZabefnrtv]"
	backslash_hex = ~r"x[0-9a-fA-F]{2}"
	# Normal char with no special meaning:
	backslash_normal = ~"."
	""")


	print regex_grammar.parse('hello+ dolly')
	print regex_grammar.parse('hello+\|hi')
	print regex_grammar.parse(r'(hello\|hi) dolly')
	print regex_grammar.parse(r'(hello\|hi\|) dolly')
	print regex_grammar.parse(r'(hello\|\|hi) dolly')
	print regex_grammar.parse(r'\|hello\|hi')
	print regex_grammar.parse(r'about \d{2}')