Skip to content

Instantly share code, notes, and snippets.

@erikrose
Last active August 29, 2015 13:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save erikrose/9233244 to your computer and use it in GitHub Desktop.
Save erikrose/9233244 to your computer and use it in GitHub Desktop.
Regex parser in Parsimonious
from parsimonious import Grammar # Get Parsimonious from https://pypi.python.org/pypi/parsimonious/.
# This recognizes a subset of Python's regex language, minus lookaround
# assertions, non-greedy quantifiers, and named and other special sorts of
# groups. Lucene doesn't support those, though we might be able to fake it
# later via some transformation.
regex_grammar = Grammar(r"""
regexp = branch another_branch*
branch = piece*
another_branch = "|" branch
piece = quantified / atom
quantified = atom quantifier
quantifier = "*" / "+" / "?" / repeat
repeat = "{" number ("," number?)? "}"
number = ~r"\d+"
# By making each parenthesized subexpr just a "regexp", visit_regexp can
# assign group numbers, starting from 0, and the top-level expression
# conveniently ends up in the conventional group 0.
atom = ("(" regexp ")") / class / "^" / "$" / "." / char # Optimize: vacuum up any harmless sequence of chars in one regex, first: [^()[\]^$.?*+{}]+
# Character classes are pretty complex little beasts, even though we're
# just scanning right over them rather than trying to pull any info out:
class = "[" (inverted_class_start / positive_class_start) initial_class_char class_char* "]"
inverted_class_start = "^"
positive_class_start = !"^"
# An unescaped ] is treated as a literal when the first char of a positive
# or inverted character class:
initial_class_char = "]" / class_char
class_char = backslash_char / ~r"[^\]]"
char = backslash_char / literal_char
backslash_char = "\\" backslash_operand
backslash_operand = backslash_special / backslash_hex / backslash_normal
# We require escaping ]{} even though these are tolerated unescaped by
# Python's re parser:
literal_char = ~r"[^^$?*+()[\]{}|.\\]"
# Char class abbreviations and untypeable chars:
backslash_special = ~r"[AbBdDsSwWZabefnrtv]"
backslash_hex = ~r"x[0-9a-fA-F]{2}"
# Normal char with no special meaning:
backslash_normal = ~"."
""")
print regex_grammar.parse('hello+ dolly')
print regex_grammar.parse('hello+|hi')
print regex_grammar.parse(r'(hello|hi) dolly')
print regex_grammar.parse(r'(hello|hi|) dolly')
print regex_grammar.parse(r'(hello||hi) dolly')
print regex_grammar.parse(r'|hello|hi')
print regex_grammar.parse(r'about \d{2}')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment