Last active
August 29, 2015 13:56
-
-
Save erikrose/9233244 to your computer and use it in GitHub Desktop.
Regex parser in Parsimonious
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from parsimonious import Grammar # Get Parsimonious from https://pypi.python.org/pypi/parsimonious/. | |
# This recognizes a subset of Python's regex language, minus lookaround | |
# assertions, non-greedy quantifiers, and named and other special sorts of | |
# groups. Lucene doesn't support those, though we might be able to fake it | |
# later via some transformation. | |
regex_grammar = Grammar(r""" | |
regexp = branch another_branch* | |
branch = piece* | |
another_branch = "|" branch | |
piece = quantified / atom | |
quantified = atom quantifier | |
quantifier = "*" / "+" / "?" / repeat | |
repeat = "{" number ("," number?)? "}" | |
number = ~r"\d+" | |
# By making each parenthesized subexpr just a "regexp", visit_regexp can | |
# assign group numbers, starting from 0, and the top-level expression | |
# conveniently ends up in the conventional group 0. | |
atom = ("(" regexp ")") / class / "^" / "$" / "." / char # Optimize: vacuum up any harmless sequence of chars in one regex, first: [^()[\]^$.?*+{}]+ | |
# Character classes are pretty complex little beasts, even though we're | |
# just scanning right over them rather than trying to pull any info out: | |
class = "[" (inverted_class_start / positive_class_start) initial_class_char class_char* "]" | |
inverted_class_start = "^" | |
positive_class_start = !"^" | |
# An unescaped ] is treated as a literal when the first char of a positive | |
# or inverted character class: | |
initial_class_char = "]" / class_char | |
class_char = backslash_char / ~r"[^\]]" | |
char = backslash_char / literal_char | |
backslash_char = "\\" backslash_operand | |
backslash_operand = backslash_special / backslash_hex / backslash_normal | |
# We require escaping ]{} even though these are tolerated unescaped by | |
# Python's re parser: | |
literal_char = ~r"[^^$?*+()[\]{}|.\\]" | |
# Char class abbreviations and untypeable chars: | |
backslash_special = ~r"[AbBdDsSwWZabefnrtv]" | |
backslash_hex = ~r"x[0-9a-fA-F]{2}" | |
# Normal char with no special meaning: | |
backslash_normal = ~"." | |
""") | |
print regex_grammar.parse('hello+ dolly') | |
print regex_grammar.parse('hello+|hi') | |
print regex_grammar.parse(r'(hello|hi) dolly') | |
print regex_grammar.parse(r'(hello|hi|) dolly') | |
print regex_grammar.parse(r'(hello||hi) dolly') | |
print regex_grammar.parse(r'|hello|hi') | |
print regex_grammar.parse(r'about \d{2}') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment