Last active
June 30, 2023 12:31
-
-
Save jonashaag/b833d947958745bd4ca2ae6a4296185f to your computer and use it in GitHub Desktop.
Cython prematcher compiler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import textwrap | |
from dataclasses import dataclass | |
@dataclass | |
class Pattern: | |
pattern: str | |
prematchers: list[str] | |
CODE_TEMPLATE = """ | |
# cython: language_level = 3 | |
import re | |
cdef extern from "string.h": | |
char* strstr(const char *haystack, const char *needle) | |
def match(s: str): | |
cdef bytes s_bytes = s.lower().encode("ascii", "ignore") | |
cdef const char *s_data = s_bytes | |
matches = set() | |
{pattern_cases} | |
return [_patterns[idx] for idx in matches] | |
# List of bound re.Pattern.search methods. | |
cdef tuple _patterns = ( | |
{compiled_patterns} | |
) | |
""" | |
def pattern_case(pattern_idx: int, pattern: Pattern) -> str: | |
"""Generate code for a single accelerated pattern check.""" | |
fast_cond = " or ".join(f"strstr(s_data, {p!r}) is not NULL" for p in pattern.prematchers) | |
return f"if ({fast_cond}) and _patterns[{pattern_idx}].search(s) is not None: matches.add({pattern_idx})" | |
def gen_cython(patterns: list[Pattern]) -> str: | |
# List of "re.compile()" calls. | |
compiled_patterns = ",\n".join( | |
f"re.compile({pattern.pattern!r})" for pattern_idx, pattern in enumerate(patterns) | |
) | |
# List of "if (strstr ...)" cases. | |
pattern_cases = f"\n ".join( | |
pattern_case(pattern_idx, pattern) for pattern_idx, pattern in enumerate(patterns) | |
) | |
return CODE_TEMPLATE.format( | |
compiled_patterns=textwrap.indent(compiled_patterns, " "), | |
pattern_cases=pattern_cases, | |
).lstrip() | |
if __name__ == "__main__": | |
patterns = [ | |
# Some date in March, eg. Mar 25, 2023 | |
Pattern("Mar \d\d?, \d\d\d\d", ["mar"]), | |
# Counting carrots in German | |
Pattern("\d+ (Rübe|Möhre)n?", ["rbe", "mhre"]), | |
] | |
print(gen_cython(patterns)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment