Skip to content

Instantly share code, notes, and snippets.

@jonashaag
Last active June 30, 2023 12:31
Show Gist options
  • Save jonashaag/b833d947958745bd4ca2ae6a4296185f to your computer and use it in GitHub Desktop.
Save jonashaag/b833d947958745bd4ca2ae6a4296185f to your computer and use it in GitHub Desktop.
Cython prematcher compiler
import textwrap
from dataclasses import dataclass
@dataclass
class Pattern:
pattern: str
prematchers: list[str]
CODE_TEMPLATE = """
# cython: language_level = 3
import re
cdef extern from "string.h":
char* strstr(const char *haystack, const char *needle)
def match(s: str):
cdef bytes s_bytes = s.lower().encode("ascii", "ignore")
cdef const char *s_data = s_bytes
matches = set()
{pattern_cases}
return [_patterns[idx] for idx in matches]
# List of bound re.Pattern.search methods.
cdef tuple _patterns = (
{compiled_patterns}
)
"""
def pattern_case(pattern_idx: int, pattern: Pattern) -> str:
"""Generate code for a single accelerated pattern check."""
fast_cond = " or ".join(f"strstr(s_data, {p!r}) is not NULL" for p in pattern.prematchers)
return f"if ({fast_cond}) and _patterns[{pattern_idx}].search(s) is not None: matches.add({pattern_idx})"
def gen_cython(patterns: list[Pattern]) -> str:
# List of "re.compile()" calls.
compiled_patterns = ",\n".join(
f"re.compile({pattern.pattern!r})" for pattern_idx, pattern in enumerate(patterns)
)
# List of "if (strstr ...)" cases.
pattern_cases = f"\n ".join(
pattern_case(pattern_idx, pattern) for pattern_idx, pattern in enumerate(patterns)
)
return CODE_TEMPLATE.format(
compiled_patterns=textwrap.indent(compiled_patterns, " "),
pattern_cases=pattern_cases,
).lstrip()
if __name__ == "__main__":
patterns = [
# Some date in March, eg. Mar 25, 2023
Pattern("Mar \d\d?, \d\d\d\d", ["mar"]),
# Counting carrots in German
Pattern("\d+ (Rübe|Möhre)n?", ["rbe", "mhre"]),
]
print(gen_cython(patterns))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment