Created
October 31, 2023 08:53
-
-
Save averykhoo/5a8a2506722a570390618dfbf027fb0e to your computer and use it in GitHub Desktop.
convert glob patterns to regex matchers, with support for globstar
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import warnings | |
from typing import Pattern | |
def translate_url_pattern(pattern: str, # noqa: max-complexity: 20 | |
case_sensitive: bool = True, | |
) -> Pattern: | |
""" | |
translates a url pattern (like a glob pattern) to a regular expression | |
see test cases at the bottom of this file for examples of expected behavior | |
supported notation: | |
* `?` -------------- match any single character (except the path separator "/") | |
* `*` -------------- match zero or more of any character (except the path separator "/") | |
* `[a-z]` ---------- character set | |
* `[?]` ------------ match a literal "?" char | |
* `[*]` ------------ match a literal "*" char | |
* `[!a-z123]` ------ negated character set | |
* `[^a-z123]` ------ alternative form of negated character set | |
* `**` (globstar) -- match zero or more path segments (note: "a**b" matches "a*b" and "a*/**/*b") | |
unsupported notation: | |
* empty set "[]" | |
* empty negated set "[!]" or "[^]" | |
* backslash escapes (all "\" characters are treated as literal) | |
* nested character sets | |
* set operations within character sets | |
* "]" character within character sets, unless it's the first char (i.e. "[][]" ok, "[[]]" not ok) | |
* "[" character outside of character sets (instead, use `[[]`) | |
* named character classes (`[[:alpha:]]`), collating symbols (`[[.a-acute.]]`), or equivalence classes (`[[=a=]]`) | |
* extglob notation | |
""" | |
# preprocessing: normalize the ** notation | |
pattern = re.sub(r'\*{2,}', '**', pattern) | |
pattern = re.sub(r'/\*\*(?!/)', '/**/*', pattern) | |
pattern = re.sub(r'(?<!/)\*\*/', '*/**/', pattern) | |
# incrementally parse through the pattern | |
i = 0 | |
n = len(pattern) | |
out = [] | |
while i < n: | |
# get current char | |
c = pattern[i] | |
i += 1 | |
# `*` matches any number of any characters | |
if c == '*': | |
# handle a single `*` character | |
if i >= n or pattern[i] != '*': # check it's not a `**` | |
out.append('[^/]*') | |
continue | |
# `**` matches any number of path segments | |
i += 1 | |
if i >= n or pattern[i] != '/': # check it's not a `**/` | |
out.append('(?:[^/]*/)*[^/]*') | |
continue | |
# `**/` has simpler handling, since it ends in a path segment | |
# note also that `**/` has been expanded to `*/**/` in the preprocessing above | |
i += 1 | |
out.append('(?:[^/]*/)*') | |
continue | |
# `?` matches any single character | |
if c == '?': | |
out.append('[^/]') | |
continue | |
# `[...]` matches a set of characters | |
if c == '[': | |
j = i | |
# find the remainder of the character set | |
if j < n and (pattern[j] == '!' or pattern[j] == '^'): | |
j += 1 | |
if j < n and pattern[j] == ']': | |
j += 1 | |
while j < n and pattern[j] != ']': | |
j += 1 | |
# no end to the character set, assume it's a literal `[` character | |
if j >= n: | |
out.append(re.escape('[')) | |
continue | |
# parse the stuff inside the brackets | |
stuff = pattern[i:j] | |
# jump forwards to after the character set | |
i = j + 1 | |
# match a single character | |
if len(stuff) == 1: | |
out.append(re.escape(stuff)) | |
continue | |
# warn that set operations are not supported | |
if '--' in stuff: | |
warnings.warn('character set difference not supported') | |
if '&&' in stuff: | |
warnings.warn('character set intersection not supported') | |
if '||' in stuff: | |
warnings.warn('character set union not supported') | |
if '~~' in stuff: | |
warnings.warn('character set symmetric difference not supported') | |
# translate posix negated char class | |
if stuff[0] == '!': | |
stuff = '^' + stuff[1:] | |
# escape any special chars | |
if stuff[0] == '^': | |
stuff = '^' + ''.join(re.escape(char) if char != '-' else '-' for char in stuff[1:]) | |
else: | |
stuff = ''.join(re.escape(char) if char != '-' else '-' for char in stuff) | |
# add char class | |
out.append(f'[{stuff}]') | |
continue | |
# literal character | |
out.append(re.escape(c)) | |
if case_sensitive: | |
return re.compile(''.join(out)) | |
else: | |
return re.compile(''.join(out), flags=re.IGNORECASE) | |
def test_cases(): | |
# simple matching | |
assert translate_url_pattern(r'abc').pattern == r'abc' | |
assert translate_url_pattern(r'a.b.c').pattern == r'a\.b\.c' | |
# character sets (cannot be escaped) | |
assert translate_url_pattern(r'[a-z]').pattern == r'[a-z]' | |
assert translate_url_pattern(r'\[a-z]').pattern == r'\\[a-z]' | |
assert translate_url_pattern(r'[a-z\]').pattern == r'[a-z\\]' | |
assert translate_url_pattern(r'[a\-z]').pattern == r'[a\\-z]' | |
assert translate_url_pattern(r'[a-z123]').pattern == r'[a-z123]' | |
assert translate_url_pattern(r'[\-z]').pattern == r'[\\-z]' | |
assert translate_url_pattern(r'[][]').pattern == r'[\]\[]' | |
# negated character sets | |
assert translate_url_pattern(r'[!a-z]').pattern == r'[^a-z]' | |
assert translate_url_pattern(r'[^a-z]').pattern == r'[^a-z]' | |
assert translate_url_pattern(r'[!!]').pattern == r'[^!]' | |
assert translate_url_pattern(r'[^^]').pattern == r'[^\^]' | |
# empty (negated) sets | |
assert translate_url_pattern(r'[]').pattern == r'\[\]' | |
assert translate_url_pattern(r'[!]').pattern == r'\[!\]' | |
assert translate_url_pattern(r'[^]').pattern == r'\[\^\]' | |
# incomplete character set notation | |
assert translate_url_pattern(r'[').pattern == r'\[' | |
assert translate_url_pattern(r']').pattern == r'\]' | |
assert translate_url_pattern(r'[[]]').pattern == r'\[\]' # see r'[[]' below | |
# single-char character sets | |
assert translate_url_pattern(r'[a]').pattern == r'a' | |
assert translate_url_pattern(r'[?]').pattern == r'\?' | |
assert translate_url_pattern(r'[*]').pattern == r'\*' | |
assert translate_url_pattern(r'[\]').pattern == r'\\' | |
assert translate_url_pattern(r'[[]').pattern == r'\[' | |
assert translate_url_pattern(r'[]]').pattern == r'\]' | |
# question mark (cannot be escaped) | |
assert translate_url_pattern(r'?').pattern == r'[^/]' | |
assert translate_url_pattern(r'\?').pattern == r'\\[^/]' | |
assert translate_url_pattern(r'a?b?c').pattern == r'a[^/]b[^/]c' | |
# asterisk (cannot be escaped) | |
assert translate_url_pattern(r'*').pattern == r'[^/]*' | |
assert translate_url_pattern(r'\*').pattern == r'\\[^/]*' | |
assert translate_url_pattern(r'*/').pattern == r'[^/]*/' | |
assert translate_url_pattern(r'/*').pattern == r'/[^/]*' | |
assert translate_url_pattern(r'/*/').pattern == r'/[^/]*/' | |
# globstar | |
assert translate_url_pattern(r'**').pattern == '(?:[^/]*/)*[^/]*' | |
assert translate_url_pattern(r'**/').pattern == '[^/]*/(?:[^/]*/)*' | |
assert translate_url_pattern(r'/**').pattern == '/(?:[^/]*/)*[^/]*' | |
assert translate_url_pattern(r'/**/').pattern == '/(?:[^/]*/)*' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment