Skip to content

Instantly share code, notes, and snippets.

@averykhoo
Created October 31, 2023 08:53
Show Gist options
  • Save averykhoo/5a8a2506722a570390618dfbf027fb0e to your computer and use it in GitHub Desktop.
Save averykhoo/5a8a2506722a570390618dfbf027fb0e to your computer and use it in GitHub Desktop.
convert glob patterns to regex matchers, with support for globstar
import re
import warnings
from typing import Pattern
def translate_url_pattern(pattern: str, # noqa: max-complexity: 20
case_sensitive: bool = True,
) -> Pattern:
"""
translates a url pattern (like a glob pattern) to a regular expression
see test cases at the bottom of this file for examples of expected behavior
supported notation:
* `?` -------------- match any single character (except the path separator "/")
* `*` -------------- match zero or more of any character (except the path separator "/")
* `[a-z]` ---------- character set
* `[?]` ------------ match a literal "?" char
* `[*]` ------------ match a literal "*" char
* `[!a-z123]` ------ negated character set
* `[^a-z123]` ------ alternative form of negated character set
* `**` (globstar) -- match zero or more path segments (note: "a**b" matches "a*b" and "a*/**/*b")
unsupported notation:
* empty set "[]"
* empty negated set "[!]" or "[^]"
* backslash escapes (all "\" characters are treated as literal)
* nested character sets
* set operations within character sets
* "]" character within character sets, unless it's the first char (i.e. "[][]" ok, "[[]]" not ok)
* "[" character outside of character sets (instead, use `[[]`)
* named character classes (`[[:alpha:]]`), collating symbols (`[[.a-acute.]]`), or equivalence classes (`[[=a=]]`)
* extglob notation
"""
# preprocessing: normalize the ** notation
pattern = re.sub(r'\*{2,}', '**', pattern)
pattern = re.sub(r'/\*\*(?!/)', '/**/*', pattern)
pattern = re.sub(r'(?<!/)\*\*/', '*/**/', pattern)
# incrementally parse through the pattern
i = 0
n = len(pattern)
out = []
while i < n:
# get current char
c = pattern[i]
i += 1
# `*` matches any number of any characters
if c == '*':
# handle a single `*` character
if i >= n or pattern[i] != '*': # check it's not a `**`
out.append('[^/]*')
continue
# `**` matches any number of path segments
i += 1
if i >= n or pattern[i] != '/': # check it's not a `**/`
out.append('(?:[^/]*/)*[^/]*')
continue
# `**/` has simpler handling, since it ends in a path segment
# note also that `**/` has been expanded to `*/**/` in the preprocessing above
i += 1
out.append('(?:[^/]*/)*')
continue
# `?` matches any single character
if c == '?':
out.append('[^/]')
continue
# `[...]` matches a set of characters
if c == '[':
j = i
# find the remainder of the character set
if j < n and (pattern[j] == '!' or pattern[j] == '^'):
j += 1
if j < n and pattern[j] == ']':
j += 1
while j < n and pattern[j] != ']':
j += 1
# no end to the character set, assume it's a literal `[` character
if j >= n:
out.append(re.escape('['))
continue
# parse the stuff inside the brackets
stuff = pattern[i:j]
# jump forwards to after the character set
i = j + 1
# match a single character
if len(stuff) == 1:
out.append(re.escape(stuff))
continue
# warn that set operations are not supported
if '--' in stuff:
warnings.warn('character set difference not supported')
if '&&' in stuff:
warnings.warn('character set intersection not supported')
if '||' in stuff:
warnings.warn('character set union not supported')
if '~~' in stuff:
warnings.warn('character set symmetric difference not supported')
# translate posix negated char class
if stuff[0] == '!':
stuff = '^' + stuff[1:]
# escape any special chars
if stuff[0] == '^':
stuff = '^' + ''.join(re.escape(char) if char != '-' else '-' for char in stuff[1:])
else:
stuff = ''.join(re.escape(char) if char != '-' else '-' for char in stuff)
# add char class
out.append(f'[{stuff}]')
continue
# literal character
out.append(re.escape(c))
if case_sensitive:
return re.compile(''.join(out))
else:
return re.compile(''.join(out), flags=re.IGNORECASE)
def test_cases():
# simple matching
assert translate_url_pattern(r'abc').pattern == r'abc'
assert translate_url_pattern(r'a.b.c').pattern == r'a\.b\.c'
# character sets (cannot be escaped)
assert translate_url_pattern(r'[a-z]').pattern == r'[a-z]'
assert translate_url_pattern(r'\[a-z]').pattern == r'\\[a-z]'
assert translate_url_pattern(r'[a-z\]').pattern == r'[a-z\\]'
assert translate_url_pattern(r'[a\-z]').pattern == r'[a\\-z]'
assert translate_url_pattern(r'[a-z123]').pattern == r'[a-z123]'
assert translate_url_pattern(r'[\-z]').pattern == r'[\\-z]'
assert translate_url_pattern(r'[][]').pattern == r'[\]\[]'
# negated character sets
assert translate_url_pattern(r'[!a-z]').pattern == r'[^a-z]'
assert translate_url_pattern(r'[^a-z]').pattern == r'[^a-z]'
assert translate_url_pattern(r'[!!]').pattern == r'[^!]'
assert translate_url_pattern(r'[^^]').pattern == r'[^\^]'
# empty (negated) sets
assert translate_url_pattern(r'[]').pattern == r'\[\]'
assert translate_url_pattern(r'[!]').pattern == r'\[!\]'
assert translate_url_pattern(r'[^]').pattern == r'\[\^\]'
# incomplete character set notation
assert translate_url_pattern(r'[').pattern == r'\['
assert translate_url_pattern(r']').pattern == r'\]'
assert translate_url_pattern(r'[[]]').pattern == r'\[\]' # see r'[[]' below
# single-char character sets
assert translate_url_pattern(r'[a]').pattern == r'a'
assert translate_url_pattern(r'[?]').pattern == r'\?'
assert translate_url_pattern(r'[*]').pattern == r'\*'
assert translate_url_pattern(r'[\]').pattern == r'\\'
assert translate_url_pattern(r'[[]').pattern == r'\['
assert translate_url_pattern(r'[]]').pattern == r'\]'
# question mark (cannot be escaped)
assert translate_url_pattern(r'?').pattern == r'[^/]'
assert translate_url_pattern(r'\?').pattern == r'\\[^/]'
assert translate_url_pattern(r'a?b?c').pattern == r'a[^/]b[^/]c'
# asterisk (cannot be escaped)
assert translate_url_pattern(r'*').pattern == r'[^/]*'
assert translate_url_pattern(r'\*').pattern == r'\\[^/]*'
assert translate_url_pattern(r'*/').pattern == r'[^/]*/'
assert translate_url_pattern(r'/*').pattern == r'/[^/]*'
assert translate_url_pattern(r'/*/').pattern == r'/[^/]*/'
# globstar
assert translate_url_pattern(r'**').pattern == '(?:[^/]*/)*[^/]*'
assert translate_url_pattern(r'**/').pattern == '[^/]*/(?:[^/]*/)*'
assert translate_url_pattern(r'/**').pattern == '/(?:[^/]*/)*[^/]*'
assert translate_url_pattern(r'/**/').pattern == '/(?:[^/]*/)*'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment