Created
July 24, 2018 02:17
-
-
Save perkinslr/209f892b2d760dae184147f902d5f3ae to your computer and use it in GitHub Desktop.
Regex Specificity Matcher
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
def _exapnd_regex(a, p, out_strings): | |
one = False | |
last = 0 | |
d = p.finditer(a) | |
for e in d: | |
one = True | |
grp = e.group() | |
if grp.startswith('(') and '){' in grp: | |
grp = grp[1:] | |
pattern, iter_name = grp.split('){') | |
else: | |
pattern, iter_name = grp.split('{') | |
if iter_name.endswith('}'): | |
iter_name = iter_name[:-1] | |
min_iter, max_iter = iter_name.split(',') | |
min_iter = int(min_iter) | |
max_iter = int(max_iter) | |
if min_iter == max_iter: | |
pattern *= max_iter | |
for i, s in enumerate(out_strings): | |
out_strings[i] = s + a[last:e.start()] + pattern | |
else: | |
if min_iter > max_iter: | |
raise Exception("Cannot expand regex: " + a) | |
else: | |
old_out_strings = out_strings[:] | |
out_strings.clear() | |
for z in range(min_iter, max_iter + 1): | |
for i in old_out_strings: | |
out_strings.append(i + a[last:e.start()] + pattern * z) | |
last = e.end() | |
return one, last | |
def expand_regex(a): | |
out_strings = [''] | |
b = re.compile('.\{[0-9]*,[0-9]*\}') | |
c = re.compile('\(.*?\)\{[0-9]*,[0-9]*\}') | |
one, last = _exapnd_regex(a, c, out_strings) | |
if one: | |
for i in range(len(out_strings)): | |
out_strings[i] += a[last:] | |
else: | |
one, last = _exapnd_regex(a, b, out_strings) | |
if one: | |
for i in range(len(out_strings)): | |
out_strings[i] += a[last:] | |
else: | |
yield a | |
return | |
for i in out_strings: | |
yield from expand_regex(i) | |
class u_set(set): | |
def __add__(self, other): | |
return self.__class__(self.union(other)) | |
class RegexElement(object): | |
def __init__(self, string): | |
self.string = string | |
self.supersets = u_set() | |
self.subsets = u_set() | |
self.disjoints = u_set() | |
self.intersects = u_set() | |
self.maybes = [] | |
self.precompilation = {} | |
self.compiled = re.compile(string, re.IGNORECASE) | |
def __hash__(self): | |
return hash(self.string) | |
SUPERSET = 'SUPERSET' | |
SUBSET = 'SUBSET' | |
INTERSECT = 'INTERSECT' | |
DISJOINT = 'DISJOINT' | |
EQUAL = 'EQUAL' | |
class SubsetGraph(object): | |
def __init__(self, tests): | |
self.regexps = [] | |
self.tests = tests | |
self._dirty = True | |
self._roots = None | |
@property | |
def roots(self): | |
if self._dirty: | |
r = self._roots = [i for i in self.regexps if not i.supersets] | |
return r | |
return self._roots | |
def add_regex(self, new_regex): | |
roots = self.roots | |
new_re = RegexElement(new_regex) | |
for element in roots: | |
self.process(new_re, element) | |
self.regexps.append(new_re) | |
def process(self, new_re, element): | |
relationship = self.compare(new_re, element) | |
if relationship: | |
getattr(self, 'add_' + relationship)(new_re, element) | |
def add_SUPERSET(self, new_re, element): | |
for i in element.subsets: | |
i.supersets.add(new_re) | |
new_re.subsets.add(i) | |
element.supersets.add(new_re) | |
new_re.subsets.add(element) | |
def add_SUBSET(self, new_re, element): | |
for i in element.subsets: | |
self.process(new_re, i) | |
element.subsets.add(new_re) | |
new_re.supersets.add(element) | |
def add_DISJOINT(self, new_re, element): | |
for i in element.subsets: | |
i.disjoints.add(new_re) | |
new_re.disjoints.add(i) | |
new_re.disjoints.add(element) | |
element.disjoints.add(new_re) | |
def add_INTERSECT(self, new_re, element): | |
for i in element.subsets: | |
self.process(new_re, i) | |
new_re.intersects.add(element) | |
element.intersects.add(new_re) | |
def add_EQUAL(self, new_re, element): | |
new_re.supersets = element.supersets.copy() | |
new_re.subsets = element.subsets.copy() | |
new_re.disjoints = element.disjoints.copy() | |
new_re.intersects = element.intersects.copy() | |
def compare(self, a, b): | |
for test in self.tests: | |
result = test(a.string, b.string) | |
if result: | |
return result | |
def match(self, text, strict=True): | |
matches = set() | |
self._match(text, self.roots, matches) | |
out = [] | |
for e in matches: | |
for s in e.subsets: | |
if s in matches: | |
break | |
else: | |
out.append(e) | |
if strict and len(out) > 1: | |
for i in out: | |
print(i.string) | |
raise Exception("Multiple equally specific matches found for " + text) | |
return out | |
def _match(self, text, elements, matches): | |
new_elements = [] | |
for element in elements: | |
m = element.compiled.match(text) | |
if m: | |
matches.add(element) | |
new_elements.extend(element.subsets) | |
if new_elements: | |
self._match(text, new_elements, matches) | |
def check_equal(a, b): | |
if a == b: | |
return EQUAL | |
def check_prefixes(a, b): | |
for i in range(len(a)): | |
if i == len(b): | |
break | |
if a[i] in "(.[" or b[i] in "(.[)": | |
break | |
if a[i] != b[i]: | |
return DISJOINT | |
def check_suffixes(a, b): | |
for i in range(len(a)): | |
if i == len(b): | |
break | |
if a[-i - 1] in ").]*+}?": | |
try: | |
if a[-i - 2] == "\\": | |
continue | |
except IndexError: | |
pass | |
break | |
elif b[-i - 1] in ").]*+}?": | |
try: | |
if b[-i - 2] == "\\": | |
continue | |
except IndexError: | |
pass | |
break | |
elif a[-i - 1] != b[-i - 1]: | |
return DISJOINT | |
def check_dotstar(a, b): | |
astar = False | |
bstar = False | |
if a.endswith(".*"): | |
a = a[:-2] | |
astar = True | |
elif a.endswith("(/.*)?"): | |
a = a[:-6] | |
astar = True | |
if b.endswith(".*"): | |
b = b[:-2] | |
bstar = True | |
elif b.endswith("(/.*)?"): | |
b = b[:-6] | |
bstar = True | |
if astar and bstar: | |
if a.startswith(b): | |
return SUBSET | |
elif b.startswith(a): | |
return SUPERSET | |
elif bstar and a.startswith(b): | |
return SUBSET | |
elif astar and b.startswith(a): | |
return SUPERSET | |
def check_fixed(a, b): | |
a_fixed = True | |
b_fixed = True | |
i = 0 | |
while i < len(a): | |
if a[i] == "\\": | |
i += 1 | |
if a[i] in r"([.*{?+": | |
a_fixed = False | |
i += 1 | |
i = 0 | |
while i < len(b): | |
if b[i] == "\\": | |
i += 1 | |
if b[i] in r"([.*{?+": | |
b_fixed = False | |
i += 1 | |
if a_fixed and not b_fixed: | |
if re.match(b, a): | |
return SUBSET | |
elif b_fixed and not a_fixed: | |
if re.match(a, b): | |
return SUPERSET | |
def check_match(a, b): | |
if re.compile(a).match(b.rstrip('$').lstrip('^')): | |
return SUPERSET | |
if re.compile(b).match(a.rstrip('$').lstrip('^')): | |
return SUBSET | |
def check_expand(a, b): | |
c = list(expand_regex(a)) | |
d = list(expand_regex(b)) | |
bina = False | |
ainb = False | |
allainb = True | |
allbina = True | |
for i in c: | |
if i in d: | |
ainb = True | |
else: | |
allainb = False | |
for i in d: | |
if i in c: | |
bina = True | |
else: | |
allbina = False | |
if allainb and allbina: | |
return EQUAL | |
if allainb and not allbina: | |
return SUBSET | |
if allbina and not allainb: | |
return SUPERSET | |
if ainb or bina: | |
return INTERSECT | |
default_tests = [check_expand, check_prefixes, check_fixed, check_dotstar, check_suffixes, check_equal, check_match] | |
if __name__=='__main__': | |
sg = SubsetGraph(default_tests) | |
sg.add_regex('..:..:..:..:..:..') | |
sg.add_regex('^00:11:22:..:..:..') | |
sg.add_regex('..:..:..:33:44:55$') | |
print([i.string for i in sg.match('00:11:22:33:44:55', False)]) | |
sg.add_regex('^00:11:22:33:44:55$') | |
print([i.string for i in sg.match('00:11:22:33:44:55', True)]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment