Created
November 24, 2015 15:15
-
-
Save mewo2/b03d07f512692c250c46 to your computer and use it in GitHub Desktop.
Making regexes out of wordlists
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def sexpr(words): | |
assert words | |
if len(words) == 1: | |
return "lit", words[0] | |
if "" in words: | |
r = sexpr([w for w in words if w]) | |
return "maybe", r | |
heads = [w[0] for w in words] | |
tails = [w[1:] for w in words] | |
if len(set(heads)) == 1: | |
return "cons", (heads[0], sexpr(tails)) | |
poss = sorted(set(heads)) | |
subregexes = [sexpr([w for h, w in zip(heads, words) if h==p]) for p in poss] | |
if all(x[0] == "lit" and len(x[1]) == 1 for x in subregexes): | |
return "charset", tuple(x[1] for x in subregexes) | |
return "choice", tuple(subregexes) | |
def toregex(s): | |
head, tail = s | |
if head == "lit": | |
return tail | |
elif head == "maybe": | |
r = toregex(tail) | |
if (tail[0] == "lit" and len(tail[1]) == 1) or tail[0] in ["charset", "maybe"]: | |
return r + "?" | |
else: | |
return "(" + r + ")?" | |
elif head == "cons": | |
r = toregex(tail[1]) | |
if tail[1][0] == "choice": | |
r = "(" + r + ")" | |
return tail[0] + r | |
elif head == "choice": | |
rs = [] | |
for x in tail: | |
r = toregex(x) | |
if x[0] not in ["charset", "maybe", "lit", "cons"]: | |
r = "(" + r + ")" | |
rs.append(r) | |
return "|".join(rs) | |
elif head == "charset": | |
return "[" + "".join(tail) + "]" | |
def regexify(words): | |
return toregex(sexpr(words)) | |
print regexify(['unsavoriness', 'unsavory', 'unsawed', 'unsawn', 'unsay']) | |
# unsa(vor(iness|y)|w(ed|n)|y) | |
print regexify(['filmily', 'filminess', 'filmish', 'filmist', 'filmize']) | |
# filmi(ly|ness|s[ht]|ze) | |
print regexify(['bead', 'beaded', 'beader', 'beadflush', 'beadhouse']) | |
# bead(e[dr]|flush|house)? |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment