Skip to content

Instantly share code, notes, and snippets.

@mewo2
Created November 24, 2015 15:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mewo2/b03d07f512692c250c46 to your computer and use it in GitHub Desktop.
Save mewo2/b03d07f512692c250c46 to your computer and use it in GitHub Desktop.
Making regexes out of wordlists
def sexpr(words):
assert words
if len(words) == 1:
return "lit", words[0]
if "" in words:
r = sexpr([w for w in words if w])
return "maybe", r
heads = [w[0] for w in words]
tails = [w[1:] for w in words]
if len(set(heads)) == 1:
return "cons", (heads[0], sexpr(tails))
poss = sorted(set(heads))
subregexes = [sexpr([w for h, w in zip(heads, words) if h==p]) for p in poss]
if all(x[0] == "lit" and len(x[1]) == 1 for x in subregexes):
return "charset", tuple(x[1] for x in subregexes)
return "choice", tuple(subregexes)
def toregex(s):
head, tail = s
if head == "lit":
return tail
elif head == "maybe":
r = toregex(tail)
if (tail[0] == "lit" and len(tail[1]) == 1) or tail[0] in ["charset", "maybe"]:
return r + "?"
else:
return "(" + r + ")?"
elif head == "cons":
r = toregex(tail[1])
if tail[1][0] == "choice":
r = "(" + r + ")"
return tail[0] + r
elif head == "choice":
rs = []
for x in tail:
r = toregex(x)
if x[0] not in ["charset", "maybe", "lit", "cons"]:
r = "(" + r + ")"
rs.append(r)
return "|".join(rs)
elif head == "charset":
return "[" + "".join(tail) + "]"
def regexify(words):
return toregex(sexpr(words))
print regexify(['unsavoriness', 'unsavory', 'unsawed', 'unsawn', 'unsay'])
# unsa(vor(iness|y)|w(ed|n)|y)
print regexify(['filmily', 'filminess', 'filmish', 'filmist', 'filmize'])
# filmi(ly|ness|s[ht]|ze)
print regexify(['bead', 'beaded', 'beader', 'beadflush', 'beadhouse'])
# bead(e[dr]|flush|house)?
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment