Skip to content

Instantly share code, notes, and snippets.

@Quacky2200
Forked from mdeous/repermute.py
Last active February 13, 2024 18:31
Show Gist options
  • Save Quacky2200/714acad06f3f80f6bdb92d7d49dea4bf to your computer and use it in GitHub Desktop.
Save Quacky2200/714acad06f3f80f6bdb92d7d49dea4bf to your computer and use it in GitHub Desktop.
Generate all possible permutations of a regex (Python v3)
# -*- coding: utf-8 -*-
#
# Used like this:
#
# from repermute import ipermute
#
# for s in ipermute('[A-Z]\d'):
# print s
#
# Almost all regular expression constructs are supported except for '*'
# (which in the Python sre_parse implementation matches 0 to 65535
# times), '+' and '^'. Non-ASCII characters doesn't work, but I think
# that is a limitation in the sre_parse module. It works by first
# parsing the regexp to string sequences so that [A-Z] becomes ['A',
# 'B', ... 'Z'], \d becomes ['1', ... , '9']. Then a Cartesian join is
# applied on all string sequences to get all possible permutations of
# them all.
import itertools
from sre_constants import *
import sre_parse
import string
category_chars = {
CATEGORY_DIGIT : string.digits,
CATEGORY_SPACE : string.whitespace,
CATEGORY_WORD : string.digits + string.ascii_letters + '_'
}
def unique_extend(res_list, list):
for item in list:
if item not in res_list:
res_list.append(item)
def handle_any(val):
"""
This is different from normal regexp matching. It only matches
printable ASCII characters.
"""
return string.printable
def handle_branch(_tuple):
(tok, val) = _tuple
all_opts = []
for toks in val:
opts = permute_toks(toks)
unique_extend(all_opts, opts)
return all_opts
def handle_category(val):
return list(category_chars[val])
def handle_in(val):
out = []
for tok, val in val:
out += handle_tok(tok, val)
return out
def handle_literal(val):
return [chr(val)]
def handle_max_repeat(repeat):
(min, max, val) = repeat
"""
Handle a repeat token such as {x,y} or ?.
"""
subtok, subval = val[0]
if max > 5000:
# max is the number of cartesian join operations needed to be
# carried out. More than 5000 consumes way to much memory.
raise ValueError("To many repetitions requested (%d)" % max)
optlist = handle_tok(subtok, subval)
iterlist = []
for x in range(min, max + 1):
joined = join([optlist] * x)
iterlist.append(joined)
return (''.join(it) for it in itertools.chain(*iterlist))
def handle_range(val):
lo, hi = val
return (chr(x) for x in range(lo, hi + 1))
def handle_subpattern(val):
return list(permute_toks(val[3]))
def handle_tok(tok, val):
"""
Returns a list of strings of possible permutations for this regexp
token.
"""
handlers = {
ANY : handle_any,
BRANCH : handle_branch,
CATEGORY : handle_category,
LITERAL : handle_literal,
IN : handle_in,
MAX_REPEAT : handle_max_repeat,
RANGE : handle_range,
SUBPATTERN : handle_subpattern}
try:
return handlers[tok](val)
except KeyError as e:
fmt = "Unsupported regular expression construct: %s"
raise ValueError(fmt % tok)
def permute_toks(toks):
"""
Returns a generator of strings of possible permutations for this
regexp token list.
"""
lists = [handle_tok(tok, val) for tok, val in toks]
return (''.join(it) for it in join(lists))
def join(iterlist):
"""
Cartesian join as an iterator of the supplied sequences. Borrowed
from:
http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/302478
"""
def rloop(seqin, comb):
if seqin:
for item in seqin[0]:
newcomb = comb + [item]
for item in rloop(seqin[1:], newcomb):
yield item
else:
yield comb
return rloop(iterlist, [])
########## PUBLIC API ####################
def ipermute(p):
toks = [tok_n_val for tok_n_val in sre_parse.parse(p)]
return permute_toks(toks)
def permute(p):
return list(ipermute(p))
@robins80
Copy link

I was trying to generate permutations of the regex "C(AB){3}" and it bombed with TypeError: 'int' object is not iterable. I think the parentheses confused it.

@Quacky2200
Copy link
Author

Quacky2200 commented Feb 13, 2024

Change the handle_subpattern function to the following:

def handle_subpattern(val):
    return list(permute_toks(val[3]))

You can see the issue when I printed from this function:

def handle_subpattern(val):
    print('handle_subpattern val')
    print(val)
    return list(permute_toks(val[1]))

Returns:

handle_subpattern val
(1, 0, 0, [(LITERAL, 65), (LITERAL, 66)])

Changing the handle_subpattern function to val[3] results in the following list:

['CABABAB']

Did you want the following regexes instead?
(C[AB]){3} -> ['CACACA', 'CACACB', 'CACBCA', 'CACBCB', 'CBCACA', 'CBCACB', 'CBCBCA', 'CBCBCB']
(C[AB]){2} -> ['CACA', 'CACB', 'CBCA', 'CBCB']
C[AB]{3} -> ['CAAA', 'CAAB', 'CABA', 'CABB', 'CBAA', 'CBAB', 'CBBA', 'CBBB'] (similar to below:)
C(A|B){3} -> ['CAAA', 'CAAB', 'CABA', 'CABB', 'CBAA', 'CBAB', 'CBBA', 'CBBB']

@robins80
Copy link

That fixed it! Thanks!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment