Last active
March 19, 2024 07:22
-
-
Save cls/27b3eeeb502028a1e44bfb37ebb690e0 to your computer and use it in GitHub Desktop.
Python regular expression bytecode disassembler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import sre_compile | |
import sre_parse | |
from sre_constants import * | |
opcodes = dict((v,k) for (k,v) in OPCODES.items()) | |
atcodes = dict((v,k) for (k,v) in ATCODES.items()) | |
chcodes = dict((v,k) for (k,v) in CHCODES.items()) | |
def print_dis(s, indent): | |
print((' ' * indent) + s) | |
def _discharset(code): | |
i = 0 | |
c = 0 | |
cset = set() | |
while i < 8: | |
bits = code.pop(0) | |
bit = 1 | |
while bit < sre_compile.MAXCODE: | |
if bits & bit: | |
cset.add(chr(c)) | |
bit <<= 1 | |
c += 1 | |
i += 1 | |
return cset | |
def _disin(code, rep, indent=0): | |
initlen = len(code) | |
while len(code) > initlen - rep: | |
opcode = code.pop(0) | |
op = opcodes.get(opcode) | |
if op is FAILURE: | |
pass | |
elif op is CATEGORY: | |
chcode = code.pop(0) | |
ch = chcodes.get(chcode) | |
print_dis("{}({}),".format(op, ch), indent) | |
elif op is CHARSET: | |
cset = _discharset(code) | |
print_dis("{}{},".format(op, tuple(sorted(cset))), indent) | |
elif op is LITERAL: | |
literal = repr(chr(code.pop(0))) | |
print_dis("{}({}),".format(op, literal), indent) | |
elif op is RANGE: | |
minchr = repr(chr(code.pop(0))) | |
maxchr = repr(chr(code.pop(0))) | |
print_dis("{}(min={}, max={}),".format(op, minchr, maxchr), indent) | |
else: | |
raise NotImplementedError | |
def _disop(code, rep, indent=0): | |
initlen = len(code) | |
while len(code) > initlen - rep: | |
opcode = code.pop(0) | |
op = opcodes.get(opcode) | |
if op is FAILURE or op is SUCCESS: | |
pass | |
elif op is ANY or op is ANY_ALL: | |
print_dis("{};".format(op), indent) | |
elif op is ASSERT or op is ASSERT_NOT: | |
sublen = code.pop(0) | |
width = code.pop(0) | |
print_dis("{}(len={}) {{ # {}".format(op, width, 'lookahead' if width == 0 else 'lookbehind'), indent) | |
_disop(code, sublen-2, indent+1) | |
print_dis("}", indent) | |
elif op is AT: | |
atcode = code.pop(0) | |
at = atcodes.get(atcode) | |
print_dis("{}({});".format(op, at), indent) | |
elif op is BRANCH: | |
print_dis("{} {{".format(op), indent) | |
i = 0 | |
sublen = code.pop(0) | |
while sublen != 0: | |
print_dis(" case {}:".format(i), indent) | |
_disop(code, sublen-2, indent+1) | |
sublen = code.pop(0) | |
i += 1 | |
print_dis("};", indent) | |
elif op is GROUPREF_EXISTS: | |
label = code.pop(0) | |
yeslen = code.pop(0) | |
print_dis("if {}({}) {{".format(op, label), indent) | |
nolen = code[yeslen-3] if code[yeslen-4] is OPCODES[JUMP] else 0 | |
if nolen: | |
yeslen -= 2 | |
_disop(code, yeslen-2, indent+1) | |
if nolen: | |
assert code[0] is OPCODES[JUMP] | |
code.pop(0) # jump | |
nolen = code.pop(0) | |
print_dis("} else {", indent) | |
_disop(code, nolen-1, indent+1) | |
print_dis("};", indent) | |
elif op is IN or op is IN_IGNORE: | |
sublen = code.pop(0) | |
if code[0] is OPCODES[NEGATE]: | |
code.pop(0) | |
sublen -= 1 | |
print_dis("{}(negate=True) {{".format(op), indent) | |
else: | |
print_dis("{} {{".format(op), indent) | |
_disin(code, sublen-1, indent+1) | |
print_dis("};", indent) | |
elif op is INFO: | |
sublen = code.pop(0) | |
mask = code.pop(0) | |
minlen = code.pop(0) | |
maxlen = code.pop(0) | |
masklist = [] | |
if mask & SRE_INFO_PREFIX: masklist.append('PREFIX') | |
if mask & SRE_INFO_LITERAL: masklist.append('LITERAL') | |
if mask & SRE_INFO_CHARSET: masklist.append('CHARSET') | |
if sublen > 4: | |
print_dis("{}(mask={}, min={}, max={}) {{".format(op, 0 if mask == 0 else '|'.join(masklist), minlen, maxlen), indent) | |
if mask & SRE_INFO_PREFIX: | |
raise NotImplementedError | |
elif mask & SRE_INFO_CHARSET: | |
_disin(code, sublen-4, indent+1) | |
print_dis("};", indent) | |
else: | |
print_dis("{}(mask={}, min={}, max={}) {{}};".format(op, mask, minlen, maxlen), indent) | |
elif op is JUMP: | |
code.pop(0) | |
print_dis("break;", indent) | |
elif op is LITERAL or op is NOT_LITERAL or op is LITERAL_IGNORE or op is NOT_LITERAL_IGNORE: | |
literal = repr(chr(code.pop(0))) | |
print_dis("{}({});".format(op, literal), indent) | |
elif op is MARK or op is GROUPREF: | |
label = code.pop(0) | |
print_dis("{}({});".format(op, label), indent) | |
elif op is REPEAT or op is REPEAT_ONE or op is MIN_REPEAT_ONE: | |
sublen = code.pop(0) | |
minrep = code.pop(0) | |
maxrep = code.pop(0) | |
print_dis("{}(min={}, max={}) {{".format(op, minrep, 'MAXREPEAT' if maxrep == MAXREPEAT else maxrep), indent) | |
_disop(code, sublen-3, indent+1) | |
if op is REPEAT: | |
untilopcode = code.pop(0) | |
untilop = opcodes.get(untilopcode) | |
assert untilop is MAX_UNTIL or untilop is MIN_UNTIL | |
print_dis("}} {};".format(untilop), indent) | |
else: | |
print_dis('};', indent) | |
else: | |
raise NotImplementedError | |
def dis(p, flags=None): | |
if type(p) is str: | |
pattern = p | |
if flags is None: | |
flags = 0 | |
else: | |
pattern = p.pattern | |
if flags is None: | |
flags = p.flags | |
p = sre_parse.parse(pattern, flags) | |
code = sre_compile._code(p, flags) | |
_disop(code, len(code)) | |
if __name__ == '__main__': | |
import sys | |
opts = True | |
flags = 0 | |
for arg in sys.argv[1:]: | |
if opts and arg[0] == '-': | |
if arg == '--': | |
opts = False | |
continue | |
else: | |
for c in arg[1:]: | |
if c == 'a': | |
flags |= re.ASCII | |
elif c == 'i': | |
flags |= re.IGNORECASE | |
elif c == 'l': | |
flags |= re.LOCALE | |
elif c == 'm': | |
flags |= re.MULTILINE | |
elif c == 's': | |
flags |= re.DOTALL | |
elif c == 'x': | |
flags |= re.VERBOSE | |
else: | |
print('usage: dissre [-ailmsx] pattern...', file=sys.stderr) | |
exit(2) | |
else: | |
opts = False | |
dis(arg, flags) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment