Skip to content

Instantly share code, notes, and snippets.

@cls
Last active March 19, 2024 07:22
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cls/27b3eeeb502028a1e44bfb37ebb690e0 to your computer and use it in GitHub Desktop.
Save cls/27b3eeeb502028a1e44bfb37ebb690e0 to your computer and use it in GitHub Desktop.
Python regular expression bytecode disassembler
import re
import sre_compile
import sre_parse
from sre_constants import *
opcodes = dict((v,k) for (k,v) in OPCODES.items())
atcodes = dict((v,k) for (k,v) in ATCODES.items())
chcodes = dict((v,k) for (k,v) in CHCODES.items())
def print_dis(s, indent):
print((' ' * indent) + s)
def _discharset(code):
i = 0
c = 0
cset = set()
while i < 8:
bits = code.pop(0)
bit = 1
while bit < sre_compile.MAXCODE:
if bits & bit:
cset.add(chr(c))
bit <<= 1
c += 1
i += 1
return cset
def _disin(code, rep, indent=0):
initlen = len(code)
while len(code) > initlen - rep:
opcode = code.pop(0)
op = opcodes.get(opcode)
if op is FAILURE:
pass
elif op is CATEGORY:
chcode = code.pop(0)
ch = chcodes.get(chcode)
print_dis("{}({}),".format(op, ch), indent)
elif op is CHARSET:
cset = _discharset(code)
print_dis("{}{},".format(op, tuple(sorted(cset))), indent)
elif op is LITERAL:
literal = repr(chr(code.pop(0)))
print_dis("{}({}),".format(op, literal), indent)
elif op is RANGE:
minchr = repr(chr(code.pop(0)))
maxchr = repr(chr(code.pop(0)))
print_dis("{}(min={}, max={}),".format(op, minchr, maxchr), indent)
else:
raise NotImplementedError
def _disop(code, rep, indent=0):
initlen = len(code)
while len(code) > initlen - rep:
opcode = code.pop(0)
op = opcodes.get(opcode)
if op is FAILURE or op is SUCCESS:
pass
elif op is ANY or op is ANY_ALL:
print_dis("{};".format(op), indent)
elif op is ASSERT or op is ASSERT_NOT:
sublen = code.pop(0)
width = code.pop(0)
print_dis("{}(len={}) {{ # {}".format(op, width, 'lookahead' if width == 0 else 'lookbehind'), indent)
_disop(code, sublen-2, indent+1)
print_dis("}", indent)
elif op is AT:
atcode = code.pop(0)
at = atcodes.get(atcode)
print_dis("{}({});".format(op, at), indent)
elif op is BRANCH:
print_dis("{} {{".format(op), indent)
i = 0
sublen = code.pop(0)
while sublen != 0:
print_dis(" case {}:".format(i), indent)
_disop(code, sublen-2, indent+1)
sublen = code.pop(0)
i += 1
print_dis("};", indent)
elif op is GROUPREF_EXISTS:
label = code.pop(0)
yeslen = code.pop(0)
print_dis("if {}({}) {{".format(op, label), indent)
nolen = code[yeslen-3] if code[yeslen-4] is OPCODES[JUMP] else 0
if nolen:
yeslen -= 2
_disop(code, yeslen-2, indent+1)
if nolen:
assert code[0] is OPCODES[JUMP]
code.pop(0) # jump
nolen = code.pop(0)
print_dis("} else {", indent)
_disop(code, nolen-1, indent+1)
print_dis("};", indent)
elif op is IN or op is IN_IGNORE:
sublen = code.pop(0)
if code[0] is OPCODES[NEGATE]:
code.pop(0)
sublen -= 1
print_dis("{}(negate=True) {{".format(op), indent)
else:
print_dis("{} {{".format(op), indent)
_disin(code, sublen-1, indent+1)
print_dis("};", indent)
elif op is INFO:
sublen = code.pop(0)
mask = code.pop(0)
minlen = code.pop(0)
maxlen = code.pop(0)
masklist = []
if mask & SRE_INFO_PREFIX: masklist.append('PREFIX')
if mask & SRE_INFO_LITERAL: masklist.append('LITERAL')
if mask & SRE_INFO_CHARSET: masklist.append('CHARSET')
if sublen > 4:
print_dis("{}(mask={}, min={}, max={}) {{".format(op, 0 if mask == 0 else '|'.join(masklist), minlen, maxlen), indent)
if mask & SRE_INFO_PREFIX:
raise NotImplementedError
elif mask & SRE_INFO_CHARSET:
_disin(code, sublen-4, indent+1)
print_dis("};", indent)
else:
print_dis("{}(mask={}, min={}, max={}) {{}};".format(op, mask, minlen, maxlen), indent)
elif op is JUMP:
code.pop(0)
print_dis("break;", indent)
elif op is LITERAL or op is NOT_LITERAL or op is LITERAL_IGNORE or op is NOT_LITERAL_IGNORE:
literal = repr(chr(code.pop(0)))
print_dis("{}({});".format(op, literal), indent)
elif op is MARK or op is GROUPREF:
label = code.pop(0)
print_dis("{}({});".format(op, label), indent)
elif op is REPEAT or op is REPEAT_ONE or op is MIN_REPEAT_ONE:
sublen = code.pop(0)
minrep = code.pop(0)
maxrep = code.pop(0)
print_dis("{}(min={}, max={}) {{".format(op, minrep, 'MAXREPEAT' if maxrep == MAXREPEAT else maxrep), indent)
_disop(code, sublen-3, indent+1)
if op is REPEAT:
untilopcode = code.pop(0)
untilop = opcodes.get(untilopcode)
assert untilop is MAX_UNTIL or untilop is MIN_UNTIL
print_dis("}} {};".format(untilop), indent)
else:
print_dis('};', indent)
else:
raise NotImplementedError
def dis(p, flags=None):
if type(p) is str:
pattern = p
if flags is None:
flags = 0
else:
pattern = p.pattern
if flags is None:
flags = p.flags
p = sre_parse.parse(pattern, flags)
code = sre_compile._code(p, flags)
_disop(code, len(code))
if __name__ == '__main__':
import sys
opts = True
flags = 0
for arg in sys.argv[1:]:
if opts and arg[0] == '-':
if arg == '--':
opts = False
continue
else:
for c in arg[1:]:
if c == 'a':
flags |= re.ASCII
elif c == 'i':
flags |= re.IGNORECASE
elif c == 'l':
flags |= re.LOCALE
elif c == 'm':
flags |= re.MULTILINE
elif c == 's':
flags |= re.DOTALL
elif c == 'x':
flags |= re.VERBOSE
else:
print('usage: dissre [-ailmsx] pattern...', file=sys.stderr)
exit(2)
else:
opts = False
dis(arg, flags)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment