Skip to content

Instantly share code, notes, and snippets.

@alexander-hanel
Created September 24, 2018 21:18
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save alexander-hanel/a266251ad9bf569c2985dcd625b5913a to your computer and use it in GitHub Desktop.
Save alexander-hanel/a266251ad9bf569c2985dcd625b5913a to your computer and use it in GitHub Desktop.
python recursive traversal disassembly using capstone and pefile
import sys
import re
import pefile
import string
import struct
from capstool import CapsTool
from capstone import *
from capstone.x86 import *
BCC = ["je", "jne", "js", "jns", "jp", "jnp", "jo", "jno", "jl", "jle", "jg",
"jge", "jb", "jbe", "ja", "jae", "jcxz", "jecxz", "jrcxz", "loop", "loopne",
"loope", "call", "lcall"]
END = ["ret", "retn", "retf", "iret", "int3"]
BNC = ["jmp", "jmpf", "ljmp"]
def get_pe_data(_data):
bit = 0
try:
pe = pefile.PE(data=_data)
pe_entry_point = pe.OPTIONAL_HEADER.AddressOfEntryPoint
rva = pe_entry_point - pe.OPTIONAL_HEADER.ImageBase
entry_point = pe.get_offset_from_rva(pe_entry_point)
except Exception as e:
print e
return False, None, None
if pe.FILE_HEADER.Machine == 0x14c:
bit = 32
elif pe.FILE_HEADER.Machine == 0x8664:
bit = 64
else:
return False, None, None
return True, entry_point, bit
def to_signed_32(n):
n = n & 0xffffffff
return (n ^ 0x80000000) - 0x80000000
def to_signed_64(n):
n = n & 0xffffffffffffffff
return (n ^ 0x8000000000000000) - 0x8000000000000000
def get_op_dist(bit, addr):
opp = cs.get_operand_value(addr, 0)
# check if operand is a register or some other non-int value
if not isinstance(opp, int):
return False, None
# convert to unsigned int based off of bit
elif bit == 32:
op_dist = to_signed_32(opp)
elif bit == 64:
op_dist = to_signed_64(opp)
return True, op_dist
def get_false_key(addr_bcc):
for key in addr_bcc:
if addr_bcc[key] is False:
return True, key
return False, None
def disassemble(addr, cs, debug=False):
visited = []
addr_bcc = {}
strings = {}
while True:
instr = cs.get_mnem(addr)
if debug:
print hex(addr), instr , addr_bcc # , [hex(x) for x in visited]
if instr is None or cs.dword(addr) == 0x0:
status, t_addr = get_false_key(addr_bcc)
if status:
addr = t_addr
continue
else:
break
if addr in addr_bcc:
if addr_bcc[addr] is False:
addr_bcc[addr] = True
else:
status, t_addr = get_false_key(addr_bcc)
if status:
addr = t_addr
continue
else:
break
if addr not in visited:
visited.append(addr)
if instr in BNC:
status, op_dist = get_op_dist(bit, addr)
if status:
addr = addr + op_dist
if addr in visited:
if addr in addr_bcc:
if addr_bcc[addr] is False:
addr_bcc[addr] = True
else:
addr_bcc[addr] = False
status, t_addr = get_false_key(addr_bcc)
if status:
addr = t_addr
continue
continue
elif instr in BCC:
if cs.word(addr) != 0x15ff:
status, op_dist = get_op_dist(bit, addr)
if status:
cal_addr = addr + op_dist
if cal_addr not in addr_bcc:
if cal_addr not in visited:
addr_bcc[cal_addr] = False
if cs.byte(cal_addr - 1) == 0x00:
temp_data = cs.get_many_bytes(addr + 5, op_dist - 6)
if temp_data:
if all(c in string.printable for c in temp_data):
strings[addr] = temp_data
status, t_addr = get_false_key(addr_bcc)
if status:
addr = t_addr
continue
elif instr in END:
status, t_addr = get_false_key(addr_bcc)
if status:
addr = t_addr
continue
else:
break
addr = cs.next_head(addr)
return visited, strings
with open(sys.argv[1], "rb") as infile:
data = infile.read()
status, addr, bit = get_pe_data(data)
cs = CapsTool("\x00\x00" + data[2:], bit)
yy, ss = disassemble(addr, cs)
for x in yy:
print hex(x), cs.get_disasm(x)
print ss
@ContegoCode
Copy link

ContegoCode commented Apr 18, 2020

In this section of code near the very bottom line 140 it says:
cs = CapsTool("\x00\x00" + data[2:], bit)

what is the "\x00\x00" doing? and why wont it run if its there?
when i try with "\x00\x00" I get an error saying:

Traceback (most recent call last):
File "rtd.py", line 140, in
cs = CapsTool("\x00\x00" + data[2:], bit)
TypeError: must be str, not bytes

but without the "\x00\x00" I get some disassembled information
btw im using python3.6

@alexander-hanel
Copy link
Author

@ContegoCode
This code was written for Python2.7. I have tested it but if you change "\x00\x00" from a string to a byte b"\x00\x00" it should work. I think the "\x00\x00" were inserted because I was testing with a portable executable file that I was having issues with or I wanted to test on raw data. I'm not 100% sure what I was thinking or how good this code is.... I probably need to review it. If you are looking for more details on recursive decent for disassembling, I have heard good things about No Starch's Binary Analysis book. I have it on my book shelf but haven't given it a full read but it does have some code https://nostarch.com/binaryanalysis

Cheers,
Alex

@ContegoCode
Copy link

ContegoCode commented Apr 21, 2020 via email

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment