Last active
July 9, 2021 18:09
-
-
Save Auscitte/37aa7b2d3be058cb6b4d5b8b4c13477a to your computer and use it in GitHub Desktop.
Retrieving types and variables from a pdb file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This file is a copy of gdata.py from pdbparse library ver. 1.5 | |
# (see https://github.com/moyix/pdbparse) | |
# with a few mofifications that are necessary for my scripts to work correctly | |
# Ry Auscitte | |
# Python 2 and 3 | |
from construct import * | |
gsym = Struct( | |
"leaf_type" / Int16ul, "data" / Switch( | |
lambda ctx: ctx.leaf_type, { | |
0x110E: | |
"data_v3" / Struct( | |
"symtype" / Int32ul, | |
"offset" / Int32ul, | |
"segment" / Int16ul, | |
"name" / CString(encoding = "utf8"), | |
), | |
0x1009: | |
"data_v2" / Struct( | |
"symtype" / Int32ul, | |
"offset" / Int32ul, | |
"segment" / Int16ul, | |
"name" / PascalString(lengthfield = "length" / Int8ul, encoding = "utf8"), | |
), | |
0x1125: #from struct REFSYM2 in cvinfo.h | |
"proc_ref" / Struct( | |
"sumname" / Int32ul, | |
"offset" / Int32ul, | |
"iMod" / Int16ul, | |
"name" / CString(encoding = "utf8"), | |
), | |
0x1127: #from struct REFSYM2 in cvinfo.h | |
"proc_ref" / Struct( | |
"sumname" / Int32ul, | |
"offset" / Int32ul, | |
"iMod" / Int16ul, | |
"name" / CString(encoding = "utf8"), | |
), | |
0x1108: #from struct UDTSYM in cvinfo.h | |
"udt" / Struct( | |
"typind" / Int32ul, | |
"name" / CString(encoding = "utf8"), | |
), | |
0x110d: #from struct DATASYM32 in cvinfo.h | |
"datasym" / Struct( | |
"typind" / Int32ul, | |
"offset" / Int32ul, | |
"segment" / Int16ul, | |
"name" / CString(encoding = "utf8"), | |
), | |
0x110c: | |
"datasym" / Struct( | |
"typind" / Int32ul, | |
"offset" / Int32ul, | |
"segment" / Int16ul, | |
"name" / CString(encoding = "utf8"), | |
), | |
0x1107: | |
"const" / Struct( | |
"typind" / Int32ul, # Type index (containing enum if enumerate) or metadata token | |
"value" / Int16ul, # numeric leaf containing value | |
"name" / CString(encoding = "utf8"), | |
), | |
})) | |
GlobalsData = "globals" / GreedyRange( | |
Struct( | |
"length" / Int16ul, | |
"symbol" / RestreamData(Bytes(lambda ctx: ctx.length), gsym), | |
)) | |
def parse(data): | |
con = GlobalsData.parse(data) | |
return merge_structures(con) | |
def parse_stream(stream): | |
con = GlobalsData.parse_stream(stream) | |
return merge_structures(con) | |
def merge_structures(con): | |
new_cons = [] | |
for sym in con: | |
sym_dict = {'length': sym.length, 'leaf_type': sym.symbol.leaf_type} | |
if sym.symbol.data: | |
#RAusc: | |
for k in sym.symbol.data.keys(): | |
sym_dict[k] = sym.symbol.data[k] | |
#sym_dict.update({ | |
# 'symtype': sym.symbol.data.symtype, | |
# 'offset': sym.symbol.data.offset, | |
# 'segment': sym.symbol.data.segment, | |
# 'name': sym.symbol.data.name | |
#}) | |
new_cons.append(Container(sym_dict)) | |
result = ListContainer(new_cons) | |
return result |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" A quick and dirty way of retrieving symbols' types from a pdb file using the pdbparse library. | |
The script is incomplete and aims to merely demonstrate one of possible methods. | |
:Copyright: | |
Ry Auscitte 2020. This script is distributed under GPL. | |
:Authors: | |
Ry Auscitte | |
""" | |
import pdbparse | |
from pdbparse import tpi | |
import sys | |
import construct as cs | |
from argparse import ArgumentParser | |
S_PROCREF = 0x1125 #reference to a procedure as defined in https://github.com/microsoft/microsoft-pdb/blob/master/include/cvinfo.h | |
# base_type_size has been borrowed from pdbparse's pdb_print_ctypes.py | |
# (see https://github.com/moyix/pdbparse) | |
base_type_size = { | |
"T_32PRCHAR": 4, | |
"T_32PUCHAR": 4, | |
"T_32PULONG": 4, | |
"T_32PUQUAD": 4, | |
"T_32PUSHORT": 4, | |
"T_32PVOID": 4, | |
"T_32PLONG": 4, | |
"T_64PRCHAR": 8, | |
"T_64PUCHAR": 8, | |
"T_64PULONG": 8, | |
"T_64PUQUAD": 8, | |
"T_64PUSHORT": 8, | |
"T_64PVOID": 8, | |
"T_64PLONG": 8, | |
"T_INT4": 4, | |
"T_INT8": 8, | |
"T_LONG": 4, | |
"T_QUAD": 8, | |
"T_RCHAR": 1, | |
"T_REAL32": 4, | |
"T_REAL64": 8, | |
"T_REAL80": 10, | |
"T_SHORT": 2, | |
"T_UCHAR": 1, | |
"T_UINT4": 4, | |
"T_ULONG": 4, | |
"T_UQUAD": 8, | |
"T_USHORT": 2, | |
"T_WCHAR": 2, | |
} | |
def get_type_name(tp): | |
"""Returns a name for pdbparse's TPI types. | |
Only some types are supported. | |
""" | |
#a primitive type does not have a record | |
if not "tpi_idx" in dir(tp): | |
return str(tp) | |
#for structures and unions, just print out the name | |
if tp.leaf_type == "LF_UNION" or tp.leaf_type == "LF_STRUCTURE": | |
return tp.name | |
#a pointer to a known type | |
if tp.leaf_type == "LF_POINTER": | |
return get_type_name(tp.utype) + "*" | |
#handling 'const', 'volatile', and 'unaligned' modifiers | |
if tp.leaf_type == "LF_MODIFIER": | |
s = [ mod for mod in ['const', 'volatile', 'unaligned'] if tp.modifier[mod] ] | |
return " ".join(s) + " " + get_type_name(tp.modified_type) | |
#only 1D arrays with elements of a primitive type are supported | |
if tp.leaf_type == "LF_ARRAY": | |
return get_type_name(tp.element_type) + "[" + str(int(tp.size / base_type_size[tp.element_type])) + "]" | |
return "UNKNOWN" | |
def print_struct_definition(pdb, sname): | |
"""Retrieves and prints a definition of the structure <sname> from the TPI stream""" | |
tps = list(filter(lambda t: pdb.STREAM_TPI.types[t].leaf_type == "LF_STRUCTURE" and pdb.STREAM_TPI.types[t].name == sname, pdb.STREAM_TPI.types)) | |
if len(tps) == 0: | |
print("Structure", sname, "is not defined in the tpi stream.") | |
return | |
print("struct", sname, "{") | |
for f in pdb.STREAM_TPI.types[tps[0]].fieldlist.substructs: | |
print("\t", f.name, ":", get_type_name(f.index)) | |
print("}") | |
def print_variable_declaration(pdb, vname): | |
"""Outputs a declaration for the variable <vname>""" | |
for s in pdb.STREAM_GSYM.globals: | |
if not "name" in s or s.name != vname: | |
continue | |
if not "typind" in s: | |
print("Found a symbol named", vname, "but, it did not have an associated type.") | |
continue | |
print(get_type_name(pdb.STREAM_TPI.types[s.typind]), " ", vname, ";", sep = "") | |
return | |
print("Could not find variable", sname) | |
def print_function_declaration_from_tpi_by_idx(pdb, fname, typind): | |
"""Outputs a prototype for the function <fname> with the type index <typind>""" | |
if not typind in pdb.STREAM_TPI.types: | |
print("There is no record with the index", typind, "in the TPI stream") | |
return | |
if pdb.STREAM_TPI.types[typind].leaf_type != "LF_PROCEDURE": #not dealing with static and member functions | |
print("The type at", typind, "is not a fuction, but", pdb.STREAM_TPI.types[typind].leaf_type) | |
return | |
formalparams = [ get_type_name(tp) for tp in pdb.STREAM_TPI.types[typind].arglist.arg_type ] | |
print(pdb.STREAM_TPI.types[typind].call_conv, " ", get_type_name(pdb.STREAM_TPI.types[typind].return_type), " ", fname, "(", ", ".join(formalparams), ")", sep="") | |
# The parsing constructs below folow the definitions of PROCSYM32, FRAMEPROCSYM, REGREL32, and CALLSITEINFO | |
# from https://github.com/microsoft/microsoft-pdb/blob/master/include/cvinfo.h | |
GlobalProcSym = "PROCSYM32" / cs.Struct( | |
"reclen" / cs.Int16ul, | |
"rectyp" / cs.Int16ul, | |
"pParent" / cs.Int32ul, | |
"pEnd" / cs.Int32ul, | |
"pNext" / cs.Int32ul, | |
"len" / cs.Int32ul, | |
"DbgStart" / cs.Int32ul, | |
"DbgEnd" / cs.Int32ul, | |
"typind" / cs.Int32ul, | |
"offset" / cs.Int32ul, | |
"seg" / cs.Int16ul, | |
"flags" / cs.Int8ul, | |
"name" / cs.CString(encoding = "utf8"), | |
) | |
def print_function_declaration_from_mods_stream(pdb, fname): | |
"""Prints out a declaration for the function named <fname> """ | |
fncs = filter(lambda s: s.leaf_type == S_PROCREF and s.name == fname, pdb.STREAM_GSYM.globals) | |
for f in fncs: | |
data = pdb.streams[pdb.STREAM_DBI.DBIExHeaders[f.iMod - 1].stream].data | |
fn = GlobalProcSym.parse(data[f.offset:]) | |
print_function_declaration_from_tpi(pdb, fn.name, fn.typind) | |
ProcFrameData = cs.Struct( | |
"rectyp" / cs.Enum(cs.Int16ul, S_FRAMEPROC = 0x1012, S_CALLSITEINFO = 0x1139, S_REGREL32 = 0x1111), | |
"reminder" / cs.Switch( | |
lambda ctx: ctx.rectyp, { | |
"S_FRAMEPROC": | |
"FRAMEPROCSYM" / cs.Struct( | |
"cbFrame" / cs.Int32ul, | |
"offPad" / cs.Int32ul, | |
"cbSaveRegs" / cs.Int32ul, | |
"offExHdlr" / cs.Int16ul, | |
"flags" / cs.Int32ul, | |
), | |
"S_REGREL32": | |
"REGREL32" / cs.Struct( | |
"off" / cs.Int32ul, | |
"typind" / cs.Int32ul, | |
"reg" / cs.Int16ul, | |
"name" / cs.CString(encoding = "utf8"), | |
), | |
"S_CALLSITEINFO": | |
"CALLSITEINFO" / cs.Struct( | |
"off" / cs.Int32ul, | |
"sect" / cs.Int16ul, | |
"__reserved_0" / cs.Int16ul, | |
"typind" / cs.Int32ul, | |
), | |
}) | |
) | |
ProcFrameEntries = cs.GreedyRange( | |
cs.Struct( | |
"reclen" / cs.Int16ul, | |
"frame_entry" / cs.RestreamData(cs.Bytes(lambda ctx: ctx.reclen), ProcFrameData), | |
) | |
) | |
GlobalProc = cs.Struct( | |
"PROCSYM32" / cs.Struct( | |
"reclen" / cs.Int16ul, | |
"rectyp" / cs.Int16ul, | |
"pParent" / cs.Int32ul, | |
"pEnd" / cs.Int32ul, | |
"pNext" / cs.Int32ul, | |
"len" / cs.Int32ul, | |
"DbgStart" / cs.Int32ul, | |
"DbgEnd" / cs.Int32ul, | |
"typind" / cs.Int32ul, | |
"offset" / cs.Int32ul, | |
"seg" / cs.Int16ul, | |
"flags" / cs.Int8ul, | |
"name" / cs.CString(encoding = "utf8"), | |
), | |
#making sure the entire length of PROCSYM32 has been parsed | |
cs.Padding(lambda ctx: ctx.PROCSYM32.reclen + cs.Int16ul.sizeof() - ctx._io.tell()), | |
"frame_data" / cs.RestreamData( | |
#ctx.PROCSYM32.pEnd points to the region immediately following the last element of ProcFrameEntries | |
#ctx.PROCSYM32.reclen does not include the reclen field hence the cs.Int16ul.sizeof() correction | |
cs.Bytes(lambda ctx: ctx.PROCSYM32.pEnd - ctx._params.entry_offest - ctx.PROCSYM32.reclen - cs.Int16ul.sizeof()), | |
ProcFrameEntries | |
) | |
) | |
def flatten_frame_data(cont): | |
"""Flatens the nested structure of ProcFrameData""" | |
fd = cs.lib.ListContainer() | |
for c in cont: | |
dc = cs.lib.Container() | |
dc["reclen"] = c.reclen | |
dc["rectyp"] = c.frame_entry.rectyp | |
for k in c.frame_entry.reminder: | |
if k.startswith("_"): | |
continue | |
dc[k] = c.frame_entry.reminder[k] | |
fd.append(dc) | |
return fd | |
def print_function_declaration_from_tpi(pdb, fname): | |
"""Outputs <fname>'s prototype using TPI records only""" | |
fncs = list(filter(lambda s: s.leaf_type == S_PROCREF and s.name == fname, pdb.STREAM_GSYM.globals)) | |
if len(fncs) == 0: | |
print("There is no S_PROCREF-type reference to", fname, "in the global symbols stream.") | |
return | |
#Indices given by iMod are 1-based while pdb.STREAM_DBI.DBIExHeaders[] is a standard python list with 0-based indexing | |
data = pdb.streams[pdb.STREAM_DBI.DBIExHeaders[fncs[0].iMod - 1].stream].data | |
fn = GlobalProc.parse(data[fncs[0].offset:], entry_offest = fncs[0].offset) | |
print_function_declaration_from_tpi_by_idx(pdb, fname, fn.PROCSYM32.typind) | |
def print_function_declaration_from_mods_stream_named_params(pdb, fname): | |
"""Outputs <fname>'s prototype using TPI and module streams""" | |
fncs = list(filter(lambda s: s.leaf_type == S_PROCREF and s.name == fname, pdb.STREAM_GSYM.globals)) | |
if len(fncs) == 0: | |
print("There is no S_PROCREF-type reference to", fname, "in the global symbols stream.") | |
return | |
data = pdb.streams[pdb.STREAM_DBI.DBIExHeaders[fncs[0].iMod - 1].stream].data | |
fn = GlobalProc.parse(data[fncs[0].offset:], entry_offest = fncs[0].offset) | |
if not fn.PROCSYM32.typind in pdb.STREAM_TPI.types: | |
print("There is no type record for", fname, "( PROCSYM32.typind =", fn.PROCSYM32.typind, ") in the TPI stream") | |
return | |
tp = pdb.STREAM_TPI.types[fn.PROCSYM32.typind] | |
paramcnt = tp.arglist.count #variable number of arguments is not supported | |
paramregs = list(filter(lambda k: k.rectyp == "S_REGREL32", flatten_frame_data(fn.frame_data)))[0:paramcnt] | |
#Primitive types do not have records in the TPI stream; for them, typid is enum rather than an index | |
#In this case, typind must be parsed by means of pdbparse's constructs to produce a compatible enum | |
#expected by get_type_name() | |
params = [ get_type_name(pdb.STREAM_TPI.types[e.typind] if e.typind in pdb.STREAM_TPI.types\ | |
else tpi.base_type.parse(e.typind.to_bytes(16, byteorder='little'))) + " " + e.name for e in paramregs ] | |
print(tp.call_conv, " ", get_type_name(tp.return_type), " ", fname, "(", ", ".join(params), ")", sep = "") | |
if __name__ == '__main__': | |
ap = ArgumentParser(description = "Retrieves types and variable declarations from a pdb file and outputs them in a human-readable language.") | |
ap.add_argument("-p", required = True, help = "a path to the pdb file") | |
ap.add_argument("-s", required = False, nargs='+', help = "names of structures to output the definitions for") | |
ap.add_argument("-v", required = False, nargs='+', help = "names of variables to output the declarations for") | |
ap.add_argument("-f", required = False, nargs='+', help = "names of functions to output the prototypes for (with unnamed formal parameters)") | |
ap.add_argument("-fna", required = False, nargs='+', help = "names of functions to output the prototypes for (with named formal parameters)") | |
args = ap.parse_args() | |
pdb = pdbparse.parse(args.p) | |
for itm in args.s if not args.s is None else []: | |
print_struct_definition(pdb, itm) | |
for itm in args.v if not args.v is None else []: | |
print_variable_declaration(pdb, itm) | |
for itm in args.f if not args.f is None else []: | |
print_function_declaration_from_tpi(pdb, itm) | |
for itm in args.fna if not args.fna is None else []: | |
print_function_declaration_from_mods_stream_named_params(pdb, itm) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The detailed explanation can be found here.