Skip to content

Instantly share code, notes, and snippets.

@Auscitte
Last active July 9, 2021 18:09
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Auscitte/37aa7b2d3be058cb6b4d5b8b4c13477a to your computer and use it in GitHub Desktop.
Save Auscitte/37aa7b2d3be058cb6b4d5b8b4c13477a to your computer and use it in GitHub Desktop.
Retrieving types and variables from a pdb file
# This file is a copy of gdata.py from pdbparse library ver. 1.5
# (see https://github.com/moyix/pdbparse)
# with a few mofifications that are necessary for my scripts to work correctly
# Ry Auscitte
# Python 2 and 3
from construct import *
gsym = Struct(
"leaf_type" / Int16ul, "data" / Switch(
lambda ctx: ctx.leaf_type, {
0x110E:
"data_v3" / Struct(
"symtype" / Int32ul,
"offset" / Int32ul,
"segment" / Int16ul,
"name" / CString(encoding = "utf8"),
),
0x1009:
"data_v2" / Struct(
"symtype" / Int32ul,
"offset" / Int32ul,
"segment" / Int16ul,
"name" / PascalString(lengthfield = "length" / Int8ul, encoding = "utf8"),
),
0x1125: #from struct REFSYM2 in cvinfo.h
"proc_ref" / Struct(
"sumname" / Int32ul,
"offset" / Int32ul,
"iMod" / Int16ul,
"name" / CString(encoding = "utf8"),
),
0x1127: #from struct REFSYM2 in cvinfo.h
"proc_ref" / Struct(
"sumname" / Int32ul,
"offset" / Int32ul,
"iMod" / Int16ul,
"name" / CString(encoding = "utf8"),
),
0x1108: #from struct UDTSYM in cvinfo.h
"udt" / Struct(
"typind" / Int32ul,
"name" / CString(encoding = "utf8"),
),
0x110d: #from struct DATASYM32 in cvinfo.h
"datasym" / Struct(
"typind" / Int32ul,
"offset" / Int32ul,
"segment" / Int16ul,
"name" / CString(encoding = "utf8"),
),
0x110c:
"datasym" / Struct(
"typind" / Int32ul,
"offset" / Int32ul,
"segment" / Int16ul,
"name" / CString(encoding = "utf8"),
),
0x1107:
"const" / Struct(
"typind" / Int32ul, # Type index (containing enum if enumerate) or metadata token
"value" / Int16ul, # numeric leaf containing value
"name" / CString(encoding = "utf8"),
),
}))
GlobalsData = "globals" / GreedyRange(
Struct(
"length" / Int16ul,
"symbol" / RestreamData(Bytes(lambda ctx: ctx.length), gsym),
))
def parse(data):
con = GlobalsData.parse(data)
return merge_structures(con)
def parse_stream(stream):
con = GlobalsData.parse_stream(stream)
return merge_structures(con)
def merge_structures(con):
new_cons = []
for sym in con:
sym_dict = {'length': sym.length, 'leaf_type': sym.symbol.leaf_type}
if sym.symbol.data:
#RAusc:
for k in sym.symbol.data.keys():
sym_dict[k] = sym.symbol.data[k]
#sym_dict.update({
# 'symtype': sym.symbol.data.symtype,
# 'offset': sym.symbol.data.offset,
# 'segment': sym.symbol.data.segment,
# 'name': sym.symbol.data.name
#})
new_cons.append(Container(sym_dict))
result = ListContainer(new_cons)
return result
""" A quick and dirty way of retrieving symbols' types from a pdb file using the pdbparse library.
The script is incomplete and aims to merely demonstrate one of possible methods.
:Copyright:
Ry Auscitte 2020. This script is distributed under GPL.
:Authors:
Ry Auscitte
"""
import pdbparse
from pdbparse import tpi
import sys
import construct as cs
from argparse import ArgumentParser
S_PROCREF = 0x1125 #reference to a procedure as defined in https://github.com/microsoft/microsoft-pdb/blob/master/include/cvinfo.h
# base_type_size has been borrowed from pdbparse's pdb_print_ctypes.py
# (see https://github.com/moyix/pdbparse)
base_type_size = {
"T_32PRCHAR": 4,
"T_32PUCHAR": 4,
"T_32PULONG": 4,
"T_32PUQUAD": 4,
"T_32PUSHORT": 4,
"T_32PVOID": 4,
"T_32PLONG": 4,
"T_64PRCHAR": 8,
"T_64PUCHAR": 8,
"T_64PULONG": 8,
"T_64PUQUAD": 8,
"T_64PUSHORT": 8,
"T_64PVOID": 8,
"T_64PLONG": 8,
"T_INT4": 4,
"T_INT8": 8,
"T_LONG": 4,
"T_QUAD": 8,
"T_RCHAR": 1,
"T_REAL32": 4,
"T_REAL64": 8,
"T_REAL80": 10,
"T_SHORT": 2,
"T_UCHAR": 1,
"T_UINT4": 4,
"T_ULONG": 4,
"T_UQUAD": 8,
"T_USHORT": 2,
"T_WCHAR": 2,
}
def get_type_name(tp):
"""Returns a name for pdbparse's TPI types.
Only some types are supported.
"""
#a primitive type does not have a record
if not "tpi_idx" in dir(tp):
return str(tp)
#for structures and unions, just print out the name
if tp.leaf_type == "LF_UNION" or tp.leaf_type == "LF_STRUCTURE":
return tp.name
#a pointer to a known type
if tp.leaf_type == "LF_POINTER":
return get_type_name(tp.utype) + "*"
#handling 'const', 'volatile', and 'unaligned' modifiers
if tp.leaf_type == "LF_MODIFIER":
s = [ mod for mod in ['const', 'volatile', 'unaligned'] if tp.modifier[mod] ]
return " ".join(s) + " " + get_type_name(tp.modified_type)
#only 1D arrays with elements of a primitive type are supported
if tp.leaf_type == "LF_ARRAY":
return get_type_name(tp.element_type) + "[" + str(int(tp.size / base_type_size[tp.element_type])) + "]"
return "UNKNOWN"
def print_struct_definition(pdb, sname):
"""Retrieves and prints a definition of the structure <sname> from the TPI stream"""
tps = list(filter(lambda t: pdb.STREAM_TPI.types[t].leaf_type == "LF_STRUCTURE" and pdb.STREAM_TPI.types[t].name == sname, pdb.STREAM_TPI.types))
if len(tps) == 0:
print("Structure", sname, "is not defined in the tpi stream.")
return
print("struct", sname, "{")
for f in pdb.STREAM_TPI.types[tps[0]].fieldlist.substructs:
print("\t", f.name, ":", get_type_name(f.index))
print("}")
def print_variable_declaration(pdb, vname):
"""Outputs a declaration for the variable <vname>"""
for s in pdb.STREAM_GSYM.globals:
if not "name" in s or s.name != vname:
continue
if not "typind" in s:
print("Found a symbol named", vname, "but, it did not have an associated type.")
continue
print(get_type_name(pdb.STREAM_TPI.types[s.typind]), " ", vname, ";", sep = "")
return
print("Could not find variable", sname)
def print_function_declaration_from_tpi_by_idx(pdb, fname, typind):
"""Outputs a prototype for the function <fname> with the type index <typind>"""
if not typind in pdb.STREAM_TPI.types:
print("There is no record with the index", typind, "in the TPI stream")
return
if pdb.STREAM_TPI.types[typind].leaf_type != "LF_PROCEDURE": #not dealing with static and member functions
print("The type at", typind, "is not a fuction, but", pdb.STREAM_TPI.types[typind].leaf_type)
return
formalparams = [ get_type_name(tp) for tp in pdb.STREAM_TPI.types[typind].arglist.arg_type ]
print(pdb.STREAM_TPI.types[typind].call_conv, " ", get_type_name(pdb.STREAM_TPI.types[typind].return_type), " ", fname, "(", ", ".join(formalparams), ")", sep="")
# The parsing constructs below folow the definitions of PROCSYM32, FRAMEPROCSYM, REGREL32, and CALLSITEINFO
# from https://github.com/microsoft/microsoft-pdb/blob/master/include/cvinfo.h
GlobalProcSym = "PROCSYM32" / cs.Struct(
"reclen" / cs.Int16ul,
"rectyp" / cs.Int16ul,
"pParent" / cs.Int32ul,
"pEnd" / cs.Int32ul,
"pNext" / cs.Int32ul,
"len" / cs.Int32ul,
"DbgStart" / cs.Int32ul,
"DbgEnd" / cs.Int32ul,
"typind" / cs.Int32ul,
"offset" / cs.Int32ul,
"seg" / cs.Int16ul,
"flags" / cs.Int8ul,
"name" / cs.CString(encoding = "utf8"),
)
def print_function_declaration_from_mods_stream(pdb, fname):
"""Prints out a declaration for the function named <fname> """
fncs = filter(lambda s: s.leaf_type == S_PROCREF and s.name == fname, pdb.STREAM_GSYM.globals)
for f in fncs:
data = pdb.streams[pdb.STREAM_DBI.DBIExHeaders[f.iMod - 1].stream].data
fn = GlobalProcSym.parse(data[f.offset:])
print_function_declaration_from_tpi(pdb, fn.name, fn.typind)
ProcFrameData = cs.Struct(
"rectyp" / cs.Enum(cs.Int16ul, S_FRAMEPROC = 0x1012, S_CALLSITEINFO = 0x1139, S_REGREL32 = 0x1111),
"reminder" / cs.Switch(
lambda ctx: ctx.rectyp, {
"S_FRAMEPROC":
"FRAMEPROCSYM" / cs.Struct(
"cbFrame" / cs.Int32ul,
"offPad" / cs.Int32ul,
"cbSaveRegs" / cs.Int32ul,
"offExHdlr" / cs.Int16ul,
"flags" / cs.Int32ul,
),
"S_REGREL32":
"REGREL32" / cs.Struct(
"off" / cs.Int32ul,
"typind" / cs.Int32ul,
"reg" / cs.Int16ul,
"name" / cs.CString(encoding = "utf8"),
),
"S_CALLSITEINFO":
"CALLSITEINFO" / cs.Struct(
"off" / cs.Int32ul,
"sect" / cs.Int16ul,
"__reserved_0" / cs.Int16ul,
"typind" / cs.Int32ul,
),
})
)
ProcFrameEntries = cs.GreedyRange(
cs.Struct(
"reclen" / cs.Int16ul,
"frame_entry" / cs.RestreamData(cs.Bytes(lambda ctx: ctx.reclen), ProcFrameData),
)
)
GlobalProc = cs.Struct(
"PROCSYM32" / cs.Struct(
"reclen" / cs.Int16ul,
"rectyp" / cs.Int16ul,
"pParent" / cs.Int32ul,
"pEnd" / cs.Int32ul,
"pNext" / cs.Int32ul,
"len" / cs.Int32ul,
"DbgStart" / cs.Int32ul,
"DbgEnd" / cs.Int32ul,
"typind" / cs.Int32ul,
"offset" / cs.Int32ul,
"seg" / cs.Int16ul,
"flags" / cs.Int8ul,
"name" / cs.CString(encoding = "utf8"),
),
#making sure the entire length of PROCSYM32 has been parsed
cs.Padding(lambda ctx: ctx.PROCSYM32.reclen + cs.Int16ul.sizeof() - ctx._io.tell()),
"frame_data" / cs.RestreamData(
#ctx.PROCSYM32.pEnd points to the region immediately following the last element of ProcFrameEntries
#ctx.PROCSYM32.reclen does not include the reclen field hence the cs.Int16ul.sizeof() correction
cs.Bytes(lambda ctx: ctx.PROCSYM32.pEnd - ctx._params.entry_offest - ctx.PROCSYM32.reclen - cs.Int16ul.sizeof()),
ProcFrameEntries
)
)
def flatten_frame_data(cont):
"""Flatens the nested structure of ProcFrameData"""
fd = cs.lib.ListContainer()
for c in cont:
dc = cs.lib.Container()
dc["reclen"] = c.reclen
dc["rectyp"] = c.frame_entry.rectyp
for k in c.frame_entry.reminder:
if k.startswith("_"):
continue
dc[k] = c.frame_entry.reminder[k]
fd.append(dc)
return fd
def print_function_declaration_from_tpi(pdb, fname):
"""Outputs <fname>'s prototype using TPI records only"""
fncs = list(filter(lambda s: s.leaf_type == S_PROCREF and s.name == fname, pdb.STREAM_GSYM.globals))
if len(fncs) == 0:
print("There is no S_PROCREF-type reference to", fname, "in the global symbols stream.")
return
#Indices given by iMod are 1-based while pdb.STREAM_DBI.DBIExHeaders[] is a standard python list with 0-based indexing
data = pdb.streams[pdb.STREAM_DBI.DBIExHeaders[fncs[0].iMod - 1].stream].data
fn = GlobalProc.parse(data[fncs[0].offset:], entry_offest = fncs[0].offset)
print_function_declaration_from_tpi_by_idx(pdb, fname, fn.PROCSYM32.typind)
def print_function_declaration_from_mods_stream_named_params(pdb, fname):
"""Outputs <fname>'s prototype using TPI and module streams"""
fncs = list(filter(lambda s: s.leaf_type == S_PROCREF and s.name == fname, pdb.STREAM_GSYM.globals))
if len(fncs) == 0:
print("There is no S_PROCREF-type reference to", fname, "in the global symbols stream.")
return
data = pdb.streams[pdb.STREAM_DBI.DBIExHeaders[fncs[0].iMod - 1].stream].data
fn = GlobalProc.parse(data[fncs[0].offset:], entry_offest = fncs[0].offset)
if not fn.PROCSYM32.typind in pdb.STREAM_TPI.types:
print("There is no type record for", fname, "( PROCSYM32.typind =", fn.PROCSYM32.typind, ") in the TPI stream")
return
tp = pdb.STREAM_TPI.types[fn.PROCSYM32.typind]
paramcnt = tp.arglist.count #variable number of arguments is not supported
paramregs = list(filter(lambda k: k.rectyp == "S_REGREL32", flatten_frame_data(fn.frame_data)))[0:paramcnt]
#Primitive types do not have records in the TPI stream; for them, typid is enum rather than an index
#In this case, typind must be parsed by means of pdbparse's constructs to produce a compatible enum
#expected by get_type_name()
params = [ get_type_name(pdb.STREAM_TPI.types[e.typind] if e.typind in pdb.STREAM_TPI.types\
else tpi.base_type.parse(e.typind.to_bytes(16, byteorder='little'))) + " " + e.name for e in paramregs ]
print(tp.call_conv, " ", get_type_name(tp.return_type), " ", fname, "(", ", ".join(params), ")", sep = "")
if __name__ == '__main__':
ap = ArgumentParser(description = "Retrieves types and variable declarations from a pdb file and outputs them in a human-readable language.")
ap.add_argument("-p", required = True, help = "a path to the pdb file")
ap.add_argument("-s", required = False, nargs='+', help = "names of structures to output the definitions for")
ap.add_argument("-v", required = False, nargs='+', help = "names of variables to output the declarations for")
ap.add_argument("-f", required = False, nargs='+', help = "names of functions to output the prototypes for (with unnamed formal parameters)")
ap.add_argument("-fna", required = False, nargs='+', help = "names of functions to output the prototypes for (with named formal parameters)")
args = ap.parse_args()
pdb = pdbparse.parse(args.p)
for itm in args.s if not args.s is None else []:
print_struct_definition(pdb, itm)
for itm in args.v if not args.v is None else []:
print_variable_declaration(pdb, itm)
for itm in args.f if not args.f is None else []:
print_function_declaration_from_tpi(pdb, itm)
for itm in args.fna if not args.fna is None else []:
print_function_declaration_from_mods_stream_named_params(pdb, itm)
@Auscitte
Copy link
Author

Auscitte commented Jul 3, 2021

The detailed explanation can be found here.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment