Skip to content

Instantly share code, notes, and snippets.

@jesboat
Created May 6, 2020 22:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jesboat/c99c8a9ec0806912bbc3a7476bdb674a to your computer and use it in GitHub Desktop.
Save jesboat/c99c8a9ec0806912bbc3a7476bdb674a to your computer and use it in GitHub Desktop.
kernel debug notes
import json
import copy
import sys
from core.cvalue import cast, dereference, addressof, getfieldoffset, sizeof, unsigned
import xnu
from xnu import kern, IterateQueue, GetLLDBThreadForKernelThread, GetObjectAtIndexFromArray
ST_RUNNABLE = 'runnable'
ST_WAIT = 'wait'
ST_IDLE = 'idle'
class Queue(object):
"""
Efficient FIFO queue which supports adding new elements and iterating
simultaneously.
"""
def __init__(self, things=[]):
""" Creates a new queue, (optionally) pre-adding everything in things """
self.out_side = list(things)
self.out_idx = 0
self.in_side = []
def add(self, thing):
""" Add a single element to the end of the queue """
self.out_side.append(thing)
def __iter__(self):
"""
Iterate over items in the queue, including all items which are in the
queue at the start of the iter and any items which are added during the
course of the iteration. Items are removed from the queue when they are
returned by the iterator.
"""
while self.in_side or self.out_side:
if self.out_side:
while self.out_idx < len(self.out_side):
item = self.out_side[self.out_idx]
self.out_idx += 1
yield item
self.out_side = None
if self.in_side:
self.out_side = self.in_side
self.out_idx = 0
self.in_side = []
def size(self):
""" Returns the number of pending items in the queue. """
size = len(self.in_side)
if self.out_side:
size += len(self.out_side) - self.out_idx
return size
class WaitqScanner(object):
"""
Utility for scanning a collection of waitqs in order to build a map from
wqsets to the waitqs in them.
Conceptually, wqsets contain waitqs, and each waitq can be in zero or more
wqsets. However, xnu stores this inverted; instead of wqsets having a list
of their members, each waitq refers (through a somewhat complicated
datastructure) to the wqsets which directly contain it. WaitqScanner has
the logic for interpreting that datastructure, and the logic for building
the effective map from wqsets to their members.
Usage is:
scanner = WaitqScanner()
for waitq in waitqs:
scanner.process_waitq(waitq)
for wqset in wqsets:
# what you wish you could write
members = [wq for wq in waitqs if wqset.contains(wq)]
# what you can write
members = scanner.get_members_of_wqset(wqset)
"""
def __init__(self):
if hasattr(kern.globals, 'g_lt_idx_max'):
self.idx_mask = int(kern.globals.g_lt_idx_max)
else:
self.idx_mask = 0x000000000003ffff
# Cache from waitq_set_id to array of waitq_set proxies for the sets
# directly containing waitqs which have `waitq->waitq_set_id == that`.
self.setid_cache = {}
# Map from address of waitq_set to waitqs they hold
self.set_to_members = {}
def _read_id(self, id):
""" Given an ID, return a proxy for the `struct waitq_link` """
# borrowed from lldbmacros/ipc.py
idx = int(id & self.idx_mask)
slab_slot = idx / kern.globals.g_wqlinktable.slab_elem
slab = kern.globals.g_wqlinktable.table[int(slab_slot)]
addr = int(slab) + int(kern.globals.g_wqlinktable.elem_sz)*(
idx - cast(slab, 'lt_elem*').lt_id.idx)
return kern.GetValueFromAddress(addr, 'waitq_link *')
def _expand_setid(self, id):
"""
Given an ID (as in the waitq_set_id field of a `struct waitq` or in the
wql_link.left_setid or wql_link.right_setid of a `struct waitq_link`),
returns a lis of proxies for all the wqsets which directly contain any
waitqs which have that ID in their waitq_set_id field.
"""
if id not in self.setid_cache:
self.setid_cache[id] = self._do_expand_setid(id)
return self.setid_cache[id]
def _do_expand_setid(self, id):
""" implementation of _expand_setid """
sets = []
id_queue = Queue([id])
for id in id_queue:
link = self._read_id(id)
wqe_type = {
0: 'FREE',
1: 'ELEM',
2: 'LINK',
3: 'RSVD'
}[(link.wqte.lt_bits >> 29) & 0x3]
if wqe_type == 'ELEM':
sets.append(link.wql_wqs.wql_set)
elif wqe_type == 'LINK':
id_queue.add(link.wql_link.left_setid)
id_queue.add(link.wql_link.right_setid)
else:
assert(wqe_type == 'FREE')
return sets
def process_waitq(self, waitq_proxy):
"""
Slurp in all data necessary to determine which wqsets contain that
waitq
"""
waitq_addr = int(waitq_proxy)
setid = int(waitq_proxy.waitq_set_id)
if setid:
wqsets = self._expand_setid(setid)
else:
wqsets = []
for wqset in wqsets:
wqset_addr = int(wqset)
self.set_to_members.setdefault(wqset_addr, {
'wqset_addr': wqset_addr,
'wqset_proxy': wqset,
'members': {},
})
self.set_to_members[wqset_addr]['members'][waitq_addr] = waitq_proxy
def get_members_of_wqset(self, wqset_proxy):
"""
Return all waitqs which were processed which are members in the wqset
"""
wqset_addr = int(wqset_proxy)
info = self.set_to_members.get(wqset_addr, None)
if info:
return info['members'].values()
else:
return []
class FullIpcScanner(object):
"""
Read and cross-reference basically everything about possible IPC. Used
like:
s = FullIpcScanner()
s.process_system()
All the data is accumulated in `s.store` but it isn't really in a
nice-to-consume format yet.
"""
def __init__(self):
self.queue = Queue()
self.store = {
'task': {},
'port': {},
'pset': {},
'kmsg': {},
}
self.waitq_scanner = WaitqScanner()
self.waitq_to_obj = {}
def process_system(self):
""" Read/process everything """
for task_proxy in kern.tasks:
self.discovered_object('task', task_proxy, 'system', 'tasks')
self.drain_queue()
for task_proxy in kern.terminated_tasks:
self.discovered_object('task', task_proxy, 'system', 'terminated_tasks')
self.store['task'][int(task_proxy)]['terminated'] = True
self.drain_queue()
self.finalize_psets()
def discovered_object(self, kind, proxy, example_from_kind, example_from_thing):
"""
Called when the first reference is discovered to some object `(kind, proxy)`.
Initializes storage for the object (so that any in/out references now known
can be recorded immediately) and enqueues it for later traversal.
"""
addr = int(proxy)
self.store[kind].setdefault(addr, {
'addr': addr,
'proxy': proxy,
'in': [],
'out': [],
})
self.queue.add((kind, proxy, example_from_kind, example_from_thing))
def drain_queue(self):
""" Process all enqueued objects until the queue is empty. """
for kind, proxy, example_from_kind, example_from_thing in self.queue:
sys.stderr.write(
"Debug: processing %s [%s] (referenced from %s [%s])...\n"
% (kind, proxy, example_from_kind, example_from_thing))
if kind == 'task':
self.process_task(proxy)
elif kind == 'port':
self.process_port(proxy)
elif kind == 'pset':
self.process_pset(proxy)
elif kind == 'kmsg':
self.process_kmsg(proxy)
else:
raise Exception()
def add_reference(self, frm, to, attributes):
"""
Called when a references is discovered from some already-known object
`frm = (from_type, from_addr)` to some object `to = (to_kind, to_proxy)`
which may or may not yet be known.
"""
from_type, from_addr = frm
to_type, to_proxy = to
to_addr = int(to_proxy)
assert(from_addr in self.store[from_type])
if to_addr not in self.store[to_type]:
self.discovered_object(to_type, to_proxy, from_type, hex(from_addr))
self.store[from_type][from_addr]['out'].append({
'to_type': to_type,
'to_addr': to_addr,
'attributes': attributes,
})
self.store[to_type][to_addr]['in'].append({
'from_type': from_type,
'from_addr': from_addr,
'attributes': attributes,
})
def process_task(self, task_proxy):
""" Discover all out-references from the specified task """
task_addr = int(task_proxy)
bsd_info = cast(task_proxy.bsd_info, 'proc*')
if int(bsd_info):
task_name = str(bsd_info.p_name)
elif 'terminated' in self.store['task'][task_addr]:
task_name = '__terminated'
else:
task_name = '__unknown??'
self.store['task'][task_addr]['task_name'] = task_name
space_proxy = task_proxy.itk_space
num_entries = int(space_proxy.is_table_size)
table_proxy = space_proxy.is_table
sys.stderr.write("Processing task '%s' (approx %d ports)...\n" % (task_name, num_entries))
for idx in range(0, num_entries):
entry_proxy = GetObjectAtIndexFromArray(table_proxy, idx)
ie_bits = int(entry_proxy.ie_bits)
if ie_bits & 0x000f0000 and ie_bits & 0x001f0000:
entry_name = "{:x}".format((idx << 8 | ie_bits >> 24))
if ie_bits & 0x00100000:
pass # dead
elif ie_bits & 0x00080000:
right = 'Set'
pset_proxy = cast(entry_proxy.ie_object, 'ipc_pset*')
self.add_reference(('task', task_addr), ('pset', pset_proxy), {'name': entry_name})
else: # port
port_proxy = cast(entry_proxy.ie_object, 'ipc_port_t')
if ie_bits & 0x00010000:
if ie_bits & 0x00020000:
rights = 'SendmanyRecv'
else:
rights = 'Sendmany'
elif ie_bits & 0x00020000:
rights = 'Recv'
elif ie_bits & 0x00040000:
rights = 'Sendonce'
else:
rights = 'Unknown'
if int(entry_proxy.index.request):
requestsval = port_proxy.ip_requests
sorightval = requestsval[int(entry_proxy.index.request)].notify.port
soright_ptr = int(sorightval)
if soright_ptr != 0:
rights += '/dead'
if soright_ptr & 0x1: rights += '/sendPossibleArmed'
if soright_ptr & 0x2: rights += '/sendPossibleRequested'
if port_proxy.ip_nsrequest != 0: rights += '/nosendersNotificationRequested'
if port_proxy.ip_pdrequest != 0: rights += '/destroyNotificationRequested'
self.add_reference(('task', task_addr), ('port', port_proxy),
{'name': entry_name, 'rights': rights})
for specialkind, specials in [
('Sendmany', [
'itk_self', 'itk_host', 'itk_bootstrap',
'itk_seatbelt', 'itk_gssd', 'itk_debug_control', 'itk_task_access',
]),
('Naked', ['itk_nself']),
('Recv', ['itk_resume']),
]:
for special in specials:
port_proxy = getattr(task_proxy, special)
if int(port_proxy) > 0:
self.add_reference(('task', task_addr), ('port', port_proxy),
{'special': special, 'rights': specialkind})
tr_max = sizeof(task_proxy.itk_registered) / sizeof(task_proxy.itk_registered[0])
for idx in range(0, tr_max):
port_proxy = task_proxy.itk_registered[idx]
if int(port_proxy) > 0:
self.add_reference(('task', task_addr), ('port', port_proxy),
{'special': 'registered:' + str(idx), 'rights': 'Sendmany'})
ex_max = sizeof(task_proxy.exc_actions) / sizeof(task_proxy.exc_actions[0])
for idx in range(0, ex_max):
port_proxy = task_proxy.exc_actions[idx].port
if int(port_proxy) > 0:
self.add_reference(('task', task_addr), ('port', port_proxy),
{'special': 'exc:' + str(idx), 'rights': 'Sendmany'})
for thread_proxy in IterateQueue(task_proxy.threads, 'thread*', 'task_threads'):
if int(thread_proxy.ith_sself) > 0:
self.add_reference(('task', task_addr), ('port', thread_proxy.ith_self),
{'special': 'thread_self:0x' + hex(int(thread_proxy)),
'rights': 'Sendmany'})
if int(thread_proxy.ith_special_reply_port) > 0:
self.add_reference(('task', task_addr), ('port', thread_proxy.ith_special_reply_port),
{'special': 'thread_reply:0x' + hex(int(thread_proxy)),
'rights': 'Sendonce'})
if int(thread_proxy.ith_voucher) > 0:
vport = thread_proxy.ith_voucher.iv_port
if int(vport) > 0:
self.add_reference(('task', task_addr), ('port', vport),
{'special': 'thread_voucher:0x' + hex(int(thread_proxy)),
'rights': 'Voucher'})
if int(thread_proxy.exc_actions) > 0:
for idx in range(0, ex_max):
export = thread_proxy.exc_actions[idx].port
if int(export) > 0:
self.add_reference(('task', task_addr), ('port', export),
{'special': 'thread_ex' + hex(int(thread_proxy)) + ':' + str(idx),
'rights': 'Sendmany'})
def process_port(self, port_proxy):
""" Discover all out-references from the specified port """
if port_proxy.ip_messages.data.port.msgcount > 0:
kmsgheadp = cast(port_proxy.ip_messages.data.port.messages.ikmq_base, 'ipc_kmsg_t')
kmsgp = kmsgheadp
while unsigned(kmsgp) > 0:
self.add_reference(('port', int(port_proxy)), ('kmsg', kmsgp), {})
kmsgp = kmsgp.ikm_next
if kmsgp == kmsgheadp:
break
if int(port_proxy.ip_nsrequest):
self.add_reference(
('port', int(port_proxy)),
('port', cast(port_proxy.ip_nsrequest, 'ipc_port*')),
{'kind': 'no-senders'})
if int(port_proxy.ip_pdrequest):
self.add_reference(
('port', int(port_proxy)),
('port', cast(port_proxy.ip_pdrequest, 'ipc_port*')),
{'kind': 'port-death'})
if int(port_proxy.ip_requests):
table = cast(port_proxy.ip_requests, 'struct ipc_port_request *')
sz = int(table.name.size.its_size)
for i in range(sz):
if i == 0:
continue
ipr = table[i]
if int(ipr.name.name):
ipr_bits = int(ipr.notify.port) & 3
ipr_port = kern.GetValueFromAddress(int(ipr.notify.port) & ~3, 'ipc_port*')
if ipr_bits & 3: ## send-possible armed and requested
ipr_disp = "sendPossibleArmedRequested"
elif ipr_bits & 2: ## send-possible requested
ipr_disp = "sendPossibleRequested"
elif ipr_bits & 1: ## send-possible armed
ipr_disp = "sendPossibleArmed"
else:
ipr_disp = "iprOther"
self.add_reference(
('port', int(port_proxy)),
('port', ipr_port),
{'kind': ipr_disp})
waitq = port_proxy.ip_messages.data.port.waitq
self.waitq_scanner.process_waitq(waitq)
self.waitq_to_obj[int(waitq)] = ('port', int(port_proxy))
def process_pset(self, pset_proxy):
"""
Ensure that, once all ports and psets have been processed,
`self.finalize_psets()` will discover all references from the
specified pset.
"""
waitq = pset_proxy.ips_messages.data.pset.setq.wqset_q
self.waitq_scanner.process_waitq(waitq)
self.waitq_to_obj[int(waitq)] = ('pset', int(pset_proxy))
def finalize_psets(self):
""" Finish pset processing """
assert not self.queue.size(), 'should only be called at end'
for pset_addr, info in self.store['pset'].items():
pset_proxy = info['proxy']
wqset = pset_proxy.ips_messages.data.pset.setq
for waitq in self.waitq_scanner.get_members_of_wqset(wqset):
mem_type, mem_addr = self.waitq_to_obj[int(waitq)]
self.add_reference(
('pset', pset_addr),
(mem_type, mem_addr),
{})
assert not self.queue.size(), \
'psets should only be able to point to things we encountered already'
def process_kmsg(self, kmsg_proxy):
""" Discover all out-references from an in-flight message """
def disposition_str(disp):
disps = {
16: 'recv',
24: 'dispose-receive',
17: 'move-send',
19: 'copy-send',
20: 'make-send',
25: 'dispose-send',
18: 'send-once',
21: 'make-send-once',
26: 'dispose-send-once',
}
return (disps[disp] if disp in disps else 'unknown:' + str(disp))
kmsgh = dereference(kmsg_proxy.ikm_header)
if int(kmsgh.msgh_remote_port):
self.add_reference(
('kmsg', int(kmsg_proxy)),
('port', kmsgh.msgh_remote_port),
{'where': 'remote', 'disposition': disposition_str(kmsgh.msgh_bits & 0x1f)})
if int(kmsgh.msgh_local_port):
self.add_reference(
('kmsg', int(kmsg_proxy)),
('port', kmsgh.msgh_local_port),
{'where': 'local', 'disposition': disposition_str((kmsgh.msgh_bits & 0x1f00) >> 8)})
if int(kmsg_proxy.ikm_voucher):
self.add_reference(
('kmsg', int(kmsg_proxy)),
('port', kmsg_proxy.ikm_voucher),
{'where': 'voucher', 'disposition': 'voucher'})
if kmsgh.msgh_bits & 0x80000000:
body = cast(addressof(cast(addressof(kmsgh), 'char *')[sizeof(kmsgh)]), 'mach_msg_body_t *')
dsc_count = dereference(cast(body, 'uint32_t *'))
dschead = cast(addressof(cast(addressof(body[0]), 'char *')[sizeof('uint32_t')]), 'mach_msg_descriptor_t *')
dsc_list = []
for i in range(dsc_count):
dsc_list.append(dschead[i])
for dsc in dsc_list:
dsc_type = unsigned(dsc.type.type)
if dsc_type == 0: # port
disp = dsc.port.disposition
ports = [dsc.port.name]
elif dsc_type == 2: # ool port
disp = dsc.ool_ports.disposition
ports = [cast(dsc.ool_ports.address, 'ipc_port*')[pidx]
for pidx in range(dsc.ool_ports.count)]
else:
continue
for port in ports:
self.add_reference(
('kmsg', int(kmsg_proxy)),
('port', port),
{'where': 'sent', 'disposition': disposition_str(disp)})
def get_fork_child_info(child_thread_proxy):
"""
Given a thread which is waiting to become the child in a vfork (or
posix_spawn), try to find information about the parent.
During a posix_spawn-doing-exec, the parent first creates a new task and a
thread in the new task with continuation task_wait_to_return. Then, the
parent loads the image into the new task (if applicable), sets the child
thread's saved state so that a return_from_trap will leave the new thread's
userland at the entry point of the new image, and unwaits it. That means
that, for a decent interval, the information on what's going on is only
available in the parent. Furthermore, the only thing which can unwait the
child is the parent, which means if we're trying to find out what the child
is waiting for, we need to walk to the parent.
1. Parent is doing a fork. fork has called fork1 which has called cloneproc
which has called fork_create_child which has called thread_create_waiting
which has made the child thread. The next steps are for thread_create_waiting
to return to fok_create_child to return to cloneproc to return to fork1,
at which point fork1 can call thread_dup, but that hasn't happened yet. AFAICT
there is no way to associate the child with the parent thread (other than
looking at the parent process and hoping only one thread is forking, or handling
every possible intermediate state which seems unrealistic.)
2. Parent is doing a fork, and has gotten as far as thread_dup. We can identify
the parent thread by looking for which thread in the parent process has the same
thread-local segment (as the thread's userspace would see it.)
3. Parent is doing a posix_spawn with POSIX_SPAWN_SETEXEC, which makes
posix_spawn behave like an execve-without-fork. posix_spawn has called
fork_create_child which has called thread_create_waiting. The next steps
are for thread_create_waiting to return to fork_create_child to return to
posix_spawn which will stick it in `imgp->ip_new_thread`, but that hasn't
happened yet. As in 1, AFAICT there is no way to associate them.
4. Parent is doing posix_spawn with POSIX_SPAWN_SETEXEC and has gotten far enough
to set `imgp->ip_new_thread` to the new thread. Obvious.
5. Parent is doing posix_spawn normally. It has called posix_spawn which has
called fork1 which has called cloneproc which has called fork_create_child
which has called thread_create_waiting. The next steps are for
thread_create_waiting, fork_create_child, cloneproc, and fork1 to return; as part
of the tail of fork1, it writes (via ptr used as an out-arg) the new thread into
imgp->ip_new_thread, but that hasn't happened yet. As in 1, AFAICT there is no way
to associate them.
6. Parent is doing posix_spawn normally, and it has gotten far enough to set
imgp->ip_new_thread in the new thread. Obvious.
7. Parent is doing an exec unrelated to a fork. (Note that, while this does
not produce a new BSD process, it does produce a new Mach task/thread for
security reasons.) execve has called fork_create_child which has called
thread_create_waiting. The next step is for thread_create_waiting and
fork_create_child to return so execve can stick the new thread into
`imgp->ip_new_thread`, but that hasn't happened yet. Yada yada.
8. Parent is doing an exec unrelated to a fork, and has gotten far enough
to stick the new thread in `fork_create_child`. This is PROBABLY easy to handle,
though I'm not sure exactly how the parent thread/task disappears.
9. vfork is a giant pile of horror which I haven't even started to understand yet.
The current implementation handles only a subset of case 6.
"""
for maybe_parent_task_proxy in kern.tasks:
maybe_parent_bsd = cast(maybe_parent_task_proxy.bsd_info, 'proc*')
for maybe_parent_thread_proxy in IterateQueue(maybe_parent_task_proxy.threads, 'thread *', 'task_threads'):
maybe_parent_thread_lldb = xnu.GetLLDBThreadForKernelThread(maybe_parent_thread_proxy)
for frame in maybe_parent_thread_lldb.frames:
if frame.name == 'exec_activate_image':
imgp_proxy = kern.GetValueFromAddress(
frame.GetValueForVariablePath('imgp').value,
'image_params*'
)
if int(child_thread_proxy) == int(imgp_proxy.ip_new_thread):
return {
'parent_thread': maybe_parent_thread_proxy,
'parent_task': maybe_parent_task_proxy,
'parent_task_pid': int(maybe_parent_bsd.p_pid),
'parent_task_name': str(maybe_parent_bsd.p_comm),
'child_exe_name': str(imgp_proxy.ip_vp.v_name),
'child_argv0': str(imgp_proxy.ip_startargv),
}
return None
# Places where a thread can eventually go after it finishes doing whatever it's
# currently in kernel for.
CONTINUATION_BASE_CASES = set([
# Thead will return to (or make an upcall into) userspace
'USERSPACE',
# Thread will terminate
'TERMINATE',
# Thread is a kernel thread which stays in kernel forever (or almost
# forever)
'KERNEL_DAEMON',
# The eventual destination of a thread is unknown
'UNKNOWN',
])
# Given the symbol name of a continuation, maps to a callable which takes the
# thread proxy blocked on that continuation and returns the /next steps/ after
# the continuation. Generally, the /next steps/ are where the thread will go
# once it is continued; in other words, we imagine that the kernel was written
# such that this wait was just a stack wait and then return what the frame
# below the wait would be. Code-wise, the /next steps/ are returned as either
# a string, a list of strings, or a proxy for another continuation.
CONTINUATION_TO_NEXT = {
'ipc_mqueue_receive_continue':
lambda thread_proxy: thread_proxy.saved.receive.continuation,
'workq_unpark_continue':
lambda thread_proxy: 'USERSPACE:wq',
'_sleep_continue':
lambda thread_proxy: cast(thread_proxy.uthread, 'uthread*').uu_continuation,
'kqueue_scan_continue':
lambda thread_proxy: cast(thread_proxy.uthread, 'uthread*').uu_save.uus_kqueue_scan.cont,
'IOWorkLoop::threadMain()':
lambda thread_proxy: 'TERMINATE',
'semaphore_wait_continue':
lambda thread_proxy: thread_proxy.saved.sema.continuation,
'thread_call_thread':
lambda thread_proxy: [
'thread_call_queue:' + str(thread_proxy.thc_state.thc_group.tcg_name),
'KERNEL_DAEMON',
],
'ulock_wait_continue':
lambda thread_proxy: 'unix_syscall_return',
'task_wait_to_return':
lambda thread_proxy: 'USERSPACE:fork_or_vfork',
'filt_wlwait_continue':
lambda thread_proxy: ['kevent_register_wait_return', 'unix_syscall_return'],
'psynch_cvcontinue':
lambda thread_proxy: 'USERSPACE:syscall',
'psynch_mtxcontinue':
lambda thread_proxy: 'unix_syscall_return',
'sigcontinue':
lambda thread_proxy: 'unix_syscall_return',
'unix_syscall_return':
lambda thread_proxy: 'USERSPACE:syscall',
'selcontinue':
lambda thread_proxy: ['_sleep_continue', 'unix_syscall_return'],
'wait4_nocancel':
lambda thread_proxy: ['_sleep_continue', 'unix_syscall_return'],
'thread_syscall_return':
lambda thread_proxy: 'USERSPACE:syscall',
'kevent_continue':
lambda thread_proxy: 'unix_syscall_return',
'__posix_sem_syscall_return':
lambda thread_proxy: 'unix_syscall_return',
'nxprov_detacher_cont':
lambda thread_proxy: 'UNKNOWN',
'kauth_resolver_getwork_continue':
lambda thread_proxy: [
'kauth_resolver_getwork2',
'kauth_resolver_getwork_continue',
'_sleep_continue',
'unix_syscall_return',
],
'wait1continue':
lambda thread_proxy: ['_sleep_continue', 'unix_syscall_return'],
}
# Set of continuations which are kernel daemons without doing anything
# complicated. Conceptually, the semantics of this are identical to
#
# for c in [this_list]:
# CONTINUATION_TO_NEXT[c] = lambda thread_proxy: 'KERNEL_DAEMON'
#
# but it's written this way for increased clarity and easier-to-read code.
CONTINUATIONS_KERNEL_DAEMONS = set([
'aio_work_thread',
'async_work_continue',
'audit_worker',
'bcleanbuf_thread',
'cfil_info_udp_expire',
'flowadv_thread_cont',
'ifnet_detacher_thread_cont',
'io_reprioritize_thread',
'mbuf_worker_thread',
'memorystatus_thread',
'nwk_wq_thread_cont',
'pf_purge_thread_cont',
'sched_timeshare_maintenance_continue',
'thread_call_daemon_continue',
'thread_exception_daemon',
'thread_stack_daemon',
'thread_terminate_daemon',
'vm_compressor_swap_trigger_thread',
'vm_object_reaper_thread',
'vm_pageout_continue',
'vm_pageout_garbage_collect',
'vm_pageout_iothread_external_continue',
'vm_pageout_iothread_internal_continue',
'vm_pressure_thread',
'vm_swapfile_create_thread',
'vm_swapfile_gc_thread',
'vm_swapout_thread',
])
def trace_kstack(thread_proxy):
"""
Given a waiting thread, trace where it will go after it's unblocked.
For the many threads which have normal kernel stacks, we just return the
stack; for threads which explicated their continuation, we have dedicated
handling which understands what the kernel's common continuations will do
when they are resumed.
"""
if int(thread_proxy.kernel_stack):
return [str(frame.function.addr) for frame in GetLLDBThreadForKernelThread(thread_proxy).frames]
is_kernel = (int(kern.GetGlobalVariable('kernel_task')) == int(thread_proxy.task))
def continuation_to_name(cont):
if isinstance(cont, basestring):
return cont
else:
cont_symbol_maybe = xnu.kern.SymbolicateFromAddress(int(cont))
if cont_symbol_maybe:
return cont_symbol_maybe[0].name
else:
return 'UNKNOWN:0x' + str(cont)
fakestack = [continuation_to_name(thread_proxy.continuation)]
while True:
last_name = fakestack[-1]
if last_name.split(':')[0] in CONTINUATION_BASE_CASES:
break
elif len(fakestack) > 100:
fakestack.append('RECURSION')
return fakestack
if last_name in CONTINUATION_TO_NEXT:
nexts = CONTINUATION_TO_NEXT[last_name](thread_proxy)
if not isinstance(nexts, list):
nexts = [nexts]
elif last_name in CONTINUATIONS_KERNEL_DAEMONS and is_kernel:
nexts = ['KERNEL_DAEMON']
else:
nexts = ['UNKNOWN']
nexts = map(continuation_to_name, nexts)
assert nexts and all([isinstance(x, basestring) for x in nexts])
fakestack.extend(nexts)
if fakestack[-1] == 'USERSPACE:fork_or_vfork':
fork_info = get_fork_child_info(thread_proxy)
if fork_info:
fakestack[-1] += (
(':parent=%d,%s,%x' % (
fork_info['parent_task_pid'],
fork_info['parent_task_name'],
int(fork_info['parent_thread']),
)) +
(':child=%s/%s' % (
fork_info['child_exe_name'],
fork_info['child_argv0'],
))
)
return fakestack
def analyze_threads():
"""
Do basic analysis on all threads in the kernel figuring out what (if anything)
they're blocked on.
Is not integrated with the system-wide analysis for IPC state.
Returns a list of dicts each of which contains basic information about a
thread and is suitable for aggregation or more detailed analysis.
"""
threads = []
kernel_task_proxy = kern.GetGlobalVariable('kernel_task')
for task_proxy in kern.tasks:
proc_proxy = dereference(cast(task_proxy.bsd_info, 'proc*'))
is_kernel = (int(task_proxy) == int(kernel_task_proxy))
for thread_proxy in IterateQueue(task_proxy.threads, 'thread *', 'task_threads'):
is_wait = bool(thread_proxy.state & 0x1)
is_idle = bool(thread_proxy.state & 0x80)
task_name = str(proc_proxy.p_name)
assert(not is_wait or not is_idle)
thri = {
'proxy': thread_proxy,
'task': task_proxy,
'task_name': task_name,
'state': (ST_IDLE if is_idle else ST_WAIT if is_wait else ST_RUNNABLE),
'wait': ({} if is_wait else None),
'is_kernel': is_kernel,
}
if is_wait:
if int(thread_proxy.wait_timer_is_set.sbvalue.value):
thri['wait']['timer'] = int(thread_proxy.wait_timer.call_entry.deadline.sbvalue.value)
else:
thri['wait']['timer'] = None
thri['wait']['block_hint'] = thread_proxy.block_hint.sbvalue.value
thri['wait']['kstack'] = trace_kstack(thread_proxy)
if int(thread_proxy.kernel_stack) == 0:
thri['wait']['continuation'] = thri['wait']['kstack'][0]
else:
thri['wait']['continuation'] = None
threads.append(thri)
return threads
def summarize(thri):
"""
Given a thread-info (as returned by analyze_threads), return a summary
which is intended to be sufficiently detailed to help debugging but
sufficiently vague that it can be aggregated efficiently.
"""
if thri['state'] == ST_IDLE:
return 'idle'
elif thri['state'] == ST_RUNNABLE:
return 'runnable'
else:
assert(thri['state'] == ST_WAIT)
summary = {
'timer': bool(thri['wait']['timer']),
'resume': thri['wait']['kstack'],
'is_kernel': thri['is_kernel'],
'block_hint': (thri['wait']['block_hint'] if thri['wait']['block_hint'] != 'kThreadWaitNone' else None),
}
return summary
def analyze_receiving_thread(thread_proxy):
"""
Given a thread waiting in kThreadWaitPortReceive, find the port or portset
it's trying to receive something from.
"""
assert(str(thread_proxy.block_hint.sbvalue.value) == 'kThreadWaitPortReceive')
waitq = thread_proxy.waitq
waitq_type = int(waitq.waitq_type)
if waitq_type == 3:
mqueue = kern.GetValueFromAddress(
hex(int(waitq) - getfieldoffset('struct ipc_mqueue', 'data.pset.setq')),
'struct ipc_mqueue*')
pset = kern.GetValueFromAddress(
hex(int(mqueue) - getfieldoffset('struct ipc_pset', 'ips_messages')),
'struct ipc_pset*')
return ('pset', pset)
elif waitq_type == 2:
mqueue = kern.GetValueFromAddress(
hex(int(waitq) - getfieldoffset('struct ipc_mqueue', 'data.port.waitq')),
'struct ipc_mqueue*')
port = kern.GetValueFromAddress(
hex(int(mqueue) - getfieldoffset('struct ipc_port', 'ip_messages')),
'struct ipc_port*')
return ('port', port)
else:
assert False, 'Unknown waitq_type ' + waitq_type
def dumps(thing):
"""Dump to json canonically"""
return json.dumps(thing, sort_keys=True)
def histogram(things, keyfn=(lambda v: v)):
"""
Return a histogram of values, grouped by some key.
This is the moral equivalent of
SELECT key, arbitrary(value), count(*)
FROM (SELECT keyfn(value) as key, value FROM things)
GROUP BY key
Params:
things: iterable of things to make a histogram from
keyfn: function from thing to key to group them by, which
must be suitable as the key in a dict
Return:
dict mapping keys (as returned by keyfn) to dicts of the form
{'val': some_thing_with_that_key, 'count': number_of_things_with_that_key}
"""
ret = {}
for thing in things:
key = keyfn(thing)
info = ret.setdefault(key, {'val': thing, 'count': 0})
info['count'] = info['count'] + 1
return ret
def print_histogram(histog):
""" Render a histogram to the screen. """
for key, info in sorted(
histog.items(),
key=(lambda entry: (-entry[1]['count'], entry[0])),
):
print("%d\t%r\n" % (info['count'], info['val']))
#!/usr/bin/env perl
use strict;
use warnings;
use 5.010;
use autodie;
use Fcntl 'SEEK_SET', 'SEEK_CUR', 'SEEK_END';
use List::Util 'min';
use YAML::XS 'Dump';
# Magic number at start of file, encoded big-endian, as if a string. In C,
#
# #define MACH_CORE_FILEHEADER_SIGNATURE 0x0063614d20646152ULL
my $MACH_CORE_FILEHEADER_SIGNATURE_BE = pack('H*', '0063614d20646152');
# Magic number at start of file, encoded little-endian, as if a string. In C,
#
# #define MACH_CORE_FILEHEADER_SIGNATURE 0x0063614d20646152ULL
my $MACH_CORE_FILEHEADER_SIGNATURE_LE =
reverse($MACH_CORE_FILEHEADER_SIGNATURE_BE);
# Maximum number of subfiles in dump. In C,
#
# #define MACH_CORE_FILEHEADER_MAXFILES 16
my $MACH_CORE_FILEHEADER_MAXFILES = 16;
# Maximum length of a dump subfile's filename. In C,
#
# #define MACH_CORE_FILEHEADER_NAMELEN 16
my $MACH_CORE_FILEHEADER_NAMELEN = 16;
# Pack pattern for a `struct mach_core_details` part of a dump header, if
# encoded big-enian. In C,
#
# struct mach_core_details {
# uint64_t gzip_offset;
# uint64_t gzip_length;
# char core_name[MACH_CORE_FILEHEADER_NAMELEN];
# };
my $MACH_CORE_DETAILS_PACKPAT_BE = "(QQ)>Z$MACH_CORE_FILEHEADER_NAMELEN";
# Size of a `struct mach_core_details` part of a dump header, in bytes.
my $MACH_CORE_DETAILS_SIZE = 8 + 8 + $MACH_CORE_FILEHEADER_NAMELEN;
# Pack pattern for a `struct mach_core_fileheader` dump header, treating
# the embedded `struct mach_core_details` as opaque, if encoded in big-endian.
# In C,
#
# struct mach_core_fileheader {
# uint64_t signature;
# uint64_t log_offset;
# uint64_t log_length;
# uint64_t num_files;
# // treated as `char files[MACH_CORE_FILEHEADER_MAXFILES][sizeof(struct mach_core_details)]`
# struct mach_core_details files[MACH_CORE_FILEHEADER_MAXFILES];
# };
#
my $MACH_CORE_FILEHEADER_PACKPAT_BE =
"(QQQQ)>(a$MACH_CORE_DETAILS_SIZE)$MACH_CORE_FILEHEADER_MAXFILES";
# Size of a `struct mach_core_fileheader` header, in bytes (including the
# contained `struct mach_core_details`s.)
my $MACH_CORE_FILEHEADER_SIZE =
8 + 8 + 8 + 8
+ $MACH_CORE_DETAILS_SIZE * $MACH_CORE_FILEHEADER_MAXFILES;
sub bswap64 {
my ($num) = @_;
return unpack('Q<', pack('Q>', $num));
}
sub parse_header {
my ($filename, $fh) = @_;
seek($fh, 0, SEEK_SET);
my $read = read($fh, my($header_buf), $MACH_CORE_FILEHEADER_SIZE);
$read == $MACH_CORE_FILEHEADER_SIZE
or die "$0: fatal: $filename: unexpected EOF\n";
my $little_endian = {
$MACH_CORE_FILEHEADER_SIGNATURE_BE => 0,
$MACH_CORE_FILEHEADER_SIGNATURE_LE => 1,
}->{substr($header_buf, 0, length($MACH_CORE_FILEHEADER_SIGNATURE_BE))};
if (not defined $little_endian) {
die "$0: fatal: $filename: bad signature\n";
}
my ($signature, $log_offset, $log_length, $num_files, @file_bufs) =
unpack($MACH_CORE_FILEHEADER_PACKPAT_BE, $header_buf);
if ($little_endian) {
$signature = bswap64($signature);
$log_offset = bswap64($log_offset);
$log_length = bswap64($log_length);
$num_files = bswap64($num_files);
}
if ($num_files > @file_bufs) {
die "$0: fatal: $filename: claims to have too many ($num_files) files\n";
} elsif ($num_files == 0) {
warn "$0: warning: $filename: claims to have no subfiles\n";
}
my $file_infos = [];
for my $i (0 .. ($num_files - 1)) {
my ($gzip_offset, $gzip_length, $name) =
unpack($MACH_CORE_DETAILS_PACKPAT_BE, $file_bufs[$i]);
if ($little_endian) {
$gzip_offset = bswap64($gzip_offset);
$gzip_length = bswap64($gzip_length);
}
push @$file_infos, {
gzip_offset => $gzip_offset,
gzip_length => $gzip_length,
name => $name,
};
}
return {
signature => $signature,
log_offset => $log_offset,
log_length => $log_length,
file_infos => $file_infos,
};
}
sub extract_range {
my ($filename, $fh, $offset, $length, $outdir, $outfilename) = @_;
seek($fh, $offset, SEEK_SET);
open(my($outfh), '>', "$outdir/$outfilename");
my $last_nz = 0;
for (my $wrote = 0; $wrote < $length;) {
my $bufsize = min($length - $wrote, 4096);
my $read = read($fh, my($buf), $bufsize);
$read == $bufsize
or die "$0: fatal: $filename: short read while extracting $outfilename\n";
if ($buf =~ /[^\x00]/) {
print {$outfh} $buf;
$last_nz = $wrote + $bufsize;
} else {
# `$buf` is entirely null. Make sparse output file.
truncate($outfh, $wrote + $bufsize);
seek($outfh, $bufsize, SEEK_CUR);
}
$wrote += $bufsize;
}
if ($outfilename eq '_trailing.bin' and $last_nz < $length) {
truncate($outfh, $last_nz);
}
close($outfh);
return {begin => $offset, end => $offset + $length, name => $outfilename};
}
sub extract_subfile {
my ($filename, $fh, $file_info, $outdir) = @_;
my $offset = $file_info->{gzip_offset};
my $length = $file_info->{gzip_length};
my $origname = $file_info->{name};
length($origname)
or die "$0: fatal: $filename: subfile with empty name\n";
($origname !~ /^[_.]/)
or die "$0: fatal: $filename: subfile with reserved name $origname\n";
my $outfilename = "$origname.gz";
if (-f "$outdir/$outfilename") {
die "$0: fatal: $filename: multiple subfiles named $origname\n";
}
return extract_range($filename, $fh, $offset, $length, $outdir, $outfilename);
}
sub extract_unclaimed {
my ($filename, $fh, $claimed_ranges, $outdir) = @_;
seek($fh, 0, SEEK_END);
my $flen = tell($fh);
# Compute all ranges of `[0, $flen)` which are not contained within any
# range `[$i->{begin}, $i->{end}) for $i in @$claimed_ranges`. The
# algorithm is quadratic, but bounded by O(MACH_CORE_FILEHEADER_MAXFILES)
# so meh.
my $unclaimed_ranges = [];
my $unclaimed_name_counter = 0;
my $offset = 0;
RANGE:
while ($offset < $flen)
{
# See if `[$offset, $offset+1)` is in any claimed range
for my $range (@$claimed_ranges) {
if ($range->{begin} <= $offset and $offset < $range->{end}) {
# If so, skip ahead to the end of that range
$offset = $range->{end};
next RANGE;
}
}
# If no, figure out what range claimed range starts immediately
# after $offset.
my $starts_next;
for my $range (@$claimed_ranges) {
if ($range->{begin} > $offset) {
if (not($starts_next) or $range->{begin} < $starts_next->{begin}) {
$starts_next = $range;
}
}
}
# Was there one?
if ($starts_next) {
# If there was any, then `[$offset, $starts_next->{begin})` is unclaimed
push @$unclaimed_ranges, {
begin => $offset,
end => $starts_next->{begin},
name => sprintf('_bonus%02u.bin', $unclaimed_name_counter++),
};
$offset = $starts_next->{begin};
} else {
# No range starts after => here to EOF is unclaimed
push @$unclaimed_ranges, {
begin => $offset,
end => $flen,
name => '_trailing.bin',
};
$offset = $flen;
}
}
# Write them out, and claim them as we do.
for my $unclaimed (@$unclaimed_ranges) {
push @$claimed_ranges, extract_range(
$filename, $fh,
$unclaimed->{begin}, $unclaimed->{end} - $unclaimed->{begin},
$outdir, $unclaimed->{name},
);
}
}
sub write_manifest {
my ($filename, $fh, $claimed_ranges, $header, $outdir) = @_;
my $manifest = {
dumpname => $filename,
files => [
sort {
$a->{begin} <=> $b->{begin}
or
$a->{end} <=> $b->{end}
or
$a->{name} cmp $b->{name}
}
@$claimed_ranges
],
};
open my($outfh), '>', "$outdir/_manifest.yaml";
print {$outfh} Dump($manifest);
close($outfh);
}
sub extract_all_parts {
my ($filename, $fh, $outdir) = @_;
my $header = parse_header($filename, $fh);
my $claimed_ranges = [];
push @$claimed_ranges, extract_range(
$filename, $fh,
0, $MACH_CORE_FILEHEADER_SIZE,
$outdir, "_header.bin",
);
push @$claimed_ranges, extract_range(
$filename, $fh,
$header->{log_offset}, $header->{log_length},
$outdir, "_panic.log",
);
for my $file_info (@{$header->{file_infos}}) {
push @$claimed_ranges,
extract_subfile($filename, $fh, $file_info, $outdir);
}
extract_unclaimed($filename, $fh, $claimed_ranges, $outdir);
write_manifest($filename, $fh, $claimed_ranges, $header, $outdir);
}
sub extract {
my ($infile, $outdir) = @_;
if (-e $outdir) {
die "$0: fatal: output directory already exists\n";
}
mkdir($outdir);
open my($infh), "<", $infile;
extract_all_parts($infile, $infh, $outdir);
}
sub main {
my (@args) = @_;
@args == 2
or die "Usage: $0 infile outdir\n";
extract($args[0], $args[1]);
return 0;
}
exit(main(@ARGV)) if not caller;

Reboot to recovery mode. Turn off SIP and secure boot. Boot back to normal mode.

Go to https://developer.apple.com/download/more/ and click "More" in the upper-right. Search for "Kernel". Download and install the Kernel Debug Kit for your build of macOS (run sw_vers on the command line to get it. A nearby KDK will often work; installing one is harmless.

For the KDK, exact build number is best and should definitely work. Otherwise, after installing it, you can see if the release kernel in the KDK (e.g. /Library/Developer/KDKs/KDK_10.14.6_18G2016.kdk/System/Library/Kernels/) matches the actual kernel (/System/Library/Kernels/kernel) with diff/shasum/whatever. If they match, the KDK will probably work. If they don't, the KDK might work. Probably nothing horrible will happen if you try and it doesn't.

Once you decide to use a KDK (or not), install the development kernel:

sudo mount -uw /
sudo install /Library/Developer/KDKs/KDK_10.14.6_18G2016.kdk/System/Library/Kernels/kernel.development /System/Library/Kernels/
sudo kextcache -invalidate /

Next, allocate some space for a core dump partition. Run diskutil list. The standard configuration will look something like

/dev/disk0 (internal):
   #:                       TYPE NAME                    SIZE       IDENTIFIER
   0:      GUID_partition_scheme                         1.0 TB     disk0
   1:                        EFI EFI                     314.6 MB   disk0s1
   2:                 Apple_APFS Container disk1         997.0 GB   disk0s2

/dev/disk1 (synthesized):
   #:                       TYPE NAME                    SIZE       IDENTIFIER
   0:      APFS Container Scheme -                      +997.0 GB   disk1
                                 Physical Store disk0s2
   1:                APFS Volume Preboot                 43.5 MB    disk1s2
   2:                APFS Volume Recovery                510.4 MB   disk1s3
   3:                APFS Volume VM                      18.3 GB    disk1s4
   4:                APFS Volume your-root-volume-name-here            439.5 GB   disk1s5

Possibly different on catalina. You need to shrink the APFS container a bit to make room for the kernel core partition. Kernel cores don't include most userspace memory and get compressed, so it doesn't need to be huge. (I have ~4GB for a machine with 32GB RAM.)

You can run diskutil apfs list disk1 to see the current exact size of the PV ("size" under the "Physical Store" entry) or diskutil apfs resizeContainer disk0s2 limits to see the limits (adjust disk1 and disk0s2 as appropriate.) If there is less space available according to limits than df, you might need to delete some time machine snapshots; see sudo tmutil listlocalsnapshotdates and sudo tmutil deletelocalsnapshots DATE.

Once you get the physical store's current size and decide how big a core dump partition you want, run something like

diskutil apfs resizeContainer \
    disk0s2 \
    $(( 1000240963584 - 1024 * 1024 * 1024 * 4 )) \
    %5361644D-6163-11AA-AA11-00306543ECAC% kcores 0

which would shrink disk0s2 from a starting size of 1000240963584 down 4GB, and then allocate a core dump partition in the remaining space. (If you already have an atypical configuration with free space after the APFS physical store, I'm not sure if this would make the core dump partition take up that free space or just the space reclaimed from APFS.)

You can actually do this live; however, your system will freeze for a bit during the resize. It will unfreeze eventually.

After that, diskutil list should look something like for the SSD

/dev/disk0 (internal):
   #:                       TYPE NAME                    SIZE       IDENTIFIER
   0:      GUID_partition_scheme                         1.0 TB     disk0
   1:                        EFI EFI                     314.6 MB   disk0s1
   2:                 Apple_APFS Container disk1         997.0 GB   disk0s2
   3:       Apple_KernelCoreDump                         3.2 GB     disk0s3

I recommend zeroing the partition (e.g. sudo dd if=/dev/zero bs=$(( 1024 * 1024 )) of=/dev/disk0s3); make sure you do the right device.

Finally, run

sudo nvram boot-args="-v msgbuf=1048576 kcsuffix=development debug=0x84d44"

The -v turns on verbose mode (prints kernel and bootloader stuff at boot/shutdown instead of hiding it with a splash screen). The msgbuf increases the size of the dmesg buffer so that all the startup stuff gets captured at boot. The kcsuffix=development tells the bootloader to load kernel.development instead of kernel, and the debug flags break down into

Pick:
    DB_ARP                  0x00040          useful if i want to do network panics later
    DB_NMI                  0x00004          NMI (and some other events) enters Debugger
    DB_KERN_DUMP_ON_PANIC   0x00400          panic triggers core dump
    DB_KERN_DUMP_ON_NMI     0x00800          NMI triggers core dump too
    DB_REBOOT_POST_CORE     0x04000          reboot post core (so you know when it's done)
    DB_DISABLE_CROSS_PANIC  0x80000         maybe necessary for machines with bridgeos?
    ----
                            0x84d44

The result is that, when the machine panics or when you do the NMI keystroke (cmd-ctrl-opt-shift-esc, which is not actually an NMI), it will appear to freeze, write a core dump to disk0s2, and then reboot.

Once you get a dump, copy it from the partition into a file with dd and then extract it using extract-kdump.pl, e.g.

sudo dd if=/dev/disk0s3 bs=$(( 1024 * 1024 )) of=blah-20200506-whatever.kdumpimg
sudo chown $(whoami) blah-20200506-whatever.kdumpimg
./extract-kdump.pl blah-20200506-whatever.kdumpimg blah-20200506-whatever.kdumpdir

The actual core dump comes out in blah-20200506-whatever.kdumpdir/kernel.gz.

You can load it with lldb; the KDK ships with a bunch of macros which are definitely easy to use and very well documented. You might need to run settings set target.load-script-from-symbol-file true in lldb. Some of the tools Apple built for the macros are actually pretty general, and you can use them to build nontrivial analyses. For example, I wrote example.py last summer to debug some recurring system deadlocks caused by Santa.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment