Skip to content

Instantly share code, notes, and snippets.

@geohot
Last active March 25, 2024 19:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save geohot/66856adb1b1600ad24e2851ad10c9fdb to your computer and use it in GitHub Desktop.
Save geohot/66856adb1b1600ad24e2851ad10c9fdb to your computer and use it in GitHub Desktop.
MES Crash in KFD_IOC_QUEUE_TYPE_COMPUTE_AQL created without an EOP buffer
# one hit, no loop needed
# this is caused by creating a KFD_IOC_QUEUE_TYPE_COMPUTE_AQL without an EOP buffer
# this causes the MES to page fault
import os, ctypes, pathlib, re, fcntl, functools, mmap, time
import tinygrad.runtime.autogen.kfd as kfd
from tinygrad.helpers import to_mv
from extra.hip_gpu_driver import hip_ioctl
import tinygrad.runtime.autogen.hsa as hsa
libc = ctypes.CDLL("libc.so.6")
libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long]
libc.mmap.restype = ctypes.c_void_p
MAP_NORESERVE = 0x4000
MAP_FIXED = 0x10
def kfd_ioctl(idir, nr, user_struct, fd, **kwargs):
made = user_struct(**kwargs)
ret = fcntl.ioctl(fd, (idir<<30) | (ctypes.sizeof(user_struct)<<16) | (ord('K')<<8) | nr, made)
if ret != 0: raise RuntimeError(f"ioctl returned {ret}")
return made
def format_struct(s):
sdats = []
for field_name, field_type in s._fields_:
dat = getattr(s, field_name)
if isinstance(dat, int): sdats.append(f"{field_name}:0x{dat:X}")
else: sdats.append(f"{field_name}:{dat}")
return sdats
idirs = {"IOW": 1, "IOR": 2, "IOWR": 3}
def ioctls_from_header():
hdr = pathlib.Path("/usr/include/linux/kfd_ioctl.h").read_text().replace("\\\n", "")
pattern = r'#define\s+(AMDKFD_IOC_[A-Z0-9_]+)\s+AMDKFD_(IOW?R?)\((0x[0-9a-fA-F]+),\s+struct\s([A-Za-z0-9_]+)\)'
matches = re.findall(pattern, hdr, re.MULTILINE)
fxns = {}
for name, idir, nr, sname in matches:
fxns[name.replace("AMDKFD_IOC_", "").lower()] = functools.partial(kfd_ioctl, idirs[idir], int(nr, 0x10), getattr(kfd, "struct_"+sname))
return type("KIO", (object, ), fxns)
kio = ioctls_from_header()
# sudo su -c "echo 'file drivers/gpu/drm/amd/* +p' > /sys/kernel/debug/dynamic_debug/control"
def gpu_alloc_userptr(fd, size, flags):
addr = libc.mmap(0, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|mmap.MAP_ANONYMOUS, -1, 0)
assert addr != 0xffffffffffffffff
mem = kio.alloc_memory_of_gpu(fd, va_addr=addr, size=size, gpu_id=GPU_ID, flags=flags, mmap_offset=addr)
return mem
def gpu_alloc(fd, size, flags):
addr = libc.mmap(0, size, 0, mmap.MAP_PRIVATE|mmap.MAP_ANONYMOUS|MAP_NORESERVE, -1, 0)
assert addr != 0xffffffffffffffff
mem = kio.alloc_memory_of_gpu(fd, va_addr=addr, size=size, gpu_id=GPU_ID, flags=flags)
buf = libc.mmap(mem.va_addr, mem.size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|MAP_FIXED, drm_fd, mem.mmap_offset)
assert buf != 0xffffffffffffffff
assert addr == buf == mem.va_addr
return mem
if __name__ == "__main__":
fd = os.open("/dev/kfd", os.O_RDWR)
drm_fd = os.open("/dev/dri/renderD128", os.O_RDWR)
with open("/sys/devices/virtual/kfd/kfd/topology/nodes/1/gpu_id", "r") as f: GPU_ID = int(f.read())
#ver = kio.get_version(fd)
st = kio.acquire_vm(fd, drm_fd=drm_fd, gpu_id=GPU_ID)
#exit(0)
# 0xF0000001 = KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC | KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE
# 0xD6000002 = KFD_IOC_ALLOC_MEM_FLAGS_GTT | KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE
# 0xD6000004 = KFD_IOC_ALLOC_MEM_FLAGS_USERPTR | KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE
# 0x94000010 = KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP | KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE
#addr = libc.mmap(0, 0x1000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_PRIVATE|mmap.MAP_ANONYMOUS, -1, 0)
#addr = libc.mmap(0, 0x1000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|mmap.MAP_ANONYMOUS, -1, 0)
#mem = kio.AMDKFD_IOC_ALLOC_MEMORY_OF_GPU(fd, va_addr=addr, size=0x1000, gpu_id=GPU_ID, flags=0xD6000004)
#mem = gpu_alloc(fd, 0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM |
# kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE |
# kfd.KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE)
#arr = (ctypes.c_int32 * 1)(GPU_ID)
#stm = kio.map_memory_to_gpu(fd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(arr), n_devices=1)
arr = (ctypes.c_int32 * 1)(GPU_ID)
rw_ptr = gpu_alloc(fd, 0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE |
kfd.KFD_IOC_ALLOC_MEM_FLAGS_COHERENT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED |
kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE)
stm = kio.map_memory_to_gpu(fd, handle=rw_ptr.handle, device_ids_array_ptr=ctypes.addressof(arr), n_devices=1)
assert stm.n_success == 1
ring_base = gpu_alloc_userptr(fd, 0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR | kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE |
kfd.KFD_IOC_ALLOC_MEM_FLAGS_COHERENT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED |
kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE)
stm = kio.map_memory_to_gpu(fd, handle=ring_base.handle, device_ids_array_ptr=ctypes.addressof(arr), n_devices=1)
assert stm.n_success == 1
#eop_buffer = gpu_alloc(fd, 0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM |
# kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE |
# kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE)
#113.00 ms + 0.00 ms : 0 = AMDKFD_IOC_CREATE_QUEUE ring_base_address:0x797465200000 write_pointer_address:0x79751C068038 read_pointer_address:0x79751C068080 doorbell_offset:0x0 ring_size:0x800000 gpu_id:0x433D queue_type:0x2 queue_per
#centage:0x64 queue_priority:0x7 queue_id:0x0 eop_buffer_address:0x79751C064000 eop_buffer_size:0x1000 ctx_save_restore_address:0x796E52400000 ctx_save_restore_size:0x2BEA000 ctl_stack_size:0xA000
#113.84 ms + 0.59 ms : 0 = AMDKFD_IOC_CREATE_QUEUE ring_base_address:0x71AC3F600000 write_pointer_address:0x71B302AB0038 read_pointer_address:0x71B302AB0080 doorbell_offset:0xD0CF400000000008 ring_size:0x800000 gpu_id:0x433D queue_typ
#e:0x2 queue_percentage:0x64 queue_priority:0x7 queue_id:0x1 eop_buffer_address:0x71B302AAC000 eop_buffer_size:0x1000 ctx_save_restore_address:0x71AC3C800000 ctx_save_restore_size:0x2BEA000 ctl_stack_size:0xA000
nq = kio.create_queue(fd, ring_base_address=ring_base.va_addr, ring_size=0x1000, gpu_id=GPU_ID,
queue_type=kfd.KFD_IOC_QUEUE_TYPE_COMPUTE_AQL, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE,
queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
#eop_buffer_address=eop_buffer.va_addr, eop_buffer_size=0x1000,
# ctx_save_restore_address, ctx_save_restore_size
#ctl_stack_size = 0xa000,
write_pointer_address=rw_ptr.va_addr+0x38, read_pointer_address=rw_ptr.va_addr+0x80)
doorbell = libc.mmap(0, 8192, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, fd, nq.doorbell_offset)
print(hex(doorbell))
#define KFD_MMAP_TYPE_SHIFT 62
#define KFD_MMAP_TYPE_DOORBELL (0x3ULL << KFD_MMAP_TYPE_SHIFT)
evt = kio.create_event(fd, auto_reset=1)
BARRIER_HEADER = 1 << hsa.HSA_PACKET_HEADER_BARRIER
BARRIER_HEADER |= hsa.HSA_FENCE_SCOPE_SYSTEM << hsa.HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE
BARRIER_HEADER |= hsa.HSA_FENCE_SCOPE_SYSTEM << hsa.HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE
BARRIER_HEADER |= hsa.HSA_PACKET_TYPE_BARRIER_AND << hsa.HSA_PACKET_HEADER_TYPE
EMPTY_SIGNAL = hsa.hsa_signal_t()
packet = hsa.hsa_barrier_and_packet_t.from_address(rw_ptr.va_addr+0x38)
packet.reserved0 = 0
packet.reserved1 = 0
for i in range(5): packet.dep_signal[i] = EMPTY_SIGNAL
packet.reserved2 = 0
packet.completion_signal = hsa.hsa_signal_t(evt.event_id)
packet.header = BARRIER_HEADER
# ring doorbell
to_mv(doorbell, 4).cast("I")[0] = 1
evt_arr = (kfd.struct_kfd_event_data * 1)()
evt_arr[0].event_id = evt.event_id
kio.wait_events(fd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=0, timeout=1000)
#nq = kio.create_queue(fd, ring_base_address=buf, ring_size=0x1000, gpu_id=GPU_ID,
# queue_type=kfd.KFD_IOC_QUEUE_TYPE_COMPUTE_AQL, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE,
# queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY, write_pointer_address=buf+8, read_pointer_address=buf+0x10)
#print(nq)
#mv = to_mv(buf, 0x1000)
#addr = libc.mmap(0, 0x1000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_PRIVATE|mmap.MAP_ANONYMOUS, -1, 0)
#print('\n'.join(format_struct(ver)))
#print('\n'.join(format_struct(st)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment