jlevon/cr3.md

## cr3.md

      
    Raw
  

              cr3.md
            
          
    Let's do a little dtracing:
# cat repro
#!/bin/bash

dtrace -n 'hilevel_intr_prolog:entry { c++; }'
[root@00-30-48-cf-be-f0 /zones/jlevon]# ./repro
dtrace: description 'hilevel_intr_prolog:entry ' matched 1 probe

My guest is busy eating CPU (other socket is offline):
# ./cpuinfo
 ID ADDR             FLG NRUN BSPL PRI RNRN KRNRN SWITCH THREAD           PROC
  0 fffffffffbc4c000  1b    0    0   1   no    no t-4    fffffe64816cbb40 bhyve
  1 fffffe645df85000  1b    0    0   1   no    no t-29   fffffe64bf4adb20 bhyve
  2 fffffe645df95000  1b    0    0   1   no    no t-0    fffffe6491ee2120 bhyve
  3 fffffe645e530000  1b    0    0   1   no    no t-104  fffffe64bf4b44a0 bhyve
  4 fffffe645e52a000  1b    0    0   1   no    no t-3    fffffe648594eb00 bhyve
  5 fffffe645e522000  1b    0    0   1   no    no t-166  fffffe647a0a7080 bhyve
  6 fffffe645e520000  2f    0    0  -1   no    no t-777204 fffffe008b869c40 (idle)
  7 fffffe645e51c000  2f    0    0  -1   no    no t-772847 fffffe008b8efc40 (idle)
  8 fffffe645e518000  2f    0    0  -1   no    no t-788230 fffffe008b975c40 (idle)
  9 fffffe645e815000  2f    0    0  -1   no    no t-788227 fffffe008b9fbc40 (idle)
 10 fffffe645e813000  2f    0    0  -1   no    no t-868237 fffffe008ba81c40 (idle)
 11 fffffe645e80d000  2f    0    0  -1   no    no t-868233 fffffe008bb07c40 (idle)
 12 fffffe645e80b000  1b    0    0   1   no    no t-192  fffffe64bf460520 bhyve
 13 fffffe645e807000  1b    0    0   1   no    no t-190  fffffe64bf460c60 bhyve
 14 fffffe645e801000  1b    0    0   1   no    no t-160  fffffe64bf4b7440 bhyve
 15 fffffe645e7ff000  1b    0    0   1   no    no t-273  fffffe64bf462c20 bhyve
 16 fffffe645e7f9000  1b    0    0   1   no    no t-2    fffffe64bf4b8080 bhyve
 17 fffffe645ea8e000  1b    1    0   1   no    no t-0    fffffe6477cfa440 mdb
 18 fffffe645ea8a000  2f    0    0  -1   no    no t-868231 fffffe008bee1c40 (idle)
 19 fffffe645ea84000  2f    0    0  -1   no    no t-868229 fffffe008bf67c40 (idle)
 20 fffffe645ea82000  2f    0    0  -1   no    no t-868227 fffffe008bfedc40 (idle)
 21 fffffe645ea7e000  2f    0    0  -1   no    no t-868224 fffffe008c073c40 (idle)
 22 fffffe645ea7a000  2f    0    0  -1   no    no t-775318 fffffe008c0f9c40 (idle)
 23 fffffe645ea76000  2f    0    0  -1   no    no t-764701 fffffe008c17fc40 (idle)

Wait for the Bad Thing:
# ./watchpf.d 
dtrace: script './watchpf.d' matched 14 probes
dtrace: allowing destructive actions

CPU     ID                    FUNCTION:NAME
 17   7451                       trap:entry CPU17 took fault at fffffc7fe9e09e10 (bhyve) 10 times after 110 us!
 17   7451                       trap:entry CPU17 took fault at fffffc7fe9e09e10 (bhyve) 20 times after 215 us!
 17   7451                       trap:entry CPU17 took fault at fffffc7fe9e09e10 (bhyve) 30 times after 315 us!
...
 17   7451                       trap:entry CPU17 took fault at fffffc7fe9e09e10 (bhyve) 1000 times after 10017 us!
 17   7451                       trap:entry CPU17 took fault at fffffc7fe9e09e10 (bhyve) 1010 times after 10117 us!
 17   7451                       trap:entry struct kpti_frame {
    uint64_t kf_lower_redzone = 0xdeadbeefdeadbeef
    greg_t kf_tr_cr3 = 0x1e000000 <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< !!!
    greg_t kf_r14 = 0
    greg_t kf_r13 = 0xfffffe008d495ae0
    greg_t kf_err = 0x2
    greg_t kf_rip = 0xfffffffffb87097b
    greg_t kf_cs = 0x30
    greg_t kf_rflags = 0x10246
    greg_t kf_rsp = 0xfffffe008d495710
    greg_t kf_ss = 0x38
    greg_t kf_tr_rsp = 0
    uint64_t kf_tr_flag = 0
    uint64_t kf_middle_redzone = 0xdeadbeefdeadbeef
    greg_t kf_kernel_cr3 = 0x8000001171570000
    greg_t kf_user_cr3 = 0x8000001170f74001
    greg_t kf_tr_ret_rsp = 0xfffffe645ea8ff20
    uint64_t kf_unused = 0
    uint64_t kf_upper_redzone = 0xdeadbeefdeadbeef
}
              unix`0xfffffffffb8001d6
              unix`0xfffffffffb87097b
              genunix`ddi_copyout+0x21
              vmm`vmmdev_do_ioctl+0x6a3
              vmm`vmm_ioctl+0x12c
              genunix`cdev_ioctl+0x39
              specfs`spec_ioctl+0x60
              genunix`fop_ioctl+0x55
              genunix`ioctl+0x9b
              unix`sys_syscall+0x19f

So we're trying to resolve a fault with a curproc of bhyve, but on to pagefault we apparently
have khat's %cr3 ? So it's no surprise we hard-loop: we populate a PTE in curproc's page tables,
but (presumably) return to kas %cr3 and immediately fault again. It's only being taken off CPU
that stops this being a hard hang.
# cat watchpf.d
#!/usr/sbin/dtrace -Cws

BEGIN {
        f[0] = 0;
}
trap:entry /args[0]->r_trapno == 0xe/ {
        if (arg1 == f[cpu]) {
                same[cpu]++;
        } else {
                same[cpu] = 0;
                self->t = timestamp;
        }
        f[cpu] = arg1;
}

trap:entry /args[0]->r_trapno == 0xe && same[cpu] >= 10 && (same[cpu] % 10) == 0/ {
        printf("CPU%d took fault at %p (%s) %d times after %lu us!",
            cpu, arg1, execname, same[cpu], (timestamp - self->t) / 1000);
        if (same[cpu] > 1000) {
                print(((cpu_t *)curcpu)->cpu_m.mcpu_kpti_flt);
                stack();
                system("echo ::cpuinfo | mdb -k");
                exit(1);
        }
}

It's not always easy to catch but in this case you can see the thread staying on CPU
for way longer than you'd expect. (I first saw this with scheduler changes.)
It is essential that "repro" is running (i.e. we're dtracing the intr prolog), and kmdb is loaded.
kpti=0 makes no difference.