Skip to content

Instantly share code, notes, and snippets.

@thejh
Last active March 3, 2024 09:01
Show Gist options
  • Save thejh/39a408fbf767e28670a8088e5425b6f6 to your computer and use it in GitHub Desktop.
Save thejh/39a408fbf767e28670a8088e5425b6f6 to your computer and use it in GitHub Desktop.
testing making misspeculated NULL derefs less page-walky (tested on Tiger Lake)
# perf stat -e task-clock:u -e cycles:u -e instructions:u -e branches:u -e branch-misses:u -e dTLB-loads:u -e dTLB-load-misses:u -e dtlb_load_misses.walk_active:u ./spec_test map
Performance counter stats for './spec_test map':
1,150.38 msec task-clock:u # 1.000 CPUs utilized
5,370,185,772 cycles:u # 4.668 GHz
1,331,717,669 instructions:u # 0.25 insn per cycle
307,326,910 branches:u # 267.153 M/sec
102,502,300 branch-misses:u # 33.35% of all branches
102,427,183 dTLB-loads # 89.038 M/sec
180 dTLB-load-misses # 0.00% of all dTLB cache accesses
9,842 dtlb_load_misses.walk_active:u # 8.555 K/sec
1.150692567 seconds time elapsed
1.150681000 seconds user
0.000000000 seconds sys
# perf stat -e task-clock:u -e cycles:u -e instructions:u -e branches:u -e branch-misses:u -e dTLB-loads:u -e dTLB-load-misses:u -e dtlb_load_misses.walk_active:u ./spec_test nomap
Performance counter stats for './spec_test nomap':
1,146.92 msec task-clock:u # 1.000 CPUs utilized
5,367,627,921 cycles:u # 4.680 GHz
1,331,715,636 instructions:u # 0.25 insn per cycle
307,326,565 branches:u # 267.957 M/sec
102,502,200 branch-misses:u # 33.35% of all branches
102,426,610 dTLB-loads # 89.305 M/sec
102,395,299 dTLB-load-misses # 99.97% of all dTLB cache accesses
2,850,800,022 dtlb_load_misses.walk_active:u # 2.486 G/sec
1.147238142 seconds time elapsed
1.147193000 seconds user
0.000000000 seconds sys
#
// compile with "gcc -o spec_test spec_test.c -O3 -mno-red-zone"
#define _GNU_SOURCE
#include <err.h>
#include <string.h>
#include <unistd.h>
#include <stdlib.h>
#include <sys/mman.h>
#define LOAD_ADDR 0
__attribute__((noinline))
void bench(void) {
for (unsigned long i=0; i<100000UL; i++) {
for (unsigned long j=0; j<0x10000; j += 64) {
asm volatile(
"lfence\n\t"
// basically retpoline
"call 1f\n\t"
// MISSPECULATION START
"mov (%[load_addr]), %%rax\n\t"
"ud2\n\t"
// MISSPECULATION END
"1:\n\t"
"mov $0, %%rbx\n\t"
"lea 2f(%%rip), %%rax\n\t"
// slow dependent ops.
// each popcnt should be 3 cycles on Tiger Lake according to
// <https://www.agner.org/optimize/instruction_tables.pdf>.
"popcnt %%rbx, %%rbx\n\t"
"popcnt %%rbx, %%rbx\n\t"
"popcnt %%rbx, %%rbx\n\t"
"popcnt %%rbx, %%rbx\n\t"
"xor %%rbx, %%rax\n\t"
"mov %%rax, (%%rsp)\n\t"
"ret\n\t"
"2:\n\t"
://out
://in
[load_addr] "r"(LOAD_ADDR)
://clobber
"rax", "rbx", "rcx", "rdx"
);
}
}
}
int main(int argc, char **argv) {
if (argc == 2 && strcmp(argv[1], "map") == 0) {
int pkey = pkey_alloc(0, PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE);
if (pkey == -1)
err(1, "pkey_alloc");
unsigned char *null_map = mmap(NULL, 0x1000, PROT_READ, MAP_PRIVATE|MAP_FIXED_NOREPLACE|MAP_ANONYMOUS, -1, 0);
if (null_map == MAP_FAILED)
err(1, "mmap");
*(volatile char *)null_map; // fault in the zeropage
if (pkey_mprotect(null_map, 0x1000, PROT_READ, pkey))
err(1, "pkey_mprotect");
} else if (argc == 2 && strcmp(argv[1], "nomap") == 0) {
} else {
errx(1, "usage: ./spec_test <map|nomap>");
}
bench();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment