// WARNING: must be run as root on an M1 device | |
// WARNING: fragile, uses private apple APIs | |
// currently no command line interface, see variables at top of main | |
/* | |
Based on https://github.com/travisdowns/robsize | |
Henry Wong <henry@stuffedcow.net> | |
http://blog.stuffedcow.net/2013/05/measuring-rob-capacity/ | |
2014-10-14 | |
*/ | |
#include <assert.h> | |
#include <dlfcn.h> | |
#include <pthread.h> | |
#include <ptrauth.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <sys/mman.h> | |
#include <libkern/OSCacheControl.h> | |
static int its = 8192 / 4; | |
static int outer_its = 64 * 2; | |
static int unroll = 1; | |
const char *delim = "\t"; | |
#define KPERF_LIST \ | |
/* ret, name, params */ \ | |
F(int, kpc_get_counting, void) \ | |
F(int, kpc_force_all_ctrs_set, int) \ | |
F(int, kpc_set_counting, uint32_t) \ | |
F(int, kpc_set_thread_counting, uint32_t) \ | |
F(int, kpc_set_config, uint32_t, void *) \ | |
F(int, kpc_get_config, uint32_t, void *) \ | |
F(int, kpc_set_period, uint32_t, void *) \ | |
F(int, kpc_get_period, uint32_t, void *) \ | |
F(uint32_t, kpc_get_counter_count, uint32_t) \ | |
F(uint32_t, kpc_get_config_count, uint32_t) \ | |
F(int, kperf_sample_get, int *) \ | |
F(int, kpc_get_thread_counters, int, unsigned int, void *) | |
#define F(ret, name, ...) \ | |
typedef ret name##proc(__VA_ARGS__); \ | |
static name##proc *name; | |
KPERF_LIST | |
#undef F | |
#define CFGWORD_EL0A32EN_MASK (0x10000) | |
#define CFGWORD_EL0A64EN_MASK (0x20000) | |
#define CFGWORD_EL1EN_MASK (0x40000) | |
#define CFGWORD_EL3EN_MASK (0x80000) | |
#define CFGWORD_ALLMODES_MASK (0xf0000) | |
#define CPMU_NONE 0 | |
#define CPMU_CORE_CYCLE 0x02 | |
#define CPMU_INST_A64 0x8c | |
#define CPMU_INST_BRANCH 0x8d | |
#define CPMU_SYNC_DC_LOAD_MISS 0xbf | |
#define CPMU_SYNC_DC_STORE_MISS 0xc0 | |
#define CPMU_SYNC_DTLB_MISS 0xc1 | |
#define CPMU_SYNC_ST_HIT_YNGR_LD 0xc4 | |
#define CPMU_SYNC_BR_ANY_MISP 0xcb | |
#define CPMU_FED_IC_MISS_DEM 0xd3 | |
#define CPMU_FED_ITLB_MISS 0xd4 | |
#define KPC_CLASS_FIXED (0) | |
#define KPC_CLASS_CONFIGURABLE (1) | |
#define KPC_CLASS_POWER (2) | |
#define KPC_CLASS_RAWPMU (3) | |
#define KPC_CLASS_FIXED_MASK (1u << KPC_CLASS_FIXED) | |
#define KPC_CLASS_CONFIGURABLE_MASK (1u << KPC_CLASS_CONFIGURABLE) | |
#define KPC_CLASS_POWER_MASK (1u << KPC_CLASS_POWER) | |
#define KPC_CLASS_RAWPMU_MASK (1u << KPC_CLASS_RAWPMU) | |
#define COUNTERS_COUNT 10 | |
#define CONFIG_COUNT 8 | |
#define KPC_MASK (KPC_CLASS_CONFIGURABLE_MASK | KPC_CLASS_FIXED_MASK) | |
uint64_t g_counters[COUNTERS_COUNT]; | |
uint64_t g_config[COUNTERS_COUNT]; | |
static void configure_rdtsc() { | |
if (kpc_set_config(KPC_MASK, g_config)) { | |
printf("kpc_set_config failed\n"); | |
return; | |
} | |
if (kpc_force_all_ctrs_set(1)) { | |
printf("kpc_force_all_ctrs_set failed\n"); | |
return; | |
} | |
if (kpc_set_counting(KPC_MASK)) { | |
printf("kpc_set_counting failed\n"); | |
return; | |
} | |
if (kpc_set_thread_counting(KPC_MASK)) { | |
printf("kpc_set_thread_counting failed\n"); | |
return; | |
} | |
} | |
static void init_rdtsc() { | |
void *kperf = dlopen( | |
"/System/Library/PrivateFrameworks/kperf.framework/Versions/A/kperf", | |
RTLD_LAZY); | |
if (!kperf) { | |
printf("kperf = %p\n", kperf); | |
return; | |
} | |
#define F(ret, name, ...) \ | |
name = (name##proc *)(dlsym(kperf, #name)); \ | |
if (!name) { \ | |
printf("%s = %p\n", #name, (void *)name); \ | |
return; \ | |
} | |
KPERF_LIST | |
#undef F | |
// TODO: KPC_CLASS_RAWPMU_MASK | |
if (kpc_get_counter_count(KPC_MASK) != COUNTERS_COUNT) { | |
printf("wrong fixed counters count\n"); | |
return; | |
} | |
if (kpc_get_config_count(KPC_MASK) != CONFIG_COUNT) { | |
printf("wrong fixed config count\n"); | |
return; | |
} | |
// Not all counters can count all things: | |
// CPMU_CORE_CYCLE {0-7} | |
// CPMU_FED_IC_MISS_DEM {0-7} | |
// CPMU_FED_ITLB_MISS {0-7} | |
// CPMU_INST_BRANCH {3, 4, 5} | |
// CPMU_SYNC_DC_LOAD_MISS {3, 4, 5} | |
// CPMU_SYNC_DC_STORE_MISS {3, 4, 5} | |
// CPMU_SYNC_DTLB_MISS {3, 4, 5} | |
// CPMU_SYNC_BR_ANY_MISP {3, 4, 5} | |
// CPMU_SYNC_ST_HIT_YNGR_LD {3, 4, 5} | |
// CPMU_INST_A64 {5} | |
// using "CFGWORD_ALLMODES_MASK" is much noisier | |
g_config[0] = CPMU_CORE_CYCLE | CFGWORD_EL0A64EN_MASK; | |
// configs[3] = CPMU_SYNC_DC_LOAD_MISS | CFGWORD_EL0A64EN_MASK; | |
// configs[4] = CPMU_SYNC_DTLB_MISS | CFGWORD_EL0A64EN_MASK; | |
// configs[5] = CPMU_INST_A64 | CFGWORD_EL0A64EN_MASK; | |
configure_rdtsc(); | |
} | |
static unsigned long long int rdtsc() { | |
if (kpc_get_thread_counters(0, COUNTERS_COUNT, g_counters)) { | |
printf("kpc_get_thread_counters failed\n"); | |
return 1; | |
} | |
return g_counters[2]; | |
} | |
static void shuffle(int *array, size_t n) { | |
if (n > 1) { | |
size_t i; | |
for (i = 0; i < n - 1; i++) { | |
size_t j = i + rand() / (RAND_MAX / (n - i) + 1); | |
int t = array[j]; | |
array[j] = array[i]; | |
array[i] = t; | |
} | |
} | |
} | |
static void init_dbufs(uint64_t **out_data1, uint64_t **out_data2) { | |
// Initialize two 256MB data buffers, with the same linked-list | |
// of offsets. | |
size_t size = 256 * 1024 * 1024; | |
size_t cache_line_size = 64; | |
size_t count = size / cache_line_size; | |
size_t stride = cache_line_size / sizeof(void *); | |
int *numbers = malloc(count * sizeof(int)); | |
for (int i = 0; i < count; i++) { | |
numbers[i] = i; | |
} | |
shuffle(numbers, count); | |
uint64_t *data1 = calloc(size, 1); | |
uint64_t *data2 = (uint64_t *)((char *)calloc(size + 64, 1) + 64); | |
int next = numbers[count - 1]; | |
for (int i = 0; i < count; i++) { | |
int n = numbers[i]; | |
data1[stride * n] = next * stride; | |
data2[stride * n] = next * stride; | |
next = n; | |
} | |
*out_data1 = data1; | |
*out_data2 = data2; | |
free(numbers); | |
} | |
static int add_filler(uint32_t *ibuf, int j) { | |
int o = 0; | |
return o; | |
} | |
#define SREG_READ(SR) \ | |
({ \ | |
uint64_t VAL = 0; \ | |
__asm__ volatile("isb \r\n mrs %0, " SR " \r\n isb \r\n" : "=r"(VAL)); \ | |
VAL; \ | |
}) | |
int get_current_core() { return SREG_READ("TPIDRRO_EL0") & 7; } | |
void *thread(void *arg) { | |
pthread_set_qos_class_self_np(QOS_CLASS_USER_INTERACTIVE, 0); | |
int v = SREG_READ("TPIDRRO_EL0") & 7; | |
while (1) { | |
while ((SREG_READ("TPIDRRO_EL0") & 7) == v) { | |
} | |
v = SREG_READ("TPIDRRO_EL0") & 7; | |
} | |
return NULL; | |
} | |
void make_routine(uint32_t *ibuf, int icount, int it) { | |
pthread_jit_write_protect_np(0); | |
int o = 0; | |
// prologue | |
ibuf[o++] = 0xa9b47bfd; // stp x29, x30, [sp, #-0xC0]! | |
ibuf[o++] = 0xa9013ff0; // stp x16, x15, [sp, #0x10] | |
ibuf[o++] = 0xa90247f2; // stp x18, x17, [sp, #0x20] | |
ibuf[o++] = 0xa9034ff4; // stp x20, x19, [sp, #0x30] | |
ibuf[o++] = 0xa90457f6; // stp x22, x21, [sp, #0x40] | |
ibuf[o++] = 0xa9055ff8; // stp x24, x23, [sp, #0x50] | |
ibuf[o++] = 0xa90667fa; // stp x26, x25, [sp, #0x60] | |
ibuf[o++] = 0xa9076ffc; // stp x28, x27, [sp, #0x70] | |
ibuf[o++] = 0x6d083bef; // stp d15, d14, [sp, #0x80] | |
ibuf[o++] = 0x6d0933ed; // stp d13, d12, [sp, #0x90] | |
ibuf[o++] = 0x6d0a2beb; // stp d11, d10, [sp, #0xA0] | |
ibuf[o++] = 0x6d0b23e9; // stp d9, d8, [sp, #0xB0] | |
// prep | |
ibuf[o++] = 0xaa0003fa; // mov x26, x0 | |
ibuf[o++] = 0xaa0203fb; // mov x27, x2 | |
ibuf[o++] = 0xaa0403fc; // mov x28, x4 | |
ibuf[o++] = 0xaa0303fd; // mov x29, x3 | |
ibuf[o++] = 0xaa0103fe; // mov x30, x1 | |
ibuf[o++] = 0xaa0303f1; // mov x17, x3 | |
int start = o; | |
ibuf[o++] = 0xf87a7b7a; // ldr x26, [x27, x26, lsl #3] | |
for (int j = 0; j < icount; j++) { | |
// size 60: | |
// ibuf[o++] = 0xf81e03e5; // stur x5, [sp, #-32] | |
// ibuf[o++] = 0xf90007a5; // str x5, [x29, #8] | |
ibuf[o++] = 0x390023a5; // strb w5, [x29, #8] | |
// size 107: | |
// ibuf[o++] = 0xf9000765; // str x5, [x27, #8] | |
// ibuf[o++] = 0xf9000465; // str x5, [x3, #8] | |
// ibuf[o++] = 0xf9000625; // str x5, [x17, #8] | |
} | |
ibuf[o++] = 0xf87e7bbe; // ldr x30, [x29, x30, lsl #3] | |
// replace with this for size 107 (even with x17 store): | |
// ibuf[o++] = 0xf87e7a3e; // ldr x30, [x17, x30, lsl #3] | |
// lfence mode? | |
ibuf[o++] = 0xD5033B9F; // DSB ISH | |
ibuf[o++] = 0xD5033FDF; // ISB | |
// loop back to top | |
ibuf[o++] = 0x5100079c; // sub w28, w28, #1 | |
int off = start - o; | |
assert(off < 0 && off > -0x40000); | |
ibuf[o++] = 0xb500001c | ((off & 0x7ffff) << 5); // cbnz w28 | |
ibuf[o++] = 0xaa1a03e0; // mov x0, x26 | |
// epilogue | |
ibuf[o++] = 0xa9413ff0; // ldp x16, x15, [sp, #0x10] | |
ibuf[o++] = 0xa94247f2; // ldp x18, x17, [sp, #0x20] | |
ibuf[o++] = 0xa9434ff4; // ldp x20, x19, [sp, #0x30] | |
ibuf[o++] = 0xa94457f6; // ldp x22, x21, [sp, #0x40] | |
ibuf[o++] = 0xa9455ff8; // ldp x24, x23, [sp, #0x50] | |
ibuf[o++] = 0xa94667fa; // ldp x26, x25, [sp, #0x60] | |
ibuf[o++] = 0xa9476ffc; // ldp x28, x27, [sp, #0x70] | |
ibuf[o++] = 0x6d483bef; // ldp d15, d14, [sp, #0x80] | |
ibuf[o++] = 0x6d4933ed; // ldp d13, d12, [sp, #0x90] | |
ibuf[o++] = 0x6d4a2beb; // ldp d11, d10, [sp, #0xA0] | |
ibuf[o++] = 0x6d4b23e9; // ldp d9, d8, [sp, #0xB0] | |
ibuf[o++] = 0xa8cc7bfd; // ldp x29, x30, [sp], #0xC0 | |
ibuf[o++] = 0xd65f03c0; // ret | |
pthread_jit_write_protect_np(1); | |
sys_icache_invalidate(ibuf, o * 4); | |
} | |
int main(int argc, char **argv) { | |
int test_high_perf_cores = 1; | |
int instr_type = 1; | |
int start_icount = test_high_perf_cores ? 50 : 15; | |
int stop_icount = 130; | |
int stride_icount = 1; | |
// TODO: can we force this to run on the fast cores? | |
// counters seemingly fail to update if we initialise | |
// them, then switch cores, although the fixed thread | |
// counters don't have this problem. | |
// QOS_CLASS_BACKGROUND does seem to pin it to the slow | |
// cores though. | |
if (test_high_perf_cores) { | |
pthread_set_qos_class_self_np(QOS_CLASS_USER_INTERACTIVE, 0); | |
} else { | |
#if 1 | |
pthread_t ids[4]; | |
for (int i = 0; i < 4; i++) { | |
if (pthread_create(&ids[i], NULL, thread, NULL) != 0) { | |
perror("pthread_create() error"); | |
exit(1); | |
} | |
} | |
#else | |
pthread_set_qos_class_self_np(QOS_CLASS_BACKGROUND, 0); | |
#endif | |
} | |
init_rdtsc(); | |
uint64_t *data1, *data2; | |
init_dbufs(&data1, &data2); | |
void *mapping = mmap(NULL, 0x400000, PROT_READ | PROT_WRITE | PROT_EXEC, | |
MAP_ANON | MAP_PRIVATE | MAP_JIT, -1, 0); | |
uint32_t *ibuf = (uint32_t *)mapping; | |
uint64_t next = 0; | |
for (int icount = start_icount; icount <= stop_icount; | |
icount += stride_icount) { | |
uint64_t min_diff = 0x7fffffffffffffffLL; | |
uint64_t max_diff = 0x0; | |
uint64_t sum_diff = 0; | |
for (int i = 0; i < outer_its; i++) { | |
make_routine(ibuf, icount, i); | |
uint64_t (*routine)(uint64_t, uint64_t, uint64_t *, uint64_t *, | |
uint64_t) = | |
ptrauth_sign_unauthenticated((void *)ibuf, | |
ptrauth_key_function_pointer, 0); | |
next = routine(next, next, data1, data2, 2); | |
// in case we were on the wrong core earlier | |
configure_rdtsc(); | |
int start_core = get_current_core(); | |
long long start = rdtsc(); | |
next = routine(next, next, data1, data2, its); | |
long long stop = rdtsc(); | |
int end_core = get_current_core(); | |
if (start_core != end_core || | |
(start_core < 4) != (!test_high_perf_cores)) { | |
i--; | |
continue; | |
} | |
uint64_t cycles = stop - start; | |
sum_diff += cycles; | |
if (min_diff > cycles) { | |
min_diff = cycles; | |
} | |
if (max_diff < cycles) { | |
max_diff = cycles; | |
} | |
} | |
printf("%d\t%.2f\t%.2f\t%.2f\t", icount, 1.0 * min_diff / its / unroll, | |
1.0 * sum_diff / its / unroll / outer_its, | |
1.0 * max_diff / its / unroll); | |
int n = 1.0 * min_diff / its / unroll; | |
for (int x = 0; x * 20 < n; x++) { | |
printf("-"); | |
} | |
printf("\n"); | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment