Skip to content

Instantly share code, notes, and snippets.

@dougallj
Created April 7, 2021 12:07
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save dougallj/0d4972967c625852956fcfe427b2054c to your computer and use it in GitHub Desktop.
Save dougallj/0d4972967c625852956fcfe427b2054c to your computer and use it in GitHub Desktop.
// WARNING: must be run as root on an M1 device
// WARNING: fragile, uses private apple APIs
// currently no command line interface, see variables at top of main
/*
Based on https://github.com/travisdowns/robsize
Henry Wong <henry@stuffedcow.net>
http://blog.stuffedcow.net/2013/05/measuring-rob-capacity/
2014-10-14
*/
#include <assert.h>
#include <dlfcn.h>
#include <pthread.h>
#include <ptrauth.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <libkern/OSCacheControl.h>
static int its = 8192 / 4;
static int outer_its = 64 * 2;
static int unroll = 1;
const char *delim = "\t";
#define KPERF_LIST \
/* ret, name, params */ \
F(int, kpc_get_counting, void) \
F(int, kpc_force_all_ctrs_set, int) \
F(int, kpc_set_counting, uint32_t) \
F(int, kpc_set_thread_counting, uint32_t) \
F(int, kpc_set_config, uint32_t, void *) \
F(int, kpc_get_config, uint32_t, void *) \
F(int, kpc_set_period, uint32_t, void *) \
F(int, kpc_get_period, uint32_t, void *) \
F(uint32_t, kpc_get_counter_count, uint32_t) \
F(uint32_t, kpc_get_config_count, uint32_t) \
F(int, kperf_sample_get, int *) \
F(int, kpc_get_thread_counters, int, unsigned int, void *)
#define F(ret, name, ...) \
typedef ret name##proc(__VA_ARGS__); \
static name##proc *name;
KPERF_LIST
#undef F
#define CFGWORD_EL0A32EN_MASK (0x10000)
#define CFGWORD_EL0A64EN_MASK (0x20000)
#define CFGWORD_EL1EN_MASK (0x40000)
#define CFGWORD_EL3EN_MASK (0x80000)
#define CFGWORD_ALLMODES_MASK (0xf0000)
#define CPMU_NONE 0
#define CPMU_CORE_CYCLE 0x02
#define CPMU_INST_A64 0x8c
#define CPMU_INST_BRANCH 0x8d
#define CPMU_SYNC_DC_LOAD_MISS 0xbf
#define CPMU_SYNC_DC_STORE_MISS 0xc0
#define CPMU_SYNC_DTLB_MISS 0xc1
#define CPMU_SYNC_ST_HIT_YNGR_LD 0xc4
#define CPMU_SYNC_BR_ANY_MISP 0xcb
#define CPMU_FED_IC_MISS_DEM 0xd3
#define CPMU_FED_ITLB_MISS 0xd4
#define KPC_CLASS_FIXED (0)
#define KPC_CLASS_CONFIGURABLE (1)
#define KPC_CLASS_POWER (2)
#define KPC_CLASS_RAWPMU (3)
#define KPC_CLASS_FIXED_MASK (1u << KPC_CLASS_FIXED)
#define KPC_CLASS_CONFIGURABLE_MASK (1u << KPC_CLASS_CONFIGURABLE)
#define KPC_CLASS_POWER_MASK (1u << KPC_CLASS_POWER)
#define KPC_CLASS_RAWPMU_MASK (1u << KPC_CLASS_RAWPMU)
#define COUNTERS_COUNT 10
#define CONFIG_COUNT 8
#define KPC_MASK (KPC_CLASS_CONFIGURABLE_MASK | KPC_CLASS_FIXED_MASK)
uint64_t g_counters[COUNTERS_COUNT];
uint64_t g_config[COUNTERS_COUNT];
static void configure_rdtsc() {
if (kpc_set_config(KPC_MASK, g_config)) {
printf("kpc_set_config failed\n");
return;
}
if (kpc_force_all_ctrs_set(1)) {
printf("kpc_force_all_ctrs_set failed\n");
return;
}
if (kpc_set_counting(KPC_MASK)) {
printf("kpc_set_counting failed\n");
return;
}
if (kpc_set_thread_counting(KPC_MASK)) {
printf("kpc_set_thread_counting failed\n");
return;
}
}
static void init_rdtsc() {
void *kperf = dlopen(
"/System/Library/PrivateFrameworks/kperf.framework/Versions/A/kperf",
RTLD_LAZY);
if (!kperf) {
printf("kperf = %p\n", kperf);
return;
}
#define F(ret, name, ...) \
name = (name##proc *)(dlsym(kperf, #name)); \
if (!name) { \
printf("%s = %p\n", #name, (void *)name); \
return; \
}
KPERF_LIST
#undef F
// TODO: KPC_CLASS_RAWPMU_MASK
if (kpc_get_counter_count(KPC_MASK) != COUNTERS_COUNT) {
printf("wrong fixed counters count\n");
return;
}
if (kpc_get_config_count(KPC_MASK) != CONFIG_COUNT) {
printf("wrong fixed config count\n");
return;
}
// Not all counters can count all things:
// CPMU_CORE_CYCLE {0-7}
// CPMU_FED_IC_MISS_DEM {0-7}
// CPMU_FED_ITLB_MISS {0-7}
// CPMU_INST_BRANCH {3, 4, 5}
// CPMU_SYNC_DC_LOAD_MISS {3, 4, 5}
// CPMU_SYNC_DC_STORE_MISS {3, 4, 5}
// CPMU_SYNC_DTLB_MISS {3, 4, 5}
// CPMU_SYNC_BR_ANY_MISP {3, 4, 5}
// CPMU_SYNC_ST_HIT_YNGR_LD {3, 4, 5}
// CPMU_INST_A64 {5}
// using "CFGWORD_ALLMODES_MASK" is much noisier
g_config[0] = CPMU_CORE_CYCLE | CFGWORD_EL0A64EN_MASK;
// configs[3] = CPMU_SYNC_DC_LOAD_MISS | CFGWORD_EL0A64EN_MASK;
// configs[4] = CPMU_SYNC_DTLB_MISS | CFGWORD_EL0A64EN_MASK;
// configs[5] = CPMU_INST_A64 | CFGWORD_EL0A64EN_MASK;
configure_rdtsc();
}
static unsigned long long int rdtsc() {
if (kpc_get_thread_counters(0, COUNTERS_COUNT, g_counters)) {
printf("kpc_get_thread_counters failed\n");
return 1;
}
return g_counters[2];
}
static void shuffle(int *array, size_t n) {
if (n > 1) {
size_t i;
for (i = 0; i < n - 1; i++) {
size_t j = i + rand() / (RAND_MAX / (n - i) + 1);
int t = array[j];
array[j] = array[i];
array[i] = t;
}
}
}
static void init_dbufs(uint64_t **out_data1, uint64_t **out_data2) {
// Initialize two 256MB data buffers, with the same linked-list
// of offsets.
size_t size = 256 * 1024 * 1024;
size_t cache_line_size = 64;
size_t count = size / cache_line_size;
size_t stride = cache_line_size / sizeof(void *);
int *numbers = malloc(count * sizeof(int));
for (int i = 0; i < count; i++) {
numbers[i] = i;
}
shuffle(numbers, count);
uint64_t *data1 = calloc(size, 1);
uint64_t *data2 = (uint64_t *)((char *)calloc(size + 64, 1) + 64);
int next = numbers[count - 1];
for (int i = 0; i < count; i++) {
int n = numbers[i];
data1[stride * n] = next * stride;
data2[stride * n] = next * stride;
next = n;
}
*out_data1 = data1;
*out_data2 = data2;
free(numbers);
}
static int add_filler(uint32_t *ibuf, int j) {
int o = 0;
return o;
}
#define SREG_READ(SR) \
({ \
uint64_t VAL = 0; \
__asm__ volatile("isb \r\n mrs %0, " SR " \r\n isb \r\n" : "=r"(VAL)); \
VAL; \
})
int get_current_core() { return SREG_READ("TPIDRRO_EL0") & 7; }
void *thread(void *arg) {
pthread_set_qos_class_self_np(QOS_CLASS_USER_INTERACTIVE, 0);
int v = SREG_READ("TPIDRRO_EL0") & 7;
while (1) {
while ((SREG_READ("TPIDRRO_EL0") & 7) == v) {
}
v = SREG_READ("TPIDRRO_EL0") & 7;
}
return NULL;
}
void make_routine(uint32_t *ibuf, int icount, int it) {
pthread_jit_write_protect_np(0);
int o = 0;
// prologue
ibuf[o++] = 0xa9b47bfd; // stp x29, x30, [sp, #-0xC0]!
ibuf[o++] = 0xa9013ff0; // stp x16, x15, [sp, #0x10]
ibuf[o++] = 0xa90247f2; // stp x18, x17, [sp, #0x20]
ibuf[o++] = 0xa9034ff4; // stp x20, x19, [sp, #0x30]
ibuf[o++] = 0xa90457f6; // stp x22, x21, [sp, #0x40]
ibuf[o++] = 0xa9055ff8; // stp x24, x23, [sp, #0x50]
ibuf[o++] = 0xa90667fa; // stp x26, x25, [sp, #0x60]
ibuf[o++] = 0xa9076ffc; // stp x28, x27, [sp, #0x70]
ibuf[o++] = 0x6d083bef; // stp d15, d14, [sp, #0x80]
ibuf[o++] = 0x6d0933ed; // stp d13, d12, [sp, #0x90]
ibuf[o++] = 0x6d0a2beb; // stp d11, d10, [sp, #0xA0]
ibuf[o++] = 0x6d0b23e9; // stp d9, d8, [sp, #0xB0]
// prep
ibuf[o++] = 0xaa0003fa; // mov x26, x0
ibuf[o++] = 0xaa0203fb; // mov x27, x2
ibuf[o++] = 0xaa0403fc; // mov x28, x4
ibuf[o++] = 0xaa0303fd; // mov x29, x3
ibuf[o++] = 0xaa0103fe; // mov x30, x1
ibuf[o++] = 0xaa0303f1; // mov x17, x3
int start = o;
ibuf[o++] = 0xf87a7b7a; // ldr x26, [x27, x26, lsl #3]
for (int j = 0; j < icount; j++) {
// size 60:
// ibuf[o++] = 0xf81e03e5; // stur x5, [sp, #-32]
// ibuf[o++] = 0xf90007a5; // str x5, [x29, #8]
ibuf[o++] = 0x390023a5; // strb w5, [x29, #8]
// size 107:
// ibuf[o++] = 0xf9000765; // str x5, [x27, #8]
// ibuf[o++] = 0xf9000465; // str x5, [x3, #8]
// ibuf[o++] = 0xf9000625; // str x5, [x17, #8]
}
ibuf[o++] = 0xf87e7bbe; // ldr x30, [x29, x30, lsl #3]
// replace with this for size 107 (even with x17 store):
// ibuf[o++] = 0xf87e7a3e; // ldr x30, [x17, x30, lsl #3]
// lfence mode?
ibuf[o++] = 0xD5033B9F; // DSB ISH
ibuf[o++] = 0xD5033FDF; // ISB
// loop back to top
ibuf[o++] = 0x5100079c; // sub w28, w28, #1
int off = start - o;
assert(off < 0 && off > -0x40000);
ibuf[o++] = 0xb500001c | ((off & 0x7ffff) << 5); // cbnz w28
ibuf[o++] = 0xaa1a03e0; // mov x0, x26
// epilogue
ibuf[o++] = 0xa9413ff0; // ldp x16, x15, [sp, #0x10]
ibuf[o++] = 0xa94247f2; // ldp x18, x17, [sp, #0x20]
ibuf[o++] = 0xa9434ff4; // ldp x20, x19, [sp, #0x30]
ibuf[o++] = 0xa94457f6; // ldp x22, x21, [sp, #0x40]
ibuf[o++] = 0xa9455ff8; // ldp x24, x23, [sp, #0x50]
ibuf[o++] = 0xa94667fa; // ldp x26, x25, [sp, #0x60]
ibuf[o++] = 0xa9476ffc; // ldp x28, x27, [sp, #0x70]
ibuf[o++] = 0x6d483bef; // ldp d15, d14, [sp, #0x80]
ibuf[o++] = 0x6d4933ed; // ldp d13, d12, [sp, #0x90]
ibuf[o++] = 0x6d4a2beb; // ldp d11, d10, [sp, #0xA0]
ibuf[o++] = 0x6d4b23e9; // ldp d9, d8, [sp, #0xB0]
ibuf[o++] = 0xa8cc7bfd; // ldp x29, x30, [sp], #0xC0
ibuf[o++] = 0xd65f03c0; // ret
pthread_jit_write_protect_np(1);
sys_icache_invalidate(ibuf, o * 4);
}
int main(int argc, char **argv) {
int test_high_perf_cores = 1;
int instr_type = 1;
int start_icount = test_high_perf_cores ? 50 : 15;
int stop_icount = 130;
int stride_icount = 1;
// TODO: can we force this to run on the fast cores?
// counters seemingly fail to update if we initialise
// them, then switch cores, although the fixed thread
// counters don't have this problem.
// QOS_CLASS_BACKGROUND does seem to pin it to the slow
// cores though.
if (test_high_perf_cores) {
pthread_set_qos_class_self_np(QOS_CLASS_USER_INTERACTIVE, 0);
} else {
#if 1
pthread_t ids[4];
for (int i = 0; i < 4; i++) {
if (pthread_create(&ids[i], NULL, thread, NULL) != 0) {
perror("pthread_create() error");
exit(1);
}
}
#else
pthread_set_qos_class_self_np(QOS_CLASS_BACKGROUND, 0);
#endif
}
init_rdtsc();
uint64_t *data1, *data2;
init_dbufs(&data1, &data2);
void *mapping = mmap(NULL, 0x400000, PROT_READ | PROT_WRITE | PROT_EXEC,
MAP_ANON | MAP_PRIVATE | MAP_JIT, -1, 0);
uint32_t *ibuf = (uint32_t *)mapping;
uint64_t next = 0;
for (int icount = start_icount; icount <= stop_icount;
icount += stride_icount) {
uint64_t min_diff = 0x7fffffffffffffffLL;
uint64_t max_diff = 0x0;
uint64_t sum_diff = 0;
for (int i = 0; i < outer_its; i++) {
make_routine(ibuf, icount, i);
uint64_t (*routine)(uint64_t, uint64_t, uint64_t *, uint64_t *,
uint64_t) =
ptrauth_sign_unauthenticated((void *)ibuf,
ptrauth_key_function_pointer, 0);
next = routine(next, next, data1, data2, 2);
// in case we were on the wrong core earlier
configure_rdtsc();
int start_core = get_current_core();
long long start = rdtsc();
next = routine(next, next, data1, data2, its);
long long stop = rdtsc();
int end_core = get_current_core();
if (start_core != end_core ||
(start_core < 4) != (!test_high_perf_cores)) {
i--;
continue;
}
uint64_t cycles = stop - start;
sum_diff += cycles;
if (min_diff > cycles) {
min_diff = cycles;
}
if (max_diff < cycles) {
max_diff = cycles;
}
}
printf("%d\t%.2f\t%.2f\t%.2f\t", icount, 1.0 * min_diff / its / unroll,
1.0 * sum_diff / its / unroll / outer_its,
1.0 * max_diff / its / unroll);
int n = 1.0 * min_diff / its / unroll;
for (int x = 0; x * 20 < n; x++) {
printf("-");
}
printf("\n");
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment