Last active
December 20, 2023 19:33
-
-
Save azat/2dc33fdadbb2feaf18e9cb591392f6cb to your computer and use it in GitHub Desktop.
Answers the question "Does cache oblivious in jemalloc still make sense?" - Yes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <bits/time.h> | |
#include <stdint.h> | |
#include <stdlib.h> | |
#include <stdio.h> | |
#include <sys/types.h> | |
#include <time.h> | |
// Answers the question "Does cache oblivious in jemalloc still make sense?" | |
// The short answer is "Yes"! | |
// | |
// $ clang -O3 -g3 bench-malloc.c -o bench-malloc && prlimit --cpu=10 ./bench-malloc | |
// | |
// $ LD_PRELOAD=/src/oss/jemalloc/.build/lib/libjemalloc.so.2 ./bench-malloc | |
// elapsed: 205832268 | |
// elapsed: 2061036 | |
// elapsed: 526032 | |
// elapsed: 515628 | |
// | |
// $ LD_PRELOAD=/src/oss/jemalloc/.build-no-cache-oblivious/lib/libjemalloc.so.2 ./bench-malloc | |
// elapsed: 206214588 | |
// elapsed: 3120804 | |
// elapsed: 2628288 | |
// elapsed: 2583684 | |
// | |
// *(Numbers from AMD Ryzen Threadripper PRO 5975WX)* | |
// | |
// Refs: | |
// - https://github.com/jemalloc/jemalloc/issues/1098 | |
// - https://www.cs.tau.ac.il/~mad/publications/ismm2011-CIF.pdf | |
__inline__ uint64_t rdtsc(void) | |
{ | |
uint32_t lo, hi; | |
__asm__ __volatile__ ( // serialize | |
"xorl %%eax,%%eax \n cpuid" | |
::: "%rax", "%rbx", "%rcx", "%rdx"); | |
/* We cannot use "=A", since this would use %rax on x86_64 and return only the lower 32bits of the TSC */ | |
__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi)); | |
return (uint64_t)hi << 32 | lo; | |
} | |
#define N 65535 | |
int main() | |
{ | |
int ** array = calloc(N, sizeof(int *)); | |
for (size_t i = 0; i < N; ++i) | |
{ | |
// we need 16K or above, since only for them jemalloc cache oblivious has difference | |
array[i] = malloc(16<<10); | |
} | |
for (size_t n = 0; n < 4; ++n) | |
{ | |
uint64_t start = rdtsc(); | |
for (size_t i = 0; i < N; ++i) | |
*array[i] *= 3; | |
uint64_t end = rdtsc(); | |
printf("elapsed: %lu\n", end - start); | |
} | |
// whatever... leaks... | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Increasing number of iterations to 100 to make this numbers pops up in profiler, and you can see that in case of no cache oblivious there are more
L1-dcache-load-misses
anddTLB-load-misses
perf stat
jemalloc without cache oblivious
jemalloc with cache oblivious