azat/bench-jemalloc-cache-oblivious.c

## bench-jemalloc-cache-oblivious.c
#include <bits/time.h>
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/types.h>
#include <time.h>

// Answers the question "Does cache oblivious in jemalloc still make sense?"
// The short answer is "Yes"!
//
//     $ clang -O3 -g3 bench-malloc.c -o bench-malloc && prlimit --cpu=10 ./bench-malloc
//
//     $ LD_PRELOAD=/src/oss/jemalloc/.build/lib/libjemalloc.so.2 ./bench-malloc
//     elapsed: 205832268
//     elapsed: 2061036
//     elapsed: 526032
//     elapsed: 515628
//
//     $ LD_PRELOAD=/src/oss/jemalloc/.build-no-cache-oblivious/lib/libjemalloc.so.2 ./bench-malloc
//     elapsed: 206214588
//     elapsed: 3120804
//     elapsed: 2628288
//     elapsed: 2583684
//
//     *(Numbers from AMD Ryzen Threadripper PRO 5975WX)*
//
// Refs:
// - https://github.com/jemalloc/jemalloc/issues/1098
// - https://www.cs.tau.ac.il/~mad/publications/ismm2011-CIF.pdf

__inline__ uint64_t rdtsc(void)
{
  uint32_t lo, hi;
  __asm__ __volatile__ (      // serialize
  "xorl %%eax,%%eax \n        cpuid"
  ::: "%rax", "%rbx", "%rcx", "%rdx");
  /* We cannot use "=A", since this would use %rax on x86_64 and return only the lower 32bits of the TSC */
  __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
  return (uint64_t)hi << 32 | lo;
}

#define N 65535
int main()
{
    int ** array = calloc(N, sizeof(int *));
    for (size_t i = 0; i < N; ++i)
    {
        // we need 16K or above, since only for them jemalloc cache oblivious has difference
        array[i] = malloc(16<<10);
    }

    for (size_t n = 0; n < 4; ++n)
    {
        uint64_t start = rdtsc();
        for (size_t i = 0; i < N; ++i)
            *array[i] *= 3;
        uint64_t end = rdtsc();
        printf("elapsed: %lu\n", end - start);
    }

    // whatever... leaks...

    return 0;
}
	#include <bits/time.h>
	#include <stdint.h>
	#include <stdlib.h>
	#include <stdio.h>
	#include <sys/types.h>
	#include <time.h>

	// Answers the question "Does cache oblivious in jemalloc still make sense?"
	// The short answer is "Yes"!
	//
	// $ clang -O3 -g3 bench-malloc.c -o bench-malloc && prlimit --cpu=10 ./bench-malloc
	//
	// $ LD_PRELOAD=/src/oss/jemalloc/.build/lib/libjemalloc.so.2 ./bench-malloc
	// elapsed: 205832268
	// elapsed: 2061036
	// elapsed: 526032
	// elapsed: 515628
	//
	// $ LD_PRELOAD=/src/oss/jemalloc/.build-no-cache-oblivious/lib/libjemalloc.so.2 ./bench-malloc
	// elapsed: 206214588
	// elapsed: 3120804
	// elapsed: 2628288
	// elapsed: 2583684
	//
	// (Numbers from AMD Ryzen Threadripper PRO 5975WX)
	//
	// Refs:
	// - https://github.com/jemalloc/jemalloc/issues/1098
	// - https://www.cs.tau.ac.il/~mad/publications/ismm2011-CIF.pdf

	__inline__ uint64_t rdtsc(void)
	{
	uint32_t lo, hi;
	__asm__ __volatile__ ( // serialize
	"xorl %%eax,%%eax \n cpuid"
	::: "%rax", "%rbx", "%rcx", "%rdx");
	/* We cannot use "=A", since this would use %rax on x86_64 and return only the lower 32bits of the TSC */
	__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
	return (uint64_t)hi << 32 \| lo;
	}

	#define N 65535
	int main()
	{
	int ** array = calloc(N, sizeof(int *));
	for (size_t i = 0; i < N; ++i)
	{
	// we need 16K or above, since only for them jemalloc cache oblivious has difference
	array[i] = malloc(16<<10);
	}

	for (size_t n = 0; n < 4; ++n)
	{
	uint64_t start = rdtsc();
	for (size_t i = 0; i < N; ++i)
	array[i] = 3;
	uint64_t end = rdtsc();
	printf("elapsed: %lu\n", end - start);
	}

	// whatever... leaks...

	return 0;
	}