Skip to content

Instantly share code, notes, and snippets.

@bitonic
Last active July 17, 2024 19:15
Show Gist options
  • Save bitonic/78887f5d3238bab5e31f3c5a41d404b2 to your computer and use it in GitHub Desktop.
Save bitonic/78887f5d3238bab5e31f3c5a41d404b2 to your computer and use it in GitHub Desktop.
// See <https://gist.github.com/pervognsen/b58108d134824e61caedffcc01004e03> for
// Per Vognsen gist on value speculation.
//
// I compile this on linux with
//
// $ clang --version
// clang version 12.0.0
// Target: x86_64-unknown-linux-gnu
// $ clang -static -Wall -O3 value-speculation-linux.c -o value-speculation-linux
//
// On a "Intel(R) Xeon(R) CPU E5-1650 v3 @ 3.50GHz" I get:
//
// 16kB, 10000 iterations
// sum1: 8465052097154389858, 1.12us, 14.24GB/s, 3.91 cycles/elem, 1.02 instrs/cycle, 3.48GHz, 4.01 instrs/elem
// sum2: 8465052097154389858, 0.57us, 27.95GB/s, 1.99 cycles/elem, 5.02 instrs/cycle, 3.48GHz, 10.01 instrs/elem
// sum3: 8465052097154389858, 0.37us, 43.40GB/s, 1.28 cycles/elem, 3.91 instrs/cycle, 3.48GHz, 5.01 instrs/elem
// sum4: 8465052097154389858, 0.39us, 41.03GB/s, 1.36 cycles/elem, 3.69 instrs/cycle, 3.48GHz, 5.01 instrs/elem
// sum5: 8465052097154389858, 0.37us, 43.55GB/s, 1.28 cycles/elem, 3.92 instrs/cycle, 3.48GHz, 5.01 instrs/elem
// 128kB, 10000 iterations
// sum1: 6947699366156898439, 9.07us, 14.11GB/s, 3.95 cycles/elem, 1.01 instrs/cycle, 3.49GHz, 4.00 instrs/elem
// sum2: 6947699366156898439, 4.51us, 28.36GB/s, 1.97 cycles/elem, 5.09 instrs/cycle, 3.49GHz, 10.00 instrs/elem
// sum3: 6947699366156898439, 3.78us, 33.84GB/s, 1.65 cycles/elem, 3.04 instrs/cycle, 3.48GHz, 5.00 instrs/elem
// sum4: 6947699366156898439, 3.32us, 38.51GB/s, 1.45 cycles/elem, 3.45 instrs/cycle, 3.48GHz, 5.00 instrs/elem
// sum5: 6947699366156898439, 3.78us, 33.86GB/s, 1.65 cycles/elem, 3.04 instrs/cycle, 3.49GHz, 5.00 instrs/elem
// 5000kB, 100 iterations
// sum1: 2134986631019855758, 0.36ms, 13.96GB/s, 3.99 cycles/elem, 1.00 instrs/cycle, 3.48GHz, 4.00 instrs/elem
// sum2: 2134986631019855758, 0.19ms, 26.22GB/s, 2.12 cycles/elem, 4.71 instrs/cycle, 3.48GHz, 10.00 instrs/elem
// sum3: 2134986631019855758, 0.17ms, 28.90GB/s, 1.93 cycles/elem, 2.59 instrs/cycle, 3.48GHz, 5.00 instrs/elem
// sum4: 2134986631019855758, 0.17ms, 30.07GB/s, 1.85 cycles/elem, 2.70 instrs/cycle, 3.48GHz, 5.00 instrs/elem
// sum5: 2134986631019855758, 0.17ms, 28.59GB/s, 1.95 cycles/elem, 2.57 instrs/cycle, 3.48GHz, 5.00 instrs/elem
// 4294MB, 1 iterations
// sum1: 15446485409674718527, 0.45 s, 9.54GB/s, 5.84 cycles/elem, 0.68 instrs/cycle, 3.48GHz, 4.00 instrs/elem
// sum2: 15446485409674718527, 0.33 s, 13.15GB/s, 4.23 cycles/elem, 2.36 instrs/cycle, 3.48GHz, 10.00 instrs/elem
// sum3: 15446485409674718527, 0.31 s, 13.82GB/s, 4.02 cycles/elem, 1.24 instrs/cycle, 3.48GHz, 5.00 instrs/elem
// sum4: 15446485409674718527, 0.30 s, 14.45GB/s, 3.85 cycles/elem, 1.30 instrs/cycle, 3.47GHz, 5.00 instrs/elem
// sum5: 15446485409674718527, 0.31 s, 14.00GB/s, 3.97 cycles/elem, 1.26 instrs/cycle, 3.48GHz, 5.00 instrs/elem
//
// When the data fits in the cache (L1, L2, and L3 specifically) the increase in
// performance is pronounced (and in fact higher than Per's experiments). With a
// bigger dataset (the siz Per Vognsen uses) I believe I am hitting the limits of
// the RAM itself, given that:
//
// $ sysbench memory --memory-block-size=1G --memory-oper=read --threads=1 run
// ...
// 102400.00 MiB transferred (12081.43 MiB/sec)
//
// The datasets are designed to fit in L1, L2, and L3 respectively.
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <unistd.h>
#include <sys/ioctl.h>
#include <linux/perf_event.h>
#include <asm/unistd.h>
#include <sched.h>
#include <stdbool.h>
#define NOINLINE __attribute__((noinline))
// The functions we're benchmarking
typedef struct Node {
uint64_t value;
struct Node *next;
} Node;
NOINLINE
static uint64_t sum1(Node *node) {
uint64_t value = 0;
while (node) {
value += node->value;
node = node->next;
}
return value;
}
// This is the version from Per, without the loop unrolling -- it works (in
// clang anyway), but we can do a bit better (see `sum3`).
NOINLINE
static uint64_t sum2(Node *node) {
uint64_t value = 0;
while (node) {
value += node->value;
Node *predicted_next = node + 1;
Node *next = node->next;
if (next == predicted_next) {
// Prevent compilers optimizing this apparently meaningless branch away
// by making them think we're changing predicted_next here.
//
// This trick, however, does not work with GCC, only with clang. GCC here
// derives that `next` and `predicted_next` are the same, and therefore
// merges them into the same variable, which re-introduces the data
// dependency we wanted to get rid of.
asm("" : "+r"(predicted_next));
node = predicted_next;
} else {
node = next;
}
}
return value;
}
// We would like to just write the following C code:
//
// uint64_t sum2(Node* node) {
// uint64_t value = 0;
// while (node) {
// value += node->value;
// node++;
// if (node != node->next) {
// node = node->next;
// }
// }
// return value;
// }
//
// But it's hard to get compilers to not compile the semantically pointless
// node++ and the if away.
//
// This uses 5 instructions per element, leveraging the fact that if next
// is the same as the predicted next, we know there's a next iteration.
//
// The original version used 6 instructions per element, but Rihalto pointed
// out on twitter that a jump could be eliminated:
// <https://twitter.com/RhialtoTheM/status/1418926515526459394>
NOINLINE
static uint64_t sum3(Node *node) {
uint64_t value = 0;
Node *next = NULL;
asm (
"begin_loop_%=:\n"
" testq %[node], %[node]\n"
" je end_%=\n"
"loop_body_%=:\n"
" addq (%[node]), %[value]\n"
" movq 8(%[node]), %[next]\n"
" addq $16, %[node]\n"
" cmpq %[node], %[next]\n"
" je loop_body_%=\n"
" movq %[next], %[node]\n"
" testq %[node], %[node]\n"
" jne loop_body_%=\n"
"end_%=:\n"
: [node]"+r"(node), [value]"+r"(value), [next]"+r"(next)
:
: "cc"
);
return value;
}
// Like `sum3`, but unroll the loop a bunch of times.
NOINLINE
static uint64_t sum4(Node *node) {
uint64_t value = 0;
Node *next = NULL;
asm (
"begin_loop_%=:\n"
" testq %[node], %[node]\n"
" je end_%=\n"
"loop_body_%=:\n"
" addq (%[node]), %[value]\n"
" movq 8(%[node]), %[next]\n"
" addq $16, %[node]\n"
" cmpq %[node], %[next]\n"
" jne assign_node_%=\n"
" addq (%[node]), %[value]\n"
" movq 8(%[node]), %[next]\n"
" addq $16, %[node]\n"
" cmpq %[node], %[next]\n"
" jne assign_node_%=\n"
" addq (%[node]), %[value]\n"
" movq 8(%[node]), %[next]\n"
" addq $16, %[node]\n"
" cmpq %[node], %[next]\n"
" jne assign_node_%=\n"
" addq (%[node]), %[value]\n"
" movq 8(%[node]), %[next]\n"
" addq $16, %[node]\n"
" cmpq %[node], %[next]\n"
" jne assign_node_%=\n"
" addq (%[node]), %[value]\n"
" movq 8(%[node]), %[next]\n"
" addq $16, %[node]\n"
" cmpq %[node], %[next]\n"
" jne assign_node_%=\n"
" addq (%[node]), %[value]\n"
" movq 8(%[node]), %[next]\n"
" addq $16, %[node]\n"
" cmpq %[node], %[next]\n"
" jne assign_node_%=\n"
" addq (%[node]), %[value]\n"
" movq 8(%[node]), %[next]\n"
" addq $16, %[node]\n"
" cmpq %[node], %[next]\n"
" jne assign_node_%=\n"
" addq (%[node]), %[value]\n"
" movq 8(%[node]), %[next]\n"
" addq $16, %[node]\n"
" cmpq %[node], %[next]\n"
" jne assign_node_%=\n"
" addq (%[node]), %[value]\n"
" movq 8(%[node]), %[next]\n"
" addq $16, %[node]\n"
" cmpq %[node], %[next]\n"
" jne assign_node_%=\n"
" addq (%[node]), %[value]\n"
" movq 8(%[node]), %[next]\n"
" addq $16, %[node]\n"
" cmpq %[node], %[next]\n"
" jne assign_node_%=\n"
" addq (%[node]), %[value]\n"
" movq 8(%[node]), %[next]\n"
" addq $16, %[node]\n"
" cmpq %[node], %[next]\n"
" jne assign_node_%=\n"
" addq (%[node]), %[value]\n"
" movq 8(%[node]), %[next]\n"
" addq $16, %[node]\n"
" cmpq %[node], %[next]\n"
" jne assign_node_%=\n"
" addq (%[node]), %[value]\n"
" movq 8(%[node]), %[next]\n"
" addq $16, %[node]\n"
" cmpq %[node], %[next]\n"
" jne assign_node_%=\n"
" addq (%[node]), %[value]\n"
" movq 8(%[node]), %[next]\n"
" addq $16, %[node]\n"
" cmpq %[node], %[next]\n"
" jne assign_node_%=\n"
" addq (%[node]), %[value]\n"
" movq 8(%[node]), %[next]\n"
" addq $16, %[node]\n"
" cmpq %[node], %[next]\n"
" jne assign_node_%=\n"
" addq (%[node]), %[value]\n"
" movq 8(%[node]), %[next]\n"
" addq $16, %[node]\n"
" cmpq %[node], %[next]\n"
" je loop_body_%=\n"
"assign_node_%=:\n"
" movq %[next], %[node]\n"
" testq %[node], %[node]\n"
" jne loop_body_%=\n"
"end_%=:\n"
: [node]"+r"(node), [value]"+r"(value), [next]"+r"(next)
:
: "cc"
);
return value;
}
// Version without needing assembly due to Alexander Monakov
// <https://twitter.com/_monoid/status/1418663360871141376>. Does
// roughly as well as sum3.
NOINLINE
static uint64_t sum5(Node *node) {
uint64_t value = 0;
for (; node; node = node->next) {
for (;;) {
value += node->value;
if (node + 1 != node->next) {
break;
}
node++;
}
}
return value;
}
// Pin to first CPU
static void pin_to_cpu_0(void) {
cpu_set_t cpu_mask;
CPU_ZERO(&cpu_mask);
CPU_SET(0, &cpu_mask);
if (sched_setaffinity(0, sizeof(cpu_mask), &cpu_mask) != 0) {
fprintf(stderr, "Could not set CPU affinity\n");
exit(EXIT_FAILURE);
}
}
// perf instrumentation -- a mixture of man 3 perf_event_open and
// <https://stackoverflow.com/a/42092180>
static long
perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) {
int ret;
ret = syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
return ret;
}
static void setup_perf_event(
struct perf_event_attr *evt,
int *fd,
uint64_t *id,
uint32_t evt_type,
uint64_t evt_config,
int group_fd
) {
memset(evt, 0, sizeof(struct perf_event_attr));
evt->type = evt_type;
evt->size = sizeof(struct perf_event_attr);
evt->config = evt_config;
evt->disabled = 1;
evt->exclude_kernel = 1;
evt->exclude_hv = 1;
evt->read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
*fd = perf_event_open(evt, 0, -1, group_fd, 0);
if (*fd == -1) {
fprintf(stderr, "Error opening leader %llx\n", evt->config);
exit(EXIT_FAILURE);
}
ioctl(*fd, PERF_EVENT_IOC_ID, id);
}
static struct perf_event_attr perf_cycles_evt;
static int perf_cycles_fd;
static uint64_t perf_cycles_id;
static struct perf_event_attr perf_clock_evt;
static int perf_clock_fd;
static uint64_t perf_clock_id;
static struct perf_event_attr perf_instrs_evt;
static int perf_instrs_fd;
static uint64_t perf_instrs_id;
static struct perf_event_attr perf_cache_misses_evt;
static int perf_cache_misses_fd;
static uint64_t perf_cache_misses_id;
static struct perf_event_attr perf_cache_references_evt;
static int perf_cache_references_fd;
static uint64_t perf_cache_references_id;
static void perf_init(void) {
// Cycles
setup_perf_event(
&perf_cycles_evt, &perf_cycles_fd, &perf_cycles_id,
PERF_TYPE_HARDWARE, PERF_COUNT_HW_REF_CPU_CYCLES, -1
);
// Clock
setup_perf_event(
&perf_clock_evt, &perf_clock_fd, &perf_clock_id,
PERF_TYPE_SOFTWARE, PERF_COUNT_SW_TASK_CLOCK, perf_cycles_fd
);
// Instructions
setup_perf_event(
&perf_instrs_evt, &perf_instrs_fd, &perf_instrs_id,
PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, perf_cycles_fd
);
// Cache misses
setup_perf_event(
&perf_cache_misses_evt, &perf_cache_misses_fd, &perf_cache_misses_id,
PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES, perf_cycles_fd
);
// Cache references
setup_perf_event(
&perf_cache_references_evt, &perf_cache_references_fd, &perf_cache_references_id,
PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES, perf_cycles_fd
);
}
static void perf_close(void) {
close(perf_clock_fd);
close(perf_cycles_fd);
close(perf_instrs_fd);
close(perf_cache_misses_fd);
close(perf_cache_references_fd);
}
static void disable_perf_count(void) {
ioctl(perf_cycles_fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
}
static void enable_perf_count(void) {
ioctl(perf_cycles_fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
}
static void reset_perf_count(void) {
ioctl(perf_cycles_fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
}
struct perf_read_value {
uint64_t value;
uint64_t id;
};
struct perf_read_format {
uint64_t nr;
struct perf_read_value values[];
};
static char perf_read_buf[4096];
struct perf_count {
uint64_t cycles;
double seconds;
uint64_t instructions;
uint64_t cache_misses;
uint64_t cache_references;
};
static void read_perf_count(struct perf_count *count) {
if (!read(perf_cycles_fd, perf_read_buf, sizeof(perf_read_buf))) {
fprintf(stderr, "Could not read cycles from perf\n");
exit(EXIT_FAILURE);
}
struct perf_read_format* rf = (struct perf_read_format *) perf_read_buf;
if (rf->nr != 5) {
fprintf(stderr, "Bad number of perf events\n");
exit(EXIT_FAILURE);
}
for (int i = 0; i < rf->nr; i++) {
struct perf_read_value *value = &rf->values[i];
if (value->id == perf_cycles_id) {
count->cycles = value->value;
} else if (value->id == perf_clock_id) {
count->seconds = ((double) (value->value / 1000ull)) / 1000000.0;
} else if (value->id == perf_instrs_id) {
count->instructions = value->value;
} else if (value->id == perf_cache_misses_id) {
count->cache_misses = value->value;
} else if (value->id == perf_cache_references_id) {
count->cache_references = value->value;
} else {
fprintf(stderr, "Spurious value in perf read (%lld)\n", value->id);
exit(EXIT_FAILURE);
}
}
}
// random numbers, from Per's other gist
static uint64_t random_uint64(void) {
static uint64_t x = 0x2545F4914F6CDD1D;
x ^= x << 13;
x ^= x >> 7;
x ^= x << 17;
return x;
}
// run our functions
NOINLINE
static void run_sum(int pre_iterations, int iterations, uint64_t n, Node *node, const char* name, uint64_t (*f)(Node *)) {
uint64_t value;
for (int i = 0; i < pre_iterations; i++) {
value = f(node);
}
struct perf_count counts;
counts.seconds = 0.0;
counts.cycles = 0;
counts.instructions = 0;
counts.cache_misses = 0;
counts.cache_references = 0;
// enabling / disabling it inside each iteration does not seem
// to yield precise result for short durations. maybe we can tune
// the sampling rate?
disable_perf_count();
reset_perf_count();
enable_perf_count();
for (int i = 0; i < iterations; i++) {
value = f(node);
}
disable_perf_count();
read_perf_count(&counts);
counts.seconds /= ((double) iterations);
counts.cycles /= ((uint64_t) iterations);
counts.instructions /= ((uint64_t) iterations);
counts.cache_misses /= ((uint64_t) iterations);
counts.cache_references /= ((uint64_t) iterations);
uint64_t bytes = n * sizeof(Node);
double gb_per_s = (((double) bytes) / 1000000000.0) / counts.seconds;
double time = counts.seconds;
const char *unit = " s";
if (time < 0.1) {
time *= 1000.0;
unit = "ms";
}
if (time < 0.1) {
time *= 1000.0;
unit = "us";
}
printf(
" %s: %20lu, %5.2f%s, %6.2fGB/s, %5.2f cycles/elem, %5.2f instrs/cycle, %5.2fGHz, %5.2f instrs/elem\n",
name, value, time, unit, gb_per_s, ((double) counts.cycles) / ((double) n), ((double) counts.instructions) / ((double) counts.cycles), ((double) counts.cycles) / counts.seconds / 1000000000.0, ((double) counts.instructions) / ((double) n)
);
}
static void run_tests(int iterations, uint64_t n) {
Node *nodes = malloc(n * sizeof(Node));
if (!nodes) {
fprintf(stderr, "Could not allocate nodes\n");
exit(EXIT_FAILURE);
}
for (uint64_t i = 0; i < n - 1; i++) {
nodes[i].value = random_uint64();
nodes[i].next = &nodes[i+1];
}
nodes[n-1].value = random_uint64();
nodes[n-1].next = NULL;
perf_init();
uint64_t amount = n * sizeof(Node);
const char *unit = "B";
if (amount > 10000ull) {
amount /= 1000ull;
unit = "kB";
}
if (amount > 10000ull) {
amount /= 1000ull;
unit = "MB";
}
if (amount > 10000ull) {
amount /= 1000ull;
unit = "GB";
}
printf("%lu%s, %d iterations\n", amount, unit, iterations);
run_sum(iterations, iterations, n, &nodes[0], "sum1", &sum1);
run_sum(iterations, iterations, n, &nodes[0], "sum2", &sum2);
run_sum(iterations, iterations, n, &nodes[0], "sum3", &sum3);
run_sum(iterations, iterations, n, &nodes[0], "sum4", &sum4);
run_sum(iterations, iterations, n, &nodes[0], "sum5", &sum5);
perf_close();
free(nodes);
}
int main() {
pin_to_cpu_0();
run_tests(10000, 1000llu); // 16kB -- should fit in L1
run_tests(10000, 8000llu); // 128kB -- should fit in L2
run_tests( 100, 312500llu); // 5MB -- should fit in L3
run_tests( 1, 1llu << 28); // 4GB -- doesn't fit anywhere
return EXIT_SUCCESS;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment