bitonic/value-speculation-linux.c

## value-speculation-linux.c
// See <https://gist.github.com/pervognsen/b58108d134824e61caedffcc01004e03> for
// Per Vognsen gist on value speculation.
//
// I compile this on linux with
//
//     $ clang --version
//     clang version 12.0.0
//     Target: x86_64-unknown-linux-gnu
//     $ clang -static -Wall -O3 value-speculation-linux.c -o value-speculation-linux
//
// On a "Intel(R) Xeon(R) CPU E5-1650 v3 @ 3.50GHz" I get:
//
//    16kB, 10000 iterations
//      sum1:  8465052097154389858,  1.12us,  14.24GB/s,  3.91 cycles/elem,  1.02 instrs/cycle,  3.48GHz,  4.01 instrs/elem
//      sum2:  8465052097154389858,  0.57us,  27.95GB/s,  1.99 cycles/elem,  5.02 instrs/cycle,  3.48GHz, 10.01 instrs/elem
//      sum3:  8465052097154389858,  0.37us,  43.40GB/s,  1.28 cycles/elem,  3.91 instrs/cycle,  3.48GHz,  5.01 instrs/elem
//      sum4:  8465052097154389858,  0.39us,  41.03GB/s,  1.36 cycles/elem,  3.69 instrs/cycle,  3.48GHz,  5.01 instrs/elem
//      sum5:  8465052097154389858,  0.37us,  43.55GB/s,  1.28 cycles/elem,  3.92 instrs/cycle,  3.48GHz,  5.01 instrs/elem
//    128kB, 10000 iterations
//      sum1:  6947699366156898439,  9.07us,  14.11GB/s,  3.95 cycles/elem,  1.01 instrs/cycle,  3.49GHz,  4.00 instrs/elem
//      sum2:  6947699366156898439,  4.51us,  28.36GB/s,  1.97 cycles/elem,  5.09 instrs/cycle,  3.49GHz, 10.00 instrs/elem
//      sum3:  6947699366156898439,  3.78us,  33.84GB/s,  1.65 cycles/elem,  3.04 instrs/cycle,  3.48GHz,  5.00 instrs/elem
//      sum4:  6947699366156898439,  3.32us,  38.51GB/s,  1.45 cycles/elem,  3.45 instrs/cycle,  3.48GHz,  5.00 instrs/elem
//      sum5:  6947699366156898439,  3.78us,  33.86GB/s,  1.65 cycles/elem,  3.04 instrs/cycle,  3.49GHz,  5.00 instrs/elem
//    5000kB, 100 iterations
//      sum1:  2134986631019855758,  0.36ms,  13.96GB/s,  3.99 cycles/elem,  1.00 instrs/cycle,  3.48GHz,  4.00 instrs/elem
//      sum2:  2134986631019855758,  0.19ms,  26.22GB/s,  2.12 cycles/elem,  4.71 instrs/cycle,  3.48GHz, 10.00 instrs/elem
//      sum3:  2134986631019855758,  0.17ms,  28.90GB/s,  1.93 cycles/elem,  2.59 instrs/cycle,  3.48GHz,  5.00 instrs/elem
//      sum4:  2134986631019855758,  0.17ms,  30.07GB/s,  1.85 cycles/elem,  2.70 instrs/cycle,  3.48GHz,  5.00 instrs/elem
//      sum5:  2134986631019855758,  0.17ms,  28.59GB/s,  1.95 cycles/elem,  2.57 instrs/cycle,  3.48GHz,  5.00 instrs/elem
//    4294MB, 1 iterations
//      sum1: 15446485409674718527,  0.45 s,   9.54GB/s,  5.84 cycles/elem,  0.68 instrs/cycle,  3.48GHz,  4.00 instrs/elem
//      sum2: 15446485409674718527,  0.33 s,  13.15GB/s,  4.23 cycles/elem,  2.36 instrs/cycle,  3.48GHz, 10.00 instrs/elem
//      sum3: 15446485409674718527,  0.31 s,  13.82GB/s,  4.02 cycles/elem,  1.24 instrs/cycle,  3.48GHz,  5.00 instrs/elem
//      sum4: 15446485409674718527,  0.30 s,  14.45GB/s,  3.85 cycles/elem,  1.30 instrs/cycle,  3.47GHz,  5.00 instrs/elem
//      sum5: 15446485409674718527,  0.31 s,  14.00GB/s,  3.97 cycles/elem,  1.26 instrs/cycle,  3.48GHz,  5.00 instrs/elem
//
// When the data fits in the cache (L1, L2, and L3 specifically) the increase in
// performance is pronounced (and in fact higher than Per's experiments). With a
// bigger dataset (the siz Per Vognsen uses) I believe I am hitting the limits of
// the RAM itself, given that:
//
//    $ sysbench memory --memory-block-size=1G --memory-oper=read --threads=1 run
//    ...
//    102400.00 MiB transferred (12081.43 MiB/sec)
//
// The datasets are designed to fit in L1, L2, and L3 respectively.
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <unistd.h>
#include <sys/ioctl.h>
#include <linux/perf_event.h>
#include <asm/unistd.h>
#include <sched.h>
#include <stdbool.h>

#define NOINLINE __attribute__((noinline))

// The functions we're benchmarking

typedef struct Node {
  uint64_t value;
  struct Node *next;
} Node;

NOINLINE
static uint64_t sum1(Node *node) {
  uint64_t value = 0;
  while (node) {
    value += node->value;
    node = node->next;
  }
  return value;
}

// This is the version from Per, without the loop unrolling -- it works (in
// clang anyway), but we can do a bit better (see `sum3`).
NOINLINE
static uint64_t sum2(Node *node) {
  uint64_t value = 0;
  while (node) {
    value += node->value;
    Node *predicted_next = node + 1;
    Node *next = node->next;
    if (next == predicted_next) {
      // Prevent compilers optimizing this apparently meaningless branch away
      // by making them think we're changing predicted_next here.
      //
      // This trick, however, does not work with GCC, only with clang. GCC here
      // derives that `next` and `predicted_next` are the same, and therefore
      // merges them into the same variable, which re-introduces the data
      // dependency we wanted to get rid of.
      asm("" : "+r"(predicted_next));
      node = predicted_next;
    } else {
      node = next;
    }
  }
  return value;
}

// We would like to just write the following C code:
//
//     uint64_t sum2(Node* node) {
//       uint64_t value = 0;
//       while (node) {
//         value += node->value;
//         node++;
//         if (node != node->next) {
//           node = node->next;
//         }
//       }
//       return value;
//     }
//
// But it's hard to get compilers to not compile the semantically pointless
// node++ and the if away.
//
// This uses 5 instructions per element, leveraging the fact that if next
// is the same as the predicted next, we know there's a next iteration.
//
// The original version used 6 instructions per element, but Rihalto pointed
// out on twitter that a jump could be eliminated:
// <https://twitter.com/RhialtoTheM/status/1418926515526459394>
NOINLINE
static uint64_t sum3(Node *node) {
  uint64_t value = 0;
  Node *next = NULL;
  asm (
    "begin_loop_%=:\n"
    "	testq %[node], %[node]\n"
    "	je    end_%=\n"
    "loop_body_%=:\n"
    "	addq  (%[node]), %[value]\n"
    "	movq  8(%[node]), %[next]\n"
    "	addq  $16, %[node]\n"
    "	cmpq  %[node], %[next]\n"
    "	je    loop_body_%=\n"
    "	movq  %[next], %[node]\n"
    "	testq %[node], %[node]\n"
    "	jne   loop_body_%=\n"
    "end_%=:\n"
    : [node]"+r"(node), [value]"+r"(value), [next]"+r"(next)
    :
    : "cc"
  );
  return value;
}


// Like `sum3`, but unroll the loop a bunch of times.
NOINLINE
static uint64_t sum4(Node *node) {
  uint64_t value = 0;
  Node *next = NULL;
  asm (
    "begin_loop_%=:\n"
    "	testq %[node], %[node]\n"
    "	je    end_%=\n"
    "loop_body_%=:\n"
    "	addq  (%[node]), %[value]\n"
    "	movq  8(%[node]), %[next]\n"
    "	addq  $16, %[node]\n"
    "	cmpq  %[node], %[next]\n"
    "	jne   assign_node_%=\n"
    "	addq  (%[node]), %[value]\n"
    "	movq  8(%[node]), %[next]\n"
    "	addq  $16, %[node]\n"
    "	cmpq  %[node], %[next]\n"
    "	jne   assign_node_%=\n"
    "	addq  (%[node]), %[value]\n"
    "	movq  8(%[node]), %[next]\n"
    "	addq  $16, %[node]\n"
    "	cmpq  %[node], %[next]\n"
    "	jne   assign_node_%=\n"
    "	addq  (%[node]), %[value]\n"
    "	movq  8(%[node]), %[next]\n"
    "	addq  $16, %[node]\n"
    "	cmpq  %[node], %[next]\n"
    "	jne   assign_node_%=\n"
    "	addq  (%[node]), %[value]\n"
    "	movq  8(%[node]), %[next]\n"
    "	addq  $16, %[node]\n"
    "	cmpq  %[node], %[next]\n"
    "	jne   assign_node_%=\n"
    "	addq  (%[node]), %[value]\n"
    "	movq  8(%[node]), %[next]\n"
    "	addq  $16, %[node]\n"
    "	cmpq  %[node], %[next]\n"
    "	jne   assign_node_%=\n"
    "	addq  (%[node]), %[value]\n"
    "	movq  8(%[node]), %[next]\n"
    "	addq  $16, %[node]\n"
    "	cmpq  %[node], %[next]\n"
    "	jne   assign_node_%=\n"
    "	addq  (%[node]), %[value]\n"
    "	movq  8(%[node]), %[next]\n"
    "	addq  $16, %[node]\n"
    "	cmpq  %[node], %[next]\n"
    "	jne   assign_node_%=\n"
    "	addq  (%[node]), %[value]\n"
    "	movq  8(%[node]), %[next]\n"
    "	addq  $16, %[node]\n"
    "	cmpq  %[node], %[next]\n"
    "	jne   assign_node_%=\n"
    "	addq  (%[node]), %[value]\n"
    "	movq  8(%[node]), %[next]\n"
    "	addq  $16, %[node]\n"
    "	cmpq  %[node], %[next]\n"
    "	jne   assign_node_%=\n"
    "	addq  (%[node]), %[value]\n"
    "	movq  8(%[node]), %[next]\n"
    "	addq  $16, %[node]\n"
    "	cmpq  %[node], %[next]\n"
    "	jne   assign_node_%=\n"
    "	addq  (%[node]), %[value]\n"
    "	movq  8(%[node]), %[next]\n"
    "	addq  $16, %[node]\n"
    "	cmpq  %[node], %[next]\n"
    "	jne   assign_node_%=\n"
    "	addq  (%[node]), %[value]\n"
    "	movq  8(%[node]), %[next]\n"
    "	addq  $16, %[node]\n"
    "	cmpq  %[node], %[next]\n"
    "	jne   assign_node_%=\n"
    "	addq  (%[node]), %[value]\n"
    "	movq  8(%[node]), %[next]\n"
    "	addq  $16, %[node]\n"
    "	cmpq  %[node], %[next]\n"
    "	jne   assign_node_%=\n"
    "	addq  (%[node]), %[value]\n"
    "	movq  8(%[node]), %[next]\n"
    "	addq  $16, %[node]\n"
    "	cmpq  %[node], %[next]\n"
    "	jne   assign_node_%=\n"
    "	addq  (%[node]), %[value]\n"
    "	movq  8(%[node]), %[next]\n"
    "	addq  $16, %[node]\n"
    "	cmpq  %[node], %[next]\n"
    "	je    loop_body_%=\n"
    "assign_node_%=:\n"
    "	movq  %[next], %[node]\n"
    "	testq %[node], %[node]\n"
    "	jne   loop_body_%=\n"
    "end_%=:\n"
    : [node]"+r"(node), [value]"+r"(value), [next]"+r"(next)
    :
    : "cc"
  );
  return value;
}

// Version without needing assembly due to Alexander Monakov
// <https://twitter.com/_monoid/status/1418663360871141376>. Does
// roughly as well as sum3.
NOINLINE
static uint64_t sum5(Node *node) {
  uint64_t value = 0;
  for (; node; node = node->next) {
    for (;;) {
      value += node->value;
      if (node + 1 != node->next) {
        break;
      }
      node++;
    }
  }
  return value;
}

// Pin to first CPU

static void pin_to_cpu_0(void) {
  cpu_set_t cpu_mask;
  CPU_ZERO(&cpu_mask);
  CPU_SET(0, &cpu_mask);
  if (sched_setaffinity(0, sizeof(cpu_mask), &cpu_mask) != 0) {
    fprintf(stderr, "Could not set CPU affinity\n");
    exit(EXIT_FAILURE);
  }
}

// perf instrumentation -- a mixture of man 3 perf_event_open and
// <https://stackoverflow.com/a/42092180>

static long
perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) {
  int ret;
  ret = syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
  return ret;
}

static void setup_perf_event(
  struct perf_event_attr *evt,
  int *fd,
  uint64_t *id,
  uint32_t evt_type,
  uint64_t evt_config,
  int group_fd
) {
  memset(evt, 0, sizeof(struct perf_event_attr));
  evt->type = evt_type;
  evt->size = sizeof(struct perf_event_attr);
  evt->config = evt_config;
  evt->disabled = 1;
  evt->exclude_kernel = 1;
  evt->exclude_hv = 1;
  evt->read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;

  *fd = perf_event_open(evt, 0, -1, group_fd, 0);
  if (*fd == -1) {
    fprintf(stderr, "Error opening leader %llx\n", evt->config);
    exit(EXIT_FAILURE);
  }

  ioctl(*fd, PERF_EVENT_IOC_ID, id);
}

static struct perf_event_attr perf_cycles_evt;
static int perf_cycles_fd;
static uint64_t perf_cycles_id;

static struct perf_event_attr perf_clock_evt;
static int perf_clock_fd;
static uint64_t perf_clock_id;

static struct perf_event_attr perf_instrs_evt;
static int perf_instrs_fd;
static uint64_t perf_instrs_id;

static struct perf_event_attr perf_cache_misses_evt;
static int perf_cache_misses_fd;
static uint64_t perf_cache_misses_id;

static struct perf_event_attr perf_cache_references_evt;
static int perf_cache_references_fd;
static uint64_t perf_cache_references_id;

static void perf_init(void) {
  // Cycles
  setup_perf_event(
    &perf_cycles_evt, &perf_cycles_fd, &perf_cycles_id,
    PERF_TYPE_HARDWARE, PERF_COUNT_HW_REF_CPU_CYCLES, -1
  );
  // Clock
  setup_perf_event(
    &perf_clock_evt, &perf_clock_fd, &perf_clock_id,
    PERF_TYPE_SOFTWARE, PERF_COUNT_SW_TASK_CLOCK, perf_cycles_fd
  );
  // Instructions
  setup_perf_event(
    &perf_instrs_evt, &perf_instrs_fd, &perf_instrs_id,
    PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, perf_cycles_fd
  );
  // Cache misses
  setup_perf_event(
    &perf_cache_misses_evt, &perf_cache_misses_fd, &perf_cache_misses_id,
    PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES, perf_cycles_fd
  );
  // Cache references
  setup_perf_event(
    &perf_cache_references_evt, &perf_cache_references_fd, &perf_cache_references_id,
    PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES, perf_cycles_fd
  );
}

static void perf_close(void) {
  close(perf_clock_fd);
  close(perf_cycles_fd);
  close(perf_instrs_fd);
  close(perf_cache_misses_fd);
  close(perf_cache_references_fd);
}


static void disable_perf_count(void) {
  ioctl(perf_cycles_fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
}

static void enable_perf_count(void) {
  ioctl(perf_cycles_fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
}

static void reset_perf_count(void) {
  ioctl(perf_cycles_fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
}

struct perf_read_value {
  uint64_t value;
  uint64_t id;
};

struct perf_read_format {
  uint64_t nr;
  struct perf_read_value values[];
};

static char perf_read_buf[4096];

struct perf_count {
  uint64_t cycles;
  double seconds;
  uint64_t instructions;
  uint64_t cache_misses;
  uint64_t cache_references;
};

static void read_perf_count(struct perf_count *count) {
  if (!read(perf_cycles_fd, perf_read_buf, sizeof(perf_read_buf))) {
    fprintf(stderr, "Could not read cycles from perf\n");
    exit(EXIT_FAILURE);
  }
  struct perf_read_format* rf = (struct perf_read_format *) perf_read_buf;
  if (rf->nr != 5) {
    fprintf(stderr, "Bad number of perf events\n");
    exit(EXIT_FAILURE);
  }
  for (int i = 0; i < rf->nr; i++) {
    struct perf_read_value *value = &rf->values[i];
    if (value->id == perf_cycles_id) {
      count->cycles = value->value;
    } else if (value->id == perf_clock_id) {
      count->seconds = ((double) (value->value / 1000ull)) / 1000000.0;
    } else if (value->id == perf_instrs_id) {
      count->instructions = value->value;
    } else if (value->id == perf_cache_misses_id) {
      count->cache_misses = value->value;
    } else if (value->id == perf_cache_references_id) {
      count->cache_references = value->value;
    } else {
      fprintf(stderr, "Spurious value in perf read (%lld)\n", value->id);
      exit(EXIT_FAILURE);
    }
  }
}

// random numbers, from Per's other gist

static uint64_t random_uint64(void) {
  static uint64_t x = 0x2545F4914F6CDD1D;
  x ^= x << 13;
  x ^= x >> 7;
  x ^= x << 17;
  return x;
}

// run our functions

NOINLINE
static void run_sum(int pre_iterations, int iterations, uint64_t n, Node *node, const char* name, uint64_t (*f)(Node *)) {
  uint64_t value;
  for (int i = 0; i < pre_iterations; i++) {
    value = f(node);
  }
  struct perf_count counts;
  counts.seconds = 0.0;
  counts.cycles = 0;
  counts.instructions = 0;
  counts.cache_misses = 0;
  counts.cache_references = 0;
  // enabling / disabling it inside each iteration does not seem
  // to yield precise result for short durations. maybe we can tune
  // the sampling rate?
  disable_perf_count();
  reset_perf_count();
  enable_perf_count();
  for (int i = 0; i < iterations; i++) {
    value = f(node);
  }
  disable_perf_count();
  read_perf_count(&counts);
  counts.seconds /= ((double) iterations);
  counts.cycles /= ((uint64_t) iterations);
  counts.instructions /= ((uint64_t) iterations);
  counts.cache_misses /= ((uint64_t) iterations);
  counts.cache_references /= ((uint64_t) iterations);

  uint64_t bytes = n * sizeof(Node);
  double gb_per_s = (((double) bytes) / 1000000000.0) / counts.seconds;

  double time = counts.seconds;
  const char *unit = " s";
  if (time < 0.1) {
    time *= 1000.0;
    unit = "ms";
  }
  if (time < 0.1) {
    time *= 1000.0;
    unit = "us";
  }

  printf(
    "  %s: %20lu, %5.2f%s, %6.2fGB/s, %5.2f cycles/elem, %5.2f instrs/cycle, %5.2fGHz, %5.2f instrs/elem\n",
    name, value, time, unit, gb_per_s, ((double) counts.cycles) / ((double) n), ((double) counts.instructions) / ((double) counts.cycles), ((double) counts.cycles) / counts.seconds / 1000000000.0, ((double) counts.instructions) / ((double) n)
  );
}

static void run_tests(int iterations, uint64_t n) {
  Node *nodes = malloc(n * sizeof(Node));
  if (!nodes) {
    fprintf(stderr, "Could not allocate nodes\n");
    exit(EXIT_FAILURE);
  }
  for (uint64_t i = 0; i < n - 1; i++) {
    nodes[i].value = random_uint64();
    nodes[i].next = &nodes[i+1];
  }
  nodes[n-1].value = random_uint64();
  nodes[n-1].next = NULL;

  perf_init();
  uint64_t amount = n * sizeof(Node);
  const char *unit = "B";
  if (amount > 10000ull) {
    amount /= 1000ull;
    unit = "kB";
  }
  if (amount > 10000ull) {
    amount /= 1000ull;
    unit = "MB";
  }
  if (amount > 10000ull) {
    amount /= 1000ull;
    unit = "GB";
  }
  printf("%lu%s, %d iterations\n", amount, unit, iterations);

  run_sum(iterations, iterations, n, &nodes[0], "sum1", &sum1);
  run_sum(iterations, iterations, n, &nodes[0], "sum2", &sum2);
  run_sum(iterations, iterations, n, &nodes[0], "sum3", &sum3);
  run_sum(iterations, iterations, n, &nodes[0], "sum4", &sum4);
  run_sum(iterations, iterations, n, &nodes[0], "sum5", &sum5);

  perf_close();

  free(nodes);
}

int main() {
  pin_to_cpu_0();

  run_tests(10000, 1000llu);    // 16kB -- should fit in L1
  run_tests(10000, 8000llu);    // 128kB -- should fit in L2
  run_tests(  100, 312500llu);  // 5MB -- should fit in L3
  run_tests(    1, 1llu << 28); // 4GB -- doesn't fit anywhere

  return EXIT_SUCCESS;
}
	// See <https://gist.github.com/pervognsen/b58108d134824e61caedffcc01004e03> for
	// Per Vognsen gist on value speculation.
	//
	// I compile this on linux with
	//
	// $ clang --version
	// clang version 12.0.0
	// Target: x86_64-unknown-linux-gnu
	// $ clang -static -Wall -O3 value-speculation-linux.c -o value-speculation-linux
	//
	// On a "Intel(R) Xeon(R) CPU E5-1650 v3 @ 3.50GHz" I get:
	//
	// 16kB, 10000 iterations
	// sum1: 8465052097154389858, 1.12us, 14.24GB/s, 3.91 cycles/elem, 1.02 instrs/cycle, 3.48GHz, 4.01 instrs/elem
	// sum2: 8465052097154389858, 0.57us, 27.95GB/s, 1.99 cycles/elem, 5.02 instrs/cycle, 3.48GHz, 10.01 instrs/elem
	// sum3: 8465052097154389858, 0.37us, 43.40GB/s, 1.28 cycles/elem, 3.91 instrs/cycle, 3.48GHz, 5.01 instrs/elem
	// sum4: 8465052097154389858, 0.39us, 41.03GB/s, 1.36 cycles/elem, 3.69 instrs/cycle, 3.48GHz, 5.01 instrs/elem
	// sum5: 8465052097154389858, 0.37us, 43.55GB/s, 1.28 cycles/elem, 3.92 instrs/cycle, 3.48GHz, 5.01 instrs/elem
	// 128kB, 10000 iterations
	// sum1: 6947699366156898439, 9.07us, 14.11GB/s, 3.95 cycles/elem, 1.01 instrs/cycle, 3.49GHz, 4.00 instrs/elem
	// sum2: 6947699366156898439, 4.51us, 28.36GB/s, 1.97 cycles/elem, 5.09 instrs/cycle, 3.49GHz, 10.00 instrs/elem
	// sum3: 6947699366156898439, 3.78us, 33.84GB/s, 1.65 cycles/elem, 3.04 instrs/cycle, 3.48GHz, 5.00 instrs/elem
	// sum4: 6947699366156898439, 3.32us, 38.51GB/s, 1.45 cycles/elem, 3.45 instrs/cycle, 3.48GHz, 5.00 instrs/elem
	// sum5: 6947699366156898439, 3.78us, 33.86GB/s, 1.65 cycles/elem, 3.04 instrs/cycle, 3.49GHz, 5.00 instrs/elem
	// 5000kB, 100 iterations
	// sum1: 2134986631019855758, 0.36ms, 13.96GB/s, 3.99 cycles/elem, 1.00 instrs/cycle, 3.48GHz, 4.00 instrs/elem
	// sum2: 2134986631019855758, 0.19ms, 26.22GB/s, 2.12 cycles/elem, 4.71 instrs/cycle, 3.48GHz, 10.00 instrs/elem
	// sum3: 2134986631019855758, 0.17ms, 28.90GB/s, 1.93 cycles/elem, 2.59 instrs/cycle, 3.48GHz, 5.00 instrs/elem
	// sum4: 2134986631019855758, 0.17ms, 30.07GB/s, 1.85 cycles/elem, 2.70 instrs/cycle, 3.48GHz, 5.00 instrs/elem
	// sum5: 2134986631019855758, 0.17ms, 28.59GB/s, 1.95 cycles/elem, 2.57 instrs/cycle, 3.48GHz, 5.00 instrs/elem
	// 4294MB, 1 iterations
	// sum1: 15446485409674718527, 0.45 s, 9.54GB/s, 5.84 cycles/elem, 0.68 instrs/cycle, 3.48GHz, 4.00 instrs/elem
	// sum2: 15446485409674718527, 0.33 s, 13.15GB/s, 4.23 cycles/elem, 2.36 instrs/cycle, 3.48GHz, 10.00 instrs/elem
	// sum3: 15446485409674718527, 0.31 s, 13.82GB/s, 4.02 cycles/elem, 1.24 instrs/cycle, 3.48GHz, 5.00 instrs/elem
	// sum4: 15446485409674718527, 0.30 s, 14.45GB/s, 3.85 cycles/elem, 1.30 instrs/cycle, 3.47GHz, 5.00 instrs/elem
	// sum5: 15446485409674718527, 0.31 s, 14.00GB/s, 3.97 cycles/elem, 1.26 instrs/cycle, 3.48GHz, 5.00 instrs/elem
	//
	// When the data fits in the cache (L1, L2, and L3 specifically) the increase in
	// performance is pronounced (and in fact higher than Per's experiments). With a
	// bigger dataset (the siz Per Vognsen uses) I believe I am hitting the limits of
	// the RAM itself, given that:
	//
	// $ sysbench memory --memory-block-size=1G --memory-oper=read --threads=1 run
	// ...
	// 102400.00 MiB transferred (12081.43 MiB/sec)
	//
	// The datasets are designed to fit in L1, L2, and L3 respectively.
	#define _GNU_SOURCE
	#include <stdio.h>
	#include <stdlib.h>
	#include <stdint.h>
	#include <string.h>
	#include <unistd.h>
	#include <sys/ioctl.h>
	#include <linux/perf_event.h>
	#include <asm/unistd.h>
	#include <sched.h>
	#include <stdbool.h>

	#define NOINLINE __attribute__((noinline))

	// The functions we're benchmarking

	typedef struct Node {
	uint64_t value;
	struct Node *next;
	} Node;

	NOINLINE
	static uint64_t sum1(Node *node) {
	uint64_t value = 0;
	while (node) {
	value += node->value;
	node = node->next;
	}
	return value;
	}

	// This is the version from Per, without the loop unrolling -- it works (in
	// clang anyway), but we can do a bit better (see `sum3`).
	NOINLINE
	static uint64_t sum2(Node *node) {
	uint64_t value = 0;
	while (node) {
	value += node->value;
	Node *predicted_next = node + 1;
	Node *next = node->next;
	if (next == predicted_next) {
	// Prevent compilers optimizing this apparently meaningless branch away
	// by making them think we're changing predicted_next here.
	//
	// This trick, however, does not work with GCC, only with clang. GCC here
	// derives that `next` and `predicted_next` are the same, and therefore
	// merges them into the same variable, which re-introduces the data
	// dependency we wanted to get rid of.
	asm("" : "+r"(predicted_next));
	node = predicted_next;
	} else {
	node = next;
	}
	}
	return value;
	}

	// We would like to just write the following C code:
	//
	// uint64_t sum2(Node* node) {
	// uint64_t value = 0;
	// while (node) {
	// value += node->value;
	// node++;
	// if (node != node->next) {
	// node = node->next;
	// }
	// }
	// return value;
	// }
	//
	// But it's hard to get compilers to not compile the semantically pointless
	// node++ and the if away.
	//
	// This uses 5 instructions per element, leveraging the fact that if next
	// is the same as the predicted next, we know there's a next iteration.
	//
	// The original version used 6 instructions per element, but Rihalto pointed
	// out on twitter that a jump could be eliminated:
	// <https://twitter.com/RhialtoTheM/status/1418926515526459394>
	NOINLINE
	static uint64_t sum3(Node *node) {
	uint64_t value = 0;
	Node *next = NULL;
	asm (
	"begin_loop_%=:\n"
	" testq %[node], %[node]\n"
	" je end_%=\n"
	"loop_body_%=:\n"
	" addq (%[node]), %[value]\n"
	" movq 8(%[node]), %[next]\n"
	" addq $16, %[node]\n"
	" cmpq %[node], %[next]\n"
	" je loop_body_%=\n"
	" movq %[next], %[node]\n"
	" testq %[node], %[node]\n"
	" jne loop_body_%=\n"
	"end_%=:\n"
	: [node]"+r"(node), [value]"+r"(value), [next]"+r"(next)
	:
	: "cc"
	);
	return value;
	}


	// Like `sum3`, but unroll the loop a bunch of times.
	NOINLINE
	static uint64_t sum4(Node *node) {
	uint64_t value = 0;
	Node *next = NULL;
	asm (
	"begin_loop_%=:\n"
	" testq %[node], %[node]\n"
	" je end_%=\n"
	"loop_body_%=:\n"
	" addq (%[node]), %[value]\n"
	" movq 8(%[node]), %[next]\n"
	" addq $16, %[node]\n"
	" cmpq %[node], %[next]\n"
	" jne assign_node_%=\n"
	" addq (%[node]), %[value]\n"
	" movq 8(%[node]), %[next]\n"
	" addq $16, %[node]\n"
	" cmpq %[node], %[next]\n"
	" jne assign_node_%=\n"
	" addq (%[node]), %[value]\n"
	" movq 8(%[node]), %[next]\n"
	" addq $16, %[node]\n"
	" cmpq %[node], %[next]\n"
	" jne assign_node_%=\n"
	" addq (%[node]), %[value]\n"
	" movq 8(%[node]), %[next]\n"
	" addq $16, %[node]\n"
	" cmpq %[node], %[next]\n"
	" jne assign_node_%=\n"
	" addq (%[node]), %[value]\n"
	" movq 8(%[node]), %[next]\n"
	" addq $16, %[node]\n"
	" cmpq %[node], %[next]\n"
	" jne assign_node_%=\n"
	" addq (%[node]), %[value]\n"
	" movq 8(%[node]), %[next]\n"
	" addq $16, %[node]\n"
	" cmpq %[node], %[next]\n"
	" jne assign_node_%=\n"
	" addq (%[node]), %[value]\n"
	" movq 8(%[node]), %[next]\n"
	" addq $16, %[node]\n"
	" cmpq %[node], %[next]\n"
	" jne assign_node_%=\n"
	" addq (%[node]), %[value]\n"
	" movq 8(%[node]), %[next]\n"
	" addq $16, %[node]\n"
	" cmpq %[node], %[next]\n"
	" jne assign_node_%=\n"
	" addq (%[node]), %[value]\n"
	" movq 8(%[node]), %[next]\n"
	" addq $16, %[node]\n"
	" cmpq %[node], %[next]\n"
	" jne assign_node_%=\n"
	" addq (%[node]), %[value]\n"
	" movq 8(%[node]), %[next]\n"
	" addq $16, %[node]\n"
	" cmpq %[node], %[next]\n"
	" jne assign_node_%=\n"
	" addq (%[node]), %[value]\n"
	" movq 8(%[node]), %[next]\n"
	" addq $16, %[node]\n"
	" cmpq %[node], %[next]\n"
	" jne assign_node_%=\n"
	" addq (%[node]), %[value]\n"
	" movq 8(%[node]), %[next]\n"
	" addq $16, %[node]\n"
	" cmpq %[node], %[next]\n"
	" jne assign_node_%=\n"
	" addq (%[node]), %[value]\n"
	" movq 8(%[node]), %[next]\n"
	" addq $16, %[node]\n"
	" cmpq %[node], %[next]\n"
	" jne assign_node_%=\n"
	" addq (%[node]), %[value]\n"
	" movq 8(%[node]), %[next]\n"
	" addq $16, %[node]\n"
	" cmpq %[node], %[next]\n"
	" jne assign_node_%=\n"
	" addq (%[node]), %[value]\n"
	" movq 8(%[node]), %[next]\n"
	" addq $16, %[node]\n"
	" cmpq %[node], %[next]\n"
	" jne assign_node_%=\n"
	" addq (%[node]), %[value]\n"
	" movq 8(%[node]), %[next]\n"
	" addq $16, %[node]\n"
	" cmpq %[node], %[next]\n"
	" je loop_body_%=\n"
	"assign_node_%=:\n"
	" movq %[next], %[node]\n"
	" testq %[node], %[node]\n"
	" jne loop_body_%=\n"
	"end_%=:\n"
	: [node]"+r"(node), [value]"+r"(value), [next]"+r"(next)
	:
	: "cc"
	);
	return value;
	}

	// Version without needing assembly due to Alexander Monakov
	// <https://twitter.com/_monoid/status/1418663360871141376>. Does
	// roughly as well as sum3.
	NOINLINE
	static uint64_t sum5(Node *node) {
	uint64_t value = 0;
	for (; node; node = node->next) {
	for (;;) {
	value += node->value;
	if (node + 1 != node->next) {
	break;
	}
	node++;
	}
	}
	return value;
	}

	// Pin to first CPU

	static void pin_to_cpu_0(void) {
	cpu_set_t cpu_mask;
	CPU_ZERO(&cpu_mask);
	CPU_SET(0, &cpu_mask);
	if (sched_setaffinity(0, sizeof(cpu_mask), &cpu_mask) != 0) {
	fprintf(stderr, "Could not set CPU affinity\n");
	exit(EXIT_FAILURE);
	}
	}

	// perf instrumentation -- a mixture of man 3 perf_event_open and
	// <https://stackoverflow.com/a/42092180>

	static long
	perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) {
	int ret;
	ret = syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
	return ret;
	}

	static void setup_perf_event(
	struct perf_event_attr *evt,
	int *fd,
	uint64_t *id,
	uint32_t evt_type,
	uint64_t evt_config,
	int group_fd
	) {
	memset(evt, 0, sizeof(struct perf_event_attr));
	evt->type = evt_type;
	evt->size = sizeof(struct perf_event_attr);
	evt->config = evt_config;
	evt->disabled = 1;
	evt->exclude_kernel = 1;
	evt->exclude_hv = 1;
	evt->read_format = PERF_FORMAT_GROUP \| PERF_FORMAT_ID;

	*fd = perf_event_open(evt, 0, -1, group_fd, 0);
	if (*fd == -1) {
	fprintf(stderr, "Error opening leader %llx\n", evt->config);
	exit(EXIT_FAILURE);
	}

	ioctl(*fd, PERF_EVENT_IOC_ID, id);
	}

	static struct perf_event_attr perf_cycles_evt;
	static int perf_cycles_fd;
	static uint64_t perf_cycles_id;

	static struct perf_event_attr perf_clock_evt;
	static int perf_clock_fd;
	static uint64_t perf_clock_id;

	static struct perf_event_attr perf_instrs_evt;
	static int perf_instrs_fd;
	static uint64_t perf_instrs_id;

	static struct perf_event_attr perf_cache_misses_evt;
	static int perf_cache_misses_fd;
	static uint64_t perf_cache_misses_id;

	static struct perf_event_attr perf_cache_references_evt;
	static int perf_cache_references_fd;
	static uint64_t perf_cache_references_id;

	static void perf_init(void) {
	// Cycles
	setup_perf_event(
	&perf_cycles_evt, &perf_cycles_fd, &perf_cycles_id,
	PERF_TYPE_HARDWARE, PERF_COUNT_HW_REF_CPU_CYCLES, -1
	);
	// Clock
	setup_perf_event(
	&perf_clock_evt, &perf_clock_fd, &perf_clock_id,
	PERF_TYPE_SOFTWARE, PERF_COUNT_SW_TASK_CLOCK, perf_cycles_fd
	);
	// Instructions
	setup_perf_event(
	&perf_instrs_evt, &perf_instrs_fd, &perf_instrs_id,
	PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, perf_cycles_fd
	);
	// Cache misses
	setup_perf_event(
	&perf_cache_misses_evt, &perf_cache_misses_fd, &perf_cache_misses_id,
	PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES, perf_cycles_fd
	);
	// Cache references
	setup_perf_event(
	&perf_cache_references_evt, &perf_cache_references_fd, &perf_cache_references_id,
	PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES, perf_cycles_fd
	);
	}

	static void perf_close(void) {
	close(perf_clock_fd);
	close(perf_cycles_fd);
	close(perf_instrs_fd);
	close(perf_cache_misses_fd);
	close(perf_cache_references_fd);
	}


	static void disable_perf_count(void) {
	ioctl(perf_cycles_fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
	}

	static void enable_perf_count(void) {
	ioctl(perf_cycles_fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
	}

	static void reset_perf_count(void) {
	ioctl(perf_cycles_fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
	}

	struct perf_read_value {
	uint64_t value;
	uint64_t id;
	};

	struct perf_read_format {
	uint64_t nr;
	struct perf_read_value values[];
	};

	static char perf_read_buf[4096];

	struct perf_count {
	uint64_t cycles;
	double seconds;
	uint64_t instructions;
	uint64_t cache_misses;
	uint64_t cache_references;
	};

	static void read_perf_count(struct perf_count *count) {
	if (!read(perf_cycles_fd, perf_read_buf, sizeof(perf_read_buf))) {
	fprintf(stderr, "Could not read cycles from perf\n");
	exit(EXIT_FAILURE);
	}
	struct perf_read_format* rf = (struct perf_read_format *) perf_read_buf;
	if (rf->nr != 5) {
	fprintf(stderr, "Bad number of perf events\n");
	exit(EXIT_FAILURE);
	}
	for (int i = 0; i < rf->nr; i++) {
	struct perf_read_value *value = &rf->values[i];
	if (value->id == perf_cycles_id) {
	count->cycles = value->value;
	} else if (value->id == perf_clock_id) {
	count->seconds = ((double) (value->value / 1000ull)) / 1000000.0;
	} else if (value->id == perf_instrs_id) {
	count->instructions = value->value;
	} else if (value->id == perf_cache_misses_id) {
	count->cache_misses = value->value;
	} else if (value->id == perf_cache_references_id) {
	count->cache_references = value->value;
	} else {
	fprintf(stderr, "Spurious value in perf read (%lld)\n", value->id);
	exit(EXIT_FAILURE);
	}
	}
	}

	// random numbers, from Per's other gist

	static uint64_t random_uint64(void) {
	static uint64_t x = 0x2545F4914F6CDD1D;
	x ^= x << 13;
	x ^= x >> 7;
	x ^= x << 17;
	return x;
	}

	// run our functions

	NOINLINE
	static void run_sum(int pre_iterations, int iterations, uint64_t n, Node node, const char name, uint64_t (f)(Node )) {
	uint64_t value;
	for (int i = 0; i < pre_iterations; i++) {
	value = f(node);
	}
	struct perf_count counts;
	counts.seconds = 0.0;
	counts.cycles = 0;
	counts.instructions = 0;
	counts.cache_misses = 0;
	counts.cache_references = 0;
	// enabling / disabling it inside each iteration does not seem
	// to yield precise result for short durations. maybe we can tune
	// the sampling rate?
	disable_perf_count();
	reset_perf_count();
	enable_perf_count();
	for (int i = 0; i < iterations; i++) {
	value = f(node);
	}
	disable_perf_count();
	read_perf_count(&counts);
	counts.seconds /= ((double) iterations);
	counts.cycles /= ((uint64_t) iterations);
	counts.instructions /= ((uint64_t) iterations);
	counts.cache_misses /= ((uint64_t) iterations);
	counts.cache_references /= ((uint64_t) iterations);

	uint64_t bytes = n * sizeof(Node);
	double gb_per_s = (((double) bytes) / 1000000000.0) / counts.seconds;

	double time = counts.seconds;
	const char *unit = " s";
	if (time < 0.1) {
	time *= 1000.0;
	unit = "ms";
	}
	if (time < 0.1) {
	time *= 1000.0;
	unit = "us";
	}

	printf(
	" %s: %20lu, %5.2f%s, %6.2fGB/s, %5.2f cycles/elem, %5.2f instrs/cycle, %5.2fGHz, %5.2f instrs/elem\n",
	name, value, time, unit, gb_per_s, ((double) counts.cycles) / ((double) n), ((double) counts.instructions) / ((double) counts.cycles), ((double) counts.cycles) / counts.seconds / 1000000000.0, ((double) counts.instructions) / ((double) n)
	);
	}

	static void run_tests(int iterations, uint64_t n) {
	Node nodes = malloc(n sizeof(Node));
	if (!nodes) {
	fprintf(stderr, "Could not allocate nodes\n");
	exit(EXIT_FAILURE);
	}
	for (uint64_t i = 0; i < n - 1; i++) {
	nodes[i].value = random_uint64();
	nodes[i].next = &nodes[i+1];
	}
	nodes[n-1].value = random_uint64();
	nodes[n-1].next = NULL;

	perf_init();
	uint64_t amount = n * sizeof(Node);
	const char *unit = "B";
	if (amount > 10000ull) {
	amount /= 1000ull;
	unit = "kB";
	}
	if (amount > 10000ull) {
	amount /= 1000ull;
	unit = "MB";
	}
	if (amount > 10000ull) {
	amount /= 1000ull;
	unit = "GB";
	}
	printf("%lu%s, %d iterations\n", amount, unit, iterations);

	run_sum(iterations, iterations, n, &nodes[0], "sum1", &sum1);
	run_sum(iterations, iterations, n, &nodes[0], "sum2", &sum2);
	run_sum(iterations, iterations, n, &nodes[0], "sum3", &sum3);
	run_sum(iterations, iterations, n, &nodes[0], "sum4", &sum4);
	run_sum(iterations, iterations, n, &nodes[0], "sum5", &sum5);

	perf_close();

	free(nodes);
	}

	int main() {
	pin_to_cpu_0();

	run_tests(10000, 1000llu); // 16kB -- should fit in L1
	run_tests(10000, 8000llu); // 128kB -- should fit in L2
	run_tests( 100, 312500llu); // 5MB -- should fit in L3
	run_tests( 1, 1llu << 28); // 4GB -- doesn't fit anywhere

	return EXIT_SUCCESS;
	}