Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
// Estimating CPU frequency...
// CPU frequency: 4.52 GHz
// sum1: value = 15182118497126522709, 0.31 secs, 5.14 cycles/elem
// sum2: value = 15182118497126522709, 0.17 secs, 2.93 cycles/elem
#define RW(x) asm("" : "+r"(x))
typedef struct Node {
u64 value;
struct Node *next;
} Node;
// Naive linked list accumulation. Speed limited by L1 latency of 4-5 cycles/elem on Zen 3.
NOINLINE
u64 sum1(Node *node) {
u64 value = 0;
for (;;) {
for (int i = 0; i < 4; i++) {
if (!node) return value;
value += node->value;
node = node->next;
}
}
}
// Value speculation (via control speculation) on the next pointers to break data dependencies.
NOINLINE
u64 sum2(Node *node) {
u64 value = 0;
for (;;) {
for (int i = 0; i < 4; i++) {
if (!node) return value;
value += node->value;
Node *predicted_next = node + 1;
Node *next = node->next;
if (next == predicted_next) {
RW(predicted_next);
node = predicted_next;
} else {
node = next;
}
}
}
}
int main() {
printf("Estimating CPU frequency...\n");
calc_cpufreq();
printf("CPU frequency: %.2f GHz\n", 1e-9 * cpufreq);
u64 n = 1ull << 28;
Node *nodes = malloc(n * sizeof(Node));
for (u64 i = 0; i < n - 1; i++) {
nodes[i].value = random_u64();
nodes[i].next = &nodes[i+1];
}
nodes[n-1].value = random_u64();
nodes[n-1].next = NULL;
start_timer();
u64 value1 = sum1(&nodes[0]);
double secs1 = stop_timer();
printf("sum1: value = %llu, %.2f secs, %.2f cycles/elem\n", value1, secs1, secs1 * cpufreq / n);
start_timer();
u64 value2 = sum2(&nodes[0]);
double secs2 = stop_timer();
printf("sum2: value = %llu, %.2f secs, %.2f cycles/elem\n", value2, secs2, secs2 * cpufreq / n);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment