Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
__builtin_prefetch() in action, but CPU being smarter
#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>
#include <time.h>
const size_t memsize = 128 * 1024 * 1024;
const size_t elems = memsize / sizeof(uint32_t);
void shuffle(uint32_t* data, size_t elems);
void time_linear(const uint32_t* indices, size_t elems);
void time_noprefetch(const uint32_t* data, const uint32_t* indices, size_t elems);
void time_prefetch(const uint32_t* data, const uint32_t* indices, size_t elems, int prefetch);
int main(void) {
uint32_t* indices = malloc(memsize);
uint32_t* data = malloc(memsize);
for (uint32_t i = 0; i < elems; i++) {
indices[i] = i;
data[i] = i; // Get page faults out of the way now
}
shuffle(indices, elems);
for (int i = 0; i < 5; i++) {
time_linear(data, elems);
time_noprefetch(data, indices, elems);
time_prefetch(data, indices, elems, 1);
time_prefetch(data, indices, elems, 5);
time_prefetch(data, indices, elems, 10);
time_prefetch(data, indices, elems, 20);
}
return 0;
}
void time_linear(const uint32_t* data, size_t elems) {
struct timespec start_time;
clock_gettime(CLOCK_MONOTONIC, &start_time);
uint32_t sum = 0; // Let's keep the optimizer from optimizing away memory access
for (int i = 0; i < elems; i++) {
sum += data[i];
}
struct timespec end_time;
clock_gettime(CLOCK_MONOTONIC, &end_time);
int64_t millis = (end_time.tv_sec - start_time.tv_sec) * 1000 +
(end_time.tv_nsec - start_time.tv_nsec) / 1000000;
printf("Linear: %" PRIi64 " ms (%" PRIu32 ")\n", millis, sum);
}
void time_noprefetch(const uint32_t* data, const uint32_t* indices, size_t elems) {
struct timespec start_time;
clock_gettime(CLOCK_MONOTONIC, &start_time);
uint32_t sum = 0; // Let's keep the optimizer from optimizing away memory access
for (int i = 0; i < elems; i++) {
sum += data[indices[i]];
}
struct timespec end_time;
clock_gettime(CLOCK_MONOTONIC, &end_time);
int64_t millis = (end_time.tv_sec - start_time.tv_sec) * 1000 +
(end_time.tv_nsec - start_time.tv_nsec) / 1000000;
printf("No prefetch: %" PRIi64 " ms (%" PRIu32 ")\n", millis, sum);
}
void time_prefetch(const uint32_t* data, const uint32_t* indices, size_t elems, int prefetch) {
struct timespec start_time;
clock_gettime(CLOCK_MONOTONIC, &start_time);
uint32_t sum = 0; // Let's keep the optimizer from optimizing away memory access
// Assuming prefetch < elems
for (int i = 0; i < elems - prefetch; i++) {
__builtin_prefetch(data + indices[i + prefetch]);
sum += data[indices[i]];
}
for (int i = elems - prefetch; i < elems; i++) {
sum += data[indices[i]];
}
struct timespec end_time;
clock_gettime(CLOCK_MONOTONIC, &end_time);
int64_t millis = (end_time.tv_sec - start_time.tv_sec) * 1000 +
(end_time.tv_nsec - start_time.tv_nsec) / 1000000;
printf("Prefetch %d: %" PRIi64 " ms (%" PRIu32 ")\n", prefetch, millis, sum);
}
void shuffle(uint32_t* data, size_t elems) {
srand(42); // Fixed sequence
for (size_t i = elems; i > 0; i--) {
size_t target = rand() % i;
uint32_t tmp = data[target];
data[target] = data[i - 1];
data[i - 1] = tmp;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.