Skip to content

Instantly share code, notes, and snippets.

@osimola
Created December 11, 2013 20:11
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save osimola/7917568 to your computer and use it in GitHub Desktop.
Save osimola/7917568 to your computer and use it in GitHub Desktop.
Example of __builtin_prefetch() in action.
#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>
#include <time.h>
const size_t memsize = 128 * 1024 * 1024;
const size_t elems = memsize / sizeof(uint32_t);
void shuffle(uint32_t* data, size_t elems);
void time_linear(const uint32_t* indices, size_t elems);
void time_noprefetch(const uint32_t* data, size_t elems);
void time_prefetch(const uint32_t* data, size_t elems, int prefetch);
int main(void) {
uint32_t* data = malloc(memsize);
for (uint32_t i = 0; i < elems; i++) {
data[i] = i; // Get page faults out of the way now
}
for (int i = 0; i < 5; i++) {
time_linear(data, elems);
time_noprefetch(data, elems);
time_prefetch(data, elems, 1);
time_prefetch(data, elems, 4);
time_prefetch(data, elems, 16);
time_prefetch(data, elems, 32);
time_prefetch(data, elems, 128);
time_prefetch(data, elems, 256);
}
return 0;
}
void time_linear(const uint32_t* data, size_t elems) {
struct timespec start_time;
clock_gettime(CLOCK_MONOTONIC, &start_time);
uint32_t sum = 0; // Let's keep the optimizer from optimizing away memory access
for (int i = 0; i < elems; i++) {
sum += data[i];
}
struct timespec end_time;
clock_gettime(CLOCK_MONOTONIC, &end_time);
int64_t millis = (end_time.tv_sec - start_time.tv_sec) * 1000 +
(end_time.tv_nsec - start_time.tv_nsec) / 1000000;
printf("Linear: %" PRIi64 " ms (%" PRIu32 ")\n", millis, sum);
}
void time_noprefetch(const uint32_t* data, size_t elems) {
struct timespec start_time;
clock_gettime(CLOCK_MONOTONIC, &start_time);
srand(42); // Fixed sequence
uint32_t sum = 0; // Let's keep the optimizer from optimizing away memory access
for (int i = 0; i < elems; i++) {
sum += data[rand() % elems];
}
struct timespec end_time;
clock_gettime(CLOCK_MONOTONIC, &end_time);
int64_t millis = (end_time.tv_sec - start_time.tv_sec) * 1000 +
(end_time.tv_nsec - start_time.tv_nsec) / 1000000;
printf("No prefetch: %" PRIi64 " ms (%" PRIu32 ")\n", millis, sum);
}
void time_prefetch(const uint32_t* data, size_t elems, int prefetch) {
struct timespec start_time;
clock_gettime(CLOCK_MONOTONIC, &start_time);
srand(42); // Fixed sequence
uint32_t sum = 0; // Let's keep the optimizer from optimizing away memory access
uint32_t buffer[prefetch];
for (size_t i = 0; i < prefetch; i++)
buffer[i] = rand() % elems;
// Iteration countt to number of elems without resorting to modulo
// prefetch or checking for two conditions inside inner
// loops. Note that elems is a multiple of prefetch :)
for (size_t i = 0; i < elems;) {
for (size_t j = 0; j < prefetch; j++) {
sum += data[buffer[j]];
uint32_t next = rand() % elems;
__builtin_prefetch(data + next);
buffer[j] = next;
i++;
}
}
struct timespec end_time;
clock_gettime(CLOCK_MONOTONIC, &end_time);
int64_t millis = (end_time.tv_sec - start_time.tv_sec) * 1000 +
(end_time.tv_nsec - start_time.tv_nsec) / 1000000;
printf("Prefetch %d: %" PRIi64 " ms (%" PRIu32 ")\n", prefetch, millis, sum);
}
void shuffle(uint32_t* data, size_t elems) {
srand(42); // Fixed sequence
for (size_t i = elems; i > 0; i--) {
size_t target = rand() % i;
uint32_t tmp = data[target];
data[target] = data[i - 1];
data[i - 1] = tmp;
}
}
@osimola
Copy link
Author

osimola commented Dec 11, 2013

Looks like I managed to write some Clang specific code, apologies to all gcc users. As if __builtin_prefetch() wasn't compiler specific itself.

@squillace91
Copy link

hey! thanks for posting it. is it compiling? tried here and having issues.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment