Skip to content

Instantly share code, notes, and snippets.

@lemire
Last active October 27, 2017 03:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lemire/a0cfdafda7448b98fd66f47086ac65f3 to your computer and use it in GitHub Desktop.
Save lemire/a0cfdafda7448b98fd66f47086ac65f3 to your computer and use it in GitHub Desktop.
ARM NEON poor's man memcpy to sanity check the NEON performance
///////////////////
// A decent machine ought to be able to copy one 32-bit integer per CPU cycle
// using vectorized instructions. Furthermore, the vectorized loads and stores
// should be close to the performance of a memcpy, one would expect, for sizeable
// arrays.
//
// This tests the sanity of your ARM (aarch64) system:
//
// cc -O3 -o copybenchmark copybenchmark.c -std=c99
// ./copybenchmark
//
// It should report that you can copy thousands of millions (that is, billions)
// of 32-bit integers.
// Here is the result on a softiron 1000 (AMD-based) processor:
// $ ./copybenchmark
// copying 1953 MB
// time = 0.056968 1755.371443 millions of uints/sec
// [memcpy] time = 0.056968 1797.429627 millions of uints/sec
//
// Code in the public domain, prepared by D. Lemire, October 2017.
///////////////////////
#include <arm_neon.h>
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/resource.h>
#include <time.h>
#include <string.h>
static void mycopy(uint32_t *input, uint32_t *output, int N) {
assert(N % 4 == 0);
for (uint32_t i = 0; i < N / 4; i++) {
uint32x4_t x = vld1q_u32(input);
vst1q_u32((uint32_t *)output, x);
output += 4;
input += 4;
}
}
int main() {
int N = 500000;
int NTrials = 200;
uint32_t datain[N];
uint32_t recovdata[N];
printf("copying %d MB \n", (int)(N * sizeof(uint32_t) / 1024));
for (int k = 0; k < N; ++k)
datain[k] = k;
struct rusage before;
getrusage(RUSAGE_SELF, &before);
for (int i = 0; i < NTrials; i++)
mycopy(datain, recovdata, N);
struct rusage after;
getrusage(RUSAGE_SELF, &after);
for (int k = 0; k < N; ++k)
assert(recovdata[k] == k);
float t = (after.ru_utime.tv_usec - before.ru_utime.tv_usec) / 1000000.0;
printf("time = %f %f millions of uints/sec\n", t,
N * NTrials / (t * 1000.0 * 1000.0));
getrusage(RUSAGE_SELF, &before);
for (int i = 0; i < NTrials; i++)
memcpy(recovdata, datain, N*sizeof(uint32_t));
getrusage(RUSAGE_SELF, &after);
for (int k = 0; k < N; ++k)
assert(recovdata[k] == k);
float t2 = (after.ru_utime.tv_usec - before.ru_utime.tv_usec) / 1000000.0;
printf("[memcpy] time = %f %f millions of uints/sec\n", t,
N * NTrials / (t2 * 1000.0 * 1000.0));
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment