Last active
October 27, 2017 03:34
-
-
Save lemire/a0cfdafda7448b98fd66f47086ac65f3 to your computer and use it in GitHub Desktop.
ARM NEON poor's man memcpy to sanity check the NEON performance
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/////////////////// | |
// A decent machine ought to be able to copy one 32-bit integer per CPU cycle | |
// using vectorized instructions. Furthermore, the vectorized loads and stores | |
// should be close to the performance of a memcpy, one would expect, for sizeable | |
// arrays. | |
// | |
// This tests the sanity of your ARM (aarch64) system: | |
// | |
// cc -O3 -o copybenchmark copybenchmark.c -std=c99 | |
// ./copybenchmark | |
// | |
// It should report that you can copy thousands of millions (that is, billions) | |
// of 32-bit integers. | |
// Here is the result on a softiron 1000 (AMD-based) processor: | |
// $ ./copybenchmark | |
// copying 1953 MB | |
// time = 0.056968 1755.371443 millions of uints/sec | |
// [memcpy] time = 0.056968 1797.429627 millions of uints/sec | |
// | |
// Code in the public domain, prepared by D. Lemire, October 2017. | |
/////////////////////// | |
#include <arm_neon.h> | |
#include <assert.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <sys/resource.h> | |
#include <time.h> | |
#include <string.h> | |
static void mycopy(uint32_t *input, uint32_t *output, int N) { | |
assert(N % 4 == 0); | |
for (uint32_t i = 0; i < N / 4; i++) { | |
uint32x4_t x = vld1q_u32(input); | |
vst1q_u32((uint32_t *)output, x); | |
output += 4; | |
input += 4; | |
} | |
} | |
int main() { | |
int N = 500000; | |
int NTrials = 200; | |
uint32_t datain[N]; | |
uint32_t recovdata[N]; | |
printf("copying %d MB \n", (int)(N * sizeof(uint32_t) / 1024)); | |
for (int k = 0; k < N; ++k) | |
datain[k] = k; | |
struct rusage before; | |
getrusage(RUSAGE_SELF, &before); | |
for (int i = 0; i < NTrials; i++) | |
mycopy(datain, recovdata, N); | |
struct rusage after; | |
getrusage(RUSAGE_SELF, &after); | |
for (int k = 0; k < N; ++k) | |
assert(recovdata[k] == k); | |
float t = (after.ru_utime.tv_usec - before.ru_utime.tv_usec) / 1000000.0; | |
printf("time = %f %f millions of uints/sec\n", t, | |
N * NTrials / (t * 1000.0 * 1000.0)); | |
getrusage(RUSAGE_SELF, &before); | |
for (int i = 0; i < NTrials; i++) | |
memcpy(recovdata, datain, N*sizeof(uint32_t)); | |
getrusage(RUSAGE_SELF, &after); | |
for (int k = 0; k < N; ++k) | |
assert(recovdata[k] == k); | |
float t2 = (after.ru_utime.tv_usec - before.ru_utime.tv_usec) / 1000000.0; | |
printf("[memcpy] time = %f %f millions of uints/sec\n", t, | |
N * NTrials / (t2 * 1000.0 * 1000.0)); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment