Created
March 31, 2016 09:38
-
-
Save ssvb/343379ceeb6d017c0023424b70fc90e2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Usage: gcc -DCPU_CLOCK_FREQUENCY=1200000000 test-arm-neon-mix.S && time ./a.out | |
* gcc -DCPU_CLOCK_FREQUENCY=1200000000 -DINTERLEAVED test-arm-neon-mix.S && time ./a.out | |
* | |
* | -DINTERLEAVED | separate ARM and NEON code chunks | |
* ------------+---------------+----------------------------------- | |
* Cortex-A15 | ~6.7 cycles | ~5.7 cycles (!) | |
* ------------+---------------+----------------------------------- | |
* Cortex-A8 | 7.0 cycles | 7.0 cycles | |
* ------------+---------------+----------------------------------- | |
* Cortex-A9 | 7.0 cycles | 10.0 cycles | |
* ------------+---------------+----------------------------------- | |
* Cortex-A53 | 7.0 cycles | 10.0 cycles | |
* ------------+---------------+----------------------------------- | |
* Cortex-A7 | 20.0 cycles | 20.0 cycles | |
* | |
* Also see http://ssvb.github.io/2011/08/03/discovering-instructions-scheduling-secrets.html | |
*/ | |
#ifndef CPU_CLOCK_FREQUENCY | |
#error CPU_CLOCK_FREQUENCY must be defined | |
#endif | |
#define LOOP_UNROLL_FACTOR 30 | |
#ifdef __aarch64__ | |
/****************************************************************************/ | |
/* 64-bit implementation */ | |
/****************************************************************************/ | |
.cpu cortex-a53+fp+simd | |
.text | |
.p2align 2 | |
.global main | |
.type main, %function | |
TMP1 .req x4 | |
TMP2 .req x5 | |
TOP .req x6 | |
X .req x7 | |
UX .req x8 | |
main: | |
ldr x1, =(CPU_CLOCK_FREQUENCY / LOOP_UNROLL_FACTOR) | |
b 1f | |
.balign 64 | |
1: | |
.rept LOOP_UNROLL_FACTOR | |
#ifdef INTERLEAVED | |
add v0.16b, v0.16b, v0.16b | |
asr TMP1, X, #16 | |
add v1.16b, v1.16b, v1.16b | |
add X, X, UX | |
add v2.16b, v2.16b, v2.16b | |
add TMP1, TOP, TMP1, lsl #1 | |
add v3.16b, v3.16b, v3.16b | |
asr TMP2, X, #16 | |
add v4.16b, v4.16b, v4.16b | |
add X, X, UX | |
add v5.16b, v5.16b, v5.16b | |
add TMP2, TOP, TMP2, lsl #1 | |
add v6.16b, v6.16b, v6.16b | |
#else | |
asr TMP1, X, #16 | |
add X, X, UX | |
add TMP1, TOP, TMP1, lsl #1 | |
asr TMP2, X, #16 | |
add X, X, UX | |
add TMP2, TOP, TMP2, lsl #1 | |
add v0.16b, v0.16b, v0.16b | |
add v1.16b, v1.16b, v1.16b | |
add v2.16b, v2.16b, v2.16b | |
add v3.16b, v3.16b, v3.16b | |
add v4.16b, v4.16b, v4.16b | |
add v5.16b, v5.16b, v5.16b | |
add v6.16b, v6.16b, v6.16b | |
#endif | |
.endr | |
subs x1, x1, #1 | |
bne 1b | |
mov w0, 0 | |
ret | |
/****************************************************************************/ | |
#else | |
/****************************************************************************/ | |
/* 32-bit implementation */ | |
/****************************************************************************/ | |
.text | |
.arch armv7-a | |
.arm | |
.fpu neon | |
.p2align 2 | |
.global main | |
.type main,%function | |
TMP1 .req r4 | |
TMP2 .req r5 | |
TOP .req r6 | |
X .req r7 | |
UX .req r8 | |
main: | |
push {r4-r12, lr} | |
ldr ip, =(CPU_CLOCK_FREQUENCY / LOOP_UNROLL_FACTOR) | |
b 1f | |
.balign 64 | |
1: | |
.rept LOOP_UNROLL_FACTOR | |
#ifdef INTERLEAVED | |
vadd.u32 q9, q9, q9 | |
asr TMP1, X, #16 | |
vadd.u32 q10, q10, q10 | |
add X, X, UX | |
vadd.u32 q11, q11, q11 | |
add TMP1, TOP, TMP1, lsl #1 | |
vadd.u32 q12, q12, q12 | |
asr TMP2, X, #16 | |
vadd.u32 q13, q13, q13 | |
add X, X, UX | |
vadd.u32 q14, q14, q14 | |
add TMP2, TOP, TMP2, lsl #1 | |
vadd.u32 q15, q15, q15 | |
#else | |
asr TMP1, X, #16 | |
add X, X, UX | |
add TMP1, TOP, TMP1, lsl #1 | |
asr TMP2, X, #16 | |
add X, X, UX | |
add TMP2, TOP, TMP2, lsl #1 | |
vadd.u32 q9, q9, q9 | |
vadd.u32 q10, q10, q10 | |
vadd.u32 q11, q11, q11 | |
vadd.u32 q12, q12, q12 | |
vadd.u32 q13, q13, q13 | |
vadd.u32 q14, q14, q14 | |
vadd.u32 q15, q15, q15 | |
#endif | |
.endr | |
subs ip, ip, #1 | |
bne 1b | |
mov r0, #0 | |
pop {r4-r12, pc} | |
/****************************************************************************/ | |
#endif |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment