Skip to content

Instantly share code, notes, and snippets.

@ssvb
Created March 31, 2016 09:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ssvb/343379ceeb6d017c0023424b70fc90e2 to your computer and use it in GitHub Desktop.
Save ssvb/343379ceeb6d017c0023424b70fc90e2 to your computer and use it in GitHub Desktop.
/*
* Usage: gcc -DCPU_CLOCK_FREQUENCY=1200000000 test-arm-neon-mix.S && time ./a.out
* gcc -DCPU_CLOCK_FREQUENCY=1200000000 -DINTERLEAVED test-arm-neon-mix.S && time ./a.out
*
* | -DINTERLEAVED | separate ARM and NEON code chunks
* ------------+---------------+-----------------------------------
* Cortex-A15 | ~6.7 cycles | ~5.7 cycles (!)
* ------------+---------------+-----------------------------------
* Cortex-A8 | 7.0 cycles | 7.0 cycles
* ------------+---------------+-----------------------------------
* Cortex-A9 | 7.0 cycles | 10.0 cycles
* ------------+---------------+-----------------------------------
* Cortex-A53 | 7.0 cycles | 10.0 cycles
* ------------+---------------+-----------------------------------
* Cortex-A7 | 20.0 cycles | 20.0 cycles
*
* Also see http://ssvb.github.io/2011/08/03/discovering-instructions-scheduling-secrets.html
*/
#ifndef CPU_CLOCK_FREQUENCY
#error CPU_CLOCK_FREQUENCY must be defined
#endif
#define LOOP_UNROLL_FACTOR 30
#ifdef __aarch64__
/****************************************************************************/
/* 64-bit implementation */
/****************************************************************************/
.cpu cortex-a53+fp+simd
.text
.p2align 2
.global main
.type main, %function
TMP1 .req x4
TMP2 .req x5
TOP .req x6
X .req x7
UX .req x8
main:
ldr x1, =(CPU_CLOCK_FREQUENCY / LOOP_UNROLL_FACTOR)
b 1f
.balign 64
1:
.rept LOOP_UNROLL_FACTOR
#ifdef INTERLEAVED
add v0.16b, v0.16b, v0.16b
asr TMP1, X, #16
add v1.16b, v1.16b, v1.16b
add X, X, UX
add v2.16b, v2.16b, v2.16b
add TMP1, TOP, TMP1, lsl #1
add v3.16b, v3.16b, v3.16b
asr TMP2, X, #16
add v4.16b, v4.16b, v4.16b
add X, X, UX
add v5.16b, v5.16b, v5.16b
add TMP2, TOP, TMP2, lsl #1
add v6.16b, v6.16b, v6.16b
#else
asr TMP1, X, #16
add X, X, UX
add TMP1, TOP, TMP1, lsl #1
asr TMP2, X, #16
add X, X, UX
add TMP2, TOP, TMP2, lsl #1
add v0.16b, v0.16b, v0.16b
add v1.16b, v1.16b, v1.16b
add v2.16b, v2.16b, v2.16b
add v3.16b, v3.16b, v3.16b
add v4.16b, v4.16b, v4.16b
add v5.16b, v5.16b, v5.16b
add v6.16b, v6.16b, v6.16b
#endif
.endr
subs x1, x1, #1
bne 1b
mov w0, 0
ret
/****************************************************************************/
#else
/****************************************************************************/
/* 32-bit implementation */
/****************************************************************************/
.text
.arch armv7-a
.arm
.fpu neon
.p2align 2
.global main
.type main,%function
TMP1 .req r4
TMP2 .req r5
TOP .req r6
X .req r7
UX .req r8
main:
push {r4-r12, lr}
ldr ip, =(CPU_CLOCK_FREQUENCY / LOOP_UNROLL_FACTOR)
b 1f
.balign 64
1:
.rept LOOP_UNROLL_FACTOR
#ifdef INTERLEAVED
vadd.u32 q9, q9, q9
asr TMP1, X, #16
vadd.u32 q10, q10, q10
add X, X, UX
vadd.u32 q11, q11, q11
add TMP1, TOP, TMP1, lsl #1
vadd.u32 q12, q12, q12
asr TMP2, X, #16
vadd.u32 q13, q13, q13
add X, X, UX
vadd.u32 q14, q14, q14
add TMP2, TOP, TMP2, lsl #1
vadd.u32 q15, q15, q15
#else
asr TMP1, X, #16
add X, X, UX
add TMP1, TOP, TMP1, lsl #1
asr TMP2, X, #16
add X, X, UX
add TMP2, TOP, TMP2, lsl #1
vadd.u32 q9, q9, q9
vadd.u32 q10, q10, q10
vadd.u32 q11, q11, q11
vadd.u32 q12, q12, q12
vadd.u32 q13, q13, q13
vadd.u32 q14, q14, q14
vadd.u32 q15, q15, q15
#endif
.endr
subs ip, ip, #1
bne 1b
mov r0, #0
pop {r4-r12, pc}
/****************************************************************************/
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment