Skip to content

Instantly share code, notes, and snippets.

@gpakosz
Created August 5, 2011 17:22
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gpakosz/1128030 to your computer and use it in GitHub Desktop.
Save gpakosz/1128030 to your computer and use it in GitHub Desktop.
ARM NEON integer 16x8 dot product
# ------------------------------------------------------------------------------
# int32_t dotProduct_16x8_neon(int16_t const* __restrict u, int8_t const* __restrict v, int32_t size)
.globl _dotProduct_16x8_neon
.private_extern _dotProduct_16x8_neon
.no_dead_strip _dotProduct_16x8_neon
_dotProduct_16x8_neon:
# calling conventions:
#---------------------
# r0 = u
# r1 = v
# r2 = size
#
# used neon registers:
# --------------------
# q0 = accumulator
# q1 (d2, d3) = u (u, u + 4)
# q2 (d4, d5) = v
# q3 (d6, d7) = u + 8
# q4 (d8, d9) = v + 8
pld [r0]
pld [r1]
asr r2, r2, #4
subs r2, r2, #1
mov r3, #0
vdup.32 q0, r3
.loop16x8:
vld1.8 {d4}, [r1]!
vld1.16 {d2-d3}, [r0]!
vld1.8 {d6}, [r1]!
vld1.16 {d8-d9}, [r0]!
pld [r0, #128]
pld [r1, #128]
vmovl.s8 q2, d4
vmovl.s8 q3, d6
vmlal.s16 q0, d2, d4
vmlal.s16 q0, d3, d5
vmlal.s16 q0, d6, d8
vmlal.s16 q0, d7, d9
subs r2, r2, #1
bpl .loop16x8
vpaddl.s32 q0, q0
vmov.s64 d2, d1
vadd.i64 d0, d0, d2
vmov.s32 r0, d0[0]
bx lr
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment