Skip to content

Instantly share code, notes, and snippets.

@n-west
Last active December 18, 2015 23:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save n-west/5861300 to your computer and use it in GitHub Desktop.
Save n-west/5861300 to your computer and use it in GitHub Desktop.
static inline void volk_arm_32f_x2_add_32f_a_inlineneon(float* cVector, const float* aVector, const float* bVector, unsigned int num_points) {
unsigned int number;
const unsigned int quarterPoints = num_points / 4;
float* cPtr = cVector;
const float* aPtr = aVector;
const float* bPtr= bVector;
for(number=0; number < quarterPoints; number++){
__asm__ volatile("vld1.32 {d0-d1}, [%[a]]!\n\t"
"vld1.32 {d2-d3}, [%[b]]!\n\t"
"vadd.f32 q1, q0, q1\n\t"
"vst1.32 {d2-d3}, [%[output]]!\n\t"
: [output] "=r"(cPtr)
: [a] "r"(aPtr), [b] "r"(bPtr)
: "memory", "d0", "d1", "d2", "d3", "d4", "d5"
);
}
number = quarterPoints * 4; // should be = num_points
for(;number < num_points; number++){
*cPtr++ = (*aPtr++) + (*bPtr++);
}
}
volk_arm_32f_x2_add_32f_a_inlineneon:
.LFB1897:
.loc 1 199 0
.cfi_startproc
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
@ link register save eliminated.
.LVL22:
stmfd sp!, {r4, r5, r6, r7, r8, r9, sl}
.LCFI2:
.cfi_def_cfa_offset 28
.cfi_offset 4, -28
.cfi_offset 5, -24
.cfi_offset 6, -20
.cfi_offset 7, -16
.cfi_offset 8, -12
.cfi_offset 9, -8
.cfi_offset 10, -4
.loc 1 208 0
movs r7, r3, lsr #2
.LVL23:
beq .L31
mov ip, #0
.LVL24:
.L32:
.loc 1 209 0 discriminator 2
#APP
@ 209 "/home/MSS/nathan-west/code/volk/volk_arm/kernels/volk_arm/volk_arm_32f_x2_add_32f.h" 1
vld1.32 {d0-d1}, [r1]!
vld1.32 {d2-d3}, [r2]!
vadd.f32 q1, q0, q1
vst1.32 {d2-d3}, [r0]!
@ 0 "" 2
.loc 1 208 0 discriminator 2
add ip, ip, #1
.LVL25:
cmp ip, r7
bne .L32
.LVL26:
.L31:
.loc 1 221 0
mov r7, r7, asl #2
.LVL27:
.loc 1 222 0
cmp r3, r7
bls .L30
.loc 1 199 0
add ip, r0, #16
add r4, r1, #16
cmp r0, r4
cmpcc r1, ip
add r5, r2, #16
movcc r4, #0
movcs r4, #1
cmp r0, r5
cmpcc r2, ip
movcc ip, #0
movcs ip, #1
rsb r9, r7, r3
and ip, r4, ip
mov r8, r9, lsr #2
cmp r9, #3
movls ip, #0
andhi ip, ip, #1
eor ip, ip, #1
mov sl, r8, asl #2
cmp r8, #0
orreq ip, ip, #1
cmp ip, #0
bne .L42
mov r6, r1
mov r5, r2
mov r4, r0
.LVL28:
.L35:
.loc 1 223 0 discriminator 2
vld1.32 {q9}, [r5]!
add ip, ip, #1
cmp r8, ip
vld1.32 {q8}, [r6]!
vadd.f32 q8, q9, q8
vst1.32 {q8}, [r4]!
bhi .L35
.loc 1 199 0
mov r5, sl, asl #2
cmp r9, sl
add r7, r7, sl
add r0, r0, r5
add r6, r1, r5
add r5, r2, r5
beq .L30
.L34:
add r2, r7, #1
.LVL29:
sub r8, r3, #7
cmp r3, r2
movcc r1, #0
.LVL30:
movcs r1, #1
cmp r2, r8
movcs r1, #0
cmp r3, #6
movls r2, #0
andhi r2, r1, #1
cmp r2, #0
beq .L43
add ip, r0, #32
add r2, r6, #32
add r1, r5, #32
.L38:
.loc 1 223 0
flds s14, [r1, #-32]
add r0, r7, #9
flds s15, [r2, #-32]
cmp r8, r0
.loc 1 222 0
add r4, r7, #8
pld [r2, #48]
.loc 1 199 0
mov r6, r2
mov r5, r1
mov r0, ip
.loc 1 222 0
mov r7, r4
.loc 1 223 0
fadds s15, s14, s15
fsts s15, [ip, #-32]
flds s14, [r1, #-28]
flds s15, [r2, #-28]
fadds s15, s14, s15
fsts s15, [ip, #-28]
flds s14, [r1, #-24]
flds s15, [r2, #-24]
fadds s15, s14, s15
fsts s15, [ip, #-24]
flds s14, [r1, #-20]
flds s15, [r2, #-20]
fadds s15, s14, s15
fsts s15, [ip, #-20]
flds s14, [r1, #-16]
flds s15, [r2, #-16]
fadds s15, s14, s15
fsts s15, [ip, #-16]
flds s14, [r1, #-12]
flds s15, [r2, #-12]
fadds s15, s14, s15
fsts s15, [ip, #-12]
flds s14, [r1, #-8]
flds s15, [r2, #-8]
fadds s15, s14, s15
fsts s15, [ip, #-8]
flds s14, [r1, #-4]
add r1, r1, #32
flds s15, [r2, #-4]
add r2, r2, #32
fadds s15, s14, s15
fsts s15, [ip, #-4]
.LVL31:
add ip, ip, #32
bhi .L38
.LVL32:
.L37:
.loc 1 199 0
sub r1, r6, #4
sub r2, r5, #4
.L39:
.loc 1 223 0
add r2, r2, #4
add r1, r1, #4
flds s14, [r2, #0]
.loc 1 222 0
add r4, r4, #1
.loc 1 223 0
add r2, r2, #4
add r1, r1, #4
flds s14, [r2, #0]
.loc 1 222 0
add r4, r4, #1
.loc 1 223 0
flds s15, [r1, #0]
.loc 1 222 0
cmp r3, r4
.loc 1 223 0
fadds s15, s14, s15
fstmias r0!, {s15}
.LVL33:
.loc 1 222 0
bhi .L39
.LVL34:
.L30:
.loc 1 226 0
ldmfd sp!, {r4, r5, r6, r7, r8, r9, sl}
bx lr
.L43:
.loc 1 199 0
mov r4, r7
b .L37
.LVL35:
.L42:
.loc 1 205 0
mov r5, r2
.loc 1 204 0
mov r6, r1
b .L34
.cfi_endproc
.LFE1897:
.size volk_arm_32f_x2_add_32f_a_inlineneon, .-volk_arm_32f_x2_add_32f_a_inlineneon
.align 2
.type volk_arm_32f_x2_multiply_32f_generic, %function
volk_arm_32f_x2_add_32f_a_inlineneon:
.LFB1897:
.loc 1 199 0
.cfi_startproc
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
@ link register save eliminated.
.LVL22:
stmfd sp!, {r4, r5, r6, r7, r8, r9, sl} @,
.LCFI2:
.cfi_def_cfa_offset 28
.cfi_offset 4, -28
.cfi_offset 5, -24
.cfi_offset 6, -20
.cfi_offset 7, -16
.cfi_offset 8, -12
.cfi_offset 9, -8
.cfi_offset 10, -4
.loc 1 208 0
movs r7, r3, lsr #2 @ quarterPoints, num_points,
.LVL23:
beq .L31 @,
mov ip, #0 @ number,
.LVL24:
.L32:
.loc 1 209 0 discriminator 2
#APP
@ 209 "/home/MSS/nathan-west/code/volk/volk_arm/kernels/volk_arm/volk_arm_32f_x2_add_32f.h" 1
vld1.32 {d0-d1}, [r1]! @ aVector
vld1.32 {d2-d3}, [r2]! @ bVector
vadd.f32 q1, q0, q1
vst1.32 {d2-d3}, [r0]! @ cPtr
@ 0 "" 2
.loc 1 208 0 discriminator 2
add ip, ip, #1 @ number, number,
.LVL25:
cmp ip, r7 @ number, quarterPoints
bne .L32 @,
.LVL26:
.L31:
.loc 1 221 0
mov r7, r7, asl #2 @ number, quarterPoints,
.LVL27:
.loc 1 222 0
cmp r3, r7 @ num_points, number
bls .L30 @,
.loc 1 199 0
add ip, r0, #16 @ D.17262, cPtr,
add r4, r1, #16 @ tmp242, aVector,
cmp r0, r4 @ cPtr, tmp242
cmpcc r1, ip @, aVector, D.17262
add r5, r2, #16 @ tmp248, bVector,
movcc r4, #0 @, tmp245
movcs r4, #1 @, tmp245
cmp r0, r5 @ cPtr, tmp248
cmpcc r2, ip @, bVector, D.17262
movcc ip, #0 @, tmp251
movcs ip, #1 @, tmp251
rsb r9, r7, r3 @ D.17246, number, num_points
and ip, r4, ip @ tmp254, tmp245, tmp251
mov r8, r9, lsr #2 @ bnd.310, D.17246,
cmp r9, #3 @ D.17246,
movls ip, #0 @, tmp258
andhi ip, ip, #1 @,, tmp258, tmp254
eor ip, ip, #1 @ tmp260, tmp258,
mov sl, r8, asl #2 @ ratio_mult_vf.311, bnd.310,
cmp r8, #0 @ bnd.310,
orreq ip, ip, #1 @,, tmp266, tmp260
cmp ip, #0 @ tmp266,
bne .L42 @,
mov r6, r1 @ ivtmp.391, aVector
mov r5, r2 @ ivtmp.394, bVector
mov r4, r0 @ ivtmp.396, cPtr
.LVL28:
.L35:
.loc 1 223 0 discriminator 2
vld1.32 {q9}, [r5]! @ tmp267, MEM[(const float *)vect_p.321_111]
add ip, ip, #1 @ ivtmp.390, ivtmp.390,
cmp r8, ip @ bnd.310, ivtmp.390
vld1.32 {q8}, [r6]! @ tmp268, MEM[(const float *)vect_p.316_107]
vadd.f32 q8, q9, q8 @ tmp269, tmp267, tmp268
vst1.32 {q8}, [r4]! @ tmp269, MEM[(float *)vect_p.327_116]
bhi .L35 @,
.loc 1 199 0
mov r5, sl, asl #2 @ D.17281, ratio_mult_vf.311,
cmp r9, sl @ D.17246, ratio_mult_vf.311
add r7, r7, sl @ number, number, ratio_mult_vf.311
add r0, r0, r5 @ cPtr, cPtr, D.17281
add r6, r1, r5 @ aPtr, aVector, D.17281
add r5, r2, r5 @ bPtr, bVector, D.17281
beq .L30 @,
.L34:
add r2, r7, #1 @ D.17302, number,
.LVL29:
sub r8, r3, #7 @ D.17303, num_points,
cmp r3, r2 @ num_points, D.17302
movcc r1, #0 @ tmp274,
.LVL30:
movcs r1, #1 @ tmp274,
cmp r2, r8 @ D.17302, D.17303
movcs r1, #0 @, tmp274,
cmp r3, #6 @ num_points,
movls r2, #0 @, tmp280
andhi r2, r1, #1 @,, tmp280, tmp274
cmp r2, #0 @ tmp280,
beq .L43 @,
add ip, r0, #32 @ ivtmp.362, cPtr,
add r2, r6, #32 @ ivtmp.363, aPtr,
add r1, r5, #32 @ ivtmp.364, bPtr,
.L38:
.loc 1 223 0
flds s14, [r1, #-32] @ MEM[base: bPtr_160, offset: 4294967264B], MEM[base: bPtr_160, offset: 4294967264B]
add r0, r7, #9 @ ivtmp.332, number,
flds s15, [r2, #-32] @ MEM[base: aPtr_161, offset: 4294967264B], MEM[base: aPtr_161, offset: 4294967264B]
cmp r8, r0 @ D.17303, ivtmp.332
.loc 1 222 0
add r4, r7, #8 @ number, number,
pld [r2, #48] @
.loc 1 199 0
mov r6, r2 @ aPtr, ivtmp.363
mov r5, r1 @ bPtr, ivtmp.364
mov r0, ip @ cPtr, ivtmp.362
.loc 1 222 0
mov r7, r4 @ number, number
.loc 1 223 0
fadds s15, s14, s15 @ tmp282, MEM[base: bPtr_160, offset: 4294967264B], MEM[base: aPtr_161, offset: 4294967264B]
fsts s15, [ip, #-32] @ tmp282, MEM[base: cPtr_159, offset: 4294967264B]
flds s14, [r1, #-28] @ MEM[base: bPtr_160, offset: 4294967268B], MEM[base: bPtr_160, offset: 4294967268B]
flds s15, [r2, #-28] @ MEM[base: aPtr_161, offset: 4294967268B], MEM[base: aPtr_161, offset: 4294967268B]
fadds s15, s14, s15 @ tmp285, MEM[base: bPtr_160, offset: 4294967268B], MEM[base: aPtr_161, offset: 4294967268B]
fsts s15, [ip, #-28] @ tmp285, MEM[base: cPtr_159, offset: 4294967268B]
flds s14, [r1, #-24] @ MEM[base: bPtr_160, offset: 4294967272B], MEM[base: bPtr_160, offset: 4294967272B]
flds s15, [r2, #-24] @ MEM[base: aPtr_161, offset: 4294967272B], MEM[base: aPtr_161, offset: 4294967272B]
fadds s15, s14, s15 @ tmp288, MEM[base: bPtr_160, offset: 4294967272B], MEM[base: aPtr_161, offset: 4294967272B]
fsts s15, [ip, #-24] @ tmp288, MEM[base: cPtr_159, offset: 4294967272B]
flds s14, [r1, #-20] @ MEM[base: bPtr_160, offset: 4294967276B], MEM[base: bPtr_160, offset: 4294967276B]
flds s15, [r2, #-20] @ MEM[base: aPtr_161, offset: 4294967276B], MEM[base: aPtr_161, offset: 4294967276B]
fadds s15, s14, s15 @ tmp291, MEM[base: bPtr_160, offset: 4294967276B], MEM[base: aPtr_161, offset: 4294967276B]
fsts s15, [ip, #-20] @ tmp291, MEM[base: cPtr_159, offset: 4294967276B]
flds s14, [r1, #-16] @ MEM[base: bPtr_160, offset: 4294967280B], MEM[base: bPtr_160, offset: 4294967280B]
flds s15, [r2, #-16] @ MEM[base: aPtr_161, offset: 4294967280B], MEM[base: aPtr_161, offset: 4294967280B]
fadds s15, s14, s15 @ tmp294, MEM[base: bPtr_160, offset: 4294967280B], MEM[base: aPtr_161, offset: 4294967280B]
fsts s15, [ip, #-16] @ tmp294, MEM[base: cPtr_159, offset: 4294967280B]
flds s14, [r1, #-12] @ MEM[base: bPtr_160, offset: 4294967284B], MEM[base: bPtr_160, offset: 4294967284B]
flds s15, [r2, #-12] @ MEM[base: aPtr_161, offset: 4294967284B], MEM[base: aPtr_161, offset: 4294967284B]
fadds s15, s14, s15 @ tmp297, MEM[base: bPtr_160, offset: 4294967284B], MEM[base: aPtr_161, offset: 4294967284B]
fsts s15, [ip, #-12] @ tmp297, MEM[base: cPtr_159, offset: 4294967284B]
flds s14, [r1, #-8] @ MEM[base: bPtr_160, offset: 4294967288B], MEM[base: bPtr_160, offset: 4294967288B]
flds s15, [r2, #-8] @ MEM[base: aPtr_161, offset: 4294967288B], MEM[base: aPtr_161, offset: 4294967288B]
fadds s15, s14, s15 @ tmp300, MEM[base: bPtr_160, offset: 4294967288B], MEM[base: aPtr_161, offset: 4294967288B]
fsts s15, [ip, #-8] @ tmp300, MEM[base: cPtr_159, offset: 4294967288B]
flds s14, [r1, #-4] @ MEM[base: bPtr_160, offset: 4294967292B], MEM[base: bPtr_160, offset: 4294967292B]
add r1, r1, #32 @ ivtmp.364, ivtmp.364,
flds s15, [r2, #-4] @ MEM[base: aPtr_161, offset: 4294967292B], MEM[base: aPtr_161, offset: 4294967292B]
add r2, r2, #32 @ ivtmp.363, ivtmp.363,
fadds s15, s14, s15 @ tmp303, MEM[base: bPtr_160, offset: 4294967292B], MEM[base: aPtr_161, offset: 4294967292B]
fsts s15, [ip, #-4] @ tmp303, MEM[base: cPtr_159, offset: 4294967292B]
.LVL31:
add ip, ip, #32 @ ivtmp.362, ivtmp.362,
bhi .L38 @,
.LVL32:
.L37:
.loc 1 199 0
sub r1, r6, #4 @ ivtmp.343, aPtr,
sub r2, r5, #4 @ ivtmp.347, bPtr,
.L39:
.loc 1 223 0
add r2, r2, #4 @ ivtmp.347, ivtmp.347,
add r1, r1, #4 @ ivtmp.343, ivtmp.343,
flds s14, [r2, #0] @ MEM[base: D.17338_196, offset: 0B], MEM[base: D.17338_196, offset: 0B]
.loc 1 222 0
add r4, r4, #1 @ number, number,
.loc 1 223 0
flds s15, [r1, #0] @ MEM[base: D.17337_197, offset: 0B], MEM[base: D.17337_197, offset: 0B]
.loc 1 222 0
cmp r3, r4 @ num_points, number
.loc 1 223 0
fadds s15, s14, s15 @ tmp306, MEM[base: D.17338_196, offset: 0B], MEM[base: D.17337_197, offset: 0B]
fstmias r0!, {s15} @ cPtr, tmp306
.LVL33:
.loc 1 222 0
bhi .L39 @,
.LVL34:
.L30:
.loc 1 226 0
ldmfd sp!, {r4, r5, r6, r7, r8, r9, sl}
bx lr
.L43:
.loc 1 199 0
mov r4, r7 @ number, number
b .L37 @
.LVL35:
.L42:
.loc 1 205 0
mov r5, r2 @ bPtr, bVector
.loc 1 204 0
mov r6, r1 @ aPtr, aVector
b .L34 @
.cfi_endproc
.LFE1897:
.size volk_arm_32f_x2_add_32f_a_inlineneon, .-volk_arm_32f_x2_add_32f_a_inlineneon
.align 2
.type volk_arm_32f_x2_multiply_32f_generic, %function
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment