Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Auto vectorizing of ARM NEON float operation in gcc arm-linux-gnueabihf-gcc -mfpu=neon -O3 -S float_average.c arm-linux-gnueabihf-gcc -mfpu=neon -Ofast -S float_average.c
float float_average(float* array, int size)
{
int i;
float total = 0;
if (size <= 0) {
return 0;
}
for (i = 0; i < size; i++) {
total += array[i];
}
return total / size;
}
.syntax unified
.arch armv7-a
.eabi_attribute 27, 3
.eabi_attribute 28, 1
.fpu neon
.eabi_attribute 20, 1
.eabi_attribute 21, 1
.eabi_attribute 23, 3
.eabi_attribute 24, 1
.eabi_attribute 25, 1
.eabi_attribute 26, 2
.eabi_attribute 30, 2
.eabi_attribute 34, 1
.eabi_attribute 18, 4
.thumb
.file "float_average.c"
.text
.align 2
.global float_average
.thumb
.thumb_func
.type float_average, %function
float_average:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
@ link register save eliminated.
cmp r1, #0
ble .L4
subs r0, r0, #4
movs r3, #0
flds s15, .L7
.L3:
adds r3, r3, #1
adds r0, r0, #4
cmp r3, r1
flds s14, [r0, #0]
fadds s15, s15, s14
bne .L3
fmsr s13, r3 @ int
fsitos s14, s13
fdivs s0, s15, s14
bx lr
.L4:
flds s0, .L7
bx lr
.L8:
.align 2
.L7:
.word 0
.size float_average, .-float_average
.ident "GCC: (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3"
.section .note.GNU-stack,"",%progbits
.syntax unified
.arch armv7-a
.eabi_attribute 27, 3
.eabi_attribute 28, 1
.fpu neon
.eabi_attribute 23, 1
.eabi_attribute 24, 1
.eabi_attribute 25, 1
.eabi_attribute 26, 2
.eabi_attribute 30, 2
.eabi_attribute 34, 1
.eabi_attribute 18, 4
.thumb
.file "float_average.c"
.text
.align 2
.global float_average
.thumb
.thumb_func
.type float_average, %function
float_average:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
@ link register save eliminated.
cmp r1, #0
push {r4, r5, r6, r7}
ble .L10
ubfx r4, r0, #2, #2
negs r4, r4
and r4, r4, #3
cmp r4, r1
it cs
movcs r4, r1
cmp r4, #0
beq .L11
subs r2, r0, #4
movs r3, #0
flds s15, .L16+16
.L4:
adds r3, r3, #1
adds r2, r2, #4
cmp r3, r4
flds s14, [r2, #0]
mov r5, r2
fadds s15, s15, s14
bcc .L4
cmp r1, r4
beq .L5
.L3:
subs r7, r1, r4
lsrs r5, r7, #2
lsls r6, r5, #2
cbz r5, .L6
add r4, r0, r4, lsl #2
movs r2, #0
vldr d16, .L16
vldr d17, .L16+8
.L7:
adds r2, r2, #1
vldmia r4!, {d18-d19}
cmp r2, r5
vadd.f32 q8, q8, q9
bcc .L7
movs r2, #0
vadd.f32 d16, d16, d17
cmp r7, r6
add r3, r3, r6
vdup.32 q9, r2
vpadd.f32 d18, d16, d16
vmov.32 r2, d18[0]
fmsr s13, r2
fadds s15, s15, s13
beq .L5
.L6:
add r0, r0, r3, lsl #2
.L9:
adds r3, r3, #1
fldmias r0!, {s14}
cmp r1, r3
fadds s15, s15, s14
bgt .L9
.L5:
fmsr s13, r1 @ int
fsitos s14, s13
fdivs s0, s15, s14
.L2:
pop {r4, r5, r6, r7}
bx lr
.L10:
flds s0, .L16+16
b .L2
.L11:
mov r3, r4
flds s15, .L16+16
b .L3
.L17:
.align 3
.L16:
.word 0
.word 0
.word 0
.word 0
.word 0
.size float_average, .-float_average
.ident "GCC: (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3"
.section .note.GNU-stack,"",%progbits
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.