Skip to content

Instantly share code, notes, and snippets.

@mtklein
Last active December 27, 2015 06:39
Show Gist options
  • Save mtklein/7283313 to your computer and use it in GitHub Desktop.
Save mtklein/7283313 to your computer and use it in GitHub Desktop.
memset 32 with loop vs. store_first --- store_first seems to be much slower
00096d14 <_ZN9arch_neonL8memset_TIN6simdpp9arch_neon8uint32x4EjEEvPT0_S4_j>:
96d14: e92d 0ff0 stmdb sp!, {r4, r5, r6, r7, r8, r9, sl, fp}
96d18: 2a03 cmp r2, #3
96d1a: b08c sub sp, #48 ; 0x30
96d1c: 4607 mov r7, r0
96d1e: 9103 str r1, [sp, #12]
96d20: 9201 str r2, [sp, #4]
96d22: f240 80c6 bls.w 96eb2 <_ZN9arch_neonL8memset_TIN6simdpp9arch_neon8uint32x4EjEEvPT0_S4_j+0x19e>
96d26: 0702 lsls r2, r0, #28
96d28: 4603 mov r3, r0
96d2a: bf18 it ne
96d2c: 9a01 ldrne r2, [sp, #4]
96d2e: d007 beq.n 96d40 <_ZN9arch_neonL8memset_TIN6simdpp9arch_neon8uint32x4EjEEvPT0_S4_j+0x2c>
96d30: f843 1b04 str.w r1, [r3], #4
96d34: 0718 lsls r0, r3, #28
96d36: f102 32ff add.w r2, r2, #4294967295 ; 0xffffffff
96d3a: 461f mov r7, r3
96d3c: d1f8 bne.n 96d30 <_ZN9arch_neonL8memset_TIN6simdpp9arch_neon8uint32x4EjEEvPT0_S4_j+0x1c>
96d3e: 9201 str r2, [sp, #4]
96d40: 9a01 ldr r2, [sp, #4]
96d42: ab03 add r3, sp, #12
96d44: 2a3f cmp r2, #63 ; 0x3f
96d46: f9e3 0caf vld1.32 {d16[]-d17[]}, [r3]
96d4a: edcd 0b04 vstr d16, [sp, #16]
96d4e: edcd 1b06 vstr d17, [sp, #24]
96d52: f240 80b8 bls.w 96ec6 <_ZN9arch_neonL8memset_TIN6simdpp9arch_neon8uint32x4EjEEvPT0_S4_j+0x1b2>
96d56: ad08 add r5, sp, #32
96d58: ac04 add r4, sp, #16
96d5a: 463e mov r6, r7
96d5c: 4694 mov ip, r2
96d5e: 9702 str r7, [sp, #8]
96d60: e894 000f ldmia.w r4, {r0, r1, r2, r3}
96d64: f106 0b20 add.w fp, r6, #32
96d68: f106 0a30 add.w sl, r6, #48 ; 0x30
96d6c: f106 0940 add.w r9, r6, #64 ; 0x40
96d70: f106 0850 add.w r8, r6, #80 ; 0x50
96d74: 4637 mov r7, r6
96d76: f1ac 0c40 sub.w ip, ip, #64 ; 0x40
96d7a: e885 000f stmia.w r5, {r0, r1, r2, r3}
96d7e: eddd 0b08 vldr d16, [sp, #32]
96d82: eddd 1b0a vldr d17, [sp, #40] ; 0x28
96d86: f1bc 0f3f cmp.w ip, #63 ; 0x3f
96d8a: f947 0add vst1.64 {d16-d17}, [r7 :64]!
96d8e: f947 0adf vst1.64 {d16-d17}, [r7 :64]
96d92: f106 0760 add.w r7, r6, #96 ; 0x60
96d96: f94b 0adf vst1.64 {d16-d17}, [fp :64]
96d9a: f106 0b70 add.w fp, r6, #112 ; 0x70
96d9e: f94a 0adf vst1.64 {d16-d17}, [sl :64]
96da2: f106 0a80 add.w sl, r6, #128 ; 0x80
96da6: f949 0adf vst1.64 {d16-d17}, [r9 :64]
96daa: f106 0990 add.w r9, r6, #144 ; 0x90
96dae: f948 0adf vst1.64 {d16-d17}, [r8 :64]
96db2: f106 08a0 add.w r8, r6, #160 ; 0xa0
96db6: f947 0adf vst1.64 {d16-d17}, [r7 :64]
96dba: f106 07b0 add.w r7, r6, #176 ; 0xb0
96dbe: f94b 0adf vst1.64 {d16-d17}, [fp :64]
96dc2: f106 0bc0 add.w fp, r6, #192 ; 0xc0
96dc6: f94a 0adf vst1.64 {d16-d17}, [sl :64]
96dca: f106 0ad0 add.w sl, r6, #208 ; 0xd0
96dce: f949 0adf vst1.64 {d16-d17}, [r9 :64]
96dd2: f106 09e0 add.w r9, r6, #224 ; 0xe0
96dd6: f948 0adf vst1.64 {d16-d17}, [r8 :64]
96dda: f106 08f0 add.w r8, r6, #240 ; 0xf0
96dde: f506 7680 add.w r6, r6, #256 ; 0x100
96de2: f947 0adf vst1.64 {d16-d17}, [r7 :64]
96de6: f94b 0adf vst1.64 {d16-d17}, [fp :64]
96dea: f94a 0adf vst1.64 {d16-d17}, [sl :64]
96dee: f949 0adf vst1.64 {d16-d17}, [r9 :64]
96df2: f948 0adf vst1.64 {d16-d17}, [r8 :64]
96df6: d8b3 bhi.n 96d60 <_ZN9arch_neonL8memset_TIN6simdpp9arch_neon8uint32x4EjEEvPT0_S4_j+0x4c>
96df8: 9a01 ldr r2, [sp, #4]
96dfa: 9f02 ldr r7, [sp, #8]
96dfc: f1a2 0340 sub.w r3, r2, #64 ; 0x40
96e00: f002 0a3f and.w sl, r2, #63 ; 0x3f
96e04: 099b lsrs r3, r3, #6
96e06: 3301 adds r3, #1
96e08: eb07 2703 add.w r7, r7, r3, lsl #8
96e0c: f1ba 0f0f cmp.w sl, #15
96e10: d927 bls.n 96e62 <_ZN9arch_neonL8memset_TIN6simdpp9arch_neon8uint32x4EjEEvPT0_S4_j+0x14e>
96e12: ad08 add r5, sp, #32
96e14: ac04 add r4, sp, #16
96e16: 463e mov r6, r7
96e18: 46d0 mov r8, sl
96e1a: e894 000f ldmia.w r4, {r0, r1, r2, r3}
96e1e: f1a8 0810 sub.w r8, r8, #16
96e22: f106 0b20 add.w fp, r6, #32
96e26: f106 0930 add.w r9, r6, #48 ; 0x30
96e2a: f1b8 0f0f cmp.w r8, #15
96e2e: 46b4 mov ip, r6
96e30: f106 0640 add.w r6, r6, #64 ; 0x40
96e34: e885 000f stmia.w r5, {r0, r1, r2, r3}
96e38: eddd 0b08 vldr d16, [sp, #32]
96e3c: eddd 1b0a vldr d17, [sp, #40] ; 0x28
96e40: f94c 0add vst1.64 {d16-d17}, [ip :64]!
96e44: f94c 0adf vst1.64 {d16-d17}, [ip :64]
96e48: f94b 0adf vst1.64 {d16-d17}, [fp :64]
96e4c: f949 0adf vst1.64 {d16-d17}, [r9 :64]
96e50: d8e3 bhi.n 96e1a <_ZN9arch_neonL8memset_TIN6simdpp9arch_neon8uint32x4EjEEvPT0_S4_j+0x106>
96e52: f1aa 0310 sub.w r3, sl, #16
96e56: f00a 0a0f and.w sl, sl, #15
96e5a: 091b lsrs r3, r3, #4
96e5c: 3301 adds r3, #1
96e5e: eb07 1783 add.w r7, r7, r3, lsl #6
96e62: f1ba 0f03 cmp.w sl, #3
96e66: d931 bls.n 96ecc <_ZN9arch_neonL8memset_TIN6simdpp9arch_neon8uint32x4EjEEvPT0_S4_j+0x1b8>
96e68: ad08 add r5, sp, #32
96e6a: ac04 add r4, sp, #16
96e6c: 46bc mov ip, r7
96e6e: 4656 mov r6, sl
96e70: e894 000f ldmia.w r4, {r0, r1, r2, r3}
96e74: 3e04 subs r6, #4
96e76: 2e03 cmp r6, #3
96e78: e885 000f stmia.w r5, {r0, r1, r2, r3}
96e7c: eddd 0b08 vldr d16, [sp, #32]
96e80: eddd 1b0a vldr d17, [sp, #40] ; 0x28
96e84: f94c 0adf vst1.64 {d16-d17}, [ip :64]
96e88: f10c 0c10 add.w ip, ip, #16
96e8c: d8f0 bhi.n 96e70 <_ZN9arch_neonL8memset_TIN6simdpp9arch_neon8uint32x4EjEEvPT0_S4_j+0x15c>
96e8e: f1aa 0204 sub.w r2, sl, #4
96e92: f00a 0303 and.w r3, sl, #3
96e96: 0892 lsrs r2, r2, #2
96e98: 3201 adds r2, #1
96e9a: eb07 1702 add.w r7, r7, r2, lsl #4
96e9e: b123 cbz r3, 96eaa <_ZN9arch_neonL8memset_TIN6simdpp9arch_neon8uint32x4EjEEvPT0_S4_j+0x196>
96ea0: 9a03 ldr r2, [sp, #12]
96ea2: 3b01 subs r3, #1
96ea4: f847 2b04 str.w r2, [r7], #4
96ea8: d1fb bne.n 96ea2 <_ZN9arch_neonL8memset_TIN6simdpp9arch_neon8uint32x4EjEEvPT0_S4_j+0x18e>
96eaa: b00c add sp, #48 ; 0x30
96eac: e8bd 0ff0 ldmia.w sp!, {r4, r5, r6, r7, r8, r9, sl, fp}
96eb0: 4770 bx lr
96eb2: 9a01 ldr r2, [sp, #4]
96eb4: 2a00 cmp r2, #0
96eb6: d0f8 beq.n 96eaa <_ZN9arch_neonL8memset_TIN6simdpp9arch_neon8uint32x4EjEEvPT0_S4_j+0x196>
96eb8: eb00 0382 add.w r3, r0, r2, lsl #2
96ebc: f847 1b04 str.w r1, [r7], #4
96ec0: 429f cmp r7, r3
96ec2: d1fb bne.n 96ebc <_ZN9arch_neonL8memset_TIN6simdpp9arch_neon8uint32x4EjEEvPT0_S4_j+0x1a8>
96ec4: e7f1 b.n 96eaa <_ZN9arch_neonL8memset_TIN6simdpp9arch_neon8uint32x4EjEEvPT0_S4_j+0x196>
96ec6: f8dd a004 ldr.w sl, [sp, #4]
96eca: e79f b.n 96e0c <_ZN9arch_neonL8memset_TIN6simdpp9arch_neon8uint32x4EjEEvPT0_S4_j+0xf8>
96ecc: 4653 mov r3, sl
96ece: e7e6 b.n 96e9e <_ZN9arch_neonL8memset_TIN6simdpp9arch_neon8uint32x4EjEEvPT0_S4_j+0x18a>
00096f74 <_ZN9arch_neonL8memset_TIN6simdpp9arch_neon8uint32x4EjEEvPT0_S4_j>:
96f74: e92d 4ff0 stmdb sp!, {r4, r5, r6, r7, r8, r9, sl, fp, lr}
96f78: 2a03 cmp r2, #3
96f7a: b0a1 sub sp, #132 ; 0x84
96f7c: 4607 mov r7, r0
96f7e: 9103 str r1, [sp, #12]
96f80: 9201 str r2, [sp, #4]
96f82: f240 80e4 bls.w 9714e <_ZN9arch_neonL8memset_TIN6simdpp9arch_neon8uint32x4EjEEvPT0_S4_j+0x1da>
96f86: 0704 lsls r4, r0, #28
96f88: 4603 mov r3, r0
96f8a: bf18 it ne
96f8c: 9a01 ldrne r2, [sp, #4]
96f8e: d007 beq.n 96fa0 <_ZN9arch_neonL8memset_TIN6simdpp9arch_neon8uint32x4EjEEvPT0_S4_j+0x2c>
96f90: f843 1b04 str.w r1, [r3], #4
96f94: 0718 lsls r0, r3, #28
96f96: f102 32ff add.w r2, r2, #4294967295 ; 0xffffffff
96f9a: 461f mov r7, r3
96f9c: d1f8 bne.n 96f90 <_ZN9arch_neonL8memset_TIN6simdpp9arch_neon8uint32x4EjEEvPT0_S4_j+0x1c>
96f9e: 9201 str r2, [sp, #4]
96fa0: 9a01 ldr r2, [sp, #4]
96fa2: ab03 add r3, sp, #12
96fa4: 2a3f cmp r2, #63 ; 0x3f
96fa6: f9e3 0caf vld1.32 {d16[]-d17[]}, [r3]
96faa: edcd 0b04 vstr d16, [sp, #16]
96fae: edcd 1b06 vstr d17, [sp, #24]
96fb2: f240 80e2 bls.w 9717a <_ZN9arch_neonL8memset_TIN6simdpp9arch_neon8uint32x4EjEEvPT0_S4_j+0x206>
96fb6: ad08 add r5, sp, #32
96fb8: ac04 add r4, sp, #16
96fba: 463e mov r6, r7
96fbc: 4694 mov ip, r2
96fbe: 9702 str r7, [sp, #8]
96fc0: e894 000f ldmia.w r4, {r0, r1, r2, r3}
96fc4: f106 0b20 add.w fp, r6, #32
96fc8: f106 0a30 add.w sl, r6, #48 ; 0x30
96fcc: f106 0940 add.w r9, r6, #64 ; 0x40
96fd0: f106 0850 add.w r8, r6, #80 ; 0x50
96fd4: 4637 mov r7, r6
96fd6: f1ac 0c40 sub.w ip, ip, #64 ; 0x40
96fda: e885 000f stmia.w r5, {r0, r1, r2, r3}
96fde: eddd 0b08 vldr d16, [sp, #32]
96fe2: eddd 1b0a vldr d17, [sp, #40] ; 0x28
96fe6: f1bc 0f3f cmp.w ip, #63 ; 0x3f
96fea: f947 0add vst1.64 {d16-d17}, [r7 :64]!
96fee: f947 0adf vst1.64 {d16-d17}, [r7 :64]
96ff2: f106 0760 add.w r7, r6, #96 ; 0x60
96ff6: f94b 0adf vst1.64 {d16-d17}, [fp :64]
96ffa: f106 0b70 add.w fp, r6, #112 ; 0x70
96ffe: f94a 0adf vst1.64 {d16-d17}, [sl :64]
97002: f106 0a80 add.w sl, r6, #128 ; 0x80
97006: f949 0adf vst1.64 {d16-d17}, [r9 :64]
9700a: f106 0990 add.w r9, r6, #144 ; 0x90
9700e: f948 0adf vst1.64 {d16-d17}, [r8 :64]
97012: f106 08a0 add.w r8, r6, #160 ; 0xa0
97016: f947 0adf vst1.64 {d16-d17}, [r7 :64]
9701a: f106 07b0 add.w r7, r6, #176 ; 0xb0
9701e: f94b 0adf vst1.64 {d16-d17}, [fp :64]
97022: f106 0bc0 add.w fp, r6, #192 ; 0xc0
97026: f94a 0adf vst1.64 {d16-d17}, [sl :64]
9702a: f106 0ad0 add.w sl, r6, #208 ; 0xd0
9702e: f949 0adf vst1.64 {d16-d17}, [r9 :64]
97032: f106 09e0 add.w r9, r6, #224 ; 0xe0
97036: f948 0adf vst1.64 {d16-d17}, [r8 :64]
9703a: f106 08f0 add.w r8, r6, #240 ; 0xf0
9703e: f506 7680 add.w r6, r6, #256 ; 0x100
97042: f947 0adf vst1.64 {d16-d17}, [r7 :64]
97046: f94b 0adf vst1.64 {d16-d17}, [fp :64]
9704a: f94a 0adf vst1.64 {d16-d17}, [sl :64]
9704e: f949 0adf vst1.64 {d16-d17}, [r9 :64]
97052: f948 0adf vst1.64 {d16-d17}, [r8 :64]
97056: d8b3 bhi.n 96fc0 <_ZN9arch_neonL8memset_TIN6simdpp9arch_neon8uint32x4EjEEvPT0_S4_j+0x4c>
97058: 9a01 ldr r2, [sp, #4]
9705a: 9f02 ldr r7, [sp, #8]
9705c: f1a2 0340 sub.w r3, r2, #64 ; 0x40
97060: f002 0a3f and.w sl, r2, #63 ; 0x3f
97064: 099b lsrs r3, r3, #6
97066: 3301 adds r3, #1
97068: eb07 2703 add.w r7, r7, r3, lsl #8
9706c: f1ba 0f0f cmp.w sl, #15
97070: d926 bls.n 970c0 <_ZN9arch_neonL8memset_TIN6simdpp9arch_neon8uint32x4EjEEvPT0_S4_j+0x14c>
97072: ad08 add r5, sp, #32
97074: 463e mov r6, r7
97076: 46d0 mov r8, sl
97078: e894 000f ldmia.w r4, {r0, r1, r2, r3}
9707c: f1a8 0810 sub.w r8, r8, #16
97080: f106 0b20 add.w fp, r6, #32
97084: f106 0930 add.w r9, r6, #48 ; 0x30
97088: f1b8 0f0f cmp.w r8, #15
9708c: 46b4 mov ip, r6
9708e: f106 0640 add.w r6, r6, #64 ; 0x40
97092: e885 000f stmia.w r5, {r0, r1, r2, r3}
97096: eddd 0b08 vldr d16, [sp, #32]
9709a: eddd 1b0a vldr d17, [sp, #40] ; 0x28
9709e: f94c 0add vst1.64 {d16-d17}, [ip :64]!
970a2: f94c 0adf vst1.64 {d16-d17}, [ip :64]
970a6: f94b 0adf vst1.64 {d16-d17}, [fp :64]
970aa: f949 0adf vst1.64 {d16-d17}, [r9 :64]
970ae: d8e3 bhi.n 97078 <_ZN9arch_neonL8memset_TIN6simdpp9arch_neon8uint32x4EjEEvPT0_S4_j+0x104>
970b0: f1aa 0310 sub.w r3, sl, #16
970b4: f00a 0a0f and.w sl, sl, #15
970b8: 091b lsrs r3, r3, #4
970ba: 3301 adds r3, #1
970bc: eb07 1783 add.w r7, r7, r3, lsl #6
970c0: f1ba 0f03 cmp.w sl, #3
970c4: d919 bls.n 970fa <_ZN9arch_neonL8memset_TIN6simdpp9arch_neon8uint32x4EjEEvPT0_S4_j+0x186>
970c6: ad08 add r5, sp, #32
970c8: 46bc mov ip, r7
970ca: 4656 mov r6, sl
970cc: e894 000f ldmia.w r4, {r0, r1, r2, r3}
970d0: 3e04 subs r6, #4
970d2: 2e03 cmp r6, #3
970d4: e885 000f stmia.w r5, {r0, r1, r2, r3}
970d8: eddd 0b08 vldr d16, [sp, #32]
970dc: eddd 1b0a vldr d17, [sp, #40] ; 0x28
970e0: f94c 0adf vst1.64 {d16-d17}, [ip :64]
970e4: f10c 0c10 add.w ip, ip, #16
970e8: d8f0 bhi.n 970cc <_ZN9arch_neonL8memset_TIN6simdpp9arch_neon8uint32x4EjEEvPT0_S4_j+0x158>
970ea: f1aa 0304 sub.w r3, sl, #4
970ee: f00a 0a03 and.w sl, sl, #3
970f2: 089b lsrs r3, r3, #2
970f4: 3301 adds r3, #1
970f6: eb07 1703 add.w r7, r7, r3, lsl #4
970fa: e894 000f ldmia.w r4, {r0, r1, r2, r3}
970fe: f1ba 0f01 cmp.w sl, #1
97102: ac0c add r4, sp, #48 ; 0x30
97104: e884 000f stmia.w r4, {r0, r1, r2, r3}
97108: d91b bls.n 97142 <_ZN9arch_neonL8memset_TIN6simdpp9arch_neon8uint32x4EjEEvPT0_S4_j+0x1ce>
9710a: f10d 0e60 add.w lr, sp, #96 ; 0x60
9710e: ad14 add r5, sp, #80 ; 0x50
97110: f10d 0870 add.w r8, sp, #112 ; 0x70
97114: ae10 add r6, sp, #64 ; 0x40
97116: e885 000f stmia.w r5, {r0, r1, r2, r3}
9711a: eddd 0b14 vldr d16, [sp, #80] ; 0x50
9711e: eddd 1b16 vldr d17, [sp, #88] ; 0x58
97122: e886 000f stmia.w r6, {r0, r1, r2, r3}
97126: e88e 000f stmia.w lr, {r0, r1, r2, r3}
9712a: 4671 mov r1, lr
9712c: 4640 mov r0, r8
9712e: f947 07dd vst1.64 {d16}, [r7 :64]!
97132: f000 f95b bl 973ec <_ZN6simdpp9arch_neon6move_lILj8EEENS0_13basic_int8x16ES2_>
97136: e898 000f ldmia.w r8, {r0, r1, r2, r3}
9713a: e885 000f stmia.w r5, {r0, r1, r2, r3}
9713e: e884 000f stmia.w r4, {r0, r1, r2, r3}
97142: f01a 0f01 tst.w sl, #1
97146: d10c bne.n 97162 <_ZN9arch_neonL8memset_TIN6simdpp9arch_neon8uint32x4EjEEvPT0_S4_j+0x1ee>
97148: b021 add sp, #132 ; 0x84
9714a: e8bd 8ff0 ldmia.w sp!, {r4, r5, r6, r7, r8, r9, sl, fp, pc}
9714e: 9a01 ldr r2, [sp, #4]
97150: 2a00 cmp r2, #0
97152: d0f9 beq.n 97148 <_ZN9arch_neonL8memset_TIN6simdpp9arch_neon8uint32x4EjEEvPT0_S4_j+0x1d4>
97154: eb00 0382 add.w r3, r0, r2, lsl #2
97158: f847 1b04 str.w r1, [r7], #4
9715c: 429f cmp r7, r3
9715e: d1fb bne.n 97158 <_ZN9arch_neonL8memset_TIN6simdpp9arch_neon8uint32x4EjEEvPT0_S4_j+0x1e4>
97160: e7f2 b.n 97148 <_ZN9arch_neonL8memset_TIN6simdpp9arch_neon8uint32x4EjEEvPT0_S4_j+0x1d4>
97162: e894 000f ldmia.w r4, {r0, r1, r2, r3}
97166: ac14 add r4, sp, #80 ; 0x50
97168: e884 000f stmia.w r4, {r0, r1, r2, r3}
9716c: eddd 0b14 vldr d16, [sp, #80] ; 0x50
97170: eddd 1b16 vldr d17, [sp, #88] ; 0x58
97174: f9c7 080f vst1.32 {d16[0]}, [r7]
97178: e7e6 b.n 97148 <_ZN9arch_neonL8memset_TIN6simdpp9arch_neon8uint32x4EjEEvPT0_S4_j+0x1d4>
9717a: ac04 add r4, sp, #16
9717c: f8dd a004 ldr.w sl, [sp, #4]
97180: e774 b.n 9706c <_ZN9arch_neonL8memset_TIN6simdpp9arch_neon8uint32x4EjEEvPT0_S4_j+0xf8>
97182: bf00 nop
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment