Assembly output of the three functions from https://lemire.me/blog/2021/01/06/memory-access-on-the-apple-m1-processor/#comments
.globl _compute_two ; -- Begin function compute_two | |
.p2align 2 | |
_compute_two: ; @compute_two | |
.cfi_startproc | |
; %bb.0: | |
Lloh0: | |
adrp x8, _M@PAGE | |
Lloh1: | |
ldr x8, [x8, _M@PAGEOFF] | |
tst x8, #0x7fffffffffffffff | |
b.eq LBB0_5 | |
; %bb.1: | |
lsl x9, x8, #1 | |
cmp x9, #2 ; =2 | |
mov w8, #2 | |
csel x8, x9, x8, hi | |
sub x8, x8, #1 ; =1 | |
lsr x10, x8, #1 | |
cbz x10, LBB0_6 | |
; %bb.2: | |
mov w8, #0 | |
mov w13, #0 | |
add x11, x10, #1 ; =1 | |
and x12, x11, #0xfffffffffffffffe | |
lsl x10, x12, #1 | |
add x14, x2, #8 ; =8 | |
mov x15, x12 | |
LBB0_3: ; =>This Inner Loop Header: Depth=1 | |
ldpsw x16, x17, [x14, #-8] | |
ldrb w16, [x0, x16] | |
ldpsw x1, x3, [x14], #16 | |
ldrb w1, [x0, x1] | |
ldrb w17, [x0, x17] | |
ldrb w3, [x0, x3] | |
eor w16, w17, w16 | |
eor w17, w3, w1 | |
add w8, w8, w16, sxtb | |
add w13, w13, w17, sxtb | |
subs x15, x15, #2 ; =2 | |
b.ne LBB0_3 | |
; %bb.4: | |
add w8, w13, w8 | |
cmp x11, x12 | |
b.ne LBB0_7 | |
b LBB0_8 | |
LBB0_5: | |
mov w0, #0 | |
ret | |
LBB0_6: | |
mov w8, #0 | |
LBB0_7: ; =>This Inner Loop Header: Depth=1 | |
add x11, x2, x10, lsl #2 | |
ldpsw x12, x11, [x11] | |
ldrb w12, [x0, x12] | |
ldrb w11, [x0, x11] | |
eor w11, w11, w12 | |
add w8, w8, w11, sxtb | |
add x10, x10, #2 ; =2 | |
cmp x10, x9 | |
b.lo LBB0_7 | |
LBB0_8: | |
mov x0, x8 | |
ret | |
.loh AdrpLdr Lloh0, Lloh1 | |
.cfi_endproc | |
; -- End function | |
.globl _compute_two_plus ; -- Begin function compute_two_plus | |
.p2align 2 | |
_compute_two_plus: ; @compute_two_plus | |
.cfi_startproc | |
; %bb.0: | |
Lloh2: | |
adrp x8, _M@PAGE | |
Lloh3: | |
ldr x10, [x8, _M@PAGEOFF] | |
tst x10, #0x7fffffffffffffff | |
b.eq LBB1_4 | |
; %bb.1: | |
mov w8, #0 | |
mov x9, #0 | |
lsl x10, x10, #1 | |
LBB1_2: ; =>This Inner Loop Header: Depth=1 | |
add x11, x2, x9, lsl #2 | |
ldpsw x12, x11, [x11] | |
add x12, x0, x12 | |
ldrb w13, [x12] | |
ldrb w12, [x12, #1] | |
eor w12, w12, w13 | |
add x11, x0, x11 | |
ldrb w13, [x11] | |
eor w12, w12, w13 | |
ldrb w11, [x11, #1] | |
eor w11, w12, w11 | |
add w8, w8, w11, sxtb | |
add x9, x9, #2 ; =2 | |
cmp x9, x10 | |
b.lo LBB1_2 | |
; %bb.3: | |
mov x0, x8 | |
ret | |
LBB1_4: | |
mov w8, #0 | |
mov x0, x8 | |
ret | |
.loh AdrpLdr Lloh2, Lloh3 | |
.cfi_endproc | |
; -- End function | |
.globl _compute_three ; -- Begin function compute_three | |
.p2align 2 | |
_compute_three: ; @compute_three | |
.cfi_startproc | |
; %bb.0: | |
Lloh4: | |
adrp x8, _M@PAGE | |
Lloh5: | |
ldr x8, [x8, _M@PAGEOFF] | |
adds x9, x8, x8, lsl #1 | |
b.eq LBB2_4 | |
; %bb.1: | |
mov x10, #0 | |
mov w8, #0 | |
LBB2_2: ; =>This Inner Loop Header: Depth=1 | |
add x11, x2, x10, lsl #2 | |
ldpsw x12, x13, [x11] | |
ldrb w12, [x0, x12] | |
ldrb w13, [x0, x13] | |
eor w12, w13, w12 | |
ldrsw x11, [x11, #8] | |
ldrb w11, [x0, x11] | |
eor w11, w12, w11 | |
add w8, w8, w11, sxtb | |
add x10, x10, #3 ; =3 | |
cmp x10, x9 | |
b.lo LBB2_2 | |
; %bb.3: | |
mov x0, x8 | |
ret | |
LBB2_4: | |
mov w8, #0 | |
mov x0, x8 | |
ret | |
.loh AdrpLdr Lloh4, Lloh5 | |
.cfi_endproc | |
; -- End function |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment