Created
October 2, 2020 20:49
-
-
Save abadams/5fee6cddd3dd23b8088d6d18de488a1f to your computer and use it in GitHub Desktop.
arm gemm with sdot
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
add x9, x8, x17 | |
ldr d26, [x8] | |
add x13, x29, x15 | |
ldr d28, [x9] | |
add x9, x9, x17 | |
ld1r { v29.4s }, [x13] | |
sub x13, x13, | |
ldr d27, [x9] | |
add x9, x9, x17 | |
ld1r { v30.4s }, [x13] | |
add x13, x14, x15 | |
ldr d11, [x9] | |
ld1r { v31.4s }, [x13] | |
sub x13, x13, | |
ld1r { v8.4s }, [x13] | |
add x13, x0, x15 | |
add x9, x9, x17 | |
mov v26.d[1], v28.d[0] | |
ld1r { v9.4s }, [x13] | |
sub x13, x13, | |
ldr d12, [x9] | |
add x9, x9, x17 | |
mov v27.d[1], v11.d[0] | |
ld1r { v10.4s }, [x13] | |
add x13, x3, x15 | |
tbl v28.16b, { v26.16b, v27.16b }, v0.16b | |
tbl v26.16b, { v26.16b, v27.16b }, v1.16b | |
ldr d27, [x9] | |
add x9, x9, x17 | |
ldr d13, [x9] | |
sdot v25.4s, v30.16b, v26.16b | |
sdot v24.4s, v30.16b, v28.16b | |
ldr d30, [x9, x17] | |
sdot v23.4s, v8.16b, v26.16b | |
sdot v22.4s, v8.16b, v28.16b | |
ld1r { v8.4s }, [x13], | |
add x9, x2, x15 | |
sdot v21.4s, v10.16b, v26.16b | |
sdot v20.4s, v10.16b, v28.16b | |
mov v12.d[1], v27.d[0] | |
ld1r { v27.4s }, [x13] | |
ld1r { v10.4s }, [x9], | |
mov v13.d[1], v30.d[0] | |
sdot v19.4s, v8.16b, v26.16b | |
sdot v18.4s, v8.16b, v28.16b | |
add x13, x16, x15 | |
tbl v30.16b, { v12.16b, v13.16b }, v0.16b | |
tbl v8.16b, { v12.16b, v13.16b }, v1.16b | |
sdot v25.4s, v29.16b, v8.16b | |
sdot v24.4s, v29.16b, v30.16b | |
sdot v23.4s, v31.16b, v8.16b | |
sdot v22.4s, v31.16b, v30.16b | |
ld1r { v29.4s }, [x9] | |
ld1r { v31.4s }, [x13], | |
add x9, x30, x15 | |
sdot v21.4s, v9.16b, v8.16b | |
sdot v20.4s, v9.16b, v30.16b | |
ld1r { v9.4s }, [x13] | |
sdot v19.4s, v27.16b, v8.16b | |
sdot v18.4s, v27.16b, v30.16b | |
ld1r { v27.4s }, [x9], | |
add x13, x28, x15 | |
sdot v17.4s, v10.16b, v26.16b | |
sdot v16.4s, v10.16b, v28.16b | |
sdot v17.4s, v29.16b, v8.16b | |
sdot v16.4s, v29.16b, v30.16b | |
sdot v7.4s, v31.16b, v26.16b | |
sdot v6.4s, v31.16b, v28.16b | |
ld1r { v29.4s }, [x9] | |
ld1r { v31.4s }, [x13], | |
sdot v5.4s, v27.16b, v26.16b | |
sdot v4.4s, v27.16b, v28.16b | |
add x15, x15, | |
ld1r { v27.4s }, [x13] | |
sdot v3.4s, v31.16b, v26.16b | |
sdot v2.4s, v31.16b, v28.16b | |
cmp x15, | |
sdot v7.4s, v9.16b, v8.16b | |
sdot v6.4s, v9.16b, v30.16b | |
sdot v5.4s, v29.16b, v8.16b | |
sdot v4.4s, v29.16b, v30.16b | |
sdot v3.4s, v27.16b, v8.16b | |
sdot v2.4s, v27.16b, v30.16b | |
add x8, x8, x20 | |
b.ne .LBB0_5 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment