Skip to content

Instantly share code, notes, and snippets.

@abadams
Created October 2, 2020 20:49
Show Gist options
  • Save abadams/5fee6cddd3dd23b8088d6d18de488a1f to your computer and use it in GitHub Desktop.
Save abadams/5fee6cddd3dd23b8088d6d18de488a1f to your computer and use it in GitHub Desktop.
arm gemm with sdot
add x9, x8, x17
ldr d26, [x8]
add x13, x29, x15
ldr d28, [x9]
add x9, x9, x17
ld1r { v29.4s }, [x13]
sub x13, x13,
ldr d27, [x9]
add x9, x9, x17
ld1r { v30.4s }, [x13]
add x13, x14, x15
ldr d11, [x9]
ld1r { v31.4s }, [x13]
sub x13, x13,
ld1r { v8.4s }, [x13]
add x13, x0, x15
add x9, x9, x17
mov v26.d[1], v28.d[0]
ld1r { v9.4s }, [x13]
sub x13, x13,
ldr d12, [x9]
add x9, x9, x17
mov v27.d[1], v11.d[0]
ld1r { v10.4s }, [x13]
add x13, x3, x15
tbl v28.16b, { v26.16b, v27.16b }, v0.16b
tbl v26.16b, { v26.16b, v27.16b }, v1.16b
ldr d27, [x9]
add x9, x9, x17
ldr d13, [x9]
sdot v25.4s, v30.16b, v26.16b
sdot v24.4s, v30.16b, v28.16b
ldr d30, [x9, x17]
sdot v23.4s, v8.16b, v26.16b
sdot v22.4s, v8.16b, v28.16b
ld1r { v8.4s }, [x13],
add x9, x2, x15
sdot v21.4s, v10.16b, v26.16b
sdot v20.4s, v10.16b, v28.16b
mov v12.d[1], v27.d[0]
ld1r { v27.4s }, [x13]
ld1r { v10.4s }, [x9],
mov v13.d[1], v30.d[0]
sdot v19.4s, v8.16b, v26.16b
sdot v18.4s, v8.16b, v28.16b
add x13, x16, x15
tbl v30.16b, { v12.16b, v13.16b }, v0.16b
tbl v8.16b, { v12.16b, v13.16b }, v1.16b
sdot v25.4s, v29.16b, v8.16b
sdot v24.4s, v29.16b, v30.16b
sdot v23.4s, v31.16b, v8.16b
sdot v22.4s, v31.16b, v30.16b
ld1r { v29.4s }, [x9]
ld1r { v31.4s }, [x13],
add x9, x30, x15
sdot v21.4s, v9.16b, v8.16b
sdot v20.4s, v9.16b, v30.16b
ld1r { v9.4s }, [x13]
sdot v19.4s, v27.16b, v8.16b
sdot v18.4s, v27.16b, v30.16b
ld1r { v27.4s }, [x9],
add x13, x28, x15
sdot v17.4s, v10.16b, v26.16b
sdot v16.4s, v10.16b, v28.16b
sdot v17.4s, v29.16b, v8.16b
sdot v16.4s, v29.16b, v30.16b
sdot v7.4s, v31.16b, v26.16b
sdot v6.4s, v31.16b, v28.16b
ld1r { v29.4s }, [x9]
ld1r { v31.4s }, [x13],
sdot v5.4s, v27.16b, v26.16b
sdot v4.4s, v27.16b, v28.16b
add x15, x15,
ld1r { v27.4s }, [x13]
sdot v3.4s, v31.16b, v26.16b
sdot v2.4s, v31.16b, v28.16b
cmp x15,
sdot v7.4s, v9.16b, v8.16b
sdot v6.4s, v9.16b, v30.16b
sdot v5.4s, v29.16b, v8.16b
sdot v4.4s, v29.16b, v30.16b
sdot v3.4s, v27.16b, v8.16b
sdot v2.4s, v27.16b, v30.16b
add x8, x8, x20
b.ne .LBB0_5
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment