Skip to content

Instantly share code, notes, and snippets.

@cypriss
Created December 11, 2011 21:23
Show Gist options
  • Save cypriss/1462825 to your computer and use it in GitHub Desktop.
Save cypriss/1462825 to your computer and use it in GitHub Desktop.
my first assembly function
void gemm_full_block(double* restrict A, // rdi
double* restrict B, // rsi
double* restrict C, // rdx
const uint32_t inner_k, // ecx
const uint32_t AB_stride, // r8
const uint32_t C_stride, // r9
uint32_t i,
uint32_t j,
uint32_t k) {
__asm__(
"subq $64, %%rsp\n\t"
"movl 16(%%rbp), %%eax\n\t" // eax: i
"movl 24(%%rbp), %%ebx\n\t" // ebx: j
"movl 32(%%rbp), %%r10d\n\t" // r10d: k
"leaq (%%rdx, %%rbx, 8), %%rdx\n\t" // C = C + J*8
// Save max_i to -8 rsp. max_i = i + block_size_m
"movl %%eax, %%r15d\n\t"
"addl $" STRINGIFY(BLOCK_SIZE_M) ", %%r15d\n\t"
"movl %%r15d, -8(%%rsp)\n\t"
// Save max_j to -16 rsp. max_j = j + block_size_n
"movl %%ebx, %%r15d\n\t"
"addl $" STRINGIFY(BLOCK_SIZE_N) ", %%r15d\n\t"
"movl %%r15d, -16(%%rsp)\n\t"
// add k to A and B
"leaq (%%rdi, %%r10, 8), %%rdi\n\t"
"leaq (%%rsi, %%r10, 8), %%rsi\n\t"
".align 4, 0x90\n\t"
"1:\n\t"
"movl 16(%%rbp), %%eax\n\t" // i = arg(i)
".align 4, 0x90\n\t"
"2:\n\t"
"movl %%eax, %%r11d\n\t" // r11: REG_OFFSET = i
"imul %%r9d, %%r11d\n\t" // REG_OFFSET = REG_OFFSET * C_stride
"movapd 0(%%rdx, %%r11, 8), %%xmm0\n\t" // xmm0(lo, high) <- (C0, C1)
"movapd 16(%%rdx, %%r11, 8), %%xmm1\n\t"
"addl %%r9d, %%r11d\n\t"
"movapd 0(%%rdx, %%r11, 8), %%xmm2\n\t"
"movapd 16(%%rdx, %%r11, 8), %%xmm3\n\t"
"addl %%r9d, %%r11d\n\t"
"movapd 0(%%rdx, %%r11, 8), %%xmm4\n\t"
"movapd 16(%%rdx, %%r11, 8), %%xmm5\n\t"
"addl %%r9d, %%r11d\n\t"
"movapd 0(%%rdx, %%r11, 8), %%xmm6\n\t"
"movapd 16(%%rdx, %%r11, 8), %%xmm7\n\t"
"movl %%eax, %%r15d\n\t" //r15 = i
"imul %%r8d, %%r15d\n\t" //r15 = i * ab_stride
"leaq (%%rdi, %%r15, 8), %%r14\n\t" // r14 = A + (i*ab_stride)*8
"movl %%ebx, %%r15d\n\t" //r15 = j
"imul %%r8d, %%r15d\n\t" //r15 = j * ab_stride
"leaq (%%rsi, %%r15, 8), %%r13\n\t" // r13 = B + (j*ab_stride)*8
"xorl %%r11d, %%r11d\n\t" // k = 0
".align 4, 0x90\n\t"
"3:\n\t"
"movl %%r11d, %%r12d\n\t" // r12 = k
"movapd (%%r14, %%r12, 8), %%xmm8\n\t" // A0 8:A0 9:xx 10:xx 11:xx 12:xx 13:xx 14:xx 15:xx
"movapd (%%r13, %%r12, 8), %%xmm9\n\t" // B0 8:A0 9:B0 10:xx 11:xx 12:xx 13:xx 14:xx 15:xx
"movapd %%xmm9, %%xmm15\n\t" // copy B0 8:A0 9:B0 10:xx 11:xx 12:xx 13:xx 14:xx 15:B0
"dppd $0x31, %%xmm8, %%xmm9\n\t" // C0 = A0 * B0 8:A0 9:C0 10:xx 11:xx 12:xx 13:xx 14:xx 15:B0
"addl %%r8d, %%r12d\n\t" // r12 = k + AB_stride*1
"movapd (%%r13, %%r12, 8), %%xmm11\n\t" // B1 8:A0 9:C0 10:xx 11:B1 12:xx 13:xx 14:xx 15:B0
"movapd %%xmm11, %%xmm14\n\t" // copy B1 8:A0 9:C0 10:xx 11:B1 12:xx 13:xx 14:B1 15:B0
"dppd $0x32, %%xmm8, %%xmm11\n\t" // C1 = A0 * B1 8:A0 9:C0 10:xx 11:C1 12:xx 13:xx 14:B1 15:B0
"movapd (%%r14, %%r12, 8), %%xmm10\n\t" // A1 8:A0 9:C0 10:A1 11:C1 12:xx 13:xx 14:B1 15:B0
"dppd $0x31, %%xmm10, %%xmm15\n\t" // C4 = A1 * B0 8:A0 9:C0 10:A1 11:C1 12:xx 13:xx 14:B1 15:C4
"dppd $0x32, %%xmm10, %%xmm14\n\t" // C5 = A1 * B1 8:A0 9:C0 10:A1 11:C1 12:xx 13:xx 14:C5 15:C4
"addl %%r8d, %%r12d\n\t" // r12 = k + AB_stride*2
"movapd (%%r13, %%r12, 8), %%xmm12\n\t" // B2 8:A0 9:C0 10:A1 11:C1 12:B2 13:xx 14:C5 15:C4
"addpd %%xmm9, %%xmm0\n\t" // Flush C0 8:A0 9:xx 10:A1 11:C1 12:B2 13:xx 14:C5 15:C4
"movapd (%%r14, %%r12, 8), %%xmm13\n\t" // A2 8:A0 9:xx 10:A1 11:C1 12:B2 13:A2 14:C5 15:C4
"movapd %%xmm12, %%xmm9\n\t" // copy B2 8:A0 9:B2 10:A1 11:C1 12:B2 13:A2 14:C5 15:C4
"dppd $0x31, %%xmm8, %%xmm12\n\t" // C2 8:A0 9:B2 10:A1 11:C1 12:C2 13:A2 14:C5 15:C4
"addpd %%xmm11, %%xmm0\n\t" // Flush C1 8:A0 9:B2 10:A1 11:xx 12:C2 13:A2 14:C5 15:C4
"addpd %%xmm15, %%xmm2\n\t" // Flush C4 8:A0 9:B2 10:A1 11:xx 12:C2 13:A2 14:C5 15:xx
"movapd %%xmm9, %%xmm11\n\t" // copy B2 8:A0 9:B2 10:A1 11:B2 12:C2 13:A2 14:C5 15:xx
"dppd $0x31, %%xmm10, %%xmm9\n\t" // C6 8:A0 9:C6 10:A1 11:B2 12:C2 13:A2 14:C5 15:xx
"addpd %%xmm14, %%xmm2\n\t" // Flush C5 8:A0 9:C6 10:A1 11:B2 12:C2 13:A2 14:xx 15:xx
"addl %%r8d, %%r12d\n\t" // r12 = k + AB_stride*3
"movapd (%%r13, %%r12, 8), %%xmm14\n\t" // B3 8:A0 9:C6 10:A1 11:B2 12:C2 13:A2 14:B3 15:xx
"movapd (%%r14, %%r12, 8), %%xmm15\n\t" // A3 8:A0 9:C6 10:A1 11:B2 12:C2 13:A2 14:B3 15:A3
"dppd $0x32, %%xmm14, %%xmm8\n\t" // C3 8:C3 9:C6 10:A1 11:B2 12:C2 13:A2 14:B3 15:A3
"dppd $0x32, %%xmm14, %%xmm10\n\t" // C7 8:C3 9:C6 10:C7 11:B2 12:C2 13:A2 14:B3 15:A3
"addpd %%xmm12, %%xmm1\n\t" // Flush C2 8:C3 9:C6 10:C7 11:B2 12:xx 13:A2 14:B3 15:A3
"movapd %%xmm11, %%xmm12\n\t" // copy B2 8:C3 9:C6 10:C7 11:B2 12:B2 13:A2 14:B3 15:A3
"dppd $0x31, %%xmm13, %%xmm11\n\t" // C10 8:C3 9:C6 10:C7 11:C10 12:B2 13:A2 14:B3 15:A3
"dppd $0x31, %%xmm15, %%xmm12\n\t" // C14 8:C3 9:C6 10:C7 11:C10 12:C14 13:A2 14:B3 15:A3
"addpd %%xmm9, %%xmm3\n\t" // Flush C6 8:C3 9:xx 10:C7 11:C10 12:C14 13:A2 14:B3 15:A3
"movapd %%xmm14, %%xmm9\n\t" // copy B3 8:C3 9:B3 10:C7 11:C10 12:C14 13:A2 14:B3 15:A3
"dppd $0x32, %%xmm13, %%xmm14\n\t" // C11 8:C3 9:B3 10:C7 11:C10 12:C14 13:A2 14:C11 15:A3
"dppd $0x32, %%xmm15, %%xmm9\n\t" // C15 8:C3 9:C15 10:C7 11:C10 12:C14 13:A2 14:C11 15:A3
"addpd %%xmm8, %%xmm1\n\t" // Flush C3 8:xx 9:C15 10:C7 11:C10 12:C14 13:A2 14:C11 15:A3
"subl %%r8d, %%r12d\n\t" // r12 = k + AB_stride*2
"subl %%r8d, %%r12d\n\t" // r12 = k + AB_stride*1
"movapd (%%r13, %%r12, 8), %%xmm8\n\t" // B1 8:B1 9:C15 10:C7 11:C10 12:C14 13:A2 14:C11 15:A3
"addpd %%xmm10, %%xmm3\n\t" // Flush C7 8:B1 9:C15 10:xx 11:C10 12:C14 13:A2 14:C11 15:A3
"movapd %%xmm8, %%xmm10\n\t" // copy B1 8:B1 9:C15 10:B1 11:C10 12:C14 13:A2 14:C11 15:A3
"dppd $0x32, %%xmm13, %%xmm8\n\t" // C9 8:C9 9:C15 10:B1 11:C10 12:C14 13:A2 14:C11 15:A3
"dppd $0x32, %%xmm15, %%xmm10\n\t" // C13 8:C9 9:C15 10:C13 11:C10 12:C14 13:A2 14:C11 15:A3
"addpd %%xmm11, %%xmm5\n\t" // Flush C10 8:C9 9:C15 10:C13 11:xx 12:C14 13:A2 14:C11 15:A3
"addpd %%xmm12, %%xmm7\n\t" // Flush C14 8:C9 9:C15 10:C13 11:xx 12:xx 13:A2 14:C11 15:A3
"subl %%r8d, %%r12d\n\t" // r12 = k + AB_stride*0 TODO: make mov from k
"movapd (%%r13, %%r12, 8), %%xmm11\n\t" // B0 8:C9 9:C15 10:C13 11:B0 12:xx 13:A2 14:C11 15:A3
"addpd %%xmm14, %%xmm5\n\t" // Flush C11 8:C9 9:C15 10:C13 11:B0 12:xx 13:A2 14:xx 15:A3
"addpd %%xmm9, %%xmm7\n\t" // Flush C15 8:C9 9:xx 10:C13 11:B0 12:xx 13:A2 14:xx 15:A3
"movapd %%xmm11, %%xmm12\n\t" // copy B0 8:C9 9:xx 10:C13 11:B0 12:B0 13:A2 14:xx 15:A3
"dppd $0x31, %%xmm13, %%xmm11\n\t" // C8 8:C9 9:xx 10:C13 11:C8 12:B0 13:A2 14:xx 15:A3
"dppd $0x31, %%xmm15, %%xmm12\n\t" // C12 8:C9 9:xx 10:C13 11:C8 12:C12 13:A2 14:xx 15:A3
"addpd %%xmm8, %%xmm4\n\t" // Flush C9 8:xx 9:xx 10:C13 11:C8 12:C12 13:A2 14:xx 15:A3
"addpd %%xmm10, %%xmm6\n\t" // Flush C13 8:xx 9:xx 10:xx 11:C8 12:C12 13:A2 14:xx 15:A3
"addpd %%xmm11, %%xmm4\n\t" // Flush C8 8:xx 9:xx 10:xx 11:xx 12:C12 13:A2 14:xx 15:A3
"addpd %%xmm12, %%xmm6\n\t" // Flush C12 8:xx 9:xx 10:xx 11:xx 12:xx 13:A2 14:xx 15:A3
"addl $2, %%r11d\n\t"
"cmpl $" STRINGIFY(BLOCK_SIZE_K) ", %%r11d\n\t"
"jne 3b\n\t"
"movl %%eax, %%r11d\n\t" // r11: REG_OFFSET = i * C_stride
"imul %%r9d, %%r11d\n\t"
"movapd %%xmm0, 0(%%rdx, %%r11, 8)\n\t"
"movapd %%xmm1, 16(%%rdx, %%r11, 8)\n\t"
"addl %%r9d, %%r11d\n\t"
"movapd %%xmm2, 0(%%rdx, %%r11, 8)\n\t"
"movapd %%xmm3, 16(%%rdx, %%r11, 8)\n\t"
"addl %%r9d, %%r11d\n\t"
"movapd %%xmm4, 0(%%rdx, %%r11, 8)\n\t"
"movapd %%xmm5, 16(%%rdx, %%r11, 8)\n\t"
"addl %%r9d, %%r11d\n\t"
"movapd %%xmm6, 0(%%rdx, %%r11, 8)\n\t"
"movapd %%xmm7, 16(%%rdx, %%r11, 8)\n\t"
"movl -8(%%rsp), %%r15d\n\t" // r15 = max i
"addl $4, %%eax\n\t" // i = i + 4
"cmpl %%r15d, %%eax\n\t"
"jb 2b\n\t"
"addq $32, %%rdx\n\t" // C = C + register size * 8
"movl -16(%%rsp), %%r15d\n\t" // r15 = max j
"addl $4, %%ebx\n\t" // j = j + 4
"cmpl %%r15d, %%ebx\n\t"
"jb 1b\n\t"
"addq $64, %%rsp\n\t"
:
// Outputs
// none
:
// Inputs
// No inputs -- I'll gather them myself
:
// Clobbered
"%r15",
"%r14",
"%r13",
"%r12",
"%r11",
"%r10",
"%r9",
"%r8",
"%rax",
"%rbx",
"%rcx",
"%rdx",
"%rsi",
"%rdi"
);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment