Created
December 11, 2011 21:23
-
-
Save cypriss/1462825 to your computer and use it in GitHub Desktop.
my first assembly function
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
void gemm_full_block(double* restrict A, // rdi | |
double* restrict B, // rsi | |
double* restrict C, // rdx | |
const uint32_t inner_k, // ecx | |
const uint32_t AB_stride, // r8 | |
const uint32_t C_stride, // r9 | |
uint32_t i, | |
uint32_t j, | |
uint32_t k) { | |
__asm__( | |
"subq $64, %%rsp\n\t" | |
"movl 16(%%rbp), %%eax\n\t" // eax: i | |
"movl 24(%%rbp), %%ebx\n\t" // ebx: j | |
"movl 32(%%rbp), %%r10d\n\t" // r10d: k | |
"leaq (%%rdx, %%rbx, 8), %%rdx\n\t" // C = C + J*8 | |
// Save max_i to -8 rsp. max_i = i + block_size_m | |
"movl %%eax, %%r15d\n\t" | |
"addl $" STRINGIFY(BLOCK_SIZE_M) ", %%r15d\n\t" | |
"movl %%r15d, -8(%%rsp)\n\t" | |
// Save max_j to -16 rsp. max_j = j + block_size_n | |
"movl %%ebx, %%r15d\n\t" | |
"addl $" STRINGIFY(BLOCK_SIZE_N) ", %%r15d\n\t" | |
"movl %%r15d, -16(%%rsp)\n\t" | |
// add k to A and B | |
"leaq (%%rdi, %%r10, 8), %%rdi\n\t" | |
"leaq (%%rsi, %%r10, 8), %%rsi\n\t" | |
".align 4, 0x90\n\t" | |
"1:\n\t" | |
"movl 16(%%rbp), %%eax\n\t" // i = arg(i) | |
".align 4, 0x90\n\t" | |
"2:\n\t" | |
"movl %%eax, %%r11d\n\t" // r11: REG_OFFSET = i | |
"imul %%r9d, %%r11d\n\t" // REG_OFFSET = REG_OFFSET * C_stride | |
"movapd 0(%%rdx, %%r11, 8), %%xmm0\n\t" // xmm0(lo, high) <- (C0, C1) | |
"movapd 16(%%rdx, %%r11, 8), %%xmm1\n\t" | |
"addl %%r9d, %%r11d\n\t" | |
"movapd 0(%%rdx, %%r11, 8), %%xmm2\n\t" | |
"movapd 16(%%rdx, %%r11, 8), %%xmm3\n\t" | |
"addl %%r9d, %%r11d\n\t" | |
"movapd 0(%%rdx, %%r11, 8), %%xmm4\n\t" | |
"movapd 16(%%rdx, %%r11, 8), %%xmm5\n\t" | |
"addl %%r9d, %%r11d\n\t" | |
"movapd 0(%%rdx, %%r11, 8), %%xmm6\n\t" | |
"movapd 16(%%rdx, %%r11, 8), %%xmm7\n\t" | |
"movl %%eax, %%r15d\n\t" //r15 = i | |
"imul %%r8d, %%r15d\n\t" //r15 = i * ab_stride | |
"leaq (%%rdi, %%r15, 8), %%r14\n\t" // r14 = A + (i*ab_stride)*8 | |
"movl %%ebx, %%r15d\n\t" //r15 = j | |
"imul %%r8d, %%r15d\n\t" //r15 = j * ab_stride | |
"leaq (%%rsi, %%r15, 8), %%r13\n\t" // r13 = B + (j*ab_stride)*8 | |
"xorl %%r11d, %%r11d\n\t" // k = 0 | |
".align 4, 0x90\n\t" | |
"3:\n\t" | |
"movl %%r11d, %%r12d\n\t" // r12 = k | |
"movapd (%%r14, %%r12, 8), %%xmm8\n\t" // A0 8:A0 9:xx 10:xx 11:xx 12:xx 13:xx 14:xx 15:xx | |
"movapd (%%r13, %%r12, 8), %%xmm9\n\t" // B0 8:A0 9:B0 10:xx 11:xx 12:xx 13:xx 14:xx 15:xx | |
"movapd %%xmm9, %%xmm15\n\t" // copy B0 8:A0 9:B0 10:xx 11:xx 12:xx 13:xx 14:xx 15:B0 | |
"dppd $0x31, %%xmm8, %%xmm9\n\t" // C0 = A0 * B0 8:A0 9:C0 10:xx 11:xx 12:xx 13:xx 14:xx 15:B0 | |
"addl %%r8d, %%r12d\n\t" // r12 = k + AB_stride*1 | |
"movapd (%%r13, %%r12, 8), %%xmm11\n\t" // B1 8:A0 9:C0 10:xx 11:B1 12:xx 13:xx 14:xx 15:B0 | |
"movapd %%xmm11, %%xmm14\n\t" // copy B1 8:A0 9:C0 10:xx 11:B1 12:xx 13:xx 14:B1 15:B0 | |
"dppd $0x32, %%xmm8, %%xmm11\n\t" // C1 = A0 * B1 8:A0 9:C0 10:xx 11:C1 12:xx 13:xx 14:B1 15:B0 | |
"movapd (%%r14, %%r12, 8), %%xmm10\n\t" // A1 8:A0 9:C0 10:A1 11:C1 12:xx 13:xx 14:B1 15:B0 | |
"dppd $0x31, %%xmm10, %%xmm15\n\t" // C4 = A1 * B0 8:A0 9:C0 10:A1 11:C1 12:xx 13:xx 14:B1 15:C4 | |
"dppd $0x32, %%xmm10, %%xmm14\n\t" // C5 = A1 * B1 8:A0 9:C0 10:A1 11:C1 12:xx 13:xx 14:C5 15:C4 | |
"addl %%r8d, %%r12d\n\t" // r12 = k + AB_stride*2 | |
"movapd (%%r13, %%r12, 8), %%xmm12\n\t" // B2 8:A0 9:C0 10:A1 11:C1 12:B2 13:xx 14:C5 15:C4 | |
"addpd %%xmm9, %%xmm0\n\t" // Flush C0 8:A0 9:xx 10:A1 11:C1 12:B2 13:xx 14:C5 15:C4 | |
"movapd (%%r14, %%r12, 8), %%xmm13\n\t" // A2 8:A0 9:xx 10:A1 11:C1 12:B2 13:A2 14:C5 15:C4 | |
"movapd %%xmm12, %%xmm9\n\t" // copy B2 8:A0 9:B2 10:A1 11:C1 12:B2 13:A2 14:C5 15:C4 | |
"dppd $0x31, %%xmm8, %%xmm12\n\t" // C2 8:A0 9:B2 10:A1 11:C1 12:C2 13:A2 14:C5 15:C4 | |
"addpd %%xmm11, %%xmm0\n\t" // Flush C1 8:A0 9:B2 10:A1 11:xx 12:C2 13:A2 14:C5 15:C4 | |
"addpd %%xmm15, %%xmm2\n\t" // Flush C4 8:A0 9:B2 10:A1 11:xx 12:C2 13:A2 14:C5 15:xx | |
"movapd %%xmm9, %%xmm11\n\t" // copy B2 8:A0 9:B2 10:A1 11:B2 12:C2 13:A2 14:C5 15:xx | |
"dppd $0x31, %%xmm10, %%xmm9\n\t" // C6 8:A0 9:C6 10:A1 11:B2 12:C2 13:A2 14:C5 15:xx | |
"addpd %%xmm14, %%xmm2\n\t" // Flush C5 8:A0 9:C6 10:A1 11:B2 12:C2 13:A2 14:xx 15:xx | |
"addl %%r8d, %%r12d\n\t" // r12 = k + AB_stride*3 | |
"movapd (%%r13, %%r12, 8), %%xmm14\n\t" // B3 8:A0 9:C6 10:A1 11:B2 12:C2 13:A2 14:B3 15:xx | |
"movapd (%%r14, %%r12, 8), %%xmm15\n\t" // A3 8:A0 9:C6 10:A1 11:B2 12:C2 13:A2 14:B3 15:A3 | |
"dppd $0x32, %%xmm14, %%xmm8\n\t" // C3 8:C3 9:C6 10:A1 11:B2 12:C2 13:A2 14:B3 15:A3 | |
"dppd $0x32, %%xmm14, %%xmm10\n\t" // C7 8:C3 9:C6 10:C7 11:B2 12:C2 13:A2 14:B3 15:A3 | |
"addpd %%xmm12, %%xmm1\n\t" // Flush C2 8:C3 9:C6 10:C7 11:B2 12:xx 13:A2 14:B3 15:A3 | |
"movapd %%xmm11, %%xmm12\n\t" // copy B2 8:C3 9:C6 10:C7 11:B2 12:B2 13:A2 14:B3 15:A3 | |
"dppd $0x31, %%xmm13, %%xmm11\n\t" // C10 8:C3 9:C6 10:C7 11:C10 12:B2 13:A2 14:B3 15:A3 | |
"dppd $0x31, %%xmm15, %%xmm12\n\t" // C14 8:C3 9:C6 10:C7 11:C10 12:C14 13:A2 14:B3 15:A3 | |
"addpd %%xmm9, %%xmm3\n\t" // Flush C6 8:C3 9:xx 10:C7 11:C10 12:C14 13:A2 14:B3 15:A3 | |
"movapd %%xmm14, %%xmm9\n\t" // copy B3 8:C3 9:B3 10:C7 11:C10 12:C14 13:A2 14:B3 15:A3 | |
"dppd $0x32, %%xmm13, %%xmm14\n\t" // C11 8:C3 9:B3 10:C7 11:C10 12:C14 13:A2 14:C11 15:A3 | |
"dppd $0x32, %%xmm15, %%xmm9\n\t" // C15 8:C3 9:C15 10:C7 11:C10 12:C14 13:A2 14:C11 15:A3 | |
"addpd %%xmm8, %%xmm1\n\t" // Flush C3 8:xx 9:C15 10:C7 11:C10 12:C14 13:A2 14:C11 15:A3 | |
"subl %%r8d, %%r12d\n\t" // r12 = k + AB_stride*2 | |
"subl %%r8d, %%r12d\n\t" // r12 = k + AB_stride*1 | |
"movapd (%%r13, %%r12, 8), %%xmm8\n\t" // B1 8:B1 9:C15 10:C7 11:C10 12:C14 13:A2 14:C11 15:A3 | |
"addpd %%xmm10, %%xmm3\n\t" // Flush C7 8:B1 9:C15 10:xx 11:C10 12:C14 13:A2 14:C11 15:A3 | |
"movapd %%xmm8, %%xmm10\n\t" // copy B1 8:B1 9:C15 10:B1 11:C10 12:C14 13:A2 14:C11 15:A3 | |
"dppd $0x32, %%xmm13, %%xmm8\n\t" // C9 8:C9 9:C15 10:B1 11:C10 12:C14 13:A2 14:C11 15:A3 | |
"dppd $0x32, %%xmm15, %%xmm10\n\t" // C13 8:C9 9:C15 10:C13 11:C10 12:C14 13:A2 14:C11 15:A3 | |
"addpd %%xmm11, %%xmm5\n\t" // Flush C10 8:C9 9:C15 10:C13 11:xx 12:C14 13:A2 14:C11 15:A3 | |
"addpd %%xmm12, %%xmm7\n\t" // Flush C14 8:C9 9:C15 10:C13 11:xx 12:xx 13:A2 14:C11 15:A3 | |
"subl %%r8d, %%r12d\n\t" // r12 = k + AB_stride*0 TODO: make mov from k | |
"movapd (%%r13, %%r12, 8), %%xmm11\n\t" // B0 8:C9 9:C15 10:C13 11:B0 12:xx 13:A2 14:C11 15:A3 | |
"addpd %%xmm14, %%xmm5\n\t" // Flush C11 8:C9 9:C15 10:C13 11:B0 12:xx 13:A2 14:xx 15:A3 | |
"addpd %%xmm9, %%xmm7\n\t" // Flush C15 8:C9 9:xx 10:C13 11:B0 12:xx 13:A2 14:xx 15:A3 | |
"movapd %%xmm11, %%xmm12\n\t" // copy B0 8:C9 9:xx 10:C13 11:B0 12:B0 13:A2 14:xx 15:A3 | |
"dppd $0x31, %%xmm13, %%xmm11\n\t" // C8 8:C9 9:xx 10:C13 11:C8 12:B0 13:A2 14:xx 15:A3 | |
"dppd $0x31, %%xmm15, %%xmm12\n\t" // C12 8:C9 9:xx 10:C13 11:C8 12:C12 13:A2 14:xx 15:A3 | |
"addpd %%xmm8, %%xmm4\n\t" // Flush C9 8:xx 9:xx 10:C13 11:C8 12:C12 13:A2 14:xx 15:A3 | |
"addpd %%xmm10, %%xmm6\n\t" // Flush C13 8:xx 9:xx 10:xx 11:C8 12:C12 13:A2 14:xx 15:A3 | |
"addpd %%xmm11, %%xmm4\n\t" // Flush C8 8:xx 9:xx 10:xx 11:xx 12:C12 13:A2 14:xx 15:A3 | |
"addpd %%xmm12, %%xmm6\n\t" // Flush C12 8:xx 9:xx 10:xx 11:xx 12:xx 13:A2 14:xx 15:A3 | |
"addl $2, %%r11d\n\t" | |
"cmpl $" STRINGIFY(BLOCK_SIZE_K) ", %%r11d\n\t" | |
"jne 3b\n\t" | |
"movl %%eax, %%r11d\n\t" // r11: REG_OFFSET = i * C_stride | |
"imul %%r9d, %%r11d\n\t" | |
"movapd %%xmm0, 0(%%rdx, %%r11, 8)\n\t" | |
"movapd %%xmm1, 16(%%rdx, %%r11, 8)\n\t" | |
"addl %%r9d, %%r11d\n\t" | |
"movapd %%xmm2, 0(%%rdx, %%r11, 8)\n\t" | |
"movapd %%xmm3, 16(%%rdx, %%r11, 8)\n\t" | |
"addl %%r9d, %%r11d\n\t" | |
"movapd %%xmm4, 0(%%rdx, %%r11, 8)\n\t" | |
"movapd %%xmm5, 16(%%rdx, %%r11, 8)\n\t" | |
"addl %%r9d, %%r11d\n\t" | |
"movapd %%xmm6, 0(%%rdx, %%r11, 8)\n\t" | |
"movapd %%xmm7, 16(%%rdx, %%r11, 8)\n\t" | |
"movl -8(%%rsp), %%r15d\n\t" // r15 = max i | |
"addl $4, %%eax\n\t" // i = i + 4 | |
"cmpl %%r15d, %%eax\n\t" | |
"jb 2b\n\t" | |
"addq $32, %%rdx\n\t" // C = C + register size * 8 | |
"movl -16(%%rsp), %%r15d\n\t" // r15 = max j | |
"addl $4, %%ebx\n\t" // j = j + 4 | |
"cmpl %%r15d, %%ebx\n\t" | |
"jb 1b\n\t" | |
"addq $64, %%rsp\n\t" | |
: | |
// Outputs | |
// none | |
: | |
// Inputs | |
// No inputs -- I'll gather them myself | |
: | |
// Clobbered | |
"%r15", | |
"%r14", | |
"%r13", | |
"%r12", | |
"%r11", | |
"%r10", | |
"%r9", | |
"%r8", | |
"%rax", | |
"%rbx", | |
"%rcx", | |
"%rdx", | |
"%rsi", | |
"%rdi" | |
); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment