Skip to content

Instantly share code, notes, and snippets.

@engie
Created April 21, 2012 20:09
Show Gist options
  • Save engie/2439378 to your computer and use it in GitHub Desktop.
Save engie/2439378 to your computer and use it in GitHub Desktop.
Unrolled hand-crafted SSE matrix & vector product
const double* b_p = &(B.data()[0]);
double* w_p = &(W.data()[0]);
typedef double v2df __attribute__ ((vector_size (16)));
for( uint32 i = 0; i < rows; i++ ) //For each row
{
double* w_p_row = w_p;
double dp = 0;
/* Loop over 400 elements
* Sum into sum
* Leave the pointers munged
*/
asm (
"xorpd %%xmm4, %%xmm4\n\t" //Clear xmm4 (will sum up into xmm4)
"xorpd %%xmm5, %%xmm5\n\t" //Clear xmm5 (will sum up into xmm5)
//First run through - as normal loop, no deps
"movapd (%[va]), %%xmm0\n\t" //0 <- Va[0]
"movapd 16(%[va]), %%xmm1\n\t" //1 <- Va[2]
"mulpd (%[vb]), %%xmm0\n\t" //0 *= Vb[0]
"movapd 32(%[va]), %%xmm2\n\t" //2 <- Va[4]
"mulpd 16(%[vb]), %%xmm1\n\t" //1 *= Vb[2]
"addpd %%xmm0, %%xmm4\n\t" //4 += 0
"movapd 48(%[va]), %%xmm3\n\t" //3 <- Va[6]
"mulpd 32(%[vb]), %%xmm2\n\t" //2 *= Vb[4]
"addpd %%xmm1, %%xmm5\n\t" //5 <- 1
"lea 64( %[va] ), %[va]\n\t" //Increment the pointers
"lea 64( %[vb] ), %[vb]\n\t"
"mov $49, %%ecx\n\t" //Set ecx to 49 (8 per loop, above & below count as 1 iteration
"jmp bottom\n\t" //Go to the comparison at the bottom
"top:\n\t"
//Centre of the loop
"movapd (%[va]), %%xmm0\n\t" //0 <- Va[0]
"mulpd -16(%[vb]), %%xmm3\n\t" //3 *= Vb[-2]
"addpd %%xmm2, %%xmm4\n\t" //4 += 2
"movapd 16(%[va]), %%xmm1\n\t" //1 <- Va[2]
"mulpd (%[vb]), %%xmm0\n\t" //0 *= Vb[0]
"addpd %%xmm3, %%xmm5\n\t" //5 += 3
"movapd 32(%[va]), %%xmm2\n\t" //2 <- Va[4]
"mulpd 16(%[vb]), %%xmm1\n\t" //1 *= Vb[2]
"addpd %%xmm0, %%xmm4\n\t" //4 += 0
"movapd 48(%[va]), %%xmm3\n\t" //3 <- Va[6]
"mulpd 32(%[vb]), %%xmm2\n\t" //2 *= Vb[4]
"addpd %%xmm1, %%xmm5\n\t" //5 <- 1
"lea 64( %[va] ), %[va]\n\t" //Increment the pointers
"lea 64( %[vb] ), %[vb]\n\t"
"dec %%ecx\n\t" //Decrement ecx
"bottom:\n\t"
"cmp $0, %%ecx\n\t" //Jump back if necessary
"jne top\n\t"
"mulpd -16(%[vb]), %%xmm3\n\t" //3 *= Vb[-2]
"addpd %%xmm2, %%xmm4\n\t" //4 += 2
"addpd %%xmm3, %%xmm5\n\t" //5 += 3
//Combine the accumulators
"addpd %%xmm4, %%xmm5\n\t" //5 += 4
//Sum now in xmm5
"movapd %%xmm5, %%xmm0\n\t" //Copy the accumulator into xmm0
"shufpd $1, %%xmm0, %%xmm0\n\t" //Move the top of xmm0 into the bottom of xmm0
"addpd %%xmm0, %%xmm5\n\t" //Add the bottoms of xmm5 and xmm0
"movq %%xmm5, %[sum]\n\t" //Move that total out to a normal register
: [sum] "=r" (dp), [va] "+r" (b_p), [vb] "+r" (w_p_row)
:
: "ecx", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" //list of clobbered registers
);
myX[i] = dp;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment