Skip to content

Instantly share code, notes, and snippets.

@magical
Created May 10, 2019 01:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save magical/3beed59f0ae92c157b6b1e1c4cfc7c58 to your computer and use it in GitHub Desktop.
Save magical/3beed59f0ae92c157b6b1e1c4cfc7c58 to your computer and use it in GitHub Desktop.
#include "simd.p4.h"
// July 13, 2017: mjb -- changed rbx register to r8
// The convention is that rbx needs to be saved by the callee (i.e., pushed and popped),
// but rcx, rdx, r8, and r9 do not
// This fixed the bug that showed up in cs 475/575 project #5 in SQ 2017
void
SimdMul( float *a, float *b, float *c, int len )
{
int limit = ( len/SSE_WIDTH ) * SSE_WIDTH;
float *atmp = a, *btmp = b, *ctmp = c;
for( int i = 0; i < limit; i += SSE_WIDTH )
{
__asm
(
".att_syntax\n\t"
"movups (%0), %%xmm0\n\t" // load the first sse register
"movups (%1), %%xmm1\n\t" // load the second sse register
"mulps %%xmm1, %%xmm0\n\t" // do the multiply
"movups %%xmm0, (%2)\n\t" // store the result
"addq $16, %0\n\t"
"addq $16, %1\n\t"
"addq $16, %2\n\t"
: /* outputs */ "+r" (atmp), "+r" (btmp), "+r" (ctmp), "=m" (*ctmp)
: /* inputs */ "m" (*atmp), "m" (*btmp)
: /* clobbers */ "xmm0", "xmm1"
);
}
for( int i = limit; i < len; i++ )
{
c[i] = a[i] * b[i];
}
}
float
SimdMulSum( float *a, float *b, int len )
{
float sum[4] = { 0., 0., 0., 0. };
int limit = ( len/SSE_WIDTH ) * SSE_WIDTH;
{
float *atmp = a, *btmp = b;
register __int128 tmp __asm ("xmm2") = 0;
for (int i = 0; i < limit; i += SSE_WIDTH) {
__asm (
".att_syntax\n\t"
"movups (%0), %%xmm0\n\t" // load the first sse register
"movups (%1), %%xmm1\n\t" // load the second sse register
"mulps %%xmm1, %%xmm0\n\t" // do the multiply
"addps %%xmm0, %2\n\t" // do the add
"addq $16, %0\n\t"
"addq $16, %1\n\t"
: /* outputs */ "+r" (atmp), "+r" (btmp), "+x" (tmp)
: /* inputs */ "m" (*atmp), "m" (*btmp)
: /* clobbers */ "xmm0", "xmm1"
);
}
__asm ("movups %1,%0" : "=m" (sum) : "x" (tmp) : /*no clobbers*/);
}
for( int i = limit; i < len; i++ )
{
sum[0] += a[i] * b[i];
}
return sum[0] + sum[1] + sum[2] + sum[3];
}
void
NonSimdMul( float *a, float *b, float *c, int len )
{
for( int i = 0; i < len; i++ )
{
c[i] = a[i] * b[i];
}
}
float
NonSimdMulSum( float *a, float *b, int len )
{
float sum[4] = { 0., 0., 0., 0. };
//int limit = ( len/SSE_WIDTH ) * SSE_WIDTH;
for( int i = 0; i < len; i++ )
{
sum[0] += a[i] * b[i];
}
return sum[0];
}
#ifndef SIMD_H
#define SIMD_H
// SSE stands for Streaming SIMD Extensions
#define SSE_WIDTH 4
#define ALIGNED __attribute__((aligned(16)))
void SimdMul( float *, float *, float *, int );
float SimdMulSum( float *, float *, int );
void NonSimdMul( float *, float *, float *, int );
float NonSimdMulSum( float *, float *, int );
#endif // SIMD_H
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment