Skip to content

Instantly share code, notes, and snippets.

Created May 18, 2013 17:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save anonymous/5605201 to your computer and use it in GitHub Desktop.
Save anonymous/5605201 to your computer and use it in GitHub Desktop.
// clx /O2 1.cpp
#include <stdio.h>
#include <windows.h>
#include <tmmintrin.h> // sse3
#include <smmintrin.h> // sse4
#include <math.h>
const int BLOCK = 16*1024*1024;
const int RUNS = 100*1000*1000;
int * a = new int[BLOCK];
int * b = new int[BLOCK];
int sign(int x)
{
return (x<0) ? -1 : 1;
}
void dump(const char * n, __m128i v)
{
printf("%s: %d, %d, %d, %d\n", n, v.m128i_i32[0], v.m128i_i32[1], v.m128i_i32[2], v.m128i_i32[3]);
}
int sum(__m128i v)
{
return v.m128i_i32[0] + v.m128i_i32[1] + v.m128i_i32[2] + v.m128i_i32[3];
}
static inline __m128i LLR4(const int * pa, const int * pb)
{
__m128i a = *(__m128i*)pa;
__m128i b = *(__m128i*)pb;
// sign(a,b)*min(abs(a),abs(b))
__m128i absa = _mm_abs_epi32(a);
__m128i absb = _mm_abs_epi32(b);
__m128i absm = _mm_min_epi32(absa, absb);
__m128i ab = _mm_mullo_epi32(a, b);
__m128i rr = _mm_sign_epi32(absm, ab);
return rr;
}
static inline int LLR(int a, int b)
{
if (a>0)
return (b>0) ? __min(a,b) : -__min(a,-b);
else
return (b>0) ? -__min(-a,b) : __min(-a,-b);
}
static inline int LLR_2(int a, int b)
{
int sign, numbers[2];
sign = 1-2*(((unsigned)a^b)>>31);
a *= 1-2*(((unsigned)a)>>31);
b *= 1-2*(((unsigned)b)>>31);
numbers[0] = b;
numbers[1] = a;
a -= b;
return sign*numbers[((unsigned)a)>>31];
}
static unsigned int mask[] = {0,(unsigned)-1};
static unsigned int constant[] = {0, 1};
static inline int LLR_3(int a, int b)
{
int sign,c, numbers[2];
sign = (_rotl(a^b,1) & 1);
c = ((_rotl(a,1) & 1));
a = (a^mask[c])+constant[c];
c = ((_rotl(b,1) & 1));
b = (b^mask[c])+constant[c];
numbers[0] = b;
numbers[1] = a;
c = ((_rotl(a-b,1) & 1));
return (numbers[c]^mask[sign])+constant[sign];
}
int Test1()
{
int x = 0;
for (int i=0;i<RUNS;i++)
{
register int j = i & (BLOCK-1);
x += LLR(a[j],b[j]);
}
return x;
}
int Test2()
{
int x = 0;
for (int i=0;i<RUNS;i++)
{
register int j = i & (BLOCK-1);
x += LLR_2(a[j],b[j]);
}
return x;
}
int Test3()
{
int x = 0;
for (int i=0;i<RUNS;i++)
{
register int j = i & (BLOCK-1);
x += LLR_3(a[j],b[j]);
}
return x;
}
int Test4()
{
__m128i t = { 0 };
for (int i=0;i<RUNS;i+=4)
{
register int j = i & (BLOCK-1);
t = _mm_add_epi32(t, LLR4(a+j, b+j));
}
return sum(t);
}
void Bench(int (*fnTest)())
{
const int TRIALS = 10;
float mintime = 1000000.0f;
int x;
for (int i=0; i<TRIALS; i++)
{
__int64 t1, t2;
QueryPerformanceCounter((LARGE_INTEGER*)&t1);
x = fnTest();
QueryPerformanceCounter((LARGE_INTEGER*)&t2);
t2 -= t1;
QueryPerformanceFrequency((LARGE_INTEGER*)&t1);
mintime = min(mintime, t2/(t1*1.));
}
printf("%f sec, x=%d\n", mintime, x);
}
void main()
{
srand(0);
for ( int i=0; i<BLOCK; i++ )
{
a[i] = rand() - RAND_MAX / 2;
b[i] = rand() - RAND_MAX / 2;
}
SetPriorityClass(GetCurrentProcess(),REALTIME_PRIORITY_CLASS);
SetThreadPriority(GetCurrentThread(),THREAD_PRIORITY_TIME_CRITICAL);
Bench(Test1);
Bench(Test2);
Bench(Test3);
Bench(Test4);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment