Skip to content

Instantly share code, notes, and snippets.

@mmozeiko mmozeiko/floor.c
Last active Nov 7, 2019

Embed
What would you like to do?
various floor functions with SSE1 or SSE2
// various SSE1 and SSE2 floor function for float
// assumes only normal input values
// does not handle NAN or INF - they may or may not work
// only guarantee is that 0, subnormal or normal float values will work
#include <emmintrin.h>
#ifdef _MSC_VER
#include <intrin.h>
#define NOINLINE __declspec(noinline)
#else
#define __rdtsc() __builtin_ia32_rdtsc()
#define NOINLINE __attribute__((noinline))
#endif
#include <stdio.h>
#include <math.h>
// no inlining prevents compiler to vectorize benchmark loop
static float NOINLINE std_floor(float x)
{
return floorf(x);
}
// full float range
static float sse_floor(float x)
{
__m128 kSignBit = _mm_set1_ps(-0.f);
__m128 kOne = _mm_set1_ps(1.f);
__m128 kNoFraction = _mm_set1_ps(8388608.f);
__m128 f = _mm_set_ss(x);
__m128 r = f;
// r = (int)f;
r = _mm_add_ss(r, kNoFraction);
r = _mm_sub_ss(r, kNoFraction);
r = _mm_sub_ss(r, kNoFraction);
r = _mm_add_ss(r, kNoFraction);
// if (f < r) r -= 1;
r = _mm_sub_ss(r, _mm_and_ps(kOne, _mm_cmplt_ss(f, r)));
// if (abs(f) > 2**23) r = f;
__m128 m = _mm_cmplt_ss(kNoFraction, _mm_andnot_ps(kSignBit, f));
r = _mm_or_ps(_mm_andnot_ps(m, r), _mm_and_ps(m, f));
return _mm_cvtss_f32(r);
}
// only non-negative floats (x >= 0.f)
static float sse_floor_pos(float x)
{
__m128 kOne = _mm_set1_ps(1.f);
__m128 kNoFraction = _mm_set1_ps(8388608.f);
__m128 f = _mm_set_ss(x);
__m128 r = f;
r = _mm_add_ss(r, kNoFraction);
r = _mm_sub_ss(r, kNoFraction);
r = _mm_sub_ss(r, _mm_and_ps(kOne, _mm_cmplt_ss(f, r)));
__m128 m = _mm_cmplt_ss(kNoFraction, f);
r = _mm_or_ps(_mm_andnot_ps(m, r), _mm_and_ps(m, f));
return _mm_cvtss_f32(r);
}
// only non-positive floats (x <= 0.f)
static float sse_floor_neg(float x)
{
__m128 kOne = _mm_set1_ps(1.f);
__m128 kNoFraction = _mm_set1_ps(-8388608.f);
__m128 f = _mm_set_ss(x);
__m128 r = f;
r = _mm_add_ss(r, kNoFraction);
r = _mm_sub_ss(r, kNoFraction);
r = _mm_sub_ss(r, _mm_and_ps(kOne, _mm_cmplt_ss(f, r)));
__m128 m = _mm_cmplt_ss(f, kNoFraction);
r = _mm_or_ps(_mm_andnot_ps(m, r), _mm_and_ps(m, f));
return _mm_cvtss_f32(r);
}
// only [-8388608 .. +8388608] range
static float sse_floor_small(float x)
{
__m128 kSignBit = _mm_set1_ps(-0.f);
__m128 kOne = _mm_set1_ps(1.f);
__m128 kNoFraction = _mm_set1_ps(8388608.f);
__m128 f = _mm_set_ss(x);
__m128 r = f;
r = _mm_add_ss(r, kNoFraction);
r = _mm_sub_ss(r, kNoFraction);
r = _mm_sub_ss(r, kNoFraction);
r = _mm_add_ss(r, kNoFraction);
r = _mm_sub_ss(r, _mm_and_ps(kOne, _mm_cmplt_ss(f, r)));
return _mm_cvtss_f32(r);
}
// only [0 .. +8388608] range
static float sse_floor_small_pos(float x)
{
__m128 kOne = _mm_set1_ps(1.f);
__m128 kNoFraction = _mm_set1_ps(8388608.f);
__m128 f = _mm_set_ss(x);
__m128 r = f;
r = _mm_add_ss(r, kNoFraction);
r = _mm_sub_ss(r, kNoFraction);
r = _mm_sub_ss(r, _mm_and_ps(kOne, _mm_cmplt_ss(f, r)));
return _mm_cvtss_f32(r);
}
// only [-8388608 .. 0] range
static float sse_floor_small_neg(float x)
{
__m128 kOne = _mm_set1_ps(1.f);
__m128 kNoFraction = _mm_set1_ps(-8388608.f);
__m128 f = _mm_set_ss(x);
__m128 r = f;
r = _mm_add_ss(r, kNoFraction);
r = _mm_sub_ss(r, kNoFraction);
r = _mm_sub_ss(r, _mm_and_ps(kOne, _mm_cmplt_ss(f, r)));
return _mm_cvtss_f32(r);
}
// full float range
static float sse2_floor(float x)
{
__m128 kSignBit = _mm_set1_ps(-0.f);
__m128 kOne = _mm_set1_ps(1.f);
__m128 kMaxValue = _mm_set1_ps(2147483648.f);
__m128 f = _mm_set_ss(x);
__m128 t = _mm_cvtepi32_ps(_mm_cvttps_epi32(f));
__m128 r = _mm_sub_ss(t, _mm_and_ps(_mm_cmplt_ss(f, t), kOne));
__m128 m = _mm_cmple_ss(kMaxValue, _mm_andnot_ps(kSignBit, f));
r = _mm_or_ps(_mm_andnot_ps(m, r), _mm_and_ps(m, f));
return _mm_cvtss_f32(r);
}
// only [-2147483648 .. +2147483647) range (actual range interval endpoints are truncated to values that fits in float)
// or in other words: if floor(x) fits into int32_t, then result will be correct
static float sse2_floor_small(float x)
{
__m128 kOne = _mm_set_ss(1.f);
__m128 f = _mm_set_ss(x);
__m128 t = _mm_cvtepi32_ps(_mm_cvttps_epi32(f));
__m128 r = _mm_sub_ss(t, _mm_and_ps(_mm_cmplt_ss(f, t), kOne));
return _mm_cvtss_f32(r);
}
// only [0 .. +2147483647) range (actual range interval endpoint is truncated to value that fits in float)
// same as above, bot only for non-negative values
static float sse2_floor_small_pos(float x)
{
__m128 f = _mm_set_ss(x);
__m128 r = _mm_cvtepi32_ps(_mm_cvttps_epi32(f));
return _mm_cvtss_f32(r);
}
static float u2f(unsigned int u)
{
union
{
unsigned int u;
float f;
} uf;
uf.u = u;
return uf.f;
}
static unsigned int f2u(float f)
{
union
{
unsigned int u;
float f;
} uf;
uf.f = f;
return uf.u;
}
// total benchmark array size = 1MB, reasonable value to fully fit in CPU cache?
#define BENCH_COUNT (256 * 1024)
// no inlining prevents compiler to optimize bechmark loop to NOP
static NOINLINE float* bench_data()
{
static float data[BENCH_COUNT];
return data;
}
#define CHECK(f, x, expected, func) do \
{ \
float actual = func(f); \
if (actual != expected) \
{ \
printf("ERROR: f=%1.8e (0x%08x) floorf=%1.8e %s=%1.8e\n", f, x, expected, #func, actual); \
return 0; \
} \
} while (0)
#define BENCH(func) do \
{ \
volatile float tmp = 0.f; \
float f = tmp; \
unsigned long long t1 = __rdtsc(); \
for (int k=0; k<4096; k++) \
{ \
float* data = bench_data(); \
for (int i=0; i<BENCH_COUNT; i++) \
{ \
f += func(data[i]); \
} \
} \
tmp = f; \
unsigned long long t2 = __rdtsc(); \
printf("%-23s%5.1f\n", #func, (double)(t2 - t1) / BENCH_COUNT / 4096); \
} while (0)
int main()
{
volatile unsigned int x = 0;
do
{
if ((x % (1<<26) == 0))
{
printf(".");
}
float f = u2f(x);
int c = fpclassify(f);
if (c == FP_ZERO || c == FP_NORMAL || c == FP_SUBNORMAL)
{
float expected = std_floor(f);
CHECK(f, x, expected, sse_floor);
if (f >= 0.f)
{
CHECK(f, x, expected, sse_floor_pos);
}
if (f <= 0.f)
{
CHECK(f, x, expected, sse_floor_neg);
}
if (f >= -8388608.f && f <= 8388608.f)
{
CHECK(f, x, expected, sse_floor_small);
}
if (f >= 0.f && f <= 8388608.f)
{
CHECK(f, x, expected, sse_floor_small_pos);
}
if (f >= -8388608.f && f <= 0.f)
{
CHECK(f, x, expected, sse_floor_small_neg);
}
CHECK(f, x, expected, sse2_floor);
if (f >= -2147483648.f && f < +2147483647.f)
{
CHECK(f, x, expected, sse2_floor_small);
}
if (f >= 0.f && f < +2147483647.f)
{
CHECK(f, x, expected, sse2_floor_small_pos);
}
}
x++;
}
while (x != 0);
printf("\n");
// go over array just to warm up caches
{
volatile float tmp = 0.f;
float f = tmp;
float* data = bench_data();
for (int i=0; i<BENCH_COUNT; i++)
{
f += floorf(data[i]);
}
tmp = f;
}
// NOTE: don't fully trust benchmark results - verify it with your own code and data
// this bencmark only tries to floor value "0" wich is repeated in a big array
BENCH(std_floor);
BENCH(sse_floor);
BENCH(sse_floor_pos);
BENCH(sse_floor_neg);
BENCH(sse_floor_small);
BENCH(sse_floor_small_pos);
BENCH(sse_floor_small_neg);
BENCH(sse2_floor);
BENCH(sse2_floor_small);
BENCH(sse2_floor_small_pos);
}
@mmozeiko

This comment has been minimized.

Copy link
Owner Author

mmozeiko commented Jun 16, 2018

micro benchmarking on i7-6700K

gcc 7.3.0 (with -O2 -ffast-math)

std_floor                6.2
sse_floor                6.2
sse_floor_pos            4.5
sse_floor_neg            4.7
sse_floor_small          4.2
sse_floor_small_pos      3.9
sse_floor_small_neg      3.9
sse2_floor               5.1
sse2_floor_small         3.9
sse2_floor_small_pos     3.9

msvc 2017, cl.exe v19.14.26430 (with /O2 /fp:fast)

std_floor                7.6
sse_floor                5.9
sse_floor_pos            4.4
sse_floor_neg            4.4
sse_floor_small          4.0
sse_floor_small_pos      3.9
sse_floor_small_neg      3.9
sse2_floor               4.5
sse2_floor_small         3.9
sse2_floor_small_pos     3.9

std_floor without "fast-math" options:

gcc  = 7.31
msvc = 7.73
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.