LebedevRI/cbrtf.c

## cbrtf.c
/*
 * $ rm a.out; gcc -Wall -lm -O3 -fopenmp main.c && time ./a.out
 */

#include <stdio.h>      // for printf
#include <string.h>
#include <math.h>
#include <xmmintrin.h>  // for __m128, _mm_mul_ps, _mm_set1_ps, _mm_add_ps

#define CLAMP(x, low, high)  (((x) > (high)) ? (high) : (((x) < (low)) ? (low) : (x)))

#define _DATAPOINTS 1e9
#define STOPS ((size_t)(round(_DATAPOINTS)))

/*
 * interpolate values from p1 and p2 into out.
 */
static float interpolate(const float min, // the smaller tuning
                         const float max, // the larger tuning (can't be == p1)
                         const size_t i)
{
  // stupid linear interpolation.
  // to be confirmed.
  const double t = (((double)i) / ((double)(STOPS-1)));
	return (1.0 - t) * min + t * max;
}

static inline float cbrtf_fast(const float x)
{
	union convert
  {
    float f;
    int i;
  } data, dataout;

	data.f = x;

  // approximate cbrtf(x):
  dataout.i =
					(
						(
							(int)
							(
								(
									(float)
									(
										data.i
									)
								)
								/
								3.0f
							)
						)
						+
						709921077
					);

	return dataout.f;
}

static inline __m128 cbrtf_sse(const __m128 x)
{
  // approximate cbrtf(x):
  return _mm_castsi128_ps(
      _mm_add_epi32(_mm_cvtps_epi32(_mm_div_ps(_mm_cvtepi32_ps(_mm_castps_si128(x)), _mm_set1_ps(3.0f))),
                    _mm_set1_epi32(709921077)));
}

int main()
{
	const float min = -1.0f;
	const float max = 2.0f;

	printf("STOPS = %zu\n\n", STOPS);

	// go through all x
#ifdef _OPENMP
#pragma omp parallel for simd schedule(static)
#endif
	for(size_t i = 0; i < (STOPS); i++)
	{
		const float x = interpolate(min, max, i);

		const __m128 X = _mm_set1_ps(x);
		const __m128 c_SSE = cbrtf_sse(X);
		float c_float[4];
		_mm_store_ps(c_float, c_SSE);

		const float c = cbrtf_fast(x);

		if(c != c_float[0]) printf("%f: %f != %f, diff = %e\n", x, c, c_float[0], fmax((double)c, (double)c_float[0]) - fmin((double)c, (double)c_float[0]));
		//printf("\tref = %f, diff(sse) = %e, diff(plain) = %e\n", cbrtf(x), fmax((double)cbrtf(x), (double)c_float[0]) - fmin((double)cbrtf(x), (double)c_float[0]), fmax((double)cbrtf(x), (double)c) - fmin((double)cbrtf(x), (double)c));
	}
}
	/*
	* $ rm a.out; gcc -Wall -lm -O3 -fopenmp main.c && time ./a.out
	*/

	#include <stdio.h> // for printf
	#include <string.h>
	#include <math.h>
	#include <xmmintrin.h> // for __m128, _mm_mul_ps, _mm_set1_ps, _mm_add_ps

	#define CLAMP(x, low, high) (((x) > (high)) ? (high) : (((x) < (low)) ? (low) : (x)))

	#define _DATAPOINTS 1e9
	#define STOPS ((size_t)(round(_DATAPOINTS)))

	/*
	* interpolate values from p1 and p2 into out.
	*/
	static float interpolate(const float min, // the smaller tuning
	const float max, // the larger tuning (can't be == p1)
	const size_t i)
	{
	// stupid linear interpolation.
	// to be confirmed.
	const double t = (((double)i) / ((double)(STOPS-1)));
	return (1.0 - t) * min + t * max;
	}

	static inline float cbrtf_fast(const float x)
	{
	union convert
	{
	float f;
	int i;
	} data, dataout;

	data.f = x;

	// approximate cbrtf(x):
	dataout.i =
	(
	(
	(int)
	(
	(
	(float)
	(
	data.i
	)
	)
	/
	3.0f
	)
	)
	+
	709921077
	);

	return dataout.f;
	}

	static inline __m128 cbrtf_sse(const __m128 x)
	{
	// approximate cbrtf(x):
	return _mm_castsi128_ps(
	_mm_add_epi32(_mm_cvtps_epi32(_mm_div_ps(_mm_cvtepi32_ps(_mm_castps_si128(x)), _mm_set1_ps(3.0f))),
	_mm_set1_epi32(709921077)));
	}

	int main()
	{
	const float min = -1.0f;
	const float max = 2.0f;

	printf("STOPS = %zu\n\n", STOPS);

	// go through all x
	#ifdef _OPENMP
	#pragma omp parallel for simd schedule(static)
	#endif
	for(size_t i = 0; i < (STOPS); i++)
	{
	const float x = interpolate(min, max, i);

	const __m128 X = _mm_set1_ps(x);
	const __m128 c_SSE = cbrtf_sse(X);
	float c_float[4];
	_mm_store_ps(c_float, c_SSE);

	const float c = cbrtf_fast(x);

	if(c != c_float[0]) printf("%f: %f != %f, diff = %e\n", x, c, c_float[0], fmax((double)c, (double)c_float[0]) - fmin((double)c, (double)c_float[0]));
	//printf("\tref = %f, diff(sse) = %e, diff(plain) = %e\n", cbrtf(x), fmax((double)cbrtf(x), (double)c_float[0]) - fmin((double)cbrtf(x), (double)c_float[0]), fmax((double)cbrtf(x), (double)c) - fmin((double)cbrtf(x), (double)c));
	}
	}