mikusp/accelerate_cuda_function.h

## accelerate_cuda_function.h
/* -----------------------------------------------------------------------------
 *
 * Module      : Function
 * Copyright   : [2008..2014] Manuel M T Chakravarty, Gabriele Keller
 *               [2009..2014] Trevor L. McDonell
 * License     : BSD3
 *
 * Maintainer  : Trevor L. McDonell <tmcdonell@cse.unsw.edu.au>
 * Stability   : experimental
 *
 * ---------------------------------------------------------------------------*/

#ifndef __ACCELERATE_CUDA_FUNCTION_H__
#define __ACCELERATE_CUDA_FUNCTION_H__

#ifndef __CUDACC_RTC__
#include <cuda_runtime.h>
#endif

#include "accelerate_cuda_type.h"

#ifdef __cplusplus

/* -----------------------------------------------------------------------------
 * Device functions required to support generated code
 * -------------------------------------------------------------------------- */

/*
 * Type coercion
 */
template <typename T>
static __inline__ __device__ Word32 reinterpret32(const T x)
{
    union { T a; Word32 b; } u;

    u.a = x;
    return u.b;
}

template <>
__inline__ __device__ Word32 reinterpret32(const Word32 x)
{
    return x;
}

template <>
__inline__ __device__ Word32 reinterpret32(const float x)
{
    return __float_as_int(x);
}

template <typename T>
static __inline__ __device__ Word64 reinterpret64(const T x)
{
    union { T a; Word64 b; } u;

    u.a = x;
    return u.b;
}

template <>
__inline__ __device__ Word64 reinterpret64(const Word64 x)
{
    return x;
}

#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 130
template <>
__inline__ __device__ Word64 reinterpret64(const double x)
{
    return __double_as_longlong(x);
}
#endif


/*
 * Atomic compare-and-swap, with type coercion
 */
template <typename T>
static __inline__ __device__ T atomicCAS32(T* address, T compare, T val)
{
    union { T a; Word32 b; } u;

    u.b = atomicCAS((Word32*) address, reinterpret32<T>(compare), reinterpret32<T>(val));
    return u.a;
}

#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 110
template <>
__inline__ __device__ Int32 atomicCAS32(Int32* address, Int32 compare, Int32 val)
{
    return atomicCAS(address, compare, val);
}

template <>
__inline__ __device__ Word32 atomicCAS32(Word32* address, Word32 compare, Word32 val)
{
    return atomicCAS(address, compare, val);
}
#endif

#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 120
template <typename T>
static __inline__ __device__ T atomicCAS64(T* address, T compare, T val)
{
    union { T a; Word64 b; } u;

    u.b = atomicCAS((Word64*) address, reinterpret64<T>(compare), reinterpret64<T>(val));
    return u.a;
}

template <>
__inline__ __device__ Word64 atomicCAS64(Word64* address, Word64 compare, Word64 val)
{
    return atomicCAS(address, compare, val);
}
#endif


#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
/*
 * Warp shuffle functions
 */
template <typename T>
static __inline__ __device__ T shfl_up32(T var, unsigned int delta, int width=warpSize)
{
    union { T a; Int32 b; } u, v;

    v.a = var;
    u.b = __shfl_up(v.b, delta, warpSize);

    return u.a;
}

template <>
__inline__ __device__ int shfl_up32(int var, unsigned int delta, int width)
{
    return __shfl_up(var, delta, width);
}

template <>
__inline__ __device__ float shfl_up32(float var, unsigned int delta, int width)
{
    return __shfl_up(var, delta, width);
}


template <typename T>
static __inline__ __device__ T shfl_up64(T var, unsigned int delta, int width=warpSize)
{
    union { T a; struct { Int32 lo; Int32 hi; }; } u, v;

    v.a  = var;
    u.lo = __shfl_up(v.lo, delta, warpSize);
    u.hi = __shfl_up(v.hi, delta, warpSize);

    return u.a;
}


template <typename T>
static __inline__ __device__ T shfl_xor32(T var, int laneMask, int width=warpSize)
{
    union { T a; Int32 b; } u, v;

    v.a = var;
    u.b = __shfl_xor(v.b, laneMask, warpSize);

    return u.a;
}

template <>
__inline__ __device__ int shfl_xor32(int var, int laneMask, int width)
{
    return __shfl_xor(var, laneMask, width);
}

template <>
__inline__ __device__ float shfl_xor32(float var, int laneMask, int width)
{
    return __shfl_xor(var, laneMask, width);
}


template <typename T>
static __inline__ __device__ T shfl_xor64(T var, int laneMask, int width=warpSize)
{
    union { T a; struct { Int32 lo; Int32 hi; }; } u, v;

    v.a  = var;
    u.lo = __shfl_xor(v.lo, laneMask, warpSize);
    u.hi = __shfl_xor(v.hi, laneMask, warpSize);

    return u.a;
}
#endif


#if 0
/* -----------------------------------------------------------------------------
 * Additional helper functions
 * -------------------------------------------------------------------------- */

/*
 * Determine if the input is a power of two
 */
template <typename T>
static __inline__ __host__ __device__ T isPow2(const T x)
{
    return ((x&(x-1)) == 0);
}

/*
 * Compute the next highest power of two
 */
template <typename T>
static __inline__ __host__ T ceilPow2(const T x)
{
#if 0
    --x;
    x |= x >> 1;
    x |= x >> 2;
    x |= x >> 4;
    x |= x >> 8;
    x |= x >> 16;
    return ++x;
#endif

    return (isPow2(x)) ? x : 1u << (int) ceil(log2((double)x));
}

/*
 * Compute the next lowest power of two
 */
template <typename T>
static __inline__ __host__ T floorPow2(const T x)
{
#if 0
    float nf = (float) n;
    return 1 << (((*(int*)&nf) >> 23) - 127);
#endif

    int exp;
    frexp(x, &exp);
    return 1 << (exp - 1);
}

/*
 * computes next highest multiple of f from x
 */
template <typename T>
static __inline__ __host__ T multiple(const T x, const T f)
{
    return ((x + (f-1)) / f);
}

/*
 * MS Excel-style CEIL() function. Rounds x up to nearest multiple of f
 */
template <typename T>
static __inline__ __host__ T ceiling(const T x, const T f)
{
    return multiple(x, f) * f;
}
#endif

#endif  // __cplusplus
#endif  // __ACCELERATE_CUDA_FUNCTION_H__
	/* -----------------------------------------------------------------------------
	*
	* Module : Function
	* Copyright : [2008..2014] Manuel M T Chakravarty, Gabriele Keller
	* [2009..2014] Trevor L. McDonell
	* License : BSD3
	*
	* Maintainer : Trevor L. McDonell <tmcdonell@cse.unsw.edu.au>
	* Stability : experimental
	*
	* ---------------------------------------------------------------------------*/

	#ifndef __ACCELERATE_CUDA_FUNCTION_H__
	#define __ACCELERATE_CUDA_FUNCTION_H__

	#ifndef __CUDACC_RTC__
	#include <cuda_runtime.h>
	#endif

	#include "accelerate_cuda_type.h"

	#ifdef __cplusplus

	/* -----------------------------------------------------------------------------
	* Device functions required to support generated code
	* -------------------------------------------------------------------------- */

	/*
	* Type coercion
	*/
	template <typename T>
	static __inline__ __device__ Word32 reinterpret32(const T x)
	{
	union { T a; Word32 b; } u;

	u.a = x;
	return u.b;
	}

	template <>
	__inline__ __device__ Word32 reinterpret32(const Word32 x)
	{
	return x;
	}

	template <>
	__inline__ __device__ Word32 reinterpret32(const float x)
	{
	return __float_as_int(x);
	}

	template <typename T>
	static __inline__ __device__ Word64 reinterpret64(const T x)
	{
	union { T a; Word64 b; } u;

	u.a = x;
	return u.b;
	}

	template <>
	__inline__ __device__ Word64 reinterpret64(const Word64 x)
	{
	return x;
	}

	#if !defined(__CUDA_ARCH__) \|\| __CUDA_ARCH__ >= 130
	template <>
	__inline__ __device__ Word64 reinterpret64(const double x)
	{
	return __double_as_longlong(x);
	}
	#endif


	/*
	* Atomic compare-and-swap, with type coercion
	*/
	template <typename T>
	static __inline__ __device__ T atomicCAS32(T* address, T compare, T val)
	{
	union { T a; Word32 b; } u;

	u.b = atomicCAS((Word32*) address, reinterpret32<T>(compare), reinterpret32<T>(val));
	return u.a;
	}

	#if !defined(__CUDA_ARCH__) \|\| __CUDA_ARCH__ >= 110
	template <>
	__inline__ __device__ Int32 atomicCAS32(Int32* address, Int32 compare, Int32 val)
	{
	return atomicCAS(address, compare, val);
	}

	template <>
	__inline__ __device__ Word32 atomicCAS32(Word32* address, Word32 compare, Word32 val)
	{
	return atomicCAS(address, compare, val);
	}
	#endif

	#if !defined(__CUDA_ARCH__) \|\| __CUDA_ARCH__ >= 120
	template <typename T>
	static __inline__ __device__ T atomicCAS64(T* address, T compare, T val)
	{
	union { T a; Word64 b; } u;

	u.b = atomicCAS((Word64*) address, reinterpret64<T>(compare), reinterpret64<T>(val));
	return u.a;
	}

	template <>
	__inline__ __device__ Word64 atomicCAS64(Word64* address, Word64 compare, Word64 val)
	{
	return atomicCAS(address, compare, val);
	}
	#endif


	#if !defined(__CUDA_ARCH__) \|\| __CUDA_ARCH__ >= 300
	/*
	* Warp shuffle functions
	*/
	template <typename T>
	static __inline__ __device__ T shfl_up32(T var, unsigned int delta, int width=warpSize)
	{
	union { T a; Int32 b; } u, v;

	v.a = var;
	u.b = __shfl_up(v.b, delta, warpSize);

	return u.a;
	}

	template <>
	__inline__ __device__ int shfl_up32(int var, unsigned int delta, int width)
	{
	return __shfl_up(var, delta, width);
	}

	template <>
	__inline__ __device__ float shfl_up32(float var, unsigned int delta, int width)
	{
	return __shfl_up(var, delta, width);
	}


	template <typename T>
	static __inline__ __device__ T shfl_up64(T var, unsigned int delta, int width=warpSize)
	{
	union { T a; struct { Int32 lo; Int32 hi; }; } u, v;

	v.a = var;
	u.lo = __shfl_up(v.lo, delta, warpSize);
	u.hi = __shfl_up(v.hi, delta, warpSize);

	return u.a;
	}


	template <typename T>
	static __inline__ __device__ T shfl_xor32(T var, int laneMask, int width=warpSize)
	{
	union { T a; Int32 b; } u, v;

	v.a = var;
	u.b = __shfl_xor(v.b, laneMask, warpSize);

	return u.a;
	}

	template <>
	__inline__ __device__ int shfl_xor32(int var, int laneMask, int width)
	{
	return __shfl_xor(var, laneMask, width);
	}

	template <>
	__inline__ __device__ float shfl_xor32(float var, int laneMask, int width)
	{
	return __shfl_xor(var, laneMask, width);
	}


	template <typename T>
	static __inline__ __device__ T shfl_xor64(T var, int laneMask, int width=warpSize)
	{
	union { T a; struct { Int32 lo; Int32 hi; }; } u, v;

	v.a = var;
	u.lo = __shfl_xor(v.lo, laneMask, warpSize);
	u.hi = __shfl_xor(v.hi, laneMask, warpSize);

	return u.a;
	}
	#endif


	#if 0
	/* -----------------------------------------------------------------------------
	* Additional helper functions
	* -------------------------------------------------------------------------- */

	/*
	* Determine if the input is a power of two
	*/
	template <typename T>
	static __inline__ __host__ __device__ T isPow2(const T x)
	{
	return ((x&(x-1)) == 0);
	}

	/*
	* Compute the next highest power of two
	*/
	template <typename T>
	static __inline__ __host__ T ceilPow2(const T x)
	{
	#if 0
	--x;
	x \|= x >> 1;
	x \|= x >> 2;
	x \|= x >> 4;
	x \|= x >> 8;
	x \|= x >> 16;
	return ++x;
	#endif

	return (isPow2(x)) ? x : 1u << (int) ceil(log2((double)x));
	}

	/*
	* Compute the next lowest power of two
	*/
	template <typename T>
	static __inline__ __host__ T floorPow2(const T x)
	{
	#if 0
	float nf = (float) n;
	return 1 << ((((int)&nf) >> 23) - 127);
	#endif

	int exp;
	frexp(x, &exp);
	return 1 << (exp - 1);
	}

	/*
	* computes next highest multiple of f from x
	*/
	template <typename T>
	static __inline__ __host__ T multiple(const T x, const T f)
	{
	return ((x + (f-1)) / f);
	}

	/*
	* MS Excel-style CEIL() function. Rounds x up to nearest multiple of f
	*/
	template <typename T>
	static __inline__ __host__ T ceiling(const T x, const T f)
	{
	return multiple(x, f) * f;
	}
	#endif

	#endif // __cplusplus
	#endif // __ACCELERATE_CUDA_FUNCTION_H__