allanmac/warp_scan.inl

## warp_scan.inl
#pragma once

//
//
//

#define PXL_WARP_SCAN_SHFL(_op,_vT,_opT,_regC,_exc,_exc0,_excP)

////////////////////////////////////////////////////////////////////////
//
// FOR KEPLER+
//

#if (__CUDA_ARCH__ >= 300)

#undef  PXL_WARP_SCAN_SHFL
#define PXL_WARP_SCAN_SHFL(_op,_vT,_opT,_regC,_exc,_exc0,_excP) \
                                                                \
  DEVICE_FUNCTION_QUALIFIERS                                    \
  _vT                                                           \
  _excP ## _warp_scan_shuffled_ ## _op (_vT v)                  \
  {                                                             \
    asm("{                                   \n\t"              \
        "    .reg " _opT " t;                \n\t"              \
        "    .reg .pred p;                   \n\t");            \
                                                                \
    for (unsigned int d = 1; d < WARP_SIZE; d *= 2)             \
      asm("  shfl.up.b32 t|p, %0, %1, 0x0;   \n\t"              \
          "  @p " #_op _opT " %0, t, %0;     \n\t"              \
          : "+" _regC (v) : "r"(d));                            \
                                                                \
    if (_exc)                                                   \
      asm("  shfl.up.b32 %0|p, %0, 0x1, 0x0; \n\t"              \
          "  @!p mov" _opT " %0, %1;         \n\t"              \
          : "+" _regC (v) : _regC (_exc0));                     \
                                                                \
    asm("}");                                                   \
                                                                \
    return v;                                                   \
  }

#endif

////////////////////////////////////////////////////////////////////////
//
// FOR ALL ARCHITECTURES
//

#ifdef SQUELCH_REDUNDANT_SHARED_STORES //////////

#define STORE_IF_LT_WARP_MINUS(l)               \
  if (lid < WARP_SIZE-l)                        \
    v32[0] = v

#else ///////////////////////////////////////////

#define STORE_IF_LT_WARP_MINUS(l)               \
  v32[0] = v

#endif //////////////////////////////////////////

#define PXL_WARP_SCAN_SHARED(_op,_vT,_exc,_exc0,_excP,_incLd)   \
                                                                \
  DEVICE_FUNCTION_QUALIFIERS                                    \
  _vT                                                           \
  _excP ## _warp_scan_shared_ ## _op                            \
  (_vT v, volatile _vT* const shared32)                         \
  {                                                             \
    const unsigned int  lid = laneId();                         \
    volatile _vT* const v32 = shared32 + lid;                   \
                                                                \
    if (_exc)                                                   \
      {                                                         \
        if (lid == (WARP_SIZE-1))                               \
          v32[-31] = _exc0;                                     \
        else                                                    \
          v32[  1] = v;                                         \
                                                                \
        v = v32[0];                                             \
      }                                                         \
    else if (_incLd)                                            \
      {                                                         \
        v = v32[0];                                             \
      }                                                         \
    else                                                        \
      {                                                         \
        v32[0] = v;                                             \
      }                                                         \
                                                                \
    if (lid >= 1)                                               \
      {                                                         \
        v = v + v32[-1];                                        \
                                                                \
        STORE_IF_LT_WARP_MINUS(2);                              \
                                                                \
        if (lid >= 2)                                           \
          {                                                     \
            v = v + v32[-2];                                    \
                                                                \
            STORE_IF_LT_WARP_MINUS(4);                          \
                                                                \
            if (lid >= 4)                                       \
              {                                                 \
                v = v + v32[-4];                                \
                                                                \
                STORE_IF_LT_WARP_MINUS(8);                      \
                                                                \
                if (lid >= 8)                                   \
                  {                                             \
                    v = v + v32[-8];                            \
                                                                \
                    STORE_IF_LT_WARP_MINUS(16);                 \
                                                                \
                    if (lid >= 16)                              \
                      v = v + v32[-16];                         \
                  }                                             \
              }                                                 \
          }                                                     \
      }                                                         \
                                                                \
    return v;                                                   \
  }

//
// GENERATE BOTH SHFL AND SHARED WARP SCAN PRIMITIVES
//
// PXL_WARP_SCAN_SHFL is a noop when not compiling for sm_30+.
//

#define PXL_WARP_SCAN_DECL(_op,_vT,_opT,_regC,_exc,_exc0,_excP,_incLd)  \
  PXL_WARP_SCAN_SHARED(_op,_vT,           _exc,_exc0,_excP,_incLd);     \
  PXL_WARP_SCAN_SHFL  (_op,_vT,_opT,_regC,_exc,_exc0,_excP)

//
// DECLARE WARP SCAN PRIMITIVES IN YOUR .cu SOURCE FILE
//

/*

 Examples:

  // inclusive "add.u32" scan:
  //
  //  unsigned int
  //  inc_warp_scan_shuffled_add(unsigned int)
  //
  //  unsigned int
  //  inc_warp_scan_shared_add(unsigned int, volatile unsigned int* const)
  //
  PXL_WARP_SCAN_DECL(add, unsigned int, ".u32", "r", false, 0,    inc, true);

  // exclusive "add.u32" scan:
  //
  //  unsigned int
  //  exc_warp_scan_shuffled_add(unsigned int)
  //
  //  unsigned int
  //  exc_warp_scan_shared_add(unsigned int, volatile unsigned int* const)
  //
  PXL_WARP_SCAN_DECL(add, unsigned int, ".u32", "r", true,  0,    exc, false);

  // exclusive "max.f32" scan:
  //
  //  float
  //  exc_warp_scan_shuffled_max(float)
  //
  //  float
  //  exc_warp_scan_shared_max(float, volatile float* const)
  //
  PXL_WARP_SCAN_DECL(max, float,        ".f32", "f", true,  0.0f, exc, false);

*/
	#pragma once

	//
	//
	//

	#define PXL_WARP_SCAN_SHFL(_op,_vT,_opT,_regC,_exc,_exc0,_excP)

	////////////////////////////////////////////////////////////////////////
	//
	// FOR KEPLER+
	//

	#if (__CUDA_ARCH__ >= 300)

	#undef PXL_WARP_SCAN_SHFL
	#define PXL_WARP_SCAN_SHFL(_op,_vT,_opT,_regC,_exc,_exc0,_excP) \
	\
	DEVICE_FUNCTION_QUALIFIERS \
	_vT \
	_excP ## _warp_scan_shuffled_ ## _op (_vT v) \
	{ \
	asm("{ \n\t" \
	" .reg " _opT " t; \n\t" \
	" .reg .pred p; \n\t"); \
	\
	for (unsigned int d = 1; d < WARP_SIZE; d *= 2) \
	asm(" shfl.up.b32 t\|p, %0, %1, 0x0; \n\t" \
	" @p " #_op _opT " %0, t, %0; \n\t" \
	: "+" _regC (v) : "r"(d)); \
	\
	if (_exc) \
	asm(" shfl.up.b32 %0\|p, %0, 0x1, 0x0; \n\t" \
	" @!p mov" _opT " %0, %1; \n\t" \
	: "+" _regC (v) : _regC (_exc0)); \
	\
	asm("}"); \
	\
	return v; \
	}

	#endif

	////////////////////////////////////////////////////////////////////////
	//
	// FOR ALL ARCHITECTURES
	//

	#ifdef SQUELCH_REDUNDANT_SHARED_STORES //////////

	#define STORE_IF_LT_WARP_MINUS(l) \
	if (lid < WARP_SIZE-l) \
	v32[0] = v

	#else ///////////////////////////////////////////

	#define STORE_IF_LT_WARP_MINUS(l) \
	v32[0] = v

	#endif //////////////////////////////////////////

	#define PXL_WARP_SCAN_SHARED(_op,_vT,_exc,_exc0,_excP,_incLd) \
	\
	DEVICE_FUNCTION_QUALIFIERS \
	_vT \
	_excP ## _warp_scan_shared_ ## _op \
	(_vT v, volatile _vT* const shared32) \
	{ \
	const unsigned int lid = laneId(); \
	volatile _vT* const v32 = shared32 + lid; \
	\
	if (_exc) \
	{ \
	if (lid == (WARP_SIZE-1)) \
	v32[-31] = _exc0; \
	else \
	v32[ 1] = v; \
	\
	v = v32[0]; \
	} \
	else if (_incLd) \
	{ \
	v = v32[0]; \
	} \
	else \
	{ \
	v32[0] = v; \
	} \
	\
	if (lid >= 1) \
	{ \
	v = v + v32[-1]; \
	\
	STORE_IF_LT_WARP_MINUS(2); \
	\
	if (lid >= 2) \
	{ \
	v = v + v32[-2]; \
	\
	STORE_IF_LT_WARP_MINUS(4); \
	\
	if (lid >= 4) \
	{ \
	v = v + v32[-4]; \
	\
	STORE_IF_LT_WARP_MINUS(8); \
	\
	if (lid >= 8) \
	{ \
	v = v + v32[-8]; \
	\
	STORE_IF_LT_WARP_MINUS(16); \
	\
	if (lid >= 16) \
	v = v + v32[-16]; \
	} \
	} \
	} \
	} \
	\
	return v; \
	}

	//
	// GENERATE BOTH SHFL AND SHARED WARP SCAN PRIMITIVES
	//
	// PXL_WARP_SCAN_SHFL is a noop when not compiling for sm_30+.
	//

	#define PXL_WARP_SCAN_DECL(_op,_vT,_opT,_regC,_exc,_exc0,_excP,_incLd) \
	PXL_WARP_SCAN_SHARED(_op,_vT, _exc,_exc0,_excP,_incLd); \
	PXL_WARP_SCAN_SHFL (_op,_vT,_opT,_regC,_exc,_exc0,_excP)

	//
	// DECLARE WARP SCAN PRIMITIVES IN YOUR .cu SOURCE FILE
	//

	/*

	Examples:

	// inclusive "add.u32" scan:
	//
	// unsigned int
	// inc_warp_scan_shuffled_add(unsigned int)
	//
	// unsigned int
	// inc_warp_scan_shared_add(unsigned int, volatile unsigned int* const)
	//
	PXL_WARP_SCAN_DECL(add, unsigned int, ".u32", "r", false, 0, inc, true);

	// exclusive "add.u32" scan:
	//
	// unsigned int
	// exc_warp_scan_shuffled_add(unsigned int)
	//
	// unsigned int
	// exc_warp_scan_shared_add(unsigned int, volatile unsigned int* const)
	//
	PXL_WARP_SCAN_DECL(add, unsigned int, ".u32", "r", true, 0, exc, false);

	// exclusive "max.f32" scan:
	//
	// float
	// exc_warp_scan_shuffled_max(float)
	//
	// float
	// exc_warp_scan_shared_max(float, volatile float* const)
	//
	PXL_WARP_SCAN_DECL(max, float, ".f32", "f", true, 0.0f, exc, false);

	*/