allanmac/geff.cu

## geff.cu

#include <stdio.h>

//
//
//

#define TYPE                   unsigned int

#define REPS                   1

#define NUM_BLOCKS_PER_SM      1
#define NUM_THREADS_PER_BLOCK  256

#define RESTRICT               __restrict
#define WARP_SIZE              32

//
//
//

#define LD(i)                                   \
  TYPE r##i = vin[idx+(i*WARP_SIZE)]

#define ST(i)                                   \
  vout[idx+(i*WARP_SIZE)] = r##i

#define REP_1()                                 \
  OP(0)

#define REP_2()                                 \
  REP_1();                                      \
  OP(1)

#define REP_4()                                 \
  REP_2();                                      \
  OP(2);                                        \
  OP(3)

#define REP_8()                                 \
  REP_4();                                      \
  OP(4);                                        \
  OP(5);                                        \
  OP(6);                                        \
  OP(7)

//
//
//


#if   REPS == 1
#define REP_N()  REP_1()
#elif REPS == 2
#define REP_N()  REP_2()
#elif REPS == 4
#define REP_N()  REP_4()
#elif REPS == 8
#define REP_N()  REP_8()
#else
#error INVALID REP
#endif

//
//
//

__global__
void
__launch_bounds__(NUM_THREADS_PER_BLOCK,NUM_BLOCKS_PER_SM)
geff(const TYPE* const RESTRICT vin, TYPE* const RESTRICT vout)
{
  const unsigned int bidx = (blockIdx.x * blockDim.x + threadIdx.x) / WARP_SIZE;
  const unsigned int idx  = bidx * REPS * WARP_SIZE + (threadIdx.x & (WARP_SIZE-1));

#undef  OP
#define OP(i) LD(i)

  REP_N();

#undef  OP
#define OP(i) ST(i)

  REP_N();
}

//
//
//

int main(int argc, char** argv)
{
  cudaError_t err;
  int         device = (argc == 1) ? 0 : atoi(argv[1]);

  cudaDeviceProp props;
  err = cudaGetDeviceProperties(&props,device);

  if (err)
    return -1;

  printf("%s (%2d)\n",props.name,props.multiProcessorCount);

  cudaSetDevice(device);

  //
  // LAUNCH KERNEL
  //

  const unsigned int numThreads = NUM_THREADS_PER_BLOCK;
  const unsigned int numBlocks  = NUM_BLOCKS_PER_SM * props.multiProcessorCount;
  const unsigned int vsize      = numBlocks * numThreads * REPS * sizeof(TYPE);

  TYPE* vin;
  TYPE* vout;

#define LD_OFFSET  0 // sizeof(TYPE) // uncomment for uncoalesced loads
#define ST_OFFSET  0

  cudaMalloc(&vin, vsize+LD_OFFSET);
  cudaMalloc(&vout,vsize+ST_OFFSET);

  //
  //
  //

  geff<<<numBlocks,numThreads>>>(vin +(LD_OFFSET/sizeof(TYPE)),
                                 vout+(ST_OFFSET/sizeof(TYPE)));

  err = cudaDeviceSynchronize();

  if (err)
    printf("Err = %d\n",err);

  //
  //
  //

  cudaFree(vin);
  cudaFree(vout);

  cudaDeviceReset();

  return 0;
}

	#include <stdio.h>

	//
	//
	//

	#define TYPE unsigned int

	#define REPS 1

	#define NUM_BLOCKS_PER_SM 1
	#define NUM_THREADS_PER_BLOCK 256

	#define RESTRICT __restrict
	#define WARP_SIZE 32

	//
	//
	//

	#define LD(i) \
	TYPE r##i = vin[idx+(i*WARP_SIZE)]

	#define ST(i) \
	vout[idx+(i*WARP_SIZE)] = r##i

	#define REP_1() \
	OP(0)

	#define REP_2() \
	REP_1(); \
	OP(1)

	#define REP_4() \
	REP_2(); \
	OP(2); \
	OP(3)

	#define REP_8() \
	REP_4(); \
	OP(4); \
	OP(5); \
	OP(6); \
	OP(7)

	//
	//
	//



	#if REPS == 1
	#define REP_N() REP_1()
	#elif REPS == 2
	#define REP_N() REP_2()
	#elif REPS == 4
	#define REP_N() REP_4()
	#elif REPS == 8
	#define REP_N() REP_8()
	#else
	#error INVALID REP
	#endif

	//
	//
	//

	__global__
	void
	__launch_bounds__(NUM_THREADS_PER_BLOCK,NUM_BLOCKS_PER_SM)
	geff(const TYPE* const RESTRICT vin, TYPE* const RESTRICT vout)
	{
	const unsigned int bidx = (blockIdx.x * blockDim.x + threadIdx.x) / WARP_SIZE;
	const unsigned int idx = bidx * REPS * WARP_SIZE + (threadIdx.x & (WARP_SIZE-1));

	#undef OP
	#define OP(i) LD(i)

	REP_N();

	#undef OP
	#define OP(i) ST(i)

	REP_N();
	}

	//
	//
	//

	int main(int argc, char** argv)
	{
	cudaError_t err;
	int device = (argc == 1) ? 0 : atoi(argv[1]);

	cudaDeviceProp props;
	err = cudaGetDeviceProperties(&props,device);

	if (err)
	return -1;

	printf("%s (%2d)\n",props.name,props.multiProcessorCount);

	cudaSetDevice(device);

	//
	// LAUNCH KERNEL
	//

	const unsigned int numThreads = NUM_THREADS_PER_BLOCK;
	const unsigned int numBlocks = NUM_BLOCKS_PER_SM * props.multiProcessorCount;
	const unsigned int vsize = numBlocks * numThreads * REPS * sizeof(TYPE);

	TYPE* vin;
	TYPE* vout;

	#define LD_OFFSET 0 // sizeof(TYPE) // uncomment for uncoalesced loads
	#define ST_OFFSET 0

	cudaMalloc(&vin, vsize+LD_OFFSET);
	cudaMalloc(&vout,vsize+ST_OFFSET);

	//
	//
	//

	geff<<<numBlocks,numThreads>>>(vin +(LD_OFFSET/sizeof(TYPE)),
	vout+(ST_OFFSET/sizeof(TYPE)));

	err = cudaDeviceSynchronize();

	if (err)
	printf("Err = %d\n",err);

	//
	//
	//

	cudaFree(vin);
	cudaFree(vout);

	cudaDeviceReset();

	return 0;
	}