allanmac/shflrot.cu

## shflrot.cu
#include <stdio.h>

//
//
//

#define WARP_SIZE                     32
#define KERNEL_QUALIFIERS             extern "C" __global__
#define DEVICE_INTRINSIC_QUALIFIERS   __device__ __forceinline__

//
//
//

DEVICE_INTRINSIC_QUALIFIERS
unsigned int laneId()
{
  unsigned int id;

  asm("mov.u32 %0, %%laneid;" : "=r"(id));

  return id;
}

DEVICE_INTRINSIC_QUALIFIERS
unsigned int
shfl_rot(const unsigned int v, const int rot, bool* const flag)
{
  const unsigned int mask = 0 | 0x1F;

  unsigned int r;
  unsigned int f;

  asm(".reg .pred p;"
      "shfl.idx.b32 %0|p, %2, %3, %4;"
      "selp.u32 %1,1,0,p;"
      : "=r"(r), "=r"(f) : "r"(v), "r"(laneId()-rot), "r"(mask));

  *flag = f;

  return r;
}

DEVICE_INTRINSIC_QUALIFIERS
unsigned int
shfl_ror(const unsigned int v, const unsigned int ror, bool* const flag)
{
  const unsigned int mask = 0 | 0;

  unsigned int r;
  unsigned int f;

  asm(".reg .pred p;"
      "shfl.up.b32 %0|p, %2, %3, %4;"
      "selp.u32 %1,1,0,p;"
      : "=r"(r), "=r"(f) : "r"(v), "r"(ror), "r"(mask));

  *flag = f;

  return r;
}

DEVICE_INTRINSIC_QUALIFIERS
unsigned int
shfl_rol(const unsigned int v, const unsigned int rol, bool* const flag)
{
  const unsigned int mask = 0 | 0x1F;

  unsigned int r;
  unsigned int f;

  asm(".reg .pred p;"
      "shfl.down.b32 %0|p, %2, %3, %4;"
      "selp.u32 %1,1,0,p;"
      : "=r"(r), "=r"(f) : "r"(v), "r"(rol), "r"(mask));

  *flag = f;

  return r;
}

//
//
//

KERNEL_QUALIFIERS
void shflRotKernel(const unsigned int* const vin,
                   unsigned int*       const vout,
                   bool*               const pout,
                   const int                 rot)
{
  unsigned int v    = vin[threadIdx.x];
  v                 = shfl_rot(v,rot,pout+threadIdx.x);
  vout[threadIdx.x] = v;
}

KERNEL_QUALIFIERS
void shflRorKernel(const unsigned int* const vin,
                   unsigned int*       const vout,
                   bool*               const pout,
                   const int                 rot)
{
  unsigned int v    = vin[threadIdx.x];
  v                 = shfl_ror(v,rot,pout+threadIdx.x);
  vout[threadIdx.x] = v;
}

KERNEL_QUALIFIERS
void shflRolKernel(const unsigned int* const vin,
                   unsigned int*       const vout,
                   bool*               const pout,
                   const int                 rot)
{
  unsigned int v    = vin[threadIdx.x];
  v                 = shfl_rol(v,rot,pout+threadIdx.x);
  vout[threadIdx.x] = v;
}

//
//
//

void printRing(const unsigned int* const ring, const bool* const pred)
{
  for (int ii=0; ii<WARP_SIZE; ii++)
    printf("%2d ",ring[ii]);

  printf("\n");

  for (int ii=0; ii<WARP_SIZE; ii++)
    printf("%s ",pred[ii] ? " ." : " x");

  printf("\n");
}

//
//
//

int main(int argc, char** argv)
{
  int rot    = (argc == 1) ? 5 : atoi(argv[1]);
  int device = (argc == 3) ? atoi(argv[2]) : 0;

  cudaDeviceProp props;
  cudaGetDeviceProperties(&props,device);

  printf("%s (%2d)\n",props.name,props.multiProcessorCount);

  if (props.major < 3) // need SHFL
    {
      printf("requires sm_30\n");
      return -1;
    }

  cudaSetDevice(device);

  //
  // LAUNCH KERNEL
  //

  unsigned int* vin;
  unsigned int* vout;
  bool*         pout;

  const unsigned int ring[WARP_SIZE] =
    {
      0,   1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
      16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
    };

  const bool pred[WARP_SIZE] =
    {
      false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false,
      false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false
    };

  cudaMalloc(&vin, sizeof(ring));
  cudaMalloc(&vout,sizeof(ring));
  cudaMalloc(&pout,sizeof(pred));

  cudaMemcpy(vin, ring,sizeof(ring),cudaMemcpyHostToDevice);
  cudaMemcpy(pout,pred,sizeof(pred),cudaMemcpyHostToDevice);

  //
  //
  //

  unsigned int ringRot[WARP_SIZE];
  bool         predRot[WARP_SIZE];

  //
  //
  //

  shflRotKernel<<<1,WARP_SIZE>>>(vin,vout,pout,rot);

  cudaDeviceSynchronize();

  cudaMemcpy(ringRot,vout,sizeof(ringRot),cudaMemcpyDeviceToHost);
  cudaMemcpy(predRot,pout,sizeof(predRot),cudaMemcpyDeviceToHost);

  printf("\nshfl.idx  (rot): %d\n",rot);
  printRing(ringRot,predRot);

  //
  //
  //

  shflRorKernel<<<1,WARP_SIZE>>>(vin,vout,pout,rot);

  cudaDeviceSynchronize();

  cudaMemcpy(ringRot,vout,sizeof(ringRot),cudaMemcpyDeviceToHost);
  cudaMemcpy(predRot,pout,sizeof(predRot),cudaMemcpyDeviceToHost);

  printf("\nshfl.up   (ror): %d\n",rot);
  printRing(ringRot,predRot);

  //
  //
  //

  shflRolKernel<<<1,WARP_SIZE>>>(vin,vout,pout,rot);

  cudaDeviceSynchronize();

  cudaMemcpy(ringRot,vout,sizeof(ringRot),cudaMemcpyDeviceToHost);
  cudaMemcpy(predRot,pout,sizeof(predRot),cudaMemcpyDeviceToHost);

  printf("\nshfl.down (rol): %d\n",rot);
  printRing(ringRot,predRot);

  //
  //
  //

  cudaFree(vin);
  cudaFree(vout);
  cudaFree(pout);

  cudaDeviceReset();

  return 0;
}
	#include <stdio.h>

	//
	//
	//

	#define WARP_SIZE 32
	#define KERNEL_QUALIFIERS extern "C" __global__
	#define DEVICE_INTRINSIC_QUALIFIERS __device__ __forceinline__

	//
	//
	//

	DEVICE_INTRINSIC_QUALIFIERS
	unsigned int laneId()
	{
	unsigned int id;

	asm("mov.u32 %0, %%laneid;" : "=r"(id));

	return id;
	}

	DEVICE_INTRINSIC_QUALIFIERS
	unsigned int
	shfl_rot(const unsigned int v, const int rot, bool* const flag)
	{
	const unsigned int mask = 0 \| 0x1F;

	unsigned int r;
	unsigned int f;

	asm(".reg .pred p;"
	"shfl.idx.b32 %0\|p, %2, %3, %4;"
	"selp.u32 %1,1,0,p;"
	: "=r"(r), "=r"(f) : "r"(v), "r"(laneId()-rot), "r"(mask));

	*flag = f;

	return r;
	}

	DEVICE_INTRINSIC_QUALIFIERS
	unsigned int
	shfl_ror(const unsigned int v, const unsigned int ror, bool* const flag)
	{
	const unsigned int mask = 0 \| 0;

	unsigned int r;
	unsigned int f;

	asm(".reg .pred p;"
	"shfl.up.b32 %0\|p, %2, %3, %4;"
	"selp.u32 %1,1,0,p;"
	: "=r"(r), "=r"(f) : "r"(v), "r"(ror), "r"(mask));

	*flag = f;

	return r;
	}

	DEVICE_INTRINSIC_QUALIFIERS
	unsigned int
	shfl_rol(const unsigned int v, const unsigned int rol, bool* const flag)
	{
	const unsigned int mask = 0 \| 0x1F;

	unsigned int r;
	unsigned int f;

	asm(".reg .pred p;"
	"shfl.down.b32 %0\|p, %2, %3, %4;"
	"selp.u32 %1,1,0,p;"
	: "=r"(r), "=r"(f) : "r"(v), "r"(rol), "r"(mask));

	*flag = f;

	return r;
	}

	//
	//
	//

	KERNEL_QUALIFIERS
	void shflRotKernel(const unsigned int* const vin,
	unsigned int* const vout,
	bool* const pout,
	const int rot)
	{
	unsigned int v = vin[threadIdx.x];
	v = shfl_rot(v,rot,pout+threadIdx.x);
	vout[threadIdx.x] = v;
	}

	KERNEL_QUALIFIERS
	void shflRorKernel(const unsigned int* const vin,
	unsigned int* const vout,
	bool* const pout,
	const int rot)
	{
	unsigned int v = vin[threadIdx.x];
	v = shfl_ror(v,rot,pout+threadIdx.x);
	vout[threadIdx.x] = v;
	}

	KERNEL_QUALIFIERS
	void shflRolKernel(const unsigned int* const vin,
	unsigned int* const vout,
	bool* const pout,
	const int rot)
	{
	unsigned int v = vin[threadIdx.x];
	v = shfl_rol(v,rot,pout+threadIdx.x);
	vout[threadIdx.x] = v;
	}

	//
	//
	//

	void printRing(const unsigned int* const ring, const bool* const pred)
	{
	for (int ii=0; ii<WARP_SIZE; ii++)
	printf("%2d ",ring[ii]);

	printf("\n");

	for (int ii=0; ii<WARP_SIZE; ii++)
	printf("%s ",pred[ii] ? " ." : " x");

	printf("\n");
	}

	//
	//
	//

	int main(int argc, char** argv)
	{
	int rot = (argc == 1) ? 5 : atoi(argv[1]);
	int device = (argc == 3) ? atoi(argv[2]) : 0;

	cudaDeviceProp props;
	cudaGetDeviceProperties(&props,device);

	printf("%s (%2d)\n",props.name,props.multiProcessorCount);

	if (props.major < 3) // need SHFL
	{
	printf("requires sm_30\n");
	return -1;
	}

	cudaSetDevice(device);

	//
	// LAUNCH KERNEL
	//

	unsigned int* vin;
	unsigned int* vout;
	bool* pout;

	const unsigned int ring[WARP_SIZE] =
	{
	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
	16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
	};

	const bool pred[WARP_SIZE] =
	{
	false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false,
	false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false
	};

	cudaMalloc(&vin, sizeof(ring));
	cudaMalloc(&vout,sizeof(ring));
	cudaMalloc(&pout,sizeof(pred));

	cudaMemcpy(vin, ring,sizeof(ring),cudaMemcpyHostToDevice);
	cudaMemcpy(pout,pred,sizeof(pred),cudaMemcpyHostToDevice);

	//
	//
	//

	unsigned int ringRot[WARP_SIZE];
	bool predRot[WARP_SIZE];

	//
	//
	//

	shflRotKernel<<<1,WARP_SIZE>>>(vin,vout,pout,rot);

	cudaDeviceSynchronize();

	cudaMemcpy(ringRot,vout,sizeof(ringRot),cudaMemcpyDeviceToHost);
	cudaMemcpy(predRot,pout,sizeof(predRot),cudaMemcpyDeviceToHost);

	printf("\nshfl.idx (rot): %d\n",rot);
	printRing(ringRot,predRot);

	//
	//
	//

	shflRorKernel<<<1,WARP_SIZE>>>(vin,vout,pout,rot);

	cudaDeviceSynchronize();

	cudaMemcpy(ringRot,vout,sizeof(ringRot),cudaMemcpyDeviceToHost);
	cudaMemcpy(predRot,pout,sizeof(predRot),cudaMemcpyDeviceToHost);

	printf("\nshfl.up (ror): %d\n",rot);
	printRing(ringRot,predRot);

	//
	//
	//

	shflRolKernel<<<1,WARP_SIZE>>>(vin,vout,pout,rot);

	cudaDeviceSynchronize();

	cudaMemcpy(ringRot,vout,sizeof(ringRot),cudaMemcpyDeviceToHost);
	cudaMemcpy(predRot,pout,sizeof(predRot),cudaMemcpyDeviceToHost);

	printf("\nshfl.down (rol): %d\n",rot);
	printRing(ringRot,predRot);

	//
	//
	//

	cudaFree(vin);
	cudaFree(vout);
	cudaFree(pout);

	cudaDeviceReset();

	return 0;
	}