allanmac/kth.cu

## kth.cu
// -*- compile-command: "nvcc -m 32 -arch compute_20 -Xptxas=-v,-abi=no -cubin kth.cu"; -*-

#include <stdio.h>
#include <stdint.h>

//
//
//

typedef uint8_t   u8;
typedef uint16_t  u16;
typedef uint32_t  u32;
typedef int32_t   s32;

//
//
//

#define WARP_SIZE                           32
#define KERNEL_QUALIFIERS                   extern "C" __global__
#define DEVICE_STATIC_INTRINSIC_QUALIFIERS  static __device__ __forceinline__
#define RESTRICT                            __restrict__

//
//
//

DEVICE_STATIC_INTRINSIC_QUALIFIERS
u32
warp_lane()
{
  u32 id;

  asm("mov.u32 %0, %%laneid;" : "=r"(id));

  return id;
}

DEVICE_STATIC_INTRINSIC_QUALIFIERS
u32
warp_lane_mask_eq()
{
#if __CUDA_ARCH__ >= 200

  u32 id;

  asm("mov.u32 %0, %%lanemask_eq;" : "=r"(id));

  return id;

#else

  return 1u << warp_lane();

#endif
}

DEVICE_STATIC_INTRINSIC_QUALIFIERS
u32
warp_lane_mask_lt()
{
#if __CUDA_ARCH__ >= 200

  u32 id;

  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(id));

  return id;

#else

  return (1u << warp_lane()) - 1u;

#endif
}

DEVICE_STATIC_INTRINSIC_QUALIFIERS
u32
warp_lane_mask_lte()
{
#if __CUDA_ARCH__ >= 200

  u32 id;

  asm("mov.u32 %0, %%lanemask_le;" : "=r"(id));

  return id;

#else

  return (2u << warp_lane()) - 1u;

#endif
}

DEVICE_STATIC_INTRINSIC_QUALIFIERS
unsigned int
warp_lane_mask_gt()
{
#if __CUDA_ARCH__ >= 200

  u32 id;

  asm("mov.u32 %0, %%lanemask_gt;" : "=r"(id));

  return id;

#else

  return ~((2u << warp_lane()) - 1u);

#endif
}

DEVICE_STATIC_INTRINSIC_QUALIFIERS
u32
warp_lane_mask_gte()
{
#if __CUDA_ARCH__ >= 200

  u32 id;

  asm("mov.u32 %0, %%lanemask_ge;" : "=r"(id));

  return id;

#else

  return ~((1u << warp_lane()) - 1u);

#endif
}

//
//
//

#define TYPE_SCRATCH u32

//
//
//

__shared__ volatile TYPE_SCRATCH scratch[WARP_SIZE+1]; // volatile is necessary

KERNEL_QUALIFIERS
void
kth_kernel(const u32 bits, TYPE_SCRATCH* const RESTRICT vout)
{
  // init scratch
  scratch[warp_lane()] = UINT8_MAX;

#if   (__CUDA_ARCH__ >= 500)
  // MAXWELL: IT APPEARS "LOW LANE WINS" BUT NOT ALWAYS ;)
  // this is a more reliable and portable implementation:
  if ((bits & warp_lane_mask_eq()) != 0)
    scratch[__popc(bits & warp_lane_mask_lt())] = warp_lane();
#else
  // FERMI/KEPLER: HIGH LANE WINS
  // count all bits from lane to MSB
  const u32 count = __popc(bits & warp_lane_mask_gte());

  // subtract 1 and clamp to last index in scratch
  const u32 idx = min(count-1u,WARP_SIZE);

  // save using "high/low lane wins" feature
  scratch[idx] = warp_lane();
#endif

  // store
  vout[warp_lane()] = scratch[warp_lane()];
}

//
//
//

int
main(int argc, char** argv)
{
  const s32 device = (argc == 1) ? 0          : atoi(argv[1]);
  const u32 bits   = (argc <= 2) ? 0x55555555 : atoi(argv[2]);

  cudaDeviceProp props;
  cudaGetDeviceProperties(&props,device);

  printf("%s (%2d)\n",props.name,props.multiProcessorCount);
  printf("0x%X\n",bits);

  cudaSetDevice(device);

  //
  //
  //

  TYPE_SCRATCH* vout_d;

  cudaMalloc(&vout_d,WARP_SIZE * sizeof(TYPE_SCRATCH));

  //
  //
  //

  kth_kernel<<<1,WARP_SIZE>>>(bits,vout_d);

  cudaDeviceSynchronize();

  //
  //
  //

  TYPE_SCRATCH vout_h[WARP_SIZE];

  cudaMemcpy(vout_h,vout_d,
             WARP_SIZE*sizeof(TYPE_SCRATCH),
             cudaMemcpyDeviceToHost);

  //
  //
  //

  for (u32 ii=0; ii<WARP_SIZE; ii++)
    printf("%3u ",(u32)vout_h[ii]);

  printf("\n");

  //
  //

  cudaDeviceReset();

  return 0;
}
	// -- compile-command: "nvcc -m 32 -arch compute_20 -Xptxas=-v,-abi=no -cubin kth.cu"; --

	#include <stdio.h>
	#include <stdint.h>

	//
	//
	//

	typedef uint8_t u8;
	typedef uint16_t u16;
	typedef uint32_t u32;
	typedef int32_t s32;

	//
	//
	//

	#define WARP_SIZE 32
	#define KERNEL_QUALIFIERS extern "C" __global__
	#define DEVICE_STATIC_INTRINSIC_QUALIFIERS static __device__ __forceinline__
	#define RESTRICT __restrict__

	//
	//
	//

	DEVICE_STATIC_INTRINSIC_QUALIFIERS
	u32
	warp_lane()
	{
	u32 id;

	asm("mov.u32 %0, %%laneid;" : "=r"(id));

	return id;
	}

	DEVICE_STATIC_INTRINSIC_QUALIFIERS
	u32
	warp_lane_mask_eq()
	{
	#if __CUDA_ARCH__ >= 200

	u32 id;

	asm("mov.u32 %0, %%lanemask_eq;" : "=r"(id));

	return id;

	#else

	return 1u << warp_lane();

	#endif
	}

	DEVICE_STATIC_INTRINSIC_QUALIFIERS
	u32
	warp_lane_mask_lt()
	{
	#if __CUDA_ARCH__ >= 200

	u32 id;

	asm("mov.u32 %0, %%lanemask_lt;" : "=r"(id));

	return id;

	#else

	return (1u << warp_lane()) - 1u;

	#endif
	}

	DEVICE_STATIC_INTRINSIC_QUALIFIERS
	u32
	warp_lane_mask_lte()
	{
	#if __CUDA_ARCH__ >= 200

	u32 id;

	asm("mov.u32 %0, %%lanemask_le;" : "=r"(id));

	return id;

	#else

	return (2u << warp_lane()) - 1u;

	#endif
	}

	DEVICE_STATIC_INTRINSIC_QUALIFIERS
	unsigned int
	warp_lane_mask_gt()
	{
	#if __CUDA_ARCH__ >= 200

	u32 id;

	asm("mov.u32 %0, %%lanemask_gt;" : "=r"(id));

	return id;

	#else

	return ~((2u << warp_lane()) - 1u);

	#endif
	}

	DEVICE_STATIC_INTRINSIC_QUALIFIERS
	u32
	warp_lane_mask_gte()
	{
	#if __CUDA_ARCH__ >= 200

	u32 id;

	asm("mov.u32 %0, %%lanemask_ge;" : "=r"(id));

	return id;

	#else

	return ~((1u << warp_lane()) - 1u);

	#endif
	}

	//
	//
	//

	#define TYPE_SCRATCH u32

	//
	//
	//

	__shared__ volatile TYPE_SCRATCH scratch[WARP_SIZE+1]; // volatile is necessary

	KERNEL_QUALIFIERS
	void
	kth_kernel(const u32 bits, TYPE_SCRATCH* const RESTRICT vout)
	{
	// init scratch
	scratch[warp_lane()] = UINT8_MAX;

	#if (__CUDA_ARCH__ >= 500)
	// MAXWELL: IT APPEARS "LOW LANE WINS" BUT NOT ALWAYS ;)
	// this is a more reliable and portable implementation:
	if ((bits & warp_lane_mask_eq()) != 0)
	scratch[__popc(bits & warp_lane_mask_lt())] = warp_lane();
	#else
	// FERMI/KEPLER: HIGH LANE WINS
	// count all bits from lane to MSB
	const u32 count = __popc(bits & warp_lane_mask_gte());

	// subtract 1 and clamp to last index in scratch
	const u32 idx = min(count-1u,WARP_SIZE);

	// save using "high/low lane wins" feature
	scratch[idx] = warp_lane();
	#endif

	// store
	vout[warp_lane()] = scratch[warp_lane()];
	}

	//
	//
	//

	int
	main(int argc, char** argv)
	{
	const s32 device = (argc == 1) ? 0 : atoi(argv[1]);
	const u32 bits = (argc <= 2) ? 0x55555555 : atoi(argv[2]);

	cudaDeviceProp props;
	cudaGetDeviceProperties(&props,device);

	printf("%s (%2d)\n",props.name,props.multiProcessorCount);
	printf("0x%X\n",bits);

	cudaSetDevice(device);

	//
	//
	//

	TYPE_SCRATCH* vout_d;

	cudaMalloc(&vout_d,WARP_SIZE * sizeof(TYPE_SCRATCH));

	//
	//
	//

	kth_kernel<<<1,WARP_SIZE>>>(bits,vout_d);

	cudaDeviceSynchronize();

	//
	//
	//

	TYPE_SCRATCH vout_h[WARP_SIZE];

	cudaMemcpy(vout_h,vout_d,
	WARP_SIZE*sizeof(TYPE_SCRATCH),
	cudaMemcpyDeviceToHost);

	//
	//
	//

	for (u32 ii=0; ii<WARP_SIZE; ii++)
	printf("%3u ",(u32)vout_h[ii]);

	printf("\n");

	//
	//

	cudaDeviceReset();

	return 0;
	}