smartbitcoin/probe_bw.cu

## probe_bw.cu
// -*- compile-command: "nvcc -m 64 -arch compute_30 -Xptxas=-v -o probe_bw probe_bw.cu"; -*-

//

#include <stdio.h>
#include <stdint.h>

//
// choose a reasonable block size
//

#define PXL_WARPS_PER_BLOCK    4
#define PXL_THREADS_PER_BLOCK  (PXL_WARPS_PER_BLOCK * WARP_SIZE)

//
// launch the grid this many times and average
//

#define PXL_HOST_LOOPS         64

//
// select 512, 256 or 128 byte transactions
//

#if   0
#define PXL_SEGMENT_TYPE       uint4                                   // low of ~9 GB/s
#define PXL_SEGMENT_TEST(v)    ((v.x ^ v.y ^ v.z ^ v.w) == 0xDEADBEEF)
#elif 0
#define PXL_SEGMENT_TYPE       uint2                                   // low of ~5 GB/s
#define PXL_SEGMENT_TEST(v)    ((v.x ^ v.y) == 0xDEADBEEF)
#else
#define PXL_SEGMENT_TYPE       uint32_t                                // low of ~3 GB/s
#define PXL_SEGMENT_TEST(v)    (v == 0xDEADBEEF)
#endif

//
// macro expand to get multiple loads in flight
//

#define PXL_DEVICE_LOOPS       7
#define PXL_UNROLL()           PXL_UNROLL_7()

//
//
//

#define PXL_UNROLL_6()                          \
  PXL_REPEAT(0);                                \
  PXL_REPEAT(1);                                \
  PXL_REPEAT(2);                                \
  PXL_REPEAT(3);                                \
  PXL_REPEAT(4);                                \
  PXL_REPEAT(5)

#define PXL_UNROLL_7()                          \
  PXL_UNROLL_6();                               \
  PXL_REPEAT(6)

#define PXL_UNROLL_8()                          \
  PXL_UNROLL_7();                               \
  PXL_REPEAT(7)

//
//
//

#define WARP_SIZE              32
#define WARP_MASK              (WARP_SIZE-1)

#define PXL_SEGMENT_SIZE       (sizeof(PXL_SEGMENT_TYPE) * WARP_SIZE)

//
//
//

static
__device__ __host__
uint32_t
lcg_parkmiller(uint32_t seed)
{
  return seed * 48271u;
}

//
// optionally use LDG
//

#if 0 && (__CUDA_ARCH__ >= 350)
#define PXL_SEGMENT_LOAD(v,i)  __ldg(v+i)
#else
#define PXL_SEGMENT_LOAD(v,i)  v[i]
#endif

//
//
//

static
__global__
void
probe_kernel(const PXL_SEGMENT_TYPE* const vin, uint32_t* const vcounter, const uint32_t segments)
{
  const int32_t  lid   = threadIdx.x & WARP_MASK;
  const uint32_t gid_x = blockIdx.y * blockIdx.x * blockDim.x + threadIdx.x;
  const uint32_t seed  = lcg_parkmiller(clock() * ~(gid_x + 1)); // must not be zero

#if __CUDA_ARCH__ < 300
  __shared__ uint32_t vidx[PXL_THREADS_PER_BLOCK];
  vidx[threadIdx.x] = seed % segments;
#define PXL_VIDX_LOAD(ii) vidx[(threadIdx.x & ~WARP_MASK) + ii]
#else
  const uint32_t vidx = seed % segments;
#define PXL_VIDX_LOAD(ii) __shfl(vidx,ii)
#endif

#undef  PXL_REPEAT
#define PXL_REPEAT(ii)                                          \
  const PXL_SEGMENT_TYPE v##ii = PXL_SEGMENT_LOAD(vin,PXL_VIDX_LOAD(ii) * WARP_SIZE + lid)

  PXL_UNROLL();

#undef  PXL_REPEAT
#define PXL_REPEAT(ii)                                          \
  if (PXL_SEGMENT_TEST(v##ii))                                  \
    atomicInc(vcounter,UINT32_MAX)

  PXL_UNROLL();
}

//
//
//

#include <stdbool.h>

static
void
cuda_assert(const cudaError_t code, const char* const file, const int line, const bool abort)
{
  if (code != cudaSuccess)
    {
      fprintf(stderr,"cuda_assert: %s %s %d\n",cudaGetErrorString(code),file,line);

      if (abort)
        exit(code);
    }
}

#define cuda(...)  cuda_assert((cuda##__VA_ARGS__), __FILE__, __LINE__, true);

//
//
//

static
void
probe_kernel_launcher(const size_t extent_mb, const size_t probe_mb)
{
  const size_t segments = extent_mb * 1024 * 1024 / PXL_SEGMENT_SIZE;
  const size_t elems    = segments  * WARP_SIZE;
  const size_t bytes    = elems     * sizeof(PXL_SEGMENT_TYPE);

  // alloc
  PXL_SEGMENT_TYPE* vin_d;

  cuda(Malloc(&vin_d, bytes));
  cuda(Memset(vin_d,0,bytes));

  // size the grid
  const size_t probes = probe_mb * 1024 * 1024 / PXL_SEGMENT_SIZE;
  const size_t blocks = (probes + PXL_WARPS_PER_BLOCK - 1) / PXL_WARPS_PER_BLOCK;

  // Fermi support -- 1 MB in dim.x and probe_mb in dim.y
  const dim3 block_dim(1024*1024/PXL_SEGMENT_SIZE/PXL_WARPS_PER_BLOCK,probe_mb);

  // warm-up
  probe_kernel<<<block_dim,PXL_THREADS_PER_BLOCK>>>(vin_d,(uint32_t*)vin_d,segments);

  // measure
  cudaEvent_t start, end;
  cuda(EventCreate(&start));
  cuda(EventCreate(&end));

  cuda(EventRecord(start));

  for (int ii=0; ii<PXL_HOST_LOOPS; ii++)
    probe_kernel<<<block_dim,PXL_THREADS_PER_BLOCK>>>(vin_d,(uint32_t*)vin_d,segments);

  cuda(EventRecord(end));
  cuda(EventSynchronize(end));

  float elapsed;
  cuda(EventElapsedTime(&elapsed,start,end));

  const size_t loads = PXL_DEVICE_LOOPS * PXL_WARPS_PER_BLOCK * blocks;

  printf(
#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
         "%8llu, %8llu, %8.2f, %8.2f\n",
#else
         "%8u, %8u, %8.2f, %8.2f\n",
#endif
         extent_mb,
         loads * PXL_SEGMENT_SIZE * PXL_HOST_LOOPS / (1024 * 1024),
         elapsed,
         (1000.0 * PXL_HOST_LOOPS * PXL_SEGMENT_SIZE * loads) / (1024.0 * 1024.0 * 1024.0 * elapsed));

  //
  cuda(Free(vin_d));
  cuda(EventDestroy(start));
  cuda(EventDestroy(end));
}

//
//
//

int
main(int argc, char** argv)
{
  //
  if (argc < 4)
    {
      printf("%s: <device id> <lo mbytes> <hi mbytes> <step mbytes> <probe mbytes>\n",argv[0]);
      exit(0);
    }

  //
  const int32_t device    = atoi(argv[1]);
  size_t        lo_mb     = max(atoi(argv[2]),1);
  const size_t  hi_mb     = max(atoi(argv[3]),1);
  const size_t  step_mb   = max(atoi(argv[4]),1);
  const size_t  probe_mb  = max(atoi(argv[5]),1);

  struct cudaDeviceProp props;
  cuda(GetDeviceProperties(&props,device));

  printf("%s : %d SM : %d MB\n",
         props.name,
         props.multiProcessorCount,
         props.totalGlobalMem / (1024 * 1024));

  cuda(SetDevice(device));

  printf("Probing from: "
#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
         "%llu - %llu MB ...\n",
#else
         "%u - %u MB ...\n",
#endif
         lo_mb,hi_mb);

  printf("alloc MB, probe MB,    msecs,     GB/s\n");

  for (; lo_mb <= hi_mb; lo_mb += step_mb)
    probe_kernel_launcher(lo_mb,probe_mb);

  //
  cuda(DeviceReset());

  return 0;
}
	// -- compile-command: "nvcc -m 64 -arch compute_30 -Xptxas=-v -o probe_bw probe_bw.cu"; --

	//

	#include <stdio.h>
	#include <stdint.h>

	//
	// choose a reasonable block size
	//

	#define PXL_WARPS_PER_BLOCK 4
	#define PXL_THREADS_PER_BLOCK (PXL_WARPS_PER_BLOCK * WARP_SIZE)

	//
	// launch the grid this many times and average
	//

	#define PXL_HOST_LOOPS 64

	//
	// select 512, 256 or 128 byte transactions
	//

	#if 0
	#define PXL_SEGMENT_TYPE uint4 // low of ~9 GB/s
	#define PXL_SEGMENT_TEST(v) ((v.x ^ v.y ^ v.z ^ v.w) == 0xDEADBEEF)
	#elif 0
	#define PXL_SEGMENT_TYPE uint2 // low of ~5 GB/s
	#define PXL_SEGMENT_TEST(v) ((v.x ^ v.y) == 0xDEADBEEF)
	#else
	#define PXL_SEGMENT_TYPE uint32_t // low of ~3 GB/s
	#define PXL_SEGMENT_TEST(v) (v == 0xDEADBEEF)
	#endif

	//
	// macro expand to get multiple loads in flight
	//

	#define PXL_DEVICE_LOOPS 7
	#define PXL_UNROLL() PXL_UNROLL_7()

	//
	//
	//

	#define PXL_UNROLL_6() \
	PXL_REPEAT(0); \
	PXL_REPEAT(1); \
	PXL_REPEAT(2); \
	PXL_REPEAT(3); \
	PXL_REPEAT(4); \
	PXL_REPEAT(5)

	#define PXL_UNROLL_7() \
	PXL_UNROLL_6(); \
	PXL_REPEAT(6)

	#define PXL_UNROLL_8() \
	PXL_UNROLL_7(); \
	PXL_REPEAT(7)

	//
	//
	//

	#define WARP_SIZE 32
	#define WARP_MASK (WARP_SIZE-1)

	#define PXL_SEGMENT_SIZE (sizeof(PXL_SEGMENT_TYPE) * WARP_SIZE)

	//
	//
	//

	static
	__device__ __host__
	uint32_t
	lcg_parkmiller(uint32_t seed)
	{
	return seed * 48271u;
	}

	//
	// optionally use LDG
	//

	#if 0 && (__CUDA_ARCH__ >= 350)
	#define PXL_SEGMENT_LOAD(v,i) __ldg(v+i)
	#else
	#define PXL_SEGMENT_LOAD(v,i) v[i]
	#endif

	//
	//
	//

	static
	__global__
	void
	probe_kernel(const PXL_SEGMENT_TYPE* const vin, uint32_t* const vcounter, const uint32_t segments)
	{
	const int32_t lid = threadIdx.x & WARP_MASK;
	const uint32_t gid_x = blockIdx.y * blockIdx.x * blockDim.x + threadIdx.x;
	const uint32_t seed = lcg_parkmiller(clock() * ~(gid_x + 1)); // must not be zero

	#if __CUDA_ARCH__ < 300
	__shared__ uint32_t vidx[PXL_THREADS_PER_BLOCK];
	vidx[threadIdx.x] = seed % segments;
	#define PXL_VIDX_LOAD(ii) vidx[(threadIdx.x & ~WARP_MASK) + ii]
	#else
	const uint32_t vidx = seed % segments;
	#define PXL_VIDX_LOAD(ii) __shfl(vidx,ii)
	#endif

	#undef PXL_REPEAT
	#define PXL_REPEAT(ii) \
	const PXL_SEGMENT_TYPE v##ii = PXL_SEGMENT_LOAD(vin,PXL_VIDX_LOAD(ii) * WARP_SIZE + lid)

	PXL_UNROLL();

	#undef PXL_REPEAT
	#define PXL_REPEAT(ii) \
	if (PXL_SEGMENT_TEST(v##ii)) \
	atomicInc(vcounter,UINT32_MAX)

	PXL_UNROLL();
	}

	//
	//
	//

	#include <stdbool.h>

	static
	void
	cuda_assert(const cudaError_t code, const char* const file, const int line, const bool abort)
	{
	if (code != cudaSuccess)
	{
	fprintf(stderr,"cuda_assert: %s %s %d\n",cudaGetErrorString(code),file,line);

	if (abort)
	exit(code);
	}
	}

	#define cuda(...) cuda_assert((cuda##__VA_ARGS__), __FILE__, __LINE__, true);

	//
	//
	//

	static
	void
	probe_kernel_launcher(const size_t extent_mb, const size_t probe_mb)
	{
	const size_t segments = extent_mb * 1024 * 1024 / PXL_SEGMENT_SIZE;
	const size_t elems = segments * WARP_SIZE;
	const size_t bytes = elems * sizeof(PXL_SEGMENT_TYPE);

	// alloc
	PXL_SEGMENT_TYPE* vin_d;

	cuda(Malloc(&vin_d, bytes));
	cuda(Memset(vin_d,0,bytes));

	// size the grid
	const size_t probes = probe_mb * 1024 * 1024 / PXL_SEGMENT_SIZE;
	const size_t blocks = (probes + PXL_WARPS_PER_BLOCK - 1) / PXL_WARPS_PER_BLOCK;

	// Fermi support -- 1 MB in dim.x and probe_mb in dim.y
	const dim3 block_dim(1024*1024/PXL_SEGMENT_SIZE/PXL_WARPS_PER_BLOCK,probe_mb);

	// warm-up
	probe_kernel<<<block_dim,PXL_THREADS_PER_BLOCK>>>(vin_d,(uint32_t*)vin_d,segments);

	// measure
	cudaEvent_t start, end;
	cuda(EventCreate(&start));
	cuda(EventCreate(&end));

	cuda(EventRecord(start));

	for (int ii=0; ii<PXL_HOST_LOOPS; ii++)
	probe_kernel<<<block_dim,PXL_THREADS_PER_BLOCK>>>(vin_d,(uint32_t*)vin_d,segments);

	cuda(EventRecord(end));
	cuda(EventSynchronize(end));

	float elapsed;
	cuda(EventElapsedTime(&elapsed,start,end));

	const size_t loads = PXL_DEVICE_LOOPS * PXL_WARPS_PER_BLOCK * blocks;

	printf(
	#if defined(__x86_64) \|\| defined(AMD64) \|\| defined(_M_AMD64)
	"%8llu, %8llu, %8.2f, %8.2f\n",
	#else
	"%8u, %8u, %8.2f, %8.2f\n",
	#endif
	extent_mb,
	loads * PXL_SEGMENT_SIZE * PXL_HOST_LOOPS / (1024 * 1024),
	elapsed,
	(1000.0 * PXL_HOST_LOOPS * PXL_SEGMENT_SIZE * loads) / (1024.0 * 1024.0 * 1024.0 * elapsed));

	//
	cuda(Free(vin_d));
	cuda(EventDestroy(start));
	cuda(EventDestroy(end));
	}

	//
	//
	//

	int
	main(int argc, char** argv)
	{
	//
	if (argc < 4)
	{
	printf("%s: <device id> <lo mbytes> <hi mbytes> <step mbytes> <probe mbytes>\n",argv[0]);
	exit(0);
	}

	//
	const int32_t device = atoi(argv[1]);
	size_t lo_mb = max(atoi(argv[2]),1);
	const size_t hi_mb = max(atoi(argv[3]),1);
	const size_t step_mb = max(atoi(argv[4]),1);
	const size_t probe_mb = max(atoi(argv[5]),1);

	struct cudaDeviceProp props;
	cuda(GetDeviceProperties(&props,device));

	printf("%s : %d SM : %d MB\n",
	props.name,
	props.multiProcessorCount,
	props.totalGlobalMem / (1024 * 1024));

	cuda(SetDevice(device));

	printf("Probing from: "
	#if defined(__x86_64) \|\| defined(AMD64) \|\| defined(_M_AMD64)
	"%llu - %llu MB ...\n",
	#else
	"%u - %u MB ...\n",
	#endif
	lo_mb,hi_mb);

	printf("alloc MB, probe MB, msecs, GB/s\n");

	for (; lo_mb <= hi_mb; lo_mb += step_mb)
	probe_kernel_launcher(lo_mb,probe_mb);

	//
	cuda(DeviceReset());

	return 0;
	}