Skip to content

Instantly share code, notes, and snippets.

@allanmac
Created January 5, 2015 16:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save allanmac/47d25cbe5e2cd9f3e8b4 to your computer and use it in GitHub Desktop.
Save allanmac/47d25cbe5e2cd9f3e8b4 to your computer and use it in GitHub Desktop.
Try to push a kernel's IPC metric as high as possible!
// -*- compile-command: "nvcc -m 32 -arch sm_50 -Xptxas=-v,-abi=no ipc.cu -o ipc" ; -*-
//
//
//
#define KERNEL_QUALIFIERS __global__
#define KERNEL_QUALIFIERS_EXTERN extern KERNEL_QUALIFIERS
#define KERNEL_QUALIFIERS_EXTERN_C extern "C" KERNEL_QUALIFIERS
//
//
//
#ifndef _DEBUG
#define DEVICE_FUNCTION_QUALIFIERS __device__ __forceinline__
#define DEVICE_INTRINSIC_QUALIFIERS __device__ __forceinline__
#else
#define DEVICE_FUNCTION_QUALIFIERS __device__
#define DEVICE_INTRINSIC_QUALIFIERS __device__
#endif
//
//
//
#define DEVICE_STATIC_FUNCTION_QUALIFIERS static DEVICE_FUNCTION_QUALIFIERS
#define DEVICE_STATIC_INTRINSIC_QUALIFIERS static DEVICE_INTRINSIC_QUALIFIERS
//
//
//
#define RESTRICT __restrict__
//
//
//
#define WARP_SIZE 32
//
//
//
DEVICE_STATIC_INTRINSIC_QUALIFIERS
int
int_add_sat(const int a, const int b)
{
int d;
asm("add.sat.s32 %0, %1, %2;" : "=r"(d) : "r"(a), "r"(b));
return d;
}
DEVICE_STATIC_INTRINSIC_QUALIFIERS
float
float_add_sat(const float a, const float b)
{
float d;
asm("add.sat.f32 %0, %1, %2;" : "=f"(d) : "f"(a), "f"(b));
return d;
}
DEVICE_STATIC_INTRINSIC_QUALIFIERS
float
float_mul_sat(const float a, const float b)
{
float d;
asm("mul.rn.ftz.sat.f32 %0, %1, %2;" : "=f"(d) : "f"(a), "f"(b));
return d;
}
DEVICE_STATIC_INTRINSIC_QUALIFIERS
float
float_fma_sat(const float a, const float b)
{
float d;
asm("fma.rn.ftz.sat.f32 %0, %1, %2, %2;" : "=f"(d) : "f"(a), "f"(b));
return d;
}
//
//
//
#define IPC_2(v) \
II(0,v); \
II(1,v);
#define IPC_4(v) \
IPC_2(v); \
II(2,v); \
II(3,v);
#define IPC_8(v) \
IPC_4(v); \
II(4,v); \
II(5,v); \
II(6,v); \
II(7,v);
#define IPC_16(v) \
IPC_8(v); \
II(8,v); \
II(9,v); \
II(10,v); \
II(11,v); \
II(12,v); \
II(13,v); \
II(14,v); \
II(15,v);
#define IPC_32(v) \
IPC_16(v); \
II(16,v); \
II(17,v); \
II(18,v); \
II(19,v); \
II(20,v); \
II(21,v); \
II(22,v); \
II(23,v); \
II(24,v); \
II(25,v); \
II(26,v); \
II(27,v); \
II(28,v); \
II(29,v); \
II(30,v); \
II(31,v);
//
//
//
#define REP_2(v) \
RR(0,v); \
RR(1,v); \
#define REP_4(v) \
REP_2(v); \
RR(2,v); \
RR(3,v);
#define REP_8(v) \
REP_4(v); \
RR(4,v); \
RR(5,v); \
RR(6,v); \
RR(7,v);
#define REP_16(v) \
REP_8(v); \
RR(8,v); \
RR(9,v); \
RR(10,v); \
RR(11,v); \
RR(12,v); \
RR(13,v); \
RR(14,v); \
RR(15,v);
#define REP_32(v) \
REP_16(v); \
RR(16,v); \
RR(17,v); \
RR(18,v); \
RR(19,v); \
RR(20,v); \
RR(21,v); \
RR(22,v); \
RR(23,v); \
RR(24,v); \
RR(25,v); \
RR(26,v); \
RR(27,v); \
RR(28,v); \
RR(29,v); \
RR(30,v); \
RR(31,v);
//
//
//
#define TYPE float
#define OP float_fma_sat
//
//
//
KERNEL_QUALIFIERS
void
ipc_kernel(const TYPE* const RESTRICT vin, TYPE* const RESTRICT vout)
{
//
// FIXME -- PLEASE TRY OTHER IPC LEVELS OTHER THAN POWER OF TWO THAT I HAVE HERE
//
#if __CUDA_ARCH__ >= 500 // MAXWELL LIKES THIS
#define IPC(v) IPC_8(v)
#define REP(v) REP_4(v)
#elif __CUDA_ARCH__ >= 350 // GK208
#define IPC(v) IPC_8(v)
#define REP(v) REP_4(v)
#elif __CUDA_ARCH__ >= 300 // GK104
#define IPC(v) IPC_16(v)
#define REP(v) REP_4(v)
#else // FERMI and below
#define IPC(v) IPC_8(v)
#define REP(v) REP_4(v)
#endif
//
//
//
const unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
//
// LOAD
//
#undef II
#define II(a,v) TYPE s##a = vin[tid*(a+1)];
IPC(0);
//
// PERFORM IPC * REP add_sat() ops
//
#undef II
#define II(a,v) s##a = OP(s##a,(v+1));
#undef RR
#define RR(a,v) IPC(a)
#pragma unroll
for (int ii=0; ii<16384; ii++) // spin a _lot_ of times
{
REP(0);
}
//
// STORE
//
#undef II
#define II(a,v) vout[tid*(a+1)] = s##a;
IPC(0);
}
//
//
//
#define IPC_MAIN
#ifdef IPC_MAIN
//
//
//
#include <stdio.h>
int
main(int argc, char** argv)
{
// ipc [device] [# of warps] [# of blocks]
const int device = (argc >= 2) ? atoi(argv[1]) : 0;
const int warps = (argc >= 3) ? atoi(argv[2]) : 16;
const int blocks = (argc >= 4) ? atoi(argv[3]) : 1;
cudaSetDevice(device);
//
//
//
cudaDeviceProp props;
cudaGetDeviceProperties(&props,device);
printf("%s (%2d)\n",props.name,props.multiProcessorCount);
//
//
//
const int threads = WARP_SIZE * warps;
printf("ipc_kernel<<<%d,%d>>>(...)\n",blocks,threads);
//
// ALLOCATE BUFFERS
//
TYPE* vin;
TYPE* vout;
cudaMalloc(&vin, sizeof(TYPE) * blocks * threads);
cudaMalloc(&vout,sizeof(TYPE) * blocks * threads);
//
// INIT VIN
//
// init vin[] if you want
//
// LAUNCH KERNEL
//
ipc_kernel<<<blocks,threads>>>(vin,vout);
cudaDeviceSynchronize();
//
// FREE BUFFERS
//
cudaFree(vin);
cudaFree(vout);
cudaDeviceReset();
return 0;
}
//
//
//
#endif // IPC_MAIN
//
//
//
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment