Skip to content

Instantly share code, notes, and snippets.

@allanmac
Last active August 29, 2015 14:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save allanmac/ed033f20e566932680ff to your computer and use it in GitHub Desktop.
Save allanmac/ed033f20e566932680ff to your computer and use it in GitHub Desktop.
Try to push an integer math kernel's IPC metric as high as possible!
// -*- compile-command: "nvcc -m 32 -arch sm_50 -Xptxas=-v,-abi=no int_ipc.cu -o int_ipc" ; -*-
//
//
//
#define KERNEL_QUALIFIERS __global__
#define KERNEL_QUALIFIERS_EXTERN extern KERNEL_QUALIFIERS
#define KERNEL_QUALIFIERS_EXTERN_C extern "C" KERNEL_QUALIFIERS
//
//
//
#ifndef _DEBUG
#define DEVICE_FUNCTION_QUALIFIERS __device__ __forceinline__
#define DEVICE_INTRINSIC_QUALIFIERS __device__ __forceinline__
#else
#define DEVICE_FUNCTION_QUALIFIERS __device__
#define DEVICE_INTRINSIC_QUALIFIERS __device__
#endif
//
//
//
#define DEVICE_STATIC_FUNCTION_QUALIFIERS static DEVICE_FUNCTION_QUALIFIERS
#define DEVICE_STATIC_INTRINSIC_QUALIFIERS static DEVICE_INTRINSIC_QUALIFIERS
//
//
//
#define RESTRICT __restrict__
//
//
//
#define WARP_SIZE 32
//
//
//
DEVICE_STATIC_INTRINSIC_QUALIFIERS
int
add(const int a, const int b)
{
int d;
asm("add.sat.s32 %0, %1, %2;" : "=r"(d) : "r"(a), "r"(b));
return d;
}
DEVICE_STATIC_INTRINSIC_QUALIFIERS
int
mul(const int a, const int b)
{
int d;
// asm("mad.hi.sat.s32 %0, %1, %2, 7;" : "=r"(d) : "r"(a), "r"(b));
// asm("mul.lo.s32 %0, %1, %2;" : "=r"(d) : "r"(a), "r"(b));
d = a * b;
return d;
}
//
//
//
#define IPC_2(v) \
II(0,v); \
II(1,v);
#define IPC_4(v) \
IPC_2(v); \
II(2,v); \
II(3,v);
#define IPC_8(v) \
IPC_4(v); \
II(4,v); \
II(5,v); \
II(6,v); \
II(7,v);
#define IPC_16(v) \
IPC_8(v); \
II(8,v); \
II(9,v); \
II(10,v); \
II(11,v); \
II(12,v); \
II(13,v); \
II(14,v); \
II(15,v);
#define IPC_32(v) \
IPC_16(v); \
II(16,v); \
II(17,v); \
II(18,v); \
II(19,v); \
II(20,v); \
II(21,v); \
II(22,v); \
II(23,v); \
II(24,v); \
II(25,v); \
II(26,v); \
II(27,v); \
II(28,v); \
II(29,v); \
II(30,v); \
II(31,v);
//
//
//
#define REP_2(v) \
RR(0,v); \
RR(1,v); \
#define REP_4(v) \
REP_2(v); \
RR(2,v); \
RR(3,v);
#define REP_8(v) \
REP_4(v); \
RR(4,v); \
RR(5,v); \
RR(6,v); \
RR(7,v);
#define REP_16(v) \
REP_8(v); \
RR(8,v); \
RR(9,v); \
RR(10,v); \
RR(11,v); \
RR(12,v); \
RR(13,v); \
RR(14,v); \
RR(15,v);
#define REP_32(v) \
REP_16(v); \
RR(16,v); \
RR(17,v); \
RR(18,v); \
RR(19,v); \
RR(20,v); \
RR(21,v); \
RR(22,v); \
RR(23,v); \
RR(24,v); \
RR(25,v); \
RR(26,v); \
RR(27,v); \
RR(28,v); \
RR(29,v); \
RR(30,v); \
RR(31,v);
//
//
//
#define TYPE int
//
//
//
KERNEL_QUALIFIERS
void
int_ipc_kernel(const TYPE* const RESTRICT vin, TYPE* const RESTRICT vout)
{
//
// FIXME -- PLEASE TRY OTHER IPC LEVELS OTHER THAN POWER OF TWO THAT I HAVE HERE
//
#if __CUDA_ARCH__ >= 500 // MAXWELL LIKES THIS
#define IPC(v) IPC_8(v)
#define REP(v) REP_4(v)
#elif __CUDA_ARCH__ >= 350 // GK208
#define IPC(v) IPC_8(v)
#define REP(v) REP_4(v)
#elif __CUDA_ARCH__ >= 300 // GK104
#define IPC(v) IPC_16(v)
#define REP(v) REP_4(v)
#else // FERMI and below
#define IPC(v) IPC_8(v)
#define REP(v) REP_4(v)
#endif
//
//
//
const unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
//
// LOAD
//
#undef II
#define II(a,v) TYPE s##a = vin[tid];
IPC(0);
//
// PERFORM IPC * REP add_sat() ops
//
#undef II
#define II(a,v) s##a = mul(s##a,s##v);
#undef RR
#define RR(a,v) IPC(a)
// #pragma unroll
for (int ii=0; ii<16384; ii++) // spin a _lot_ of times
{
REP(0);
}
//
// STORE
//
#undef II
#define II(a,v) vout[tid*(a+1)] = s##a;
IPC(0);
}
//
//
//
#define INT_IPC_MAIN
#ifdef INT_IPC_MAIN
//
//
//
#include <stdio.h>
int
main(int argc, char** argv)
{
// int_ipc [device] [# of warps] [# of blocks]
const int device = (argc >= 2) ? atoi(argv[1]) : 0;
const int warps = (argc >= 3) ? atoi(argv[2]) : 16;
const int blocks = (argc >= 4) ? atoi(argv[3]) : 1;
cudaSetDevice(device);
//
//
//
cudaDeviceProp props;
cudaGetDeviceProperties(&props,device);
printf("%s (%2d)\n",props.name,props.multiProcessorCount);
//
//
//
const int threads = WARP_SIZE * warps;
printf("int_ipc_kernel<<<%d,%d>>>(...)\n",blocks,threads);
//
// ALLOCATE BUFFERS
//
TYPE* vin;
TYPE* vout;
cudaMalloc(&vin, sizeof(TYPE) * blocks * threads);
cudaMalloc(&vout,sizeof(TYPE) * blocks * threads);
//
// INIT VIN
//
// init vin[] if you want
//
// LAUNCH KERNEL
//
int_ipc_kernel<<<blocks,threads>>>(vin,vout);
cudaDeviceSynchronize();
//
// FREE BUFFERS
//
cudaFree(vin);
cudaFree(vout);
cudaDeviceReset();
return 0;
}
//
//
//
#endif // INT_IPC_MAIN
//
//
//
@allanmac
Copy link
Author

Built with:

nvcc -m 32 -arch sm_50 -Xptxas=-v,-abi=no int_ipc.cu -o int_ipc

Run with:

int_ipc <device#> <# of warps> <# of blocks>

@allanmac
Copy link
Author

On a GTX 750 Ti (Maxwell) an IPC of 4.05 is achieved:

>nvprof -m ipc int_ipc.exe 0 32
==5188== NVPROF is profiling process 5188, command: int_ipc.exe 0 32
GeForce GTX 750 Ti ( 5)
int_ipc_kernel<<<1,1024>>>(...)
==5188== Profiling application: int_ipc.exe 0 32
==5188== Profiling result:
==5188== Metric result:
Invocations                               Metric Name                        Metric Description         Min         Max         Avg
Device "GeForce GTX 750 Ti (0)"
        Kernel: int_ipc_kernel(int const *, int*)
          1                                       ipc                              Executed IPC    4.052755    4.052755    4.052755

@allanmac
Copy link
Author

On a GT 630 (GK208) an IPC of 3.53 is achieved:

>nvprof -m ipc int_ipc.exe 2 32
==8900== NVPROF is profiling process 8900, command: int_ipc.exe 2 32
GeForce GT 630 ( 2)
int_ipc_kernel<<<1,1024>>>(...)
==8900== Profiling application: int_ipc.exe 2 32
==8900== Profiling result:
==8900== Metric result:
Invocations                               Metric Name                        Metric Description         Min         Max         Avg
Device "GeForce GT 630 (2)"
        Kernel: int_ipc_kernel(int const *, int*)
          1                                       ipc                              Executed IPC    3.533268    3.533268    3.533268

@allanmac
Copy link
Author

allanmac commented Jan 5, 2015

$ nvprof -m ipc int_ipc 0 32
==1656== NVPROF is profiling process 1656, command: int_ipc 0 32
GK20A ( 1)
int_ipc_kernel<<<1,1024>>>(...)
==1656== Profiling application: int_ipc 0 32
==1656== Profiling result:
==1656== Metric result:
Invocations                               Metric Name                        Metric Description         Min         Max         Avg
Device "GK20A (0)"
        Kernel: int_ipc_kernel(int const *, int*)
          1                                       ipc                              Executed IPC    3.258458    3.258458    3.258458

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment