Skip to content

Instantly share code, notes, and snippets.

@allanmac
Last active December 17, 2015 23:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save allanmac/5691524 to your computer and use it in GitHub Desktop.
Save allanmac/5691524 to your computer and use it in GitHub Desktop.
Demonstrate the impact of the resident block limit on grids with 32-thread "tinyblocks".
#include <stdio.h>
#include <stdint.h>
//
//
//
#define WARP_SIZE 32
//
//
//
__global__
void
pmFloatKernel()
{
int clock;
asm volatile("mov.u32 %0, %clock;" : "=r"(clock));
float total = __int_as_float(clock);
for (int ii=0; ii<4096; ii++)
{
total += total;
}
if (__float_as_int(total) == 0xDEADBEEF)
asm volatile("pmevent 15;");
}
//
//
//
int
gcd(int a, int b)
{
if (a == 0)
return b;
return gcd(b%a,a);
}
int main(int argc, char **argv)
{
cudaError_t err;
int device = (argc == 1) ? 0 : atoi(argv[1]);
cudaDeviceProp props;
err = cudaGetDeviceProperties(&props,device);
if (err)
return -1;
// if (props.major < 2) {
// printf("%s = sm_%d%d\n",props.name,props.major,props.minor);
// return -1;
// }
cudaSetDevice(device);
//
//
//
const unsigned int loops = 100;
const unsigned int mpLoad = 8;
//
//
//
unsigned int threads = gcd(props.maxThreadsPerMultiProcessor, props.maxThreadsPerBlock);
unsigned int blocks = props.multiProcessorCount * mpLoad * props.maxThreadsPerMultiProcessor / threads;
//
//
//
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop,cudaEventBlockingSync);
//
//
//
while (threads >= WARP_SIZE)
{
printf("%s, %u, %u, %u, ",
props.name,
props.multiProcessorCount,
props.maxThreadsPerMultiProcessor,
props.maxThreadsPerBlock);
cudaEventRecord(start);
for (int ii=0; ii<loops; ii++)
pmFloatKernel<<<blocks,threads>>>();
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime,start,stop);
printf("%u, %u, %.1f\n",blocks,threads,elapsedTime);
threads /= 2;
blocks *= 2;
}
return 0;
}
all:
nvcc -m 32 -Xptxas=-v,-abi=no \
-gencode=arch=compute_11,code=sm_11 \
-gencode=arch=compute_12,code=sm_12 \
-gencode=arch=compute_20,code=sm_21 \
-gencode=arch=compute_30,code=sm_30 \
-gencode=arch=compute_35,code=sm_35 \
blocks.cu -o blocks
@allanmac
Copy link
Author

allanmac commented Jun 1, 2013

> blocks 0
Tesla K20c, 13, 2048, 1024, 208, 1024, 70.4
Tesla K20c, 13, 2048, 1024, 416, 512, 73.2
Tesla K20c, 13, 2048, 1024, 832, 256, 72.5
Tesla K20c, 13, 2048, 1024, 1664, 128, 72.7
Tesla K20c, 13, 2048, 1024, 3328, 64, 85.6
Tesla K20c, 13, 2048, 1024, 6656, 32, 171.1

> blocks 2
GeForce GTX 680, 8, 2048, 1024, 128, 1024, 52.5
GeForce GTX 680, 8, 2048, 1024, 256, 512, 55.1
GeForce GTX 680, 8, 2048, 1024, 512, 256, 54.6
GeForce GTX 680, 8, 2048, 1024, 1024, 128, 51.1
GeForce GTX 680, 8, 2048, 1024, 2048, 64, 57.6
GeForce GTX 680, 8, 2048, 1024, 4096, 32, 111.3

> blocks 3
GeForce GT 545, 3, 1536, 1024, 72, 512, 124.8
GeForce GT 545, 3, 1536, 1024, 144, 256, 124.4
GeForce GT 545, 3, 1536, 1024, 288, 128, 123.1
GeForce GT 545, 3, 1536, 1024, 576, 64, 141.3
GeForce GT 545, 3, 1536, 1024, 1152, 32, 265.3

> blocks 1
GeForce GT 240, 12, 1024, 512, 192, 512, 1168.4
GeForce GT 240, 12, 1024, 512, 384, 256, 1154.2
GeForce GT 240, 12, 1024, 512, 768, 128, 1143.0
GeForce GT 240, 12, 1024, 512, 1536, 64, 1136.0
GeForce GT 240, 12, 1024, 512, 3072, 32, 1069.9

> blocks 4
GeForce 9400 GT, 4, 768, 512, 96, 256, 874.5
GeForce 9400 GT, 4, 768, 512, 192, 128, 874.5
GeForce 9400 GT, 4, 768, 512, 384, 64, 874.5
GeForce 9400 GT, 4, 768, 512, 768, 32, 878.4

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment