Last active
December 17, 2015 23:48
-
-
Save allanmac/5691524 to your computer and use it in GitHub Desktop.
Demonstrate the impact of the resident block limit on grids with 32-thread "tinyblocks".
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdint.h> | |
// | |
// | |
// | |
#define WARP_SIZE 32 | |
// | |
// | |
// | |
__global__ | |
void | |
pmFloatKernel() | |
{ | |
int clock; | |
asm volatile("mov.u32 %0, %clock;" : "=r"(clock)); | |
float total = __int_as_float(clock); | |
for (int ii=0; ii<4096; ii++) | |
{ | |
total += total; | |
} | |
if (__float_as_int(total) == 0xDEADBEEF) | |
asm volatile("pmevent 15;"); | |
} | |
// | |
// | |
// | |
int | |
gcd(int a, int b) | |
{ | |
if (a == 0) | |
return b; | |
return gcd(b%a,a); | |
} | |
int main(int argc, char **argv) | |
{ | |
cudaError_t err; | |
int device = (argc == 1) ? 0 : atoi(argv[1]); | |
cudaDeviceProp props; | |
err = cudaGetDeviceProperties(&props,device); | |
if (err) | |
return -1; | |
// if (props.major < 2) { | |
// printf("%s = sm_%d%d\n",props.name,props.major,props.minor); | |
// return -1; | |
// } | |
cudaSetDevice(device); | |
// | |
// | |
// | |
const unsigned int loops = 100; | |
const unsigned int mpLoad = 8; | |
// | |
// | |
// | |
unsigned int threads = gcd(props.maxThreadsPerMultiProcessor, props.maxThreadsPerBlock); | |
unsigned int blocks = props.multiProcessorCount * mpLoad * props.maxThreadsPerMultiProcessor / threads; | |
// | |
// | |
// | |
cudaEvent_t start, stop; | |
cudaEventCreate(&start); | |
cudaEventCreate(&stop,cudaEventBlockingSync); | |
// | |
// | |
// | |
while (threads >= WARP_SIZE) | |
{ | |
printf("%s, %u, %u, %u, ", | |
props.name, | |
props.multiProcessorCount, | |
props.maxThreadsPerMultiProcessor, | |
props.maxThreadsPerBlock); | |
cudaEventRecord(start); | |
for (int ii=0; ii<loops; ii++) | |
pmFloatKernel<<<blocks,threads>>>(); | |
cudaEventRecord(stop); | |
cudaEventSynchronize(stop); | |
float elapsedTime; | |
cudaEventElapsedTime(&elapsedTime,start,stop); | |
printf("%u, %u, %.1f\n",blocks,threads,elapsedTime); | |
threads /= 2; | |
blocks *= 2; | |
} | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
all: | |
nvcc -m 32 -Xptxas=-v,-abi=no \ | |
-gencode=arch=compute_11,code=sm_11 \ | |
-gencode=arch=compute_12,code=sm_12 \ | |
-gencode=arch=compute_20,code=sm_21 \ | |
-gencode=arch=compute_30,code=sm_30 \ | |
-gencode=arch=compute_35,code=sm_35 \ | |
blocks.cu -o blocks |
Author
allanmac
commented
Jun 1, 2013
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment