Skip to content

Instantly share code, notes, and snippets.

@Timer
Last active March 23, 2016 21:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Timer/d6b040465a1e15b67ee3 to your computer and use it in GitHub Desktop.
Save Timer/d6b040465a1e15b67ee3 to your computer and use it in GitHub Desktop.
#include <stdio.h>
int main() {
int devices;
cudaGetDeviceCount(&devices);
for (int d = 0; d < devices; ++d) {
cudaDeviceProp p;
cudaGetDeviceProperties(&p, d);
int mp = p.multiProcessorCount, sp = 0;
if (p.major == 2) {
if (p.minor == 1) sp = 48;
else sp = 32;
} else if (p.major == 3) {
sp = 192;
} else if (p.major == 5) {
sp = 128;
}
printf("Device %d: %s\n", d, p.name);
printf(" -> multiprocessor count: %d\n", mp);
printf(" -> stream processor count: %d (total %d)\n", sp, sp * mp);
printf(" -> warp size: %d\n", p.warpSize);
printf(" -> max threads per block: %d\n", p.maxThreadsPerBlock);
printf(" -> max block dimensions: %d x %d x %d\n", p.maxThreadsDim[0], p.maxThreadsDim[1], p.maxThreadsDim[2]);
printf(" -> max grid dimensions: %d x %d x %d\n", p.maxGridSize[0], p.maxGridSize[1], p.maxGridSize[2]);
puts("");
}
return 0;
}
#include <stdio.h>
const int ARRAY_SIZE = 200;
__global__ void Dup(long count, int *arr) {
long idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < count) {
arr[idx] *= 2;
}
}
int main() {
int *arr = (int *) malloc(ARRAY_SIZE * sizeof(int));
if (arr == NULL) {
fprintf(stderr, "malloc failed!\n");
return 1;
}
for (int i = 0; i < ARRAY_SIZE; ++i) arr[i] = i;
int *c_arr;
cudaError_t err = cudaMalloc((void **) &c_arr, ARRAY_SIZE * sizeof(int));
if (err != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed: %s\n", cudaGetErrorString(err));
return 1;
}
err = cudaMemcpy(c_arr, arr, ARRAY_SIZE * sizeof(int), cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed: %s\n", cudaGetErrorString(err));
return 1;
}
int blocks, threads;
getLaunchConfiguration(Dup, ARRAY_SIZE, &blocks, &threads);
printf("Launch %d blocks of %d threads for %d elements.\n", blocks, threads, ARRAY_SIZE);
Dup<<<blocks, threads>>>(ARRAY_SIZE, c_arr);
cudaDeviceSynchronize();
err = cudaMemcpy(arr, c_arr, ARRAY_SIZE * sizeof(int), cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed: %s\n", cudaGetErrorString(err));
return 1;
}
cudaFree(c_arr);
for (int i = 0; i < ARRAY_SIZE; ++i) {
if (arr[i] != i * 2) {
fprintf(stderr, "CUDA failed to double array entries (index %d) ...\n", i);
return 1;
}
}
free(arr);
puts("PASS");
return 0;
}
template <class T>
cudaError_t cudaOccupancyMaxPotentialBlockSize(int *minGridSize, int *blockSize, T kernel, size_t dynamicSMemSize, int blockSizeLimit);
template <class T>
__host__ void getLaunchConfiguration(T t, int n, int *blocks, int *threads) {
cudaOccupancyMaxPotentialBlockSize(blocks, threads, t, 0, n);
*blocks = (n + *threads - 1) / *threads;
}
//Define a kernel `k` ...
__global__ void k(void *data, int count) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < count) {
//Here be dragons
}
}
int blocks = 64, threads = 512;
//Given kernel `k`, we can invoke it by writing the following:
k<<<blocks, threads>>>(...);
//Variables `blocks` and `threads` are of type `dim3` or `int`
//Note that if executing on more than a single dimension `dim3`, the method to compute idx must also be adjusted
//By default, passing an int to the kernel invocation creates a single dimension `dim3` (int x 1 x 1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment