Last active
March 23, 2016 21:03
-
-
Save Timer/d6b040465a1e15b67ee3 to your computer and use it in GitHub Desktop.
Code for blog post http://www.invokestatic.com/2016/01/17/cuda-1/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
int main() { | |
int devices; | |
cudaGetDeviceCount(&devices); | |
for (int d = 0; d < devices; ++d) { | |
cudaDeviceProp p; | |
cudaGetDeviceProperties(&p, d); | |
int mp = p.multiProcessorCount, sp = 0; | |
if (p.major == 2) { | |
if (p.minor == 1) sp = 48; | |
else sp = 32; | |
} else if (p.major == 3) { | |
sp = 192; | |
} else if (p.major == 5) { | |
sp = 128; | |
} | |
printf("Device %d: %s\n", d, p.name); | |
printf(" -> multiprocessor count: %d\n", mp); | |
printf(" -> stream processor count: %d (total %d)\n", sp, sp * mp); | |
printf(" -> warp size: %d\n", p.warpSize); | |
printf(" -> max threads per block: %d\n", p.maxThreadsPerBlock); | |
printf(" -> max block dimensions: %d x %d x %d\n", p.maxThreadsDim[0], p.maxThreadsDim[1], p.maxThreadsDim[2]); | |
printf(" -> max grid dimensions: %d x %d x %d\n", p.maxGridSize[0], p.maxGridSize[1], p.maxGridSize[2]); | |
puts(""); | |
} | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
const int ARRAY_SIZE = 200; | |
__global__ void Dup(long count, int *arr) { | |
long idx = blockIdx.x * blockDim.x + threadIdx.x; | |
if (idx < count) { | |
arr[idx] *= 2; | |
} | |
} | |
int main() { | |
int *arr = (int *) malloc(ARRAY_SIZE * sizeof(int)); | |
if (arr == NULL) { | |
fprintf(stderr, "malloc failed!\n"); | |
return 1; | |
} | |
for (int i = 0; i < ARRAY_SIZE; ++i) arr[i] = i; | |
int *c_arr; | |
cudaError_t err = cudaMalloc((void **) &c_arr, ARRAY_SIZE * sizeof(int)); | |
if (err != cudaSuccess) { | |
fprintf(stderr, "cudaMalloc failed: %s\n", cudaGetErrorString(err)); | |
return 1; | |
} | |
err = cudaMemcpy(c_arr, arr, ARRAY_SIZE * sizeof(int), cudaMemcpyHostToDevice); | |
if (err != cudaSuccess) { | |
fprintf(stderr, "cudaMemcpy failed: %s\n", cudaGetErrorString(err)); | |
return 1; | |
} | |
int blocks, threads; | |
getLaunchConfiguration(Dup, ARRAY_SIZE, &blocks, &threads); | |
printf("Launch %d blocks of %d threads for %d elements.\n", blocks, threads, ARRAY_SIZE); | |
Dup<<<blocks, threads>>>(ARRAY_SIZE, c_arr); | |
cudaDeviceSynchronize(); | |
err = cudaMemcpy(arr, c_arr, ARRAY_SIZE * sizeof(int), cudaMemcpyDeviceToHost); | |
if (err != cudaSuccess) { | |
fprintf(stderr, "cudaMemcpy failed: %s\n", cudaGetErrorString(err)); | |
return 1; | |
} | |
cudaFree(c_arr); | |
for (int i = 0; i < ARRAY_SIZE; ++i) { | |
if (arr[i] != i * 2) { | |
fprintf(stderr, "CUDA failed to double array entries (index %d) ...\n", i); | |
return 1; | |
} | |
} | |
free(arr); | |
puts("PASS"); | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
template <class T> | |
cudaError_t cudaOccupancyMaxPotentialBlockSize(int *minGridSize, int *blockSize, T kernel, size_t dynamicSMemSize, int blockSizeLimit); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
template <class T> | |
__host__ void getLaunchConfiguration(T t, int n, int *blocks, int *threads) { | |
cudaOccupancyMaxPotentialBlockSize(blocks, threads, t, 0, n); | |
*blocks = (n + *threads - 1) / *threads; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//Define a kernel `k` ... | |
__global__ void k(void *data, int count) { | |
int idx = blockIdx.x * blockDim.x + threadIdx.x; | |
if (idx < count) { | |
//Here be dragons | |
} | |
} | |
int blocks = 64, threads = 512; | |
//Given kernel `k`, we can invoke it by writing the following: | |
k<<<blocks, threads>>>(...); | |
//Variables `blocks` and `threads` are of type `dim3` or `int` | |
//Note that if executing on more than a single dimension `dim3`, the method to compute idx must also be adjusted | |
//By default, passing an int to the kernel invocation creates a single dimension `dim3` (int x 1 x 1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment