Created
December 20, 2011 17:36
-
-
Save dpiponi/1502434 to your computer and use it in GitHub Desktop.
Minimal CUDA example (with helpful comments).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
// | |
// Nearly minimal CUDA example. | |
// Compile with: | |
// | |
// nvcc -o example example.cu | |
// | |
#define N 1000 | |
// | |
// A function marked __global__ | |
// runs on the GPU but can be called from | |
// the CPU. | |
// | |
// This function multiplies the elements of an array | |
// of ints by 2. | |
// | |
// The entire computation can be thought of as running | |
// with one thread per array element with blockIdx.x | |
// identifying the thread. | |
// | |
// The comparison i<N is because often it isn't convenient | |
// to have an exact 1-1 correspondence between threads | |
// and array elements. Not strictly necessary here. | |
// | |
// Note how we're mixing GPU and CPU code in the same source | |
// file. An alternative way to use CUDA is to keep | |
// C/C++ code separate from CUDA code and dynamically | |
// compile and load the CUDA code at runtime, a little | |
// like how you compile and load OpenGL shaders from | |
// C/C++ code. | |
// | |
__global__ | |
void add(int *a, int *b) { | |
int i = blockIdx.x; | |
if (i<N) { | |
b[i] = 2*a[i]; | |
} | |
} | |
int main() { | |
// | |
// Create int arrays on the CPU. | |
// ('h' stands for "host".) | |
// | |
int ha[N], hb[N]; | |
// | |
// Create corresponding int arrays on the GPU. | |
// ('d' stands for "device".) | |
// | |
int *da, *db; | |
cudaMalloc((void **)&da, N*sizeof(int)); | |
cudaMalloc((void **)&db, N*sizeof(int)); | |
// | |
// Initialise the input data on the CPU. | |
// | |
for (int i = 0; i<N; ++i) { | |
ha[i] = i; | |
} | |
// | |
// Copy input data to array on GPU. | |
// | |
cudaMemcpy(da, ha, N*sizeof(int), cudaMemcpyHostToDevice); | |
// | |
// Launch GPU code with N threads, one per | |
// array element. | |
// | |
add<<<N, 1>>>(da, db); | |
// | |
// Copy output array from GPU back to CPU. | |
// | |
cudaMemcpy(hb, db, N*sizeof(int), cudaMemcpyDeviceToHost); | |
for (int i = 0; i<N; ++i) { | |
printf("%d\n", hb[i]); | |
} | |
// | |
// Free up the arrays on the GPU. | |
// | |
cudaFree(da); | |
cudaFree(db); | |
return 0; | |
} |
FWIW I just tried following my own instructions with CUDA 11.4 running on WSL 2/Ubuntu and it worked. (Compiling with /usr/local/cuda/bin/nvcc.)
Thank you so much for this; truly helped a lot with my homework.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
For me, this was solved by downgrading CUDA from 11.7 to 10.2