Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Minimal CUDA example (with helpful comments).
#include <stdio.h>
//
// Nearly minimal CUDA example.
// Compile with:
//
// nvcc -o example example.cu
//
#define N 1000
//
// A function marked __global__
// runs on the GPU but can be called from
// the CPU.
//
// This function multiplies the elements of an array
// of ints by 2.
//
// The entire computation can be thought of as running
// with one thread per array element with blockIdx.x
// identifying the thread.
//
// The comparison i<N is because often it isn't convenient
// to have an exact 1-1 correspondence between threads
// and array elements. Not strictly necessary here.
//
// Note how we're mixing GPU and CPU code in the same source
// file. An alternative way to use CUDA is to keep
// C/C++ code separate from CUDA code and dynamically
// compile and load the CUDA code at runtime, a little
// like how you compile and load OpenGL shaders from
// C/C++ code.
//
__global__
void add(int *a, int *b) {
int i = blockIdx.x;
if (i<N) {
b[i] = 2*a[i];
}
}
int main() {
//
// Create int arrays on the CPU.
// ('h' stands for "host".)
//
int ha[N], hb[N];
//
// Create corresponding int arrays on the GPU.
// ('d' stands for "device".)
//
int *da, *db;
cudaMalloc((void **)&da, N*sizeof(int));
cudaMalloc((void **)&db, N*sizeof(int));
//
// Initialise the input data on the CPU.
//
for (int i = 0; i<N; ++i) {
ha[i] = i;
}
//
// Copy input data to array on GPU.
//
cudaMemcpy(da, ha, N*sizeof(int), cudaMemcpyHostToDevice);
//
// Launch GPU code with N threads, one per
// array element.
//
add<<<N, 1>>>(da, db);
//
// Copy output array from GPU back to CPU.
//
cudaMemcpy(hb, db, N*sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i<N; ++i) {
printf("%d\n", hb[i]);
}
//
// Free up the arrays on the GPU.
//
cudaFree(da);
cudaFree(db);
return 0;
}
@alexsystemf
Copy link

alexsystemf commented Oct 5, 2018

It produces random scrambled results, as if the output values are pointer addresses. Running on Linux Mint 19 with GTX 1060.

Ryan, if you see random output numbers, are just out-of-bound memory contents. check the i and N values and re-compile and re-run it. also try to " #define N 32 " or a small number...

@tdard
Copy link

tdard commented Sep 18, 2019

Thank you!

@woshiyyya
Copy link

woshiyyya commented Feb 7, 2022

Super helpful!! Thx!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment