Instantly share code, notes, and snippets.

Embed
What would you like to do?
Minimal CUDA example (with helpful comments).
#include <stdio.h>
//
// Nearly minimal CUDA example.
// Compile with:
//
// nvcc -o example example.cu
//
#define N 1000
//
// A function marked __global__
// runs on the GPU but can be called from
// the CPU.
//
// This function multiplies the elements of an array
// of ints by 2.
//
// The entire computation can be thought of as running
// with one thread per array element with blockIdx.x
// identifying the thread.
//
// The comparison i<N is because often it isn't convenient
// to have an exact 1-1 correspondence between threads
// and array elements. Not strictly necessary here.
//
// Note how we're mixing GPU and CPU code in the same source
// file. An alternative way to use CUDA is to keep
// C/C++ code separate from CUDA code and dynamically
// compile and load the CUDA code at runtime, a little
// like how you compile and load OpenGL shaders from
// C/C++ code.
//
__global__
void add(int *a, int *b) {
int i = blockIdx.x;
if (i<N) {
b[i] = 2*a[i];
}
}
int main() {
//
// Create int arrays on the CPU.
// ('h' stands for "host".)
//
int ha[N], hb[N];
//
// Create corresponding int arrays on the GPU.
// ('d' stands for "device".)
//
int *da, *db;
cudaMalloc((void **)&da, N*sizeof(int));
cudaMalloc((void **)&db, N*sizeof(int));
//
// Initialise the input data on the CPU.
//
for (int i = 0; i<N; ++i) {
ha[i] = i;
}
//
// Copy input data to array on GPU.
//
cudaMemcpy(da, ha, N*sizeof(int), cudaMemcpyHostToDevice);
//
// Launch GPU code with N threads, one per
// array element.
//
add<<<N, 1>>>(da, db);
//
// Copy output array from GPU back to CPU.
//
cudaMemcpy(hb, db, N*sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i<N; ++i) {
printf("%d\n", hb[i]);
}
//
// Free up the arrays on the GPU.
//
cudaFree(da);
cudaFree(db);
return 0;
}
@kylelk

This comment has been minimized.

Show comment
Hide comment
@kylelk

kylelk Mar 12, 2014

Thank you, this example is easy to understand

kylelk commented Mar 12, 2014

Thank you, this example is easy to understand

@YarnSeemannsgarn

This comment has been minimized.

Show comment
Hide comment
@YarnSeemannsgarn

YarnSeemannsgarn commented Sep 16, 2014

Thanks =)

@dpiponi

This comment has been minimized.

Show comment
Hide comment
@dpiponi

dpiponi May 11, 2017

Well this is convenient. I do a web search for a minimal CUDA example and find my own example. Still compiles and runs too :-)

Owner

dpiponi commented May 11, 2017

Well this is convenient. I do a web search for a minimal CUDA example and find my own example. Still compiles and runs too :-)

@john-danson

This comment has been minimized.

Show comment
Hide comment
@john-danson

john-danson commented Oct 25, 2017

Thanks :>)

@alexshi0000

This comment has been minimized.

Show comment
Hide comment
@alexshi0000

alexshi0000 commented Feb 6, 2018

woa nice

@Aswathym24

This comment has been minimized.

Show comment
Hide comment
@Aswathym24

Aswathym24 Jun 26, 2018

Thanks, its working for N=100000 but after that(#define N 1000000 and more than this) results in 0 's as output. Why so?

Aswathym24 commented Jun 26, 2018

Thanks, its working for N=100000 but after that(#define N 1000000 and more than this) results in 0 's as output. Why so?

@dpiponi

This comment has been minimized.

Show comment
Hide comment
@dpiponi

dpiponi Jul 24, 2018

I'm surprised it works for N=100000.
If you look here https://en.wikipedia.org/wiki/CUDA#Version_features_and_specifications you'll see various limits on the sizes of CUDA structures.
In particular the "Maximum x- or y-dimension of a block" bit.
I chose 1000 so it's less than the 1024 limit.
If you want to work with bigger arrays you'll need to work with multiple blocks and/or higher dimensional blocks.
But then the demo wouldn't be minimal any more :-)

Owner

dpiponi commented Jul 24, 2018

I'm surprised it works for N=100000.
If you look here https://en.wikipedia.org/wiki/CUDA#Version_features_and_specifications you'll see various limits on the sizes of CUDA structures.
In particular the "Maximum x- or y-dimension of a block" bit.
I chose 1000 so it's less than the 1024 limit.
If you want to work with bigger arrays you'll need to work with multiple blocks and/or higher dimensional blocks.
But then the demo wouldn't be minimal any more :-)

@austinRichards3

This comment has been minimized.

Show comment
Hide comment
@austinRichards3

austinRichards3 Aug 28, 2018

isn't the syntax for launching a kernel <<<blocks_per_grid, threads_per_block>>>

austinRichards3 commented Aug 28, 2018

isn't the syntax for launching a kernel <<<blocks_per_grid, threads_per_block>>>

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment