Skip to content

Instantly share code, notes, and snippets.

Last active May 5, 2020 10:51
Show Gist options
  • Save oak-tree/f270a6cc5666de61ad81d84c3253d6eb to your computer and use it in GitHub Desktop.
Save oak-tree/f270a6cc5666de61ad81d84c3253d6eb to your computer and use it in GitHub Desktop.
#include <stdio.h>
#include <assert.h>
inline cudaError_t checkCuda(cudaError_t result)
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
return result;
inline cudaError_t checkLastCuda()
cudaError_t result = cudaGetLastError();
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
return result;
#include <stdio.h>
#include <assert.h>
inline cudaError_t checkCuda(cudaError_t result)
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
return result;
inline cudaError_t checkLastCuda()
cudaError_t result = cudaGetLastError();
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
return result;
void initWith(float num, float *a, int N)
for(int i = 0; i < N; ++i)
a[i] = num;
__global__ void addVectorsInto(float *result, float *a, float*b, int N){
int indexWithinTheGrid = threadIdx.x + blockIdx.x * blockDim.x;
int gridStride = gridDim.x * blockDim.x;
for (int i = indexWithinTheGrid; i < N; i += gridStride)
result[i] = a[i] + b[i];
void checkElementsAre(float target, float *array, int N)
for(int i = 0; i < N; i++)
if(array[i] != target)
printf("FAIL: array[%d] - %0.0f does not equal %0.0f\n", i, array[i], target);
printf("SUCCESS! All values added correctly.\n");
int main()
const int N = 2<<20;
size_t size = N * sizeof(float);
float *a;
float *b;
float *c;
cudaMallocManaged(&a, size);
cudaMallocManaged(&b, size);
cudaMallocManaged(&c, size);
initWith(3, a, N);
initWith(4, b, N);
initWith(0, c, N);
int blocks = 10;
int threads = 1;
addVectorsInto<<<blocks,threads>>>(c, a, b, N);
checkElementsAre(7, c, N);
#include <stdio.h>
#include <assert.h>
inline cudaError_t checkCuda(cudaError_t result)
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
return result;
inline cudaError_t checkLastCuda()
cudaError_t result = cudaGetLastError();
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
return result;
#define N 64
__global__ void matrixMulGPU( int * a, int * b, int * c)
int row = threadIdx.x + blockIdx.x * blockDim.x;
int col = threadIdx.y + blockIdx.y * blockDim.y;
if (row >= N)
if (col >= N)
int val = 0;
for ( int k = 0; k < N; ++k )
val += a[row * N + k] * b[k * N + col];
c[row * N + col] = val;
* Build out this kernel.
* This CPU function already works, and will run to create a solution matrix
* against which to verify your work building out the matrixMulGPU kernel.
void matrixMulCPU( int * a, int * b, int * c )
int val = 0;
for( int row = 0; row < N; ++row )
for( int col = 0; col < N; ++col )
val = 0;
for ( int k = 0; k < N; ++k )
val += a[row * N + k] * b[k * N + col];
c[row * N + col] = val;
int main()
int *a, *b, *c_cpu, *c_gpu; // Allocate a solution matrix for both the CPU and the GPU operations
int size = N * N * sizeof (int); // Number of bytes of an N x N matrix
// Allocate memory
cudaMallocManaged (&a, size);
cudaMallocManaged (&b, size);
cudaMallocManaged (&c_cpu, size);
cudaMallocManaged (&c_gpu, size);
// Initialize memory; create 2D matrices
for( int row = 0; row < N; ++row )
for( int col = 0; col < N; ++col )
a[row*N + col] = row;
b[row*N + col] = col+2;
c_cpu[row*N + col] = 0;
c_gpu[row*N + col] = 0;
* Assign `threads_per_block` and `number_of_blocks` 2D values
* that can be used in matrixMulGPU above.
dim3 threads_per_block(16,16,1);
dim3 number_of_blocks(N/threads_per_block.x,N/threads_per_block.y,1);
matrixMulGPU <<< number_of_blocks, threads_per_block >>> ( a, b, c_gpu );
// Call the CPU version to check our work
matrixMulCPU( a, b, c_cpu );
// Compare the two answers to make sure they are equal
bool error = false;
for( int row = 0; row < N && !error; ++row )
for( int col = 0; col < N && !error; ++col )
if (c_cpu[row * N + col] != c_gpu[row * N + col])
printf("FOUND ERROR at c[%d][%d]\n", row, col);
error = true;
if (!error)
// Free all our allocated memory
cudaFree(a); cudaFree(b);
cudaFree( c_cpu ); cudaFree( c_gpu );
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment