Skip to content

Instantly share code, notes, and snippets.

@ilyakava
Created July 30, 2015 21:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ilyakava/69a824853d00f2c8e79f to your computer and use it in GitHub Desktop.
Save ilyakava/69a824853d00f2c8e79f to your computer and use it in GitHub Desktop.
#include <stdio.h>
#include <stdlib.h>
#define BLOCK_WIDTH 1000
void print_array(int *array, int size)
{
printf("{ ");
for (int i = 0; i < size; i++) { printf("%d ", array[i]); }
printf("}\n");
}
__global__ void increment_naive(int *g, int array_size)
{
// which thread is this?
int i = blockIdx.x * blockDim.x + threadIdx.x;
// each thread to increment consecutive elements, wrapping at ARRAY_SIZE
i = i % array_size;
g[i] = g[i] + 1;
}
__global__ void increment_atomic(int *g, int array_size)
{
// which thread is this?
int i = blockIdx.x * blockDim.x + threadIdx.x;
// each thread to increment consecutive elements, wrapping at ARRAY_SIZE
i = i % array_size;
atomicAdd(& g[i], 1);
}
int main(int argc,char **argv)
{
// run me with: ./time 10000000 100 1
// discussion: https://discussions.udacity.com/t/atomic-vs-non-atomic/18205?u=edg
// GpuTimer timer;
int NUM_THREADS = atoi(argv[1]);
int ARRAY_SIZE = atoi(argv[2]);
int runAtomic = atoi(argv[3]);
printf("mode %i (atomic if 1). %d total threads in %d blocks writing into %d array elements\n",
runAtomic, NUM_THREADS, NUM_THREADS / BLOCK_WIDTH, ARRAY_SIZE);
// declare and allocate host memory
int h_array[ARRAY_SIZE];
const int ARRAY_BYTES = ARRAY_SIZE * sizeof(int);
// declare, allocate, and zero out GPU memory
int * d_array;
cudaMalloc((void **) &d_array, ARRAY_BYTES);
cudaMemset((void *) d_array, 0, ARRAY_BYTES);
// launch the kernel - comment out one of these
// timer.Start();
// Instructions: This program is needed for the next quiz
// uncomment increment_naive to measure speed and accuracy
// of non-atomic increments or uncomment increment_atomic to
// measure speed and accuracy of atomic icrements
// increment_naive<<<NUM_THREADS/BLOCK_WIDTH, BLOCK_WIDTH>>>(d_array);
if(runAtomic){
increment_atomic<<<NUM_THREADS/BLOCK_WIDTH, BLOCK_WIDTH>>>(d_array, ARRAY_SIZE);
} else {
increment_naive<<<NUM_THREADS/BLOCK_WIDTH, BLOCK_WIDTH>>>(d_array, ARRAY_SIZE);
}
// timer.Stop();
// copy back the array of sums from GPU and print
cudaMemcpy(h_array, d_array, ARRAY_BYTES, cudaMemcpyDeviceToHost);
// print_array(h_array, ARRAY_SIZE);
// printf("Time elapsed = %g ms\n", timer.Elapsed());
// free GPU memory allocation and exit
cudaFree(d_array);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment