ilyakava/time.cu

## time.cu
#include <stdio.h>
#include <stdlib.h>

#define BLOCK_WIDTH 1000

void print_array(int *array, int size)
{
    printf("{ ");
    for (int i = 0; i < size; i++)  { printf("%d ", array[i]); }
    printf("}\n");
}

__global__ void increment_naive(int *g, int array_size)
{
    // which thread is this?
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    // each thread to increment consecutive elements, wrapping at ARRAY_SIZE
    i = i % array_size;
    g[i] = g[i] + 1;
}

__global__ void increment_atomic(int *g, int array_size)
{
    // which thread is this?
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    // each thread to increment consecutive elements, wrapping at ARRAY_SIZE
    i = i % array_size;
    atomicAdd(& g[i], 1);
}

int main(int argc,char **argv)
{
    // run me with: ./time 10000000 100 1
    // discussion: https://discussions.udacity.com/t/atomic-vs-non-atomic/18205?u=edg
    // GpuTimer timer;
    int NUM_THREADS = atoi(argv[1]);
    int ARRAY_SIZE = atoi(argv[2]);
    int runAtomic = atoi(argv[3]);
    printf("mode %i (atomic if 1). %d total threads in %d blocks writing into %d array elements\n",
           runAtomic, NUM_THREADS, NUM_THREADS / BLOCK_WIDTH, ARRAY_SIZE);

    // declare and allocate host memory
    int h_array[ARRAY_SIZE];
    const int ARRAY_BYTES = ARRAY_SIZE * sizeof(int);

    // declare, allocate, and zero out GPU memory
    int * d_array;
    cudaMalloc((void **) &d_array, ARRAY_BYTES);
    cudaMemset((void *) d_array, 0, ARRAY_BYTES);

    // launch the kernel - comment out one of these
    // timer.Start();

    // Instructions: This program is needed for the next quiz
    // uncomment increment_naive to measure speed and accuracy
    // of non-atomic increments or uncomment increment_atomic to
    // measure speed and accuracy of  atomic icrements
    // increment_naive<<<NUM_THREADS/BLOCK_WIDTH, BLOCK_WIDTH>>>(d_array);
    if(runAtomic){
        increment_atomic<<<NUM_THREADS/BLOCK_WIDTH, BLOCK_WIDTH>>>(d_array, ARRAY_SIZE);
    } else {
        increment_naive<<<NUM_THREADS/BLOCK_WIDTH, BLOCK_WIDTH>>>(d_array, ARRAY_SIZE);
    }
    // timer.Stop();

    // copy back the array of sums from GPU and print
    cudaMemcpy(h_array, d_array, ARRAY_BYTES, cudaMemcpyDeviceToHost);
    // print_array(h_array, ARRAY_SIZE);
    // printf("Time elapsed = %g ms\n", timer.Elapsed());

    // free GPU memory allocation and exit
    cudaFree(d_array);
    return 0;
}
	#include <stdio.h>
	#include <stdlib.h>

	#define BLOCK_WIDTH 1000

	void print_array(int *array, int size)
	{
	printf("{ ");
	for (int i = 0; i < size; i++) { printf("%d ", array[i]); }
	printf("}\n");
	}

	__global__ void increment_naive(int *g, int array_size)
	{
	// which thread is this?
	int i = blockIdx.x * blockDim.x + threadIdx.x;

	// each thread to increment consecutive elements, wrapping at ARRAY_SIZE
	i = i % array_size;
	g[i] = g[i] + 1;
	}

	__global__ void increment_atomic(int *g, int array_size)
	{
	// which thread is this?
	int i = blockIdx.x * blockDim.x + threadIdx.x;

	// each thread to increment consecutive elements, wrapping at ARRAY_SIZE
	i = i % array_size;
	atomicAdd(& g[i], 1);
	}

	int main(int argc,char **argv)
	{
	// run me with: ./time 10000000 100 1
	// discussion: https://discussions.udacity.com/t/atomic-vs-non-atomic/18205?u=edg
	// GpuTimer timer;
	int NUM_THREADS = atoi(argv[1]);
	int ARRAY_SIZE = atoi(argv[2]);
	int runAtomic = atoi(argv[3]);
	printf("mode %i (atomic if 1). %d total threads in %d blocks writing into %d array elements\n",
	runAtomic, NUM_THREADS, NUM_THREADS / BLOCK_WIDTH, ARRAY_SIZE);

	// declare and allocate host memory
	int h_array[ARRAY_SIZE];
	const int ARRAY_BYTES = ARRAY_SIZE * sizeof(int);

	// declare, allocate, and zero out GPU memory
	int * d_array;
	cudaMalloc((void **) &d_array, ARRAY_BYTES);
	cudaMemset((void *) d_array, 0, ARRAY_BYTES);

	// launch the kernel - comment out one of these
	// timer.Start();

	// Instructions: This program is needed for the next quiz
	// uncomment increment_naive to measure speed and accuracy
	// of non-atomic increments or uncomment increment_atomic to
	// measure speed and accuracy of atomic icrements
	// increment_naive<<<NUM_THREADS/BLOCK_WIDTH, BLOCK_WIDTH>>>(d_array);
	if(runAtomic){
	increment_atomic<<<NUM_THREADS/BLOCK_WIDTH, BLOCK_WIDTH>>>(d_array, ARRAY_SIZE);
	} else {
	increment_naive<<<NUM_THREADS/BLOCK_WIDTH, BLOCK_WIDTH>>>(d_array, ARRAY_SIZE);
	}
	// timer.Stop();

	// copy back the array of sums from GPU and print
	cudaMemcpy(h_array, d_array, ARRAY_BYTES, cudaMemcpyDeviceToHost);
	// print_array(h_array, ARRAY_SIZE);
	// printf("Time elapsed = %g ms\n", timer.Elapsed());

	// free GPU memory allocation and exit
	cudaFree(d_array);
	return 0;
	}