chetandhembre/dotproduct.cu

## dotproduct.cu
#include <stdio.h>
#include<common.h>
#include <cassert>

/*
Implement a kernel that computes the dot-product of a and b and stores it in out.
You have 1 thread per position.
You only need 2 global reads and 1 global write per thread.

Note: For this problem you don't need to worry about number of shared reads.
We will handle that challenge later.
*/

//fasted
#define BLOCK_SIZE 1024
#define COARSE_FACTORE 16

// for n = 20480000 this kernel fails because of floating point precision issue
__global__ void dotproduct(float *a, float *b, int n, float *out) {
    assert(blockDim.x == BLOCK_SIZE);
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    float val;
    if (x < n) {
        val = (a[x] * b[x]);
        atomicAdd(&out[0], val);
    }
}

__global__ void dotproduct1(float *a, float *b, int n, float *out) {
    assert(blockDim.x == BLOCK_SIZE);
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    if (x < n) {
        out[x] = a[x] * b[x];
    }
}

__global__ void reduction(float *input, int n) {
    assert(blockDim.x == BLOCK_SIZE);
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    __shared__ float s[BLOCK_SIZE];

    float val = 0;
    int k;
    for (int i = 0; i < COARSE_FACTORE && x + i * blockDim.x < n; i++) {
        k = i * blockDim.x + x;
        val += input[k];
    }

    s[threadIdx.x] = val;

    __syncthreads();

    for (int i = 1; i < blockDim.x; i *= 2) {
        if (threadIdx.x % (2 * i) == 0) {
            s[threadIdx.x] += s[threadIdx.x + i];
        }
        __syncthreads();
    }

    if (threadIdx.x == 0) {
        input[blockIdx.x] = s[0];
    }
}

void __puzzle_part_2(float *input, int n) {
    int block_size = BLOCK_SIZE;
    int grid_size = ceil_div(n/COARSE_FACTORE, block_size);
    while (grid_size > 0) {
        reduction<<<grid_size, block_size>>>(input, n);
        if (grid_size == 1) {
            break;
        }
        n = grid_size;
        grid_size = ceil_div(n/COARSE_FACTORE, block_size);
    }
}

float puzzle10_cpu(float *a, float *b, int n) {
    double out = 0;
    int i = 0;
    for (; i < n; i++) {
        out += (a[i] * b[i]);
    }
    return out;
}

void _puzzle10(float *a, float *b, int n, float *output) {
    int block_size = 1024;
    int grid_size = ceil_div(n, block_size);
    // dotproduct<<<grid_size, block_size>>>(a, b, n, output);

    dotproduct1<<<grid_size, block_size>>>(a, b, n, output);
    __puzzle_part_2(output, n);
}

void puzzle10() {
    float *a, *b, *output;
    int n = 20480000;

    a = (float *)malloc(n  * sizeof(float));
    b = (float *)malloc(n  * sizeof(float));
    output = (float *)malloc(n * sizeof(float));
    for (int i = 0; i < n; i++) {
        a[i] = 1;
        b[i] = 1;
        output[i] = 0;
    }


    float *d_a, *d_b, *d_output;
    cudaCheck(cudaMalloc(&d_a, n * sizeof(float)));
    cudaCheck(cudaMalloc(&d_b, n * sizeof(float)));
    cudaCheck(cudaMalloc(&d_output, n * sizeof(float)));

    cudaCheck(cudaMemcpy(d_a, a, n * sizeof(float), cudaMemcpyHostToDevice));
    cudaCheck(cudaMemcpy(d_b, b, n * sizeof(float), cudaMemcpyHostToDevice));
    cudaCheck(cudaMemcpy(d_output, output, n * sizeof(float), cudaMemcpyHostToDevice));

    int repeat_times = 1;
    float elapsed_time = benchmark_kernel(repeat_times, _puzzle10, d_a, d_b, n, d_output);
    printf("time gpu %.8f ms\n", elapsed_time);

    cudaCheck(cudaMemcpy(output, d_output, sizeof(float), cudaMemcpyDeviceToHost));
    float output_cpu = puzzle10_cpu(a, b, n);
    if (output_cpu != *output) {
        printf("Error: %f != %f\n", output_cpu, *output);
    } else {
        printf("Output is correct %f = %f \n", output_cpu, *output);
    }

    cudaCheck(cudaFree(d_a));
    cudaCheck(cudaFree(d_b));
    cudaCheck(cudaFree(d_output));
    free(a);
    free(b);
    free(output);
}

//with dotproduct kernel Error: 20480000.000000 != 16777216.000000
//with dotproduct1 and  reduction kernel Error: 20480000.000000 == 20480000.000000
	#include <stdio.h>
	#include<common.h>
	#include <cassert>

	/*
	Implement a kernel that computes the dot-product of a and b and stores it in out.
	You have 1 thread per position.
	You only need 2 global reads and 1 global write per thread.

	Note: For this problem you don't need to worry about number of shared reads.
	We will handle that challenge later.
	*/

	//fasted
	#define BLOCK_SIZE 1024
	#define COARSE_FACTORE 16

	// for n = 20480000 this kernel fails because of floating point precision issue
	__global__ void dotproduct(float a, float b, int n, float *out) {
	assert(blockDim.x == BLOCK_SIZE);
	int x = blockIdx.x * blockDim.x + threadIdx.x;
	float val;
	if (x < n) {
	val = (a[x] * b[x]);
	atomicAdd(&out[0], val);
	}
	}

	__global__ void dotproduct1(float a, float b, int n, float *out) {
	assert(blockDim.x == BLOCK_SIZE);
	int x = blockIdx.x * blockDim.x + threadIdx.x;
	if (x < n) {
	out[x] = a[x] * b[x];
	}
	}

	__global__ void reduction(float *input, int n) {
	assert(blockDim.x == BLOCK_SIZE);
	int x = blockIdx.x * blockDim.x + threadIdx.x;
	__shared__ float s[BLOCK_SIZE];

	float val = 0;
	int k;
	for (int i = 0; i < COARSE_FACTORE && x + i * blockDim.x < n; i++) {
	k = i * blockDim.x + x;
	val += input[k];
	}

	s[threadIdx.x] = val;

	__syncthreads();

	for (int i = 1; i < blockDim.x; i *= 2) {
	if (threadIdx.x % (2 * i) == 0) {
	s[threadIdx.x] += s[threadIdx.x + i];
	}
	__syncthreads();
	}

	if (threadIdx.x == 0) {
	input[blockIdx.x] = s[0];
	}
	}

	void __puzzle_part_2(float *input, int n) {
	int block_size = BLOCK_SIZE;
	int grid_size = ceil_div(n/COARSE_FACTORE, block_size);
	while (grid_size > 0) {
	reduction<<<grid_size, block_size>>>(input, n);
	if (grid_size == 1) {
	break;
	}
	n = grid_size;
	grid_size = ceil_div(n/COARSE_FACTORE, block_size);
	}
	}

	float puzzle10_cpu(float a, float b, int n) {
	double out = 0;
	int i = 0;
	for (; i < n; i++) {
	out += (a[i] * b[i]);
	}
	return out;
	}

	void _puzzle10(float a, float b, int n, float *output) {
	int block_size = 1024;
	int grid_size = ceil_div(n, block_size);
	// dotproduct<<<grid_size, block_size>>>(a, b, n, output);

	dotproduct1<<<grid_size, block_size>>>(a, b, n, output);
	__puzzle_part_2(output, n);
	}

	void puzzle10() {
	float a, b, *output;
	int n = 20480000;

	a = (float )malloc(n sizeof(float));
	b = (float )malloc(n sizeof(float));
	output = (float )malloc(n sizeof(float));
	for (int i = 0; i < n; i++) {
	a[i] = 1;
	b[i] = 1;
	output[i] = 0;
	}


	float d_a, d_b, *d_output;
	cudaCheck(cudaMalloc(&d_a, n * sizeof(float)));
	cudaCheck(cudaMalloc(&d_b, n * sizeof(float)));
	cudaCheck(cudaMalloc(&d_output, n * sizeof(float)));

	cudaCheck(cudaMemcpy(d_a, a, n * sizeof(float), cudaMemcpyHostToDevice));
	cudaCheck(cudaMemcpy(d_b, b, n * sizeof(float), cudaMemcpyHostToDevice));
	cudaCheck(cudaMemcpy(d_output, output, n * sizeof(float), cudaMemcpyHostToDevice));

	int repeat_times = 1;
	float elapsed_time = benchmark_kernel(repeat_times, _puzzle10, d_a, d_b, n, d_output);
	printf("time gpu %.8f ms\n", elapsed_time);

	cudaCheck(cudaMemcpy(output, d_output, sizeof(float), cudaMemcpyDeviceToHost));
	float output_cpu = puzzle10_cpu(a, b, n);
	if (output_cpu != *output) {
	printf("Error: %f != %f\n", output_cpu, *output);
	} else {
	printf("Output is correct %f = %f \n", output_cpu, *output);
	}

	cudaCheck(cudaFree(d_a));
	cudaCheck(cudaFree(d_b));
	cudaCheck(cudaFree(d_output));
	free(a);
	free(b);
	free(output);
	}

	//with dotproduct kernel Error: 20480000.000000 != 16777216.000000
	//with dotproduct1 and reduction kernel Error: 20480000.000000 == 20480000.000000