Skip to content

Instantly share code, notes, and snippets.

@chetandhembre
Last active May 15, 2024 15:22
Show Gist options
  • Save chetandhembre/6e93a652026f0bb669c981513e2cc5b8 to your computer and use it in GitHub Desktop.
Save chetandhembre/6e93a652026f0bb669c981513e2cc5b8 to your computer and use it in GitHub Desktop.
puzzle10_dotproduct floating point overflow error
#include <stdio.h>
#include<common.h>
#include <cassert>
/*
Implement a kernel that computes the dot-product of a and b and stores it in out.
You have 1 thread per position.
You only need 2 global reads and 1 global write per thread.
Note: For this problem you don't need to worry about number of shared reads.
We will handle that challenge later.
*/
//fasted
#define BLOCK_SIZE 1024
#define COARSE_FACTORE 16
// for n = 20480000 this kernel fails because of floating point precision issue
__global__ void dotproduct(float *a, float *b, int n, float *out) {
assert(blockDim.x == BLOCK_SIZE);
int x = blockIdx.x * blockDim.x + threadIdx.x;
float val;
if (x < n) {
val = (a[x] * b[x]);
atomicAdd(&out[0], val);
}
}
__global__ void dotproduct1(float *a, float *b, int n, float *out) {
assert(blockDim.x == BLOCK_SIZE);
int x = blockIdx.x * blockDim.x + threadIdx.x;
if (x < n) {
out[x] = a[x] * b[x];
}
}
__global__ void reduction(float *input, int n) {
assert(blockDim.x == BLOCK_SIZE);
int x = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ float s[BLOCK_SIZE];
float val = 0;
int k;
for (int i = 0; i < COARSE_FACTORE && x + i * blockDim.x < n; i++) {
k = i * blockDim.x + x;
val += input[k];
}
s[threadIdx.x] = val;
__syncthreads();
for (int i = 1; i < blockDim.x; i *= 2) {
if (threadIdx.x % (2 * i) == 0) {
s[threadIdx.x] += s[threadIdx.x + i];
}
__syncthreads();
}
if (threadIdx.x == 0) {
input[blockIdx.x] = s[0];
}
}
void __puzzle_part_2(float *input, int n) {
int block_size = BLOCK_SIZE;
int grid_size = ceil_div(n/COARSE_FACTORE, block_size);
while (grid_size > 0) {
reduction<<<grid_size, block_size>>>(input, n);
if (grid_size == 1) {
break;
}
n = grid_size;
grid_size = ceil_div(n/COARSE_FACTORE, block_size);
}
}
float puzzle10_cpu(float *a, float *b, int n) {
double out = 0;
int i = 0;
for (; i < n; i++) {
out += (a[i] * b[i]);
}
return out;
}
void _puzzle10(float *a, float *b, int n, float *output) {
int block_size = 1024;
int grid_size = ceil_div(n, block_size);
// dotproduct<<<grid_size, block_size>>>(a, b, n, output);
dotproduct1<<<grid_size, block_size>>>(a, b, n, output);
__puzzle_part_2(output, n);
}
void puzzle10() {
float *a, *b, *output;
int n = 20480000;
a = (float *)malloc(n * sizeof(float));
b = (float *)malloc(n * sizeof(float));
output = (float *)malloc(n * sizeof(float));
for (int i = 0; i < n; i++) {
a[i] = 1;
b[i] = 1;
output[i] = 0;
}
float *d_a, *d_b, *d_output;
cudaCheck(cudaMalloc(&d_a, n * sizeof(float)));
cudaCheck(cudaMalloc(&d_b, n * sizeof(float)));
cudaCheck(cudaMalloc(&d_output, n * sizeof(float)));
cudaCheck(cudaMemcpy(d_a, a, n * sizeof(float), cudaMemcpyHostToDevice));
cudaCheck(cudaMemcpy(d_b, b, n * sizeof(float), cudaMemcpyHostToDevice));
cudaCheck(cudaMemcpy(d_output, output, n * sizeof(float), cudaMemcpyHostToDevice));
int repeat_times = 1;
float elapsed_time = benchmark_kernel(repeat_times, _puzzle10, d_a, d_b, n, d_output);
printf("time gpu %.8f ms\n", elapsed_time);
cudaCheck(cudaMemcpy(output, d_output, sizeof(float), cudaMemcpyDeviceToHost));
float output_cpu = puzzle10_cpu(a, b, n);
if (output_cpu != *output) {
printf("Error: %f != %f\n", output_cpu, *output);
} else {
printf("Output is correct %f = %f \n", output_cpu, *output);
}
cudaCheck(cudaFree(d_a));
cudaCheck(cudaFree(d_b));
cudaCheck(cudaFree(d_output));
free(a);
free(b);
free(output);
}
//with dotproduct kernel Error: 20480000.000000 != 16777216.000000
//with dotproduct1 and reduction kernel Error: 20480000.000000 == 20480000.000000
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment