Skip to content

Instantly share code, notes, and snippets.

@dyigitpolat
Created June 13, 2017 09:57
Show Gist options
  • Save dyigitpolat/22e6a992773ea277c267c9f991dcde56 to your computer and use it in GitHub Desktop.
Save dyigitpolat/22e6a992773ea277c267c9f991dcde56 to your computer and use it in GitHub Desktop.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
// CUDA kernel. Each thread takes care of one element of c
__global__ void elementWiseMult(double *a, double *b, double *c, int n)
{
// Get our global thread ID
int id = blockIdx.x*blockDim.x+threadIdx.x;
// Make sure we do not go out of bounds
if (id < n)
c[id] = a[id] * b[id];
}
// CUDA kernel. Each thread takes care of one element of c
__global__ void reduceAdd(double *c, int n)
{
// Get our global thread ID
int id = blockIdx.x*blockDim.x+threadIdx.x;
// Make sure we do not go out of bounds
if (id < n)
c[id] = c[id] + c[id + n];
}
int main( int argc, char** argv) {
int nDevices;
cudaGetDeviceCount(&nDevices);
for (int i = 0; i < nDevices; i++)
{
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, i);
printf("Device Number: %d\n", i);
printf(" Device name: %s\n", prop.name);
printf(" Memory Clock Rate (KHz): %d\n", prop.memoryClockRate);
printf(" Memory Bus Width (bits): %d\n", prop.memoryBusWidth);
printf(" Peak Memory Bandwidth (GB/s): %f\n\n", 2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
}
int length = 1048576;
int vec_size = length * sizeof(double);
double* vec_a = (double*) malloc( vec_size);
double* vec_b = (double*) malloc( vec_size);
double a_dot_b;
int i;
for( i = 0; i < length; i++)
{
vec_a[i] = 1;
vec_b[i] = 1;
}
double* device_a;
double* device_b;
double* device_c;
cudaMalloc( &device_a, vec_size);
cudaMalloc( &device_b, vec_size);
cudaMalloc( &device_c, vec_size);
cudaMemcpy( device_a, vec_a, vec_size, cudaMemcpyHostToDevice);
cudaMemcpy( device_b, vec_b, vec_size, cudaMemcpyHostToDevice);
int block_size = 1024;
int grid_size = length / block_size;
// Execute the kernel
elementWiseMult<<<grid_size, block_size>>>( device_a, device_b, device_c, length);
while( length = length >> 1)
{
reduceAdd<<<grid_size, block_size>>>( device_c, length);
}
cudaMemcpy( &a_dot_b, device_c, sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy( vec_a, device_c, vec_size, cudaMemcpyDeviceToHost);
cudaFree( device_a);
cudaFree( device_b);
cudaFree( device_c);
printf( "Dot producxt = %f\n", a_dot_b);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment