Created
June 13, 2017 09:57
-
-
Save dyigitpolat/22e6a992773ea277c267c9f991dcde56 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <math.h> | |
// CUDA kernel. Each thread takes care of one element of c | |
__global__ void elementWiseMult(double *a, double *b, double *c, int n) | |
{ | |
// Get our global thread ID | |
int id = blockIdx.x*blockDim.x+threadIdx.x; | |
// Make sure we do not go out of bounds | |
if (id < n) | |
c[id] = a[id] * b[id]; | |
} | |
// CUDA kernel. Each thread takes care of one element of c | |
__global__ void reduceAdd(double *c, int n) | |
{ | |
// Get our global thread ID | |
int id = blockIdx.x*blockDim.x+threadIdx.x; | |
// Make sure we do not go out of bounds | |
if (id < n) | |
c[id] = c[id] + c[id + n]; | |
} | |
int main( int argc, char** argv) { | |
int nDevices; | |
cudaGetDeviceCount(&nDevices); | |
for (int i = 0; i < nDevices; i++) | |
{ | |
cudaDeviceProp prop; | |
cudaGetDeviceProperties(&prop, i); | |
printf("Device Number: %d\n", i); | |
printf(" Device name: %s\n", prop.name); | |
printf(" Memory Clock Rate (KHz): %d\n", prop.memoryClockRate); | |
printf(" Memory Bus Width (bits): %d\n", prop.memoryBusWidth); | |
printf(" Peak Memory Bandwidth (GB/s): %f\n\n", 2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6); | |
} | |
int length = 1048576; | |
int vec_size = length * sizeof(double); | |
double* vec_a = (double*) malloc( vec_size); | |
double* vec_b = (double*) malloc( vec_size); | |
double a_dot_b; | |
int i; | |
for( i = 0; i < length; i++) | |
{ | |
vec_a[i] = 1; | |
vec_b[i] = 1; | |
} | |
double* device_a; | |
double* device_b; | |
double* device_c; | |
cudaMalloc( &device_a, vec_size); | |
cudaMalloc( &device_b, vec_size); | |
cudaMalloc( &device_c, vec_size); | |
cudaMemcpy( device_a, vec_a, vec_size, cudaMemcpyHostToDevice); | |
cudaMemcpy( device_b, vec_b, vec_size, cudaMemcpyHostToDevice); | |
int block_size = 1024; | |
int grid_size = length / block_size; | |
// Execute the kernel | |
elementWiseMult<<<grid_size, block_size>>>( device_a, device_b, device_c, length); | |
while( length = length >> 1) | |
{ | |
reduceAdd<<<grid_size, block_size>>>( device_c, length); | |
} | |
cudaMemcpy( &a_dot_b, device_c, sizeof(double), cudaMemcpyDeviceToHost); | |
cudaMemcpy( vec_a, device_c, vec_size, cudaMemcpyDeviceToHost); | |
cudaFree( device_a); | |
cudaFree( device_b); | |
cudaFree( device_c); | |
printf( "Dot producxt = %f\n", a_dot_b); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment