Skip to content

Instantly share code, notes, and snippets.

@ebraminio
Last active December 19, 2016 17:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ebraminio/b1ac55596aa6e91752889e50c25ec355 to your computer and use it in GitHub Desktop.
Save ebraminio/b1ac55596aa6e91752889e50c25ec355 to your computer and use it in GitHub Desktop.
All Sums
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define __CUDACC__
#include "device_functions.h"
#include <stdio.h>
#include <stdlib.h>
void print(char*, int[], int);
__global__
void allSums(int* A, int size) {
extern __shared__ int localA[];
localA[threadIdx.x] = A[threadIdx.x];
for (int i = 1; i < size; i = i << 1) {
if ((int)threadIdx.x - i >= 0)
localA[threadIdx.x] += localA[threadIdx.x - i];
__syncthreads();
}
A[threadIdx.x] = localA[threadIdx.x];
}
int main() {
int arr[] = { 1, 2, 3, 4, 5, 6, 7, 8 };
int numBytes = sizeof arr;
int size = numBytes / sizeof(int);
print("arr: ", arr, size);
int *arrGPU;
cudaMalloc((void**)&arrGPU, numBytes);
cudaMemcpy(arrGPU, arr, numBytes, cudaMemcpyHostToDevice);
int numBlocks = 1;
dim3 threadsPerBlock(size, 1);
allSums<<<numBlocks, threadsPerBlock, numBytes>>>(arrGPU, size);
cudaDeviceSynchronize();
cudaMemcpy(arr, arrGPU, numBytes, cudaMemcpyDeviceToHost);
cudaFree(arrGPU);
print("\nResult of GPU processing (x .^ 2): ", arr, size);
getchar();
return 0;
}
void print(char* str, int arr[], int n) {
printf("%s{ ", str);
for (int i = 0; i < n; ++i)
printf("%d%s", arr[i], n - 1 == i ? "" : ", ");
printf(" }\n");
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment