Created
December 3, 2018 17:01
-
-
Save Voltara/18e6c23df057a9f304d7b8103ba556b7 to your computer and use it in GitHub Desktop.
Advent of Code 2018 Day 3 Part 1, CUDA
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// nvcc -O3 day03.cu | |
#include <cstdio> | |
#include <vector> | |
struct input_t { int id, x0, y0, x1, y1; }; | |
static __device__ void reduce(int *out, int *in) { | |
volatile int *vin = in; | |
int tid = threadIdx.x; | |
__syncthreads(); | |
for (int n = 512; n >= 64; n >>= 1) { | |
if (tid < n) { | |
in[tid] += in[tid + n]; | |
__syncthreads(); | |
} | |
} | |
if (tid < 32) { | |
for (int n = 32; n; n >>= 1) { | |
vin[tid] += vin[tid + n]; | |
} | |
if (tid == 0) { | |
*out = in[0]; | |
} | |
} | |
} | |
static __global__ void part1_kernel(int *out, input_t *in, size_t n) { | |
__shared__ int shared[1024]; | |
int x = threadIdx.x, y = blockIdx.x; | |
int count = 0; | |
for (; n--; in++) { | |
// Is my pixel inside the rectangle? | |
count += (x >= in->x0) & (x < in->x1) & | |
(y >= in->y0) & (y < in->y1); | |
} | |
shared[x] = (count > 1); | |
reduce(out + blockIdx.x, shared); | |
} | |
static __global__ void reduce_kernel(int *arr) { | |
__shared__ int shared[1024]; | |
int tid = threadIdx.x; | |
shared[tid] = arr[tid]; | |
reduce(arr, shared); | |
} | |
int main() { | |
std::vector<input_t> v; | |
int *g_out, out[1024]; | |
input_t *g_in; | |
input_t tmp; | |
while (scanf("#%d @ %d,%d: %dx%d\n", &tmp.id, | |
&tmp.x0, &tmp.y0, | |
&tmp.x1, &tmp.y1) == 5) | |
{ | |
tmp.x1 += tmp.x0; | |
tmp.y1 += tmp.y0; | |
v.push_back(tmp); | |
} | |
cudaSetDevice(0); | |
cudaMalloc(&g_in, v.size() * sizeof(input_t)); | |
cudaMemcpy(g_in, &v[0], v.size() * sizeof(input_t), cudaMemcpyHostToDevice); | |
cudaMalloc(&g_out, sizeof(out)); | |
// Get counts per row | |
part1_kernel<<<1024, 1024>>>(g_out, g_in, v.size()); | |
// Sum all the rows | |
reduce_kernel<<<1, 1024>>>(g_out); | |
cudaDeviceSynchronize(); | |
cudaMemcpy(out, g_out, sizeof(out), cudaMemcpyDeviceToHost); | |
printf("Part 1: %d\n", out[0]); | |
cudaFree(g_out); | |
cudaFree(g_in); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment