Voltara/day03.cu

## day03.cu
// nvcc -O3 day03.cu
#include <cstdio>
#include <vector>

struct input_t { int id, x0, y0, x1, y1; };

static __device__ void reduce(int *out, int *in) {
    volatile int *vin = in;
    int tid = threadIdx.x;
    __syncthreads();
    for (int n = 512; n >= 64; n >>= 1) {
        if (tid < n) {
            in[tid] += in[tid + n];
            __syncthreads();
        }
    }
    if (tid < 32) {
        for (int n = 32; n; n >>= 1) {
            vin[tid] += vin[tid + n];
        }
        if (tid == 0) {
            *out = in[0];
        }
    }
}

static __global__ void part1_kernel(int *out, input_t *in, size_t n) {
    __shared__ int shared[1024];
    int x = threadIdx.x, y = blockIdx.x;
    int count = 0;
    for (; n--; in++) {
        // Is my pixel inside the rectangle?
        count += (x >= in->x0) & (x < in->x1) &
                 (y >= in->y0) & (y < in->y1);
    }
    shared[x] = (count > 1);
    reduce(out + blockIdx.x, shared);
}

static __global__ void reduce_kernel(int *arr) {
    __shared__ int shared[1024];
    int tid = threadIdx.x;
    shared[tid] = arr[tid];
    reduce(arr, shared);
}

int main() {
    std::vector<input_t> v;
    int *g_out, out[1024];
    input_t *g_in;

    input_t tmp;
    while (scanf("#%d @ %d,%d: %dx%d\n", &tmp.id,
                 &tmp.x0, &tmp.y0,
                 &tmp.x1, &tmp.y1) == 5)
    {
        tmp.x1 += tmp.x0;
        tmp.y1 += tmp.y0;
        v.push_back(tmp);
    }

    cudaSetDevice(0);
    cudaMalloc(&g_in, v.size() * sizeof(input_t));
    cudaMemcpy(g_in, &v[0], v.size() * sizeof(input_t), cudaMemcpyHostToDevice);
    cudaMalloc(&g_out, sizeof(out));

    // Get counts per row
    part1_kernel<<<1024, 1024>>>(g_out, g_in, v.size());

    // Sum all the rows
    reduce_kernel<<<1, 1024>>>(g_out);

    cudaDeviceSynchronize();
    cudaMemcpy(out, g_out, sizeof(out), cudaMemcpyDeviceToHost);

    printf("Part 1: %d\n", out[0]);

    cudaFree(g_out);
    cudaFree(g_in);

    return 0;
}
	// nvcc -O3 day03.cu
	#include <cstdio>
	#include <vector>

	struct input_t { int id, x0, y0, x1, y1; };

	static __device__ void reduce(int out, int in) {
	volatile int *vin = in;
	int tid = threadIdx.x;
	__syncthreads();
	for (int n = 512; n >= 64; n >>= 1) {
	if (tid < n) {
	in[tid] += in[tid + n];
	__syncthreads();
	}
	}
	if (tid < 32) {
	for (int n = 32; n; n >>= 1) {
	vin[tid] += vin[tid + n];
	}
	if (tid == 0) {
	*out = in[0];
	}
	}
	}

	static __global__ void part1_kernel(int out, input_t in, size_t n) {
	__shared__ int shared[1024];
	int x = threadIdx.x, y = blockIdx.x;
	int count = 0;
	for (; n--; in++) {
	// Is my pixel inside the rectangle?
	count += (x >= in->x0) & (x < in->x1) &
	(y >= in->y0) & (y < in->y1);
	}
	shared[x] = (count > 1);
	reduce(out + blockIdx.x, shared);
	}

	static __global__ void reduce_kernel(int *arr) {
	__shared__ int shared[1024];
	int tid = threadIdx.x;
	shared[tid] = arr[tid];
	reduce(arr, shared);
	}

	int main() {
	std::vector<input_t> v;
	int *g_out, out[1024];
	input_t *g_in;

	input_t tmp;
	while (scanf("#%d @ %d,%d: %dx%d\n", &tmp.id,
	&tmp.x0, &tmp.y0,
	&tmp.x1, &tmp.y1) == 5)
	{
	tmp.x1 += tmp.x0;
	tmp.y1 += tmp.y0;
	v.push_back(tmp);
	}

	cudaSetDevice(0);
	cudaMalloc(&g_in, v.size() * sizeof(input_t));
	cudaMemcpy(g_in, &v[0], v.size() * sizeof(input_t), cudaMemcpyHostToDevice);
	cudaMalloc(&g_out, sizeof(out));

	// Get counts per row
	part1_kernel<<<1024, 1024>>>(g_out, g_in, v.size());

	// Sum all the rows
	reduce_kernel<<<1, 1024>>>(g_out);

	cudaDeviceSynchronize();
	cudaMemcpy(out, g_out, sizeof(out), cudaMemcpyDeviceToHost);

	printf("Part 1: %d\n", out[0]);

	cudaFree(g_out);
	cudaFree(g_in);

	return 0;
	}