Earthcomputer/main.cu

## main.cu
#include <algorithm>
#include <chrono>
#include <iostream>
#include <sstream>
#include <thread>
#include <vector>

#ifdef _MSC_VER
#include <immintrin.h>
#endif

void doHandleError(cudaError_t errorCode, const char* file, int line) {
    if (errorCode != cudaSuccess) {
        std::cerr << file << ":" << line << ": CUDA error " << errorCode << " " << cudaGetErrorString(errorCode) << std::endl;
        exit(errorCode);
    }
}
#define handleError(x) doHandleError((x), __FILE__, __LINE__)

template<class F>
class EndScope {
    F f;
public:
    explicit EndScope(F f) : f(f) {}
    ~EndScope() {
        f();
    }
};
template<class F> EndScope<F> makeEndScope(F f) { return EndScope<F>(f); }
#define onEndScope(name, code) auto (name) = makeEndScope([&]{code;})

__host__ __device__ uint64_t rotateLeft(uint64_t n, int amt) {
    return (n << amt) | (n >> (64 - amt));
}

#ifdef __CUDA_ARCH__
__device__ double fastInverseSqrt(double x) {
    double d = 0.5 * x;
    int64_t l = __double_as_longlong(x);
    l = 6910469410427058090LL - (l >> 1);
    x = __longlong_as_double(l);
    return x * (1.5 - d * x * x);
}
#else
double fastInverseSqrt(double x) {
    double d = 0.5 * x;
    int64_t l;
    memcpy(&l, &x, sizeof(x));
    l = 6910469410427058090LL - (l >> 1);
    memcpy(&x, &l, sizeof(l));
    return x * (1.5 - d * x * x);
}
#endif

class XOR128 {
    uint64_t seedLo = 0;
    uint64_t seedHi = 0;

    __host__ __device__ static uint64_t mixStafford13(uint64_t seed) {
        seed = (seed ^ seed >> 30) * (uint64_t) -4658895280553007687LL;
        seed = (seed ^ seed >> 27) * (uint64_t) -7723592293110705685LL;
        return seed ^ seed >> 31;
    }

public:
    explicit __host__ __device__ XOR128(uint64_t seed) {
        setSeed(seed);
    }

    __host__ __device__ uint64_t next() {
        uint64_t l = seedLo;
        uint64_t m = seedHi;
        uint64_t n = rotateLeft(l + m, 17) + l;
        m ^= l;
        seedLo = rotateLeft(l, 49) ^ m ^ m << 21;
        seedHi = rotateLeft(m, 28);
        return n;
    }

private:
    __host__ __device__ int32_t next(int bits) {
        return (int32_t) (next() >> (64 - bits));
    }

public:
    __host__ __device__ int64_t nextLong() {
        int32_t i = next(32);
        int32_t j = next(32);
        int64_t l = (int64_t) i << 32;
        return l + (int64_t) j;
    }

    __host__ __device__ float nextFloat() {
        return (float) next(24) * 5.9604645E-8F;
    }

    __host__ __device__ double nextDouble() {
        int32_t i = next(26);
        int32_t j = next(27);
        int64_t l = ((int64_t)i << 27) + (int64_t)j;
        return (double)l * 1.110223E-16F;
    }

    __host__ __device__ int32_t nextInt(int32_t bound) {
        if ((bound & bound - 1) == 0) {
            return (int32_t)((int64_t)bound * (int64_t)next(31) >> 31);
        } else {
            int32_t i;
            int32_t j;
            do {
                i = next(31);
                j = i % bound;
            } while (i - j + (bound - 1) < 0);

            return j;
        }
    }

    __host__ __device__ void setSeed(uint64_t seed) {
        uint64_t l2 = seed ^ 7640891576956012809LL;
        uint64_t l3 = l2 - 7046029254386353131LL;
        seedLo = mixStafford13(l2);
        seedHi = mixStafford13(l3);
        if ((seedLo | seedHi) == 0) {
            seedLo = (uint64_t) -7046029254386353131LL;
            seedHi = 7640891576956012809LL;
        }
    }

    __host__ __device__ static uint64_t getPopulationSeed(uint64_t seed, uint64_t l, uint64_t m, int32_t blockX, int32_t blockZ) {
        return (uint64_t) (int64_t) blockX * l + (uint64_t) (int64_t) blockZ * m ^ seed;
    }

    __host__ __device__ static XOR128 withDecoratorSeed(uint64_t populationSeed, int32_t index, int32_t step) {
        uint64_t l = populationSeed + (uint64_t) index + (uint64_t) (10000 * step);
        return XOR128(l);
    }
};

class Random {
    uint64_t seed;

    __host__ __device__ int32_t next(int bits) {
        return (int32_t) ((seed = (seed * 0x5deece66dLL + 11) & 0xffffffffffffLL) >> (48 - bits));
    }

public:
    __host__ __device__ explicit Random(uint64_t seed) : seed(seed ^ 0x5deece66dLL) {}

    __host__ __device__ int64_t nextLong() {
        int32_t i = next(32);
        int32_t j = next(32);
        int64_t l = (int64_t) i << 32;
        return l + (int64_t) j;
    }

    __host__ __device__ int32_t nextInt(int32_t bound) {
        if ((bound & bound - 1) == 0) {
            return (int32_t)((int64_t)bound * (int64_t)next(31) >> 31);
        } else {
            int32_t i;
            int32_t j;
            do {
                i = next(31);
                j = i % bound;
            } while (i - j + (bound - 1) < 0);

            return j;
        }
    }

    __host__ __device__ double nextDouble() {
        int32_t i = next(26);
        int32_t j = next(27);
        int64_t l = ((int64_t)i << 27) + (int64_t)j;
        return (double)l * 1.110223E-16;
    }
};

class PerlinNoiseSampler {
    static constexpr double lacunarity = 1.0 / 16.0;
    static constexpr double persistence = 1;

    double originX = 0, originY = 0, originZ = 0;
    uint8_t permutation[256]{};

public:

    __host__ __device__ PerlinNoiseSampler() = default;

#pragma clang diagnostic push
#pragma ide diagnostic ignored "EndlessLoop"
    __host__ __device__ explicit PerlinNoiseSampler(Random& randomIn) {
        Random random(randomIn.nextLong() ^ 440898198LL); // "octave_-4".hashCode() = 440898198

        originX = random.nextDouble() * 256;
        originY = random.nextDouble() * 256;
        originZ = random.nextDouble() * 256;

        for (int32_t i = 0; i < 256; i++) {
            permutation[i] = (uint8_t) i;
        }

        for (int32_t i = 0; i < 256; i++) {
            int32_t j = random.nextInt(256 - i);
            std::swap(permutation[i], permutation[i + j]);
        }
    }
#pragma clang diagnostic pop
private:

    __host__ __device__ double doSample(double x, double y, double z, double yScale, double yMax) const {
        double d = x + originX;
        double e = y + originY;
        double f = z + originZ;
        int32_t i = ifloor(d);
        int32_t j = ifloor(e);
        int32_t k = ifloor(f);
        double g = d - i;
        double h = e - j;
        double l = f - k;
        double n;
        if (yScale != 0) {
            double m = yMax > 0 && yMax < h ? yMax : h;
            n = floor(m / yScale + 1.0E-7F) * yScale;
        } else {
            n = 0;
        }
        return doSample(i, j, k, g, h - n, l, h);
    }

    __host__ __device__ double doSample(int32_t sectionX, int32_t sectionY, int32_t sectionZ, double localX, double localY, double localZ, double fadeLocalY) const {
        int32_t i = map(sectionX);
        int32_t j = map(sectionX + 1);
        int32_t k = map(i + sectionY);
        int32_t l = map(i + sectionY + 1);
        int32_t m = map(j + sectionY);
        int32_t n = map(j + sectionY + 1);
        double d = grad(map(k + sectionZ), localX, localY, localZ);
        double e = grad(map(m + sectionZ), localX - 1, localY, localZ);
        double f = grad(map(l + sectionZ), localX, localY - 1, localZ);
        double g = grad(map(n + sectionZ), localX - 1, localY - 1, localZ);
        double h = grad(map(k + sectionZ + 1), localX, localY, localZ - 1);
        double o = grad(map(m + sectionZ + 1), localX - 1, localY, localZ - 1);
        double p = grad(map(l + sectionZ + 1), localX, localY - 1, localZ - 1);
        double q = grad(map(n + sectionZ + 1), localX - 1, localY - 1, localZ - 1);
        double r = perlinFade(localX);
        double s = perlinFade(fadeLocalY);
        double t = perlinFade(localZ);
        return lerp3(r, s, t, d, e, f, g, h, o, p, q);
    }

    __host__ __device__ uint8_t map(int32_t input) const {
        return permutation[input & 0xff];
    }

    __host__ __device__ static double grad(int hash, double x, double y, double z) {
        static constexpr double GRADIENTS[16][3] = {
            {1, 1, 0},
            {-1, 1, 0},
            {1, -1, 0},
            {-1, -1, 0},
            {1, 0, 1},
            {-1, 0, 1},
            {1, 0, -1},
            {-1, 0, -1},
            {0, 1, 1},
            {0, -1, 1},
            {0, 1, -1},
            {0, -1, -1},
            {1, 1, 0},
            {0, -1, 1},
            {-1, 1, 0},
            {0, -1, -1}
        };
        return dot(GRADIENTS[hash & 15], x, y, z);
    }

    __host__ __device__ static double dot(const double gradient[3], double x, double y, double z) {
        return gradient[0] * x + gradient[1] * y + gradient[2] * z;
    }

    __host__ __device__ static double perlinFade(double value) {
        return value * value * value * (value * (value * 6.0 - 15.0) + 10.0);
    }

    __host__ __device__ static double lerp3(
        double deltaX,
        double deltaY,
        double deltaZ,
        double x0y0z0,
        double x1y0z0,
        double x0y1z0,
        double x1y1z0,
        double x0y0z1,
        double x1y0z1,
        double x0y1z1,
        double x1y1z1
    ) {
        return lerp(deltaZ, lerp2(deltaX, deltaY, x0y0z0, x1y0z0, x0y1z0, x1y1z0), lerp2(deltaX, deltaY, x0y0z1, x1y0z1, x0y1z1, x1y1z1));
    }

    __host__ __device__ static double lerp2(double deltaX, double deltaY, double x0y0, double x1y0, double x0y1, double x1y1) {
        return lerp(deltaY, lerp(deltaX, x0y0, x1y0), lerp(deltaX, x0y1, x1y1));
    }

    __host__ __device__ static double lerp(double delta, double start, double end) {
        return start + delta * (end - start);
    }

    __host__ __device__ static double maintainPrecision(double value) {
        return value - (double) lfloor(value / 3.3554432E7 + 0.5) * 3.3554432E7;
    }

    __host__ __device__ static int32_t ifloor(double value) {
        auto i = (int32_t) value;
        return value < (double) i ? i - 1 : i;
    }

    __host__ __device__ static int64_t lfloor(double value) {
        auto l = (int64_t) value;
        return value < (double) l ? l - 1 : l;
    }

public:
    __host__ __device__ double sample(double x, double y, double z, double yScale, double yMax) const {
        double e = lacunarity;
        double f = persistence;

        double g = doSample(maintainPrecision(x * e), maintainPrecision(y * e), maintainPrecision(z * e), yScale * e, yMax * e);
        return g * f;
    }
};

class DoublePerlinNoiseSampler {
    PerlinNoiseSampler firstSampler{};
    PerlinNoiseSampler secondSampler{};

public:
    __host__ __device__ DoublePerlinNoiseSampler() = default;

    __host__ __device__ explicit DoublePerlinNoiseSampler(Random& random) {
        firstSampler = PerlinNoiseSampler(random);
        secondSampler = PerlinNoiseSampler(random);
    }

    __host__ __device__ double sample(double x, double y, double z) const {
        double d = x * 1.0181268882175227;
        double e = y * 1.0181268882175227;
        double f = z * 1.0181268882175227;
        double first = firstSampler.sample(x, y, z, 0, 0);
        double second = secondSampler.sample(d, e, f, 0, 0);
        return (first + second) * (5.0 / 6.0);
    }
};

class ByteArray {
    uint32_t* data;
public:
    __host__ __device__ explicit ByteArray(uint32_t* data) : data(data) {}

    __host__ __device__ uint8_t operator[](size_t index) {
        return (data[index >> 2] >> ((index & 3) << 3)) & 0xff;
    }

    __device__ uint8_t atomicInc(size_t index) {
        return (atomicAdd(data + (index >> 2), 1 << ((index & 3) << 3)) >> ((index & 3) << 3)) & 0xff;
    }
};

struct BlockPos {
    int32_t x, y, z;

    __host__ __device__ BlockPos operator+(const BlockPos& other) const {
        return {x + other.x, y + other.y, z + other.z};
    }

    __host__ __device__ double getSquaredDistance(const BlockPos& other) const {
        double d = (double) x - (double) other.x;
        double e = (double) y - (double) other.y;
        double f = (double) z - (double) other.z;
        return d * d + e * e + f * f;
    }
};

struct ChunkPos {
    int32_t x, z;
};
static_assert(sizeof(ChunkPos) == 8, "sizeof(ChunkPos) must be 8");

struct PosIntPair {
    BlockPos pos;
    int32_t val;
};

struct GeodeGeneratorStaticData {
    uint64_t seed = 0;
    DoublePerlinNoiseSampler doublePerlinNoiseSampler{};
    uint64_t nextA = 0;
    uint64_t nextB = 0;

    __host__ __device__ GeodeGeneratorStaticData() = default;

    __host__ __device__ explicit GeodeGeneratorStaticData(uint64_t seed) : seed(seed) {
        Random lcg(seed);
        doublePerlinNoiseSampler = DoublePerlinNoiseSampler(lcg);
        XOR128 rand(seed);
        nextA = rand.nextLong() | 1;
        nextB = rand.nextLong() | 1;
    }
};

__forceinline__ __host__ __device__ bool canGenerateGeode(const GeodeGeneratorStaticData& staticData, int chunkX, int chunkZ) {
    int32_t blockX = chunkX << 4;
    int32_t blockZ = chunkZ << 4;
    uint64_t decorationSeed = XOR128::getPopulationSeed(staticData.seed, staticData.nextA, staticData.nextB, blockX, blockZ);
    XOR128 rand = XOR128::withDecoratorSeed(decorationSeed, 2, 2);
    return rand.nextFloat() < 1.0 / 24;
}

template<class PlaceCrystalFunc>
__forceinline__ __host__ __device__ bool generateGeode(
    const GeodeGeneratorStaticData& staticData,
    int32_t chunkX,
    int32_t chunkZ,
    PlaceCrystalFunc placeCrystalFunc
) {
    int32_t blockX = chunkX << 4;
    int32_t blockZ = chunkZ << 4;
    uint64_t decorationSeed = XOR128::getPopulationSeed(staticData.seed, staticData.nextA, staticData.nextB, blockX, blockZ);
    XOR128 rand = XOR128::withDecoratorSeed(decorationSeed, 2, 2);
    rand.nextFloat(); // caller should use canGenerateGeode first

    blockX += rand.nextInt(16);
    blockZ += rand.nextInt(16);

    int32_t blockY = rand.nextInt(30-(-64+6)+1)+(-64+6);

    BlockPos blockPos{blockX, blockY, blockZ};

    int k = rand.nextInt((4-3+1))+3;
    double d = (double)k / 6.0;

    constexpr double e = 0.76696498884; // 1 / sqrt(1.7)
    double f = 1.0 / sqrt(2.2 + d);
    double h = 1.0 / sqrt(4.2 + d);
    double l = 1.0 / sqrt(2.0 + rand.nextDouble() / 2.0 + (k > 3 ? d : 0.0));
    bool bl = rand.nextFloat() < 0.95;

    PosIntPair list[4];
    int listSize = 0;
    for (int n = 0; n < k; n++) {
        int o = rand.nextInt(6 - 4 + 1) + 4;
        int p = rand.nextInt(6 - 4 + 1) + 4;
        int q = rand.nextInt(6 - 4 + 1) + 4;
        BlockPos blockPos2 = blockPos + BlockPos{o, p, q};

        list[listSize++] = PosIntPair{blockPos2, rand.nextInt(2-1+1)+1};
    }

    BlockPos list2[3];
    if (bl) {
        int n = rand.nextInt(4);
        int o = k * 2 + 1;
        switch (n) {
            case 0:
                list2[0] = blockPos + BlockPos{o, 7, 0};
                list2[1] = blockPos + BlockPos{o, 5, 0};
                list2[2] = blockPos + BlockPos{o, 1, 0};
                break;
            case 1:
                list2[0] = blockPos + BlockPos{0, 7, o};
                list2[1] = blockPos + BlockPos{0, 5, o};
                list2[2] = blockPos + BlockPos{0, 1, o};
                break;
            case 2:
                list2[0] = blockPos + BlockPos{o, 7, o};
                list2[1] = blockPos + BlockPos{o, 5, o};
                list2[2] = blockPos + BlockPos{o, 1, o};
                break;
            default:
                list2[0] = blockPos + BlockPos{0, 7, 0};
                list2[1] = blockPos + BlockPos{0, 5, 0};
                list2[2] = blockPos + BlockPos{0, 1, 0};
                break;
        }
    }

    for (int dz = -16; dz <= 16; dz++) {
        for (int dy = -16; dy <= 16; dy++) {
            for (int dx = -16; dx <= 16; dx++) {
                BlockPos blockPos3 = blockPos + BlockPos{dx, dy, dz};

                double s = 0;
                double t = 0;

                for (int i = 0; i < listSize; i++) {
                    s += fastInverseSqrt(blockPos3.getSquaredDistance(list[i].pos) + (double)list[i].val);
                }
                double esx = s + 0.05 * (double)k;
                double esn = s - 0.05 * (double)k;
                if (esn >= e || (!((esx >= h)&&(esx >= f)))) {
                    continue;
                }

                if (bl) {
                    for (int i = 0; i < 3; i++) {
                        t += fastInverseSqrt(blockPos3.getSquaredDistance(list2[i]) + 2);
                    }
                    if (t - 0.05 * 3 >= l) {
                        continue;
                    }
                }

                double r = staticData.doublePerlinNoiseSampler.sample((double)blockPos3.x, (double)blockPos3.y, (double)blockPos3.z) * 0.05;

                s += r * (double) k;
                if ((!((s >= h)&&(s >= f)))||(s >= e))
                    continue;
                if (bl) {
                    t += r * 3;
                    if (t >= l)
                        continue;
                }
                bool bl2 = rand.nextFloat() < 0.083;
                if (bl2) {
                    placeCrystalFunc(blockPos3);
                    rand.next();
                }
            }
        }
    }

    return true;
}

__constant__ GeodeGeneratorStaticData STATIC_DATA;

__global__ void filterCanGenerateGeode(int32_t minChunkX, int32_t minChunkZ, uint32_t squareSize, uint32_t* count, ChunkPos* outPositions) {
    uint64_t idx = (uint64_t)blockIdx.x * (uint64_t)blockDim.x + (uint64_t)threadIdx.x;
    if (idx > squareSize * squareSize) {
        return;
    }
    int32_t cx = minChunkX + (int32_t) (idx / squareSize);
    int32_t cz = minChunkZ + (int32_t) (idx % squareSize);
    if (canGenerateGeode(STATIC_DATA, cx, cz)) {
        uint32_t index = atomicAdd(count, 1);
        outPositions[index] = ChunkPos{cx, cz};
    }
}

__global__ void countCrystals(uint32_t count, ChunkPos* positions, size_t* progress, int32_t minChunkX, int32_t minChunkZ, uint32_t squareSize, ByteArray crystalCounts) {
    uint32_t idx = (uint32_t) blockIdx.x * (uint32_t) blockDim.x + (uint32_t) threadIdx.x;
    if (idx > count) {
        return;
    }
    *progress = max(*progress, (size_t) idx);
    ChunkPos position = positions[idx];
    generateGeode(STATIC_DATA, position.x, position.z, [minChunkX, minChunkZ, squareSize, &crystalCounts](const BlockPos& pos) {
        int32_t cx = pos.x >> 4;
        int32_t cz = pos.z >> 4;
        if (cx >= minChunkX && cz >= minChunkZ && cx < minChunkX + (int32_t)squareSize && cz < minChunkZ + (int32_t)squareSize) {
            uint32_t index = (uint32_t) (cx - minChunkX) + squareSize * (uint32_t) (cz - minChunkZ);
            crystalCounts.atomicInc(index);
        }
    });
}

__host__ __device__ int getCountInRange(ByteArray crystalCounts, int32_t cx, int32_t cz, uint32_t squareSize) {
    constexpr double playerPositions[8][2] = {
        { 1.0625, 4.125 },
        { 4.125, 1.0625 },
        { 14.9375, 4.125 },
        { 4.125, 14.9375 },
        { 1.0625, 11.875 },
        { 11.875, 1.0625 },
        { 14.9375, 11.875 },
        { 11.875, 14.9375 },
    };

    int maxCount = 0;

    for (int i = 0; i < 8; i++) {
        double playerX = playerPositions[i][0];
        double playerZ = playerPositions[i][1];
        int count = 0;
        for (int ox = -8; ox <= 8; ox++) {
            for (int oz = -8; oz <= 8; oz++) {
                int chunkX = ox * 16 + 8;
                int chunkZ = ox * 16 + 8;
                double dx = chunkX - playerX;
                double dz = chunkZ - playerZ;
                if (dx * dx + dz * dz < 128 * 128) {
                    int32_t actualChunkX = cx + ox;
                    int32_t actualChunkZ = cz + oz;
                    if (actualChunkX >= 0 && actualChunkZ >= 0 && (uint32_t)actualChunkX < squareSize && (uint32_t)actualChunkZ < squareSize) {
                        uint32_t index = (uint32_t) actualChunkX + squareSize * (uint32_t) actualChunkZ;
                        count += crystalCounts[index];
                    }
                }
            }
        }

        maxCount = max(count, maxCount);
    }

    return maxCount;
}

constexpr int ACCEPTABLE_BELOW_MAX = 100;

__global__ void processRenderDistanceRanges(size_t* progress, ByteArray crystalCounts, uint32_t squareSize, int* maxCount, uint32_t* outputCount, ChunkPos* output) {
    uint64_t idx = (uint64_t)blockIdx.x * (uint64_t)blockDim.x + (uint64_t)threadIdx.x;
    if (idx > squareSize * squareSize) {
        return;
    }
    *progress = max(*progress, (size_t) idx);

    auto cx = (int32_t) (idx / squareSize);
    auto cz = (int32_t) (idx % squareSize);

    int count = getCountInRange(crystalCounts, cx, cz, squareSize);
    int oldCount = atomicMax(maxCount, count);
    int currentMax = max(count, oldCount);
    if (currentMax - count <= ACCEPTABLE_BELOW_MAX) {
        uint32_t index = atomicAdd(outputCount, 1);
        output[index] = ChunkPos{cx, cz};
        return;
    }
}

template <class F, class L>
void executeKernelUnderProgress(const char* displayName, size_t workSize, F kernelFunction, L kernelLauncher) {
    int blockSize;
    int minGridSize;
    handleError(cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, kernelFunction, 0, workSize));
    int gridSize = (int) ((workSize + (size_t) blockSize - 1) / (size_t) blockSize);
    std::cout << "Executing " << displayName << " with gridSize=" << gridSize << " blockSize=" << blockSize << std::endl;

    size_t* deviceProgress;
    handleError(cudaMalloc(&deviceProgress, sizeof(*deviceProgress)));
    size_t* hostProgress;
    handleError(cudaHostAlloc(&hostProgress, sizeof(*hostProgress), cudaHostAllocDefault));

    cudaEvent_t finishEvent;
    handleError(cudaEventCreate(&finishEvent));
    cudaStream_t executionStream;
    handleError(cudaStreamCreate(&executionStream));
    cudaStream_t progressStream;
    handleError(cudaStreamCreate(&progressStream));

    kernelLauncher(gridSize, blockSize, executionStream, deviceProgress);
    handleError(cudaEventRecord(finishEvent, executionStream));

    cudaError_t queryError;
    while ((queryError = cudaEventQuery(finishEvent)) == cudaErrorNotReady) {
        handleError(cudaMemcpyAsync(hostProgress, deviceProgress, sizeof(*hostProgress), cudaMemcpyDeviceToHost, progressStream));
        handleError(cudaStreamSynchronize(progressStream));
        std::cout << "... Progress " << displayName << ": " << *hostProgress << " / " << workSize << " (" << ((double) *hostProgress / (double) workSize * 100) << "%)" << std::endl;
        std::this_thread::sleep_for(std::chrono::seconds(1));
    }
    handleError(queryError);
    handleError(cudaEventDestroy(finishEvent));
    handleError(cudaStreamDestroy(progressStream));
    handleError(cudaStreamDestroy(executionStream));
    handleError(cudaFreeHost(hostProgress));
    handleError(cudaFree(deviceProgress));
}

template <class T>
bool string2int(const char* str, T& result) {
    std::stringstream ss(str);
    return !(ss >> result).fail() && ss.eof();
}

int main(int argc, char** argv) {
    if (argc < 5) {
        std::cerr << argv[0] << " <seed> <center-chunk-x> <center-chunk-z> <chunk-radius>" << std::endl;
        return 0;
    }
    int64_t seed;
    int32_t centerChunkX, centerChunkZ, chunkRadius;
    if (!string2int(argv[1], seed) || !string2int(argv[2], centerChunkX) || !string2int(argv[3], centerChunkZ) || !string2int(argv[4], chunkRadius)) {
        std::cerr << "Invalid integer inputs" << std::endl;
        return 0;
    }
    if (chunkRadius <= 0) {
        std::cerr << "Chunk radius must be positive" << std::endl;
        return 0;
    }
    int32_t minChunkX = centerChunkX - chunkRadius;
    int32_t minChunkZ = centerChunkZ - chunkRadius;
    uint32_t squareSize = chunkRadius * 2;

    size_t freeMemory;
    size_t totalMemory;
    handleError(cudaMemGetInfo(&freeMemory, &totalMemory));
    auto maxSquareSize = (size_t) (sqrt((double) freeMemory) * 0.5);
    if (maxSquareSize == 0) {
        std::cerr << "No free GPU memory" << std::endl;
        return 0;
    }
#ifdef _MSC_VER
    int leadingZeros = sizeof(size_t) == 8 ? _lzcnt_u64(maxSquareSize) : _lzcnt_u32(maxSquareSize);
#else
    int leadingZeros = __builtin_clz(maxSquareSize);
#endif
    maxSquareSize = 1 << ((sizeof(maxSquareSize) * 8 - 1) - leadingZeros);
    if (squareSize > maxSquareSize) {
        std::cerr << "Not enough GPU memory for that radius. Radius must be at most " << (maxSquareSize / 2) << " chunks for this available memory" << std::endl;
        return 0;
    }
    size_t workSize = squareSize * squareSize;

    std::cout << "Searching area " << minChunkX << ", " << minChunkZ << " to " << (minChunkX + squareSize - 1) << ", " << (minChunkZ + squareSize - 1) << std::endl;

    GeodeGeneratorStaticData staticData(seed);
    handleError(cudaMemcpyToSymbol(STATIC_DATA, &staticData, sizeof(staticData)));

    ChunkPos* filteredPositions;
    cudaMalloc(&filteredPositions, workSize); // allocate 8 times less than the capacity, okay because we'll never actually reach this
    uint32_t* countPtr;
    cudaMallocManaged(&countPtr, sizeof(*countPtr));
    *countPtr = 0;
    executeKernelUnderProgress("finding geodes", workSize, filterCanGenerateGeode, [=](int gridSize, int blockSize, cudaStream_t executionStream, size_t* deviceProgress){
        filterCanGenerateGeode<<<gridSize, blockSize, 0, executionStream>>>(minChunkX, minChunkZ, squareSize, countPtr, filteredPositions);
    });
    uint32_t count = *countPtr;
    handleError(cudaFree(countPtr));

    std::cout << "Found " << count << " geodes total in grid" << std::endl;

    // shrink filtered positions allocation
    // array is too large to keep on device, need to go via host
    auto temp = new ChunkPos[count];
    handleError(cudaMemcpy(temp, filteredPositions, count * sizeof(*filteredPositions), cudaMemcpyDeviceToHost));
    handleError(cudaFree(filteredPositions));
    handleError(cudaMalloc(&filteredPositions, count * sizeof(*filteredPositions)));
    handleError(cudaMemcpy(filteredPositions, temp, count * sizeof(*filteredPositions), cudaMemcpyHostToDevice));
    delete[] temp;

    uint32_t* crystalCountsRaw;
    size_t sizeOfCrystalCounts = (workSize + 3) / 4 * 4;
    handleError(cudaMalloc(&crystalCountsRaw, sizeOfCrystalCounts));
    handleError(cudaMemset(crystalCountsRaw, 0, sizeOfCrystalCounts));
    ByteArray crystalCounts(crystalCountsRaw);
    auto crystalCountsRawHost = new uint32_t[sizeOfCrystalCounts];
    ByteArray crystalCountsHost(crystalCountsRawHost);

    executeKernelUnderProgress("crystal counting", count, countCrystals, [=](int gridSize, int blockSize, cudaStream_t executionStream, size_t* deviceProgress){
        countCrystals<<<gridSize, blockSize, 0, executionStream>>>(count, filteredPositions, deviceProgress, minChunkX, minChunkZ, squareSize, crystalCounts);
    });

    handleError(cudaFree(filteredPositions));

    int* maxCount;
    handleError(cudaMallocManaged(&maxCount, sizeof(*maxCount)));
    uint32_t* outputCount;
    handleError(cudaMallocManaged(&outputCount, sizeof(*outputCount)));
    ChunkPos* output;
    handleError(cudaMallocManaged(&output, 10000 * sizeof(*output)));

    executeKernelUnderProgress("applying ticking ranges", workSize, processRenderDistanceRanges, [=](int gridSize, int blockSize, cudaStream_t executionStream, size_t* deviceProgress){
        processRenderDistanceRanges<<<gridSize, blockSize, 0, executionStream>>>(deviceProgress, crystalCounts, squareSize, maxCount, outputCount, output);
    });

    handleError(cudaMemcpy(crystalCountsRawHost, crystalCountsRaw, sizeOfCrystalCounts, cudaMemcpyDeviceToHost));

    std::vector<std::pair<ChunkPos, int>> filteredOutput;
    for (int i = 0; i < *outputCount; i++) {
        int inRange = getCountInRange(crystalCountsHost, output[i].x, output[i].z, squareSize);
        if (*maxCount - inRange <= ACCEPTABLE_BELOW_MAX) {
            filteredOutput.emplace_back(output[i], inRange);
        }
    }

    std::cout << "Max found: " << *maxCount << ", number of areas within " << ACCEPTABLE_BELOW_MAX << " of max: " << filteredOutput.size() << std::endl;

    std::sort(filteredOutput.begin(), filteredOutput.end(), [](const std::pair<ChunkPos, int>& a, const std::pair<ChunkPos, int>& b) { return a.second >= b.second; });
    for (const std::pair<ChunkPos, int>& pair : filteredOutput) {
        std::cout << "(" << (pair.first.x + minChunkX) << ", " << (pair.first.z + minChunkZ) << "): " << pair.second << std::endl;
    }

    return 0;
}