Skip to content

Instantly share code, notes, and snippets.

@Earthcomputer
Created January 15, 2024 21:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Earthcomputer/c2d20dd0b139f9f267eb34c760519313 to your computer and use it in GitHub Desktop.
Save Earthcomputer/c2d20dd0b139f9f267eb34c760519313 to your computer and use it in GitHub Desktop.
Amethyst cluster finder
#include <algorithm>
#include <chrono>
#include <iostream>
#include <sstream>
#include <thread>
#include <vector>
#ifdef _MSC_VER
#include <immintrin.h>
#endif
void doHandleError(cudaError_t errorCode, const char* file, int line) {
if (errorCode != cudaSuccess) {
std::cerr << file << ":" << line << ": CUDA error " << errorCode << " " << cudaGetErrorString(errorCode) << std::endl;
exit(errorCode);
}
}
#define handleError(x) doHandleError((x), __FILE__, __LINE__)
template<class F>
class EndScope {
F f;
public:
explicit EndScope(F f) : f(f) {}
~EndScope() {
f();
}
};
template<class F> EndScope<F> makeEndScope(F f) { return EndScope<F>(f); }
#define onEndScope(name, code) auto (name) = makeEndScope([&]{code;})
__host__ __device__ uint64_t rotateLeft(uint64_t n, int amt) {
return (n << amt) | (n >> (64 - amt));
}
#ifdef __CUDA_ARCH__
__device__ double fastInverseSqrt(double x) {
double d = 0.5 * x;
int64_t l = __double_as_longlong(x);
l = 6910469410427058090LL - (l >> 1);
x = __longlong_as_double(l);
return x * (1.5 - d * x * x);
}
#else
double fastInverseSqrt(double x) {
double d = 0.5 * x;
int64_t l;
memcpy(&l, &x, sizeof(x));
l = 6910469410427058090LL - (l >> 1);
memcpy(&x, &l, sizeof(l));
return x * (1.5 - d * x * x);
}
#endif
class XOR128 {
uint64_t seedLo = 0;
uint64_t seedHi = 0;
__host__ __device__ static uint64_t mixStafford13(uint64_t seed) {
seed = (seed ^ seed >> 30) * (uint64_t) -4658895280553007687LL;
seed = (seed ^ seed >> 27) * (uint64_t) -7723592293110705685LL;
return seed ^ seed >> 31;
}
public:
explicit __host__ __device__ XOR128(uint64_t seed) {
setSeed(seed);
}
__host__ __device__ uint64_t next() {
uint64_t l = seedLo;
uint64_t m = seedHi;
uint64_t n = rotateLeft(l + m, 17) + l;
m ^= l;
seedLo = rotateLeft(l, 49) ^ m ^ m << 21;
seedHi = rotateLeft(m, 28);
return n;
}
private:
__host__ __device__ int32_t next(int bits) {
return (int32_t) (next() >> (64 - bits));
}
public:
__host__ __device__ int64_t nextLong() {
int32_t i = next(32);
int32_t j = next(32);
int64_t l = (int64_t) i << 32;
return l + (int64_t) j;
}
__host__ __device__ float nextFloat() {
return (float) next(24) * 5.9604645E-8F;
}
__host__ __device__ double nextDouble() {
int32_t i = next(26);
int32_t j = next(27);
int64_t l = ((int64_t)i << 27) + (int64_t)j;
return (double)l * 1.110223E-16F;
}
__host__ __device__ int32_t nextInt(int32_t bound) {
if ((bound & bound - 1) == 0) {
return (int32_t)((int64_t)bound * (int64_t)next(31) >> 31);
} else {
int32_t i;
int32_t j;
do {
i = next(31);
j = i % bound;
} while (i - j + (bound - 1) < 0);
return j;
}
}
__host__ __device__ void setSeed(uint64_t seed) {
uint64_t l2 = seed ^ 7640891576956012809LL;
uint64_t l3 = l2 - 7046029254386353131LL;
seedLo = mixStafford13(l2);
seedHi = mixStafford13(l3);
if ((seedLo | seedHi) == 0) {
seedLo = (uint64_t) -7046029254386353131LL;
seedHi = 7640891576956012809LL;
}
}
__host__ __device__ static uint64_t getPopulationSeed(uint64_t seed, uint64_t l, uint64_t m, int32_t blockX, int32_t blockZ) {
return (uint64_t) (int64_t) blockX * l + (uint64_t) (int64_t) blockZ * m ^ seed;
}
__host__ __device__ static XOR128 withDecoratorSeed(uint64_t populationSeed, int32_t index, int32_t step) {
uint64_t l = populationSeed + (uint64_t) index + (uint64_t) (10000 * step);
return XOR128(l);
}
};
class Random {
uint64_t seed;
__host__ __device__ int32_t next(int bits) {
return (int32_t) ((seed = (seed * 0x5deece66dLL + 11) & 0xffffffffffffLL) >> (48 - bits));
}
public:
__host__ __device__ explicit Random(uint64_t seed) : seed(seed ^ 0x5deece66dLL) {}
__host__ __device__ int64_t nextLong() {
int32_t i = next(32);
int32_t j = next(32);
int64_t l = (int64_t) i << 32;
return l + (int64_t) j;
}
__host__ __device__ int32_t nextInt(int32_t bound) {
if ((bound & bound - 1) == 0) {
return (int32_t)((int64_t)bound * (int64_t)next(31) >> 31);
} else {
int32_t i;
int32_t j;
do {
i = next(31);
j = i % bound;
} while (i - j + (bound - 1) < 0);
return j;
}
}
__host__ __device__ double nextDouble() {
int32_t i = next(26);
int32_t j = next(27);
int64_t l = ((int64_t)i << 27) + (int64_t)j;
return (double)l * 1.110223E-16;
}
};
class PerlinNoiseSampler {
static constexpr double lacunarity = 1.0 / 16.0;
static constexpr double persistence = 1;
double originX = 0, originY = 0, originZ = 0;
uint8_t permutation[256]{};
public:
__host__ __device__ PerlinNoiseSampler() = default;
#pragma clang diagnostic push
#pragma ide diagnostic ignored "EndlessLoop"
__host__ __device__ explicit PerlinNoiseSampler(Random& randomIn) {
Random random(randomIn.nextLong() ^ 440898198LL); // "octave_-4".hashCode() = 440898198
originX = random.nextDouble() * 256;
originY = random.nextDouble() * 256;
originZ = random.nextDouble() * 256;
for (int32_t i = 0; i < 256; i++) {
permutation[i] = (uint8_t) i;
}
for (int32_t i = 0; i < 256; i++) {
int32_t j = random.nextInt(256 - i);
std::swap(permutation[i], permutation[i + j]);
}
}
#pragma clang diagnostic pop
private:
__host__ __device__ double doSample(double x, double y, double z, double yScale, double yMax) const {
double d = x + originX;
double e = y + originY;
double f = z + originZ;
int32_t i = ifloor(d);
int32_t j = ifloor(e);
int32_t k = ifloor(f);
double g = d - i;
double h = e - j;
double l = f - k;
double n;
if (yScale != 0) {
double m = yMax > 0 && yMax < h ? yMax : h;
n = floor(m / yScale + 1.0E-7F) * yScale;
} else {
n = 0;
}
return doSample(i, j, k, g, h - n, l, h);
}
__host__ __device__ double doSample(int32_t sectionX, int32_t sectionY, int32_t sectionZ, double localX, double localY, double localZ, double fadeLocalY) const {
int32_t i = map(sectionX);
int32_t j = map(sectionX + 1);
int32_t k = map(i + sectionY);
int32_t l = map(i + sectionY + 1);
int32_t m = map(j + sectionY);
int32_t n = map(j + sectionY + 1);
double d = grad(map(k + sectionZ), localX, localY, localZ);
double e = grad(map(m + sectionZ), localX - 1, localY, localZ);
double f = grad(map(l + sectionZ), localX, localY - 1, localZ);
double g = grad(map(n + sectionZ), localX - 1, localY - 1, localZ);
double h = grad(map(k + sectionZ + 1), localX, localY, localZ - 1);
double o = grad(map(m + sectionZ + 1), localX - 1, localY, localZ - 1);
double p = grad(map(l + sectionZ + 1), localX, localY - 1, localZ - 1);
double q = grad(map(n + sectionZ + 1), localX - 1, localY - 1, localZ - 1);
double r = perlinFade(localX);
double s = perlinFade(fadeLocalY);
double t = perlinFade(localZ);
return lerp3(r, s, t, d, e, f, g, h, o, p, q);
}
__host__ __device__ uint8_t map(int32_t input) const {
return permutation[input & 0xff];
}
__host__ __device__ static double grad(int hash, double x, double y, double z) {
static constexpr double GRADIENTS[16][3] = {
{1, 1, 0},
{-1, 1, 0},
{1, -1, 0},
{-1, -1, 0},
{1, 0, 1},
{-1, 0, 1},
{1, 0, -1},
{-1, 0, -1},
{0, 1, 1},
{0, -1, 1},
{0, 1, -1},
{0, -1, -1},
{1, 1, 0},
{0, -1, 1},
{-1, 1, 0},
{0, -1, -1}
};
return dot(GRADIENTS[hash & 15], x, y, z);
}
__host__ __device__ static double dot(const double gradient[3], double x, double y, double z) {
return gradient[0] * x + gradient[1] * y + gradient[2] * z;
}
__host__ __device__ static double perlinFade(double value) {
return value * value * value * (value * (value * 6.0 - 15.0) + 10.0);
}
__host__ __device__ static double lerp3(
double deltaX,
double deltaY,
double deltaZ,
double x0y0z0,
double x1y0z0,
double x0y1z0,
double x1y1z0,
double x0y0z1,
double x1y0z1,
double x0y1z1,
double x1y1z1
) {
return lerp(deltaZ, lerp2(deltaX, deltaY, x0y0z0, x1y0z0, x0y1z0, x1y1z0), lerp2(deltaX, deltaY, x0y0z1, x1y0z1, x0y1z1, x1y1z1));
}
__host__ __device__ static double lerp2(double deltaX, double deltaY, double x0y0, double x1y0, double x0y1, double x1y1) {
return lerp(deltaY, lerp(deltaX, x0y0, x1y0), lerp(deltaX, x0y1, x1y1));
}
__host__ __device__ static double lerp(double delta, double start, double end) {
return start + delta * (end - start);
}
__host__ __device__ static double maintainPrecision(double value) {
return value - (double) lfloor(value / 3.3554432E7 + 0.5) * 3.3554432E7;
}
__host__ __device__ static int32_t ifloor(double value) {
auto i = (int32_t) value;
return value < (double) i ? i - 1 : i;
}
__host__ __device__ static int64_t lfloor(double value) {
auto l = (int64_t) value;
return value < (double) l ? l - 1 : l;
}
public:
__host__ __device__ double sample(double x, double y, double z, double yScale, double yMax) const {
double e = lacunarity;
double f = persistence;
double g = doSample(maintainPrecision(x * e), maintainPrecision(y * e), maintainPrecision(z * e), yScale * e, yMax * e);
return g * f;
}
};
class DoublePerlinNoiseSampler {
PerlinNoiseSampler firstSampler{};
PerlinNoiseSampler secondSampler{};
public:
__host__ __device__ DoublePerlinNoiseSampler() = default;
__host__ __device__ explicit DoublePerlinNoiseSampler(Random& random) {
firstSampler = PerlinNoiseSampler(random);
secondSampler = PerlinNoiseSampler(random);
}
__host__ __device__ double sample(double x, double y, double z) const {
double d = x * 1.0181268882175227;
double e = y * 1.0181268882175227;
double f = z * 1.0181268882175227;
double first = firstSampler.sample(x, y, z, 0, 0);
double second = secondSampler.sample(d, e, f, 0, 0);
return (first + second) * (5.0 / 6.0);
}
};
class ByteArray {
uint32_t* data;
public:
__host__ __device__ explicit ByteArray(uint32_t* data) : data(data) {}
__host__ __device__ uint8_t operator[](size_t index) {
return (data[index >> 2] >> ((index & 3) << 3)) & 0xff;
}
__device__ uint8_t atomicInc(size_t index) {
return (atomicAdd(data + (index >> 2), 1 << ((index & 3) << 3)) >> ((index & 3) << 3)) & 0xff;
}
};
struct BlockPos {
int32_t x, y, z;
__host__ __device__ BlockPos operator+(const BlockPos& other) const {
return {x + other.x, y + other.y, z + other.z};
}
__host__ __device__ double getSquaredDistance(const BlockPos& other) const {
double d = (double) x - (double) other.x;
double e = (double) y - (double) other.y;
double f = (double) z - (double) other.z;
return d * d + e * e + f * f;
}
};
struct ChunkPos {
int32_t x, z;
};
static_assert(sizeof(ChunkPos) == 8, "sizeof(ChunkPos) must be 8");
struct PosIntPair {
BlockPos pos;
int32_t val;
};
struct GeodeGeneratorStaticData {
uint64_t seed = 0;
DoublePerlinNoiseSampler doublePerlinNoiseSampler{};
uint64_t nextA = 0;
uint64_t nextB = 0;
__host__ __device__ GeodeGeneratorStaticData() = default;
__host__ __device__ explicit GeodeGeneratorStaticData(uint64_t seed) : seed(seed) {
Random lcg(seed);
doublePerlinNoiseSampler = DoublePerlinNoiseSampler(lcg);
XOR128 rand(seed);
nextA = rand.nextLong() | 1;
nextB = rand.nextLong() | 1;
}
};
__forceinline__ __host__ __device__ bool canGenerateGeode(const GeodeGeneratorStaticData& staticData, int chunkX, int chunkZ) {
int32_t blockX = chunkX << 4;
int32_t blockZ = chunkZ << 4;
uint64_t decorationSeed = XOR128::getPopulationSeed(staticData.seed, staticData.nextA, staticData.nextB, blockX, blockZ);
XOR128 rand = XOR128::withDecoratorSeed(decorationSeed, 2, 2);
return rand.nextFloat() < 1.0 / 24;
}
template<class PlaceCrystalFunc>
__forceinline__ __host__ __device__ bool generateGeode(
const GeodeGeneratorStaticData& staticData,
int32_t chunkX,
int32_t chunkZ,
PlaceCrystalFunc placeCrystalFunc
) {
int32_t blockX = chunkX << 4;
int32_t blockZ = chunkZ << 4;
uint64_t decorationSeed = XOR128::getPopulationSeed(staticData.seed, staticData.nextA, staticData.nextB, blockX, blockZ);
XOR128 rand = XOR128::withDecoratorSeed(decorationSeed, 2, 2);
rand.nextFloat(); // caller should use canGenerateGeode first
blockX += rand.nextInt(16);
blockZ += rand.nextInt(16);
int32_t blockY = rand.nextInt(30-(-64+6)+1)+(-64+6);
BlockPos blockPos{blockX, blockY, blockZ};
int k = rand.nextInt((4-3+1))+3;
double d = (double)k / 6.0;
constexpr double e = 0.76696498884; // 1 / sqrt(1.7)
double f = 1.0 / sqrt(2.2 + d);
double h = 1.0 / sqrt(4.2 + d);
double l = 1.0 / sqrt(2.0 + rand.nextDouble() / 2.0 + (k > 3 ? d : 0.0));
bool bl = rand.nextFloat() < 0.95;
PosIntPair list[4];
int listSize = 0;
for (int n = 0; n < k; n++) {
int o = rand.nextInt(6 - 4 + 1) + 4;
int p = rand.nextInt(6 - 4 + 1) + 4;
int q = rand.nextInt(6 - 4 + 1) + 4;
BlockPos blockPos2 = blockPos + BlockPos{o, p, q};
list[listSize++] = PosIntPair{blockPos2, rand.nextInt(2-1+1)+1};
}
BlockPos list2[3];
if (bl) {
int n = rand.nextInt(4);
int o = k * 2 + 1;
switch (n) {
case 0:
list2[0] = blockPos + BlockPos{o, 7, 0};
list2[1] = blockPos + BlockPos{o, 5, 0};
list2[2] = blockPos + BlockPos{o, 1, 0};
break;
case 1:
list2[0] = blockPos + BlockPos{0, 7, o};
list2[1] = blockPos + BlockPos{0, 5, o};
list2[2] = blockPos + BlockPos{0, 1, o};
break;
case 2:
list2[0] = blockPos + BlockPos{o, 7, o};
list2[1] = blockPos + BlockPos{o, 5, o};
list2[2] = blockPos + BlockPos{o, 1, o};
break;
default:
list2[0] = blockPos + BlockPos{0, 7, 0};
list2[1] = blockPos + BlockPos{0, 5, 0};
list2[2] = blockPos + BlockPos{0, 1, 0};
break;
}
}
for (int dz = -16; dz <= 16; dz++) {
for (int dy = -16; dy <= 16; dy++) {
for (int dx = -16; dx <= 16; dx++) {
BlockPos blockPos3 = blockPos + BlockPos{dx, dy, dz};
double s = 0;
double t = 0;
for (int i = 0; i < listSize; i++) {
s += fastInverseSqrt(blockPos3.getSquaredDistance(list[i].pos) + (double)list[i].val);
}
double esx = s + 0.05 * (double)k;
double esn = s - 0.05 * (double)k;
if (esn >= e || (!((esx >= h)&&(esx >= f)))) {
continue;
}
if (bl) {
for (int i = 0; i < 3; i++) {
t += fastInverseSqrt(blockPos3.getSquaredDistance(list2[i]) + 2);
}
if (t - 0.05 * 3 >= l) {
continue;
}
}
double r = staticData.doublePerlinNoiseSampler.sample((double)blockPos3.x, (double)blockPos3.y, (double)blockPos3.z) * 0.05;
s += r * (double) k;
if ((!((s >= h)&&(s >= f)))||(s >= e))
continue;
if (bl) {
t += r * 3;
if (t >= l)
continue;
}
bool bl2 = rand.nextFloat() < 0.083;
if (bl2) {
placeCrystalFunc(blockPos3);
rand.next();
}
}
}
}
return true;
}
__constant__ GeodeGeneratorStaticData STATIC_DATA;
__global__ void filterCanGenerateGeode(int32_t minChunkX, int32_t minChunkZ, uint32_t squareSize, uint32_t* count, ChunkPos* outPositions) {
uint64_t idx = (uint64_t)blockIdx.x * (uint64_t)blockDim.x + (uint64_t)threadIdx.x;
if (idx > squareSize * squareSize) {
return;
}
int32_t cx = minChunkX + (int32_t) (idx / squareSize);
int32_t cz = minChunkZ + (int32_t) (idx % squareSize);
if (canGenerateGeode(STATIC_DATA, cx, cz)) {
uint32_t index = atomicAdd(count, 1);
outPositions[index] = ChunkPos{cx, cz};
}
}
__global__ void countCrystals(uint32_t count, ChunkPos* positions, size_t* progress, int32_t minChunkX, int32_t minChunkZ, uint32_t squareSize, ByteArray crystalCounts) {
uint32_t idx = (uint32_t) blockIdx.x * (uint32_t) blockDim.x + (uint32_t) threadIdx.x;
if (idx > count) {
return;
}
*progress = max(*progress, (size_t) idx);
ChunkPos position = positions[idx];
generateGeode(STATIC_DATA, position.x, position.z, [minChunkX, minChunkZ, squareSize, &crystalCounts](const BlockPos& pos) {
int32_t cx = pos.x >> 4;
int32_t cz = pos.z >> 4;
if (cx >= minChunkX && cz >= minChunkZ && cx < minChunkX + (int32_t)squareSize && cz < minChunkZ + (int32_t)squareSize) {
uint32_t index = (uint32_t) (cx - minChunkX) + squareSize * (uint32_t) (cz - minChunkZ);
crystalCounts.atomicInc(index);
}
});
}
__host__ __device__ int getCountInRange(ByteArray crystalCounts, int32_t cx, int32_t cz, uint32_t squareSize) {
constexpr double playerPositions[8][2] = {
{ 1.0625, 4.125 },
{ 4.125, 1.0625 },
{ 14.9375, 4.125 },
{ 4.125, 14.9375 },
{ 1.0625, 11.875 },
{ 11.875, 1.0625 },
{ 14.9375, 11.875 },
{ 11.875, 14.9375 },
};
int maxCount = 0;
for (int i = 0; i < 8; i++) {
double playerX = playerPositions[i][0];
double playerZ = playerPositions[i][1];
int count = 0;
for (int ox = -8; ox <= 8; ox++) {
for (int oz = -8; oz <= 8; oz++) {
int chunkX = ox * 16 + 8;
int chunkZ = ox * 16 + 8;
double dx = chunkX - playerX;
double dz = chunkZ - playerZ;
if (dx * dx + dz * dz < 128 * 128) {
int32_t actualChunkX = cx + ox;
int32_t actualChunkZ = cz + oz;
if (actualChunkX >= 0 && actualChunkZ >= 0 && (uint32_t)actualChunkX < squareSize && (uint32_t)actualChunkZ < squareSize) {
uint32_t index = (uint32_t) actualChunkX + squareSize * (uint32_t) actualChunkZ;
count += crystalCounts[index];
}
}
}
}
maxCount = max(count, maxCount);
}
return maxCount;
}
constexpr int ACCEPTABLE_BELOW_MAX = 100;
__global__ void processRenderDistanceRanges(size_t* progress, ByteArray crystalCounts, uint32_t squareSize, int* maxCount, uint32_t* outputCount, ChunkPos* output) {
uint64_t idx = (uint64_t)blockIdx.x * (uint64_t)blockDim.x + (uint64_t)threadIdx.x;
if (idx > squareSize * squareSize) {
return;
}
*progress = max(*progress, (size_t) idx);
auto cx = (int32_t) (idx / squareSize);
auto cz = (int32_t) (idx % squareSize);
int count = getCountInRange(crystalCounts, cx, cz, squareSize);
int oldCount = atomicMax(maxCount, count);
int currentMax = max(count, oldCount);
if (currentMax - count <= ACCEPTABLE_BELOW_MAX) {
uint32_t index = atomicAdd(outputCount, 1);
output[index] = ChunkPos{cx, cz};
return;
}
}
template <class F, class L>
void executeKernelUnderProgress(const char* displayName, size_t workSize, F kernelFunction, L kernelLauncher) {
int blockSize;
int minGridSize;
handleError(cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, kernelFunction, 0, workSize));
int gridSize = (int) ((workSize + (size_t) blockSize - 1) / (size_t) blockSize);
std::cout << "Executing " << displayName << " with gridSize=" << gridSize << " blockSize=" << blockSize << std::endl;
size_t* deviceProgress;
handleError(cudaMalloc(&deviceProgress, sizeof(*deviceProgress)));
size_t* hostProgress;
handleError(cudaHostAlloc(&hostProgress, sizeof(*hostProgress), cudaHostAllocDefault));
cudaEvent_t finishEvent;
handleError(cudaEventCreate(&finishEvent));
cudaStream_t executionStream;
handleError(cudaStreamCreate(&executionStream));
cudaStream_t progressStream;
handleError(cudaStreamCreate(&progressStream));
kernelLauncher(gridSize, blockSize, executionStream, deviceProgress);
handleError(cudaEventRecord(finishEvent, executionStream));
cudaError_t queryError;
while ((queryError = cudaEventQuery(finishEvent)) == cudaErrorNotReady) {
handleError(cudaMemcpyAsync(hostProgress, deviceProgress, sizeof(*hostProgress), cudaMemcpyDeviceToHost, progressStream));
handleError(cudaStreamSynchronize(progressStream));
std::cout << "... Progress " << displayName << ": " << *hostProgress << " / " << workSize << " (" << ((double) *hostProgress / (double) workSize * 100) << "%)" << std::endl;
std::this_thread::sleep_for(std::chrono::seconds(1));
}
handleError(queryError);
handleError(cudaEventDestroy(finishEvent));
handleError(cudaStreamDestroy(progressStream));
handleError(cudaStreamDestroy(executionStream));
handleError(cudaFreeHost(hostProgress));
handleError(cudaFree(deviceProgress));
}
template <class T>
bool string2int(const char* str, T& result) {
std::stringstream ss(str);
return !(ss >> result).fail() && ss.eof();
}
int main(int argc, char** argv) {
if (argc < 5) {
std::cerr << argv[0] << " <seed> <center-chunk-x> <center-chunk-z> <chunk-radius>" << std::endl;
return 0;
}
int64_t seed;
int32_t centerChunkX, centerChunkZ, chunkRadius;
if (!string2int(argv[1], seed) || !string2int(argv[2], centerChunkX) || !string2int(argv[3], centerChunkZ) || !string2int(argv[4], chunkRadius)) {
std::cerr << "Invalid integer inputs" << std::endl;
return 0;
}
if (chunkRadius <= 0) {
std::cerr << "Chunk radius must be positive" << std::endl;
return 0;
}
int32_t minChunkX = centerChunkX - chunkRadius;
int32_t minChunkZ = centerChunkZ - chunkRadius;
uint32_t squareSize = chunkRadius * 2;
size_t freeMemory;
size_t totalMemory;
handleError(cudaMemGetInfo(&freeMemory, &totalMemory));
auto maxSquareSize = (size_t) (sqrt((double) freeMemory) * 0.5);
if (maxSquareSize == 0) {
std::cerr << "No free GPU memory" << std::endl;
return 0;
}
#ifdef _MSC_VER
int leadingZeros = sizeof(size_t) == 8 ? _lzcnt_u64(maxSquareSize) : _lzcnt_u32(maxSquareSize);
#else
int leadingZeros = __builtin_clz(maxSquareSize);
#endif
maxSquareSize = 1 << ((sizeof(maxSquareSize) * 8 - 1) - leadingZeros);
if (squareSize > maxSquareSize) {
std::cerr << "Not enough GPU memory for that radius. Radius must be at most " << (maxSquareSize / 2) << " chunks for this available memory" << std::endl;
return 0;
}
size_t workSize = squareSize * squareSize;
std::cout << "Searching area " << minChunkX << ", " << minChunkZ << " to " << (minChunkX + squareSize - 1) << ", " << (minChunkZ + squareSize - 1) << std::endl;
GeodeGeneratorStaticData staticData(seed);
handleError(cudaMemcpyToSymbol(STATIC_DATA, &staticData, sizeof(staticData)));
ChunkPos* filteredPositions;
cudaMalloc(&filteredPositions, workSize); // allocate 8 times less than the capacity, okay because we'll never actually reach this
uint32_t* countPtr;
cudaMallocManaged(&countPtr, sizeof(*countPtr));
*countPtr = 0;
executeKernelUnderProgress("finding geodes", workSize, filterCanGenerateGeode, [=](int gridSize, int blockSize, cudaStream_t executionStream, size_t* deviceProgress){
filterCanGenerateGeode<<<gridSize, blockSize, 0, executionStream>>>(minChunkX, minChunkZ, squareSize, countPtr, filteredPositions);
});
uint32_t count = *countPtr;
handleError(cudaFree(countPtr));
std::cout << "Found " << count << " geodes total in grid" << std::endl;
// shrink filtered positions allocation
// array is too large to keep on device, need to go via host
auto temp = new ChunkPos[count];
handleError(cudaMemcpy(temp, filteredPositions, count * sizeof(*filteredPositions), cudaMemcpyDeviceToHost));
handleError(cudaFree(filteredPositions));
handleError(cudaMalloc(&filteredPositions, count * sizeof(*filteredPositions)));
handleError(cudaMemcpy(filteredPositions, temp, count * sizeof(*filteredPositions), cudaMemcpyHostToDevice));
delete[] temp;
uint32_t* crystalCountsRaw;
size_t sizeOfCrystalCounts = (workSize + 3) / 4 * 4;
handleError(cudaMalloc(&crystalCountsRaw, sizeOfCrystalCounts));
handleError(cudaMemset(crystalCountsRaw, 0, sizeOfCrystalCounts));
ByteArray crystalCounts(crystalCountsRaw);
auto crystalCountsRawHost = new uint32_t[sizeOfCrystalCounts];
ByteArray crystalCountsHost(crystalCountsRawHost);
executeKernelUnderProgress("crystal counting", count, countCrystals, [=](int gridSize, int blockSize, cudaStream_t executionStream, size_t* deviceProgress){
countCrystals<<<gridSize, blockSize, 0, executionStream>>>(count, filteredPositions, deviceProgress, minChunkX, minChunkZ, squareSize, crystalCounts);
});
handleError(cudaFree(filteredPositions));
int* maxCount;
handleError(cudaMallocManaged(&maxCount, sizeof(*maxCount)));
uint32_t* outputCount;
handleError(cudaMallocManaged(&outputCount, sizeof(*outputCount)));
ChunkPos* output;
handleError(cudaMallocManaged(&output, 10000 * sizeof(*output)));
executeKernelUnderProgress("applying ticking ranges", workSize, processRenderDistanceRanges, [=](int gridSize, int blockSize, cudaStream_t executionStream, size_t* deviceProgress){
processRenderDistanceRanges<<<gridSize, blockSize, 0, executionStream>>>(deviceProgress, crystalCounts, squareSize, maxCount, outputCount, output);
});
handleError(cudaMemcpy(crystalCountsRawHost, crystalCountsRaw, sizeOfCrystalCounts, cudaMemcpyDeviceToHost));
std::vector<std::pair<ChunkPos, int>> filteredOutput;
for (int i = 0; i < *outputCount; i++) {
int inRange = getCountInRange(crystalCountsHost, output[i].x, output[i].z, squareSize);
if (*maxCount - inRange <= ACCEPTABLE_BELOW_MAX) {
filteredOutput.emplace_back(output[i], inRange);
}
}
std::cout << "Max found: " << *maxCount << ", number of areas within " << ACCEPTABLE_BELOW_MAX << " of max: " << filteredOutput.size() << std::endl;
std::sort(filteredOutput.begin(), filteredOutput.end(), [](const std::pair<ChunkPos, int>& a, const std::pair<ChunkPos, int>& b) { return a.second >= b.second; });
for (const std::pair<ChunkPos, int>& pair : filteredOutput) {
std::cout << "(" << (pair.first.x + minChunkX) << ", " << (pair.first.z + minChunkZ) << "): " << pair.second << std::endl;
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment