rbnelr/df_init.glsl

## df_init.glsl
#version 460 core
#extension GL_NV_gpu_shader5: enable // for uint8_t
#extension GL_NV_shader_thread_shuffle: enable

layout(local_size_x = 8, local_size_y = 8, local_size_z = 8) in;

layout(r8i, binding = 4) restrict uniform iimage3D df_img;

#include "gpu_voxels.glsl"

uniform ivec3 offsets[32];

// Initialized my DF such that a cell gets the value:
//   0 if a neighbouring voxel is solid (3x3 region)
//  -1 if the voxel itself is solid (so that I avoid checking voxel block ids in the tracing)
// 128 (max DF dist) otherwise

// This code is using a few compute/gpu shader tricks like shared mem, barriers and lane intrinsics
// but in essence it's just computing this logic on the 3x3 region centered on the voxel
// but without requiring 27 texture reads per result voxel

const int REGION = 8;
const int CORE = REGION -2;
const int CHUNK_WGROUPS = (CHUNK_SIZE + CORE-1) / CORE; // round up

shared int8_t buf[REGION][REGION][REGION];

void main () {
	ivec3 wgroupid = ivec3(gl_WorkGroupID);

	int chunkid = wgroupid.z / CHUNK_WGROUPS;
	wgroupid.z  = wgroupid.z % CHUNK_WGROUPS;

	ivec3 pos = wgroupid * CORE + ivec3(gl_LocalInvocationID)-1; // pos in chunk
	bool in_chunk = all(lessThan(pos, ivec3(CHUNK_SIZE)));

	pos += offsets[chunkid]; // to world coord

	int x = int(gl_LocalInvocationID.x);
	int y = int(gl_LocalInvocationID.y);
	int z = int(gl_LocalInvocationID.z);

	uint bid = in_chunk ? texelFetch(voxel_tex, pos, 0).r : 0;
	int8_t val = bid > B_AIR ? int8_t(1u) : int8_t(0u); // solid voxels get a 1, air gets a 0

	// Propagate 1s to X-neighbours
#if 0
	buf[z][y][x] = val;
	barrier(); // make write visible to X pass reads

	// X pass
	if (x >         0) val |= buf[z][y][x-1];
	if (x < REGION -1) val |= buf[z][y][x+1];

    barrier(); // finish reads so we can write X pass result
#else
	// X pass (with lane intrinsics)
	// NOTE: if (x > 0) range checks are not needed since out of bounds reads return this threads value, which can safely be ORed in
	int8_t val0 = int8_t(shuffleDownNV(val, 1u, REGION));
	int8_t val1 = int8_t(shuffleUpNV  (val, 1u, REGION));
	val |= val0;
	val |= val1;
#endif
	buf[z][y][x] = val;
	barrier(); // make write visible to Y pass reads

	{ // Y pass
		if (y >         0) val |= buf[z][y-1][x];
		if (y < REGION -1) val |= buf[z][y+1][x];
	}
    barrier(); // finish reads so we can write Y pass result
	buf[z][y][x] = val;

	if ( x > 0 && x < REGION-1 &&
		 y > 0 && y < REGION-1 &&
		 z > 0 && z < REGION-1 && in_chunk ) {
		barrier(); // make write visible to Z pass reads

		// Z pass
		val |= buf[z-1][y][x];
		val |= buf[z+1][y][x];

		// make DF -1 for solid block and the 1-voxel border 0
		// to let us directly DDA the DF data without touching the voxel data until the final hit computation
		int df = 127;
		if (val != 0)
			df = bid > B_AIR ? -1 : 0;

		imageStore(df_img, pos, ivec4(df, 0,0,0));
	}
}

## dt_gen_pass.glsl
#version 460 core
layout(local_size_x = GROUPSZ, local_size_y = GROUPSZ) in;

layout(r8i, binding = 4) restrict uniform iimage3D df_img;

#include "gpu_voxels.glsl" // CHUNK_SIZE = 64
#define SIZE CHUNK_SIZE

uniform ivec3 offsets[32];

void main () {
#if PASS == 0
	ivec3 pos = ivec3(0, gl_GlobalInvocationID.xy);
	#define GETPOS(I) ivec3(pos.x + (I), pos.y, pos.z)
#elif PASS == 1
	ivec3 pos = ivec3(gl_GlobalInvocationID.x, 0, gl_GlobalInvocationID.y);
	#define GETPOS(I) ivec3(pos.x, pos.y + (I), pos.z)
#else
	ivec3 pos = ivec3(gl_GlobalInvocationID.xy, 0);
	#define GETPOS(I) ivec3(pos.x, pos.y, pos.z + (I))
#endif

	pos += offsets[gl_WorkGroupID.z];

	int prev = 127;
	for (int i=0; i<SIZE; ++i) {
		ivec3 p = GETPOS(i);
		int cur = imageLoad(df_img, p).r;

		prev += 1;
		if (prev < cur) imageStore(df_img, p, ivec4(prev, 0,0,0));
		else            prev = cur;
	}

	prev = 127;
	for (int i=SIZE-1; i>=0; --i) {
		ivec3 p = GETPOS(i);
		int cur = imageLoad(df_img, p).r;

		prev += 1;
		if (prev < cur) imageStore(df_img, p, ivec4(prev, 0,0,0));
		else            prev = cur;
	}
}

## gl_tracer.cpp
void Raytracer::upload_changes (OpenglRenderer& r, Game& game) {
		ZoneScoped;

		std::vector<int3> chunks;

		if (!game.chunks.upload_voxels.empty()) {
			OGL_TRACE("raytracer upload changes");

			// temp buffer to 'decompress' my sparse subchunks and enable uploading them in a single glTextureSubImage3D per chunk
			block_id buffer[CHUNK_SIZE][CHUNK_SIZE][CHUNK_SIZE];

			for (auto cid : game.chunks.upload_voxels) {
				auto& chunk = game.chunks.chunks[cid];
				auto& vox = game.chunks.chunk_voxels[cid];

				int3 pos = chunk.pos + GPU_WORLD_SIZE_CHUNKS/2;
				if ( (unsigned)(pos.x) < GPU_WORLD_SIZE_CHUNKS && // use 3 unsigned comparisons instead of 6 signed ones
					 (unsigned)(pos.y) < GPU_WORLD_SIZE_CHUNKS &&
					 (unsigned)(pos.z) < GPU_WORLD_SIZE_CHUNKS ) {
					//OGL_TRACE("upload chunk data");

					{
						ZoneScopedN("decompress");

						for (int sz=0; sz<SUBCHUNK_COUNT; ++sz)
						for (int sy=0; sy<SUBCHUNK_COUNT; ++sy)
						for (int sx=0; sx<SUBCHUNK_COUNT; ++sx) {

							auto subc = vox.subchunks[IDX3D(sx,sy,sz, SUBCHUNK_SIZE)];
							if (subc & SUBC_SPARSE_BIT) {
								block_id val = (block_id)(subc & ~SUBC_SPARSE_BIT);

								block_id val_packed[SUBCHUNK_SIZE];
								for (int i=0; i<SUBCHUNK_SIZE; ++i)
									val_packed[i] = val;

								for (int z=0; z<SUBCHUNK_SIZE; ++z)
								for (int y=0; y<SUBCHUNK_SIZE; ++y) {
									auto* dst = &buffer[sz*SUBCHUNK_SIZE + z][sy*SUBCHUNK_SIZE + y][sx*SUBCHUNK_SIZE + 0];
									memcpy(dst, val_packed, sizeof(block_id)*SUBCHUNK_SIZE);
								}

							} else {
								auto* data = game.chunks.subchunks[subc].voxels;

								for (int z=0; z<SUBCHUNK_SIZE; ++z)
								for (int y=0; y<SUBCHUNK_SIZE; ++y) {
									auto* dst = &buffer[sz*SUBCHUNK_SIZE + z][sy*SUBCHUNK_SIZE + y][sx*SUBCHUNK_SIZE + 0];
									auto* src = &data[IDX3D(0,y,z, SUBCHUNK_SIZE)];
									memcpy(dst, src, sizeof(block_id)*SUBCHUNK_SIZE);
								}
							}
						}
					}

					{
						ZoneScopedN("glTextureSubImage3D");

						glTextureSubImage3D(voxel_tex.tex, 0,
							pos.x*CHUNK_SIZE, pos.y*CHUNK_SIZE, pos.z*CHUNK_SIZE, CHUNK_SIZE, CHUNK_SIZE, CHUNK_SIZE,
							GL_RED_INTEGER, GL_UNSIGNED_SHORT, buffer);
					}

					chunks.push_back(pos);
				}
			}
			glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT|GL_TEXTURE_FETCH_BARRIER_BIT);

		}
		{
			//chunks.clear(); // for profiling df gen
			//for (int y=0; y<4; ++y)
			//for (int x=0; x<4; ++x) {
			//	chunks.push_back({x,y,7});
			//}

			if (!chunks.empty()) {
				ZoneScopedN("rt_df_gen");
				OGL_TRACE("rt_df_gen");

				int count = (int)chunks.size();

				glBindImageTexture(4, df_tex.tex, 0, GL_FALSE, 0, GL_READ_WRITE, GL_R8I);

				{
					OGL_TIMER_ZONE(timer_df_init.timer);

					glUseProgram(df_tex.shad_init->prog);

					r.state.bind_textures(df_tex.shad_init, {
						{"voxel_tex", voxel_tex.tex},
					});

					static constexpr int BATCHSIZE = 32;
					for (int i=0; i<count; i+=BATCHSIZE) {
						int subcount = min(count - i, BATCHSIZE);

						int3 offsets[BATCHSIZE] = {};
						for (int j=0; j<subcount; ++j)
							offsets[j] = chunks[i+j] * CHUNK_SIZE;

						df_tex.shad_init->set_uniform_array("offsets[0]", offsets, BATCHSIZE);

						constexpr int REGION = 8;
						constexpr int CORE = REGION -2;
						constexpr int CHUNK_WGROUPS = (CHUNK_SIZE + CORE-1) / CORE; // round up

						glDispatchCompute(CHUNK_WGROUPS, CHUNK_WGROUPS, CHUNK_WGROUPS * subcount);
					}

					glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT|GL_TEXTURE_FETCH_BARRIER_BIT);
				}
				for (int pass=0; pass<3; ++pass) {
					Shader* shad = df_tex.shad_pass[pass];

					glUseProgram(shad->prog);
					r.state.bind_textures(shad, {});

					static constexpr int BATCHSIZE = 32;
					for (int i=0; i<count; i+=BATCHSIZE) {
						int subcount = min(count - i, BATCHSIZE);

						int3 offsets[BATCHSIZE] = {};
						for (int j=0; j<subcount; ++j)
							offsets[j] = chunks[i+j] * CHUNK_SIZE;

						shad->set_uniform_array("offsets[0]", offsets, BATCHSIZE);

						int dispatch_size = (CHUNK_SIZE + DFTexture::COMPUTE_GROUPSZ -1) / DFTexture::COMPUTE_GROUPSZ;
						glDispatchCompute(dispatch_size, dispatch_size, subcount);
					}

					glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT|GL_TEXTURE_FETCH_BARRIER_BIT);
				}
			}

			glBindImageTexture(4, 0, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8); // unbind
		}
	}

	void Raytracer::draw (OpenglRenderer& r, Game& game) {
		ZoneScoped;
		if (!rt_forward->prog) return;

		{
			ZoneScopedN("rt_gbufgen");
			OGL_TRACE("rt_gbufgen");
			OGL_TIMER_ZONE(timer_rt.timer);

			glUseProgram(rt_forward->prog);

			rt_forward->set_uniform("framebuf_size", r.framebuffer.size);
			rt_forward->set_uniform("update_debugdraw", r.debug_draw.update_indirect);

			rt_forward->set_uniform("max_iterations", max_iterations);

			r.state.bind_textures(rt_forward, {
				{"voxel_tex", voxel_tex.tex},
				{"df_tex", df_tex.tex},

				//{"gbuf_pos" , gbuf.pos },
				//{"gbuf_col" , gbuf.col },
				//{"gbuf_norm", gbuf.norm},

				{"tile_textures", r.tile_textures, r.tile_sampler},

				{"heat_gradient", r.gradient, r.normal_sampler},
			});

			glBindImageTexture(0, r.framebuffer.color, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_RGBA16F);

			int2 dispatch_size;
			dispatch_size.x = (r.framebuffer.size.x + rt_groupsz.size.x -1) / rt_groupsz.size.x;
			dispatch_size.y = (r.framebuffer.size.y + rt_groupsz.size.y -1) / rt_groupsz.size.y;

			glDispatchCompute(dispatch_size.x, dispatch_size.y, 1);
		}
		glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT|GL_TEXTURE_FETCH_BARRIER_BIT);

		//{
		//	glBindFramebuffer(GL_FRAMEBUFFER, gbuf.fbo);
		//	glClear(GL_DEPTH_BUFFER_BIT);
		//
		//	test_renderer.draw(r);
		//}

		// unbind
		glBindImageTexture(0, 0, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_RGBA32F);

	}

## raytrace.glsl
bool trace_ray (vec3 ray_pos, vec3 ray_dir, float max_dist, out Hit hit) {
	bvec3 dir_pos = greaterThanEqual(ray_dir, vec3(0.0));

	ivec3 vox_exit = mix(ivec3(0), ivec3(1), dir_pos);
	ivec3 step_dir = mix(ivec3(-1), ivec3(+1), dir_pos);

	// precompute part of plane projection equation
	// prefer  'pos * inv_dir + bias'  over  'inv_dir * (pos - ray_pos)'
	// due to madd instruction
	vec3 inv_dir = 1.0 / ray_dir;
	vec3 bias = inv_dir * -ray_pos;

	float dist;
	{ // allow ray to start outside world texture cube for nice debugging views
		float epsilon = 0.0001; // stop the raymarching from sometimes sampling outside the world textures
		vec3 world_min = vec3(        0.0) + epsilon;
		vec3 world_max = vec3(WORLD_SIZEf) - epsilon;

		// calculate entry and exit coords into whole world cube
		vec3 t0v = mix(world_max, world_min, dir_pos) * inv_dir + bias;
		vec3 t1v = mix(world_min, world_max, dir_pos) * inv_dir + bias;
		float t0 = max(max(t0v.x, t0v.y), t0v.z);
		float t1 = min(min(t1v.x, t1v.y), t1v.z);

		t0 = max(t0, 0.0);
		t1 = max(t1, 0.0);

		// ray misses world texture
		if (t1 <= t0)
			return false;

		// adjust ray to start where it hits cube initally
		dist = t0;
		max_dist = t1;
	}

	float manhattan_fac = 0.999 / (abs(ray_dir.x) + abs(ray_dir.y) + abs(ray_dir.z));

	vec3 pos = dist * ray_dir + ray_pos;
	ivec3 coord = ivec3(pos);

	int iter = 0;

	for (;;) {
		float df = float(texelFetch(df_tex, coord, 0).r);
		df *= manhattan_fac;

		if (df > 1.0) {
			// DF tells us that we can still step by <df> before we could possibly hit a voxel
			// step via DF raymarching

			// step up to exit of current cell, since DF is safe up until its bounds
			// seems to give a little bit of perf, as this reduces iteration count
			// of course iteration now has more instructions, so could hurt as well
			vec3 t1v = inv_dir * vec3(coord + vox_exit) + bias;
			dist = min(min(t1v.x, t1v.y), t1v.z);

			// compute chunk exit, since DF is not valid for things outside of the chunk it is generated for
			vec3 chunk_exit = vec3((coord & ~63) + vox_exit*64);

			vec3 chunk_t1v = inv_dir * chunk_exit + bias;
			float chunk_t1 = min(min(chunk_t1v.x, chunk_t1v.y), chunk_t1v.z);

			// limit step to exactly on the exit face of the chunk
			dist = min(dist + df, chunk_t1);

			// update pos for next iteration
			pos = dist * ray_dir + ray_pos;

			// fix precision issues with coord calculation when limiting step to on chunk face
			// note: prefer this to adding epsilon to chunk_t1, since that can miss voxels through the diagonals
			if      (chunk_t1v.x == dist) pos.x += float(step_dir.x) * 0.5;
			else if (chunk_t1v.y == dist) pos.y += float(step_dir.y) * 0.5;
			else if (chunk_t1v.z == dist) pos.z += float(step_dir.z) * 0.5;

			// update coord for next iteration
			coord = ivec3(pos);

		} else {
			// we need to check individual voxels by DDA now

			// -1 marks solid voxels (they have 1-voxel border of 0s around them)
			// this avoids one memory read
			// and should eliminate all empty block id reads and thus help improve caching for the DF values by a bit
			if (df < 0.0)
				break;

			vec3 t1v = inv_dir * vec3(coord + vox_exit) + bias;
			dist = min(min(t1v.x, t1v.y), t1v.z);

			// step on axis where exit distance is lowest
			if      (t1v.x == dist) coord.x += step_dir.x;
			else if (t1v.y == dist) coord.y += step_dir.y;
			else                    coord.z += step_dir.z;
		}

		iter++;
		if (iter >= max_iterations || dist >= max_dist)
			return false; // miss
	}

	{ // calc hit info

		// snap ray to voxel entry in case we landed inside a voxel when raymarching
		vec3 vox_entry = vec3(coord) + mix(vec3(1.0), vec3(0.0), dir_pos);

		vec3 t0v = inv_dir * vox_entry + bias;
		dist = max(max(t0v.x, t0v.y), max(t0v.z, 0.0)); // max(, 0.0) to not count faces behind ray

		hit.bid = texelFetch(voxel_tex, coord, 0).r;
		hit.dist = dist;
		hit.pos = dist * ray_dir + ray_pos;

		vec2 uv;
		int face;
		{ // calc hit face, uv and normal
			vec3 hit_center = vec3(coord) + 0.5;

			vec3 offs = (hit.pos - hit_center);
			vec3 abs_offs = abs(offs);

			hit.normal = vec3(0.0);

			if (abs_offs.x >= abs_offs.y && abs_offs.x >= abs_offs.z) {
				hit.normal.x = sign(offs.x);
				face = offs.x < 0.0 ? 0 : 1;
				uv = hit.pos.yz;
				if (offs.x < 0.0) uv.x = 1.0 - uv.x;
			} else if (abs_offs.y >= abs_offs.z) {
				hit.normal.y = sign(offs.y);
				face = offs.y < 0.0 ? 2 : 3;
				uv = hit.pos.xz;
				if (offs.y >= 0.0) uv.x = 1.0 - uv.x;
			} else {
				hit.normal.z = sign(offs.z);
				face = offs.z < 0.0 ? 4 : 5;
				uv = hit.pos.xy;
				if (offs.z < 0.0) uv.y = 1.0 - uv.y;
			}
		}

		float texid = float(block_tiles[hit.bid].sides[face]);

		hit.col = textureLod(tile_textures, vec3(uv, texid), log2(dist)*0.20 - 1.0).rgba;
	}

	return true; // hit
}
	#version 460 core
	#extension GL_NV_gpu_shader5: enable // for uint8_t
	#extension GL_NV_shader_thread_shuffle: enable

	layout(local_size_x = 8, local_size_y = 8, local_size_z = 8) in;

	layout(r8i, binding = 4) restrict uniform iimage3D df_img;

	#include "gpu_voxels.glsl"

	uniform ivec3 offsets[32];

	// Initialized my DF such that a cell gets the value:
	// 0 if a neighbouring voxel is solid (3x3 region)
	// -1 if the voxel itself is solid (so that I avoid checking voxel block ids in the tracing)
	// 128 (max DF dist) otherwise

	// This code is using a few compute/gpu shader tricks like shared mem, barriers and lane intrinsics
	// but in essence it's just computing this logic on the 3x3 region centered on the voxel
	// but without requiring 27 texture reads per result voxel

	const int REGION = 8;
	const int CORE = REGION -2;
	const int CHUNK_WGROUPS = (CHUNK_SIZE + CORE-1) / CORE; // round up

	shared int8_t buf[REGION][REGION][REGION];

	void main () {
	ivec3 wgroupid = ivec3(gl_WorkGroupID);

	int chunkid = wgroupid.z / CHUNK_WGROUPS;
	wgroupid.z = wgroupid.z % CHUNK_WGROUPS;

	ivec3 pos = wgroupid * CORE + ivec3(gl_LocalInvocationID)-1; // pos in chunk
	bool in_chunk = all(lessThan(pos, ivec3(CHUNK_SIZE)));

	pos += offsets[chunkid]; // to world coord

	int x = int(gl_LocalInvocationID.x);
	int y = int(gl_LocalInvocationID.y);
	int z = int(gl_LocalInvocationID.z);

	uint bid = in_chunk ? texelFetch(voxel_tex, pos, 0).r : 0;
	int8_t val = bid > B_AIR ? int8_t(1u) : int8_t(0u); // solid voxels get a 1, air gets a 0

	// Propagate 1s to X-neighbours
	#if 0
	buf[z][y][x] = val;
	barrier(); // make write visible to X pass reads

	// X pass
	if (x > 0) val \|= buf[z][y][x-1];
	if (x < REGION -1) val \|= buf[z][y][x+1];

	barrier(); // finish reads so we can write X pass result
	#else
	// X pass (with lane intrinsics)
	// NOTE: if (x > 0) range checks are not needed since out of bounds reads return this threads value, which can safely be ORed in
	int8_t val0 = int8_t(shuffleDownNV(val, 1u, REGION));
	int8_t val1 = int8_t(shuffleUpNV (val, 1u, REGION));
	val \|= val0;
	val \|= val1;
	#endif
	buf[z][y][x] = val;
	barrier(); // make write visible to Y pass reads

	{ // Y pass
	if (y > 0) val \|= buf[z][y-1][x];
	if (y < REGION -1) val \|= buf[z][y+1][x];
	}
	barrier(); // finish reads so we can write Y pass result
	buf[z][y][x] = val;

	if ( x > 0 && x < REGION-1 &&
	y > 0 && y < REGION-1 &&
	z > 0 && z < REGION-1 && in_chunk ) {
	barrier(); // make write visible to Z pass reads

	// Z pass
	val \|= buf[z-1][y][x];
	val \|= buf[z+1][y][x];

	// make DF -1 for solid block and the 1-voxel border 0
	// to let us directly DDA the DF data without touching the voxel data until the final hit computation
	int df = 127;
	if (val != 0)
	df = bid > B_AIR ? -1 : 0;

	imageStore(df_img, pos, ivec4(df, 0,0,0));
	}
	}
	#version 460 core
	layout(local_size_x = GROUPSZ, local_size_y = GROUPSZ) in;

	layout(r8i, binding = 4) restrict uniform iimage3D df_img;

	#include "gpu_voxels.glsl" // CHUNK_SIZE = 64
	#define SIZE CHUNK_SIZE

	uniform ivec3 offsets[32];

	void main () {
	#if PASS == 0
	ivec3 pos = ivec3(0, gl_GlobalInvocationID.xy);
	#define GETPOS(I) ivec3(pos.x + (I), pos.y, pos.z)
	#elif PASS == 1
	ivec3 pos = ivec3(gl_GlobalInvocationID.x, 0, gl_GlobalInvocationID.y);
	#define GETPOS(I) ivec3(pos.x, pos.y + (I), pos.z)
	#else
	ivec3 pos = ivec3(gl_GlobalInvocationID.xy, 0);
	#define GETPOS(I) ivec3(pos.x, pos.y, pos.z + (I))
	#endif

	pos += offsets[gl_WorkGroupID.z];

	int prev = 127;
	for (int i=0; i<SIZE; ++i) {
	ivec3 p = GETPOS(i);
	int cur = imageLoad(df_img, p).r;

	prev += 1;
	if (prev < cur) imageStore(df_img, p, ivec4(prev, 0,0,0));
	else prev = cur;
	}

	prev = 127;
	for (int i=SIZE-1; i>=0; --i) {
	ivec3 p = GETPOS(i);
	int cur = imageLoad(df_img, p).r;

	prev += 1;
	if (prev < cur) imageStore(df_img, p, ivec4(prev, 0,0,0));
	else prev = cur;
	}
	}
	void Raytracer::upload_changes (OpenglRenderer& r, Game& game) {
	ZoneScoped;

	std::vector<int3> chunks;

	if (!game.chunks.upload_voxels.empty()) {
	OGL_TRACE("raytracer upload changes");

	// temp buffer to 'decompress' my sparse subchunks and enable uploading them in a single glTextureSubImage3D per chunk
	block_id buffer[CHUNK_SIZE][CHUNK_SIZE][CHUNK_SIZE];

	for (auto cid : game.chunks.upload_voxels) {
	auto& chunk = game.chunks.chunks[cid];
	auto& vox = game.chunks.chunk_voxels[cid];

	int3 pos = chunk.pos + GPU_WORLD_SIZE_CHUNKS/2;
	if ( (unsigned)(pos.x) < GPU_WORLD_SIZE_CHUNKS && // use 3 unsigned comparisons instead of 6 signed ones
	(unsigned)(pos.y) < GPU_WORLD_SIZE_CHUNKS &&
	(unsigned)(pos.z) < GPU_WORLD_SIZE_CHUNKS ) {
	//OGL_TRACE("upload chunk data");

	{
	ZoneScopedN("decompress");

	for (int sz=0; sz<SUBCHUNK_COUNT; ++sz)
	for (int sy=0; sy<SUBCHUNK_COUNT; ++sy)
	for (int sx=0; sx<SUBCHUNK_COUNT; ++sx) {

	auto subc = vox.subchunks[IDX3D(sx,sy,sz, SUBCHUNK_SIZE)];
	if (subc & SUBC_SPARSE_BIT) {
	block_id val = (block_id)(subc & ~SUBC_SPARSE_BIT);

	block_id val_packed[SUBCHUNK_SIZE];
	for (int i=0; i<SUBCHUNK_SIZE; ++i)
	val_packed[i] = val;

	for (int z=0; z<SUBCHUNK_SIZE; ++z)
	for (int y=0; y<SUBCHUNK_SIZE; ++y) {
	auto* dst = &buffer[szSUBCHUNK_SIZE + z][sySUBCHUNK_SIZE + y][sx*SUBCHUNK_SIZE + 0];
	memcpy(dst, val_packed, sizeof(block_id)*SUBCHUNK_SIZE);
	}

	} else {
	auto* data = game.chunks.subchunks[subc].voxels;

	for (int z=0; z<SUBCHUNK_SIZE; ++z)
	for (int y=0; y<SUBCHUNK_SIZE; ++y) {
	auto* dst = &buffer[szSUBCHUNK_SIZE + z][sySUBCHUNK_SIZE + y][sx*SUBCHUNK_SIZE + 0];
	auto* src = &data[IDX3D(0,y,z, SUBCHUNK_SIZE)];
	memcpy(dst, src, sizeof(block_id)*SUBCHUNK_SIZE);
	}
	}
	}
	}

	{
	ZoneScopedN("glTextureSubImage3D");

	glTextureSubImage3D(voxel_tex.tex, 0,
	pos.xCHUNK_SIZE, pos.yCHUNK_SIZE, pos.z*CHUNK_SIZE, CHUNK_SIZE, CHUNK_SIZE, CHUNK_SIZE,
	GL_RED_INTEGER, GL_UNSIGNED_SHORT, buffer);
	}

	chunks.push_back(pos);
	}
	}
	glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT\|GL_TEXTURE_FETCH_BARRIER_BIT);

	}
	{
	//chunks.clear(); // for profiling df gen
	//for (int y=0; y<4; ++y)
	//for (int x=0; x<4; ++x) {
	// chunks.push_back({x,y,7});
	//}

	if (!chunks.empty()) {
	ZoneScopedN("rt_df_gen");
	OGL_TRACE("rt_df_gen");

	int count = (int)chunks.size();

	glBindImageTexture(4, df_tex.tex, 0, GL_FALSE, 0, GL_READ_WRITE, GL_R8I);

	{
	OGL_TIMER_ZONE(timer_df_init.timer);

	glUseProgram(df_tex.shad_init->prog);

	r.state.bind_textures(df_tex.shad_init, {
	{"voxel_tex", voxel_tex.tex},
	});

	static constexpr int BATCHSIZE = 32;
	for (int i=0; i<count; i+=BATCHSIZE) {
	int subcount = min(count - i, BATCHSIZE);

	int3 offsets[BATCHSIZE] = {};
	for (int j=0; j<subcount; ++j)
	offsets[j] = chunks[i+j] * CHUNK_SIZE;

	df_tex.shad_init->set_uniform_array("offsets[0]", offsets, BATCHSIZE);

	constexpr int REGION = 8;
	constexpr int CORE = REGION -2;
	constexpr int CHUNK_WGROUPS = (CHUNK_SIZE + CORE-1) / CORE; // round up

	glDispatchCompute(CHUNK_WGROUPS, CHUNK_WGROUPS, CHUNK_WGROUPS * subcount);
	}

	glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT\|GL_TEXTURE_FETCH_BARRIER_BIT);
	}
	for (int pass=0; pass<3; ++pass) {
	Shader* shad = df_tex.shad_pass[pass];

	glUseProgram(shad->prog);
	r.state.bind_textures(shad, {});

	static constexpr int BATCHSIZE = 32;
	for (int i=0; i<count; i+=BATCHSIZE) {
	int subcount = min(count - i, BATCHSIZE);

	int3 offsets[BATCHSIZE] = {};
	for (int j=0; j<subcount; ++j)
	offsets[j] = chunks[i+j] * CHUNK_SIZE;

	shad->set_uniform_array("offsets[0]", offsets, BATCHSIZE);

	int dispatch_size = (CHUNK_SIZE + DFTexture::COMPUTE_GROUPSZ -1) / DFTexture::COMPUTE_GROUPSZ;
	glDispatchCompute(dispatch_size, dispatch_size, subcount);
	}

	glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT\|GL_TEXTURE_FETCH_BARRIER_BIT);
	}
	}

	glBindImageTexture(4, 0, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8); // unbind
	}
	}

	void Raytracer::draw (OpenglRenderer& r, Game& game) {
	ZoneScoped;
	if (!rt_forward->prog) return;

	{
	ZoneScopedN("rt_gbufgen");
	OGL_TRACE("rt_gbufgen");
	OGL_TIMER_ZONE(timer_rt.timer);

	glUseProgram(rt_forward->prog);

	rt_forward->set_uniform("framebuf_size", r.framebuffer.size);
	rt_forward->set_uniform("update_debugdraw", r.debug_draw.update_indirect);

	rt_forward->set_uniform("max_iterations", max_iterations);

	r.state.bind_textures(rt_forward, {
	{"voxel_tex", voxel_tex.tex},
	{"df_tex", df_tex.tex},

	//{"gbuf_pos" , gbuf.pos },
	//{"gbuf_col" , gbuf.col },
	//{"gbuf_norm", gbuf.norm},

	{"tile_textures", r.tile_textures, r.tile_sampler},

	{"heat_gradient", r.gradient, r.normal_sampler},
	});

	glBindImageTexture(0, r.framebuffer.color, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_RGBA16F);

	int2 dispatch_size;
	dispatch_size.x = (r.framebuffer.size.x + rt_groupsz.size.x -1) / rt_groupsz.size.x;
	dispatch_size.y = (r.framebuffer.size.y + rt_groupsz.size.y -1) / rt_groupsz.size.y;

	glDispatchCompute(dispatch_size.x, dispatch_size.y, 1);
	}
	glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT\|GL_TEXTURE_FETCH_BARRIER_BIT);

	//{
	// glBindFramebuffer(GL_FRAMEBUFFER, gbuf.fbo);
	// glClear(GL_DEPTH_BUFFER_BIT);
	//
	// test_renderer.draw(r);
	//}

	// unbind
	glBindImageTexture(0, 0, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_RGBA32F);

	}
	bool trace_ray (vec3 ray_pos, vec3 ray_dir, float max_dist, out Hit hit) {
	bvec3 dir_pos = greaterThanEqual(ray_dir, vec3(0.0));

	ivec3 vox_exit = mix(ivec3(0), ivec3(1), dir_pos);
	ivec3 step_dir = mix(ivec3(-1), ivec3(+1), dir_pos);

	// precompute part of plane projection equation
	// prefer 'pos * inv_dir + bias' over 'inv_dir * (pos - ray_pos)'
	// due to madd instruction
	vec3 inv_dir = 1.0 / ray_dir;
	vec3 bias = inv_dir * -ray_pos;

	float dist;
	{ // allow ray to start outside world texture cube for nice debugging views
	float epsilon = 0.0001; // stop the raymarching from sometimes sampling outside the world textures
	vec3 world_min = vec3( 0.0) + epsilon;
	vec3 world_max = vec3(WORLD_SIZEf) - epsilon;

	// calculate entry and exit coords into whole world cube
	vec3 t0v = mix(world_max, world_min, dir_pos) * inv_dir + bias;
	vec3 t1v = mix(world_min, world_max, dir_pos) * inv_dir + bias;
	float t0 = max(max(t0v.x, t0v.y), t0v.z);
	float t1 = min(min(t1v.x, t1v.y), t1v.z);

	t0 = max(t0, 0.0);
	t1 = max(t1, 0.0);

	// ray misses world texture
	if (t1 <= t0)
	return false;

	// adjust ray to start where it hits cube initally
	dist = t0;
	max_dist = t1;
	}

	float manhattan_fac = 0.999 / (abs(ray_dir.x) + abs(ray_dir.y) + abs(ray_dir.z));

	vec3 pos = dist * ray_dir + ray_pos;
	ivec3 coord = ivec3(pos);

	int iter = 0;

	for (;;) {
	float df = float(texelFetch(df_tex, coord, 0).r);
	df *= manhattan_fac;

	if (df > 1.0) {
	// DF tells us that we can still step by <df> before we could possibly hit a voxel
	// step via DF raymarching

	// step up to exit of current cell, since DF is safe up until its bounds
	// seems to give a little bit of perf, as this reduces iteration count
	// of course iteration now has more instructions, so could hurt as well
	vec3 t1v = inv_dir * vec3(coord + vox_exit) + bias;
	dist = min(min(t1v.x, t1v.y), t1v.z);

	// compute chunk exit, since DF is not valid for things outside of the chunk it is generated for
	vec3 chunk_exit = vec3((coord & ~63) + vox_exit*64);

	vec3 chunk_t1v = inv_dir * chunk_exit + bias;
	float chunk_t1 = min(min(chunk_t1v.x, chunk_t1v.y), chunk_t1v.z);

	// limit step to exactly on the exit face of the chunk
	dist = min(dist + df, chunk_t1);

	// update pos for next iteration
	pos = dist * ray_dir + ray_pos;

	// fix precision issues with coord calculation when limiting step to on chunk face
	// note: prefer this to adding epsilon to chunk_t1, since that can miss voxels through the diagonals
	if (chunk_t1v.x == dist) pos.x += float(step_dir.x) * 0.5;
	else if (chunk_t1v.y == dist) pos.y += float(step_dir.y) * 0.5;
	else if (chunk_t1v.z == dist) pos.z += float(step_dir.z) * 0.5;

	// update coord for next iteration
	coord = ivec3(pos);

	} else {
	// we need to check individual voxels by DDA now

	// -1 marks solid voxels (they have 1-voxel border of 0s around them)
	// this avoids one memory read
	// and should eliminate all empty block id reads and thus help improve caching for the DF values by a bit
	if (df < 0.0)
	break;

	vec3 t1v = inv_dir * vec3(coord + vox_exit) + bias;
	dist = min(min(t1v.x, t1v.y), t1v.z);

	// step on axis where exit distance is lowest
	if (t1v.x == dist) coord.x += step_dir.x;
	else if (t1v.y == dist) coord.y += step_dir.y;
	else coord.z += step_dir.z;
	}

	iter++;
	if (iter >= max_iterations \|\| dist >= max_dist)
	return false; // miss
	}

	{ // calc hit info

	// snap ray to voxel entry in case we landed inside a voxel when raymarching
	vec3 vox_entry = vec3(coord) + mix(vec3(1.0), vec3(0.0), dir_pos);

	vec3 t0v = inv_dir * vox_entry + bias;
	dist = max(max(t0v.x, t0v.y), max(t0v.z, 0.0)); // max(, 0.0) to not count faces behind ray

	hit.bid = texelFetch(voxel_tex, coord, 0).r;
	hit.dist = dist;
	hit.pos = dist * ray_dir + ray_pos;

	vec2 uv;
	int face;
	{ // calc hit face, uv and normal
	vec3 hit_center = vec3(coord) + 0.5;

	vec3 offs = (hit.pos - hit_center);
	vec3 abs_offs = abs(offs);

	hit.normal = vec3(0.0);

	if (abs_offs.x >= abs_offs.y && abs_offs.x >= abs_offs.z) {
	hit.normal.x = sign(offs.x);
	face = offs.x < 0.0 ? 0 : 1;
	uv = hit.pos.yz;
	if (offs.x < 0.0) uv.x = 1.0 - uv.x;
	} else if (abs_offs.y >= abs_offs.z) {
	hit.normal.y = sign(offs.y);
	face = offs.y < 0.0 ? 2 : 3;
	uv = hit.pos.xz;
	if (offs.y >= 0.0) uv.x = 1.0 - uv.x;
	} else {
	hit.normal.z = sign(offs.z);
	face = offs.z < 0.0 ? 4 : 5;
	uv = hit.pos.xy;
	if (offs.z < 0.0) uv.y = 1.0 - uv.y;
	}
	}

	float texid = float(block_tiles[hit.bid].sides[face]);

	hit.col = textureLod(tile_textures, vec3(uv, texid), log2(dist)*0.20 - 1.0).rgba;
	}

	return true; // hit
	}