Skip to content

Instantly share code, notes, and snippets.

@rbnelr
Created August 26, 2021 10:37
Show Gist options
  • Save rbnelr/5b594eceec7562524a126eef950deffc to your computer and use it in GitHub Desktop.
Save rbnelr/5b594eceec7562524a126eef950deffc to your computer and use it in GitHub Desktop.
#version 460 core
#extension GL_NV_gpu_shader5: enable // for uint8_t
#extension GL_NV_shader_thread_shuffle: enable
layout(local_size_x = 8, local_size_y = 8, local_size_z = 8) in;
layout(r8i, binding = 4) restrict uniform iimage3D df_img;
#include "gpu_voxels.glsl"
uniform ivec3 offsets[32];
// Initialized my DF such that a cell gets the value:
// 0 if a neighbouring voxel is solid (3x3 region)
// -1 if the voxel itself is solid (so that I avoid checking voxel block ids in the tracing)
// 128 (max DF dist) otherwise
// This code is using a few compute/gpu shader tricks like shared mem, barriers and lane intrinsics
// but in essence it's just computing this logic on the 3x3 region centered on the voxel
// but without requiring 27 texture reads per result voxel
const int REGION = 8;
const int CORE = REGION -2;
const int CHUNK_WGROUPS = (CHUNK_SIZE + CORE-1) / CORE; // round up
shared int8_t buf[REGION][REGION][REGION];
void main () {
ivec3 wgroupid = ivec3(gl_WorkGroupID);
int chunkid = wgroupid.z / CHUNK_WGROUPS;
wgroupid.z = wgroupid.z % CHUNK_WGROUPS;
ivec3 pos = wgroupid * CORE + ivec3(gl_LocalInvocationID)-1; // pos in chunk
bool in_chunk = all(lessThan(pos, ivec3(CHUNK_SIZE)));
pos += offsets[chunkid]; // to world coord
int x = int(gl_LocalInvocationID.x);
int y = int(gl_LocalInvocationID.y);
int z = int(gl_LocalInvocationID.z);
uint bid = in_chunk ? texelFetch(voxel_tex, pos, 0).r : 0;
int8_t val = bid > B_AIR ? int8_t(1u) : int8_t(0u); // solid voxels get a 1, air gets a 0
// Propagate 1s to X-neighbours
#if 0
buf[z][y][x] = val;
barrier(); // make write visible to X pass reads
// X pass
if (x > 0) val |= buf[z][y][x-1];
if (x < REGION -1) val |= buf[z][y][x+1];
barrier(); // finish reads so we can write X pass result
#else
// X pass (with lane intrinsics)
// NOTE: if (x > 0) range checks are not needed since out of bounds reads return this threads value, which can safely be ORed in
int8_t val0 = int8_t(shuffleDownNV(val, 1u, REGION));
int8_t val1 = int8_t(shuffleUpNV (val, 1u, REGION));
val |= val0;
val |= val1;
#endif
buf[z][y][x] = val;
barrier(); // make write visible to Y pass reads
{ // Y pass
if (y > 0) val |= buf[z][y-1][x];
if (y < REGION -1) val |= buf[z][y+1][x];
}
barrier(); // finish reads so we can write Y pass result
buf[z][y][x] = val;
if ( x > 0 && x < REGION-1 &&
y > 0 && y < REGION-1 &&
z > 0 && z < REGION-1 && in_chunk ) {
barrier(); // make write visible to Z pass reads
// Z pass
val |= buf[z-1][y][x];
val |= buf[z+1][y][x];
// make DF -1 for solid block and the 1-voxel border 0
// to let us directly DDA the DF data without touching the voxel data until the final hit computation
int df = 127;
if (val != 0)
df = bid > B_AIR ? -1 : 0;
imageStore(df_img, pos, ivec4(df, 0,0,0));
}
}
#version 460 core
layout(local_size_x = GROUPSZ, local_size_y = GROUPSZ) in;
layout(r8i, binding = 4) restrict uniform iimage3D df_img;
#include "gpu_voxels.glsl" // CHUNK_SIZE = 64
#define SIZE CHUNK_SIZE
uniform ivec3 offsets[32];
void main () {
#if PASS == 0
ivec3 pos = ivec3(0, gl_GlobalInvocationID.xy);
#define GETPOS(I) ivec3(pos.x + (I), pos.y, pos.z)
#elif PASS == 1
ivec3 pos = ivec3(gl_GlobalInvocationID.x, 0, gl_GlobalInvocationID.y);
#define GETPOS(I) ivec3(pos.x, pos.y + (I), pos.z)
#else
ivec3 pos = ivec3(gl_GlobalInvocationID.xy, 0);
#define GETPOS(I) ivec3(pos.x, pos.y, pos.z + (I))
#endif
pos += offsets[gl_WorkGroupID.z];
int prev = 127;
for (int i=0; i<SIZE; ++i) {
ivec3 p = GETPOS(i);
int cur = imageLoad(df_img, p).r;
prev += 1;
if (prev < cur) imageStore(df_img, p, ivec4(prev, 0,0,0));
else prev = cur;
}
prev = 127;
for (int i=SIZE-1; i>=0; --i) {
ivec3 p = GETPOS(i);
int cur = imageLoad(df_img, p).r;
prev += 1;
if (prev < cur) imageStore(df_img, p, ivec4(prev, 0,0,0));
else prev = cur;
}
}
void Raytracer::upload_changes (OpenglRenderer& r, Game& game) {
ZoneScoped;
std::vector<int3> chunks;
if (!game.chunks.upload_voxels.empty()) {
OGL_TRACE("raytracer upload changes");
// temp buffer to 'decompress' my sparse subchunks and enable uploading them in a single glTextureSubImage3D per chunk
block_id buffer[CHUNK_SIZE][CHUNK_SIZE][CHUNK_SIZE];
for (auto cid : game.chunks.upload_voxels) {
auto& chunk = game.chunks.chunks[cid];
auto& vox = game.chunks.chunk_voxels[cid];
int3 pos = chunk.pos + GPU_WORLD_SIZE_CHUNKS/2;
if ( (unsigned)(pos.x) < GPU_WORLD_SIZE_CHUNKS && // use 3 unsigned comparisons instead of 6 signed ones
(unsigned)(pos.y) < GPU_WORLD_SIZE_CHUNKS &&
(unsigned)(pos.z) < GPU_WORLD_SIZE_CHUNKS ) {
//OGL_TRACE("upload chunk data");
{
ZoneScopedN("decompress");
for (int sz=0; sz<SUBCHUNK_COUNT; ++sz)
for (int sy=0; sy<SUBCHUNK_COUNT; ++sy)
for (int sx=0; sx<SUBCHUNK_COUNT; ++sx) {
auto subc = vox.subchunks[IDX3D(sx,sy,sz, SUBCHUNK_SIZE)];
if (subc & SUBC_SPARSE_BIT) {
block_id val = (block_id)(subc & ~SUBC_SPARSE_BIT);
block_id val_packed[SUBCHUNK_SIZE];
for (int i=0; i<SUBCHUNK_SIZE; ++i)
val_packed[i] = val;
for (int z=0; z<SUBCHUNK_SIZE; ++z)
for (int y=0; y<SUBCHUNK_SIZE; ++y) {
auto* dst = &buffer[sz*SUBCHUNK_SIZE + z][sy*SUBCHUNK_SIZE + y][sx*SUBCHUNK_SIZE + 0];
memcpy(dst, val_packed, sizeof(block_id)*SUBCHUNK_SIZE);
}
} else {
auto* data = game.chunks.subchunks[subc].voxels;
for (int z=0; z<SUBCHUNK_SIZE; ++z)
for (int y=0; y<SUBCHUNK_SIZE; ++y) {
auto* dst = &buffer[sz*SUBCHUNK_SIZE + z][sy*SUBCHUNK_SIZE + y][sx*SUBCHUNK_SIZE + 0];
auto* src = &data[IDX3D(0,y,z, SUBCHUNK_SIZE)];
memcpy(dst, src, sizeof(block_id)*SUBCHUNK_SIZE);
}
}
}
}
{
ZoneScopedN("glTextureSubImage3D");
glTextureSubImage3D(voxel_tex.tex, 0,
pos.x*CHUNK_SIZE, pos.y*CHUNK_SIZE, pos.z*CHUNK_SIZE, CHUNK_SIZE, CHUNK_SIZE, CHUNK_SIZE,
GL_RED_INTEGER, GL_UNSIGNED_SHORT, buffer);
}
chunks.push_back(pos);
}
}
glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT|GL_TEXTURE_FETCH_BARRIER_BIT);
}
{
//chunks.clear(); // for profiling df gen
//for (int y=0; y<4; ++y)
//for (int x=0; x<4; ++x) {
// chunks.push_back({x,y,7});
//}
if (!chunks.empty()) {
ZoneScopedN("rt_df_gen");
OGL_TRACE("rt_df_gen");
int count = (int)chunks.size();
glBindImageTexture(4, df_tex.tex, 0, GL_FALSE, 0, GL_READ_WRITE, GL_R8I);
{
OGL_TIMER_ZONE(timer_df_init.timer);
glUseProgram(df_tex.shad_init->prog);
r.state.bind_textures(df_tex.shad_init, {
{"voxel_tex", voxel_tex.tex},
});
static constexpr int BATCHSIZE = 32;
for (int i=0; i<count; i+=BATCHSIZE) {
int subcount = min(count - i, BATCHSIZE);
int3 offsets[BATCHSIZE] = {};
for (int j=0; j<subcount; ++j)
offsets[j] = chunks[i+j] * CHUNK_SIZE;
df_tex.shad_init->set_uniform_array("offsets[0]", offsets, BATCHSIZE);
constexpr int REGION = 8;
constexpr int CORE = REGION -2;
constexpr int CHUNK_WGROUPS = (CHUNK_SIZE + CORE-1) / CORE; // round up
glDispatchCompute(CHUNK_WGROUPS, CHUNK_WGROUPS, CHUNK_WGROUPS * subcount);
}
glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT|GL_TEXTURE_FETCH_BARRIER_BIT);
}
for (int pass=0; pass<3; ++pass) {
Shader* shad = df_tex.shad_pass[pass];
glUseProgram(shad->prog);
r.state.bind_textures(shad, {});
static constexpr int BATCHSIZE = 32;
for (int i=0; i<count; i+=BATCHSIZE) {
int subcount = min(count - i, BATCHSIZE);
int3 offsets[BATCHSIZE] = {};
for (int j=0; j<subcount; ++j)
offsets[j] = chunks[i+j] * CHUNK_SIZE;
shad->set_uniform_array("offsets[0]", offsets, BATCHSIZE);
int dispatch_size = (CHUNK_SIZE + DFTexture::COMPUTE_GROUPSZ -1) / DFTexture::COMPUTE_GROUPSZ;
glDispatchCompute(dispatch_size, dispatch_size, subcount);
}
glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT|GL_TEXTURE_FETCH_BARRIER_BIT);
}
}
glBindImageTexture(4, 0, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8); // unbind
}
}
void Raytracer::draw (OpenglRenderer& r, Game& game) {
ZoneScoped;
if (!rt_forward->prog) return;
{
ZoneScopedN("rt_gbufgen");
OGL_TRACE("rt_gbufgen");
OGL_TIMER_ZONE(timer_rt.timer);
glUseProgram(rt_forward->prog);
rt_forward->set_uniform("framebuf_size", r.framebuffer.size);
rt_forward->set_uniform("update_debugdraw", r.debug_draw.update_indirect);
rt_forward->set_uniform("max_iterations", max_iterations);
r.state.bind_textures(rt_forward, {
{"voxel_tex", voxel_tex.tex},
{"df_tex", df_tex.tex},
//{"gbuf_pos" , gbuf.pos },
//{"gbuf_col" , gbuf.col },
//{"gbuf_norm", gbuf.norm},
{"tile_textures", r.tile_textures, r.tile_sampler},
{"heat_gradient", r.gradient, r.normal_sampler},
});
glBindImageTexture(0, r.framebuffer.color, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_RGBA16F);
int2 dispatch_size;
dispatch_size.x = (r.framebuffer.size.x + rt_groupsz.size.x -1) / rt_groupsz.size.x;
dispatch_size.y = (r.framebuffer.size.y + rt_groupsz.size.y -1) / rt_groupsz.size.y;
glDispatchCompute(dispatch_size.x, dispatch_size.y, 1);
}
glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT|GL_TEXTURE_FETCH_BARRIER_BIT);
//{
// glBindFramebuffer(GL_FRAMEBUFFER, gbuf.fbo);
// glClear(GL_DEPTH_BUFFER_BIT);
//
// test_renderer.draw(r);
//}
// unbind
glBindImageTexture(0, 0, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_RGBA32F);
}
bool trace_ray (vec3 ray_pos, vec3 ray_dir, float max_dist, out Hit hit) {
bvec3 dir_pos = greaterThanEqual(ray_dir, vec3(0.0));
ivec3 vox_exit = mix(ivec3(0), ivec3(1), dir_pos);
ivec3 step_dir = mix(ivec3(-1), ivec3(+1), dir_pos);
// precompute part of plane projection equation
// prefer 'pos * inv_dir + bias' over 'inv_dir * (pos - ray_pos)'
// due to madd instruction
vec3 inv_dir = 1.0 / ray_dir;
vec3 bias = inv_dir * -ray_pos;
float dist;
{ // allow ray to start outside world texture cube for nice debugging views
float epsilon = 0.0001; // stop the raymarching from sometimes sampling outside the world textures
vec3 world_min = vec3( 0.0) + epsilon;
vec3 world_max = vec3(WORLD_SIZEf) - epsilon;
// calculate entry and exit coords into whole world cube
vec3 t0v = mix(world_max, world_min, dir_pos) * inv_dir + bias;
vec3 t1v = mix(world_min, world_max, dir_pos) * inv_dir + bias;
float t0 = max(max(t0v.x, t0v.y), t0v.z);
float t1 = min(min(t1v.x, t1v.y), t1v.z);
t0 = max(t0, 0.0);
t1 = max(t1, 0.0);
// ray misses world texture
if (t1 <= t0)
return false;
// adjust ray to start where it hits cube initally
dist = t0;
max_dist = t1;
}
float manhattan_fac = 0.999 / (abs(ray_dir.x) + abs(ray_dir.y) + abs(ray_dir.z));
vec3 pos = dist * ray_dir + ray_pos;
ivec3 coord = ivec3(pos);
int iter = 0;
for (;;) {
float df = float(texelFetch(df_tex, coord, 0).r);
df *= manhattan_fac;
if (df > 1.0) {
// DF tells us that we can still step by <df> before we could possibly hit a voxel
// step via DF raymarching
// step up to exit of current cell, since DF is safe up until its bounds
// seems to give a little bit of perf, as this reduces iteration count
// of course iteration now has more instructions, so could hurt as well
vec3 t1v = inv_dir * vec3(coord + vox_exit) + bias;
dist = min(min(t1v.x, t1v.y), t1v.z);
// compute chunk exit, since DF is not valid for things outside of the chunk it is generated for
vec3 chunk_exit = vec3((coord & ~63) + vox_exit*64);
vec3 chunk_t1v = inv_dir * chunk_exit + bias;
float chunk_t1 = min(min(chunk_t1v.x, chunk_t1v.y), chunk_t1v.z);
// limit step to exactly on the exit face of the chunk
dist = min(dist + df, chunk_t1);
// update pos for next iteration
pos = dist * ray_dir + ray_pos;
// fix precision issues with coord calculation when limiting step to on chunk face
// note: prefer this to adding epsilon to chunk_t1, since that can miss voxels through the diagonals
if (chunk_t1v.x == dist) pos.x += float(step_dir.x) * 0.5;
else if (chunk_t1v.y == dist) pos.y += float(step_dir.y) * 0.5;
else if (chunk_t1v.z == dist) pos.z += float(step_dir.z) * 0.5;
// update coord for next iteration
coord = ivec3(pos);
} else {
// we need to check individual voxels by DDA now
// -1 marks solid voxels (they have 1-voxel border of 0s around them)
// this avoids one memory read
// and should eliminate all empty block id reads and thus help improve caching for the DF values by a bit
if (df < 0.0)
break;
vec3 t1v = inv_dir * vec3(coord + vox_exit) + bias;
dist = min(min(t1v.x, t1v.y), t1v.z);
// step on axis where exit distance is lowest
if (t1v.x == dist) coord.x += step_dir.x;
else if (t1v.y == dist) coord.y += step_dir.y;
else coord.z += step_dir.z;
}
iter++;
if (iter >= max_iterations || dist >= max_dist)
return false; // miss
}
{ // calc hit info
// snap ray to voxel entry in case we landed inside a voxel when raymarching
vec3 vox_entry = vec3(coord) + mix(vec3(1.0), vec3(0.0), dir_pos);
vec3 t0v = inv_dir * vox_entry + bias;
dist = max(max(t0v.x, t0v.y), max(t0v.z, 0.0)); // max(, 0.0) to not count faces behind ray
hit.bid = texelFetch(voxel_tex, coord, 0).r;
hit.dist = dist;
hit.pos = dist * ray_dir + ray_pos;
vec2 uv;
int face;
{ // calc hit face, uv and normal
vec3 hit_center = vec3(coord) + 0.5;
vec3 offs = (hit.pos - hit_center);
vec3 abs_offs = abs(offs);
hit.normal = vec3(0.0);
if (abs_offs.x >= abs_offs.y && abs_offs.x >= abs_offs.z) {
hit.normal.x = sign(offs.x);
face = offs.x < 0.0 ? 0 : 1;
uv = hit.pos.yz;
if (offs.x < 0.0) uv.x = 1.0 - uv.x;
} else if (abs_offs.y >= abs_offs.z) {
hit.normal.y = sign(offs.y);
face = offs.y < 0.0 ? 2 : 3;
uv = hit.pos.xz;
if (offs.y >= 0.0) uv.x = 1.0 - uv.x;
} else {
hit.normal.z = sign(offs.z);
face = offs.z < 0.0 ? 4 : 5;
uv = hit.pos.xy;
if (offs.z < 0.0) uv.y = 1.0 - uv.y;
}
}
float texid = float(block_tiles[hit.bid].sides[face]);
hit.col = textureLod(tile_textures, vec3(uv, texid), log2(dist)*0.20 - 1.0).rgba;
}
return true; // hit
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment