Created
August 26, 2021 10:37
-
-
Save rbnelr/5b594eceec7562524a126eef950deffc to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#version 460 core | |
#extension GL_NV_gpu_shader5: enable // for uint8_t | |
#extension GL_NV_shader_thread_shuffle: enable | |
layout(local_size_x = 8, local_size_y = 8, local_size_z = 8) in; | |
layout(r8i, binding = 4) restrict uniform iimage3D df_img; | |
#include "gpu_voxels.glsl" | |
uniform ivec3 offsets[32]; | |
// Initialized my DF such that a cell gets the value: | |
// 0 if a neighbouring voxel is solid (3x3 region) | |
// -1 if the voxel itself is solid (so that I avoid checking voxel block ids in the tracing) | |
// 128 (max DF dist) otherwise | |
// This code is using a few compute/gpu shader tricks like shared mem, barriers and lane intrinsics | |
// but in essence it's just computing this logic on the 3x3 region centered on the voxel | |
// but without requiring 27 texture reads per result voxel | |
const int REGION = 8; | |
const int CORE = REGION -2; | |
const int CHUNK_WGROUPS = (CHUNK_SIZE + CORE-1) / CORE; // round up | |
shared int8_t buf[REGION][REGION][REGION]; | |
void main () { | |
ivec3 wgroupid = ivec3(gl_WorkGroupID); | |
int chunkid = wgroupid.z / CHUNK_WGROUPS; | |
wgroupid.z = wgroupid.z % CHUNK_WGROUPS; | |
ivec3 pos = wgroupid * CORE + ivec3(gl_LocalInvocationID)-1; // pos in chunk | |
bool in_chunk = all(lessThan(pos, ivec3(CHUNK_SIZE))); | |
pos += offsets[chunkid]; // to world coord | |
int x = int(gl_LocalInvocationID.x); | |
int y = int(gl_LocalInvocationID.y); | |
int z = int(gl_LocalInvocationID.z); | |
uint bid = in_chunk ? texelFetch(voxel_tex, pos, 0).r : 0; | |
int8_t val = bid > B_AIR ? int8_t(1u) : int8_t(0u); // solid voxels get a 1, air gets a 0 | |
// Propagate 1s to X-neighbours | |
#if 0 | |
buf[z][y][x] = val; | |
barrier(); // make write visible to X pass reads | |
// X pass | |
if (x > 0) val |= buf[z][y][x-1]; | |
if (x < REGION -1) val |= buf[z][y][x+1]; | |
barrier(); // finish reads so we can write X pass result | |
#else | |
// X pass (with lane intrinsics) | |
// NOTE: if (x > 0) range checks are not needed since out of bounds reads return this threads value, which can safely be ORed in | |
int8_t val0 = int8_t(shuffleDownNV(val, 1u, REGION)); | |
int8_t val1 = int8_t(shuffleUpNV (val, 1u, REGION)); | |
val |= val0; | |
val |= val1; | |
#endif | |
buf[z][y][x] = val; | |
barrier(); // make write visible to Y pass reads | |
{ // Y pass | |
if (y > 0) val |= buf[z][y-1][x]; | |
if (y < REGION -1) val |= buf[z][y+1][x]; | |
} | |
barrier(); // finish reads so we can write Y pass result | |
buf[z][y][x] = val; | |
if ( x > 0 && x < REGION-1 && | |
y > 0 && y < REGION-1 && | |
z > 0 && z < REGION-1 && in_chunk ) { | |
barrier(); // make write visible to Z pass reads | |
// Z pass | |
val |= buf[z-1][y][x]; | |
val |= buf[z+1][y][x]; | |
// make DF -1 for solid block and the 1-voxel border 0 | |
// to let us directly DDA the DF data without touching the voxel data until the final hit computation | |
int df = 127; | |
if (val != 0) | |
df = bid > B_AIR ? -1 : 0; | |
imageStore(df_img, pos, ivec4(df, 0,0,0)); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#version 460 core | |
layout(local_size_x = GROUPSZ, local_size_y = GROUPSZ) in; | |
layout(r8i, binding = 4) restrict uniform iimage3D df_img; | |
#include "gpu_voxels.glsl" // CHUNK_SIZE = 64 | |
#define SIZE CHUNK_SIZE | |
uniform ivec3 offsets[32]; | |
void main () { | |
#if PASS == 0 | |
ivec3 pos = ivec3(0, gl_GlobalInvocationID.xy); | |
#define GETPOS(I) ivec3(pos.x + (I), pos.y, pos.z) | |
#elif PASS == 1 | |
ivec3 pos = ivec3(gl_GlobalInvocationID.x, 0, gl_GlobalInvocationID.y); | |
#define GETPOS(I) ivec3(pos.x, pos.y + (I), pos.z) | |
#else | |
ivec3 pos = ivec3(gl_GlobalInvocationID.xy, 0); | |
#define GETPOS(I) ivec3(pos.x, pos.y, pos.z + (I)) | |
#endif | |
pos += offsets[gl_WorkGroupID.z]; | |
int prev = 127; | |
for (int i=0; i<SIZE; ++i) { | |
ivec3 p = GETPOS(i); | |
int cur = imageLoad(df_img, p).r; | |
prev += 1; | |
if (prev < cur) imageStore(df_img, p, ivec4(prev, 0,0,0)); | |
else prev = cur; | |
} | |
prev = 127; | |
for (int i=SIZE-1; i>=0; --i) { | |
ivec3 p = GETPOS(i); | |
int cur = imageLoad(df_img, p).r; | |
prev += 1; | |
if (prev < cur) imageStore(df_img, p, ivec4(prev, 0,0,0)); | |
else prev = cur; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
void Raytracer::upload_changes (OpenglRenderer& r, Game& game) { | |
ZoneScoped; | |
std::vector<int3> chunks; | |
if (!game.chunks.upload_voxels.empty()) { | |
OGL_TRACE("raytracer upload changes"); | |
// temp buffer to 'decompress' my sparse subchunks and enable uploading them in a single glTextureSubImage3D per chunk | |
block_id buffer[CHUNK_SIZE][CHUNK_SIZE][CHUNK_SIZE]; | |
for (auto cid : game.chunks.upload_voxels) { | |
auto& chunk = game.chunks.chunks[cid]; | |
auto& vox = game.chunks.chunk_voxels[cid]; | |
int3 pos = chunk.pos + GPU_WORLD_SIZE_CHUNKS/2; | |
if ( (unsigned)(pos.x) < GPU_WORLD_SIZE_CHUNKS && // use 3 unsigned comparisons instead of 6 signed ones | |
(unsigned)(pos.y) < GPU_WORLD_SIZE_CHUNKS && | |
(unsigned)(pos.z) < GPU_WORLD_SIZE_CHUNKS ) { | |
//OGL_TRACE("upload chunk data"); | |
{ | |
ZoneScopedN("decompress"); | |
for (int sz=0; sz<SUBCHUNK_COUNT; ++sz) | |
for (int sy=0; sy<SUBCHUNK_COUNT; ++sy) | |
for (int sx=0; sx<SUBCHUNK_COUNT; ++sx) { | |
auto subc = vox.subchunks[IDX3D(sx,sy,sz, SUBCHUNK_SIZE)]; | |
if (subc & SUBC_SPARSE_BIT) { | |
block_id val = (block_id)(subc & ~SUBC_SPARSE_BIT); | |
block_id val_packed[SUBCHUNK_SIZE]; | |
for (int i=0; i<SUBCHUNK_SIZE; ++i) | |
val_packed[i] = val; | |
for (int z=0; z<SUBCHUNK_SIZE; ++z) | |
for (int y=0; y<SUBCHUNK_SIZE; ++y) { | |
auto* dst = &buffer[sz*SUBCHUNK_SIZE + z][sy*SUBCHUNK_SIZE + y][sx*SUBCHUNK_SIZE + 0]; | |
memcpy(dst, val_packed, sizeof(block_id)*SUBCHUNK_SIZE); | |
} | |
} else { | |
auto* data = game.chunks.subchunks[subc].voxels; | |
for (int z=0; z<SUBCHUNK_SIZE; ++z) | |
for (int y=0; y<SUBCHUNK_SIZE; ++y) { | |
auto* dst = &buffer[sz*SUBCHUNK_SIZE + z][sy*SUBCHUNK_SIZE + y][sx*SUBCHUNK_SIZE + 0]; | |
auto* src = &data[IDX3D(0,y,z, SUBCHUNK_SIZE)]; | |
memcpy(dst, src, sizeof(block_id)*SUBCHUNK_SIZE); | |
} | |
} | |
} | |
} | |
{ | |
ZoneScopedN("glTextureSubImage3D"); | |
glTextureSubImage3D(voxel_tex.tex, 0, | |
pos.x*CHUNK_SIZE, pos.y*CHUNK_SIZE, pos.z*CHUNK_SIZE, CHUNK_SIZE, CHUNK_SIZE, CHUNK_SIZE, | |
GL_RED_INTEGER, GL_UNSIGNED_SHORT, buffer); | |
} | |
chunks.push_back(pos); | |
} | |
} | |
glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT|GL_TEXTURE_FETCH_BARRIER_BIT); | |
} | |
{ | |
//chunks.clear(); // for profiling df gen | |
//for (int y=0; y<4; ++y) | |
//for (int x=0; x<4; ++x) { | |
// chunks.push_back({x,y,7}); | |
//} | |
if (!chunks.empty()) { | |
ZoneScopedN("rt_df_gen"); | |
OGL_TRACE("rt_df_gen"); | |
int count = (int)chunks.size(); | |
glBindImageTexture(4, df_tex.tex, 0, GL_FALSE, 0, GL_READ_WRITE, GL_R8I); | |
{ | |
OGL_TIMER_ZONE(timer_df_init.timer); | |
glUseProgram(df_tex.shad_init->prog); | |
r.state.bind_textures(df_tex.shad_init, { | |
{"voxel_tex", voxel_tex.tex}, | |
}); | |
static constexpr int BATCHSIZE = 32; | |
for (int i=0; i<count; i+=BATCHSIZE) { | |
int subcount = min(count - i, BATCHSIZE); | |
int3 offsets[BATCHSIZE] = {}; | |
for (int j=0; j<subcount; ++j) | |
offsets[j] = chunks[i+j] * CHUNK_SIZE; | |
df_tex.shad_init->set_uniform_array("offsets[0]", offsets, BATCHSIZE); | |
constexpr int REGION = 8; | |
constexpr int CORE = REGION -2; | |
constexpr int CHUNK_WGROUPS = (CHUNK_SIZE + CORE-1) / CORE; // round up | |
glDispatchCompute(CHUNK_WGROUPS, CHUNK_WGROUPS, CHUNK_WGROUPS * subcount); | |
} | |
glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT|GL_TEXTURE_FETCH_BARRIER_BIT); | |
} | |
for (int pass=0; pass<3; ++pass) { | |
Shader* shad = df_tex.shad_pass[pass]; | |
glUseProgram(shad->prog); | |
r.state.bind_textures(shad, {}); | |
static constexpr int BATCHSIZE = 32; | |
for (int i=0; i<count; i+=BATCHSIZE) { | |
int subcount = min(count - i, BATCHSIZE); | |
int3 offsets[BATCHSIZE] = {}; | |
for (int j=0; j<subcount; ++j) | |
offsets[j] = chunks[i+j] * CHUNK_SIZE; | |
shad->set_uniform_array("offsets[0]", offsets, BATCHSIZE); | |
int dispatch_size = (CHUNK_SIZE + DFTexture::COMPUTE_GROUPSZ -1) / DFTexture::COMPUTE_GROUPSZ; | |
glDispatchCompute(dispatch_size, dispatch_size, subcount); | |
} | |
glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT|GL_TEXTURE_FETCH_BARRIER_BIT); | |
} | |
} | |
glBindImageTexture(4, 0, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8); // unbind | |
} | |
} | |
void Raytracer::draw (OpenglRenderer& r, Game& game) { | |
ZoneScoped; | |
if (!rt_forward->prog) return; | |
{ | |
ZoneScopedN("rt_gbufgen"); | |
OGL_TRACE("rt_gbufgen"); | |
OGL_TIMER_ZONE(timer_rt.timer); | |
glUseProgram(rt_forward->prog); | |
rt_forward->set_uniform("framebuf_size", r.framebuffer.size); | |
rt_forward->set_uniform("update_debugdraw", r.debug_draw.update_indirect); | |
rt_forward->set_uniform("max_iterations", max_iterations); | |
r.state.bind_textures(rt_forward, { | |
{"voxel_tex", voxel_tex.tex}, | |
{"df_tex", df_tex.tex}, | |
//{"gbuf_pos" , gbuf.pos }, | |
//{"gbuf_col" , gbuf.col }, | |
//{"gbuf_norm", gbuf.norm}, | |
{"tile_textures", r.tile_textures, r.tile_sampler}, | |
{"heat_gradient", r.gradient, r.normal_sampler}, | |
}); | |
glBindImageTexture(0, r.framebuffer.color, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_RGBA16F); | |
int2 dispatch_size; | |
dispatch_size.x = (r.framebuffer.size.x + rt_groupsz.size.x -1) / rt_groupsz.size.x; | |
dispatch_size.y = (r.framebuffer.size.y + rt_groupsz.size.y -1) / rt_groupsz.size.y; | |
glDispatchCompute(dispatch_size.x, dispatch_size.y, 1); | |
} | |
glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT|GL_TEXTURE_FETCH_BARRIER_BIT); | |
//{ | |
// glBindFramebuffer(GL_FRAMEBUFFER, gbuf.fbo); | |
// glClear(GL_DEPTH_BUFFER_BIT); | |
// | |
// test_renderer.draw(r); | |
//} | |
// unbind | |
glBindImageTexture(0, 0, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_RGBA32F); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
bool trace_ray (vec3 ray_pos, vec3 ray_dir, float max_dist, out Hit hit) { | |
bvec3 dir_pos = greaterThanEqual(ray_dir, vec3(0.0)); | |
ivec3 vox_exit = mix(ivec3(0), ivec3(1), dir_pos); | |
ivec3 step_dir = mix(ivec3(-1), ivec3(+1), dir_pos); | |
// precompute part of plane projection equation | |
// prefer 'pos * inv_dir + bias' over 'inv_dir * (pos - ray_pos)' | |
// due to madd instruction | |
vec3 inv_dir = 1.0 / ray_dir; | |
vec3 bias = inv_dir * -ray_pos; | |
float dist; | |
{ // allow ray to start outside world texture cube for nice debugging views | |
float epsilon = 0.0001; // stop the raymarching from sometimes sampling outside the world textures | |
vec3 world_min = vec3( 0.0) + epsilon; | |
vec3 world_max = vec3(WORLD_SIZEf) - epsilon; | |
// calculate entry and exit coords into whole world cube | |
vec3 t0v = mix(world_max, world_min, dir_pos) * inv_dir + bias; | |
vec3 t1v = mix(world_min, world_max, dir_pos) * inv_dir + bias; | |
float t0 = max(max(t0v.x, t0v.y), t0v.z); | |
float t1 = min(min(t1v.x, t1v.y), t1v.z); | |
t0 = max(t0, 0.0); | |
t1 = max(t1, 0.0); | |
// ray misses world texture | |
if (t1 <= t0) | |
return false; | |
// adjust ray to start where it hits cube initally | |
dist = t0; | |
max_dist = t1; | |
} | |
float manhattan_fac = 0.999 / (abs(ray_dir.x) + abs(ray_dir.y) + abs(ray_dir.z)); | |
vec3 pos = dist * ray_dir + ray_pos; | |
ivec3 coord = ivec3(pos); | |
int iter = 0; | |
for (;;) { | |
float df = float(texelFetch(df_tex, coord, 0).r); | |
df *= manhattan_fac; | |
if (df > 1.0) { | |
// DF tells us that we can still step by <df> before we could possibly hit a voxel | |
// step via DF raymarching | |
// step up to exit of current cell, since DF is safe up until its bounds | |
// seems to give a little bit of perf, as this reduces iteration count | |
// of course iteration now has more instructions, so could hurt as well | |
vec3 t1v = inv_dir * vec3(coord + vox_exit) + bias; | |
dist = min(min(t1v.x, t1v.y), t1v.z); | |
// compute chunk exit, since DF is not valid for things outside of the chunk it is generated for | |
vec3 chunk_exit = vec3((coord & ~63) + vox_exit*64); | |
vec3 chunk_t1v = inv_dir * chunk_exit + bias; | |
float chunk_t1 = min(min(chunk_t1v.x, chunk_t1v.y), chunk_t1v.z); | |
// limit step to exactly on the exit face of the chunk | |
dist = min(dist + df, chunk_t1); | |
// update pos for next iteration | |
pos = dist * ray_dir + ray_pos; | |
// fix precision issues with coord calculation when limiting step to on chunk face | |
// note: prefer this to adding epsilon to chunk_t1, since that can miss voxels through the diagonals | |
if (chunk_t1v.x == dist) pos.x += float(step_dir.x) * 0.5; | |
else if (chunk_t1v.y == dist) pos.y += float(step_dir.y) * 0.5; | |
else if (chunk_t1v.z == dist) pos.z += float(step_dir.z) * 0.5; | |
// update coord for next iteration | |
coord = ivec3(pos); | |
} else { | |
// we need to check individual voxels by DDA now | |
// -1 marks solid voxels (they have 1-voxel border of 0s around them) | |
// this avoids one memory read | |
// and should eliminate all empty block id reads and thus help improve caching for the DF values by a bit | |
if (df < 0.0) | |
break; | |
vec3 t1v = inv_dir * vec3(coord + vox_exit) + bias; | |
dist = min(min(t1v.x, t1v.y), t1v.z); | |
// step on axis where exit distance is lowest | |
if (t1v.x == dist) coord.x += step_dir.x; | |
else if (t1v.y == dist) coord.y += step_dir.y; | |
else coord.z += step_dir.z; | |
} | |
iter++; | |
if (iter >= max_iterations || dist >= max_dist) | |
return false; // miss | |
} | |
{ // calc hit info | |
// snap ray to voxel entry in case we landed inside a voxel when raymarching | |
vec3 vox_entry = vec3(coord) + mix(vec3(1.0), vec3(0.0), dir_pos); | |
vec3 t0v = inv_dir * vox_entry + bias; | |
dist = max(max(t0v.x, t0v.y), max(t0v.z, 0.0)); // max(, 0.0) to not count faces behind ray | |
hit.bid = texelFetch(voxel_tex, coord, 0).r; | |
hit.dist = dist; | |
hit.pos = dist * ray_dir + ray_pos; | |
vec2 uv; | |
int face; | |
{ // calc hit face, uv and normal | |
vec3 hit_center = vec3(coord) + 0.5; | |
vec3 offs = (hit.pos - hit_center); | |
vec3 abs_offs = abs(offs); | |
hit.normal = vec3(0.0); | |
if (abs_offs.x >= abs_offs.y && abs_offs.x >= abs_offs.z) { | |
hit.normal.x = sign(offs.x); | |
face = offs.x < 0.0 ? 0 : 1; | |
uv = hit.pos.yz; | |
if (offs.x < 0.0) uv.x = 1.0 - uv.x; | |
} else if (abs_offs.y >= abs_offs.z) { | |
hit.normal.y = sign(offs.y); | |
face = offs.y < 0.0 ? 2 : 3; | |
uv = hit.pos.xz; | |
if (offs.y >= 0.0) uv.x = 1.0 - uv.x; | |
} else { | |
hit.normal.z = sign(offs.z); | |
face = offs.z < 0.0 ? 4 : 5; | |
uv = hit.pos.xy; | |
if (offs.z < 0.0) uv.y = 1.0 - uv.y; | |
} | |
} | |
float texid = float(block_tiles[hit.bid].sides[face]); | |
hit.col = textureLod(tile_textures, vec3(uv, texid), log2(dist)*0.20 - 1.0).rgba; | |
} | |
return true; // hit | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment