Skip to content

Instantly share code, notes, and snippets.

@futureengine2
Last active April 29, 2024 01:51
Show Gist options
  • Star 15 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save futureengine2/7c8fbc6fefce1818ff1edcd4d7e7bfcf to your computer and use it in GitHub Desktop.
Save futureengine2/7c8fbc6fefce1818ff1edcd4d7e7bfcf to your computer and use it in GitHub Desktop.
Radiance Cascades 2d GI implementation
static void gi_on_gpu(u8* in_bitmap, int w, int h) {
#define num_cascades 7
static bool initialized;
static gpu_bindgroup_t texture_bindgroup[2];
static gpu_bindgroup_t cascade_uniform_bindgroup[num_cascades];
static gpu_bindgroup_t render_uniform_bindgroup;
static gpu_buffer_t vertex_buffer;
static gpu_buffer_t uniform_buffer;
static gpu_pipeline_t pipeline;
static gpu_bindgroup_layout_t uniform_bindgroup_layout;
static gpu_bindgroup_layout_t texture_bindgroup_layout;
static lifetime_t texture_lifetime;
static gpu_texture_t textures[2];
static gpu_texture_t input_texture;
lifetime_t* lifetime = g_platform->lifetime;
f32 d0 = 1.f; // distance between probes in cascade 0
int r0 = 4; // number of rays in cascade 0
int n0 = (int)floorf(2*w/d0); // number of probes in cascade 0 per dimension
int cn = num_cascades;
typedef struct {
f32 d0;
int r0;
int n0;
int ci;
int cn;
int do_render;
int add_sky_light;
int padding;
v2 resolution;
v2 padding2;
} uniform_t;
if (!initialized) {
lifetime_t temp_lifetime = {0};
initialized = true;
// create bindgroup layouts
uniform_bindgroup_layout = gpu_bindgroup_layout_make(lifetime, &(gpu_bindgroup_layout_desc_t){
.name = "gi uniform bgl",
.entries = {
{
.visibility = gpu_visibility_fragment,
.type = gpu_binding_type_buffer,
.buffer.type = gpu_buffer_binding_type_uniform,
},
},
});
texture_bindgroup_layout = gpu_bindgroup_layout_make(lifetime, &(gpu_bindgroup_layout_desc_t){
.name = "gi texture bgl",
.entries = {
{
.visibility = gpu_visibility_fragment,
.type = gpu_binding_type_sampler,
},
{
.visibility = gpu_visibility_fragment,
.type = gpu_binding_type_sampler,
},
},
});
// create pipeline
pipeline = gpu_pipeline_make(lifetime, &(gpu_pipeline_desc_t){
.name = "gi render shader",
.code = file_read("shaders/gi.glsl", &temp_lifetime).bytes,
.bgls = {
uniform_bindgroup_layout,
texture_bindgroup_layout,
},
});
// create uniform buffer (we pack all our different uniforms in one buffer), one per cascade and one for rendering
{
gpu_uniform_packer_t p = gpu_uniform_packer_begin(sizeof(uniform_t), num_cascades+1, lifetime);
uniform_buffer = p.handle;
// set cascade uniforms
for (int i = 0; i < num_cascades; ++i) {
*(uniform_t*)p.data = (uniform_t){
.d0 = d0,
.r0 = r0,
.n0 = n0,
.ci = i,
.cn = num_cascades,
.add_sky_light = 1,
.resolution = {(f32)w,(f32)h},
};
cascade_uniform_bindgroup[i] = gpu_bindgroup_make(lifetime, &(gpu_bindgroup_desc_t){
.name = "gi",
.layout = uniform_bindgroup_layout,
.entries = {gpu_uniform_packer_bindgroup_entry(&p)},
});
gpu_uniform_packer_next(&p);
}
// set render uniform
*(uniform_t*)p.data = (uniform_t){
.d0 = d0,
.r0 = r0,
.n0 = n0,
.ci = 0,
.cn = num_cascades,
.do_render = 1,
.resolution = {(f32)w,(f32)h},
};
render_uniform_bindgroup = gpu_bindgroup_make(lifetime, &(gpu_bindgroup_desc_t){
.name = "gi",
.layout = uniform_bindgroup_layout,
.entries = {gpu_uniform_packer_bindgroup_entry(&p)},
});
gpu_uniform_packer_end(&p);
}
// create textures
input_texture = gpu_texture_make(w, h, gpu_texture_format_rgb8, filter_type_nearest, false, lifetime);
gpu_texture_set_border(input_texture, (color_t){1,1,1,1});
textures[0] = gpu_texture_make(r0*n0, n0, gpu_texture_format_rgba8, filter_type_nearest, false, lifetime);
textures[1] = gpu_texture_make(r0*n0, n0, gpu_texture_format_rgba8, filter_type_nearest, false, lifetime);
texture_bindgroup[0] = gpu_bindgroup_make(lifetime, &(gpu_bindgroup_desc_t){
.name = "gi",
.layout = texture_bindgroup_layout,
.entries = {
{.sampler = {input_texture}},
{.sampler = {textures[0]}},
},
});
texture_bindgroup[1] = gpu_bindgroup_make(lifetime, &(gpu_bindgroup_desc_t){
.name = "gi",
.layout = texture_bindgroup_layout,
.entries = {
{.sampler = {input_texture}},
{.sampler = {textures[1]}},
},
});
lifetime_destroy(&temp_lifetime);
}
// update input texture
gpu_texture_set_data(input_texture, in_bitmap);
// clear texture for pingponging
gpu_texture_clear(textures[(cn-1)%2], (color_t){0});
// build cascades
for (int i = cn-1; i >= 0; --i) {
drawcall_render(&(drawcall_t){
.pipeline = pipeline,
.last_vertex = 6,
.bindgroups = {cascade_uniform_bindgroup[i], texture_bindgroup[i%2]},
.outputs = {textures[(i+1)%2]},
});
}
// render
drawcall_render(&(drawcall_t){
.pipeline = pipeline,
.last_vertex = 6,
.bindgroups = {render_uniform_bindgroup, texture_bindgroup[cn%2]},
});
#undef num_cascades
}
#ifdef VERTEX_SHADER
out vec2 fuv;
// a vertex shader that spits out a screen-size quad
// call with vertex count = 6
void main(void) {
vec2[] positions = vec2[](
vec2(-1,-1),
vec2(1,-1),
vec2(1,1),
vec2(-1,-1),
vec2(1,1),
vec2(-1,1)
);
vec2 vpos = positions[gl_VertexID%6];
gl_Position = vec4(vpos, 0, 1);
fuv = vpos*0.5+0.5;
fuv.y = 1 - fuv.y;
}
#endif /* VERTEX_SHADER */
#ifdef FRAGMENT_SHADER
layout (std140, binding = 0) uniform Uniform
{
float d0; // distance between probes in cascade 0
int r0; // number of rays in cascade 0
int n0; // number of probes in cascade 0 (per dimension)
int ci; // cascade number
int cn; // total number of cascades
int should_do_render; // we switch on this to render instead of building the cascades
int add_sky_light; // set to 1 to add sky lighting to uppermost cascade
int padding;
vec2 u_resolution; // resolution of the input texture
vec2 padding4;
};
layout(binding = 1) uniform sampler2D u_input; // world data that we raytrace through
layout(binding = 2) uniform sampler2D u_prev; // previous cascade (ping-pong this and the output texture)
in vec2 fuv;
layout(location = 0) out vec4 ocolor;
const float PI = 3.1415927;
// raymarch2d: Implementation of Amanatides & Woo voxel marching algo
struct raymarch2d_t {
int x;
int y;
int sx;
int sy;
int ex;
int ey;
float tmx;
float tmy;
float tdx;
float tdy;
};
raymarch2d_t raymarch2d_make(float x0, float y0, float x1, float y1) {
raymarch2d_t res;
res.x = int(floor(x0));
res.y = int(floor(y0));
res.sx = x0 < x1 ? 1 : x1 < x0 ? -1 : 0;
res.sy = y0 < y1 ? 1 : y1 < y0 ? -1 : 0;
res.ex = int(floor(x1)) + 2*res.sx;
res.ey = int(floor(y1)) + 2*res.sy;
float dx = x1 - x0;
float dy = y1 - y0;
float l = 1.f/sqrt(dx*dx + dy*dy);
dx *= l;
dy *= l;
res.tmx = dx == 0 ? 10000000 : (x0 - res.x)/dx;
res.tmy = dy == 0 ? 10000000 : (y0 - res.y)/dy;
res.tdx = dx == 0 ? 0 : res.sx/dx;
res.tdy = dy == 0 ? 0 : res.sy/dy;
return res;
}
bool raymarch2d_next(inout raymarch2d_t r) {
if (r.tmx < r.tmy) {
r.tmx += r.tdx;
r.x += r.sx;
return r.x != r.ex;
}
else {
r.tmy += r.tdy;
r.y += r.sy;
return r.y != r.ey;
}
}
vec3 tonemap_aces(vec3 color) {
const float slope = 12.0;
vec4 x = vec4(
color.r, color.g, color.b,
(color.r * 0.299) + (color.g * 0.587) + (color.b * 0.114)
);
const float a = 2.51f;
const float b = 0.03f;
const float c = 2.43f;
const float d = 0.59f;
const float e = 0.14f;
vec4 tonemap = clamp((x * (a * x + b)) / (x * (c * x + d) + e), 0.0, 1.0);
float t = x.a;
t = t * t / (slope + t);
return mix(tonemap.rgb, tonemap.aaa, t);
}
vec3 sky_(vec2 angle) {
float a1 = angle[1];
float a0 = angle[0];
// Sky integral formula taken from
// Analytic Direct Illumination - Mathis
// https://www.shadertoy.com/view/NttSW7
const vec3 SkyColor = vec3(0.2,0.5,1.);
const vec3 SunColor = vec3(1.,0.7,0.1)*10.;
const float SunA = 2.0;
const float SunS = 64.0;
const float SSunS = sqrt(SunS);
const float ISSunS = 1./SSunS;
vec3 SI = SkyColor*(a1-a0-0.5*(cos(a1)-cos(a0)));
SI += SunColor*(atan(SSunS*(SunA-a0))-atan(SSunS*(SunA-a1)))*ISSunS;
return SI / 6.0;
}
vec3 sky(vec2 angle) {
// Integrate the radiance from the sky over an interval of directions
if (angle[1] < 2.0 * PI)
return sky_(angle);
return
sky_(vec2(angle[0], 2.0 * PI)) +
sky_(vec2(0.0, angle[1] - 2.0 * PI));
}
void main(void) {
if (should_do_render == 1) {
// sample probe in cascade 0
float x = fuv.x * u_resolution.x;
float y = fuv.y * u_resolution.y;
float xi = round(x/d0);
float yi = round(y/d0);
vec3 c = vec3(0,0,0);
for (int r = 0; r < r0; ++r) {
vec2 pixelcoord = floor(vec2(xi*r0 + r, yi)) + 0.5;
c += texture(u_prev, pixelcoord / textureSize(u_prev, 0)).rgb;
}
ocolor = vec4(tonemap_aces(c/r0),1);
}
else {
// build cascade
int u = int(gl_FragCoord.x);
int v = int(gl_FragCoord.y);
int lm = 2;// ray distance branching factor. ray distance = 2^(lm*ci)
int rm = 1;// ray count branching factor. Num rays for cascade ci = r0*2^(rm*ci) = r0*(1 << rm*ci). NOTE: increasing this removes the property that total size of all cascades converges to 2x size of cascade 0, and instead leads to linear size increase
int n = n0 >> ci; // number of probes in one dimension
float d = d0*(1 << ci); // distance between probes
int rn = r0 << (rm*ci); // number of pixels/rays per probe
int yi = v; // probe index
int xi = u/rn; // probe index
int r = u - xi*rn; // ray index
float dx = d0*0.5f*(1 << ci);
float x = xi * d + dx; // probe pos
float y = yi * d + dx; // probe pos
float l = 0.5 * d0; // length of ray
float intensity = 1.0;
if (xi >= n || xi < 0 || yi >= n || yi < 0) {
ocolor = vec4(0,0,0,0);
return;
}
float ra = ci == 0 ? 0 : l*(1 << ((ci-1)*lm)); // start of ray length interval
float rb = l*(1 << (ci*lm)); // end of ray length interval
float alpha = 2*PI*(float(r)+0.5)/rn;
vec2 rot = vec2(cos(alpha), sin(alpha));
vec2 a = vec2(x,y) + rot*ra; // start of ray
vec2 b = vec2(x,y) + rot*rb; // end of ray
raymarch2d_t raym = raymarch2d_make(a.x, a.y, b.x, b.y);
vec4 col = vec4(0,0,0,0);
while (raymarch2d_next(raym)) {
vec3 v = texture(u_input, vec2((raym.x+0.5)/u_resolution.x, (raym.y+0.5)/u_resolution.y)).rgb;
if (v != vec3(1,1,1)) {
col = vec4(v*intensity,1);
break;
}
}
// if no hit, get from upper cascade
// TODO: do proper alpha blending to support transparent materials. Since we're only dealing with opaque materials for now it's fine
if (col.a == 0) {
if (ci == cn-1) {
if (add_sky_light != 0)
col = vec4(sky(vec2(alpha, alpha + 2*PI/rn)) / (2*PI/rn), 1);
else
col = vec4(0,0,0,0);
}
else {
int xi2 = (xi+1)/2; // probe index in upper
int yi2 = (yi+1)/2; // probe index in upper
int r2 = r << rm; // ray index in upper
int rn2 = rn << rm; // num rays in upper
int n2 = n >> 1; // num probes in upper
float tx = 0.75 - 0.5*float(xi%2); // weighting of upper cascade. we can do this magic because we know how the probes are laid out in the grid
float ty = 0.75 - 0.5*float(yi%2); // weighting of upper cascade. we can do this magic because we know how the probes are laid out in the grid
// loop through all the nearby rays in the upper cascade
// TODO: in the case where there are >2 rays in the upper cascade for each ray in this cascade (i.e. rm > 1),
// we should choose a better weighting than just treating them all equally
vec4 upper = vec4(0,0,0,0);
float frac = 1.0 / (1 << rm);
for (int ri = 0; ri < (1 << rm); ++ri) {
vec2 pc1 = floor(vec2(clamp(xi2-1, 0, n2-1)*rn2 + r2 + ri, clamp(yi2-1, 0, n2-1))) + 0.5; // pixel coordinate of upper probe for ray r2+ri
vec2 pc2 = floor(vec2(clamp(xi2, 0, n2-1)*rn2 + r2 + ri, clamp(yi2-1, 0, n2-1))) + 0.5; // pixel coordinate of upper probe for ray r2+ri
vec2 pc3 = floor(vec2(clamp(xi2-1, 0, n2-1)*rn2 + r2 + ri, clamp(yi2, 0, n2-1))) + 0.5; // pixel coordinate of upper probe for ray r2+ri
vec2 pc4 = floor(vec2(clamp(xi2, 0, n2-1)*rn2 + r2 + ri, clamp(yi2, 0, n2-1))) + 0.5; // pixel coordinate of upper probe for ray r2+ri
vec4 c = mix(
mix(texture(u_prev, pc1 / textureSize(u_prev, 0)), texture(u_prev, pc2 / textureSize(u_prev, 0)), tx),
mix(texture(u_prev, pc3 / textureSize(u_prev, 0)), texture(u_prev, pc4 / textureSize(u_prev, 0)), tx),
ty
);
upper += c*frac;
}
col = upper;
}
}
ocolor = vec4(col.rgb, 1);
}
}
#endif /* FRAGMENT_SHADER */
@prime31
Copy link

prime31 commented Nov 17, 2023

Is the base renderer open source (or is there any other example code in the wild)? It looks interesting and I'd love to give it a read to see how its all put together.

@futureengine2
Copy link
Author

futureengine2 commented Nov 17, 2023

I'm afraid it's not publicly available, if there's any particular part you're curious about I could get into more detail about it :) But for the main part it's just a mish-mash of pre-existing ideas:

  • The main set of graphics primitives (pipelines, bindgroups, etc.) is very similar to the native webgpu API.
  • Handles (like gpu_texture_t) are (id, generation) pairs to simulate weak references. This is similar to Hypehypes renderer.
  • Having your drawcall specification as single struct (drawcall_t here) I haven't seen too many people do. I believe Hypehype did it in the beginning but ultimately scrapped it for the command queue method (like many other APIs) because it's more performant. We cared more for ease-of-use. The struct looks something like
typedef struct {
    gpu_pipeline_t      pipeline;
    u32                 first_vertex;
    u32                 last_vertex;
    gpu_buffer_t        vertex_buffers[3];
    gpu_buffer_t        index_buffer;
    gpu_texture_t       outputs[3];
    gpu_texture_t       depth;
    gpu_bindgroup_t     bindgroups[3];
    rectu16_t           scissor;
    rectu16_t           viewport;
    u32                 num_instances;
} drawcall_t;
  • Just to make the API a bit easier to use, we use static array sizes in structs for lists, just like sokol does. So gpu_pipeline_info_t looks something like
typedef struct {
    gpu_bindgroup_layout_t bgls[3];
    gpu_pipeline_buffer_desc_t buffers[3];
    u32                       primitive           : 2;
    u32                       blend_function_src  : 4;
    u32                       blend_function_dest : 4;
    ... etc
} gpu_pipeline_info_t;

@prime31
Copy link

prime31 commented Nov 17, 2023

Many thanks for the response and explanation! I now know why it felt so familiar instantly to me. I'm building a new renderer currently and hit on a lot of the exact same inspirational sources as you: resource handles (like Sokol and Seb), descriptor structs, fixed size arrays for ease of use, etc.

Your API looks really nice and easy to use. Whittled down to just what it needs to have. I think I might take a few of your API ideas from this gist and implement them into my own renderer. Really nice work on it!

@futureengine2
Copy link
Author

futureengine2 commented Nov 18, 2023

Sweet, good luck! Let me know if you run into any issues and/or have questions, I'm curious how it goes.

With regards to this, it's still a work in progress, which is why we're hesitant to release it.
For example sampling method (nearest/linear, etc) should really be set on the sampler, not on the texture.

But webgpu does this correctly, so highly recommend looking at it if you haven't already - it's quite readable

@futureengine2
Copy link
Author

Oh yeah, the lifetime_t thing is just a stack allocator with support for registering destructor functions. It lets you bundle up the lifetime of multiple objects together and destroy them all at once, which is pretty handy.

@prime31
Copy link

prime31 commented Nov 18, 2023

Thanks for the explanation! I totally figured the lifetime_t was an arena of some sort. Like I said, it was all so clear and obvious just from reading this small bit of it. If you ever did even release just the headers any nerds like me would love to read them!

I’m actually using WebGPU for my renderer as well so I am familiar with it.

@ShadedLamp
Copy link

I was wondering, do you have a complete implementation of this? Particularly how you defined the opaque occluder geometry for this.

@ShadedLamp
Copy link

Also, what is the minimum geometry size?

@futureengine2
Copy link
Author

futureengine2 commented Mar 14, 2024

I was wondering, do you have a complete implementation of this? Particularly how you defined the opaque occluder geometry for this.

Hey Shaded, this is my recreation of Alexander's 2d implementation here where he draws on the screen to create occluders and emitters.
I did the same, and just put that info in a texture (called u_input in the shader) using the following scheme:
black = black occluder,
white = transparent (just air)
any other color = an emitter

But the algorithm is kind of agnostic to how you do the raytracing. You can do cone tracing through a voxel hierarchy, or sphere-trace through SDF, or whatever.

@futureengine2
Copy link
Author

futureengine2 commented Mar 14, 2024

Also, what is the minimum geometry size?

Depending on your raytrace method, the algorithm can be independent of the amount of geometry in terms of performance.
I believe in Path of Exile 2, Alexander uses something like a Gbuffer for raytracing (which works since it's top-down, so called 2.5D) so as long as you can render your geometry to a Gbuffer you're good.
I've seen other solutions that precomputes a voxelization of a scene and cone-trace through that, so that also makes it independent from geometry in terms of performance.

@octanejohn
Copy link

can this idea merge or at least use parts for what godot hdda does ?

@futureengine2
Copy link
Author

futureengine2 commented Mar 24, 2024

I'm not super familiar with hddi, but if I understand correctly it's a datastructure that allows for efficient raytracing. So yeah, you could use this method to decide which rays to sample and how to combine the results, and then use hddi to trace those rays through the scene.

@octanejohn
Copy link

this idea has to be screen space? or it can go to world space so that there isnt visual artifacts when merging

@futureengine2
Copy link
Author

futureengine2 commented Mar 24, 2024

Yes you can definitely do this in 3d world space. I havent done it myself but Alexander has a world-space demo on his YouTube.

I can't think of any reason artefacts should be bad (in fact i believe some artefacts like ringing in 2d dont appear in 3d). I don't know of anyone implementing a production-ready 3d version though.

There's still more research going on to reduce artefacts on the Graphics Programming Discord, recommend you check it out

@octanejohn
Copy link

i saw it but to me it looks it cant produce data on its own outside camera view(turn camera away from light loses the data on the wall), thats why i am trying to see if it can merge with hddagi to help it with world space like amd brixelizer caching idea does

@futureengine2
Copy link
Author

Looks like he's got two 3D videos, one in screen-space and in world-space.
Here's an example of the world space, you can see that he gets light from the models to the left and right outside of the view frustum:

https://youtu.be/5Ua-h1pg6yM?si=c6wdsT-LzlQTPC_l&t=37

There's some other artifacts going on that are probably coming from things like number of cascades being low, or low ray multiplication factor, probably could've used more parameter tweaking.
This is a neat website tmpvar made that lets you play with some of the parameters (screenspace only though) https://tmpvar.com/poc/radiance-cascades/#flatland-2d

@futureengine2
Copy link
Author

futureengine2 commented Mar 25, 2024

Btw this method is literally just a cleverer way of laying out and combining the results of your probes. How you calculate the value of your rays is entirely up to you.

@octanejohn
Copy link

awesome thanks for chatting

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment