Skip to content

Instantly share code, notes, and snippets.

@nyorain
Created November 2, 2019 20:19
Show Gist options
  • Save nyorain/afad58f105844e8f642838cc706c6413 to your computer and use it in GitHub Desktop.
Save nyorain/afad58f105844e8f642838cc706c6413 to your computer and use it in GitHub Desktop.
Alternative luminance mipmapping using SIMD
diff --git a/src/deferred/luminance.cpp b/src/deferred/luminance.cpp
index 92351e1..7e2e922 100644
--- a/src/deferred/luminance.cpp
+++ b/src/deferred/luminance.cpp
@@ -256,7 +256,7 @@ void LuminancePass::initBuffers(InitBufferData& data, vk::ImageView light,
dsu.apply();
// mip levels
- const auto mf = (mipGroupDimSize * 4); // minification factor
+ const auto mf = (mipGroupDimSize * 8); // minification factor
const u32 shift = std::log2(mf); // mf is power always of two
auto i = shift;
@@ -363,7 +363,7 @@ void LuminancePass::record(vk::CommandBuffer cb, vk::Extent2D size) {
tkn::cmdBindComputeDescriptors(cb, extract_.pipeLayout, 0, {extract_.ds});
vk::cmdDispatch(cb, cx, cy, 1);
- const auto mf = (mipGroupDimSize * 4); // minification factor
+ const auto mf = (mipGroupDimSize * 8); // minification factor
auto prevLevel = 0u;
auto i = mip_.target0;
vk::cmdBindPipeline(cb, vk::PipelineBindPoint::compute, mip_.pipe);
diff --git a/src/deferred/luminanceMip.comp b/src/deferred/luminanceMip.comp
index 5d3711e..82b9038 100644
--- a/src/deferred/luminanceMip.comp
+++ b/src/deferred/luminanceMip.comp
@@ -33,8 +33,16 @@ layout(push_constant) uniform PCR {
const uint size = gl_WorkGroupSize.x; // == gl_WorkGroupSize.y
vec2 pixelSize = 1.f / textureSize(inLum, 0);
+// TODO: optimiziation
+// make groups smaller and shared memory larger (locally reduce a whole vec4 per invocation)
+// bandwidth is probably our bottleneck anyways, no gpu has that many
+// memory lanes.
+// even more? half the invocations are not being used even in the first iteration
+// currently
+
// contains tee current summed-up luminance
-shared float lum[size][size];
+// we use vec4s to be able to use simd when adding them up
+shared vec4 lum[size][size];
float load(vec2 pixel) {
vec2 dist = clamp(pcr.inSize - (pixel - 0.5), 0, 1);
@@ -42,20 +50,45 @@ float load(vec2 pixel) {
return fac * texture(inLum, min(pixel, pcr.inSize - 0.5) * pixelSize).r;
}
-// no early returns due to all the barriers. We use a sampler
-// with a black border and clampToBorder instead.
+float load4(vec2 pixel) {
+ return load(pixel + vec2(0, 0))
+ + load(pixel + vec2(2, 0))
+ + load(pixel + vec2(0, 2))
+ + load(pixel + vec2(2, 2));
+}
+
+// no early returns due to all the barriers.
void main() {
uvec2 l = gl_LocalInvocationID.xy;
- vec2 pixel = 4 * gl_GlobalInvocationID.xy; // top-left of sampled pixels
- pixel += 1; // for linear sampling
+ vec2 pixel = 8 * gl_GlobalInvocationID.xy; // top-left of sampled pixels
+ pixel += 1; // for linear sampling, allows us to load 4 pixels per texture call
// first reduction locally
- float val = 0.0;
- val += load(pixel + vec2(0, 0));
- val += load(pixel + vec2(2, 0));
- val += load(pixel + vec2(0, 2));
- val += load(pixel + vec2(2, 2));
- lum[l.x][l.y] = val;
+ // float val = 0.0;
+ // val += load(pixel + vec2(0, 0));
+ // val += load(pixel + vec2(2, 0));
+ // val += load(pixel + vec2(0, 2));
+ // val += load(pixel + vec2(2, 2));
+
+ // uint id;
+ // if(l.x % 2 == 0 && l.y % 2 == 0) lum[l.x / 2][l.y / 2][0] = val;
+ // else if(l.x % 2 == 1 && l.y % 2 == 0) lum[l.x / 2][l.y / 2][1] = val;
+ // else if(l.x % 2 == 0 && l.y % 2 == 1) lum[l.x / 2][l.y / 2][2] = val;
+ // else if(l.x % 2 == 1 && l.y % 2 == 1) lum[l.x / 2][l.y / 2][3] = val;
+
+ // uint id = 2 * (l.y % 2) + l.x % 2; // unique mapping {0, 1}^2 -> {0, 1, 2, 3}
+ // lum[l.x / 2][l.y / 2][id] = val;
+
+ lum[l.x][l.y] = vec4(
+ load4(pixel + vec2(0, 0)),
+ load4(pixel + vec2(4, 0)),
+ load4(pixel + vec2(0, 4)),
+ load4(pixel + vec2(4, 4)));
+ // lum[l.x][l.y] = vec4(
+ // load(pixel + vec2(0, 0)),
+ // load(pixel + vec2(2, 0)),
+ // load(pixel + vec2(0, 2)),
+ // load(pixel + vec2(2, 2)));
for(uint isize = size / 2; isize > 0; isize /= 2) {
// one barrier is enough, memoryBarrierShared is not needed.
// See GL_KHR_vulkan_glsl and spirv spec
@@ -70,7 +103,8 @@ void main() {
}
if(l.x == 0 && l.y == 0) {
- float avg = lum[0][0] / (4 * size * size);
+ float avg = dot(lum[0][0], vec4(1.0)) / (16 * size * size);
+ // float avg = dot(lum[0][0], vec4(1.0)) / (4 * size * size);
imageStore(outLum, ivec2(gl_WorkGroupID.xy), vec4(avg));
}
}
@nyorain
Copy link
Author

nyorain commented Nov 2, 2019

Doesn't seem to be faster though (rather 0.1 ms slower). Not sure why, it should be faster in theory i guess.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment