-
-
Save sebbbi/6cfbec7ab343924dad9b7ee48ef3ba6c to your computer and use it in GitHub Desktop.
// NOTE: Must bind 8x single mip RWTexture views, because HLSL doesn't have .mips member for RWTexture2D. (SRVs only have .mips member) | |
// NOTE: globallycoherent attribute is needed. Without it writes aren't guaranteed to be seen by other groups | |
globallycoherent RWTexture2D<float> MipTextures[8]; | |
RWTexture2D<uint> Counters[8]; | |
groupshared uint CounterReturnLDS; | |
[numthreads(16, 16, 1)] | |
void GenerateMipPyramid(uint3 Tid : SV_DispatchThreadID, uint3 Group : SV_GroupId, uint Gix : SV_GroupIndex) | |
{ | |
[unroll] | |
for (int Mip = 0; Mip < 8-1; ++Mip) | |
{ | |
// 2x2 downsample | |
float Sum = | |
MipTextures[Mip][Tid.xy * 2 + uint2(0, 0)] + | |
MipTextures[Mip][Tid.xy * 2 + uint2(1, 0)] + | |
MipTextures[Mip][Tid.xy * 2 + uint2(0, 1)] + | |
MipTextures[Mip][Tid.xy * 2 + uint2(1, 1)]; | |
MipTextures[Mip+1][Tid.xy] = Sum * 0.25; | |
// Four groups in 2x2 tile of groups increment the same counter. | |
if (Gix == 0) | |
{ | |
InterlockedAdd(Counters[Mip][Group.xy / 2], 1, CounterReturnLDS); | |
} | |
// We do a full memory barrier here. In next mip the surviving thread group will read data generated by 3 other thread groups. Data needs to be visible. | |
AllMemoryBarrierWithGroupSync(); | |
// Kill all groups except the last one to finish in 2x2 tile. This branch is allowed because CounterReturnLDS is group invariant. | |
if (CounterReturnLDS < 3) | |
{ | |
return; | |
} | |
// Needed to ensure that all threads in group read CounterReturnLDS before it is modified in next loop iteration | |
GroupMemoryBarrierWithGroup(); | |
Tid.xy /= 2; | |
Group.xy /= 2; | |
} | |
} |
There's probably a lot about compute shaders I don't know, but I wonder why this can't be done with just:
layout(local_size_x = 16, local_size_y = 16, local_size_z = 1) in;
layout(rgba8) uniform image2D u_texmip[12];
uniform uint u_mipcount;
void main()
{
uvec2 pos = gl_GlobalInvocationID.xy;
ivec2 pos1 = ivec2(pos);
ivec2 pos2 = ivec2(pos)*2;
uvec2 siz = (gl_NumWorkGroups.xy*gl_WorkGroupSize.xy) >> 1;
for (uint i = 0; i < u_mipcount-1; i++)
{
vec4 c1 = imageLoad(u_texmip[i], pos2+ivec2(0,0));
vec4 c2 = imageLoad(u_texmip[i], pos2+ivec2(1,0));
vec4 c3 = imageLoad(u_texmip[i], pos2+ivec2(0,1));
vec4 c4 = imageLoad(u_texmip[i], pos2+ivec2(1,1));
vec4 cc = (c1+c2+c3+c4)*0.25;
imageStore(u_texmip[i+1], pos1,cc);
if (any(greaterThan(pos, siz)))
return;
siz >>= 1;
memoryBarrierImage();
}
}
removing that return makes 6ms->23ms, so something happens to the threadgroups. why is the LDS necessary?
Ah, of course it's necessary because there's no other way to know when 2x2 tile of groups has finished. But if those groups finish in random order, how does that tid.xy /= 2; produce correct coordinates for next iteration?
Any updates on getting a fixed version of this? :)
Hey, we have tried this version of the gist and it is definitely slower on
Radeon RX 580 and NVidia 2060 than the version in the mini engine for directx.
The version in direct xdoes uses lds for now, if it uses waveintrinsics it will be faster.
Can you comment?
On 4096x4096 on RX 580
Gist : 1806240 ns
Miniengine: 875680 ns
Do we do something wrong?
Measure with pix 1908.02
Fixed both bugs. See note below