Skip to content

Instantly share code, notes, and snippets.

@reinsteam
Created April 2, 2018 15:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save reinsteam/5c5f5a4e41c0f13f08c75688d563b222 to your computer and use it in GitHub Desktop.
Save reinsteam/5c5f5a4e41c0f13f08c75688d563b222 to your computer and use it in GitHub Desktop.
Profiling stats of simple triangle filtering shader from [Pyramid](https://github.com/jbarczak/Pyramid)
/*-----------------------------------------------------------------------------------------------------------------------
* Output from Pyramid:
*
* SGPRs: 30 / 102
* VGPRs: 20 / 256
* LDS bytes/tg 32 / 32768
* Waves/Group: 4
* Occupancy:
* S: 10 waves/SIMD
* V: 10 waves/SIMD
* L: 2048 groups/CU
* 8192 waves/CU
* 2048.00 waves/SIMD
* Ops:
* VALU: 88
* S: 35
* VMEM: 5
*
*--------------------------------------------------------------------------------------------------------------------*/
ByteAddressBuffer IdxBuffer;
// Assume position buffer contains clip space X, Y and view space Z (clip space W) for simplicity
ByteAddressBuffer PosBuffer;
RWByteAddressBuffer OutIdxBuffer;
uint cbCullingFlags;
float Min3(float3 v)
{
return min(min(v.x, v.y), v.z);
}
float Max3(float3 v)
{
return max(max(v.x, v.y), v.z);
}
groupshared uint Mask[8];
[numthreads(256, 1, 1)]
void MainCS(uint ThreadId : SV_DispatchThreadID, uint LocalId : SV_GroupID)
{
[branch] if (LocalId < 8)
{
Mask[LocalId] = 0;
}
GroupMemoryBarrierWithGroupSync();
// Assume 16-bit indices (32-bit are obvious)
const uint Uint3Idx = ThreadId >> 1;
const uint Uint3Ofs = ThreadId & 1;
const uint2 Packet = IdxBuffer.Load2((Uint3Idx * 3 + Uint3Ofs) << 2);
uint Idx0, Idx1, Idx2;
if (Uint3Ofs)
{
Idx0 = Packet.x >> 16;
Idx1 = Packet.y & 0xffff;
Idx2 = Packet.y >> 16;
}
else
{
Idx0 = Packet.x & 0xffff;
Idx1 = Packet.x >> 16;
Idx2 = Packet.y & 0xffff;
}
/* Load vertices */
const float3 Vtx0 = asfloat(PosBuffer.Load4(Idx0 * 12).xyz);
const float3 Vtx1 = asfloat(PosBuffer.Load4(Idx1 * 12).xyz);
const float3 Vtx2 = asfloat(PosBuffer.Load4(Idx2 * 12).xyz);
/* Cull degenerate triangles */
bool culled = (Idx0 == Idx1) || (Idx1 == Idx2) || (Idx2 == Idx0);
/* Backface Culling */
[flatten] if (cbCullingFlags & 0x1)
{
culled = culled || determinant(float3x3(Vtx0, Vtx1, Vtx2)) > 0.0;
}
float3 VtxX = float3(Vtx0.x, Vtx1.x, Vtx2.x);
float3 VtxY = float3(Vtx0.y, Vtx1.y, Vtx2.y);
float3 VtxW = float3(Vtx0.z, Vtx1.z, Vtx2.z);
/* Near Plane Culling */
culled = culled || all(VtxW < 0.0);
/* Clip Space Culling */
// Convert from clip space to NDC and then to screen space
VtxW = rcp(abs(VtxW));
VtxX *= VtxW * 0.5;
VtxY *= VtxW * -0.5;
VtxX += 0.5;
VtxY += 0.5;
float2 Min = float2(Min3(VtxX), Min3(VtxY));
float2 Max = float2(Max3(VtxX), Max3(VtxY));
culled = culled || any(round(Min) == round(Max));
/* Frustum Culling */
culled = culled || any(Max < 0.0) || any(Min > 1.0);
/* Update local mask */
const uint SlotIdx = LocalId >> 5;
const uint MaskBit = LocalId & 0x1f;
InterlockedOr(Mask[SlotIdx], (!culled) << MaskBit);
GroupMemoryBarrierWithGroupSync();
// Output global mask:
[branch] if (LocalId < 2)
{
const uint GroupId = ThreadId >> 8;
const uint MemSlotId = ((GroupId << 1) + LocalId) << 4;
const uint LdsSlotId = LocalId << 2;
OutIdxBuffer.Store4(MemSlotId << 2, uint4(Mask[LdsSlotId], Mask[LdsSlotId + 1], Mask[LdsSlotId + 2], Mask[LdsSlotId + 3]));
}
// Output indices:
//OutIdxBuffer.Store2((ThreadId * 3) << 2, uint2(Idx0, Idx1));
//OutIdxBuffer.Store((ThreadId * 3 + 2) << 2, Idx2);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment