reinsteam/TriangleFilteringCS.hlsl

## TriangleFilteringCS.hlsl
/*-----------------------------------------------------------------------------------------------------------------------
 * Output from Pyramid:
 *
 * SGPRs:           30 / 102
 * VGPRs:           20 / 256
 * LDS bytes/tg     32 / 32768
 * Waves/Group:    4
 * Occupancy:
 *    S: 10 waves/SIMD
 *    V: 10 waves/SIMD
 *    L: 2048 groups/CU
 *       8192 waves/CU
 *        2048.00 waves/SIMD
 * Ops:
 *    VALU: 88
 *    S:    35
 *    VMEM: 5
 *
 *--------------------------------------------------------------------------------------------------------------------*/

ByteAddressBuffer IdxBuffer;

// Assume position buffer contains clip space X, Y and view space Z (clip space W) for simplicity
ByteAddressBuffer PosBuffer;

RWByteAddressBuffer OutIdxBuffer;

uint cbCullingFlags;

float Min3(float3 v)
{
    return min(min(v.x, v.y), v.z);
}

float Max3(float3 v)
{
    return max(max(v.x, v.y), v.z);
}

groupshared uint Mask[8];

[numthreads(256, 1, 1)]
void MainCS(uint ThreadId : SV_DispatchThreadID, uint LocalId : SV_GroupID)
{
    [branch] if (LocalId < 8)
    {
        Mask[LocalId] = 0;
    }
    GroupMemoryBarrierWithGroupSync();

    // Assume 16-bit indices (32-bit are obvious)
    const uint Uint3Idx = ThreadId >> 1;
    const uint Uint3Ofs = ThreadId  & 1;

    const uint2 Packet = IdxBuffer.Load2((Uint3Idx * 3 + Uint3Ofs) << 2);

    uint Idx0, Idx1, Idx2;
    if (Uint3Ofs)
    {
        Idx0 = Packet.x >> 16;
        Idx1 = Packet.y & 0xffff;
        Idx2 = Packet.y >> 16;
    }
    else
    {
        Idx0 = Packet.x & 0xffff;
        Idx1 = Packet.x >> 16;
        Idx2 = Packet.y & 0xffff;
    }

    /* Load vertices */
    const float3 Vtx0 = asfloat(PosBuffer.Load4(Idx0 * 12).xyz);
    const float3 Vtx1 = asfloat(PosBuffer.Load4(Idx1 * 12).xyz);
    const float3 Vtx2 = asfloat(PosBuffer.Load4(Idx2 * 12).xyz);

    /* Cull degenerate triangles */
    bool culled = (Idx0 == Idx1) || (Idx1 == Idx2) || (Idx2 == Idx0);

    /* Backface Culling */
    [flatten] if (cbCullingFlags & 0x1)
    {
        culled = culled || determinant(float3x3(Vtx0, Vtx1, Vtx2)) > 0.0;
    }

    float3 VtxX = float3(Vtx0.x, Vtx1.x, Vtx2.x);
    float3 VtxY = float3(Vtx0.y, Vtx1.y, Vtx2.y);
    float3 VtxW = float3(Vtx0.z, Vtx1.z, Vtx2.z);

    /* Near Plane Culling */
    culled = culled || all(VtxW < 0.0);

    /* Clip Space Culling */

    // Convert from clip space to NDC and then to screen space
    VtxW = rcp(abs(VtxW));
    VtxX *= VtxW *  0.5;
    VtxY *= VtxW * -0.5;

    VtxX += 0.5;
    VtxY += 0.5;

    float2 Min = float2(Min3(VtxX), Min3(VtxY));
    float2 Max = float2(Max3(VtxX), Max3(VtxY));

    culled = culled || any(round(Min) == round(Max));

    /* Frustum Culling */
    culled = culled || any(Max < 0.0) || any(Min > 1.0);

    /* Update local mask */
    const uint SlotIdx = LocalId >> 5;
    const uint MaskBit = LocalId & 0x1f;

    InterlockedOr(Mask[SlotIdx], (!culled) << MaskBit);
    GroupMemoryBarrierWithGroupSync();

    // Output global mask:
    [branch] if (LocalId < 2)
    {
        const uint GroupId = ThreadId >> 8;
        const uint MemSlotId = ((GroupId << 1) + LocalId) << 4;
        const uint LdsSlotId = LocalId << 2;

        OutIdxBuffer.Store4(MemSlotId << 2, uint4(Mask[LdsSlotId], Mask[LdsSlotId + 1], Mask[LdsSlotId + 2], Mask[LdsSlotId + 3]));
    }

    // Output indices:
    //OutIdxBuffer.Store2((ThreadId * 3) << 2, uint2(Idx0, Idx1));
    //OutIdxBuffer.Store((ThreadId * 3 + 2) << 2, Idx2);

}
	/*-----------------------------------------------------------------------------------------------------------------------
	* Output from Pyramid:
	*
	* SGPRs: 30 / 102
	* VGPRs: 20 / 256
	* LDS bytes/tg 32 / 32768
	* Waves/Group: 4
	* Occupancy:
	* S: 10 waves/SIMD
	* V: 10 waves/SIMD
	* L: 2048 groups/CU
	* 8192 waves/CU
	* 2048.00 waves/SIMD
	* Ops:
	* VALU: 88
	* S: 35
	* VMEM: 5
	*
	--------------------------------------------------------------------------------------------------------------------/

	ByteAddressBuffer IdxBuffer;

	// Assume position buffer contains clip space X, Y and view space Z (clip space W) for simplicity
	ByteAddressBuffer PosBuffer;

	RWByteAddressBuffer OutIdxBuffer;

	uint cbCullingFlags;

	float Min3(float3 v)
	{
	return min(min(v.x, v.y), v.z);
	}

	float Max3(float3 v)
	{
	return max(max(v.x, v.y), v.z);
	}

	groupshared uint Mask[8];

	[numthreads(256, 1, 1)]
	void MainCS(uint ThreadId : SV_DispatchThreadID, uint LocalId : SV_GroupID)
	{
	[branch] if (LocalId < 8)
	{
	Mask[LocalId] = 0;
	}
	GroupMemoryBarrierWithGroupSync();

	// Assume 16-bit indices (32-bit are obvious)
	const uint Uint3Idx = ThreadId >> 1;
	const uint Uint3Ofs = ThreadId & 1;

	const uint2 Packet = IdxBuffer.Load2((Uint3Idx * 3 + Uint3Ofs) << 2);

	uint Idx0, Idx1, Idx2;
	if (Uint3Ofs)
	{
	Idx0 = Packet.x >> 16;
	Idx1 = Packet.y & 0xffff;
	Idx2 = Packet.y >> 16;
	}
	else
	{
	Idx0 = Packet.x & 0xffff;
	Idx1 = Packet.x >> 16;
	Idx2 = Packet.y & 0xffff;
	}

	/* Load vertices */
	const float3 Vtx0 = asfloat(PosBuffer.Load4(Idx0 * 12).xyz);
	const float3 Vtx1 = asfloat(PosBuffer.Load4(Idx1 * 12).xyz);
	const float3 Vtx2 = asfloat(PosBuffer.Load4(Idx2 * 12).xyz);

	/* Cull degenerate triangles */
	bool culled = (Idx0 == Idx1) \|\| (Idx1 == Idx2) \|\| (Idx2 == Idx0);

	/* Backface Culling */
	[flatten] if (cbCullingFlags & 0x1)
	{
	culled = culled \|\| determinant(float3x3(Vtx0, Vtx1, Vtx2)) > 0.0;
	}

	float3 VtxX = float3(Vtx0.x, Vtx1.x, Vtx2.x);
	float3 VtxY = float3(Vtx0.y, Vtx1.y, Vtx2.y);
	float3 VtxW = float3(Vtx0.z, Vtx1.z, Vtx2.z);

	/* Near Plane Culling */
	culled = culled \|\| all(VtxW < 0.0);

	/* Clip Space Culling */

	// Convert from clip space to NDC and then to screen space
	VtxW = rcp(abs(VtxW));
	VtxX = VtxW 0.5;
	VtxY = VtxW -0.5;

	VtxX += 0.5;
	VtxY += 0.5;

	float2 Min = float2(Min3(VtxX), Min3(VtxY));
	float2 Max = float2(Max3(VtxX), Max3(VtxY));

	culled = culled \|\| any(round(Min) == round(Max));

	/* Frustum Culling */
	culled = culled \|\| any(Max < 0.0) \|\| any(Min > 1.0);

	/* Update local mask */
	const uint SlotIdx = LocalId >> 5;
	const uint MaskBit = LocalId & 0x1f;

	InterlockedOr(Mask[SlotIdx], (!culled) << MaskBit);
	GroupMemoryBarrierWithGroupSync();

	// Output global mask:
	[branch] if (LocalId < 2)
	{
	const uint GroupId = ThreadId >> 8;
	const uint MemSlotId = ((GroupId << 1) + LocalId) << 4;
	const uint LdsSlotId = LocalId << 2;

	OutIdxBuffer.Store4(MemSlotId << 2, uint4(Mask[LdsSlotId], Mask[LdsSlotId + 1], Mask[LdsSlotId + 2], Mask[LdsSlotId + 3]));
	}

	// Output indices:
	//OutIdxBuffer.Store2((ThreadId * 3) << 2, uint2(Idx0, Idx1));
	//OutIdxBuffer.Store((ThreadId * 3 + 2) << 2, Idx2);

	}