reinsteam

## amd_intrinsics_test.hlsl
/*
    set isa_file=%~1.isa
    set analysis_file=%~1.a
    set isa_file
    rga --define COMPILER_AMD_RGA=1 --source-kind hlsl --asic Pitcairn --profile cs_5_0 --function %2 --intrinsics --isa %isa_file% %1
 */
#if COMPILER_AMD_RGA
    #include "ags_shader_intrinsics_dx11.hlsl"

    uint2 ballot(bool pred)

## iaca_output.txt
Intel(R) Architecture Code Analyzer Version - 2.3 build:c151d5a (Thu, 6 Jul 2017 09:41:36 +0300)
Analyzed File - aosoa_packet.obj
Binary Format - 64Bit
Architecture  - HSW
Analysis Type - Throughput

*******************************************************************
Intel(R) Architecture Code Analyzer Mark Number 1
*******************************************************************

## gist:00c60b62dff4df38b61048677831e0af
Sigma = 1.0638460811;
X0 = 0.0;
X1 = 0.9580110968;
X2 = 2.01388028375;
G(X0) = 0.3750 = 6.0 / 16.0
G(X1) = 0.2500 = 4.0 / 16.0
G(X2) = 0.0625 = 1.0 / 16.0

## LdsMinMax.hlsl
groupshared float ldsMin[64];
groupshared float ldsMax[64];

void LdsMinMax(uint Idx, uint Ofs)
{
    [branch] if (Idx < Ofs)
    {
        ldsMin[Idx] = min(ldsMin[Idx], ldsMin[Idx + Ofs]);
        ldsMax[Idx] = max(ldsMax[Idx], ldsMax[Idx + Ofs]);
    }

## TriangleFilteringCS.hlsl
/*-----------------------------------------------------------------------------------------------------------------------
 * Output from Pyramid:
 *
 * SGPRs:           30 / 102
 * VGPRs:           20 / 256
 * LDS bytes/tg     32 / 32768
 * Waves/Group:    4
 * Occupancy:
 *    S: 10 waves/SIMD
 *    V: 10 waves/SIMD

## fp_accumulation_limits.c
/*------------------------------------------------------------------------------------------------------------------
 * A sample that demonstrates 32-bit floating point precision
 *
 *  'compute_upper_bound_f32' finds such floating point number for given 'x' that
 *      upper_bound + x == upper_bound
 *
 *  'compute_lower_bound_f32' finds such floating point number for given 'x' that
 *      x + lower_bound == x
 *----------------------------------------------------------------------------------------------------------------*/
#include <stdio.h>

## gist:ab7c62dc49de29974bfcf37b89dd1d9d
// construct SunDir from cosine of an angle between the vector and zenith (MuS)
SunDir.x = 0.0;
SunDir.y = MuS;
SunDir.z = sqrt(1.0 - MuS * MuS);

// construct EyeDir from cosine of an angle between the vector and zenith (Mu)
EyeDir.x = 0.0;
EyeDir.y = Mu;
EyeDir.z = sqrt(1.0 - Mu * Mu);

## ComputeNormals.c
typedef struct float3
{
    float x, y, z;
} float3;

typedef unsigned int u32;

/*----------------------------------------------------------------------------------------------------------------------
 * input parameters:
 * `vertices` - an array storing vertex positions

## EncodeMorton3Bit.c
#include <stdio.h>

int MortonShuffle3Bit(int x)
{
    return (x & 0x1) | ((x & 0x2) << 1) | ((x & 0x4) << 2);
}

int EncodeMorton3Bit(int x, int y)
{
    return (MortonShuffle3Bit(y) << 1) | MortonShuffle3Bit(x);

## SphericalLayerDistance.hlsl
/*----------------------------------------------------------------------------------------------------------------------
 * Let's imagine two spheres with common center and different radiuses. This function computes distance from a point
 * on the sphere with smaller radius (RMin) to the surface on the sphere with bigger radius (RMax) in the direction
 * define by Mu (cosine between direction from the point on the smaller sphere to the point on the bigger sphere and
 * direction from the point on the smaller sphere to the sphere center)
 *
 * Complexity :  3 mad, 1 sqrt
 *--------------------------------------------------------------------------------------------------------------------*/
float DistanceToSphericalLayer(float RMin, float RMinSq, float RMaxSq, float Mu)
{
	/*
	set isa_file=%~1.isa
	set analysis_file=%~1.a
	set isa_file
	rga --define COMPILER_AMD_RGA=1 --source-kind hlsl --asic Pitcairn --profile cs_5_0 --function %2 --intrinsics --isa %isa_file% %1
	*/
	#if COMPILER_AMD_RGA
	#include "ags_shader_intrinsics_dx11.hlsl"

	uint2 ballot(bool pred)
	Intel(R) Architecture Code Analyzer Version - 2.3 build:c151d5a (Thu, 6 Jul 2017 09:41:36 +0300)
	Analyzed File - aosoa_packet.obj
	Binary Format - 64Bit
	Architecture - HSW
	Analysis Type - Throughput

	*******************************************************************
	Intel(R) Architecture Code Analyzer Mark Number 1
	*******************************************************************
	Sigma = 1.0638460811;
	X0 = 0.0;
	X1 = 0.9580110968;
	X2 = 2.01388028375;
	G(X0) = 0.3750 = 6.0 / 16.0
	G(X1) = 0.2500 = 4.0 / 16.0
	G(X2) = 0.0625 = 1.0 / 16.0
	groupshared float ldsMin[64];
	groupshared float ldsMax[64];

	void LdsMinMax(uint Idx, uint Ofs)
	{
	[branch] if (Idx < Ofs)
	{
	ldsMin[Idx] = min(ldsMin[Idx], ldsMin[Idx + Ofs]);
	ldsMax[Idx] = max(ldsMax[Idx], ldsMax[Idx + Ofs]);
	}
	/*-----------------------------------------------------------------------------------------------------------------------
	* Output from Pyramid:
	*
	* SGPRs: 30 / 102
	* VGPRs: 20 / 256
	* LDS bytes/tg 32 / 32768
	* Waves/Group: 4
	* Occupancy:
	* S: 10 waves/SIMD
	* V: 10 waves/SIMD
	/*------------------------------------------------------------------------------------------------------------------
	* A sample that demonstrates 32-bit floating point precision
	*
	* 'compute_upper_bound_f32' finds such floating point number for given 'x' that
	* upper_bound + x == upper_bound
	*
	* 'compute_lower_bound_f32' finds such floating point number for given 'x' that
	* x + lower_bound == x
	----------------------------------------------------------------------------------------------------------------/
	#include <stdio.h>
	// construct SunDir from cosine of an angle between the vector and zenith (MuS)
	SunDir.x = 0.0;
	SunDir.y = MuS;
	SunDir.z = sqrt(1.0 - MuS * MuS);

	// construct EyeDir from cosine of an angle between the vector and zenith (Mu)
	EyeDir.x = 0.0;
	EyeDir.y = Mu;
	EyeDir.z = sqrt(1.0 - Mu * Mu);
	typedef struct float3
	{
	float x, y, z;
	} float3;

	typedef unsigned int u32;

	/*----------------------------------------------------------------------------------------------------------------------
	* input parameters:
	* `vertices` - an array storing vertex positions
	#include <stdio.h>

	int MortonShuffle3Bit(int x)
	{
	return (x & 0x1) \| ((x & 0x2) << 1) \| ((x & 0x4) << 2);
	}

	int EncodeMorton3Bit(int x, int y)
	{
	return (MortonShuffle3Bit(y) << 1) \| MortonShuffle3Bit(x);
	/*----------------------------------------------------------------------------------------------------------------------
	* Let's imagine two spheres with common center and different radiuses. This function computes distance from a point
	* on the sphere with smaller radius (RMin) to the surface on the sphere with bigger radius (RMax) in the direction
	* define by Mu (cosine between direction from the point on the smaller sphere to the point on the bigger sphere and
	* direction from the point on the smaller sphere to the sphere center)
	*
	* Complexity : 3 mad, 1 sqrt
	--------------------------------------------------------------------------------------------------------------------/
	float DistanceToSphericalLayer(float RMin, float RMinSq, float RMaxSq, float Mu)
	{