Skip to content

Instantly share code, notes, and snippets.

/*
set isa_file=%~1.isa
set analysis_file=%~1.a
set isa_file
rga --define COMPILER_AMD_RGA=1 --source-kind hlsl --asic Pitcairn --profile cs_5_0 --function %2 --intrinsics --isa %isa_file% %1
*/
#if COMPILER_AMD_RGA
#include "ags_shader_intrinsics_dx11.hlsl"
uint2 ballot(bool pred)
@reinsteam
reinsteam / iaca_output.txt
Created February 23, 2019 04:57
Throughput analysis dump from IACA 2.3
Intel(R) Architecture Code Analyzer Version - 2.3 build:c151d5a (Thu, 6 Jul 2017 09:41:36 +0300)
Analyzed File - aosoa_packet.obj
Binary Format - 64Bit
Architecture - HSW
Analysis Type - Throughput
*******************************************************************
Intel(R) Architecture Code Analyzer Mark Number 1
*******************************************************************
Sigma = 1.0638460811;
X0 = 0.0;
X1 = 0.9580110968;
X2 = 2.01388028375;
G(X0) = 0.3750 = 6.0 / 16.0
G(X1) = 0.2500 = 4.0 / 16.0
G(X2) = 0.0625 = 1.0 / 16.0
@reinsteam
reinsteam / LdsMinMax.hlsl
Created April 6, 2018 15:41
Min/Max reduction example
groupshared float ldsMin[64];
groupshared float ldsMax[64];
void LdsMinMax(uint Idx, uint Ofs)
{
[branch] if (Idx < Ofs)
{
ldsMin[Idx] = min(ldsMin[Idx], ldsMin[Idx + Ofs]);
ldsMax[Idx] = max(ldsMax[Idx], ldsMax[Idx + Ofs]);
}
@reinsteam
reinsteam / TriangleFilteringCS.hlsl
Created April 2, 2018 15:23
Profiling stats of simple triangle filtering shader from [Pyramid](https://github.com/jbarczak/Pyramid)
/*-----------------------------------------------------------------------------------------------------------------------
* Output from Pyramid:
*
* SGPRs: 30 / 102
* VGPRs: 20 / 256
* LDS bytes/tg 32 / 32768
* Waves/Group: 4
* Occupancy:
* S: 10 waves/SIMD
* V: 10 waves/SIMD
@reinsteam
reinsteam / fp_accumulation_limits.c
Created January 5, 2018 13:37
A sample showing an example of finding limits of floating point accumulation
/*------------------------------------------------------------------------------------------------------------------
* A sample that demonstrates 32-bit floating point precision
*
* 'compute_upper_bound_f32' finds such floating point number for given 'x' that
* upper_bound + x == upper_bound
*
* 'compute_lower_bound_f32' finds such floating point number for given 'x' that
* x + lower_bound == x
*----------------------------------------------------------------------------------------------------------------*/
#include <stdio.h>
// construct SunDir from cosine of an angle between the vector and zenith (MuS)
SunDir.x = 0.0;
SunDir.y = MuS;
SunDir.z = sqrt(1.0 - MuS * MuS);
// construct EyeDir from cosine of an angle between the vector and zenith (Mu)
EyeDir.x = 0.0;
EyeDir.y = Mu;
EyeDir.z = sqrt(1.0 - Mu * Mu);
@reinsteam
reinsteam / ComputeNormals.c
Created August 24, 2017 17:10
Example of mesh normals computation
typedef struct float3
{
float x, y, z;
} float3;
typedef unsigned int u32;
/*----------------------------------------------------------------------------------------------------------------------
* input parameters:
* `vertices` - an array storing vertex positions
@reinsteam
reinsteam / EncodeMorton3Bit.c
Created April 15, 2017 12:51
Simplified version of creating morton codes from 2 numbers in range [0; 8). Useful for converting local thread index in compute shader to a flattened one for downsampling
#include <stdio.h>
int MortonShuffle3Bit(int x)
{
return (x & 0x1) | ((x & 0x2) << 1) | ((x & 0x4) << 2);
}
int EncodeMorton3Bit(int x, int y)
{
return (MortonShuffle3Bit(y) << 1) | MortonShuffle3Bit(x);
/*----------------------------------------------------------------------------------------------------------------------
* Let's imagine two spheres with common center and different radiuses. This function computes distance from a point
* on the sphere with smaller radius (RMin) to the surface on the sphere with bigger radius (RMax) in the direction
* define by Mu (cosine between direction from the point on the smaller sphere to the point on the bigger sphere and
* direction from the point on the smaller sphere to the sphere center)
*
* Complexity : 3 mad, 1 sqrt
*--------------------------------------------------------------------------------------------------------------------*/
float DistanceToSphericalLayer(float RMin, float RMinSq, float RMaxSq, float Mu)
{