Skip to content

Instantly share code, notes, and snippets.

@Reedbeta
Created July 7, 2016 06:19
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Reedbeta/6633fc9a67e64377c2cd6eaccd1e8c4c to your computer and use it in GitHub Desktop.
Save Reedbeta/6633fc9a67e64377c2cd6eaccd1e8c4c to your computer and use it in GitHub Desktop.
GPU PRNG & hash-function testbed
// COM pointer - wraps a COM object and automatically calls AddRef() / Release() as necessary
#pragma once
template <typename T>
class comptr
{
public:
comptr ()
: m_pT(NULL) {}
explicit comptr (T * pT)
: m_pT(pT) {}
comptr (const comptr<T> & comptrOther)
: m_pT(comptrOther.m_pT)
{
if (m_pT)
m_pT->AddRef();
}
template <typename U>
comptr (const comptr<U> & comptrOther)
: m_pT(comptrOther.m_pT)
{
if (m_pT)
m_pT->AddRef();
}
template <typename U>
comptr<T> & operator = (const comptr<U> & comptrOther)
{
release();
m_pT = comptrOther.m_pT;
if (m_pT)
m_pT->AddRef();
return *this;
}
template <typename U>
comptr<T> & operator = (U * pU)
{
release();
m_pT = pU;
return *this;
}
~comptr ()
{
release();
}
operator bool () const
{
return (m_pT != NULL);
}
operator T * () const
{
return m_pT;
}
T * operator -> () const
{
return m_pT;
}
T * operator * () const
{
return m_pT;
}
T ** address ()
{
return &m_pT;
}
void release ()
{
if (m_pT)
{
m_pT->Release();
m_pT = NULL;
}
}
protected:
T * m_pT;
};
// GPU pseudorandom number generator demo
// Written by Nathan Reed, January 2013
#include <stdio.h>
#include <time.h>
#include <d3d11.h>
#include <d3dcompiler.h>
#include <vector>
#include "comptr.hpp"
#if defined(_MSC_VER) && _MSC_VER <= 1500 // MSVC 2008 and earlier don't have stdint.h
typedef unsigned int uint32_t;
#else
#include <stdint.h>
#endif
void PrintHelp ();
bool InitD3D ();
void ShutdownD3D ();
void GenerateGPU ();
void GenerateCPU ();
void WriteBmp ();
void WriteRaw ();
// Generated data
std::vector<uint32_t> g_data;
// Graphics state
comptr<ID3D11Device> g_pDevice;
comptr<ID3D11DeviceContext> g_pContext;
comptr<ID3D11Buffer> g_pBufferOut;
comptr<ID3D11UnorderedAccessView> g_pUAVOut;
comptr<ID3D11ComputeShader> g_pCS;
comptr<ID3D11Query> g_pQueryBegin, g_pQueryEnd, g_pQueryDisjoint;
comptr<ID3D11Buffer> g_pBufferStaging;
// Command-line settings
enum PRNG
{
PRNG_LCG,
PRNG_MWC,
PRNG_Xorshift,
PRNG_JenkinsHash,
PRNG_JenkinsHash2,
PRNG_WangHash,
PRNG_FNVHash,
PRNG_WangThenLCG,
PRNG_WangThenXorshift,
PRNG_LCGFloat,
PRNG_Noise,
PRNG_LCGDeep,
PRNG_XorshiftDeep,
PRNG_WangDeep,
PRNG_Max,
};
const char * g_apChzPrng[PRNG_Max] =
{
"LCG", // PRNG_LCG
"MWC", // PRNG_MWC
"Xorshift", // PRNG_Xorshift
"Jenkins lookup3 hash", // PRNG_JenkinsHash
"Jenkins integer hash", // PRNG_JenkinsHash2
"Wang integer hash", // PRNG_WangHash
"FNV hash", // PRNG_FnvHash
"Wang hash init, then LCG", // PRNG_WangThenLCG
"Wang hash init, then Xorshift", // PRNG_WangThenXorshift
"LCG float", // PRNG_LCGFloat
"High-frequency noise", // PRNG_Noise
"LCG deep", // PRNG_LCGDeep
"Xorshift deep", // PRNG_XorshiftDeep
"Wang hash deep", // PRNG_WangDeep
};
PRNG g_prng = PRNG_WangHash;
uint32_t g_cIntGenerate = 512 * 512;
uint32_t g_cThreadsPerGroup = 256;
uint32_t g_cRngSteps = 1;
uint32_t g_cTimingReps = 1;
const char * g_pChzBmpOut = NULL;
const char * g_pChzRawOut = NULL;
bool g_fDisassemble = false;
// Shader source (embedded, at end of file)
extern const char * g_apChzShader[PRNG_Max];
int main (int cArg, const char ** apChzArg)
{
// Parse command-line arguments
for (int i = 1; i < cArg; ++i)
{
if (_stricmp(apChzArg[i], "-h") == 0)
{
PrintHelp();
return 0;
}
else if (_stricmp(apChzArg[i], "-lcg") == 0)
{
g_prng = PRNG_LCG;
}
else if (_stricmp(apChzArg[i], "-mwc") == 0)
{
g_prng = PRNG_MWC;
}
else if (_stricmp(apChzArg[i], "-xorshift") == 0)
{
g_prng = PRNG_Xorshift;
}
else if (_stricmp(apChzArg[i], "-jenkins") == 0)
{
g_prng = PRNG_JenkinsHash;
}
else if (_stricmp(apChzArg[i], "-jenkins2") == 0)
{
g_prng = PRNG_JenkinsHash2;
}
else if (_stricmp(apChzArg[i], "-wang") == 0)
{
g_prng = PRNG_WangHash;
}
else if (_stricmp(apChzArg[i], "-fnv") == 0)
{
g_prng = PRNG_FNVHash;
}
else if (_stricmp(apChzArg[i], "-wang-then-lcg") == 0)
{
g_prng = PRNG_WangThenLCG;
}
else if (_stricmp(apChzArg[i], "-wang-then-xorshift") == 0)
{
g_prng = PRNG_WangThenXorshift;
}
else if (_stricmp(apChzArg[i], "-lcg-float") == 0)
{
g_prng = PRNG_LCGFloat;
}
else if (_stricmp(apChzArg[i], "-noise") == 0)
{
g_prng = PRNG_Noise;
}
else if (_stricmp(apChzArg[i], "-lcg-deep") == 0)
{
g_prng = PRNG_LCGDeep;
}
else if (_stricmp(apChzArg[i], "-xorshift-deep") == 0)
{
g_prng = PRNG_XorshiftDeep;
}
else if (_stricmp(apChzArg[i], "-wang-deep") == 0)
{
g_prng = PRNG_WangDeep;
}
else if (_stricmp(apChzArg[i], "-dis") == 0)
{
g_fDisassemble = true;
}
else if (_stricmp(apChzArg[i], "-n") == 0)
{
if (sscanf(apChzArg[++i], "%u", &g_cIntGenerate) != 1)
fprintf(stderr, "Invalid number of integers \"%s\"; ignoring\n", apChzArg[i]);
}
else if (_stricmp(apChzArg[i], "-obmp") == 0)
{
g_pChzBmpOut = apChzArg[++i];
}
else if (_stricmp(apChzArg[i], "-oraw") == 0)
{
g_pChzRawOut = apChzArg[++i];
}
else if (_stricmp(apChzArg[i], "-r") == 0)
{
if (sscanf(apChzArg[++i], "%u", &g_cTimingReps) != 1)
fprintf(stderr, "Invalid number of timing reps \"%s\"; ignoring\n", apChzArg[i]);
}
else if (_stricmp(apChzArg[i], "-s") == 0)
{
if (sscanf(apChzArg[++i], "%u", &g_cRngSteps) != 1)
fprintf(stderr, "Invalid number of RNG steps \"%s\"; ignoring\n", apChzArg[i]);
}
else if (_stricmp(apChzArg[i], "-t") == 0)
{
if (sscanf(apChzArg[++i], "%u", &g_cThreadsPerGroup) != 1)
fprintf(stderr, "Invalid number of threads \"%s\"; ignoring\n", apChzArg[i]);
}
else
{
fprintf(stderr, "Unrecognized command-line parameter \"%s\"; ignoring\n", apChzArg[i]);
}
}
if (g_prng < PRNG_LCGDeep)
{
if (!InitD3D())
{
ShutdownD3D();
return 1;
}
GenerateGPU();
ShutdownD3D();
}
else
{
GenerateCPU();
}
WriteBmp();
WriteRaw();
return 0;
}
void PrintHelp ()
{
printf(
"gpu-prng written by Nathan Reed, January 2013.\n"
"Usage: gpu-prng [options]\n"
"Available options:\n"
" -h Print this message\n"
"\n"
" -lcg Linear congruential generator\n"
" -mwc Multiply-with-carry generator\n"
" -xorshift Xorshift generator\n"
" -jenkins Jenkins \"lookup3\" hash\n"
" -jenkins2 Another Jenkins hash, from his \"Integer Hashing\" page\n"
" -wang Thomas Wang's integer hash\n"
" -fnv FNV hash\n"
" -wang-then-lcg Wang hash init, then LCG\n"
" -wang-then-xorshift Wang hash init, then Xorshift\n"
" -lcg-float Linear congruential generator, converted to float\n"
" -lcg-deep Linear congruential generator, deep instead of wide\n"
" -xorshift-deep Xorshift generator, deep instead of wide\n"
" -wang-deep Wang hash, deep instead of wide\n"
"\n"
" -dis Print disassembly of compute shader\n"
" -n NUM Number of values to generate\n"
" -obmp FILENAME Write generated values to BMP file (one bit per pixel, max dim 512x512)\n"
" -oraw FILENAME Write generated values to raw binary file\n"
" -r NUM Number of repetitions of whole generation process (for timing)\n"
" -s NUM Number of RNG steps to run for each generated value\n"
" -t NUM Number of threads per group for compute shader\n"
);
}
bool InitD3D ()
{
// Initialize the device
D3D_FEATURE_LEVEL featureLevel = D3D_FEATURE_LEVEL(0);
#ifdef _DEBUG
UINT flags = D3D11_CREATE_DEVICE_DEBUG;
#else
UINT flags = 0;
#endif
if (FAILED(D3D11CreateDevice(
NULL,
D3D_DRIVER_TYPE_HARDWARE,
NULL,
flags,
NULL,
0,
D3D11_SDK_VERSION,
g_pDevice.address(),
&featureLevel,
g_pContext.address())))
{
fprintf(stderr, "Couldn't create D3D11 device\n");
return false;
}
if (featureLevel < D3D_FEATURE_LEVEL_11_0)
{
fprintf(stderr, "Minimum feature level required is D3D11!\n");
return false;
}
// Allocate memory to store generated output integers
uint32_t cB = g_cIntGenerate * 4;
D3D11_BUFFER_DESC bufferDesc =
{
cB,
D3D11_USAGE_DEFAULT,
D3D11_BIND_UNORDERED_ACCESS,
0,
D3D11_RESOURCE_MISC_BUFFER_STRUCTURED,
4,
};
if (FAILED(g_pDevice->CreateBuffer(&bufferDesc, NULL, g_pBufferOut.address())))
{
fprintf(stderr, "Couldn't create output buffer\n");
return false;
}
D3D11_UNORDERED_ACCESS_VIEW_DESC uavDesc =
{
DXGI_FORMAT_UNKNOWN,
D3D11_UAV_DIMENSION_BUFFER,
{
{ 0, g_cIntGenerate, 0 },
},
};
if (FAILED(g_pDevice->CreateUnorderedAccessView(g_pBufferOut, &uavDesc, g_pUAVOut.address())))
{
fprintf(stderr, "Couldn't create UAV\n");
return false;
}
// Create staging buffer to allow transfer to the CPU
D3D11_BUFFER_DESC bufferDescStaging =
{
cB,
D3D11_USAGE_STAGING,
0,
D3D11_CPU_ACCESS_READ,
0,
0,
};
if (FAILED(g_pDevice->CreateBuffer(&bufferDescStaging, NULL, g_pBufferStaging.address())))
{
fprintf(stderr, "Couldn't create staging buffer\n");
return false;
}
// Create the queries to time the operation
D3D11_QUERY_DESC queryDescTs = { D3D11_QUERY_TIMESTAMP, 0 };
D3D11_QUERY_DESC queryDescDisjoint = { D3D11_QUERY_TIMESTAMP_DISJOINT, 0 };
if (FAILED(g_pDevice->CreateQuery(&queryDescTs, g_pQueryBegin.address())) ||
FAILED(g_pDevice->CreateQuery(&queryDescTs, g_pQueryEnd.address())) ||
FAILED(g_pDevice->CreateQuery(&queryDescDisjoint, g_pQueryDisjoint.address())))
{
fprintf(stderr, "Couldn't create timing queries\n");
return false;
}
// Generate code for the compute shader
char aChzShader[1024];
if (_snprintf(aChzShader, 1024, g_apChzShader[g_prng], g_cThreadsPerGroup, g_cRngSteps) < 0)
{
fprintf(stderr, "Shader code too long\n");
return false;
}
// Compile the compute shader
comptr<ID3DBlob> pBlobIL;
comptr<ID3DBlob> pBlobErrors;
if (FAILED(D3DCompile(
aChzShader,
strlen(aChzShader),
NULL,
NULL,
NULL,
"cs_main",
"cs_5_0",
D3DCOMPILE_ENABLE_STRICTNESS | D3DCOMPILE_OPTIMIZATION_LEVEL3 | D3DCOMPILE_WARNINGS_ARE_ERRORS,
0,
pBlobIL.address(),
pBlobErrors.address())))
{
if (pBlobErrors)
{
fprintf(stderr, "Couldn't compile compute shader (stage 1):\n\n%s\n", pBlobErrors->GetBufferPointer());
return false;
}
else
{
fprintf(stderr, "Couldn't compile compute shader (stage 1)\n");
return false;
}
}
if (FAILED(g_pDevice->CreateComputeShader(
pBlobIL->GetBufferPointer(),
pBlobIL->GetBufferSize(),
NULL,
g_pCS.address())))
{
fprintf(stderr, "Couldn't compile compute shader (stage 2)\n");
return false;
}
if (g_fDisassemble)
{
// Disassemble the shader
comptr<ID3DBlob> pBlobDisassembly;
if (SUCCEEDED(D3DDisassemble(
pBlobIL->GetBufferPointer(),
pBlobIL->GetBufferSize(),
0,
NULL,
pBlobDisassembly.address())))
{
printf("Compute shader disassembly:\n%s\n", pBlobDisassembly->GetBufferPointer());
}
else
{
fprintf(stderr, "Couldn't disassemble compute shader\n");
}
}
return true;
}
void ShutdownD3D ()
{
g_pQueryBegin.release();
g_pQueryEnd.release();
g_pQueryDisjoint.release();
g_pCS.release();
g_pUAVOut.release();
g_pBufferOut.release();
g_pBufferStaging.release();
g_pContext.release();
g_pDevice.release();
}
void GenerateGPU ()
{
// Set up GPU state
g_pContext->CSSetShader(g_pCS, NULL, 0);
g_pContext->CSSetUnorderedAccessViews(0, 1, g_pUAVOut.address(), NULL);
// Timestamp before generation
g_pContext->Begin(g_pQueryDisjoint);
g_pContext->End(g_pQueryBegin);
// Dispatch compute shader to generate the numbers. Do it many times for timing purposes.
uint32_t cThreadGroup = g_cIntGenerate / g_cThreadsPerGroup;
if (cThreadGroup > 65535)
{
fprintf(stderr, "Too many threadgroups (%d; maximum is 65535).\n", cThreadGroup);
cThreadGroup = 65535;
}
for (uint32_t i = 0; i < g_cTimingReps; ++i)
g_pContext->Dispatch(cThreadGroup, 1, 1);
// Timestamp after generation
g_pContext->End(g_pQueryEnd);
g_pContext->End(g_pQueryDisjoint);
// Wait for GPU to finish work
g_pContext->Flush();
while (g_pContext->GetData(g_pQueryDisjoint, NULL, 0, 0) == S_FALSE)
{
Sleep(0);
}
// Calculate time taken
D3D11_QUERY_DATA_TIMESTAMP_DISJOINT timestampDisjoint;
if (g_pContext->GetData(g_pQueryDisjoint, &timestampDisjoint, sizeof(timestampDisjoint), 0) != S_OK)
{
fprintf(stderr, "Couldn't retrieve timestamp disjoint query data\n");
return;
}
if (timestampDisjoint.Disjoint)
{
fprintf(stderr, "Timestamps reportedly disjoint; can't retrieve GPU timing data\n");
return;
}
unsigned long long timestampBegin, timestampEnd;
if (g_pContext->GetData(g_pQueryBegin, &timestampBegin, sizeof(timestampBegin), 0) != S_OK ||
g_pContext->GetData(g_pQueryEnd, &timestampEnd, sizeof(timestampEnd), 0) != S_OK)
{
fprintf(stderr, "Couldn't retrieve timestamp query data\n");
return;
}
float timeInSeconds = float(timestampEnd - timestampBegin) / float(timestampDisjoint.Frequency);
printf(
"%s, %u * %d ints, %d steps, %d threads/group - generated on GPU in %0.2f ms, average rate %0.2f G/sec\n",
g_apChzPrng[g_prng],
g_cIntGenerate,
g_cTimingReps,
g_cRngSteps,
g_cThreadsPerGroup,
1000.0f * timeInSeconds,
float(g_cTimingReps) * float(g_cIntGenerate) / timeInSeconds * 1e-9f);
if (g_pChzBmpOut || g_pChzRawOut)
{
// Copy the results into CPU memory space. Unfortunately, we can't just map the UAV;
// we have to copy the data to a staging buffer and map that.
g_pContext->CopyResource(g_pBufferStaging, g_pBufferOut);
D3D11_MAPPED_SUBRESOURCE mappedSubresource;
if (FAILED(g_pContext->Map(g_pBufferStaging, 0, D3D11_MAP_READ, 0, &mappedSubresource)))
{
fprintf(stderr, "Couldn't map staging buffer for writing output\n");
return;
}
g_data.resize(g_cIntGenerate);
memcpy(&g_data[0], mappedSubresource.pData, g_cIntGenerate * 4);
g_pContext->Unmap(g_pBufferStaging, 0);
}
}
void GenerateCPU ()
{
g_data.resize(g_cIntGenerate);
clock_t clockBegin = clock();
uint32_t seed = 47;
for (uint32_t iRep = 0; iRep < g_cTimingReps; ++iRep)
{
switch (g_prng)
{
case PRNG_LCGDeep:
{
uint32_t state = seed;
for (uint32_t i = 0; i < g_cIntGenerate; ++i)
{
state = 1664525 * state + 1013904223;
g_data[i] = state;
}
break;
}
case PRNG_XorshiftDeep:
{
uint32_t state = seed;
for (uint32_t i = 0; i < g_cIntGenerate; ++i)
{
state ^= (state << 13);
state ^= (state >> 17);
state ^= (state << 5);
g_data[i] = state;
}
break;
}
case PRNG_WangDeep:
{
uint32_t state = seed;
for (uint32_t i = 0; i < g_cIntGenerate; ++i)
{
state = (state ^ 61) ^ (state >> 16);
state = state + (state << 3);
state = state ^ (state >> 4);
state = state * 0x27d4eb2d;
state = state ^ (state >> 15);
g_data[i] = state;
}
break;
}
}
}
clock_t clockEnd = clock();
float timeInSeconds = float(clockEnd - clockBegin) / float(CLOCKS_PER_SEC);
printf(
"%s, %u * %d ints - generated on CPU in %0.2f ms, average rate %0.2f M/sec\n",
g_apChzPrng[g_prng],
g_cIntGenerate,
g_cTimingReps,
1000.0f * timeInSeconds,
float(g_cTimingReps) * float(g_cIntGenerate) / timeInSeconds * 1e-6f);
}
void WriteBmp ()
{
if (!g_pChzBmpOut)
return;
FILE * pFile = fopen(g_pChzBmpOut, "wt");
if (!pFile)
{
fprintf(stderr, "Couldn't open output file \"%s\" for writing\n", g_pChzBmpOut);
return;
}
// Calculate bitmap size - always 512 wide, one bit per pixel, max 512 high
uint32_t dXBitmap = 512;
uint32_t cIntPerRow = dXBitmap / 32;
uint32_t dYBitmap = min(512, g_cIntGenerate / cIntPerRow);
// Write bitmap header
BITMAPFILEHEADER bmfh =
{
0x4d42, // "BM"
sizeof(BITMAPFILEHEADER) + sizeof(BITMAPINFOHEADER) + 2 * sizeof(RGBQUAD) + dXBitmap * dYBitmap / 8,
0,
0,
sizeof(BITMAPFILEHEADER) + sizeof(BITMAPINFOHEADER) + 2 * sizeof(RGBQUAD),
};
BITMAPINFOHEADER bmih =
{
sizeof(BITMAPINFOHEADER),
dXBitmap,
dYBitmap,
1,
1,
BI_RGB,
0,
0,
0,
2,
0,
};
RGBQUAD aRgbPalette[2] =
{
{ 0, 0, 0, 0 },
{ 255, 255, 255, 0 },
};
fwrite(&bmfh, sizeof(bmfh), 1, pFile);
fwrite(&bmih, sizeof(bmih), 1, pFile);
fwrite(aRgbPalette, sizeof(RGBQUAD), 2, pFile);
// Write bitmap data
fwrite(&g_data[0], dXBitmap * dYBitmap / 8, 1, pFile);
fclose(pFile);
printf("Output written to \"%s\"\n", g_pChzBmpOut);
}
void WriteRaw ()
{
if (!g_pChzRawOut)
return;
FILE * pFile = fopen(g_pChzRawOut, "wb");
if (!pFile)
{
fprintf(stderr, "Couldn't open output file \"%s\" for writing\n", g_pChzRawOut);
g_pContext->Unmap(g_pBufferStaging, 0);
return;
}
fwrite(&g_data[0], g_cIntGenerate * 4, 1, pFile);
fclose(pFile);
printf("Raw output written to \"%s\"\n", g_pChzRawOut);
}
// Shader source
extern const char * g_apChzShader[PRNG_Max] =
{
// PRNG_LCG
"RWStructuredBuffer<uint> g_aData : register(u0);\n"
"[numthreads(%d, 1, 1)]\n"
"void cs_main(uint3 threadId : SV_DispatchThreadID)\n"
"{\n"
" uint seed = threadId.x;\n"
" uint state = seed;\n"
" [unroll]for (uint i = 0; i < %d; ++i)\n"
" state = 1664525 * state + 1013904223;\n"
" g_aData[threadId.x] = state;\n"
"}\n",
// PRNG_MWC
"RWStructuredBuffer<uint> g_aData : register(u0);\n"
"[numthreads(%d, 1, 1)]\n"
"void cs_main(uint3 threadId : SV_DispatchThreadID)\n"
"{\n"
" uint seed = threadId.x;\n"
" uint state = seed;\n"
" [unroll]for (uint i = 0; i < %d; ++i)\n"
" {\n"
" // 64-bit multiply state by 0xffffda61\n"
" uint highhigh = (state >> 16) * 0xffff;\n"
" uint highlow = (state >> 16) * 0xda61;\n"
" uint lowhigh = (state & 0xffff) * 0xffff;\n"
" uint lowlow = (state & 0xffff) * 0xda61;\n"
" uint resultLow = lowlow + ((highlow + lowhigh) << 16);\n"
" uint resultHigh = highhigh + (highlow >> 16) + (lowhigh >> 16); // !!! missing carry-out from low 32\n"
" state = resultLow + resultHigh;\n"
" }\n"
" g_aData[threadId.x] = state;\n"
"}\n",
// PRNG_Xorshift
"RWStructuredBuffer<uint> g_aData : register(u0);\n"
"[numthreads(%d, 1, 1)]\n"
"void cs_main(uint3 threadId : SV_DispatchThreadID)\n"
"{\n"
" uint seed = threadId.x;\n"
" uint state = seed;\n"
" [unroll]for (uint i = 0; i < %d; ++i)\n"
" {\n"
" state ^= (state << 13);\n"
" state ^= (state >> 17);\n"
" state ^= (state << 5);\n"
" }\n"
" g_aData[threadId.x] = state;\n"
"}\n",
// PRNG_JenkinsHash
"RWStructuredBuffer<uint> g_aData : register(u0);\n"
"uint rot(uint x, uint k) { return ((x << k) | (x >> (32 - k))); }\n"
"[numthreads(%d, 1, 1)]\n"
"void cs_main(uint3 threadId : SV_DispatchThreadID)\n"
"{\n"
" uint seed = threadId.x;\n"
" uint state = seed;\n"
" [unroll]for (uint i = 0; i < %d; ++i)\n"
" {\n"
" uint a, b, c;\n"
" a = b = c = 0xdeadbeef + state;\n"
" c ^= b; c -= rot(b, 14);\n"
" a ^= c; a -= rot(c, 11);\n"
" b ^= a; b -= rot(a, 25);\n"
" c ^= b; c -= rot(b, 16);\n"
" a ^= c; a -= rot(c, 4);\n"
" b ^= a; b -= rot(a, 14);\n"
" c ^= b; c -= rot(b, 24);\n"
" state = c;\n"
" }\n"
" g_aData[threadId.x] = state;\n"
"}\n",
// PRNG_JenkinsHash2
"RWStructuredBuffer<uint> g_aData : register(u0);\n"
"[numthreads(%d, 1, 1)]\n"
"void cs_main(uint3 threadId : SV_DispatchThreadID)\n"
"{\n"
" uint seed = threadId.x;\n"
" uint state = seed;\n"
" [unroll]for (uint i = 0; i < %d; ++i)\n"
" {\n"
" state -= (state << 6);\n"
" state ^= (state >> 17);\n"
" state -= (state << 9);\n"
" state ^= (state << 4);\n"
" state -= (state << 3);\n"
" state ^= (state << 10);\n"
" state ^= (state >> 15);\n"
" }\n"
" g_aData[threadId.x] = state;\n"
"}\n",
// PRNG_WangHash
"RWStructuredBuffer<uint> g_aData : register(u0);\n"
"[numthreads(%d, 1, 1)]\n"
"void cs_main(uint3 threadId : SV_DispatchThreadID)\n"
"{\n"
" uint seed = threadId.x;\n"
" uint state = seed;\n"
" [unroll]for (uint i = 0; i < %d; ++i)\n"
" {\n"
" state = (state ^ 61) ^ (state >> 16);\n"
" state *= 9;\n"
" state = state ^ (state >> 4);\n"
" state *= 0x27d4eb2d;\n"
" state = state ^ (state >> 15);\n"
" }\n"
" g_aData[threadId.x] = state;\n"
"}\n",
// PRNG_FNVHash
"RWStructuredBuffer<uint> g_aData : register(u0);\n"
"[numthreads(%d, 1, 1)]\n"
"void cs_main(uint3 threadId : SV_DispatchThreadID)\n"
"{\n"
" uint seed = threadId.x;\n"
" uint state = seed;\n"
" [unroll]for (uint i = 0; i < %d; ++i)\n"
" {\n"
" uint hash = 2166136261UL;\n"
" hash = (hash ^ (state & 0xff)) * 16777619;\n"
" hash = (hash ^ ((state >> 8) & 0xff)) * 16777619;\n"
" hash = (hash ^ ((state >> 16) & 0xff)) * 16777619;\n"
" hash = (hash ^ ((state >> 24) & 0xff)) * 16777619;\n"
" state = hash;\n"
" }\n"
" g_aData[threadId.x] = state;\n"
"}\n",
// PRNG_WangThenLCG
"RWStructuredBuffer<uint> g_aData : register(u0);\n"
"[numthreads(%d, 1, 1)]\n"
"void cs_main(uint3 threadId : SV_DispatchThreadID)\n"
"{\n"
" uint seed = threadId.x;\n"
" uint state = seed;\n"
" // Wang hash init\n"
" state = (state ^ 61) ^ (state >> 16);\n"
" state *= 9;\n"
" state = state ^ (state >> 4);\n"
" state *= 0x27d4eb2d;\n"
" state = state ^ (state >> 15);\n"
" // LCG\n"
" [unroll]for (uint i = 1; i < %d; ++i)\n"
" state = 1664525 * state + 1013904223;\n"
" g_aData[threadId.x] = state;\n"
"}\n",
// PRNG_WangThenXorshift
"RWStructuredBuffer<uint> g_aData : register(u0);\n"
"[numthreads(%d, 1, 1)]\n"
"void cs_main(uint3 threadId : SV_DispatchThreadID)\n"
"{\n"
" uint seed = threadId.x;\n"
" uint state = seed;\n"
" // Wang hash init\n"
" state = (state ^ 61) ^ (state >> 16);\n"
" state *= 9;\n"
" state = state ^ (state >> 4);\n"
" state *= 0x27d4eb2d;\n"
" state = state ^ (state >> 15);\n"
" // Xorshift\n"
" [unroll]for (uint i = 1; i < %d; ++i)\n"
" {\n"
" state ^= (state << 13);\n"
" state ^= (state >> 17);\n"
" state ^= (state << 5);\n"
" }\n"
" g_aData[threadId.x] = state;\n"
"}\n",
// PRNG_LCGFloat
"RWStructuredBuffer<float> g_aData : register(u0);\n"
"[numthreads(%d, 1, 1)]\n"
"void cs_main(uint3 threadId : SV_DispatchThreadID)\n"
"{\n"
" uint seed = threadId.x;\n"
" uint state = seed;\n"
" [unroll]for (uint i = 0; i < %d; ++i)\n"
" state = 1664525 * state + 1013904223;\n"
" g_aData[threadId.x] = float(state) * (1.0 / 4294967295.0);\n"
"}\n",
// PRNG_Noise
"RWStructuredBuffer<float> g_aData : register(u0);\n"
"[numthreads(%d, 1, 1)]\n"
"void cs_main(uint3 threadId : SV_DispatchThreadID)\n"
"{\n"
" float seed = threadId.x;\n"
" float state = seed;\n"
" [unroll]for (uint i = 0; i < %d; ++i)\n"
" state = frac(sin(state * 12.9898) * 43758.5453);\n"
" g_aData[threadId.x] = state;\n"
"}\n",
// Deep RNGs - implemented on CPU
NULL,
NULL,
NULL,
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment