Created
July 7, 2016 06:19
-
-
Save Reedbeta/6633fc9a67e64377c2cd6eaccd1e8c4c to your computer and use it in GitHub Desktop.
GPU PRNG & hash-function testbed
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// COM pointer - wraps a COM object and automatically calls AddRef() / Release() as necessary | |
#pragma once | |
template <typename T> | |
class comptr | |
{ | |
public: | |
comptr () | |
: m_pT(NULL) {} | |
explicit comptr (T * pT) | |
: m_pT(pT) {} | |
comptr (const comptr<T> & comptrOther) | |
: m_pT(comptrOther.m_pT) | |
{ | |
if (m_pT) | |
m_pT->AddRef(); | |
} | |
template <typename U> | |
comptr (const comptr<U> & comptrOther) | |
: m_pT(comptrOther.m_pT) | |
{ | |
if (m_pT) | |
m_pT->AddRef(); | |
} | |
template <typename U> | |
comptr<T> & operator = (const comptr<U> & comptrOther) | |
{ | |
release(); | |
m_pT = comptrOther.m_pT; | |
if (m_pT) | |
m_pT->AddRef(); | |
return *this; | |
} | |
template <typename U> | |
comptr<T> & operator = (U * pU) | |
{ | |
release(); | |
m_pT = pU; | |
return *this; | |
} | |
~comptr () | |
{ | |
release(); | |
} | |
operator bool () const | |
{ | |
return (m_pT != NULL); | |
} | |
operator T * () const | |
{ | |
return m_pT; | |
} | |
T * operator -> () const | |
{ | |
return m_pT; | |
} | |
T * operator * () const | |
{ | |
return m_pT; | |
} | |
T ** address () | |
{ | |
return &m_pT; | |
} | |
void release () | |
{ | |
if (m_pT) | |
{ | |
m_pT->Release(); | |
m_pT = NULL; | |
} | |
} | |
protected: | |
T * m_pT; | |
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// GPU pseudorandom number generator demo | |
// Written by Nathan Reed, January 2013 | |
#include <stdio.h> | |
#include <time.h> | |
#include <d3d11.h> | |
#include <d3dcompiler.h> | |
#include <vector> | |
#include "comptr.hpp" | |
#if defined(_MSC_VER) && _MSC_VER <= 1500 // MSVC 2008 and earlier don't have stdint.h | |
typedef unsigned int uint32_t; | |
#else | |
#include <stdint.h> | |
#endif | |
void PrintHelp (); | |
bool InitD3D (); | |
void ShutdownD3D (); | |
void GenerateGPU (); | |
void GenerateCPU (); | |
void WriteBmp (); | |
void WriteRaw (); | |
// Generated data | |
std::vector<uint32_t> g_data; | |
// Graphics state | |
comptr<ID3D11Device> g_pDevice; | |
comptr<ID3D11DeviceContext> g_pContext; | |
comptr<ID3D11Buffer> g_pBufferOut; | |
comptr<ID3D11UnorderedAccessView> g_pUAVOut; | |
comptr<ID3D11ComputeShader> g_pCS; | |
comptr<ID3D11Query> g_pQueryBegin, g_pQueryEnd, g_pQueryDisjoint; | |
comptr<ID3D11Buffer> g_pBufferStaging; | |
// Command-line settings | |
enum PRNG | |
{ | |
PRNG_LCG, | |
PRNG_MWC, | |
PRNG_Xorshift, | |
PRNG_JenkinsHash, | |
PRNG_JenkinsHash2, | |
PRNG_WangHash, | |
PRNG_FNVHash, | |
PRNG_WangThenLCG, | |
PRNG_WangThenXorshift, | |
PRNG_LCGFloat, | |
PRNG_Noise, | |
PRNG_LCGDeep, | |
PRNG_XorshiftDeep, | |
PRNG_WangDeep, | |
PRNG_Max, | |
}; | |
const char * g_apChzPrng[PRNG_Max] = | |
{ | |
"LCG", // PRNG_LCG | |
"MWC", // PRNG_MWC | |
"Xorshift", // PRNG_Xorshift | |
"Jenkins lookup3 hash", // PRNG_JenkinsHash | |
"Jenkins integer hash", // PRNG_JenkinsHash2 | |
"Wang integer hash", // PRNG_WangHash | |
"FNV hash", // PRNG_FnvHash | |
"Wang hash init, then LCG", // PRNG_WangThenLCG | |
"Wang hash init, then Xorshift", // PRNG_WangThenXorshift | |
"LCG float", // PRNG_LCGFloat | |
"High-frequency noise", // PRNG_Noise | |
"LCG deep", // PRNG_LCGDeep | |
"Xorshift deep", // PRNG_XorshiftDeep | |
"Wang hash deep", // PRNG_WangDeep | |
}; | |
PRNG g_prng = PRNG_WangHash; | |
uint32_t g_cIntGenerate = 512 * 512; | |
uint32_t g_cThreadsPerGroup = 256; | |
uint32_t g_cRngSteps = 1; | |
uint32_t g_cTimingReps = 1; | |
const char * g_pChzBmpOut = NULL; | |
const char * g_pChzRawOut = NULL; | |
bool g_fDisassemble = false; | |
// Shader source (embedded, at end of file) | |
extern const char * g_apChzShader[PRNG_Max]; | |
int main (int cArg, const char ** apChzArg) | |
{ | |
// Parse command-line arguments | |
for (int i = 1; i < cArg; ++i) | |
{ | |
if (_stricmp(apChzArg[i], "-h") == 0) | |
{ | |
PrintHelp(); | |
return 0; | |
} | |
else if (_stricmp(apChzArg[i], "-lcg") == 0) | |
{ | |
g_prng = PRNG_LCG; | |
} | |
else if (_stricmp(apChzArg[i], "-mwc") == 0) | |
{ | |
g_prng = PRNG_MWC; | |
} | |
else if (_stricmp(apChzArg[i], "-xorshift") == 0) | |
{ | |
g_prng = PRNG_Xorshift; | |
} | |
else if (_stricmp(apChzArg[i], "-jenkins") == 0) | |
{ | |
g_prng = PRNG_JenkinsHash; | |
} | |
else if (_stricmp(apChzArg[i], "-jenkins2") == 0) | |
{ | |
g_prng = PRNG_JenkinsHash2; | |
} | |
else if (_stricmp(apChzArg[i], "-wang") == 0) | |
{ | |
g_prng = PRNG_WangHash; | |
} | |
else if (_stricmp(apChzArg[i], "-fnv") == 0) | |
{ | |
g_prng = PRNG_FNVHash; | |
} | |
else if (_stricmp(apChzArg[i], "-wang-then-lcg") == 0) | |
{ | |
g_prng = PRNG_WangThenLCG; | |
} | |
else if (_stricmp(apChzArg[i], "-wang-then-xorshift") == 0) | |
{ | |
g_prng = PRNG_WangThenXorshift; | |
} | |
else if (_stricmp(apChzArg[i], "-lcg-float") == 0) | |
{ | |
g_prng = PRNG_LCGFloat; | |
} | |
else if (_stricmp(apChzArg[i], "-noise") == 0) | |
{ | |
g_prng = PRNG_Noise; | |
} | |
else if (_stricmp(apChzArg[i], "-lcg-deep") == 0) | |
{ | |
g_prng = PRNG_LCGDeep; | |
} | |
else if (_stricmp(apChzArg[i], "-xorshift-deep") == 0) | |
{ | |
g_prng = PRNG_XorshiftDeep; | |
} | |
else if (_stricmp(apChzArg[i], "-wang-deep") == 0) | |
{ | |
g_prng = PRNG_WangDeep; | |
} | |
else if (_stricmp(apChzArg[i], "-dis") == 0) | |
{ | |
g_fDisassemble = true; | |
} | |
else if (_stricmp(apChzArg[i], "-n") == 0) | |
{ | |
if (sscanf(apChzArg[++i], "%u", &g_cIntGenerate) != 1) | |
fprintf(stderr, "Invalid number of integers \"%s\"; ignoring\n", apChzArg[i]); | |
} | |
else if (_stricmp(apChzArg[i], "-obmp") == 0) | |
{ | |
g_pChzBmpOut = apChzArg[++i]; | |
} | |
else if (_stricmp(apChzArg[i], "-oraw") == 0) | |
{ | |
g_pChzRawOut = apChzArg[++i]; | |
} | |
else if (_stricmp(apChzArg[i], "-r") == 0) | |
{ | |
if (sscanf(apChzArg[++i], "%u", &g_cTimingReps) != 1) | |
fprintf(stderr, "Invalid number of timing reps \"%s\"; ignoring\n", apChzArg[i]); | |
} | |
else if (_stricmp(apChzArg[i], "-s") == 0) | |
{ | |
if (sscanf(apChzArg[++i], "%u", &g_cRngSteps) != 1) | |
fprintf(stderr, "Invalid number of RNG steps \"%s\"; ignoring\n", apChzArg[i]); | |
} | |
else if (_stricmp(apChzArg[i], "-t") == 0) | |
{ | |
if (sscanf(apChzArg[++i], "%u", &g_cThreadsPerGroup) != 1) | |
fprintf(stderr, "Invalid number of threads \"%s\"; ignoring\n", apChzArg[i]); | |
} | |
else | |
{ | |
fprintf(stderr, "Unrecognized command-line parameter \"%s\"; ignoring\n", apChzArg[i]); | |
} | |
} | |
if (g_prng < PRNG_LCGDeep) | |
{ | |
if (!InitD3D()) | |
{ | |
ShutdownD3D(); | |
return 1; | |
} | |
GenerateGPU(); | |
ShutdownD3D(); | |
} | |
else | |
{ | |
GenerateCPU(); | |
} | |
WriteBmp(); | |
WriteRaw(); | |
return 0; | |
} | |
void PrintHelp () | |
{ | |
printf( | |
"gpu-prng written by Nathan Reed, January 2013.\n" | |
"Usage: gpu-prng [options]\n" | |
"Available options:\n" | |
" -h Print this message\n" | |
"\n" | |
" -lcg Linear congruential generator\n" | |
" -mwc Multiply-with-carry generator\n" | |
" -xorshift Xorshift generator\n" | |
" -jenkins Jenkins \"lookup3\" hash\n" | |
" -jenkins2 Another Jenkins hash, from his \"Integer Hashing\" page\n" | |
" -wang Thomas Wang's integer hash\n" | |
" -fnv FNV hash\n" | |
" -wang-then-lcg Wang hash init, then LCG\n" | |
" -wang-then-xorshift Wang hash init, then Xorshift\n" | |
" -lcg-float Linear congruential generator, converted to float\n" | |
" -lcg-deep Linear congruential generator, deep instead of wide\n" | |
" -xorshift-deep Xorshift generator, deep instead of wide\n" | |
" -wang-deep Wang hash, deep instead of wide\n" | |
"\n" | |
" -dis Print disassembly of compute shader\n" | |
" -n NUM Number of values to generate\n" | |
" -obmp FILENAME Write generated values to BMP file (one bit per pixel, max dim 512x512)\n" | |
" -oraw FILENAME Write generated values to raw binary file\n" | |
" -r NUM Number of repetitions of whole generation process (for timing)\n" | |
" -s NUM Number of RNG steps to run for each generated value\n" | |
" -t NUM Number of threads per group for compute shader\n" | |
); | |
} | |
bool InitD3D () | |
{ | |
// Initialize the device | |
D3D_FEATURE_LEVEL featureLevel = D3D_FEATURE_LEVEL(0); | |
#ifdef _DEBUG | |
UINT flags = D3D11_CREATE_DEVICE_DEBUG; | |
#else | |
UINT flags = 0; | |
#endif | |
if (FAILED(D3D11CreateDevice( | |
NULL, | |
D3D_DRIVER_TYPE_HARDWARE, | |
NULL, | |
flags, | |
NULL, | |
0, | |
D3D11_SDK_VERSION, | |
g_pDevice.address(), | |
&featureLevel, | |
g_pContext.address()))) | |
{ | |
fprintf(stderr, "Couldn't create D3D11 device\n"); | |
return false; | |
} | |
if (featureLevel < D3D_FEATURE_LEVEL_11_0) | |
{ | |
fprintf(stderr, "Minimum feature level required is D3D11!\n"); | |
return false; | |
} | |
// Allocate memory to store generated output integers | |
uint32_t cB = g_cIntGenerate * 4; | |
D3D11_BUFFER_DESC bufferDesc = | |
{ | |
cB, | |
D3D11_USAGE_DEFAULT, | |
D3D11_BIND_UNORDERED_ACCESS, | |
0, | |
D3D11_RESOURCE_MISC_BUFFER_STRUCTURED, | |
4, | |
}; | |
if (FAILED(g_pDevice->CreateBuffer(&bufferDesc, NULL, g_pBufferOut.address()))) | |
{ | |
fprintf(stderr, "Couldn't create output buffer\n"); | |
return false; | |
} | |
D3D11_UNORDERED_ACCESS_VIEW_DESC uavDesc = | |
{ | |
DXGI_FORMAT_UNKNOWN, | |
D3D11_UAV_DIMENSION_BUFFER, | |
{ | |
{ 0, g_cIntGenerate, 0 }, | |
}, | |
}; | |
if (FAILED(g_pDevice->CreateUnorderedAccessView(g_pBufferOut, &uavDesc, g_pUAVOut.address()))) | |
{ | |
fprintf(stderr, "Couldn't create UAV\n"); | |
return false; | |
} | |
// Create staging buffer to allow transfer to the CPU | |
D3D11_BUFFER_DESC bufferDescStaging = | |
{ | |
cB, | |
D3D11_USAGE_STAGING, | |
0, | |
D3D11_CPU_ACCESS_READ, | |
0, | |
0, | |
}; | |
if (FAILED(g_pDevice->CreateBuffer(&bufferDescStaging, NULL, g_pBufferStaging.address()))) | |
{ | |
fprintf(stderr, "Couldn't create staging buffer\n"); | |
return false; | |
} | |
// Create the queries to time the operation | |
D3D11_QUERY_DESC queryDescTs = { D3D11_QUERY_TIMESTAMP, 0 }; | |
D3D11_QUERY_DESC queryDescDisjoint = { D3D11_QUERY_TIMESTAMP_DISJOINT, 0 }; | |
if (FAILED(g_pDevice->CreateQuery(&queryDescTs, g_pQueryBegin.address())) || | |
FAILED(g_pDevice->CreateQuery(&queryDescTs, g_pQueryEnd.address())) || | |
FAILED(g_pDevice->CreateQuery(&queryDescDisjoint, g_pQueryDisjoint.address()))) | |
{ | |
fprintf(stderr, "Couldn't create timing queries\n"); | |
return false; | |
} | |
// Generate code for the compute shader | |
char aChzShader[1024]; | |
if (_snprintf(aChzShader, 1024, g_apChzShader[g_prng], g_cThreadsPerGroup, g_cRngSteps) < 0) | |
{ | |
fprintf(stderr, "Shader code too long\n"); | |
return false; | |
} | |
// Compile the compute shader | |
comptr<ID3DBlob> pBlobIL; | |
comptr<ID3DBlob> pBlobErrors; | |
if (FAILED(D3DCompile( | |
aChzShader, | |
strlen(aChzShader), | |
NULL, | |
NULL, | |
NULL, | |
"cs_main", | |
"cs_5_0", | |
D3DCOMPILE_ENABLE_STRICTNESS | D3DCOMPILE_OPTIMIZATION_LEVEL3 | D3DCOMPILE_WARNINGS_ARE_ERRORS, | |
0, | |
pBlobIL.address(), | |
pBlobErrors.address()))) | |
{ | |
if (pBlobErrors) | |
{ | |
fprintf(stderr, "Couldn't compile compute shader (stage 1):\n\n%s\n", pBlobErrors->GetBufferPointer()); | |
return false; | |
} | |
else | |
{ | |
fprintf(stderr, "Couldn't compile compute shader (stage 1)\n"); | |
return false; | |
} | |
} | |
if (FAILED(g_pDevice->CreateComputeShader( | |
pBlobIL->GetBufferPointer(), | |
pBlobIL->GetBufferSize(), | |
NULL, | |
g_pCS.address()))) | |
{ | |
fprintf(stderr, "Couldn't compile compute shader (stage 2)\n"); | |
return false; | |
} | |
if (g_fDisassemble) | |
{ | |
// Disassemble the shader | |
comptr<ID3DBlob> pBlobDisassembly; | |
if (SUCCEEDED(D3DDisassemble( | |
pBlobIL->GetBufferPointer(), | |
pBlobIL->GetBufferSize(), | |
0, | |
NULL, | |
pBlobDisassembly.address()))) | |
{ | |
printf("Compute shader disassembly:\n%s\n", pBlobDisassembly->GetBufferPointer()); | |
} | |
else | |
{ | |
fprintf(stderr, "Couldn't disassemble compute shader\n"); | |
} | |
} | |
return true; | |
} | |
void ShutdownD3D () | |
{ | |
g_pQueryBegin.release(); | |
g_pQueryEnd.release(); | |
g_pQueryDisjoint.release(); | |
g_pCS.release(); | |
g_pUAVOut.release(); | |
g_pBufferOut.release(); | |
g_pBufferStaging.release(); | |
g_pContext.release(); | |
g_pDevice.release(); | |
} | |
void GenerateGPU () | |
{ | |
// Set up GPU state | |
g_pContext->CSSetShader(g_pCS, NULL, 0); | |
g_pContext->CSSetUnorderedAccessViews(0, 1, g_pUAVOut.address(), NULL); | |
// Timestamp before generation | |
g_pContext->Begin(g_pQueryDisjoint); | |
g_pContext->End(g_pQueryBegin); | |
// Dispatch compute shader to generate the numbers. Do it many times for timing purposes. | |
uint32_t cThreadGroup = g_cIntGenerate / g_cThreadsPerGroup; | |
if (cThreadGroup > 65535) | |
{ | |
fprintf(stderr, "Too many threadgroups (%d; maximum is 65535).\n", cThreadGroup); | |
cThreadGroup = 65535; | |
} | |
for (uint32_t i = 0; i < g_cTimingReps; ++i) | |
g_pContext->Dispatch(cThreadGroup, 1, 1); | |
// Timestamp after generation | |
g_pContext->End(g_pQueryEnd); | |
g_pContext->End(g_pQueryDisjoint); | |
// Wait for GPU to finish work | |
g_pContext->Flush(); | |
while (g_pContext->GetData(g_pQueryDisjoint, NULL, 0, 0) == S_FALSE) | |
{ | |
Sleep(0); | |
} | |
// Calculate time taken | |
D3D11_QUERY_DATA_TIMESTAMP_DISJOINT timestampDisjoint; | |
if (g_pContext->GetData(g_pQueryDisjoint, ×tampDisjoint, sizeof(timestampDisjoint), 0) != S_OK) | |
{ | |
fprintf(stderr, "Couldn't retrieve timestamp disjoint query data\n"); | |
return; | |
} | |
if (timestampDisjoint.Disjoint) | |
{ | |
fprintf(stderr, "Timestamps reportedly disjoint; can't retrieve GPU timing data\n"); | |
return; | |
} | |
unsigned long long timestampBegin, timestampEnd; | |
if (g_pContext->GetData(g_pQueryBegin, ×tampBegin, sizeof(timestampBegin), 0) != S_OK || | |
g_pContext->GetData(g_pQueryEnd, ×tampEnd, sizeof(timestampEnd), 0) != S_OK) | |
{ | |
fprintf(stderr, "Couldn't retrieve timestamp query data\n"); | |
return; | |
} | |
float timeInSeconds = float(timestampEnd - timestampBegin) / float(timestampDisjoint.Frequency); | |
printf( | |
"%s, %u * %d ints, %d steps, %d threads/group - generated on GPU in %0.2f ms, average rate %0.2f G/sec\n", | |
g_apChzPrng[g_prng], | |
g_cIntGenerate, | |
g_cTimingReps, | |
g_cRngSteps, | |
g_cThreadsPerGroup, | |
1000.0f * timeInSeconds, | |
float(g_cTimingReps) * float(g_cIntGenerate) / timeInSeconds * 1e-9f); | |
if (g_pChzBmpOut || g_pChzRawOut) | |
{ | |
// Copy the results into CPU memory space. Unfortunately, we can't just map the UAV; | |
// we have to copy the data to a staging buffer and map that. | |
g_pContext->CopyResource(g_pBufferStaging, g_pBufferOut); | |
D3D11_MAPPED_SUBRESOURCE mappedSubresource; | |
if (FAILED(g_pContext->Map(g_pBufferStaging, 0, D3D11_MAP_READ, 0, &mappedSubresource))) | |
{ | |
fprintf(stderr, "Couldn't map staging buffer for writing output\n"); | |
return; | |
} | |
g_data.resize(g_cIntGenerate); | |
memcpy(&g_data[0], mappedSubresource.pData, g_cIntGenerate * 4); | |
g_pContext->Unmap(g_pBufferStaging, 0); | |
} | |
} | |
void GenerateCPU () | |
{ | |
g_data.resize(g_cIntGenerate); | |
clock_t clockBegin = clock(); | |
uint32_t seed = 47; | |
for (uint32_t iRep = 0; iRep < g_cTimingReps; ++iRep) | |
{ | |
switch (g_prng) | |
{ | |
case PRNG_LCGDeep: | |
{ | |
uint32_t state = seed; | |
for (uint32_t i = 0; i < g_cIntGenerate; ++i) | |
{ | |
state = 1664525 * state + 1013904223; | |
g_data[i] = state; | |
} | |
break; | |
} | |
case PRNG_XorshiftDeep: | |
{ | |
uint32_t state = seed; | |
for (uint32_t i = 0; i < g_cIntGenerate; ++i) | |
{ | |
state ^= (state << 13); | |
state ^= (state >> 17); | |
state ^= (state << 5); | |
g_data[i] = state; | |
} | |
break; | |
} | |
case PRNG_WangDeep: | |
{ | |
uint32_t state = seed; | |
for (uint32_t i = 0; i < g_cIntGenerate; ++i) | |
{ | |
state = (state ^ 61) ^ (state >> 16); | |
state = state + (state << 3); | |
state = state ^ (state >> 4); | |
state = state * 0x27d4eb2d; | |
state = state ^ (state >> 15); | |
g_data[i] = state; | |
} | |
break; | |
} | |
} | |
} | |
clock_t clockEnd = clock(); | |
float timeInSeconds = float(clockEnd - clockBegin) / float(CLOCKS_PER_SEC); | |
printf( | |
"%s, %u * %d ints - generated on CPU in %0.2f ms, average rate %0.2f M/sec\n", | |
g_apChzPrng[g_prng], | |
g_cIntGenerate, | |
g_cTimingReps, | |
1000.0f * timeInSeconds, | |
float(g_cTimingReps) * float(g_cIntGenerate) / timeInSeconds * 1e-6f); | |
} | |
void WriteBmp () | |
{ | |
if (!g_pChzBmpOut) | |
return; | |
FILE * pFile = fopen(g_pChzBmpOut, "wt"); | |
if (!pFile) | |
{ | |
fprintf(stderr, "Couldn't open output file \"%s\" for writing\n", g_pChzBmpOut); | |
return; | |
} | |
// Calculate bitmap size - always 512 wide, one bit per pixel, max 512 high | |
uint32_t dXBitmap = 512; | |
uint32_t cIntPerRow = dXBitmap / 32; | |
uint32_t dYBitmap = min(512, g_cIntGenerate / cIntPerRow); | |
// Write bitmap header | |
BITMAPFILEHEADER bmfh = | |
{ | |
0x4d42, // "BM" | |
sizeof(BITMAPFILEHEADER) + sizeof(BITMAPINFOHEADER) + 2 * sizeof(RGBQUAD) + dXBitmap * dYBitmap / 8, | |
0, | |
0, | |
sizeof(BITMAPFILEHEADER) + sizeof(BITMAPINFOHEADER) + 2 * sizeof(RGBQUAD), | |
}; | |
BITMAPINFOHEADER bmih = | |
{ | |
sizeof(BITMAPINFOHEADER), | |
dXBitmap, | |
dYBitmap, | |
1, | |
1, | |
BI_RGB, | |
0, | |
0, | |
0, | |
2, | |
0, | |
}; | |
RGBQUAD aRgbPalette[2] = | |
{ | |
{ 0, 0, 0, 0 }, | |
{ 255, 255, 255, 0 }, | |
}; | |
fwrite(&bmfh, sizeof(bmfh), 1, pFile); | |
fwrite(&bmih, sizeof(bmih), 1, pFile); | |
fwrite(aRgbPalette, sizeof(RGBQUAD), 2, pFile); | |
// Write bitmap data | |
fwrite(&g_data[0], dXBitmap * dYBitmap / 8, 1, pFile); | |
fclose(pFile); | |
printf("Output written to \"%s\"\n", g_pChzBmpOut); | |
} | |
void WriteRaw () | |
{ | |
if (!g_pChzRawOut) | |
return; | |
FILE * pFile = fopen(g_pChzRawOut, "wb"); | |
if (!pFile) | |
{ | |
fprintf(stderr, "Couldn't open output file \"%s\" for writing\n", g_pChzRawOut); | |
g_pContext->Unmap(g_pBufferStaging, 0); | |
return; | |
} | |
fwrite(&g_data[0], g_cIntGenerate * 4, 1, pFile); | |
fclose(pFile); | |
printf("Raw output written to \"%s\"\n", g_pChzRawOut); | |
} | |
// Shader source | |
extern const char * g_apChzShader[PRNG_Max] = | |
{ | |
// PRNG_LCG | |
"RWStructuredBuffer<uint> g_aData : register(u0);\n" | |
"[numthreads(%d, 1, 1)]\n" | |
"void cs_main(uint3 threadId : SV_DispatchThreadID)\n" | |
"{\n" | |
" uint seed = threadId.x;\n" | |
" uint state = seed;\n" | |
" [unroll]for (uint i = 0; i < %d; ++i)\n" | |
" state = 1664525 * state + 1013904223;\n" | |
" g_aData[threadId.x] = state;\n" | |
"}\n", | |
// PRNG_MWC | |
"RWStructuredBuffer<uint> g_aData : register(u0);\n" | |
"[numthreads(%d, 1, 1)]\n" | |
"void cs_main(uint3 threadId : SV_DispatchThreadID)\n" | |
"{\n" | |
" uint seed = threadId.x;\n" | |
" uint state = seed;\n" | |
" [unroll]for (uint i = 0; i < %d; ++i)\n" | |
" {\n" | |
" // 64-bit multiply state by 0xffffda61\n" | |
" uint highhigh = (state >> 16) * 0xffff;\n" | |
" uint highlow = (state >> 16) * 0xda61;\n" | |
" uint lowhigh = (state & 0xffff) * 0xffff;\n" | |
" uint lowlow = (state & 0xffff) * 0xda61;\n" | |
" uint resultLow = lowlow + ((highlow + lowhigh) << 16);\n" | |
" uint resultHigh = highhigh + (highlow >> 16) + (lowhigh >> 16); // !!! missing carry-out from low 32\n" | |
" state = resultLow + resultHigh;\n" | |
" }\n" | |
" g_aData[threadId.x] = state;\n" | |
"}\n", | |
// PRNG_Xorshift | |
"RWStructuredBuffer<uint> g_aData : register(u0);\n" | |
"[numthreads(%d, 1, 1)]\n" | |
"void cs_main(uint3 threadId : SV_DispatchThreadID)\n" | |
"{\n" | |
" uint seed = threadId.x;\n" | |
" uint state = seed;\n" | |
" [unroll]for (uint i = 0; i < %d; ++i)\n" | |
" {\n" | |
" state ^= (state << 13);\n" | |
" state ^= (state >> 17);\n" | |
" state ^= (state << 5);\n" | |
" }\n" | |
" g_aData[threadId.x] = state;\n" | |
"}\n", | |
// PRNG_JenkinsHash | |
"RWStructuredBuffer<uint> g_aData : register(u0);\n" | |
"uint rot(uint x, uint k) { return ((x << k) | (x >> (32 - k))); }\n" | |
"[numthreads(%d, 1, 1)]\n" | |
"void cs_main(uint3 threadId : SV_DispatchThreadID)\n" | |
"{\n" | |
" uint seed = threadId.x;\n" | |
" uint state = seed;\n" | |
" [unroll]for (uint i = 0; i < %d; ++i)\n" | |
" {\n" | |
" uint a, b, c;\n" | |
" a = b = c = 0xdeadbeef + state;\n" | |
" c ^= b; c -= rot(b, 14);\n" | |
" a ^= c; a -= rot(c, 11);\n" | |
" b ^= a; b -= rot(a, 25);\n" | |
" c ^= b; c -= rot(b, 16);\n" | |
" a ^= c; a -= rot(c, 4);\n" | |
" b ^= a; b -= rot(a, 14);\n" | |
" c ^= b; c -= rot(b, 24);\n" | |
" state = c;\n" | |
" }\n" | |
" g_aData[threadId.x] = state;\n" | |
"}\n", | |
// PRNG_JenkinsHash2 | |
"RWStructuredBuffer<uint> g_aData : register(u0);\n" | |
"[numthreads(%d, 1, 1)]\n" | |
"void cs_main(uint3 threadId : SV_DispatchThreadID)\n" | |
"{\n" | |
" uint seed = threadId.x;\n" | |
" uint state = seed;\n" | |
" [unroll]for (uint i = 0; i < %d; ++i)\n" | |
" {\n" | |
" state -= (state << 6);\n" | |
" state ^= (state >> 17);\n" | |
" state -= (state << 9);\n" | |
" state ^= (state << 4);\n" | |
" state -= (state << 3);\n" | |
" state ^= (state << 10);\n" | |
" state ^= (state >> 15);\n" | |
" }\n" | |
" g_aData[threadId.x] = state;\n" | |
"}\n", | |
// PRNG_WangHash | |
"RWStructuredBuffer<uint> g_aData : register(u0);\n" | |
"[numthreads(%d, 1, 1)]\n" | |
"void cs_main(uint3 threadId : SV_DispatchThreadID)\n" | |
"{\n" | |
" uint seed = threadId.x;\n" | |
" uint state = seed;\n" | |
" [unroll]for (uint i = 0; i < %d; ++i)\n" | |
" {\n" | |
" state = (state ^ 61) ^ (state >> 16);\n" | |
" state *= 9;\n" | |
" state = state ^ (state >> 4);\n" | |
" state *= 0x27d4eb2d;\n" | |
" state = state ^ (state >> 15);\n" | |
" }\n" | |
" g_aData[threadId.x] = state;\n" | |
"}\n", | |
// PRNG_FNVHash | |
"RWStructuredBuffer<uint> g_aData : register(u0);\n" | |
"[numthreads(%d, 1, 1)]\n" | |
"void cs_main(uint3 threadId : SV_DispatchThreadID)\n" | |
"{\n" | |
" uint seed = threadId.x;\n" | |
" uint state = seed;\n" | |
" [unroll]for (uint i = 0; i < %d; ++i)\n" | |
" {\n" | |
" uint hash = 2166136261UL;\n" | |
" hash = (hash ^ (state & 0xff)) * 16777619;\n" | |
" hash = (hash ^ ((state >> 8) & 0xff)) * 16777619;\n" | |
" hash = (hash ^ ((state >> 16) & 0xff)) * 16777619;\n" | |
" hash = (hash ^ ((state >> 24) & 0xff)) * 16777619;\n" | |
" state = hash;\n" | |
" }\n" | |
" g_aData[threadId.x] = state;\n" | |
"}\n", | |
// PRNG_WangThenLCG | |
"RWStructuredBuffer<uint> g_aData : register(u0);\n" | |
"[numthreads(%d, 1, 1)]\n" | |
"void cs_main(uint3 threadId : SV_DispatchThreadID)\n" | |
"{\n" | |
" uint seed = threadId.x;\n" | |
" uint state = seed;\n" | |
" // Wang hash init\n" | |
" state = (state ^ 61) ^ (state >> 16);\n" | |
" state *= 9;\n" | |
" state = state ^ (state >> 4);\n" | |
" state *= 0x27d4eb2d;\n" | |
" state = state ^ (state >> 15);\n" | |
" // LCG\n" | |
" [unroll]for (uint i = 1; i < %d; ++i)\n" | |
" state = 1664525 * state + 1013904223;\n" | |
" g_aData[threadId.x] = state;\n" | |
"}\n", | |
// PRNG_WangThenXorshift | |
"RWStructuredBuffer<uint> g_aData : register(u0);\n" | |
"[numthreads(%d, 1, 1)]\n" | |
"void cs_main(uint3 threadId : SV_DispatchThreadID)\n" | |
"{\n" | |
" uint seed = threadId.x;\n" | |
" uint state = seed;\n" | |
" // Wang hash init\n" | |
" state = (state ^ 61) ^ (state >> 16);\n" | |
" state *= 9;\n" | |
" state = state ^ (state >> 4);\n" | |
" state *= 0x27d4eb2d;\n" | |
" state = state ^ (state >> 15);\n" | |
" // Xorshift\n" | |
" [unroll]for (uint i = 1; i < %d; ++i)\n" | |
" {\n" | |
" state ^= (state << 13);\n" | |
" state ^= (state >> 17);\n" | |
" state ^= (state << 5);\n" | |
" }\n" | |
" g_aData[threadId.x] = state;\n" | |
"}\n", | |
// PRNG_LCGFloat | |
"RWStructuredBuffer<float> g_aData : register(u0);\n" | |
"[numthreads(%d, 1, 1)]\n" | |
"void cs_main(uint3 threadId : SV_DispatchThreadID)\n" | |
"{\n" | |
" uint seed = threadId.x;\n" | |
" uint state = seed;\n" | |
" [unroll]for (uint i = 0; i < %d; ++i)\n" | |
" state = 1664525 * state + 1013904223;\n" | |
" g_aData[threadId.x] = float(state) * (1.0 / 4294967295.0);\n" | |
"}\n", | |
// PRNG_Noise | |
"RWStructuredBuffer<float> g_aData : register(u0);\n" | |
"[numthreads(%d, 1, 1)]\n" | |
"void cs_main(uint3 threadId : SV_DispatchThreadID)\n" | |
"{\n" | |
" float seed = threadId.x;\n" | |
" float state = seed;\n" | |
" [unroll]for (uint i = 0; i < %d; ++i)\n" | |
" state = frac(sin(state * 12.9898) * 43758.5453);\n" | |
" g_aData[threadId.x] = state;\n" | |
"}\n", | |
// Deep RNGs - implemented on CPU | |
NULL, | |
NULL, | |
NULL, | |
}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment