Skip to content

Instantly share code, notes, and snippets.

@monkins1010
Last active January 30, 2019 21:21
Show Gist options
  • Save monkins1010/bc9d159a0e0e08a816bfe30b2b60bc86 to your computer and use it in GitHub Desktop.
Save monkins1010/bc9d159a0e0e08a816bfe30b2b60bc86 to your computer and use it in GitHub Desktop.
__device__ __forceinline__ uint64_t ROTL64(const uint64_t value, const int offset) {
uint2 result;
if (offset >= 32) {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
}
else {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
}
return __double_as_longlong(__hiloint2double(result.y, result.x));
}
__device__ __forceinline__ uint64_t ROTR64(const uint64_t value, const int offset) {
uint2 result;
if (offset < 32) {
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
}
else {
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
}
return __double_as_longlong(__hiloint2double(result.y, result.x));
}
__device__ __forceinline__
uint64_t SWAPDWORDS(uint64_t value)
{
#if __CUDA_ARCH__ >= 320
uint2 temp;
asm("mov.b64 {%0, %1}, %2; ": "=r"(temp.x), "=r"(temp.y) : "l"(value));
asm("mov.b64 %0, {%1, %2}; ": "=l"(value) : "r"(temp.y), "r"(temp.x));
return value;
#else
return ROTL64(value, 32);
#endif
}
#define B2B_G(v,a,b,c,d,x,y,c1,c2) { \
v[a] = v[a] + v[b] + (x ^ c1); \
v[d] ^= v[a]; \
v[d] = ROTR64(v[d], 60); \
v[c] = v[c] + v[d]; \
v[b] = ROTR64(v[b] ^ v[c], 43); \
v[a] = v[a] + v[b] + (y ^ c2); \
v[d] = ROTR64(v[d] ^ v[a], 5); \
v[c] = v[c] + v[d]; \
v[b] = ROTR64(v[b] ^ v[c], 18); \
v[d] ^= ~(v[a] | v[b] | v[c]) | (~v[a] & v[b] & v[c]) | (v[a] & ~v[b] & v[c]) | (v[a] & v[b] & ~v[c]); \
v[d] ^= (~v[a] & ~v[b] & v[c]) | (~v[a] & v[b] & ~v[c]) | (v[a] & ~v[b] & ~v[c]) | (v[a] & v[b] & v[c]); \
}
cudaStream_t cudastream;
uint32_t *blockHeadermobj = nullptr;
uint32_t *midStatemobj = nullptr;
uint32_t *nonceOutmobj = nullptr;
cudaError_t grindNonces(uint32_t *nonceResult, uint64_t *hashStart, const uint64_t *header);
__device__ __constant__
static const uint8_t sigma[16][16] = {
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 },
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
};
__device__ __constant__
static const uint64_t u512[16] =
{
0xA51B6A89D489E800ULL, 0xD35B2E0E0B723800ULL,
0xA47B39A2AE9F9000ULL, 0x0C0EFA33E77E6488ULL,
0x4F452FEC309911EBULL, 0x3CFCC66F74E1022CULL,
0x4606AD364DC879DDULL, 0xBBA055B53D47C800ULL,
0x531655D90C59EB1BULL, 0xD1A00BA6DAE5B800ULL,
0x2FE452DA9632463EULL, 0x98A7B5496226F800ULL,
0xBAFCD004F92CA000ULL, 0x64A39957839525E7ULL,
0xD859E6F081AAE000ULL, 0x63D980597B560E6BULL
};
__device__ __constant__
static const uint64_t vBlake_iv[16] = {
0x4bbf42c1f107ad85ull, 0x5D11A8C3B5AEB12Eull,
0xA64AB78DC2774652ull, 0xC67595724658F253ull,
0xB8864E79CB891E56ull, 0x12ED593E29FB41A1ull,
0xB1DA3AB63C60BAA8ull, 0x6D20E50C1F954DEDull,
0x4BBF42C1F006AD9Dull, 0x5D11A8C3B5AEB12Eull,
0xA64AB78DC2774652ull, 0xC67595724658F253ull,
0xb8864e79cb891e16ull, 0x12ED593E29FB41A1ull,
0x4e25c549c39f4557ull, 0x6D20E50C1F954DEDull
};
__device__ __forceinline__ uint64_t vblake512_compress(uint64_t * __restrict__ m)
{
uint64_t v[16];
for (int i = 0; i < 16; i++)
{
v[i] = vBlake_iv[i];
}
//#pragma unroll 16
for (int i = 0; i < 16; i++) {
B2B_G(v, 0, 4, 8, 12, m[sigma[i][1]], m[sigma[i][0]],
u512[sigma[i][1]], u512[sigma[i][0]]);
B2B_G(v, 1, 5, 9, 13, m[sigma[i][3]], m[sigma[i][2]],
u512[sigma[i][3]], u512[sigma[i][2]]);
B2B_G(v, 2, 6, 10, 14, m[sigma[i][5]], m[sigma[i][4]],
u512[sigma[i][5]], u512[sigma[i][4]]);
B2B_G(v, 3, 7, 11, 15, m[sigma[i][7]], m[sigma[i][6]],
u512[sigma[i][7]], u512[sigma[i][6]]);
B2B_G(v, 0, 5, 10, 15, m[sigma[i][9]], m[sigma[i][8]],
u512[sigma[i][9]], u512[sigma[i][8]]);
B2B_G(v, 1, 6, 11, 12, m[sigma[i][11]], m[sigma[i][10]],
u512[sigma[i][11]], u512[sigma[i][10]]);
B2B_G(v, 2, 7, 8, 13, m[sigma[i][13]], m[sigma[i][12]],
u512[sigma[i][13]], u512[sigma[i][12]]);
B2B_G(v, 3, 4, 9, 14, m[sigma[i][15]], m[sigma[i][14]],
u512[sigma[i][15]], u512[sigma[i][14]]);
}
return 0x3C10ED058B3FE57E ^ v[0] ^ v[8] ^ v[3] ^ v[11] ^ v[6] ^ v[14];
}
__device__ __forceinline__ uint64_t vBlake2(const uint64_t * __restrict__ h0, const uint64_t h7)
{
uint64_t b[8];
b[0] = h0[0];
b[1] = h0[1];
b[2] = h0[2];
b[3] = h0[3];
b[4] = h0[4];
b[5] = h0[5];
b[6] = h0[6];
b[7] = h7;
return vblake512_compress(b);
}
#if CPU_SHARES
#define WORK_PER_THREAD 256
#else
#define WORK_PER_THREAD 256
#endif
#if HIGH_RESOURCE
#define DEFAULT_BLOCKSIZE 512
#define DEFAULT_THREADS_PER_BLOCK 1024
#else
#define DEFAULT_BLOCKSIZE 512
#define DEFAULT_THREADS_PER_BLOCK 512
#endif
int blocksize = DEFAULT_BLOCKSIZE;
int threadsPerBlock = DEFAULT_THREADS_PER_BLOCK;
bool verboseOutput = false;
/*
* Kernel function to search a range of nonces for a solution falling under the macro-configured difficulty (CPU=2^24, GPU=2^32).
*/
__launch_bounds__(256, 1)
__global__ void vblakeHasher(uint32_t *nonceStart, uint32_t *nonceOut, uint64_t *hashStartOut, uint64_t const *headerIn)
{
// Generate a unique starting nonce for each thread that doesn't overlap with the work of any other thread
const uint32_t workStart = ((blockDim.x * blockIdx.x + threadIdx.x) * WORK_PER_THREAD) + nonceStart[0];
uint64_t nonceHeaderSection = headerIn[7];
// Run the hash WORK_PER_THREAD times
for (unsigned int nonce = workStart; nonce < workStart + WORK_PER_THREAD; nonce++) {
// Zero out nonce position and write new nonce to last 32 bits of prototype header
nonceHeaderSection &= 0x00000000FFFFFFFFu;
nonceHeaderSection |= (((uint64_t)nonce) << 32);
uint64_t hashStart = vBlake2(headerIn, nonceHeaderSection);
if ((hashStart & 0x00000000FFFFFFFFu) == 0) {
// Check that found solution is better than existing solution if one has already been found on this run of the kernel (always send back highest-quality work)
if (hashStartOut[0] > hashStart || hashStartOut[0] == 0) {
nonceOut[0] = nonce;
hashStartOut[0] = hashStart;
}
// exit loop early
nonce = workStart + WORK_PER_THREAD;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment