Last active
January 30, 2019 21:21
-
-
Save monkins1010/bc9d159a0e0e08a816bfe30b2b60bc86 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__device__ __forceinline__ uint64_t ROTL64(const uint64_t value, const int offset) { | |
uint2 result; | |
if (offset >= 32) { | |
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); | |
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); | |
} | |
else { | |
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); | |
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); | |
} | |
return __double_as_longlong(__hiloint2double(result.y, result.x)); | |
} | |
__device__ __forceinline__ uint64_t ROTR64(const uint64_t value, const int offset) { | |
uint2 result; | |
if (offset < 32) { | |
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); | |
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); | |
} | |
else { | |
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); | |
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); | |
} | |
return __double_as_longlong(__hiloint2double(result.y, result.x)); | |
} | |
__device__ __forceinline__ | |
uint64_t SWAPDWORDS(uint64_t value) | |
{ | |
#if __CUDA_ARCH__ >= 320 | |
uint2 temp; | |
asm("mov.b64 {%0, %1}, %2; ": "=r"(temp.x), "=r"(temp.y) : "l"(value)); | |
asm("mov.b64 %0, {%1, %2}; ": "=l"(value) : "r"(temp.y), "r"(temp.x)); | |
return value; | |
#else | |
return ROTL64(value, 32); | |
#endif | |
} | |
#define B2B_G(v,a,b,c,d,x,y,c1,c2) { \ | |
v[a] = v[a] + v[b] + (x ^ c1); \ | |
v[d] ^= v[a]; \ | |
v[d] = ROTR64(v[d], 60); \ | |
v[c] = v[c] + v[d]; \ | |
v[b] = ROTR64(v[b] ^ v[c], 43); \ | |
v[a] = v[a] + v[b] + (y ^ c2); \ | |
v[d] = ROTR64(v[d] ^ v[a], 5); \ | |
v[c] = v[c] + v[d]; \ | |
v[b] = ROTR64(v[b] ^ v[c], 18); \ | |
v[d] ^= ~(v[a] | v[b] | v[c]) | (~v[a] & v[b] & v[c]) | (v[a] & ~v[b] & v[c]) | (v[a] & v[b] & ~v[c]); \ | |
v[d] ^= (~v[a] & ~v[b] & v[c]) | (~v[a] & v[b] & ~v[c]) | (v[a] & ~v[b] & ~v[c]) | (v[a] & v[b] & v[c]); \ | |
} | |
cudaStream_t cudastream; | |
uint32_t *blockHeadermobj = nullptr; | |
uint32_t *midStatemobj = nullptr; | |
uint32_t *nonceOutmobj = nullptr; | |
cudaError_t grindNonces(uint32_t *nonceResult, uint64_t *hashStart, const uint64_t *header); | |
__device__ __constant__ | |
static const uint8_t sigma[16][16] = { | |
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, | |
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, | |
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, | |
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, | |
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, | |
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, | |
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, | |
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, | |
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, | |
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 }, | |
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, | |
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, | |
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, | |
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, | |
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, | |
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } | |
}; | |
__device__ __constant__ | |
static const uint64_t u512[16] = | |
{ | |
0xA51B6A89D489E800ULL, 0xD35B2E0E0B723800ULL, | |
0xA47B39A2AE9F9000ULL, 0x0C0EFA33E77E6488ULL, | |
0x4F452FEC309911EBULL, 0x3CFCC66F74E1022CULL, | |
0x4606AD364DC879DDULL, 0xBBA055B53D47C800ULL, | |
0x531655D90C59EB1BULL, 0xD1A00BA6DAE5B800ULL, | |
0x2FE452DA9632463EULL, 0x98A7B5496226F800ULL, | |
0xBAFCD004F92CA000ULL, 0x64A39957839525E7ULL, | |
0xD859E6F081AAE000ULL, 0x63D980597B560E6BULL | |
}; | |
__device__ __constant__ | |
static const uint64_t vBlake_iv[16] = { | |
0x4bbf42c1f107ad85ull, 0x5D11A8C3B5AEB12Eull, | |
0xA64AB78DC2774652ull, 0xC67595724658F253ull, | |
0xB8864E79CB891E56ull, 0x12ED593E29FB41A1ull, | |
0xB1DA3AB63C60BAA8ull, 0x6D20E50C1F954DEDull, | |
0x4BBF42C1F006AD9Dull, 0x5D11A8C3B5AEB12Eull, | |
0xA64AB78DC2774652ull, 0xC67595724658F253ull, | |
0xb8864e79cb891e16ull, 0x12ED593E29FB41A1ull, | |
0x4e25c549c39f4557ull, 0x6D20E50C1F954DEDull | |
}; | |
__device__ __forceinline__ uint64_t vblake512_compress(uint64_t * __restrict__ m) | |
{ | |
uint64_t v[16]; | |
for (int i = 0; i < 16; i++) | |
{ | |
v[i] = vBlake_iv[i]; | |
} | |
//#pragma unroll 16 | |
for (int i = 0; i < 16; i++) { | |
B2B_G(v, 0, 4, 8, 12, m[sigma[i][1]], m[sigma[i][0]], | |
u512[sigma[i][1]], u512[sigma[i][0]]); | |
B2B_G(v, 1, 5, 9, 13, m[sigma[i][3]], m[sigma[i][2]], | |
u512[sigma[i][3]], u512[sigma[i][2]]); | |
B2B_G(v, 2, 6, 10, 14, m[sigma[i][5]], m[sigma[i][4]], | |
u512[sigma[i][5]], u512[sigma[i][4]]); | |
B2B_G(v, 3, 7, 11, 15, m[sigma[i][7]], m[sigma[i][6]], | |
u512[sigma[i][7]], u512[sigma[i][6]]); | |
B2B_G(v, 0, 5, 10, 15, m[sigma[i][9]], m[sigma[i][8]], | |
u512[sigma[i][9]], u512[sigma[i][8]]); | |
B2B_G(v, 1, 6, 11, 12, m[sigma[i][11]], m[sigma[i][10]], | |
u512[sigma[i][11]], u512[sigma[i][10]]); | |
B2B_G(v, 2, 7, 8, 13, m[sigma[i][13]], m[sigma[i][12]], | |
u512[sigma[i][13]], u512[sigma[i][12]]); | |
B2B_G(v, 3, 4, 9, 14, m[sigma[i][15]], m[sigma[i][14]], | |
u512[sigma[i][15]], u512[sigma[i][14]]); | |
} | |
return 0x3C10ED058B3FE57E ^ v[0] ^ v[8] ^ v[3] ^ v[11] ^ v[6] ^ v[14]; | |
} | |
__device__ __forceinline__ uint64_t vBlake2(const uint64_t * __restrict__ h0, const uint64_t h7) | |
{ | |
uint64_t b[8]; | |
b[0] = h0[0]; | |
b[1] = h0[1]; | |
b[2] = h0[2]; | |
b[3] = h0[3]; | |
b[4] = h0[4]; | |
b[5] = h0[5]; | |
b[6] = h0[6]; | |
b[7] = h7; | |
return vblake512_compress(b); | |
} | |
#if CPU_SHARES | |
#define WORK_PER_THREAD 256 | |
#else | |
#define WORK_PER_THREAD 256 | |
#endif | |
#if HIGH_RESOURCE | |
#define DEFAULT_BLOCKSIZE 512 | |
#define DEFAULT_THREADS_PER_BLOCK 1024 | |
#else | |
#define DEFAULT_BLOCKSIZE 512 | |
#define DEFAULT_THREADS_PER_BLOCK 512 | |
#endif | |
int blocksize = DEFAULT_BLOCKSIZE; | |
int threadsPerBlock = DEFAULT_THREADS_PER_BLOCK; | |
bool verboseOutput = false; | |
/* | |
* Kernel function to search a range of nonces for a solution falling under the macro-configured difficulty (CPU=2^24, GPU=2^32). | |
*/ | |
__launch_bounds__(256, 1) | |
__global__ void vblakeHasher(uint32_t *nonceStart, uint32_t *nonceOut, uint64_t *hashStartOut, uint64_t const *headerIn) | |
{ | |
// Generate a unique starting nonce for each thread that doesn't overlap with the work of any other thread | |
const uint32_t workStart = ((blockDim.x * blockIdx.x + threadIdx.x) * WORK_PER_THREAD) + nonceStart[0]; | |
uint64_t nonceHeaderSection = headerIn[7]; | |
// Run the hash WORK_PER_THREAD times | |
for (unsigned int nonce = workStart; nonce < workStart + WORK_PER_THREAD; nonce++) { | |
// Zero out nonce position and write new nonce to last 32 bits of prototype header | |
nonceHeaderSection &= 0x00000000FFFFFFFFu; | |
nonceHeaderSection |= (((uint64_t)nonce) << 32); | |
uint64_t hashStart = vBlake2(headerIn, nonceHeaderSection); | |
if ((hashStart & 0x00000000FFFFFFFFu) == 0) { | |
// Check that found solution is better than existing solution if one has already been found on this run of the kernel (always send back highest-quality work) | |
if (hashStartOut[0] > hashStart || hashStartOut[0] == 0) { | |
nonceOut[0] = nonce; | |
hashStartOut[0] = hashStart; | |
} | |
// exit loop early | |
nonce = workStart + WORK_PER_THREAD; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment