monkins1010/veriblock.cu

## veriblock.cu

__device__ __forceinline__ uint64_t ROTL64(const uint64_t value, const int offset) {
	uint2 result;
	if (offset >= 32) {
		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
	}
	else {
		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
	}
	return  __double_as_longlong(__hiloint2double(result.y, result.x));
}

__device__ __forceinline__ uint64_t ROTR64(const uint64_t value, const int offset) {
	uint2 result;
	if (offset < 32) {
		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
	}
	else {
		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
	}
	return __double_as_longlong(__hiloint2double(result.y, result.x));
}
__device__ __forceinline__
uint64_t SWAPDWORDS(uint64_t value)
{
#if __CUDA_ARCH__ >= 320
	uint2 temp;
	asm("mov.b64 {%0, %1}, %2; ": "=r"(temp.x), "=r"(temp.y) : "l"(value));
	asm("mov.b64 %0, {%1, %2}; ": "=l"(value) : "r"(temp.y), "r"(temp.x));
	return value;
#else
	return ROTL64(value, 32);
#endif
}

#define B2B_G(v,a,b,c,d,x,y,c1,c2) { \
	v[a] = v[a] + v[b] + (x ^ c1); \
	v[d] ^= v[a]; \
	v[d] = ROTR64(v[d], 60); \
	v[c] = v[c] + v[d]; \
	v[b] = ROTR64(v[b] ^ v[c], 43); \
	v[a] = v[a] + v[b] + (y ^ c2); \
	v[d] = ROTR64(v[d] ^ v[a], 5); \
	v[c] = v[c] + v[d]; \
	v[b] = ROTR64(v[b] ^ v[c], 18); \
	v[d] ^= ~(v[a] | v[b] | v[c]) | (~v[a] & v[b] & v[c]) | (v[a] & ~v[b] & v[c])   | (v[a] & v[b] & ~v[c]); \
    v[d] ^= (~v[a] & ~v[b] & v[c]) | (~v[a] & v[b] & ~v[c]) | (v[a] & ~v[b] & ~v[c]) | (v[a] & v[b] & v[c]); \
}
cudaStream_t cudastream;

uint32_t *blockHeadermobj = nullptr;
uint32_t *midStatemobj = nullptr;
uint32_t *nonceOutmobj = nullptr;

cudaError_t grindNonces(uint32_t *nonceResult, uint64_t *hashStart, const uint64_t *header);
__device__ __constant__
static const uint8_t sigma[16][16] = {
	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },

	{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
	{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
	{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
	{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 },

	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
};
__device__ __constant__
static const uint64_t u512[16] =
{
	0xA51B6A89D489E800ULL, 0xD35B2E0E0B723800ULL,
	0xA47B39A2AE9F9000ULL, 0x0C0EFA33E77E6488ULL,
	0x4F452FEC309911EBULL, 0x3CFCC66F74E1022CULL,
	0x4606AD364DC879DDULL, 0xBBA055B53D47C800ULL,
	0x531655D90C59EB1BULL, 0xD1A00BA6DAE5B800ULL,
	0x2FE452DA9632463EULL, 0x98A7B5496226F800ULL,
	0xBAFCD004F92CA000ULL, 0x64A39957839525E7ULL,
	0xD859E6F081AAE000ULL, 0x63D980597B560E6BULL
};
__device__ __constant__
static const uint64_t vBlake_iv[16] = {
	0x4bbf42c1f107ad85ull, 0x5D11A8C3B5AEB12Eull,
	0xA64AB78DC2774652ull, 0xC67595724658F253ull,
	0xB8864E79CB891E56ull, 0x12ED593E29FB41A1ull,
	0xB1DA3AB63C60BAA8ull, 0x6D20E50C1F954DEDull,
	0x4BBF42C1F006AD9Dull, 0x5D11A8C3B5AEB12Eull,
	0xA64AB78DC2774652ull, 0xC67595724658F253ull,
	0xb8864e79cb891e16ull, 0x12ED593E29FB41A1ull,
	0x4e25c549c39f4557ull, 0x6D20E50C1F954DEDull
};
__device__  __forceinline__ uint64_t vblake512_compress(uint64_t  * __restrict__ m)
{
	uint64_t v[16];
	for (int i = 0; i < 16; i++)
	{
		v[i] = vBlake_iv[i];
	}

	//#pragma unroll 16
	for (int i = 0; i < 16; i++) {
		B2B_G(v, 0, 4, 8, 12, m[sigma[i][1]], m[sigma[i][0]],
			u512[sigma[i][1]], u512[sigma[i][0]]);

		B2B_G(v, 1, 5, 9, 13, m[sigma[i][3]], m[sigma[i][2]],
			u512[sigma[i][3]], u512[sigma[i][2]]);

		B2B_G(v, 2, 6, 10, 14, m[sigma[i][5]], m[sigma[i][4]],
			u512[sigma[i][5]], u512[sigma[i][4]]);

		B2B_G(v, 3, 7, 11, 15, m[sigma[i][7]], m[sigma[i][6]],
			u512[sigma[i][7]], u512[sigma[i][6]]);

		B2B_G(v, 0, 5, 10, 15, m[sigma[i][9]], m[sigma[i][8]],
			u512[sigma[i][9]], u512[sigma[i][8]]);

		B2B_G(v, 1, 6, 11, 12, m[sigma[i][11]], m[sigma[i][10]],
			u512[sigma[i][11]], u512[sigma[i][10]]);

		B2B_G(v, 2, 7, 8, 13, m[sigma[i][13]], m[sigma[i][12]],
			u512[sigma[i][13]], u512[sigma[i][12]]);

		B2B_G(v, 3, 4, 9, 14, m[sigma[i][15]], m[sigma[i][14]],
			u512[sigma[i][15]], u512[sigma[i][14]]);
	}
	return 0x3C10ED058B3FE57E ^ v[0] ^ v[8] ^ v[3] ^ v[11] ^ v[6] ^ v[14];
}
__device__  __forceinline__  uint64_t vBlake2(const uint64_t * __restrict__ h0, const uint64_t h7)
{
	uint64_t b[8];
	b[0] = h0[0];
	b[1] = h0[1];
	b[2] = h0[2];
	b[3] = h0[3];
	b[4] = h0[4];
	b[5] = h0[5];
	b[6] = h0[6];
	b[7] = h7;
return vblake512_compress(b);
}


#if CPU_SHARES
#define WORK_PER_THREAD 256
#else
#define WORK_PER_THREAD 256
#endif

#if HIGH_RESOURCE
#define DEFAULT_BLOCKSIZE 512
#define DEFAULT_THREADS_PER_BLOCK 1024
#else
#define DEFAULT_BLOCKSIZE 512
#define DEFAULT_THREADS_PER_BLOCK 512
#endif

int blocksize = DEFAULT_BLOCKSIZE;
int threadsPerBlock = DEFAULT_THREADS_PER_BLOCK;

bool verboseOutput = false;

/*
* Kernel function to search a range of nonces for a solution falling under the macro-configured difficulty (CPU=2^24, GPU=2^32).
*/
__launch_bounds__(256, 1)
__global__ void vblakeHasher(uint32_t *nonceStart, uint32_t *nonceOut, uint64_t *hashStartOut, uint64_t const *headerIn)
{
	// Generate a unique starting nonce for each thread that doesn't overlap with the work of any other thread
	const uint32_t workStart = ((blockDim.x * blockIdx.x + threadIdx.x)  * WORK_PER_THREAD) + nonceStart[0];

	uint64_t nonceHeaderSection = headerIn[7];

	// Run the hash WORK_PER_THREAD times
	for (unsigned int nonce = workStart; nonce < workStart + WORK_PER_THREAD; nonce++) {
		// Zero out nonce position and write new nonce to last 32 bits of prototype header
		nonceHeaderSection &= 0x00000000FFFFFFFFu;
		nonceHeaderSection |= (((uint64_t)nonce) << 32);

		uint64_t hashStart = vBlake2(headerIn, nonceHeaderSection);

		if ((hashStart & 0x00000000FFFFFFFFu) == 0) {
			// Check that found solution is better than existing solution if one has already been found on this run of the kernel (always send back highest-quality work)
			if (hashStartOut[0] > hashStart || hashStartOut[0] == 0) {
				nonceOut[0] = nonce;
				hashStartOut[0] = hashStart;
			}

			// exit loop early
			nonce = workStart + WORK_PER_THREAD;
		}
	}
}

	__device__ __forceinline__ uint64_t ROTL64(const uint64_t value, const int offset) {
	uint2 result;
	if (offset >= 32) {
	asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
	asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
	}
	else {
	asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
	asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
	}
	return __double_as_longlong(__hiloint2double(result.y, result.x));
	}

	__device__ __forceinline__ uint64_t ROTR64(const uint64_t value, const int offset) {
	uint2 result;
	if (offset < 32) {
	asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
	asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
	}
	else {
	asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
	asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
	}
	return __double_as_longlong(__hiloint2double(result.y, result.x));
	}
	__device__ __forceinline__
	uint64_t SWAPDWORDS(uint64_t value)
	{
	#if __CUDA_ARCH__ >= 320
	uint2 temp;
	asm("mov.b64 {%0, %1}, %2; ": "=r"(temp.x), "=r"(temp.y) : "l"(value));
	asm("mov.b64 %0, {%1, %2}; ": "=l"(value) : "r"(temp.y), "r"(temp.x));
	return value;
	#else
	return ROTL64(value, 32);
	#endif
	}

	#define B2B_G(v,a,b,c,d,x,y,c1,c2) { \
	v[a] = v[a] + v[b] + (x ^ c1); \
	v[d] ^= v[a]; \
	v[d] = ROTR64(v[d], 60); \
	v[c] = v[c] + v[d]; \
	v[b] = ROTR64(v[b] ^ v[c], 43); \
	v[a] = v[a] + v[b] + (y ^ c2); \
	v[d] = ROTR64(v[d] ^ v[a], 5); \
	v[c] = v[c] + v[d]; \
	v[b] = ROTR64(v[b] ^ v[c], 18); \
	v[d] ^= ~(v[a] \| v[b] \| v[c]) \| (~v[a] & v[b] & v[c]) \| (v[a] & ~v[b] & v[c]) \| (v[a] & v[b] & ~v[c]); \
	v[d] ^= (~v[a] & ~v[b] & v[c]) \| (~v[a] & v[b] & ~v[c]) \| (v[a] & ~v[b] & ~v[c]) \| (v[a] & v[b] & v[c]); \
	}
	cudaStream_t cudastream;

	uint32_t *blockHeadermobj = nullptr;
	uint32_t *midStatemobj = nullptr;
	uint32_t *nonceOutmobj = nullptr;

	cudaError_t grindNonces(uint32_t nonceResult, uint64_t hashStart, const uint64_t *header);
	__device__ __constant__
	static const uint8_t sigma[16][16] = {
	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },

	{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
	{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
	{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
	{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 },

	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
	};
	__device__ __constant__
	static const uint64_t u512[16] =
	{
	0xA51B6A89D489E800ULL, 0xD35B2E0E0B723800ULL,
	0xA47B39A2AE9F9000ULL, 0x0C0EFA33E77E6488ULL,
	0x4F452FEC309911EBULL, 0x3CFCC66F74E1022CULL,
	0x4606AD364DC879DDULL, 0xBBA055B53D47C800ULL,
	0x531655D90C59EB1BULL, 0xD1A00BA6DAE5B800ULL,
	0x2FE452DA9632463EULL, 0x98A7B5496226F800ULL,
	0xBAFCD004F92CA000ULL, 0x64A39957839525E7ULL,
	0xD859E6F081AAE000ULL, 0x63D980597B560E6BULL
	};
	__device__ __constant__
	static const uint64_t vBlake_iv[16] = {
	0x4bbf42c1f107ad85ull, 0x5D11A8C3B5AEB12Eull,
	0xA64AB78DC2774652ull, 0xC67595724658F253ull,
	0xB8864E79CB891E56ull, 0x12ED593E29FB41A1ull,
	0xB1DA3AB63C60BAA8ull, 0x6D20E50C1F954DEDull,
	0x4BBF42C1F006AD9Dull, 0x5D11A8C3B5AEB12Eull,
	0xA64AB78DC2774652ull, 0xC67595724658F253ull,
	0xb8864e79cb891e16ull, 0x12ED593E29FB41A1ull,
	0x4e25c549c39f4557ull, 0x6D20E50C1F954DEDull
	};
	__device__ __forceinline__ uint64_t vblake512_compress(uint64_t * __restrict__ m)
	{
	uint64_t v[16];
	for (int i = 0; i < 16; i++)
	{
	v[i] = vBlake_iv[i];
	}

	//#pragma unroll 16
	for (int i = 0; i < 16; i++) {
	B2B_G(v, 0, 4, 8, 12, m[sigma[i][1]], m[sigma[i][0]],
	u512[sigma[i][1]], u512[sigma[i][0]]);

	B2B_G(v, 1, 5, 9, 13, m[sigma[i][3]], m[sigma[i][2]],
	u512[sigma[i][3]], u512[sigma[i][2]]);

	B2B_G(v, 2, 6, 10, 14, m[sigma[i][5]], m[sigma[i][4]],
	u512[sigma[i][5]], u512[sigma[i][4]]);

	B2B_G(v, 3, 7, 11, 15, m[sigma[i][7]], m[sigma[i][6]],
	u512[sigma[i][7]], u512[sigma[i][6]]);

	B2B_G(v, 0, 5, 10, 15, m[sigma[i][9]], m[sigma[i][8]],
	u512[sigma[i][9]], u512[sigma[i][8]]);

	B2B_G(v, 1, 6, 11, 12, m[sigma[i][11]], m[sigma[i][10]],
	u512[sigma[i][11]], u512[sigma[i][10]]);

	B2B_G(v, 2, 7, 8, 13, m[sigma[i][13]], m[sigma[i][12]],
	u512[sigma[i][13]], u512[sigma[i][12]]);

	B2B_G(v, 3, 4, 9, 14, m[sigma[i][15]], m[sigma[i][14]],
	u512[sigma[i][15]], u512[sigma[i][14]]);
	}
	return 0x3C10ED058B3FE57E ^ v[0] ^ v[8] ^ v[3] ^ v[11] ^ v[6] ^ v[14];
	}
	__device__ __forceinline__ uint64_t vBlake2(const uint64_t * __restrict__ h0, const uint64_t h7)
	{
	uint64_t b[8];
	b[0] = h0[0];
	b[1] = h0[1];
	b[2] = h0[2];
	b[3] = h0[3];
	b[4] = h0[4];
	b[5] = h0[5];
	b[6] = h0[6];
	b[7] = h7;
	return vblake512_compress(b);
	}


	#if CPU_SHARES
	#define WORK_PER_THREAD 256
	#else
	#define WORK_PER_THREAD 256
	#endif

	#if HIGH_RESOURCE
	#define DEFAULT_BLOCKSIZE 512
	#define DEFAULT_THREADS_PER_BLOCK 1024
	#else
	#define DEFAULT_BLOCKSIZE 512
	#define DEFAULT_THREADS_PER_BLOCK 512
	#endif

	int blocksize = DEFAULT_BLOCKSIZE;
	int threadsPerBlock = DEFAULT_THREADS_PER_BLOCK;

	bool verboseOutput = false;

	/*
	* Kernel function to search a range of nonces for a solution falling under the macro-configured difficulty (CPU=2^24, GPU=2^32).
	*/
	__launch_bounds__(256, 1)
	__global__ void vblakeHasher(uint32_t nonceStart, uint32_t nonceOut, uint64_t hashStartOut, uint64_t const headerIn)
	{
	// Generate a unique starting nonce for each thread that doesn't overlap with the work of any other thread
	const uint32_t workStart = ((blockDim.x * blockIdx.x + threadIdx.x) * WORK_PER_THREAD) + nonceStart[0];

	uint64_t nonceHeaderSection = headerIn[7];

	// Run the hash WORK_PER_THREAD times
	for (unsigned int nonce = workStart; nonce < workStart + WORK_PER_THREAD; nonce++) {
	// Zero out nonce position and write new nonce to last 32 bits of prototype header
	nonceHeaderSection &= 0x00000000FFFFFFFFu;
	nonceHeaderSection \|= (((uint64_t)nonce) << 32);

	uint64_t hashStart = vBlake2(headerIn, nonceHeaderSection);

	if ((hashStart & 0x00000000FFFFFFFFu) == 0) {
	// Check that found solution is better than existing solution if one has already been found on this run of the kernel (always send back highest-quality work)
	if (hashStartOut[0] > hashStart \|\| hashStartOut[0] == 0) {
	nonceOut[0] = nonce;
	hashStartOut[0] = hashStart;
	}

	// exit loop early
	nonce = workStart + WORK_PER_THREAD;
	}
	}
	}