monkins1010/GPU code for monero hash phase 2

## GPU code for monero hash phase 2
template<size_t ITERATIONS, uint32_t MEM, uint32_t MASK, xmrig::Algo ALGO, uint8_t VARIANT>
#ifdef XMR_STAK_THREADS
__launch_bounds__( XMR_STAK_THREADS * 4 )
#endif
__global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state,
        uint64_t startNonce, uint32_t * __restrict__ d_input )
{
    __shared__ uint32_t sharedMemory[1024];

    cn_aes_gpu_init( sharedMemory );

    __syncthreads( );

    const int thread = ( blockDim.x * blockIdx.x + threadIdx.x ) >> 2;
    const uint64_t nonce = startNonce + thread;
    const int sub = threadIdx.x & 3;
    const int sub2 = sub & 2;

#if( __CUDA_ARCH__ < 300 )
        extern __shared__ uint32_t shuffleMem[];
        volatile uint32_t* sPtr = (volatile uint32_t*)(shuffleMem + (threadIdx.x& 0xFFFFFFFC));
#else
        volatile uint32_t* sPtr = NULL;
#endif
    if ( thread >= threads )
        return;

    int i, k;
    uint32_t j;
    const int batchsize = (ITERATIONS * 2) >> ( 2 + bfactor );
    const int start = partidx * batchsize;
    const int end = start + batchsize;
    uint32_t * long_state = &d_long_state[(IndexType) thread * MEM];
    uint32_t a, d[2], idx0, jf;
    uint32_t t1[2], t2[2], res, res2;

    uint32_t tweak1_2[2];
   /* if (VARIANT > 0)
    {
        uint32_t * state = d_ctx_state + thread * 50;
        tweak1_2[0] = (d_input[8] >> 24) | (d_input[9] << 8);
        tweak1_2[0] ^= state[48];
        tweak1_2[1] = nonce;
        tweak1_2[1] ^= state[49];
    } */

    a = (d_ctx_a + thread * 4)[sub];
    idx0 = shuffle<4>(sPtr,sub, a, 0);

    d[1] = (d_ctx_b + thread * 4)[sub];

    #pragma unroll 2
    for ( i = start; i < end; ++i )
    {
        #pragma unroll 2
		for (int x = 0; x < 2; ++x)
		{
			j = ((idx0 & MASK) >> 2) + sub;

			const uint32_t x_0 = loadGlobal32<uint32_t>(long_state + j);
			const uint32_t x_1 = shuffle<4>(sPtr, sub, x_0, sub + 1);
			const uint32_t x_2 = shuffle<4>(sPtr, sub, x_0, sub + 2);
			const uint32_t x_3 = shuffle<4>(sPtr, sub, x_0, sub + 3);
			d[x] = a ^
				t_fn0(x_0 & 0xff) ^
				t_fn1((x_1 >> 8) & 0xff) ^
				t_fn2((x_2 >> 16) & 0xff) ^
				t_fn3((x_3 >> 24));


			//XOR_BLOCKS_DST(c, b, &long_state[j]);
			t1[0] = shuffle<4>(sPtr, sub, d[x], 0);

///******************  VARIANT THAT NWEEDS TO BE COPIED AND PASTED DOWN ON LINE 100 and adapted to work on al0 (from CPU code)*******

			const uint32_t z = d[0] ^ d[1];
			if (VARIANT > 0)
			{
				const uint32_t table = 0x86420U;
				const uint32_t index = ((z >> 26) & 12) | ((z >> 23) & 2);
				const uint32_t fork_7 = z ^ ((table >> index) & 0x30U) << 24;
				storeGlobal32(long_state + j, sub == 2 ? fork_7 : z);
			}
			else
			storeGlobal32(long_state + j, z);

///******************  VARIANT THAT NWEEDS TO BE COPIED END *******

			//MUL_SUM_XOR_DST(c, a, &long_state[((uint32_t *)c)[0] & MASK]);
			j = ((*t1 & MASK) >> 2) + sub;

			uint32_t yy[2];
			*((uint64_t*)yy) = loadGlobal64<uint64_t>(((uint64_t *)long_state) + (j >> 1));
			uint32_t zz[2];
			zz[0] = shuffle<4>(sPtr, sub, yy[0], 0);
			zz[1] = shuffle<4>(sPtr, sub, yy[1], 0);

			t1[1] = shuffle<4>(sPtr, sub, d[x], 1);


//*************code that needs to be fixed start****************************
	#pragma unroll
	for (k = 0; k < 2; k++)
	t2[k] = shuffle<4>(sPtr, sub, a, k + sub2);

	*((uint64_t *)t2) += sub2 ? (*((uint64_t *)t1) * *((uint64_t*)zz)) : __umul64hi(*((uint64_t *)t1), *((uint64_t*)zz));

	res = *((uint64_t *)t2) >> (sub & 1 ? 32 : 0);

	   if(VARIANT > 0)
           {
             const uint32_t tweaked_res = tweak1_2[sub & 1] ^ res;
             const uint32_t long_state_update = sub2 ? tweaked_res : res;
             storeGlobal32( long_state + j, long_state_update );
           }
          else
          storeGlobal32( long_state + j, res );

          a = ( sub & 1 ? yy[1] : yy[0] ) ^ res;
	  idx0 = shuffle<4>(sPtr,sub, a, 0);

//*************code that needs to be fixed finish****************************
        }
    }

    if ( bfactor > 0 )
    {
        (d_ctx_a + thread * 4)[sub] = a;
        (d_ctx_b + thread * 4)[sub] = d[1];

    }
}
	template<size_t ITERATIONS, uint32_t MEM, uint32_t MASK, xmrig::Algo ALGO, uint8_t VARIANT>
	#ifdef XMR_STAK_THREADS
	__launch_bounds__( XMR_STAK_THREADS * 4 )
	#endif
	__global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state,
	uint64_t startNonce, uint32_t * __restrict__ d_input )
	{
	__shared__ uint32_t sharedMemory[1024];

	cn_aes_gpu_init( sharedMemory );

	__syncthreads( );

	const int thread = ( blockDim.x * blockIdx.x + threadIdx.x ) >> 2;
	const uint64_t nonce = startNonce + thread;
	const int sub = threadIdx.x & 3;
	const int sub2 = sub & 2;

	#if( __CUDA_ARCH__ < 300 )
	extern __shared__ uint32_t shuffleMem[];
	volatile uint32_t* sPtr = (volatile uint32_t*)(shuffleMem + (threadIdx.x& 0xFFFFFFFC));
	#else
	volatile uint32_t* sPtr = NULL;
	#endif
	if ( thread >= threads )
	return;

	int i, k;
	uint32_t j;
	const int batchsize = (ITERATIONS * 2) >> ( 2 + bfactor );
	const int start = partidx * batchsize;
	const int end = start + batchsize;
	uint32_t * long_state = &d_long_state[(IndexType) thread * MEM];
	uint32_t a, d[2], idx0, jf;
	uint32_t t1[2], t2[2], res, res2;

	uint32_t tweak1_2[2];
	/* if (VARIANT > 0)
	{
	uint32_t * state = d_ctx_state + thread * 50;
	tweak1_2[0] = (d_input[8] >> 24) \| (d_input[9] << 8);
	tweak1_2[0] ^= state[48];
	tweak1_2[1] = nonce;
	tweak1_2[1] ^= state[49];
	} */

	a = (d_ctx_a + thread * 4)[sub];
	idx0 = shuffle<4>(sPtr,sub, a, 0);

	d[1] = (d_ctx_b + thread * 4)[sub];

	#pragma unroll 2
	for ( i = start; i < end; ++i )
	{
	#pragma unroll 2
	for (int x = 0; x < 2; ++x)
	{
	j = ((idx0 & MASK) >> 2) + sub;

	const uint32_t x_0 = loadGlobal32<uint32_t>(long_state + j);
	const uint32_t x_1 = shuffle<4>(sPtr, sub, x_0, sub + 1);
	const uint32_t x_2 = shuffle<4>(sPtr, sub, x_0, sub + 2);
	const uint32_t x_3 = shuffle<4>(sPtr, sub, x_0, sub + 3);
	d[x] = a ^
	t_fn0(x_0 & 0xff) ^
	t_fn1((x_1 >> 8) & 0xff) ^
	t_fn2((x_2 >> 16) & 0xff) ^
	t_fn3((x_3 >> 24));


	//XOR_BLOCKS_DST(c, b, &long_state[j]);
	t1[0] = shuffle<4>(sPtr, sub, d[x], 0);

	///**************** VARIANT THAT NWEEDS TO BE COPIED AND PASTED DOWN ON LINE 100 and adapted to work on al0 (from CPU code)*****

	const uint32_t z = d[0] ^ d[1];
	if (VARIANT > 0)
	{
	const uint32_t table = 0x86420U;
	const uint32_t index = ((z >> 26) & 12) \| ((z >> 23) & 2);
	const uint32_t fork_7 = z ^ ((table >> index) & 0x30U) << 24;
	storeGlobal32(long_state + j, sub == 2 ? fork_7 : z);
	}
	else
	storeGlobal32(long_state + j, z);

	///**************** VARIANT THAT NWEEDS TO BE COPIED END *****

	//MUL_SUM_XOR_DST(c, a, &long_state[((uint32_t *)c)[0] & MASK]);
	j = ((*t1 & MASK) >> 2) + sub;

	uint32_t yy[2];
	((uint64_t)yy) = loadGlobal64<uint64_t>(((uint64_t *)long_state) + (j >> 1));
	uint32_t zz[2];
	zz[0] = shuffle<4>(sPtr, sub, yy[0], 0);
	zz[1] = shuffle<4>(sPtr, sub, yy[1], 0);

	t1[1] = shuffle<4>(sPtr, sub, d[x], 1);


	//***********code that needs to be fixed start**************************
	#pragma unroll
	for (k = 0; k < 2; k++)
	t2[k] = shuffle<4>(sPtr, sub, a, k + sub2);

	((uint64_t )t2) += sub2 ? (((uint64_t )t1) * ((uint64_t)zz)) : __umul64hi(((uint64_t )t1), ((uint64_t)zz));

	res = ((uint64_t )t2) >> (sub & 1 ? 32 : 0);

	if(VARIANT > 0)
	{
	const uint32_t tweaked_res = tweak1_2[sub & 1] ^ res;
	const uint32_t long_state_update = sub2 ? tweaked_res : res;
	storeGlobal32( long_state + j, long_state_update );
	}
	else
	storeGlobal32( long_state + j, res );

	a = ( sub & 1 ? yy[1] : yy[0] ) ^ res;
	idx0 = shuffle<4>(sPtr,sub, a, 0);

	//***********code that needs to be fixed finish**************************
	}
	}

	if ( bfactor > 0 )
	{
	(d_ctx_a + thread * 4)[sub] = a;
	(d_ctx_b + thread * 4)[sub] = d[1];

	}
	}