Last active
July 21, 2018 09:01
-
-
Save monkins1010/ada8792804e5dd58191d9debc811a6ca to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
template<size_t ITERATIONS, uint32_t MEM, uint32_t MASK, xmrig::Algo ALGO, uint8_t VARIANT> | |
#ifdef XMR_STAK_THREADS | |
__launch_bounds__( XMR_STAK_THREADS * 4 ) | |
#endif | |
__global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state, | |
uint64_t startNonce, uint32_t * __restrict__ d_input ) | |
{ | |
__shared__ uint32_t sharedMemory[1024]; | |
cn_aes_gpu_init( sharedMemory ); | |
__syncthreads( ); | |
const int thread = ( blockDim.x * blockIdx.x + threadIdx.x ) >> 2; | |
const uint64_t nonce = startNonce + thread; | |
const int sub = threadIdx.x & 3; | |
const int sub2 = sub & 2; | |
#if( __CUDA_ARCH__ < 300 ) | |
extern __shared__ uint32_t shuffleMem[]; | |
volatile uint32_t* sPtr = (volatile uint32_t*)(shuffleMem + (threadIdx.x& 0xFFFFFFFC)); | |
#else | |
volatile uint32_t* sPtr = NULL; | |
#endif | |
if ( thread >= threads ) | |
return; | |
int i, k; | |
uint32_t j; | |
const int batchsize = (ITERATIONS * 2) >> ( 2 + bfactor ); | |
const int start = partidx * batchsize; | |
const int end = start + batchsize; | |
uint32_t * long_state = &d_long_state[(IndexType) thread * MEM]; | |
uint32_t a, d[2], idx0, jf; | |
uint32_t t1[2], t2[2], res, res2; | |
uint32_t tweak1_2[2]; | |
/* if (VARIANT > 0) | |
{ | |
uint32_t * state = d_ctx_state + thread * 50; | |
tweak1_2[0] = (d_input[8] >> 24) | (d_input[9] << 8); | |
tweak1_2[0] ^= state[48]; | |
tweak1_2[1] = nonce; | |
tweak1_2[1] ^= state[49]; | |
} */ | |
a = (d_ctx_a + thread * 4)[sub]; | |
idx0 = shuffle<4>(sPtr,sub, a, 0); | |
d[1] = (d_ctx_b + thread * 4)[sub]; | |
#pragma unroll 2 | |
for ( i = start; i < end; ++i ) | |
{ | |
#pragma unroll 2 | |
for (int x = 0; x < 2; ++x) | |
{ | |
j = ((idx0 & MASK) >> 2) + sub; | |
const uint32_t x_0 = loadGlobal32<uint32_t>(long_state + j); | |
const uint32_t x_1 = shuffle<4>(sPtr, sub, x_0, sub + 1); | |
const uint32_t x_2 = shuffle<4>(sPtr, sub, x_0, sub + 2); | |
const uint32_t x_3 = shuffle<4>(sPtr, sub, x_0, sub + 3); | |
d[x] = a ^ | |
t_fn0(x_0 & 0xff) ^ | |
t_fn1((x_1 >> 8) & 0xff) ^ | |
t_fn2((x_2 >> 16) & 0xff) ^ | |
t_fn3((x_3 >> 24)); | |
//XOR_BLOCKS_DST(c, b, &long_state[j]); | |
t1[0] = shuffle<4>(sPtr, sub, d[x], 0); | |
///****************** VARIANT THAT NWEEDS TO BE COPIED AND PASTED DOWN ON LINE 100 and adapted to work on al0 (from CPU code)******* | |
const uint32_t z = d[0] ^ d[1]; | |
if (VARIANT > 0) | |
{ | |
const uint32_t table = 0x86420U; | |
const uint32_t index = ((z >> 26) & 12) | ((z >> 23) & 2); | |
const uint32_t fork_7 = z ^ ((table >> index) & 0x30U) << 24; | |
storeGlobal32(long_state + j, sub == 2 ? fork_7 : z); | |
} | |
else | |
storeGlobal32(long_state + j, z); | |
///****************** VARIANT THAT NWEEDS TO BE COPIED END ******* | |
//MUL_SUM_XOR_DST(c, a, &long_state[((uint32_t *)c)[0] & MASK]); | |
j = ((*t1 & MASK) >> 2) + sub; | |
uint32_t yy[2]; | |
*((uint64_t*)yy) = loadGlobal64<uint64_t>(((uint64_t *)long_state) + (j >> 1)); | |
uint32_t zz[2]; | |
zz[0] = shuffle<4>(sPtr, sub, yy[0], 0); | |
zz[1] = shuffle<4>(sPtr, sub, yy[1], 0); | |
t1[1] = shuffle<4>(sPtr, sub, d[x], 1); | |
//*************code that needs to be fixed start**************************** | |
#pragma unroll | |
for (k = 0; k < 2; k++) | |
t2[k] = shuffle<4>(sPtr, sub, a, k + sub2); | |
*((uint64_t *)t2) += sub2 ? (*((uint64_t *)t1) * *((uint64_t*)zz)) : __umul64hi(*((uint64_t *)t1), *((uint64_t*)zz)); | |
res = *((uint64_t *)t2) >> (sub & 1 ? 32 : 0); | |
if(VARIANT > 0) | |
{ | |
const uint32_t tweaked_res = tweak1_2[sub & 1] ^ res; | |
const uint32_t long_state_update = sub2 ? tweaked_res : res; | |
storeGlobal32( long_state + j, long_state_update ); | |
} | |
else | |
storeGlobal32( long_state + j, res ); | |
a = ( sub & 1 ? yy[1] : yy[0] ) ^ res; | |
idx0 = shuffle<4>(sPtr,sub, a, 0); | |
//*************code that needs to be fixed finish**************************** | |
} | |
} | |
if ( bfactor > 0 ) | |
{ | |
(d_ctx_a + thread * 4)[sub] = a; | |
(d_ctx_b + thread * 4)[sub] = d[1]; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment