Skip to content

Instantly share code, notes, and snippets.

@monkins1010
Last active July 21, 2018 09:01
Show Gist options
  • Save monkins1010/ada8792804e5dd58191d9debc811a6ca to your computer and use it in GitHub Desktop.
Save monkins1010/ada8792804e5dd58191d9debc811a6ca to your computer and use it in GitHub Desktop.
template<size_t ITERATIONS, uint32_t MEM, uint32_t MASK, xmrig::Algo ALGO, uint8_t VARIANT>
#ifdef XMR_STAK_THREADS
__launch_bounds__( XMR_STAK_THREADS * 4 )
#endif
__global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state,
uint64_t startNonce, uint32_t * __restrict__ d_input )
{
__shared__ uint32_t sharedMemory[1024];
cn_aes_gpu_init( sharedMemory );
__syncthreads( );
const int thread = ( blockDim.x * blockIdx.x + threadIdx.x ) >> 2;
const uint64_t nonce = startNonce + thread;
const int sub = threadIdx.x & 3;
const int sub2 = sub & 2;
#if( __CUDA_ARCH__ < 300 )
extern __shared__ uint32_t shuffleMem[];
volatile uint32_t* sPtr = (volatile uint32_t*)(shuffleMem + (threadIdx.x& 0xFFFFFFFC));
#else
volatile uint32_t* sPtr = NULL;
#endif
if ( thread >= threads )
return;
int i, k;
uint32_t j;
const int batchsize = (ITERATIONS * 2) >> ( 2 + bfactor );
const int start = partidx * batchsize;
const int end = start + batchsize;
uint32_t * long_state = &d_long_state[(IndexType) thread * MEM];
uint32_t a, d[2], idx0, jf;
uint32_t t1[2], t2[2], res, res2;
uint32_t tweak1_2[2];
/* if (VARIANT > 0)
{
uint32_t * state = d_ctx_state + thread * 50;
tweak1_2[0] = (d_input[8] >> 24) | (d_input[9] << 8);
tweak1_2[0] ^= state[48];
tweak1_2[1] = nonce;
tweak1_2[1] ^= state[49];
} */
a = (d_ctx_a + thread * 4)[sub];
idx0 = shuffle<4>(sPtr,sub, a, 0);
d[1] = (d_ctx_b + thread * 4)[sub];
#pragma unroll 2
for ( i = start; i < end; ++i )
{
#pragma unroll 2
for (int x = 0; x < 2; ++x)
{
j = ((idx0 & MASK) >> 2) + sub;
const uint32_t x_0 = loadGlobal32<uint32_t>(long_state + j);
const uint32_t x_1 = shuffle<4>(sPtr, sub, x_0, sub + 1);
const uint32_t x_2 = shuffle<4>(sPtr, sub, x_0, sub + 2);
const uint32_t x_3 = shuffle<4>(sPtr, sub, x_0, sub + 3);
d[x] = a ^
t_fn0(x_0 & 0xff) ^
t_fn1((x_1 >> 8) & 0xff) ^
t_fn2((x_2 >> 16) & 0xff) ^
t_fn3((x_3 >> 24));
//XOR_BLOCKS_DST(c, b, &long_state[j]);
t1[0] = shuffle<4>(sPtr, sub, d[x], 0);
///****************** VARIANT THAT NWEEDS TO BE COPIED AND PASTED DOWN ON LINE 100 and adapted to work on al0 (from CPU code)*******
const uint32_t z = d[0] ^ d[1];
if (VARIANT > 0)
{
const uint32_t table = 0x86420U;
const uint32_t index = ((z >> 26) & 12) | ((z >> 23) & 2);
const uint32_t fork_7 = z ^ ((table >> index) & 0x30U) << 24;
storeGlobal32(long_state + j, sub == 2 ? fork_7 : z);
}
else
storeGlobal32(long_state + j, z);
///****************** VARIANT THAT NWEEDS TO BE COPIED END *******
//MUL_SUM_XOR_DST(c, a, &long_state[((uint32_t *)c)[0] & MASK]);
j = ((*t1 & MASK) >> 2) + sub;
uint32_t yy[2];
*((uint64_t*)yy) = loadGlobal64<uint64_t>(((uint64_t *)long_state) + (j >> 1));
uint32_t zz[2];
zz[0] = shuffle<4>(sPtr, sub, yy[0], 0);
zz[1] = shuffle<4>(sPtr, sub, yy[1], 0);
t1[1] = shuffle<4>(sPtr, sub, d[x], 1);
//*************code that needs to be fixed start****************************
#pragma unroll
for (k = 0; k < 2; k++)
t2[k] = shuffle<4>(sPtr, sub, a, k + sub2);
*((uint64_t *)t2) += sub2 ? (*((uint64_t *)t1) * *((uint64_t*)zz)) : __umul64hi(*((uint64_t *)t1), *((uint64_t*)zz));
res = *((uint64_t *)t2) >> (sub & 1 ? 32 : 0);
if(VARIANT > 0)
{
const uint32_t tweaked_res = tweak1_2[sub & 1] ^ res;
const uint32_t long_state_update = sub2 ? tweaked_res : res;
storeGlobal32( long_state + j, long_state_update );
}
else
storeGlobal32( long_state + j, res );
a = ( sub & 1 ? yy[1] : yy[0] ) ^ res;
idx0 = shuffle<4>(sPtr,sub, a, 0);
//*************code that needs to be fixed finish****************************
}
}
if ( bfactor > 0 )
{
(d_ctx_a + thread * 4)[sub] = a;
(d_ctx_b + thread * 4)[sub] = d[1];
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment