Skip to content

Instantly share code, notes, and snippets.

@Griffitsj
Last active August 29, 2015 14:03
Show Gist options
  • Save Griffitsj/55349ccee17bbf87ffd5 to your computer and use it in GitHub Desktop.
Save Griffitsj/55349ccee17bbf87ffd5 to your computer and use it in GitHub Desktop.
Even more optimised version of blake256.cl from https://github.com/kR105/cgminer
// (c) 2013 originally written by smolen, modified by kr105
// (c) 2014 additional optimisations by griffitsj
#define SPH_ROTR32(v,n) rotate((uint4)(v),(uint)(32-(n)))
#define V0 v0to3.x
#define V1 v0to3.y
#define V2 v0to3.z
#define V3 v0to3.w
#define V4 v4to7.x
#define V5 v4to7.y
#define V6 v4to7.z
#define V7 v4to7.w
#define V8 v8toB.x
#define V9 v8toB.y
#define VA v8toB.z
#define VB v8toB.w
#define VC vCtoF.x
#define VD vCtoF.y
#define VE vCtoF.z
#define VF vCtoF.w
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search(
volatile __global uint * restrict output,
// precalc hash from fisrt part of message
const uint h0,
const uint h1,
const uint h2,
const uint h3,
const uint h4,
const uint h5,
const uint h6,
const uint h7,
// last 12 bytes of original message
const uint in16,
const uint in17,
const uint in18
)
{
uint M0, M1, M2, M3;
uint pre7 = h7;
uint4 v0to3, v4to7, v8toB, vCtoF;
uint nonce = get_global_id(0);
M0 = in16;
M1 = in17;
M2 = in18;
M3 = nonce;
// Round 1
v0to3 = (uint4)(h0, h1, h2, h3) + (uint4)(h4, h5, h6, h7) + (uint4)(M0 ^ 0x85A308D3UL, M2 ^ 0x03707344UL, 0xA99F31D0UL, 0xEC4E6C89UL);
vCtoF = SPH_ROTR32((uint4)(0xA4093AA2UL ^ V0, 0x299F3350UL ^ V1, 0x082EFA98UL ^ V2, 0xEC4E6C89UL ^ V3), 16);
v8toB = (uint4)(0x243F6A88UL, 0x85A308D3UL, 0x13198A2EUL, 0x03707344UL) + vCtoF;
v4to7 = SPH_ROTR32((uint4)(h4, h5, h6, h7) ^ v8toB, 12);
v0to3 = v0to3 + v4to7 + (uint4)(M1 ^ 0x243F6A88UL, M3 ^ 0x13198A2EUL, 0xA4093822UL, 0x082EFA98UL);
vCtoF = SPH_ROTR32(vCtoF ^ v0to3, 8);
v8toB = v8toB + vCtoF;
v4to7 = SPH_ROTR32(v4to7 ^ v8toB, 7);
v0to3 = v0to3 + (uint4)(V5, V6, V7, V4) + (uint4)(0x38D01377UL, 0x34E90C6CUL, 0xC97C50DDUL, 0xB5470917UL);
vCtoF = SPH_ROTR32(vCtoF ^ (uint4)(V1, V2, V3, V0), 16);
v8toB = v8toB + (uint4)(VD, VE, VF, VC);
v4to7 = SPH_ROTR32(v4to7 ^ (uint4)(V9, VA, VB, V8), 12);
v0to3 = v0to3 + (uint4)(V5, V6, V7, V4) + (uint4)(0x452821E6UL, 0xBE5466CFUL, 0xC0AC29B6UL, 0x3F84D735UL);
vCtoF = SPH_ROTR32(vCtoF ^ (uint4)(V1, V2, V3, V0), 8);
v8toB = v8toB + (uint4)(VD, VE, VF, VC);
v4to7 = SPH_ROTR32(v4to7 ^ (uint4)(V9, VA, VB, V8), 7);
// Round 2
v0to3 = v0to3 + v4to7 + (uint4)(0xBE5466CFUL, 0xC52821E6UL, 0xB5470917UL, 0x82EFA99UL);
vCtoF = SPH_ROTR32(vCtoF ^ v0to3, 16);
v8toB = v8toB + vCtoF;
v4to7 = SPH_ROTR32(v4to7 ^ v8toB, 12);
v0to3 = v0to3 + v4to7 + (uint4)(0x3F84D5B5UL, 0xA4093822UL, 0x38D011F7UL, 0xC97C50DDUL);
vCtoF = SPH_ROTR32(vCtoF ^ v0to3, 8);
v8toB = v8toB + vCtoF;
v4to7 = SPH_ROTR32(v4to7 ^ v8toB, 7);
v0to3 = v0to3 + (uint4)(V5, V6, V7, V4) + (uint4)(M1 ^ 0xC0AC29B7UL, M0 ^ 0x13198A2EUL, 0xEC4E6C89UL, 0x03707344UL);
vCtoF = SPH_ROTR32(vCtoF ^ (uint4)(V1, V2, V3, V0), 16);
v8toB = v8toB + (uint4)(VD, VE, VF, VC);
v4to7 = SPH_ROTR32(v4to7 ^ (uint4)(V9, VA, VB, V8), 12);
v0to3 = v0to3 + (uint4)(V5, V6, V7, V4) + (uint4)(0x85A308D3UL, M2 ^ 0x243F6A88UL, 0x34E90C6CUL, M3 ^ 0x299F31D0UL);
vCtoF = SPH_ROTR32(vCtoF ^ (uint4)(V1, V2, V3, V0), 8);
v8toB = v8toB + (uint4)(VD, VE, VF, VC);
v4to7 = SPH_ROTR32(v4to7 ^ (uint4)(V9, VA, VB, V8), 7);
// Round 3
v0to3 = v0to3 + v4to7 + (uint4)(0x452821E6UL, 0x243F6A88UL, 0x13198A2EUL, 0xC97C525DUL);
vCtoF = SPH_ROTR32(vCtoF ^ v0to3, 16);
v8toB = v8toB + vCtoF;
v4to7 = SPH_ROTR32(v4to7 ^ v8toB, 12);
v0to3 = v0to3 + v4to7 + (uint4)(0x34E90C6CUL, M0 ^ 0xC0AC29B7UL, M2 ^ 0x299F31D0UL, 0xB5470916UL);
vCtoF = SPH_ROTR32(vCtoF ^ v0to3, 8);
v8toB = v8toB + vCtoF;
v4to7 = SPH_ROTR32(v4to7 ^ v8toB, 7);
v0to3 = v0to3 + (uint4)(V5, V6, V7, V4) + (uint4)(0x3F84D5B5UL, M3 ^ 0x082EFA98UL, 0x85A308D3UL, 0xA4093822UL);
vCtoF = SPH_ROTR32(vCtoF ^ (uint4)(V1, V2, V3, V0), 16);
v8toB = v8toB + (uint4)(VD, VE, VF, VC);
v4to7 = SPH_ROTR32(v4to7 ^ (uint4)(V9, VA, VB, V8), 12);
v0to3 = v0to3 + (uint4)(V5, V6, V7, V4) + (uint4)(0xBE5466CFUL, 0x03707344UL, M1 ^ 0xEC4E6C89UL, 0xB8D01377UL);
vCtoF = SPH_ROTR32(vCtoF ^ (uint4)(V1, V2, V3, V0), 8);
v8toB = v8toB + (uint4)(VD, VE, VF, VC);
v4to7 = SPH_ROTR32(v4to7 ^ (uint4)(V9, VA, VB, V8), 7);
// Round 4
v0to3 = v0to3 + v4to7 + (uint4)(0x38D01377UL, M3 ^ 0x85A308D3UL, 0xC0AC29B6UL, 0x3F84D5B5UL);
vCtoF = SPH_ROTR32(vCtoF ^ v0to3, 16);
v8toB = v8toB + vCtoF;
v4to7 = SPH_ROTR32(v4to7 ^ v8toB, 12);
v0to3 = v0to3 + v4to7 + (uint4)(0xEC4E6C89UL, M1 ^ 0x03707344UL, 0xC97C50DDUL, 0x34E90C6CUL);
vCtoF = SPH_ROTR32(vCtoF ^ v0to3, 8);
v8toB = v8toB + vCtoF;
v4to7 = SPH_ROTR32(v4to7 ^ v8toB, 7);
v0to3 = v0to3 + (uint4)(V5, V6, V7, V4) + (uint4)(M2 ^ 0x082EFA98UL, 0xBE5466CFUL, 0xA43F6A88UL, 0x45282366UL);
vCtoF = SPH_ROTR32(vCtoF ^ (uint4)(V1, V2, V3, V0), 16);
v8toB = v8toB + (uint4)(VD, VE, VF, VC);
v4to7 = SPH_ROTR32(v4to7 ^ (uint4)(V9, VA, VB,V8), 12);
v0to3 = v0to3 + (uint4)(V5, V6, V7, V4) + (uint4)(0x13198A2EUL, 0x299F31D0UL, M0 ^ 0xA4093822UL, 0xB5470917UL);
vCtoF = SPH_ROTR32(vCtoF ^ (uint4)(V1, V2, V3, V0), 8);
v8toB = v8toB + (uint4)(VD, VE, VF, VC);
v4to7 = SPH_ROTR32(v4to7 ^ (uint4)(V9, VA, VB, V8), 7);
// Round 5
v0to3 = v0to3 + v4to7 + (uint4)(0x243F6A88UL, 0xEC4E6C89UL, M2 ^ 0xA4093822UL, 0xB5470917UL);
vCtoF = SPH_ROTR32(vCtoF ^ v0to3, 16);
v8toB = v8toB + vCtoF;
v4to7 = SPH_ROTR32(v4to7 ^ v8toB, 12);
v0to3 = v0to3 + v4to7 + (uint4)(M0 ^ 0x38D01377UL, 0x299F31D0UL, 0x93198A2EUL, 0xBE54644FUL);
vCtoF = SPH_ROTR32(vCtoF ^ v0to3, 8);
v8toB = v8toB + vCtoF;
v4to7 = SPH_ROTR32(v4to7 ^ v8toB, 7);
v0to3 = v0to3 + (uint4)(V5, V6, V7, V4) + (uint4)(0x85A308D3UL, 0xC0AC29B7UL, 0x452821E6UL, M3 ^ 0xC97C50DDUL);
vCtoF = SPH_ROTR32(vCtoF ^ (uint4)(V1, V2, V3, V0), 16);
v8toB = v8toB + (uint4)(VD, VE, VF, VC);
v4to7 = SPH_ROTR32(v4to7 ^ (uint4)(V9, VA, VB,V8), 12);
v0to3 = v0to3 + (uint4)(V5, V6, V7, V4) + (uint4)(M1 ^ 0x3F84D5B5UL, 0x34E90C6CUL, 0x082EFA98UL, 0x3707345UL);
vCtoF = SPH_ROTR32(vCtoF ^ (uint4)(V1, V2, V3, V0), 8);
v8toB = v8toB + (uint4)(VD, VE, VF, VC);
v4to7 = SPH_ROTR32(v4to7 ^ (uint4)(V9, VA, VB, V8), 7);
// Round 6
v0to3 = v0to3 + v4to7 + (uint4)(M2 ^ 0xC0AC29B7UL, 0xBE5466CFUL, M0 ^ 0x34E90C6CUL, 0x03707344UL);
vCtoF = SPH_ROTR32(vCtoF ^ v0to3, 16);
v8toB = v8toB + vCtoF;
v4to7 = SPH_ROTR32(v4to7 ^ v8toB, 12);
v0to3 = v0to3 + v4to7 + (uint4)(0x13198A2EUL, 0x082EFA98UL, 0x243F6A88UL, M3 ^ 0x452821E6UL);
vCtoF = SPH_ROTR32(vCtoF ^ v0to3, 8);
v8toB = v8toB + vCtoF;
v4to7 = SPH_ROTR32(v4to7 ^ v8toB, 7);
v0to3 = v0to3 + (uint4)(V5, V6, V7, V4) + (uint4)(0x497C50DDUL, 0x299F31D0UL, 0x3F84D735UL, M1 ^ 0x38D01377UL);
vCtoF = SPH_ROTR32(vCtoF ^ (uint4)(V1, V2, V3, V0), 16);
v8toB = v8toB + (uint4)(VD, VE, VF, VC);
v4to7 = SPH_ROTR32(v4to7 ^ (uint4)(V9, VA, VB,V8), 12);
v0to3 = v0to3 + (uint4)(V5, V6, V7, V4) + (uint4)(0xA4093823UL, 0xEC4E6C89UL, 0xB5470917UL, 0x85A308D3UL);
vCtoF = SPH_ROTR32(vCtoF ^ (uint4)(V1, V2, V3, V0), 8);
v8toB = v8toB + (uint4)(VD, VE, VF, VC);
v4to7 = SPH_ROTR32(v4to7 ^ (uint4)(V9, VA, VB, V8), 7);
// Round 7
v0to3 = v0to3 + v4to7 + (uint4)(0x299F31D0UL, M1 ^ 0xB5470917UL, 0xC97C50DDUL, 0x3E5466CFUL);
vCtoF = SPH_ROTR32(vCtoF ^ v0to3, 16);
v8toB = v8toB + vCtoF;
v4to7 = SPH_ROTR32(v4to7 ^ v8toB, 12);
v0to3 = v0to3 + v4to7 + (uint4)(0xC0AC29B7UL, 0x85A30A53UL, 0x3F84D5B4UL, 0xA4093822UL);
vCtoF = SPH_ROTR32(vCtoF ^ v0to3, 8);
v8toB = v8toB + vCtoF;
v4to7 = SPH_ROTR32(v4to7 ^ v8toB, 7);
v0to3 = v0to3 + (uint4)(V5, V6, V7, V4) + (uint4)(M0 ^ 0xEC4E6C89UL, 0x03707344UL, 0x13198A2EUL, 0x34E90C6CUL);
vCtoF = SPH_ROTR32(vCtoF ^ (uint4)(V1, V2, V3, V0), 16);
v8toB = v8toB + (uint4)(VD, VE, VF, VC);
v4to7 = SPH_ROTR32(v4to7 ^ (uint4)(V9, VA, VB,V8), 12);
v0to3 = v0to3 + (uint4)(V5, V6, V7, V4) + (uint4)(0x243F6A88UL, M3 ^ 0x082EFA98UL, M2 ^ 0x38D01377UL, 0x452821E6UL);
vCtoF = SPH_ROTR32(vCtoF ^ (uint4)(V1, V2, V3, V0), 8);
v8toB = v8toB + (uint4)(VD, VE, VF, VC);
v4to7 = SPH_ROTR32(v4to7 ^ (uint4)(V9, VA, VB, V8), 7);
// Round 8
v0to3 = v0to3 + v4to7 + (uint4)(0x34E90C6DUL, 0x3F84D5B5UL, 0x85A308D3UL, M3 ^ 0x38D01377UL);
vCtoF = SPH_ROTR32(vCtoF ^ v0to3, 16);
v8toB = v8toB + vCtoF;
v4to7 = SPH_ROTR32(v4to7 ^ v8toB, 12);
v0to3 = v0to3 + v4to7 + (uint4)(0xC97C50DDUL, 0xEC4E6C89UL, M1 ^ 0xC0AC29B7UL, 0x03707344UL);
vCtoF = SPH_ROTR32(vCtoF ^ v0to3, 8);
v8toB = v8toB + vCtoF;
v4to7 = SPH_ROTR32(v4to7 ^ v8toB, 7);
v0to3 = v0to3 + (uint4)(V5, V6, V7, V4) + (uint4)(0x243F6A88UL, 0xA4093AA2UL, 0x082EFA98UL, M2 ^ 0xBE5466CFUL);
vCtoF = SPH_ROTR32(vCtoF ^ (uint4)(V1, V2, V3, V0), 16);
v8toB = v8toB + (uint4)(VD, VE, VF, VC);
v4to7 = SPH_ROTR32(v4to7 ^ (uint4)(V9, VA, VB,V8), 12);
v0to3 = v0to3 + (uint4)(V5, V6, V7, V4) + (uint4)(M0 ^ 0x299F31D0UL, 0x35470917UL, 0x452821E6UL, 0x13198A2EUL);
vCtoF = SPH_ROTR32(vCtoF ^ (uint4)(V1, V2, V3, V0), 8);
v8toB = v8toB + (uint4)(VD, VE, VF, VC);
v4to7 = SPH_ROTR32(v4to7 ^ (uint4)(V9, VA, VB, V8), 7);
if(pre7 ^ V7 ^ VF)
return;
output[output[0xFF]++] = nonce;
}
@cqtenq
Copy link

cqtenq commented Aug 4, 2014

How is development of neoscrypt ?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment