Last active
August 29, 2015 14:03
-
-
Save Griffitsj/55349ccee17bbf87ffd5 to your computer and use it in GitHub Desktop.
Even more optimised version of blake256.cl from https://github.com/kR105/cgminer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// (c) 2013 originally written by smolen, modified by kr105 | |
// (c) 2014 additional optimisations by griffitsj | |
#define SPH_ROTR32(v,n) rotate((uint4)(v),(uint)(32-(n))) | |
#define V0 v0to3.x | |
#define V1 v0to3.y | |
#define V2 v0to3.z | |
#define V3 v0to3.w | |
#define V4 v4to7.x | |
#define V5 v4to7.y | |
#define V6 v4to7.z | |
#define V7 v4to7.w | |
#define V8 v8toB.x | |
#define V9 v8toB.y | |
#define VA v8toB.z | |
#define VB v8toB.w | |
#define VC vCtoF.x | |
#define VD vCtoF.y | |
#define VE vCtoF.z | |
#define VF vCtoF.w | |
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) | |
__kernel void search( | |
volatile __global uint * restrict output, | |
// precalc hash from fisrt part of message | |
const uint h0, | |
const uint h1, | |
const uint h2, | |
const uint h3, | |
const uint h4, | |
const uint h5, | |
const uint h6, | |
const uint h7, | |
// last 12 bytes of original message | |
const uint in16, | |
const uint in17, | |
const uint in18 | |
) | |
{ | |
uint M0, M1, M2, M3; | |
uint pre7 = h7; | |
uint4 v0to3, v4to7, v8toB, vCtoF; | |
uint nonce = get_global_id(0); | |
M0 = in16; | |
M1 = in17; | |
M2 = in18; | |
M3 = nonce; | |
// Round 1 | |
v0to3 = (uint4)(h0, h1, h2, h3) + (uint4)(h4, h5, h6, h7) + (uint4)(M0 ^ 0x85A308D3UL, M2 ^ 0x03707344UL, 0xA99F31D0UL, 0xEC4E6C89UL); | |
vCtoF = SPH_ROTR32((uint4)(0xA4093AA2UL ^ V0, 0x299F3350UL ^ V1, 0x082EFA98UL ^ V2, 0xEC4E6C89UL ^ V3), 16); | |
v8toB = (uint4)(0x243F6A88UL, 0x85A308D3UL, 0x13198A2EUL, 0x03707344UL) + vCtoF; | |
v4to7 = SPH_ROTR32((uint4)(h4, h5, h6, h7) ^ v8toB, 12); | |
v0to3 = v0to3 + v4to7 + (uint4)(M1 ^ 0x243F6A88UL, M3 ^ 0x13198A2EUL, 0xA4093822UL, 0x082EFA98UL); | |
vCtoF = SPH_ROTR32(vCtoF ^ v0to3, 8); | |
v8toB = v8toB + vCtoF; | |
v4to7 = SPH_ROTR32(v4to7 ^ v8toB, 7); | |
v0to3 = v0to3 + (uint4)(V5, V6, V7, V4) + (uint4)(0x38D01377UL, 0x34E90C6CUL, 0xC97C50DDUL, 0xB5470917UL); | |
vCtoF = SPH_ROTR32(vCtoF ^ (uint4)(V1, V2, V3, V0), 16); | |
v8toB = v8toB + (uint4)(VD, VE, VF, VC); | |
v4to7 = SPH_ROTR32(v4to7 ^ (uint4)(V9, VA, VB, V8), 12); | |
v0to3 = v0to3 + (uint4)(V5, V6, V7, V4) + (uint4)(0x452821E6UL, 0xBE5466CFUL, 0xC0AC29B6UL, 0x3F84D735UL); | |
vCtoF = SPH_ROTR32(vCtoF ^ (uint4)(V1, V2, V3, V0), 8); | |
v8toB = v8toB + (uint4)(VD, VE, VF, VC); | |
v4to7 = SPH_ROTR32(v4to7 ^ (uint4)(V9, VA, VB, V8), 7); | |
// Round 2 | |
v0to3 = v0to3 + v4to7 + (uint4)(0xBE5466CFUL, 0xC52821E6UL, 0xB5470917UL, 0x82EFA99UL); | |
vCtoF = SPH_ROTR32(vCtoF ^ v0to3, 16); | |
v8toB = v8toB + vCtoF; | |
v4to7 = SPH_ROTR32(v4to7 ^ v8toB, 12); | |
v0to3 = v0to3 + v4to7 + (uint4)(0x3F84D5B5UL, 0xA4093822UL, 0x38D011F7UL, 0xC97C50DDUL); | |
vCtoF = SPH_ROTR32(vCtoF ^ v0to3, 8); | |
v8toB = v8toB + vCtoF; | |
v4to7 = SPH_ROTR32(v4to7 ^ v8toB, 7); | |
v0to3 = v0to3 + (uint4)(V5, V6, V7, V4) + (uint4)(M1 ^ 0xC0AC29B7UL, M0 ^ 0x13198A2EUL, 0xEC4E6C89UL, 0x03707344UL); | |
vCtoF = SPH_ROTR32(vCtoF ^ (uint4)(V1, V2, V3, V0), 16); | |
v8toB = v8toB + (uint4)(VD, VE, VF, VC); | |
v4to7 = SPH_ROTR32(v4to7 ^ (uint4)(V9, VA, VB, V8), 12); | |
v0to3 = v0to3 + (uint4)(V5, V6, V7, V4) + (uint4)(0x85A308D3UL, M2 ^ 0x243F6A88UL, 0x34E90C6CUL, M3 ^ 0x299F31D0UL); | |
vCtoF = SPH_ROTR32(vCtoF ^ (uint4)(V1, V2, V3, V0), 8); | |
v8toB = v8toB + (uint4)(VD, VE, VF, VC); | |
v4to7 = SPH_ROTR32(v4to7 ^ (uint4)(V9, VA, VB, V8), 7); | |
// Round 3 | |
v0to3 = v0to3 + v4to7 + (uint4)(0x452821E6UL, 0x243F6A88UL, 0x13198A2EUL, 0xC97C525DUL); | |
vCtoF = SPH_ROTR32(vCtoF ^ v0to3, 16); | |
v8toB = v8toB + vCtoF; | |
v4to7 = SPH_ROTR32(v4to7 ^ v8toB, 12); | |
v0to3 = v0to3 + v4to7 + (uint4)(0x34E90C6CUL, M0 ^ 0xC0AC29B7UL, M2 ^ 0x299F31D0UL, 0xB5470916UL); | |
vCtoF = SPH_ROTR32(vCtoF ^ v0to3, 8); | |
v8toB = v8toB + vCtoF; | |
v4to7 = SPH_ROTR32(v4to7 ^ v8toB, 7); | |
v0to3 = v0to3 + (uint4)(V5, V6, V7, V4) + (uint4)(0x3F84D5B5UL, M3 ^ 0x082EFA98UL, 0x85A308D3UL, 0xA4093822UL); | |
vCtoF = SPH_ROTR32(vCtoF ^ (uint4)(V1, V2, V3, V0), 16); | |
v8toB = v8toB + (uint4)(VD, VE, VF, VC); | |
v4to7 = SPH_ROTR32(v4to7 ^ (uint4)(V9, VA, VB, V8), 12); | |
v0to3 = v0to3 + (uint4)(V5, V6, V7, V4) + (uint4)(0xBE5466CFUL, 0x03707344UL, M1 ^ 0xEC4E6C89UL, 0xB8D01377UL); | |
vCtoF = SPH_ROTR32(vCtoF ^ (uint4)(V1, V2, V3, V0), 8); | |
v8toB = v8toB + (uint4)(VD, VE, VF, VC); | |
v4to7 = SPH_ROTR32(v4to7 ^ (uint4)(V9, VA, VB, V8), 7); | |
// Round 4 | |
v0to3 = v0to3 + v4to7 + (uint4)(0x38D01377UL, M3 ^ 0x85A308D3UL, 0xC0AC29B6UL, 0x3F84D5B5UL); | |
vCtoF = SPH_ROTR32(vCtoF ^ v0to3, 16); | |
v8toB = v8toB + vCtoF; | |
v4to7 = SPH_ROTR32(v4to7 ^ v8toB, 12); | |
v0to3 = v0to3 + v4to7 + (uint4)(0xEC4E6C89UL, M1 ^ 0x03707344UL, 0xC97C50DDUL, 0x34E90C6CUL); | |
vCtoF = SPH_ROTR32(vCtoF ^ v0to3, 8); | |
v8toB = v8toB + vCtoF; | |
v4to7 = SPH_ROTR32(v4to7 ^ v8toB, 7); | |
v0to3 = v0to3 + (uint4)(V5, V6, V7, V4) + (uint4)(M2 ^ 0x082EFA98UL, 0xBE5466CFUL, 0xA43F6A88UL, 0x45282366UL); | |
vCtoF = SPH_ROTR32(vCtoF ^ (uint4)(V1, V2, V3, V0), 16); | |
v8toB = v8toB + (uint4)(VD, VE, VF, VC); | |
v4to7 = SPH_ROTR32(v4to7 ^ (uint4)(V9, VA, VB,V8), 12); | |
v0to3 = v0to3 + (uint4)(V5, V6, V7, V4) + (uint4)(0x13198A2EUL, 0x299F31D0UL, M0 ^ 0xA4093822UL, 0xB5470917UL); | |
vCtoF = SPH_ROTR32(vCtoF ^ (uint4)(V1, V2, V3, V0), 8); | |
v8toB = v8toB + (uint4)(VD, VE, VF, VC); | |
v4to7 = SPH_ROTR32(v4to7 ^ (uint4)(V9, VA, VB, V8), 7); | |
// Round 5 | |
v0to3 = v0to3 + v4to7 + (uint4)(0x243F6A88UL, 0xEC4E6C89UL, M2 ^ 0xA4093822UL, 0xB5470917UL); | |
vCtoF = SPH_ROTR32(vCtoF ^ v0to3, 16); | |
v8toB = v8toB + vCtoF; | |
v4to7 = SPH_ROTR32(v4to7 ^ v8toB, 12); | |
v0to3 = v0to3 + v4to7 + (uint4)(M0 ^ 0x38D01377UL, 0x299F31D0UL, 0x93198A2EUL, 0xBE54644FUL); | |
vCtoF = SPH_ROTR32(vCtoF ^ v0to3, 8); | |
v8toB = v8toB + vCtoF; | |
v4to7 = SPH_ROTR32(v4to7 ^ v8toB, 7); | |
v0to3 = v0to3 + (uint4)(V5, V6, V7, V4) + (uint4)(0x85A308D3UL, 0xC0AC29B7UL, 0x452821E6UL, M3 ^ 0xC97C50DDUL); | |
vCtoF = SPH_ROTR32(vCtoF ^ (uint4)(V1, V2, V3, V0), 16); | |
v8toB = v8toB + (uint4)(VD, VE, VF, VC); | |
v4to7 = SPH_ROTR32(v4to7 ^ (uint4)(V9, VA, VB,V8), 12); | |
v0to3 = v0to3 + (uint4)(V5, V6, V7, V4) + (uint4)(M1 ^ 0x3F84D5B5UL, 0x34E90C6CUL, 0x082EFA98UL, 0x3707345UL); | |
vCtoF = SPH_ROTR32(vCtoF ^ (uint4)(V1, V2, V3, V0), 8); | |
v8toB = v8toB + (uint4)(VD, VE, VF, VC); | |
v4to7 = SPH_ROTR32(v4to7 ^ (uint4)(V9, VA, VB, V8), 7); | |
// Round 6 | |
v0to3 = v0to3 + v4to7 + (uint4)(M2 ^ 0xC0AC29B7UL, 0xBE5466CFUL, M0 ^ 0x34E90C6CUL, 0x03707344UL); | |
vCtoF = SPH_ROTR32(vCtoF ^ v0to3, 16); | |
v8toB = v8toB + vCtoF; | |
v4to7 = SPH_ROTR32(v4to7 ^ v8toB, 12); | |
v0to3 = v0to3 + v4to7 + (uint4)(0x13198A2EUL, 0x082EFA98UL, 0x243F6A88UL, M3 ^ 0x452821E6UL); | |
vCtoF = SPH_ROTR32(vCtoF ^ v0to3, 8); | |
v8toB = v8toB + vCtoF; | |
v4to7 = SPH_ROTR32(v4to7 ^ v8toB, 7); | |
v0to3 = v0to3 + (uint4)(V5, V6, V7, V4) + (uint4)(0x497C50DDUL, 0x299F31D0UL, 0x3F84D735UL, M1 ^ 0x38D01377UL); | |
vCtoF = SPH_ROTR32(vCtoF ^ (uint4)(V1, V2, V3, V0), 16); | |
v8toB = v8toB + (uint4)(VD, VE, VF, VC); | |
v4to7 = SPH_ROTR32(v4to7 ^ (uint4)(V9, VA, VB,V8), 12); | |
v0to3 = v0to3 + (uint4)(V5, V6, V7, V4) + (uint4)(0xA4093823UL, 0xEC4E6C89UL, 0xB5470917UL, 0x85A308D3UL); | |
vCtoF = SPH_ROTR32(vCtoF ^ (uint4)(V1, V2, V3, V0), 8); | |
v8toB = v8toB + (uint4)(VD, VE, VF, VC); | |
v4to7 = SPH_ROTR32(v4to7 ^ (uint4)(V9, VA, VB, V8), 7); | |
// Round 7 | |
v0to3 = v0to3 + v4to7 + (uint4)(0x299F31D0UL, M1 ^ 0xB5470917UL, 0xC97C50DDUL, 0x3E5466CFUL); | |
vCtoF = SPH_ROTR32(vCtoF ^ v0to3, 16); | |
v8toB = v8toB + vCtoF; | |
v4to7 = SPH_ROTR32(v4to7 ^ v8toB, 12); | |
v0to3 = v0to3 + v4to7 + (uint4)(0xC0AC29B7UL, 0x85A30A53UL, 0x3F84D5B4UL, 0xA4093822UL); | |
vCtoF = SPH_ROTR32(vCtoF ^ v0to3, 8); | |
v8toB = v8toB + vCtoF; | |
v4to7 = SPH_ROTR32(v4to7 ^ v8toB, 7); | |
v0to3 = v0to3 + (uint4)(V5, V6, V7, V4) + (uint4)(M0 ^ 0xEC4E6C89UL, 0x03707344UL, 0x13198A2EUL, 0x34E90C6CUL); | |
vCtoF = SPH_ROTR32(vCtoF ^ (uint4)(V1, V2, V3, V0), 16); | |
v8toB = v8toB + (uint4)(VD, VE, VF, VC); | |
v4to7 = SPH_ROTR32(v4to7 ^ (uint4)(V9, VA, VB,V8), 12); | |
v0to3 = v0to3 + (uint4)(V5, V6, V7, V4) + (uint4)(0x243F6A88UL, M3 ^ 0x082EFA98UL, M2 ^ 0x38D01377UL, 0x452821E6UL); | |
vCtoF = SPH_ROTR32(vCtoF ^ (uint4)(V1, V2, V3, V0), 8); | |
v8toB = v8toB + (uint4)(VD, VE, VF, VC); | |
v4to7 = SPH_ROTR32(v4to7 ^ (uint4)(V9, VA, VB, V8), 7); | |
// Round 8 | |
v0to3 = v0to3 + v4to7 + (uint4)(0x34E90C6DUL, 0x3F84D5B5UL, 0x85A308D3UL, M3 ^ 0x38D01377UL); | |
vCtoF = SPH_ROTR32(vCtoF ^ v0to3, 16); | |
v8toB = v8toB + vCtoF; | |
v4to7 = SPH_ROTR32(v4to7 ^ v8toB, 12); | |
v0to3 = v0to3 + v4to7 + (uint4)(0xC97C50DDUL, 0xEC4E6C89UL, M1 ^ 0xC0AC29B7UL, 0x03707344UL); | |
vCtoF = SPH_ROTR32(vCtoF ^ v0to3, 8); | |
v8toB = v8toB + vCtoF; | |
v4to7 = SPH_ROTR32(v4to7 ^ v8toB, 7); | |
v0to3 = v0to3 + (uint4)(V5, V6, V7, V4) + (uint4)(0x243F6A88UL, 0xA4093AA2UL, 0x082EFA98UL, M2 ^ 0xBE5466CFUL); | |
vCtoF = SPH_ROTR32(vCtoF ^ (uint4)(V1, V2, V3, V0), 16); | |
v8toB = v8toB + (uint4)(VD, VE, VF, VC); | |
v4to7 = SPH_ROTR32(v4to7 ^ (uint4)(V9, VA, VB,V8), 12); | |
v0to3 = v0to3 + (uint4)(V5, V6, V7, V4) + (uint4)(M0 ^ 0x299F31D0UL, 0x35470917UL, 0x452821E6UL, 0x13198A2EUL); | |
vCtoF = SPH_ROTR32(vCtoF ^ (uint4)(V1, V2, V3, V0), 8); | |
v8toB = v8toB + (uint4)(VD, VE, VF, VC); | |
v4to7 = SPH_ROTR32(v4to7 ^ (uint4)(V9, VA, VB, V8), 7); | |
if(pre7 ^ V7 ^ VF) | |
return; | |
output[output[0xFF]++] = nonce; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
How is development of neoscrypt ?