Skip to content

Instantly share code, notes, and snippets.

@danny8376
Last active September 26, 2023 08:34
Show Gist options
  • Save danny8376/a2e3ea9e9ef42eeca4cb364ac82f975e to your computer and use it in GitHub Desktop.
Save danny8376/a2e3ea9e9ef42eeca4cb364ac82f975e to your computer and use it in GitHub Desktop.
sha256_12 for mii lfcs hash
/* sha256_16/12
* again specialized to only take 16/12 bytes input and spit out the first 16/last 8 bytes
* again code dug out from mbed TLS
* https://github.com/ARMmbed/mbedtls/blob/development/library/sha256.c
*/
// adopted from: https://github.com/Jimmy-Z/bfCL/blob/master/cl/sha256_16.cl
#include <stdint.h>
const uint32_t K[] =
{
0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2,
};
#define SHR(x,n) ((x & 0xFFFFFFFF) >> n)
#define ROTR(x,n) (SHR(x,n) | (x << (32 - n)))
#define S0(x) (ROTR(x, 7) ^ ROTR(x,18) ^ SHR(x, 3))
#define S1(x) (ROTR(x,17) ^ ROTR(x,19) ^ SHR(x,10))
#define S2(x) (ROTR(x, 2) ^ ROTR(x,13) ^ ROTR(x,22))
#define S3(x) (ROTR(x, 6) ^ ROTR(x,11) ^ ROTR(x,25))
#define F0(x,y,z) ((x & y) | (z & (x | y)))
#define F1(x,y,z) (z ^ (x & (y ^ z)))
#define R(t) \
( \
W[t] = S1(W[t - 2]) + W[t - 7] + \
S0(W[t - 15]) + W[t - 16] \
)
#define P(a,b,c,d,e,f,g,h,x,K) \
{ \
temp1 = h + S3(e) + F1(e,f,g) + K + x; \
temp2 = S2(a) + F0(a,b,c); \
d += temp1; h = temp1 + temp2; \
}
void sha256_12(uint32_t *src, uint32_t *hash) // uint32_t[3] src, uint32_t[2] hash
{
uint32_t temp1, temp2, W[64];
uint32_t A[8] = {
0x6A09E667,
0xBB67AE85,
0x3C6EF372,
0xA54FF53A,
0x510E527F,
0x9B05688C,
0x1F83D9AB,
0x5BE0CD19
};
unsigned int i;
// padding and msglen identical/similar to sha1_16
W[0] = src[0];
W[1] = src[1];
W[2] = src[2];
W[3] = 0x80000000u;
W[4] = 0;
W[5] = 0; W[6] = 0; W[7] = 0;
W[8] = 0; W[9] = 0; W[10] = 0; W[11] = 0;
W[12] = 0; W[13] = 0; W[14] = 0;
W[15] = 0x60u;
for (i = 0; i < 16; i += 8)
{
P(A[0], A[1], A[2], A[3], A[4], A[5], A[6], A[7], W[i + 0], K[i + 0]);
P(A[7], A[0], A[1], A[2], A[3], A[4], A[5], A[6], W[i + 1], K[i + 1]);
P(A[6], A[7], A[0], A[1], A[2], A[3], A[4], A[5], W[i + 2], K[i + 2]);
P(A[5], A[6], A[7], A[0], A[1], A[2], A[3], A[4], W[i + 3], K[i + 3]);
P(A[4], A[5], A[6], A[7], A[0], A[1], A[2], A[3], W[i + 4], K[i + 4]);
P(A[3], A[4], A[5], A[6], A[7], A[0], A[1], A[2], W[i + 5], K[i + 5]);
P(A[2], A[3], A[4], A[5], A[6], A[7], A[0], A[1], W[i + 6], K[i + 6]);
P(A[1], A[2], A[3], A[4], A[5], A[6], A[7], A[0], W[i + 7], K[i + 7]);
}
for (i = 16; i < 64; i += 8)
{
P(A[0], A[1], A[2], A[3], A[4], A[5], A[6], A[7], R(i + 0), K[i + 0]);
P(A[7], A[0], A[1], A[2], A[3], A[4], A[5], A[6], R(i + 1), K[i + 1]);
P(A[6], A[7], A[0], A[1], A[2], A[3], A[4], A[5], R(i + 2), K[i + 2]);
P(A[5], A[6], A[7], A[0], A[1], A[2], A[3], A[4], R(i + 3), K[i + 3]);
P(A[4], A[5], A[6], A[7], A[0], A[1], A[2], A[3], R(i + 4), K[i + 4]);
P(A[3], A[4], A[5], A[6], A[7], A[0], A[1], A[2], R(i + 5), K[i + 5]);
P(A[2], A[3], A[4], A[5], A[6], A[7], A[0], A[1], R(i + 6), K[i + 6]);
P(A[1], A[2], A[3], A[4], A[5], A[6], A[7], A[0], R(i + 7), K[i + 7]);
}
A[6] += 0x1F83D9AB;
A[7] += 0x5BE0CD19;
hash[0] = A[6];
hash[1] = A[7];
}
from functools import reduce
# ref: https://github.com/Jimmy-Z/bfCL/blob/master/cl/sha256_16.cl
A=[
0x6a09e667,0xbb67ae85,0x3c6ef372,0xa54ff53a,
0x510e527f,0x9b05688c,0x1f83d9ab,0x5be0cd19
]
K=[
0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,
0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,
0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,
0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,
0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,
0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,
0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,
0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,
0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
]
def mod32add(*vals):
return reduce(lambda x, y: x+y, vals) & 0xFFFFFFFF
def shr(v, n):
return (v & 0xFFFFFFFF) >> n
def ror(v, n):
return shr(v,n) | ((v << (32 - n)) & 0xFFFFFFFF)
def f0(x,y,z):
return ((x & y) | (z & (x | y)))
def f1(x,y,z):
return (z ^ (x & (y ^ z)))
def s0(x):
return ror(x,7) ^ ror(x,18) ^ shr(x,3)
def s1(x):
return ror(x,17) ^ ror(x,19) ^ shr(x,10)
def s2(x):
return ror(x,2) ^ ror(x,13) ^ ror(x,22)
def s3(x):
return ror(x,6) ^ ror(x,11) ^ ror(x,25)
def sha256_12(input):
w=[*map(lambda b: int.from_bytes(b, "big"), [input[i:i+4] for i in range(0, 12, 4)])]
w+=[0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x60]
a,b,c,d,e,f,g,h=A
for i in range(0,64):
if i < 16:
t1=mod32add(h,s3(e),f1(e,f,g),K[i],w[i])
t2=mod32add(s2(a),f0(a,b,c))
a,b,c,d,e,f,g,h=mod32add(t1,t2),a,b,c,mod32add(d,t1),e,f,g
else:
w.append(mod32add(s1(w[i-2]),w[i-7],s0(w[i-15]),w[i-16]))
t1=mod32add(h,s3(e),f1(e,f,g),K[i],w[i])
t2=mod32add(s2(a),f0(a,b,c))
a,b,c,d,e,f,g,h=mod32add(t1,t2),a,b,c,mod32add(d,t1),e,f,g
return "{0:08x}{1:08x}".format(mod32add(0x1F83D9AB,g), mod32add(0x5BE0CD19,h))
/* sha256-x86.c - Intel SHA extensions using C intrinsics */
/* Written and place in public domain by Jeffrey Walton */
/* Based on code from Intel, and by Sean Gulley for */
/* the miTLS project. */
// adopted from: https://github.com/noloader/SHA-Intrinsics/blob/master/sha256-x86.c
// ref: https://github.com/Jimmy-Z/bfCL/blob/master/cl/sha256_16.cl
// gcc -DTEST_MAIN -msse4.1 -msha sha256-x86.c -o sha256.exe
// Include the GCC super header
#if defined(__GNUC__)
# include <stdint.h>
# include <x86intrin.h>
#endif
// Microsoft supports Intel SHA ACLE extensions as of Visual Studio 2015
#if defined(_MSC_VER)
# include <immintrin.h>
# define WIN32_LEAN_AND_MEAN
# include <Windows.h>
typedef UINT32 uint32_t;
typedef UINT8 uint8_t;
#endif
#include <string.h>
static const uint8_t DATA12[16] __attribute__ ((aligned (16))) = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
};
static const uint8_t DATA3[16] __attribute__ ((aligned (16))) = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00
};
static const uint8_t INIT[][16] __attribute__ ((aligned (16))) = {
{ 0x8C, 0x68, 0x05, 0x9B, 0x7F, 0x52, 0x0E, 0x51, 0x85, 0xAE, 0x67, 0xBB, 0x67, 0xE6, 0x09, 0x6A },
{ 0x19, 0xCD, 0xE0, 0x5B, 0xAB, 0xD9, 0x83, 0x1F, 0x3A, 0xF5, 0x4F, 0xA5, 0x72, 0xF3, 0x6E, 0x3C }
};
static const uint8_t SHAC[][16] __attribute__ ((aligned (16))) = {
{ 0x98, 0x2F, 0x8A, 0x42, 0x91, 0x44, 0x37, 0x71, 0xCF, 0xFB, 0xC0, 0xB5, 0xA5, 0xDB, 0xB5, 0xE9 },
{ 0x5B, 0xC2, 0x56, 0x39, 0xF1, 0x11, 0xF1, 0x59, 0xA4, 0x82, 0x3F, 0x92, 0xD5, 0x5E, 0x1C, 0xAB },
{ 0x98, 0xAA, 0x07, 0xD8, 0x01, 0x5B, 0x83, 0x12, 0xBE, 0x85, 0x31, 0x24, 0xC3, 0x7D, 0x0C, 0x55 },
{ 0x74, 0x5D, 0xBE, 0x72, 0xFE, 0xB1, 0xDE, 0x80, 0xA7, 0x06, 0xDC, 0x9B, 0x74, 0xF1, 0x9B, 0xC1 },
{ 0xC1, 0x69, 0x9B, 0xE4, 0x86, 0x47, 0xBE, 0xEF, 0xC6, 0x9D, 0xC1, 0x0F, 0xCC, 0xA1, 0x0C, 0x24 },
{ 0x6F, 0x2C, 0xE9, 0x2D, 0xAA, 0x84, 0x74, 0x4A, 0xDC, 0xA9, 0xB0, 0x5C, 0xDA, 0x88, 0xF9, 0x76 },
{ 0x52, 0x51, 0x3E, 0x98, 0x6D, 0xC6, 0x31, 0xA8, 0xC8, 0x27, 0x03, 0xB0, 0xC7, 0x7F, 0x59, 0xBF },
{ 0xF3, 0x0B, 0xE0, 0xC6, 0x47, 0x91, 0xA7, 0xD5, 0x51, 0x63, 0xCA, 0x06, 0x67, 0x29, 0x29, 0x14 },
{ 0x85, 0x0A, 0xB7, 0x27, 0x38, 0x21, 0x1B, 0x2E, 0xFC, 0x6D, 0x2C, 0x4D, 0x13, 0x0D, 0x38, 0x53 },
{ 0x54, 0x73, 0x0A, 0x65, 0xBB, 0x0A, 0x6A, 0x76, 0x2E, 0xC9, 0xC2, 0x81, 0x85, 0x2C, 0x72, 0x92 },
{ 0xA1, 0xE8, 0xBF, 0xA2, 0x4B, 0x66, 0x1A, 0xA8, 0x70, 0x8B, 0x4B, 0xC2, 0xA3, 0x51, 0x6C, 0xC7 },
{ 0x19, 0xE8, 0x92, 0xD1, 0x24, 0x06, 0x99, 0xD6, 0x85, 0x35, 0x0E, 0xF4, 0x70, 0xA0, 0x6A, 0x10 },
{ 0x16, 0xC1, 0xA4, 0x19, 0x08, 0x6C, 0x37, 0x1E, 0x4C, 0x77, 0x48, 0x27, 0xB5, 0xBC, 0xB0, 0x34 },
{ 0xB3, 0x0C, 0x1C, 0x39, 0x4A, 0xAA, 0xD8, 0x4E, 0x4F, 0xCA, 0x9C, 0x5B, 0xF3, 0x6F, 0x2E, 0x68 },
{ 0xEE, 0x82, 0x8F, 0x74, 0x6F, 0x63, 0xA5, 0x78, 0x14, 0x78, 0xC8, 0x84, 0x08, 0x02, 0xC7, 0x8C },
{ 0xFA, 0xFF, 0xBE, 0x90, 0xEB, 0x6C, 0x50, 0xA4, 0xF7, 0xA3, 0xF9, 0xBE, 0xF2, 0x78, 0x71, 0xC6 }
};
static const uint8_t SWAPC[][16] __attribute__ ((aligned (16))) = {
0x19, 0xCD, 0xE0, 0x5B, 0xAB, 0xD9, 0x83, 0x1F, 0x3A, 0xF5, 0x4F, 0xA5, 0x72, 0xf3, 0x6E, 0x3C
};
void sha256_12_shaext(const uint8_t *seed, uint8_t *hash) {
uint8_t DATA0[16] __attribute__ ((aligned (8))) = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80
};
memcpy(DATA0, seed, 12);
register __m128i STATE0 asm ("xmm3"), STATE1 asm ("xmm4");
register __m128i MSG asm ("xmm5"), TMP asm ("xmm6");
register __m128i MSG0 asm ("xmm7"), MSG1 asm ("xmm8"), MSG2 asm ("xmm9"), MSG3 asm ("xmm10");
// Load initial values, pre shuffled
STATE0 = *((__m128i*)INIT[0]);
STATE1 = *((__m128i*)INIT[1]);
// Rounds 0-3
MSG0 = *((__m128i*)&DATA0);
MSG = _mm_add_epi32(MSG0, *((__m128i*)SHAC[0]));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
// Rounds 4-7
MSG1 = *((__m128i*)&DATA12);
MSG = _mm_add_epi32(MSG1, *((__m128i*)SHAC[1]));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
// Rounds 8-11
MSG2 = *((__m128i*)&DATA12);
MSG = _mm_add_epi32(MSG2, *((__m128i*)SHAC[2]));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
// Rounds 12-15
MSG3 = *((__m128i*)&DATA3);
MSG = _mm_add_epi32(MSG3, *((__m128i*)SHAC[3]));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
MSG0 = _mm_add_epi32(MSG0, TMP);
MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
// Rounds 16-19
MSG = _mm_add_epi32(MSG0, *((__m128i*)SHAC[4]));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
MSG1 = _mm_add_epi32(MSG1, TMP);
MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
// Rounds 20-23
MSG = _mm_add_epi32(MSG1, *((__m128i*)SHAC[5]));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
MSG2 = _mm_add_epi32(MSG2, TMP);
MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
// Rounds 24-27
MSG = _mm_add_epi32(MSG2, *((__m128i*)SHAC[6]));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
MSG3 = _mm_add_epi32(MSG3, TMP);
MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
// Rounds 28-31
MSG = _mm_add_epi32(MSG3, *((__m128i*)SHAC[7]));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
MSG0 = _mm_add_epi32(MSG0, TMP);
MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
// Rounds 32-35
MSG = _mm_add_epi32(MSG0, *((__m128i*)SHAC[8]));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
MSG1 = _mm_add_epi32(MSG1, TMP);
MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
// Rounds 36-39
MSG = _mm_add_epi32(MSG1, *((__m128i*)SHAC[9]));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
MSG2 = _mm_add_epi32(MSG2, TMP);
MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
// Rounds 40-43
MSG = _mm_add_epi32(MSG2, *((__m128i*)SHAC[10]));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
MSG3 = _mm_add_epi32(MSG3, TMP);
MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
// Rounds 44-47
MSG = _mm_add_epi32(MSG3, *((__m128i*)SHAC[11]));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
MSG0 = _mm_add_epi32(MSG0, TMP);
MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
// Rounds 48-51
MSG = _mm_add_epi32(MSG0, *((__m128i*)SHAC[12]));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
MSG1 = _mm_add_epi32(MSG1, TMP);
MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
// Rounds 52-55
MSG = _mm_add_epi32(MSG1, *((__m128i*)SHAC[13]));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
MSG2 = _mm_add_epi32(MSG2, TMP);
MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
// Rounds 56-59
MSG = _mm_add_epi32(MSG2, *((__m128i*)SHAC[14]));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
MSG3 = _mm_add_epi32(MSG3, TMP);
MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
// Rounds 60-63
MSG = _mm_add_epi32(MSG3, *((__m128i*)SHAC[15]));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
// Combine state
STATE1 = _mm_add_epi32(STATE1, *((__m128i*)&SWAPC));
TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
// Output
*((uint64_t*)hash) = _mm_extract_epi64(STATE1, 1);
*((uint32_t*)hash) = (uint32_t)__builtin_bswap32(*((uint32_t*)hash));
*((uint32_t*)(hash+4)) = (uint32_t)__builtin_bswap32(*((uint32_t*)(hash+4)));
}
.intel_syntax noprefix
# adopted from: https://github.com/noloader/SHA-Intrinsics/blob/master/sha256-x86.c
# ref: https://github.com/Jimmy-Z/bfCL/blob/master/cl/sha256_16.cl
#ifdef __APPLE__
#define _func(name) \
.global _##name ; \
_##name:
#else
#define _func(name) \
.global name ; \
name:
#endif
#ifdef _WIN32
#define rp1 rcx
#define rp2 rdx
#define rp3 r8
#define rp4 r9
#define ep1 ecx
#define ep2 edx
#define ep3 r8d
#define ep4 r9d
#define p1 cx
#define p2 dx
#define p3 r8w
#define p4 r9w
#define p5ptr [rbp+0x48]
#else
#define rp1 rdi
#define rp2 rsi
#define rp3 rdx
#define rp4 rcx
#define ep1 edi
#define ep2 esi
#define ep3 edx
#define ep4 ecx
#define p1 di
#define p2 si
#define p3 dx
#define p4 cx
#define p5ptr [r8]
#endif
#define _xmmd(name) xmmword ptr [rip+name]
#define _qdat(name) qword ptr [rip+name]
#define _ddat(name) dword ptr [rip+name]
#define _xmm2sp(offset, regn) \
movdqa [rsp+16*offset], xmm##regn ;
#define _sp2xmm(regn, offset) \
movdqa xmm##regn, [rsp+16*offset] ;
#define _shasr1(s0, s1, tmpr, regt, msg, k) \
movdqa regt, msg ; \
paddd regt, _xmmd(k) ; \
sha256rnds2 s1, s0, regt ; \
pshufd regt, regt, 0xE ; \
sha256rnds2 s0, s1, regt ;
#define _shasr2(s0, s1, tmpr, regt, msg, pmsg, k) \
movdqa regt, msg ; \
paddd regt, _xmmd(k) ; \
sha256rnds2 s1, s0, regt ; \
pshufd regt, regt, 0xE ; \
sha256rnds2 s0, s1, regt ; \
sha256msg1 pmsg, msg ;
#define _shasr3(s0, s1, tmpr, regt, regt2, msg, pmsg, nmsg, k) \
movdqa regt, msg ; \
paddd regt, _xmmd(k) ; \
sha256rnds2 s1, s0, regt ; \
movdqa regt2, msg ; \
palignr regt2, pmsg, 4 ; \
paddd nmsg, regt2 ; \
sha256msg2 nmsg, msg ; \
pshufd regt, regt, 0xE ; \
sha256rnds2 s0, s1, regt ; \
sha256msg1 pmsg, msg ;
.data
.align 8
CD: .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80
.align 16
D12: .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
D3: .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00
I0: .byte 0x8C, 0x68, 0x05, 0x9B, 0x7F, 0x52, 0x0E, 0x51, 0x85, 0xAE, 0x67, 0xBB, 0x67, 0xE6, 0x09, 0x6A
I1: .byte 0x19, 0xCD, 0xE0, 0x5B, 0xAB, 0xD9, 0x83, 0x1F, 0x3A, 0xF5, 0x4F, 0xA5, 0x72, 0xF3, 0x6E, 0x3C
C0: .byte 0x98, 0x2F, 0x8A, 0x42, 0x91, 0x44, 0x37, 0x71, 0xCF, 0xFB, 0xC0, 0xB5, 0xA5, 0xDB, 0xB5, 0xE9
C1: .byte 0x5B, 0xC2, 0x56, 0x39, 0xF1, 0x11, 0xF1, 0x59, 0xA4, 0x82, 0x3F, 0x92, 0xD5, 0x5E, 0x1C, 0xAB
C2: .byte 0x98, 0xAA, 0x07, 0xD8, 0x01, 0x5B, 0x83, 0x12, 0xBE, 0x85, 0x31, 0x24, 0xC3, 0x7D, 0x0C, 0x55
C3: .byte 0x74, 0x5D, 0xBE, 0x72, 0xFE, 0xB1, 0xDE, 0x80, 0xA7, 0x06, 0xDC, 0x9B, 0x74, 0xF1, 0x9B, 0xC1
C4: .byte 0xC1, 0x69, 0x9B, 0xE4, 0x86, 0x47, 0xBE, 0xEF, 0xC6, 0x9D, 0xC1, 0x0F, 0xCC, 0xA1, 0x0C, 0x24
C5: .byte 0x6F, 0x2C, 0xE9, 0x2D, 0xAA, 0x84, 0x74, 0x4A, 0xDC, 0xA9, 0xB0, 0x5C, 0xDA, 0x88, 0xF9, 0x76
C6: .byte 0x52, 0x51, 0x3E, 0x98, 0x6D, 0xC6, 0x31, 0xA8, 0xC8, 0x27, 0x03, 0xB0, 0xC7, 0x7F, 0x59, 0xBF
C7: .byte 0xF3, 0x0B, 0xE0, 0xC6, 0x47, 0x91, 0xA7, 0xD5, 0x51, 0x63, 0xCA, 0x06, 0x67, 0x29, 0x29, 0x14
C8: .byte 0x85, 0x0A, 0xB7, 0x27, 0x38, 0x21, 0x1B, 0x2E, 0xFC, 0x6D, 0x2C, 0x4D, 0x13, 0x0D, 0x38, 0x53
C9: .byte 0x54, 0x73, 0x0A, 0x65, 0xBB, 0x0A, 0x6A, 0x76, 0x2E, 0xC9, 0xC2, 0x81, 0x85, 0x2C, 0x72, 0x92
C10: .byte 0xA1, 0xE8, 0xBF, 0xA2, 0x4B, 0x66, 0x1A, 0xA8, 0x70, 0x8B, 0x4B, 0xC2, 0xA3, 0x51, 0x6C, 0xC7
C11: .byte 0x19, 0xE8, 0x92, 0xD1, 0x24, 0x06, 0x99, 0xD6, 0x85, 0x35, 0x0E, 0xF4, 0x70, 0xA0, 0x6A, 0x10
C12: .byte 0x16, 0xC1, 0xA4, 0x19, 0x08, 0x6C, 0x37, 0x1E, 0x4C, 0x77, 0x48, 0x27, 0xB5, 0xBC, 0xB0, 0x34
C13: .byte 0xB3, 0x0C, 0x1C, 0x39, 0x4A, 0xAA, 0xD8, 0x4E, 0x4F, 0xCA, 0x9C, 0x5B, 0xF3, 0x6F, 0x2E, 0x68
C14: .byte 0xEE, 0x82, 0x8F, 0x74, 0x6F, 0x63, 0xA5, 0x78, 0x14, 0x78, 0xC8, 0x84, 0x08, 0x02, 0xC7, 0x8C
C15: .byte 0xFA, 0xFF, 0xBE, 0x90, 0xEB, 0x6C, 0x50, 0xA4, 0xF7, 0xA3, 0xF9, 0xBE, 0xF2, 0x78, 0x71, 0xC6
S: .byte 0x19, 0xCD, 0xE0, 0x5B, 0xAB, 0xD9, 0x83, 0x1F, 0x3A, 0xF5, 0x4F, 0xA5, 0x72, 0xf3, 0x6E, 0x3C
.text
# ---- volatile ----
#define DAT0 xmm3
# ---- non-volatile ----
#define STATE0 xmm10
#define STATE1 xmm11
#define MSG0 xmm12
#define MSG1 xmm13
#define MSG2 xmm14
#define MSG3 xmm15
_func(sha256_12) # uint64_t data => uint64_t hash
push rbp
mov rbp, rsp
# store xmm10-15 x6
lea rsp, [rsp-16*6]
_xmm2sp(0, 10)
_xmm2sp(1, 11)
_xmm2sp(2, 12)
_xmm2sp(3, 13)
_xmm2sp(4, 14)
_xmm2sp(5, 15)
# --------------------------------
# | prepare data
# DAT0
movq DAT0, rp1
pinsrq DAT0, _qdat(CD), 1
# --------------------------------
# | actual sha256_12 hashing
# init state, pre shuffled
movdqa STATE0, _xmmd(I0)
movdqa STATE1, _xmmd(I1)
# rounds 0-3
movdqa MSG0, DAT0
_shasr1 (STATE0, STATE1, rax, xmm0, MSG0, C0)
# rounds 4-7
movdqa MSG1, _xmmd(D12)
_shasr2 (STATE0, STATE1, rax, xmm0, MSG1, MSG0, C1)
# rounds 8-11
movdqa MSG2, _xmmd(D12)
_shasr2 (STATE0, STATE1, rax, xmm0, MSG2, MSG1, C2)
# rounds 12-15
movdqa MSG3, _xmmd(D3)
_shasr3 (STATE0, STATE1, rax, xmm0, xmm1, MSG3, MSG2, MSG0, C3)
# rounds 16-19
_shasr3 (STATE0, STATE1, rax, xmm0, xmm1, MSG0, MSG3, MSG1, C4)
# rounds 20-23
_shasr3 (STATE0, STATE1, rax, xmm0, xmm1, MSG1, MSG0, MSG2, C5)
# rounds 24-27
_shasr3 (STATE0, STATE1, rax, xmm0, xmm1, MSG2, MSG1, MSG3, C6)
# rounds 28-31
_shasr3 (STATE0, STATE1, rax, xmm0, xmm1, MSG3, MSG2, MSG0, C7)
# rounds 32-35
_shasr3 (STATE0, STATE1, rax, xmm0, xmm1, MSG0, MSG3, MSG1, C8)
# rounds 36-39
_shasr3 (STATE0, STATE1, rax, xmm0, xmm1, MSG1, MSG0, MSG2, C9)
# rounds 40-43
_shasr3 (STATE0, STATE1, rax, xmm0, xmm1, MSG2, MSG1, MSG3, C10)
# rounds 44-47
_shasr3 (STATE0, STATE1, rax, xmm0, xmm1, MSG3, MSG2, MSG0, C11)
# rounds 48-51
_shasr3 (STATE0, STATE1, rax, xmm0, xmm1, MSG0, MSG3, MSG1, C12)
# rounds 52-55
_shasr3 (STATE0, STATE1, rax, xmm0, xmm1, MSG1, MSG0, MSG2, C13)
# rounds 56-59
_shasr3 (STATE0, STATE1, rax, xmm0, xmm1, MSG2, MSG1, MSG3, C14)
# rounds 60-63
_shasr1 (STATE0, STATE1, rax, xmm0, MSG3, C15)
# combine state
paddd STATE1, _xmmd(S)
pshufd xmm0, STATE0, 0x1B
pshufd STATE1, STATE1, 0xB1
movdqa STATE0, xmm0
pblendw STATE0, STATE1, 0xF0
palignr STATE1, xmm0, 8
# result hash in high 64bit of STATE1 (swapped double dword)
pextrq rax, STATE1, 1
_sp2xmm(10, 0)
_sp2xmm(11, 1)
_sp2xmm(12, 2)
_sp2xmm(13, 3)
_sp2xmm(14, 4)
_sp2xmm(15, 5)
mov rsp, rbp
pop rbp
ret
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment