Created
April 10, 2013 08:27
-
-
Save jandk/5352820 to your computer and use it in GitHub Desktop.
Calculate 4 MD5 hashes at the same time. ~2.5x speedup.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <emmintrin.h> | |
typedef unsigned char uint8; | |
typedef unsigned int uint32; | |
typedef unsigned long long uint64; | |
typedef struct MD5Data | |
{ | |
__m128i h0, h1, h2, h3; | |
} MD5Data; | |
#define _MM_TRANSPOSE4_PI(r0, r1, r2, r3) \ | |
{ \ | |
__m128i t3, t2, t1, t0; \ | |
\ | |
t0 = _mm_unpacklo_epi32((r0), (r1)); \ | |
t1 = _mm_unpacklo_epi32((r2), (r3)); \ | |
t2 = _mm_unpackhi_epi32((r0), (r1)); \ | |
t3 = _mm_unpackhi_epi32((r2), (r3)); \ | |
\ | |
(r0) = _mm_unpacklo_epi64(t0, t1); \ | |
(r1) = _mm_unpackhi_epi64(t0, t1); \ | |
(r2) = _mm_unpacklo_epi64(t2, t3); \ | |
(r3) = _mm_unpackhi_epi64(t2, t3); \ | |
} | |
#define EXPAND128(x) (_mm_set1_epi32((int)(x))) | |
#define ALL1 EXPAND128(0xffffffff) | |
#define ALL0 _mm_setzero_si128() | |
#define MD5_ROL(x, r) \ | |
_mm_or_si128( \ | |
_mm_slli_epi32(x, r), \ | |
_mm_srli_epi32(x, 32 - r) \ | |
); | |
#define MD5_RnA(a, b, s) \ | |
a = MD5_ROL(a, s); \ | |
a = _mm_add_epi32(a, b); | |
#define MD5_ROUND1(a, b, c, d, Xk, s, Ti) \ | |
a = _mm_add_epi32(a, \ | |
_mm_add_epi32( \ | |
_mm_or_si128(_mm_and_si128(c, b), _mm_andnot_si128(b, d)), \ | |
_mm_add_epi32(Xk, EXPAND128(Ti)) \ | |
) \ | |
); \ | |
MD5_RnA(a, b, s); | |
#define MD5_ROUND2(a, b, c, d, Xk, s, Ti) \ | |
a = _mm_add_epi32(a, \ | |
_mm_add_epi32( \ | |
_mm_or_si128(_mm_and_si128(d, b), _mm_andnot_si128(d, c)), \ | |
_mm_add_epi32(Xk, EXPAND128(Ti)) \ | |
) \ | |
); \ | |
MD5_RnA(a, b, s); | |
#define MD5_ROUND3(a, b, c, d, Xk, s, Ti) \ | |
a = _mm_add_epi32(a, \ | |
_mm_add_epi32( \ | |
_mm_xor_si128(b, _mm_xor_si128(c, d)), \ | |
_mm_add_epi32(Xk, EXPAND128(Ti)) \ | |
) \ | |
); \ | |
MD5_RnA(a, b, s); | |
#define MD5_ROUND4(a, b, c, d, Xk, s, Ti) \ | |
a = _mm_add_epi32(a, \ | |
_mm_add_epi32( \ | |
_mm_xor_si128(c, _mm_or_si128(b, _mm_xor_si128(d, ALL1))), \ | |
_mm_add_epi32(Xk, EXPAND128(Ti)) \ | |
) \ | |
); \ | |
MD5_RnA(a, b, s); | |
#define MD5_AA EXPAND128(0x67452301) | |
#define MD5_BB EXPAND128(0xefcdab89) | |
#define MD5_CC EXPAND128(0x98badcfe) | |
#define MD5_DD EXPAND128(0x10325476) | |
void md5(MD5Data *data, const uint32 *len) | |
{ | |
__m128i a, b, c, d; | |
__m128i aa, bb, cc, dd; | |
__m128i ml8; | |
// This is why I love C | |
uint8* chardata = (uint8 *)data; | |
chardata[ 0 + len[0]] = 0x80; | |
chardata[16 + len[1]] = 0x80; | |
chardata[32 + len[2]] = 0x80; | |
chardata[48 + len[3]] = 0x80; | |
ml8 = _mm_set_epi32(len[3], len[2], len[1], len[0]); | |
ml8 = _mm_slli_epi32(ml8, 3); | |
aa = data->h0; | |
bb = data->h1; | |
cc = data->h2; | |
dd = data->h3; | |
// First transpose the MD5 | |
_MM_TRANSPOSE4_PI(aa, bb, cc, dd); | |
// Do some stuff beforehand | |
a = _mm_add_epi32(EXPAND128(0xd76aa477), aa); | |
MD5_RnA(a, MD5_BB, 7); | |
d = _mm_add_epi32(EXPAND128(0xf8fa0bcc), bb); | |
d = _mm_add_epi32(d, _mm_or_si128(_mm_and_si128(a, MD5_BB), _mm_andnot_si128(a, MD5_CC))); | |
MD5_RnA(d, a, 12); | |
c = _mm_add_epi32(EXPAND128(0xbcdb4dd9), cc); | |
c = _mm_add_epi32(c, _mm_or_si128(_mm_and_si128(d, a), _mm_andnot_si128(d, MD5_BB))); | |
MD5_RnA(c, d, 17); | |
b = _mm_add_epi32(EXPAND128(0xb18b7a77), dd); | |
b = _mm_add_epi32(b, _mm_or_si128(_mm_and_si128(d, c), _mm_andnot_si128(c, a))); | |
MD5_RnA(b, c, 22); | |
// Main body | |
MD5_ROUND1(a, b, c, d, ALL0, 7, 0xf57c0faf); | |
MD5_ROUND1(d, a, b, c, ALL0, 12, 0x4787c62a); | |
MD5_ROUND1(c, d, a, b, ALL0, 17, 0xa8304613); | |
MD5_ROUND1(b, c, d, a, ALL0, 22, 0xfd469501); | |
MD5_ROUND1(a, b, c, d, ALL0, 7, 0x698098d8); | |
MD5_ROUND1(d, a, b, c, ALL0, 12, 0x8b44f7af); | |
MD5_ROUND1(c, d, a, b, ALL0, 17, 0xffff5bb1); | |
MD5_ROUND1(b, c, d, a, ALL0, 22, 0x895cd7be); | |
MD5_ROUND1(a, b, c, d, ALL0, 7, 0x6b901122); | |
MD5_ROUND1(d, a, b, c, ALL0, 12, 0xfd987193); | |
MD5_ROUND1(c, d, a, b, ml8, 17, 0xa679438e); | |
MD5_ROUND1(b, c, d, a, ALL0, 22, 0x49b40821); | |
MD5_ROUND2(a, b, c, d, bb, 5, 0xf61e2562); | |
MD5_ROUND2(d, a, b, c, ALL0, 9, 0xc040b340); | |
MD5_ROUND2(c, d, a, b, ALL0, 14, 0x265e5a51); | |
MD5_ROUND2(b, c, d, a, aa, 20, 0xe9b6c7aa); | |
MD5_ROUND2(a, b, c, d, ALL0, 5, 0xd62f105d); | |
MD5_ROUND2(d, a, b, c, ALL0, 9, 0x02441453); | |
MD5_ROUND2(c, d, a, b, ALL0, 14, 0xd8a1e681); | |
MD5_ROUND2(b, c, d, a, ALL0, 20, 0xe7d3fbc8); | |
MD5_ROUND2(a, b, c, d, ALL0, 5, 0x21e1cde6); | |
MD5_ROUND2(d, a, b, c, ml8, 9, 0xc33707d6); | |
MD5_ROUND2(c, d, a, b, dd, 14, 0xf4d50d87); | |
MD5_ROUND2(b, c, d, a, ALL0, 20, 0x455a14ed); | |
MD5_ROUND2(a, b, c, d, ALL0, 5, 0xa9e3e905); | |
MD5_ROUND2(d, a, b, c, cc, 9, 0xfcefa3f8); | |
MD5_ROUND2(c, d, a, b, ALL0, 14, 0x676f02d9); | |
MD5_ROUND2(b, c, d, a, ALL0, 20, 0x8d2a4c8a); | |
MD5_ROUND3(a, b, c, d, ALL0, 4, 0xfffa3942); | |
MD5_ROUND3(d, a, b, c, ALL0, 11, 0x8771f681); | |
MD5_ROUND3(c, d, a, b, ALL0, 16, 0x6d9d6122); | |
MD5_ROUND3(b, c, d, a, ml8, 23, 0xfde5380c); | |
MD5_ROUND3(a, b, c, d, bb, 4, 0xa4beea44); | |
MD5_ROUND3(d, a, b, c, ALL0, 11, 0x4bdecfa9); | |
MD5_ROUND3(c, d, a, b, ALL0, 16, 0xf6bb4b60); | |
MD5_ROUND3(b, c, d, a, ALL0, 23, 0xbebfbc70); | |
MD5_ROUND3(a, b, c, d, ALL0, 4, 0x289b7ec6); | |
MD5_ROUND3(d, a, b, c, aa, 11, 0xeaa127fa); | |
MD5_ROUND3(c, d, a, b, dd, 16, 0xd4ef3085); | |
MD5_ROUND3(b, c, d, a, ALL0, 23, 0x04881d05); | |
MD5_ROUND3(a, b, c, d, ALL0, 4, 0xd9d4d039); | |
MD5_ROUND3(d, a, b, c, ALL0, 11, 0xe6db99e5); | |
MD5_ROUND3(c, d, a, b, ALL0, 16, 0x1fa27cf8); | |
MD5_ROUND3(b, c, d, a, cc, 23, 0xc4ac5665); | |
MD5_ROUND4(a, b, c, d, aa, 6, 0xf4292244); | |
MD5_ROUND4(d, a, b, c, ALL0, 10, 0x432aff97); | |
MD5_ROUND4(c, d, a, b, ml8, 15, 0xab9423a7); | |
MD5_ROUND4(b, c, d, a, ALL0, 21, 0xfc93a039); | |
MD5_ROUND4(a, b, c, d, ALL0, 6, 0x655b59c3); | |
MD5_ROUND4(d, a, b, c, dd, 10, 0x8f0ccc92); | |
MD5_ROUND4(c, d, a, b, ALL0, 15, 0xffeff47d); | |
MD5_ROUND4(b, c, d, a, bb, 21, 0x85845dd1); | |
MD5_ROUND4(a, b, c, d, ALL0, 6, 0x6fa87e4f); | |
MD5_ROUND4(d, a, b, c, ALL0, 10, 0xfe2ce6e0); | |
MD5_ROUND4(c, d, a, b, ALL0, 15, 0xa3014314); | |
MD5_ROUND4(b, c, d, a, ALL0, 21, 0x4e0811a1); | |
MD5_ROUND4(a, b, c, d, ALL0, 6, 0xf7537e82); | |
MD5_ROUND4(d, a, b, c, ALL0, 10, 0xbd3af235); | |
MD5_ROUND4(c, d, a, b, cc, 15, 0x2ad7d2bb); | |
MD5_ROUND4(b, c, d, a, ALL0, 21, 0xeb86d391); | |
a = _mm_add_epi32(a, MD5_AA); | |
b = _mm_add_epi32(b, MD5_BB); | |
c = _mm_add_epi32(c, MD5_CC); | |
d = _mm_add_epi32(d, MD5_DD); | |
_MM_TRANSPOSE4_PI(a, b, c, d); | |
data->h0 = a; | |
data->h1 = b; | |
data->h2 = c; | |
data->h3 = d; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment