Skip to content

Instantly share code, notes, and snippets.

@mniip
Last active August 29, 2015 14:01
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mniip/220ba4aa1907c7887a34 to your computer and use it in GitHub Desktop.
Save mniip/220ba4aa1907c7887a34 to your computer and use it in GitHub Desktop.
8-at-a-time MD5 hasher using AVX2
#include <stddef.h>
register A asm("xmm0");
register B asm("xmm1");
register C asm("xmm2");
register D asm("xmm3");
const unsigned int A0 = 0x67452301;
const unsigned int B0 = 0xEFCDAB89;
const unsigned int C0 = 0x98BADCFE;
const unsigned int D0 = 0x10325476;
void encrypt_short_8(char (*digest)[16], const char **data, const size_t *length)
{
unsigned int *block = calloc(64, 8);
unsigned int (*block_digest)[4] = digest;
unsigned int trans_digest[4][8];
int b, i;
for(b = 0; b < 8; b++)
{
for(i = 0; i < length[b]; i++)
((char *)(block + b + (i >> 2 << 3)))[i & 3] = data[b][i];
((char *)(block + b + (length[b] >> 2 << 3)))[length[b] & 3] = 0x80;
block[b + 8 * 14] = length[b] << 3;
block[b + 8 * 15] = length[b] >> 29;
}
unsigned int dummy;
asm volatile
(
"vpbroadcastd A0, %%ymm0\n"
"vpbroadcastd B0, %%ymm1\n"
"vpbroadcastd C0, %%ymm2\n"
"vpbroadcastd D0, %%ymm3\n"
"movq %1, %%rdi\n"
"callq block_round_8\n"
"vmovdqu %%ymm0, 0x0(%0)\n"
"vmovdqu %%ymm1, 0x20(%0)\n"
"vmovdqu %%ymm2, 0x40(%0)\n"
"vmovdqu %%ymm3, 0x60(%0)\n"
: : "r"(&trans_digest), "r"(block)
);
for(b = 0; b < 8; b++)
for(i = 0; i < 4; i++)
block_digest[b][i] = trans_digest[i][b];
}
# extern void block_round_8(const unsigned int (*data)[8]);
# __m256i A register("ymm0");
# __m256i B register("ymm1");
# __m256i C register("ymm2");
# __m256i D register("ymm3");
.global block_round_8
.type block_round_8 @function
block_round_8:
pushq %rbp
movq %rsp, %rbp
subq $0x4, %rsp;
# %rdi - data
vmovdqa %ymm0, %ymm4
vmovdqa %ymm1, %ymm5
vmovdqa %ymm2, %ymm6
vmovdqa %ymm3, %ymm7
# M = N + (M + K + F(N, O, P) + block[B]) <<< S
.macro iteration F, M, N, O, P, K, B, S
\F \N, \O, \P
movl $\K, (%rsp)
vpbroadcastd (%rsp), %ymm9
vpaddd %ymm9, %ymm8, %ymm8
vpaddd \M, %ymm8, %ymm8
vpaddd \B*4*8(%rdi), %ymm8, %ymm8
vpslld $\S, %ymm8, %ymm9
vpsrld $32-\S, %ymm8, %ymm8
vpor %ymm9, %ymm8, %ymm8
vpaddd \N, %ymm8, \M
.endm
# (N & O) | (~N & P)
.macro func1 N, O, P
vpandn \P, \N, %ymm9
vpand \N, \O, %ymm8
vpor %ymm9, %ymm8, %ymm8
.endm
iteration func1, %ymm4, %ymm5, %ymm6, %ymm7, 0xD76AA478, 0, 7
iteration func1, %ymm7, %ymm4, %ymm5, %ymm6, 0xE8C7B756, 1, 12
iteration func1, %ymm6, %ymm7, %ymm4, %ymm5, 0x242070DB, 2, 17
iteration func1, %ymm5, %ymm6, %ymm7, %ymm4, 0xC1BDCEEE, 3, 22
iteration func1, %ymm4, %ymm5, %ymm6, %ymm7, 0xF57C0FAF, 4, 7
iteration func1, %ymm7, %ymm4, %ymm5, %ymm6, 0x4787C62A, 5, 12
iteration func1, %ymm6, %ymm7, %ymm4, %ymm5, 0xA8304613, 6, 17
iteration func1, %ymm5, %ymm6, %ymm7, %ymm4, 0xFD469501, 7, 22
iteration func1, %ymm4, %ymm5, %ymm6, %ymm7, 0x698098D8, 8, 7
iteration func1, %ymm7, %ymm4, %ymm5, %ymm6, 0x8B44F7AF, 9, 12
iteration func1, %ymm6, %ymm7, %ymm4, %ymm5, 0xFFFF5BB1, 10, 17
iteration func1, %ymm5, %ymm6, %ymm7, %ymm4, 0x895CD7BE, 11, 22
iteration func1, %ymm4, %ymm5, %ymm6, %ymm7, 0x6B901122, 12, 7
iteration func1, %ymm7, %ymm4, %ymm5, %ymm6, 0xFD987193, 13, 12
iteration func1, %ymm6, %ymm7, %ymm4, %ymm5, 0xA679438E, 14, 17
iteration func1, %ymm5, %ymm6, %ymm7, %ymm4, 0x49B40821, 15, 22
# (P & N) | (~P & O)
.macro func2 N, O, P
func1 \P, \N, \O
.endm
iteration func2, %ymm4, %ymm5, %ymm6, %ymm7, 0xF61E2562, 1, 5
iteration func2, %ymm7, %ymm4, %ymm5, %ymm6, 0xC040B340, 6, 9
iteration func2, %ymm6, %ymm7, %ymm4, %ymm5, 0x265E5A51, 11, 14
iteration func2, %ymm5, %ymm6, %ymm7, %ymm4, 0xE9B6C7AA, 0, 20
iteration func2, %ymm4, %ymm5, %ymm6, %ymm7, 0xD62F105D, 5, 5
iteration func2, %ymm7, %ymm4, %ymm5, %ymm6, 0x02441453, 10, 9
iteration func2, %ymm6, %ymm7, %ymm4, %ymm5, 0xD8A1E681, 15, 14
iteration func2, %ymm5, %ymm6, %ymm7, %ymm4, 0xE7D3FBC8, 4, 20
iteration func2, %ymm4, %ymm5, %ymm6, %ymm7, 0x21E1CDE6, 9, 5
iteration func2, %ymm7, %ymm4, %ymm5, %ymm6, 0xC33707D6, 14, 9
iteration func2, %ymm6, %ymm7, %ymm4, %ymm5, 0xF4D50D87, 3, 14
iteration func2, %ymm5, %ymm6, %ymm7, %ymm4, 0x455A14ED, 8, 20
iteration func2, %ymm4, %ymm5, %ymm6, %ymm7, 0xA9E3E905, 13, 5
iteration func2, %ymm7, %ymm4, %ymm5, %ymm6, 0xFCEFA3F8, 2, 9
iteration func2, %ymm6, %ymm7, %ymm4, %ymm5, 0x676F02D9, 7, 14
iteration func2, %ymm5, %ymm6, %ymm7, %ymm4, 0x8D2A4C8A, 12, 20
# (N ^ O ^ P)
.macro func3 N, O, P
vpxor \N, \O, %ymm8
vpxor \P, %ymm8, %ymm8
.endm
iteration func3, %ymm4, %ymm5, %ymm6, %ymm7, 0xFFFA3942, 5, 4
iteration func3, %ymm7, %ymm4, %ymm5, %ymm6, 0x8771F681, 8, 11
iteration func3, %ymm6, %ymm7, %ymm4, %ymm5, 0x6D9D6122, 11, 16
iteration func3, %ymm5, %ymm6, %ymm7, %ymm4, 0xFDE5380C, 14, 23
iteration func3, %ymm4, %ymm5, %ymm6, %ymm7, 0xA4BEEA44, 1, 4
iteration func3, %ymm7, %ymm4, %ymm5, %ymm6, 0x4BDECFA9, 4, 11
iteration func3, %ymm6, %ymm7, %ymm4, %ymm5, 0xF6BB4B60, 7, 16
iteration func3, %ymm5, %ymm6, %ymm7, %ymm4, 0xBEBFBC70, 10, 23
iteration func3, %ymm4, %ymm5, %ymm6, %ymm7, 0x289B7EC6, 13, 4
iteration func3, %ymm7, %ymm4, %ymm5, %ymm6, 0xEAA127FA, 0, 11
iteration func3, %ymm6, %ymm7, %ymm4, %ymm5, 0xD4EF3085, 3, 16
iteration func3, %ymm5, %ymm6, %ymm7, %ymm4, 0x04881D05, 6, 23
iteration func3, %ymm4, %ymm5, %ymm6, %ymm7, 0xD9D4D039, 9, 4
iteration func3, %ymm7, %ymm4, %ymm5, %ymm6, 0xE6DB99E5, 12, 11
iteration func3, %ymm6, %ymm7, %ymm4, %ymm5, 0x1FA27CF8, 15, 16
iteration func3, %ymm5, %ymm6, %ymm7, %ymm4, 0xC4AC5665, 2, 23
# (O ^ (N | ~P))
.macro func4 N, O, P
vpcmpeqd %ymm8, %ymm8, %ymm8
vpxor \P, %ymm8, %ymm8
vpor \N, %ymm8, %ymm8
vpxor \O, %ymm8, %ymm8
.endm
iteration func4, %ymm4, %ymm5, %ymm6, %ymm7, 0xF4292244, 0, 6
iteration func4, %ymm7, %ymm4, %ymm5, %ymm6, 0x432AFF97, 7, 10
iteration func4, %ymm6, %ymm7, %ymm4, %ymm5, 0xAB9423A7, 14, 15
iteration func4, %ymm5, %ymm6, %ymm7, %ymm4, 0xFC93A039, 5, 21
iteration func4, %ymm4, %ymm5, %ymm6, %ymm7, 0x655B59C3, 12, 6
iteration func4, %ymm7, %ymm4, %ymm5, %ymm6, 0x8F0CCC92, 3, 10
iteration func4, %ymm6, %ymm7, %ymm4, %ymm5, 0xFFEFF47D, 10, 15
iteration func4, %ymm5, %ymm6, %ymm7, %ymm4, 0x85845DD1, 1, 21
iteration func4, %ymm4, %ymm5, %ymm6, %ymm7, 0x6FA87E4F, 8, 6
iteration func4, %ymm7, %ymm4, %ymm5, %ymm6, 0xFE2CE6E0, 15, 10
iteration func4, %ymm6, %ymm7, %ymm4, %ymm5, 0xA3014314, 6, 15
iteration func4, %ymm5, %ymm6, %ymm7, %ymm4, 0x4E0811A1, 13, 21
iteration func4, %ymm4, %ymm5, %ymm6, %ymm7, 0xF7537E82, 4, 6
iteration func4, %ymm7, %ymm4, %ymm5, %ymm6, 0xBD3AF235, 11, 10
iteration func4, %ymm6, %ymm7, %ymm4, %ymm5, 0x2AD7D2BB, 2, 15
iteration func4, %ymm5, %ymm6, %ymm7, %ymm4, 0xEB86D391, 9, 21
vpaddd %ymm4, %ymm0, %ymm0
vpaddd %ymm5, %ymm1, %ymm1
vpaddd %ymm6, %ymm2, %ymm2
vpaddd %ymm7, %ymm3, %ymm3
leaveq
retq
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment