Skip to content

Instantly share code, notes, and snippets.

@tarcieri
Created April 3, 2021 16:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tarcieri/414a3300072160f372b5d93ccfce280b to your computer and use it in GitHub Desktop.
Save tarcieri/414a3300072160f372b5d93ccfce280b to your computer and use it in GitHub Desktop.
core::simd-based SHA-256 implementation using ARMv8 Cryptography Extensions
#![feature(stdsimd)]
// Based on the following C intrinsics implementation:
// <https://github.com/noloader/SHA-Intrinsics/blob/master/sha256-arm.c>
//
// Original C written and placed in public domain by Jeffrey Walton.
// Based on code from ARM, and by Johannes Schneiders, Skip Hovsmith and
// Barry O'Rourke for the mbedTLS project.
use core::arch::aarch64::*;
const K: [u32; 64] = [
0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2,
];
pub fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
// Load state
let mut state0 = vld1q_u32(&state[0]);
let mut state1 = vld1q_u32(&state[4]);
for block in blocks {
// Save state
let abef_save = state0;
let cdgh_save = state1;
// Load message
let mut msg = [
vld1q_u32(block[..16].as_ptr() as *const u32),
vld1q_u32(block[16..32].as_ptr() as *const u32),
vld1q_u32(block[32..48].as_ptr() as *const u32),
vld1q_u32(block[48..].as_ptr() as *const u32),
];
// Reverse for little endian
for i in 0..4 {
// TODO(tarcieri): figure out why `vreinterpretq_u8_u32` is missing on aarch64
// msg[i] = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(msg[i])));
let m = unsafe { *(&msg[i] as *const uint32x4_t as *const uint8x16_t) };
msg[i] = vreinterpretq_u32_u8(vrev32q_u8(m));
}
let mut tmp0 = vaddq_u32(msg[0], vld1q_u32(&K[0x00]));
let mut tmp1: uint32x4_t;
let mut tmp2: uint32x4_t;
// Rounds 0-3
msg[0] = vsha256su0q_u32(msg[0], msg[1]);
tmp2 = state0;
tmp1 = vaddq_u32(msg[1], vld1q_u32(&K[0x04]));
state0 = vsha256hq_u32(state0, state1, tmp0);
state1 = vsha256h2q_u32(state1, tmp2, tmp0);
msg[0] = vsha256su1q_u32(msg[0], msg[2], msg[3]);
// Rounds 4-7
msg[1] = vsha256su0q_u32(msg[1], msg[2]);
tmp2 = state0;
tmp0 = vaddq_u32(msg[2], vld1q_u32(&K[0x08]));
state0 = vsha256hq_u32(state0, state1, tmp1);
state1 = vsha256h2q_u32(state1, tmp2, tmp1);
msg[1] = vsha256su1q_u32(msg[1], msg[3], msg[0]);
// Rounds 8-11
msg[2] = vsha256su0q_u32(msg[2], msg[3]);
tmp2 = state0;
tmp1 = vaddq_u32(msg[3], vld1q_u32(&K[0x0c]));
state0 = vsha256hq_u32(state0, state1, tmp0);
state1 = vsha256h2q_u32(state1, tmp2, tmp0);
msg[2] = vsha256su1q_u32(msg[2], msg[0], msg[1]);
// Rounds 12-15
msg[3] = vsha256su0q_u32(msg[3], msg[0]);
tmp2 = state0;
tmp0 = vaddq_u32(msg[0], vld1q_u32(&K[0x10]));
state0 = vsha256hq_u32(state0, state1, tmp1);
state1 = vsha256h2q_u32(state1, tmp2, tmp1);
msg[3] = vsha256su1q_u32(msg[3], msg[1], msg[2]);
// Rounds 16-19
msg[0] = vsha256su0q_u32(msg[0], msg[1]);
tmp2 = state0;
tmp1 = vaddq_u32(msg[1], vld1q_u32(&K[0x14]));
state0 = vsha256hq_u32(state0, state1, tmp0);
state1 = vsha256h2q_u32(state1, tmp2, tmp0);
msg[0] = vsha256su1q_u32(msg[0], msg[2], msg[3]);
// Rounds 20-23
msg[1] = vsha256su0q_u32(msg[1], msg[2]);
tmp2 = state0;
tmp0 = vaddq_u32(msg[2], vld1q_u32(&K[0x18]));
state0 = vsha256hq_u32(state0, state1, tmp1);
state1 = vsha256h2q_u32(state1, tmp2, tmp1);
msg[1] = vsha256su1q_u32(msg[1], msg[3], msg[0]);
// Rounds 24-27
msg[2] = vsha256su0q_u32(msg[2], msg[3]);
tmp2 = state0;
tmp1 = vaddq_u32(msg[3], vld1q_u32(&K[0x1c]));
state0 = vsha256hq_u32(state0, state1, tmp0);
state1 = vsha256h2q_u32(state1, tmp2, tmp0);
msg[2] = vsha256su1q_u32(msg[2], msg[0], msg[1]);
// Rounds 28-31
msg[3] = vsha256su0q_u32(msg[3], msg[0]);
tmp2 = state0;
tmp0 = vaddq_u32(msg[0], vld1q_u32(&K[0x20]));
state0 = vsha256hq_u32(state0, state1, tmp1);
state1 = vsha256h2q_u32(state1, tmp2, tmp1);
msg[3] = vsha256su1q_u32(msg[3], msg[1], msg[2]);
// Rounds 32-35
msg[0] = vsha256su0q_u32(msg[0], msg[1]);
tmp2 = state0;
tmp1 = vaddq_u32(msg[1], vld1q_u32(&K[0x24]));
state0 = vsha256hq_u32(state0, state1, tmp0);
state1 = vsha256h2q_u32(state1, tmp2, tmp0);
msg[0] = vsha256su1q_u32(msg[0], msg[2], msg[3]);
// Rounds 36-39
msg[1] = vsha256su0q_u32(msg[1], msg[2]);
tmp2 = state0;
tmp0 = vaddq_u32(msg[2], vld1q_u32(&K[0x28]));
state0 = vsha256hq_u32(state0, state1, tmp1);
state1 = vsha256h2q_u32(state1, tmp2, tmp1);
msg[1] = vsha256su1q_u32(msg[1], msg[3], msg[0]);
// Rounds 40-43
msg[2] = vsha256su0q_u32(msg[2], msg[3]);
tmp2 = state0;
tmp1 = vaddq_u32(msg[3], vld1q_u32(&K[0x2c]));
state0 = vsha256hq_u32(state0, state1, tmp0);
state1 = vsha256h2q_u32(state1, tmp2, tmp0);
msg[2] = vsha256su1q_u32(msg[2], msg[0], msg[1]);
// Rounds 44-47
msg[3] = vsha256su0q_u32(msg[3], msg[0]);
tmp2 = state0;
tmp0 = vaddq_u32(msg[0], vld1q_u32(&K[0x30]));
state0 = vsha256hq_u32(state0, state1, tmp1);
state1 = vsha256h2q_u32(state1, tmp2, tmp1);
msg[3] = vsha256su1q_u32(msg[3], msg[1], msg[2]);
// Rounds 48-51
tmp2 = state0;
tmp1 = vaddq_u32(msg[1], vld1q_u32(&K[0x34]));
state0 = vsha256hq_u32(state0, state1, tmp0);
state1 = vsha256h2q_u32(state1, tmp2, tmp0);
// Rounds 52-55
tmp2 = state0;
tmp0 = vaddq_u32(msg[2], vld1q_u32(&K[0x38]));
state0 = vsha256hq_u32(state0, state1, tmp1);
state1 = vsha256h2q_u32(state1, tmp2, tmp1);
// Rounds 56-59
tmp2 = state0;
tmp1 = vaddq_u32(msg[3], vld1q_u32(&K[0x3c]));
state0 = vsha256hq_u32(state0, state1, tmp0);
state1 = vsha256h2q_u32(state1, tmp2, tmp0);
// Rounds 60-63
tmp2 = state0;
state0 = vsha256hq_u32(state0, state1, tmp1);
state1 = vsha256h2q_u32(state1, tmp2, tmp1);
// Combine state
state0 = vaddq_u32(state0, abef_save);
state1 = vaddq_u32(state1, cdgh_save);
}
// Save state
vst1q_u32(&state[0], state0);
vst1q_u32(&state[4], state1);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment