-
-
Save dougallj/7766e068779d2bfa11bab9f337762d59 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Demo of technique for handling unaligned start/end chunks | |
// without reading out-of-bounds - does not build. | |
template <uint32_t Poly> | |
inline constexpr poly64x2_t fold128_constant(int byte_distance) { | |
return poly64x2_t{k_shift<Poly>(byte_distance * 8 + 32), | |
k_shift<Poly>(byte_distance * 8 - 32)}; | |
} | |
template <uint32_t Poly, int NumChains> | |
uint32_t generic_crc32(uint32_t crc, uint8_t *p, size_t size) { | |
assert(size >= 48); | |
constexpr uint8_t shift_indices_data[] __attribute__((aligned(64))) = { | |
240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, | |
252, 253, 254, 255, 0, 1, 2, 3, 4, 5, 6, 7, | |
8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, | |
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; | |
const uint8_t *shift_indices = shift_indices_data + 16; | |
constexpr poly64x2_t end_table[] = { | |
fold128_constant<Poly>(16), fold128_constant<Poly>(17), | |
fold128_constant<Poly>(18), fold128_constant<Poly>(19), | |
fold128_constant<Poly>(20), fold128_constant<Poly>(21), | |
fold128_constant<Poly>(22), fold128_constant<Poly>(23), | |
fold128_constant<Poly>(24), fold128_constant<Poly>(25), | |
fold128_constant<Poly>(26), fold128_constant<Poly>(27), | |
fold128_constant<Poly>(28), fold128_constant<Poly>(29), | |
fold128_constant<Poly>(30), fold128_constant<Poly>(31), | |
}; | |
poly64x2_t k3k4_2 = fold128_constant<Poly>(2 * sizeof(uint8x16_t)); | |
poly64x2_t k3k4 = fold128_constant<Poly>(sizeof(uint8x16_t)); | |
int start_align = (uintptr_t)bytes & 0xF; | |
uint8x16_t unaligned_header = vld1q_u8(bytes); | |
uint8x16_t aligned_header = vld1q_u8(bytes + 16 - start_align); | |
uint8x16_t final_word = vld1q_u8(end - 16); | |
unaligned_header = veorq_u8(unaligned_header, init); | |
// shift in duplicate data from vals[0] | |
aligned_header = | |
vqtbx1q_u8(aligned_header, unaligned_header, | |
vld1q_u8(shift_indices - (start_align - 16)); | |
// shift out duplicate data from vals[0] | |
unaligned_header = vqtbl1q_u8( | |
unaligned_header, vld1q_u8(shift_indices - start_align)); | |
message128 = fold128(unaligned_header, aligned_header, k3k4); | |
uint8_t *p = bytes + 32 - start_align; | |
int end_align = (uintptr_t)end & 0xF; | |
uint8_t *aligned_end = end - 16 - end_align; | |
uint8x16_t penultimate = | |
vqtbl1q_u8(vld1q_u8(aligned_end), | |
vld1q_u8(shift_indices + (end_align - 16))); | |
final_word = fold128(penultimate, final_word, k3k4); | |
if (aligned_end - p >= (NumChains - 1) * sizeof(uint8x16_t)) { | |
// load first 16 * NumChains chunk | |
uint8x16_t vals[NumChains]; | |
constexpr poly64x2_t k1k2 = | |
fold128_constant<Poly>(NumChains * sizeof(uint8x16_t)); | |
uint8_t *start_p = p + (16 * (NumChains - 1)); | |
size = (uintptr_t)aligned_end - (uintptr_t)start_p; | |
size_t fast_size = size / (16 * NumChains) * (16 * NumChains); | |
uint8_t *fast_end = start_p + fast_size; | |
p -= 0x10; | |
for (int i = 0; i < NumChains; i++) { | |
vals[i] = i == 0 ? message128 : vld1q_u8(p); | |
p += 0x10; | |
} | |
while (p != fast_end) { | |
#pragma unroll | |
for (int i = 0; i < NumChains; i++) { | |
vals[i] = fold128(vals[i], vld1q_u8(p), k1k2); | |
p += 0x10; | |
} | |
} | |
constexpr poly64x2_t k3k4 = fold128_constant<Poly>(sizeof(uint8x16_t)); | |
if constexpr (NumChains % 4 == 0) { | |
constexpr poly64x2_t k1 = | |
fold128_constant<Poly>((NumChains / 2) * sizeof(uint8x16_t)); | |
for (int i = 0; i < NumChains / 2; i++) | |
vals[i] = fold128(vals[i], vals[i + (NumChains / 2)], k1); | |
constexpr poly64x2_t k2 = | |
fold128_constant<Poly>((NumChains / 4) * sizeof(uint8x16_t)); | |
for (int i = 0; i < NumChains / 4; i++) | |
vals[i] = fold128(vals[i], vals[i + (NumChains / 4)], k2); | |
message128 = vals[0]; | |
for (int i = 1; i < NumChains / 4; i++) | |
message128 = fold128(message128, vals[i], k3k4); | |
} else { | |
// Warning: very serial if NumChains is large | |
message128 = vals[0]; | |
for (int i = 1; i < NumChains; i++) | |
message128 = fold128(message128, vals[i], k3k4); | |
} | |
} | |
while (p != aligned_end) { | |
message128 = fold128(message128, vld1q_u8(p), k3k4); | |
p += 0x10; | |
} | |
message128 = fold128(message128, final_word, end_table[end_align]); | |
// etc. | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment