Skip to content

Instantly share code, notes, and snippets.

@dougallj
Last active Jun 6, 2022
Embed
What would you like to do?
// Demo of technique for handling unaligned start/end chunks
// without reading out-of-bounds - does not build.
template <uint32_t Poly>
inline constexpr poly64x2_t fold128_constant(int byte_distance) {
return poly64x2_t{k_shift<Poly>(byte_distance * 8 + 32),
k_shift<Poly>(byte_distance * 8 - 32)};
}
template <uint32_t Poly, int NumChains>
uint32_t generic_crc32(uint32_t crc, uint8_t *p, size_t size) {
assert(size >= 48);
constexpr uint8_t shift_indices_data[] __attribute__((aligned(64))) = {
240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
252, 253, 254, 255, 0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
const uint8_t *shift_indices = shift_indices_data + 16;
constexpr poly64x2_t end_table[] = {
fold128_constant<Poly>(16), fold128_constant<Poly>(17),
fold128_constant<Poly>(18), fold128_constant<Poly>(19),
fold128_constant<Poly>(20), fold128_constant<Poly>(21),
fold128_constant<Poly>(22), fold128_constant<Poly>(23),
fold128_constant<Poly>(24), fold128_constant<Poly>(25),
fold128_constant<Poly>(26), fold128_constant<Poly>(27),
fold128_constant<Poly>(28), fold128_constant<Poly>(29),
fold128_constant<Poly>(30), fold128_constant<Poly>(31),
};
poly64x2_t k3k4_2 = fold128_constant<Poly>(2 * sizeof(uint8x16_t));
poly64x2_t k3k4 = fold128_constant<Poly>(sizeof(uint8x16_t));
int start_align = (uintptr_t)bytes & 0xF;
uint8x16_t unaligned_header = vld1q_u8(bytes);
uint8x16_t aligned_header = vld1q_u8(bytes + 16 - start_align);
uint8x16_t final_word = vld1q_u8(end - 16);
unaligned_header = veorq_u8(unaligned_header, init);
// shift in duplicate data from vals[0]
aligned_header =
vqtbx1q_u8(aligned_header, unaligned_header,
vld1q_u8(shift_indices - (start_align - 16));
// shift out duplicate data from vals[0]
unaligned_header = vqtbl1q_u8(
unaligned_header, vld1q_u8(shift_indices - start_align));
message128 = fold128(unaligned_header, aligned_header, k3k4);
uint8_t *p = bytes + 32 - start_align;
int end_align = (uintptr_t)end & 0xF;
uint8_t *aligned_end = end - 16 - end_align;
uint8x16_t penultimate =
vqtbl1q_u8(vld1q_u8(aligned_end),
vld1q_u8(shift_indices + (end_align - 16)));
final_word = fold128(penultimate, final_word, k3k4);
if (aligned_end - p >= (NumChains - 1) * sizeof(uint8x16_t)) {
// load first 16 * NumChains chunk
uint8x16_t vals[NumChains];
constexpr poly64x2_t k1k2 =
fold128_constant<Poly>(NumChains * sizeof(uint8x16_t));
uint8_t *start_p = p + (16 * (NumChains - 1));
size = (uintptr_t)aligned_end - (uintptr_t)start_p;
size_t fast_size = size / (16 * NumChains) * (16 * NumChains);
uint8_t *fast_end = start_p + fast_size;
p -= 0x10;
for (int i = 0; i < NumChains; i++) {
vals[i] = i == 0 ? message128 : vld1q_u8(p);
p += 0x10;
}
while (p != fast_end) {
#pragma unroll
for (int i = 0; i < NumChains; i++) {
vals[i] = fold128(vals[i], vld1q_u8(p), k1k2);
p += 0x10;
}
}
constexpr poly64x2_t k3k4 = fold128_constant<Poly>(sizeof(uint8x16_t));
if constexpr (NumChains % 4 == 0) {
constexpr poly64x2_t k1 =
fold128_constant<Poly>((NumChains / 2) * sizeof(uint8x16_t));
for (int i = 0; i < NumChains / 2; i++)
vals[i] = fold128(vals[i], vals[i + (NumChains / 2)], k1);
constexpr poly64x2_t k2 =
fold128_constant<Poly>((NumChains / 4) * sizeof(uint8x16_t));
for (int i = 0; i < NumChains / 4; i++)
vals[i] = fold128(vals[i], vals[i + (NumChains / 4)], k2);
message128 = vals[0];
for (int i = 1; i < NumChains / 4; i++)
message128 = fold128(message128, vals[i], k3k4);
} else {
// Warning: very serial if NumChains is large
message128 = vals[0];
for (int i = 1; i < NumChains; i++)
message128 = fold128(message128, vals[i], k3k4);
}
}
while (p != aligned_end) {
message128 = fold128(message128, vld1q_u8(p), k3k4);
p += 0x10;
}
message128 = fold128(message128, final_word, end_table[end_align]);
// etc.
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment