dougallj/start-and-end-method.cpp Secret

## start-and-end-method.cpp
// Demo of technique for handling unaligned start/end chunks
// without reading out-of-bounds - does not build.

template <uint32_t Poly>
inline constexpr poly64x2_t fold128_constant(int byte_distance) {
  return poly64x2_t{k_shift<Poly>(byte_distance * 8 + 32),
                    k_shift<Poly>(byte_distance * 8 - 32)};
}

template <uint32_t Poly, int NumChains>
uint32_t generic_crc32(uint32_t crc, uint8_t *p, size_t size) {
  assert(size >= 48);

  constexpr uint8_t shift_indices_data[] __attribute__((aligned(64))) = {
      240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
      252, 253, 254, 255, 0,   1,   2,   3,   4,   5,   6,   7,
      8,   9,   10,  11,  12,  13,  14,  15,  16,  17,  18,  19,
      20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31};
  const uint8_t *shift_indices = shift_indices_data + 16;

  constexpr poly64x2_t end_table[] = {
      fold128_constant<Poly>(16), fold128_constant<Poly>(17),
      fold128_constant<Poly>(18), fold128_constant<Poly>(19),
      fold128_constant<Poly>(20), fold128_constant<Poly>(21),
      fold128_constant<Poly>(22), fold128_constant<Poly>(23),
      fold128_constant<Poly>(24), fold128_constant<Poly>(25),
      fold128_constant<Poly>(26), fold128_constant<Poly>(27),
      fold128_constant<Poly>(28), fold128_constant<Poly>(29),
      fold128_constant<Poly>(30), fold128_constant<Poly>(31),
  };

  poly64x2_t k3k4_2 = fold128_constant<Poly>(2 * sizeof(uint8x16_t));
  poly64x2_t k3k4 = fold128_constant<Poly>(sizeof(uint8x16_t));

  int start_align = (uintptr_t)bytes & 0xF;

  uint8x16_t unaligned_header = vld1q_u8(bytes);
  uint8x16_t aligned_header = vld1q_u8(bytes + 16 - start_align);
  uint8x16_t final_word = vld1q_u8(end - 16);

  unaligned_header = veorq_u8(unaligned_header, init);

  // shift in duplicate data from vals[0]
  aligned_header =
      vqtbx1q_u8(aligned_header, unaligned_header,
                 vld1q_u8(shift_indices - (start_align - 16));

  // shift out duplicate data from vals[0]
  unaligned_header = vqtbl1q_u8(
      unaligned_header, vld1q_u8(shift_indices - start_align));

  message128 = fold128(unaligned_header, aligned_header, k3k4);

  uint8_t *p = bytes + 32 - start_align;

  int end_align = (uintptr_t)end & 0xF;

  uint8_t *aligned_end = end - 16 - end_align;

  uint8x16_t penultimate =
      vqtbl1q_u8(vld1q_u8(aligned_end),
                 vld1q_u8(shift_indices + (end_align - 16)));
  final_word = fold128(penultimate, final_word, k3k4);

  if (aligned_end - p >= (NumChains - 1) * sizeof(uint8x16_t)) {
    // load first 16 * NumChains chunk
    uint8x16_t vals[NumChains];

    constexpr poly64x2_t k1k2 =
        fold128_constant<Poly>(NumChains * sizeof(uint8x16_t));

    uint8_t *start_p = p + (16 * (NumChains - 1));
    size = (uintptr_t)aligned_end - (uintptr_t)start_p;
    size_t fast_size = size / (16 * NumChains) * (16 * NumChains);
    uint8_t *fast_end = start_p + fast_size;

    p -= 0x10;
    for (int i = 0; i < NumChains; i++) {
      vals[i] = i == 0 ? message128 : vld1q_u8(p);
      p += 0x10;
    }

    while (p != fast_end) {
#pragma unroll
      for (int i = 0; i < NumChains; i++) {
        vals[i] = fold128(vals[i], vld1q_u8(p), k1k2);
        p += 0x10;
      }
    }

    constexpr poly64x2_t k3k4 = fold128_constant<Poly>(sizeof(uint8x16_t));

    if constexpr (NumChains % 4 == 0) {
      constexpr poly64x2_t k1 =
          fold128_constant<Poly>((NumChains / 2) * sizeof(uint8x16_t));
      for (int i = 0; i < NumChains / 2; i++)
        vals[i] = fold128(vals[i], vals[i + (NumChains / 2)], k1);

      constexpr poly64x2_t k2 =
          fold128_constant<Poly>((NumChains / 4) * sizeof(uint8x16_t));
      for (int i = 0; i < NumChains / 4; i++)
        vals[i] = fold128(vals[i], vals[i + (NumChains / 4)], k2);

      message128 = vals[0];
      for (int i = 1; i < NumChains / 4; i++)
        message128 = fold128(message128, vals[i], k3k4);
    } else {
      // Warning: very serial if NumChains is large
      message128 = vals[0];
      for (int i = 1; i < NumChains; i++)
        message128 = fold128(message128, vals[i], k3k4);
    }
  }

  while (p != aligned_end) {
    message128 = fold128(message128, vld1q_u8(p), k3k4);
    p += 0x10;
  }

  message128 = fold128(message128, final_word, end_table[end_align]);
  // etc.
}
	// Demo of technique for handling unaligned start/end chunks
	// without reading out-of-bounds - does not build.

	template <uint32_t Poly>
	inline constexpr poly64x2_t fold128_constant(int byte_distance) {
	return poly64x2_t{k_shift<Poly>(byte_distance * 8 + 32),
	k_shift<Poly>(byte_distance * 8 - 32)};
	}

	template <uint32_t Poly, int NumChains>
	uint32_t generic_crc32(uint32_t crc, uint8_t *p, size_t size) {
	assert(size >= 48);

	constexpr uint8_t shift_indices_data[] __attribute__((aligned(64))) = {
	240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
	252, 253, 254, 255, 0, 1, 2, 3, 4, 5, 6, 7,
	8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
	20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
	const uint8_t *shift_indices = shift_indices_data + 16;

	constexpr poly64x2_t end_table[] = {
	fold128_constant<Poly>(16), fold128_constant<Poly>(17),
	fold128_constant<Poly>(18), fold128_constant<Poly>(19),
	fold128_constant<Poly>(20), fold128_constant<Poly>(21),
	fold128_constant<Poly>(22), fold128_constant<Poly>(23),
	fold128_constant<Poly>(24), fold128_constant<Poly>(25),
	fold128_constant<Poly>(26), fold128_constant<Poly>(27),
	fold128_constant<Poly>(28), fold128_constant<Poly>(29),
	fold128_constant<Poly>(30), fold128_constant<Poly>(31),
	};

	poly64x2_t k3k4_2 = fold128_constant<Poly>(2 * sizeof(uint8x16_t));
	poly64x2_t k3k4 = fold128_constant<Poly>(sizeof(uint8x16_t));

	int start_align = (uintptr_t)bytes & 0xF;

	uint8x16_t unaligned_header = vld1q_u8(bytes);
	uint8x16_t aligned_header = vld1q_u8(bytes + 16 - start_align);
	uint8x16_t final_word = vld1q_u8(end - 16);

	unaligned_header = veorq_u8(unaligned_header, init);

	// shift in duplicate data from vals[0]
	aligned_header =
	vqtbx1q_u8(aligned_header, unaligned_header,
	vld1q_u8(shift_indices - (start_align - 16));

	// shift out duplicate data from vals[0]
	unaligned_header = vqtbl1q_u8(
	unaligned_header, vld1q_u8(shift_indices - start_align));

	message128 = fold128(unaligned_header, aligned_header, k3k4);

	uint8_t *p = bytes + 32 - start_align;

	int end_align = (uintptr_t)end & 0xF;

	uint8_t *aligned_end = end - 16 - end_align;

	uint8x16_t penultimate =
	vqtbl1q_u8(vld1q_u8(aligned_end),
	vld1q_u8(shift_indices + (end_align - 16)));
	final_word = fold128(penultimate, final_word, k3k4);

	if (aligned_end - p >= (NumChains - 1) * sizeof(uint8x16_t)) {
	// load first 16 * NumChains chunk
	uint8x16_t vals[NumChains];

	constexpr poly64x2_t k1k2 =
	fold128_constant<Poly>(NumChains * sizeof(uint8x16_t));

	uint8_t start_p = p + (16 (NumChains - 1));
	size = (uintptr_t)aligned_end - (uintptr_t)start_p;
	size_t fast_size = size / (16 * NumChains) * (16 * NumChains);
	uint8_t *fast_end = start_p + fast_size;

	p -= 0x10;
	for (int i = 0; i < NumChains; i++) {
	vals[i] = i == 0 ? message128 : vld1q_u8(p);
	p += 0x10;
	}

	while (p != fast_end) {
	#pragma unroll
	for (int i = 0; i < NumChains; i++) {
	vals[i] = fold128(vals[i], vld1q_u8(p), k1k2);
	p += 0x10;
	}
	}

	constexpr poly64x2_t k3k4 = fold128_constant<Poly>(sizeof(uint8x16_t));

	if constexpr (NumChains % 4 == 0) {
	constexpr poly64x2_t k1 =
	fold128_constant<Poly>((NumChains / 2) * sizeof(uint8x16_t));
	for (int i = 0; i < NumChains / 2; i++)
	vals[i] = fold128(vals[i], vals[i + (NumChains / 2)], k1);

	constexpr poly64x2_t k2 =
	fold128_constant<Poly>((NumChains / 4) * sizeof(uint8x16_t));
	for (int i = 0; i < NumChains / 4; i++)
	vals[i] = fold128(vals[i], vals[i + (NumChains / 4)], k2);

	message128 = vals[0];
	for (int i = 1; i < NumChains / 4; i++)
	message128 = fold128(message128, vals[i], k3k4);
	} else {
	// Warning: very serial if NumChains is large
	message128 = vals[0];
	for (int i = 1; i < NumChains; i++)
	message128 = fold128(message128, vals[i], k3k4);
	}
	}

	while (p != aligned_end) {
	message128 = fold128(message128, vld1q_u8(p), k3k4);
	p += 0x10;
	}

	message128 = fold128(message128, final_word, end_table[end_align]);
	// etc.
	}