Created
June 6, 2023 20:37
-
-
Save danlark1/7912195855fb7087ba2de6e81803beed to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
SNAPPY_ATTRIBUTE_ALWAYS_INLINE | |
inline size_t AdvanceToNextTagX86Optimized(const uint8_t** ip_p, size_t* tag) { | |
const uint8_t*& ip = *ip_p; | |
// This section is crucial for the throughput of the decompression loop. | |
// The latency of an iteration is fundamentally constrained by the | |
// following data chain on ip. | |
// ip -> c = Load(ip) -> ip1 = ip + 1 + (c & 3) -> ip = ip1 or ip2 | |
// ip2 = ip + 2 + (c >> 2) | |
// This amounts to 8 cycles. | |
// 5 (load) + 1 (c & 3) + 1 (lea ip1, [ip + (c & 3) + 1]) + 1 (cmov) | |
size_t literal_len = *tag >> 2; | |
size_t tag_type = *tag; | |
bool is_literal; | |
#if defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(__x86_64__) | |
// TODO clang misses the fact that the (c & 3) already correctly | |
// sets the zero flag. | |
asm("and $3, %k[tag_type]\n\t" | |
: [tag_type] "+r"(tag_type), "=@ccz"(is_literal) | |
:: "cc"); | |
#else | |
tag_type &= 3; | |
is_literal = (tag_type == 0); | |
#endif | |
// TODO | |
// This is code is subtle. Loading the values first and then cmov has less | |
// latency then cmov ip and then load. However clang would move the loads | |
// in an optimization phase, volatile prevents this transformation. | |
// Note that we have enough slop bytes (64) that the loads are always valid. | |
size_t tag_literal = | |
static_cast<const volatile uint8_t*>(ip)[1 + literal_len]; | |
size_t tag_copy = static_cast<const volatile uint8_t*>(ip)[tag_type]; | |
*tag = is_literal ? tag_literal : tag_copy; | |
const uint8_t* ip_copy = ip + 1 + tag_type; | |
const uint8_t* ip_literal = ip + 2 + literal_len; | |
ip = is_literal ? ip_literal : ip_copy; | |
#if defined(__GNUC__) && defined(__x86_64__) | |
// TODO Clang is "optimizing" zero-extension (a totally free | |
// operation) this means that after the cmov of tag, it emits another movzb | |
// tag, byte(tag). It really matters as it's on the core chain. This dummy | |
// asm, persuades clang to do the zero-extension at the load (it's automatic) | |
// removing the expensive movzb. | |
asm("" ::"r"(tag_copy)); | |
#endif | |
return tag_type; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment