Skip to content

Instantly share code, notes, and snippets.

@Nexuapex
Created October 1, 2018 01:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Nexuapex/b837cfa19e6db51be0952d7f654e8a9d to your computer and use it in GitHub Desktop.
Save Nexuapex/b837cfa19e6db51be0952d7f654e8a9d to your computer and use it in GitHub Desktop.
// Abseil absl::flat_hash_map
// Analysis of the critical path for perfectly predicted load hit.
// https://godbolt.org/z/P_ksaa
//
// x86_64
// Instruction latencies have been entirely ignored.
# rdi = desired key
mov rax, qword ptr [rip + kHashSeed] # [a00] rax = per-process seed (with entropy)
add rax, rdi # [a01] rax = hash so far
movabs rcx, 0x9DDFEA08EB382D69 # rcx = CityHash magic number
mul rcx # [a02] 128-bit multiply rax * rcx...
xor rdx, rax # [a03] ...and mix to 64-bits (rdx = hash of key)
mov r10, qword ptr [rip + g_map] # [b00] r10 = pointer to metadata bytes
mov rax, rdx
shr rax, 7 # [a04] rax = hash bits 7-63
mov rcx, r10
shr rcx, 12 # [b01] rcx = page address (ASLR entropy)
xor rcx, rax # [a05] rcx = H1 (with added entropy) -- n.b. also b02
mov rsi, qword ptr [rip + g_map+24] # [c00] rsi = power-of-two-minus-one capacity mask
and dl, 127 # [d04] dl = H2 (hash bits 0-6) -- n.b. branched from "a" dep chain
movzx eax, dl # eax = H2
vmovd xmm0, eax # [d05] xmm0[byte 0] = H2
vpxor xmm1, xmm1, xmm1
vpshufb xmm0, xmm0, xmm1 # [d06] splat byte 0 across all lanes of xmm0
mov r9, qword ptr [rip + g_map+8] # [e00] r9 = pointer to interleaved keys and values
xor r8d, r8d
and rcx, rsi # [a06] rcx = bucket index (or slot index?) -- n.b. also c01
vpcmpeqb xmm1, xmm0, xmmword ptr [r10 + rcx] # [a07] xmm1 = packed byte compare w/ metadata bytes -- n.b. also d07
vpmovmskb edx, xmm1 # [a08] edx = mask of byte compare results
test edx, edx # [a09] did I find any candidates?
jz .probe_next_bucket # no candidates, go to next bucket
bsf eax, edx # [a09] eax = relative index of first candidate
add rax, rcx # [a10] eax = absolute index of first candidate
and rax, rsi # [a11] handle wrap-around
shl rax, 4 # [a12] rax = byte offset to key (after mul by 16)
cmp qword ptr [r9 + rax], rdi # [a13] does the key match desired key?
jne .check_next_slot # no match, go to next candidate
mov rax, qword ptr [r9 + rax + 8] # [a13] rax = value
ret
00: LD load seed, metadata ptr, slots ptr
01: ALU seed + key
02: ALU tmp1 * hash_magic (128-bit)
03: ALU tmp2.hi ^ tmp2.lo
04: ALU hash >> 7 ALU hash & 127
05: ALU tmp3 ^ ASLR entropy FLT move metadata byte to floating point unit
06: ALU tmp4 & capacity_mask FLT splat metadata byte across all lanes
07: FLT [load-op] packed byte compare
08: FLT extract/move mask back to gpr
09: ALU bitscan to find first candidate
10: ALU candidate += bucket index
11: ALU candidate &= capacity_mask
12: ALU candidate *= 16
13: LD [load-op] compare key LD load value
14 instructions long. The most expensive is the MUL during hashing.
2 dependent loads, although the second (key + value) could possibly be prefetched.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment