Nexuapex/absl_flat_hash_map_latency.txt

## absl_flat_hash_map_latency.txt
// Abseil absl::flat_hash_map
// Analysis of the critical path for perfectly predicted load hit.
// https://godbolt.org/z/P_ksaa
//
// x86_64
// Instruction latencies have been entirely ignored.

                                                        #       rdi = desired key
mov             rax, qword ptr [rip + kHashSeed]        # [a00] rax = per-process seed (with entropy)
add             rax, rdi                                # [a01] rax = hash so far
movabs          rcx, 0x9DDFEA08EB382D69                 #       rcx = CityHash magic number
mul             rcx                                     # [a02] 128-bit multiply rax * rcx...
xor             rdx, rax                                # [a03] ...and mix to 64-bits (rdx = hash of key)
mov             r10, qword ptr [rip + g_map]            # [b00] r10 = pointer to metadata bytes
mov             rax, rdx
shr             rax, 7                                  # [a04] rax = hash bits 7-63
mov             rcx, r10
shr             rcx, 12                                 # [b01] rcx = page address (ASLR entropy)
xor             rcx, rax                                # [a05] rcx = H1 (with added entropy) -- n.b. also b02
mov             rsi, qword ptr [rip + g_map+24]         # [c00] rsi = power-of-two-minus-one capacity mask
and             dl, 127                                 # [d04] dl = H2 (hash bits 0-6) -- n.b. branched from "a" dep chain
movzx           eax, dl                                 #       eax = H2
vmovd           xmm0, eax                               # [d05] xmm0[byte 0] = H2
vpxor           xmm1, xmm1, xmm1
vpshufb         xmm0, xmm0, xmm1                        # [d06] splat byte 0 across all lanes of xmm0
mov             r9, qword ptr [rip + g_map+8]           # [e00] r9 = pointer to interleaved keys and values
xor             r8d, r8d
and             rcx, rsi                                # [a06] rcx = bucket index (or slot index?) -- n.b. also c01
vpcmpeqb        xmm1, xmm0, xmmword ptr [r10 + rcx]     # [a07] xmm1 = packed byte compare w/ metadata bytes -- n.b. also d07
vpmovmskb       edx, xmm1                               # [a08] edx = mask of byte compare results
test            edx, edx                                # [a09] did I find any candidates?
jz              .probe_next_bucket                      #       no candidates, go to next bucket
bsf             eax, edx                                # [a09] eax = relative index of first candidate
add             rax, rcx                                # [a10] eax = absolute index of first candidate
and             rax, rsi                                # [a11] handle wrap-around
shl             rax, 4                                  # [a12] rax = byte offset to key (after mul by 16)
cmp             qword ptr [r9 + rax], rdi               # [a13] does the key match desired key?
jne             .check_next_slot                        #       no match, go to next candidate
mov             rax, qword ptr [r9 + rax + 8]           # [a13] rax = value
ret

00: LD  load seed, metadata ptr, slots ptr
01: ALU seed + key
02: ALU tmp1 * hash_magic (128-bit)
03: ALU tmp2.hi ^ tmp2.lo
04: ALU hash >> 7                       ALU hash & 127
05: ALU tmp3 ^ ASLR entropy             FLT move metadata byte to floating point unit
06: ALU tmp4 & capacity_mask            FLT splat metadata byte across all lanes
07: FLT [load-op] packed byte compare
08: FLT extract/move mask back to gpr
09: ALU bitscan to find first candidate
10: ALU candidate += bucket index
11: ALU candidate &= capacity_mask
12: ALU candidate *= 16
13: LD  [load-op] compare key           LD  load value

14 instructions long. The most expensive is the MUL during hashing.
2 dependent loads, although the second (key + value) could possibly be prefetched.
	// Abseil absl::flat_hash_map
	// Analysis of the critical path for perfectly predicted load hit.
	// https://godbolt.org/z/P_ksaa
	//
	// x86_64
	// Instruction latencies have been entirely ignored.

	# rdi = desired key
	mov rax, qword ptr [rip + kHashSeed] # [a00] rax = per-process seed (with entropy)
	add rax, rdi # [a01] rax = hash so far
	movabs rcx, 0x9DDFEA08EB382D69 # rcx = CityHash magic number
	mul rcx # [a02] 128-bit multiply rax * rcx...
	xor rdx, rax # [a03] ...and mix to 64-bits (rdx = hash of key)
	mov r10, qword ptr [rip + g_map] # [b00] r10 = pointer to metadata bytes
	mov rax, rdx
	shr rax, 7 # [a04] rax = hash bits 7-63
	mov rcx, r10
	shr rcx, 12 # [b01] rcx = page address (ASLR entropy)
	xor rcx, rax # [a05] rcx = H1 (with added entropy) -- n.b. also b02
	mov rsi, qword ptr [rip + g_map+24] # [c00] rsi = power-of-two-minus-one capacity mask
	and dl, 127 # [d04] dl = H2 (hash bits 0-6) -- n.b. branched from "a" dep chain
	movzx eax, dl # eax = H2
	vmovd xmm0, eax # [d05] xmm0[byte 0] = H2
	vpxor xmm1, xmm1, xmm1
	vpshufb xmm0, xmm0, xmm1 # [d06] splat byte 0 across all lanes of xmm0
	mov r9, qword ptr [rip + g_map+8] # [e00] r9 = pointer to interleaved keys and values
	xor r8d, r8d
	and rcx, rsi # [a06] rcx = bucket index (or slot index?) -- n.b. also c01
	vpcmpeqb xmm1, xmm0, xmmword ptr [r10 + rcx] # [a07] xmm1 = packed byte compare w/ metadata bytes -- n.b. also d07
	vpmovmskb edx, xmm1 # [a08] edx = mask of byte compare results
	test edx, edx # [a09] did I find any candidates?
	jz .probe_next_bucket # no candidates, go to next bucket
	bsf eax, edx # [a09] eax = relative index of first candidate
	add rax, rcx # [a10] eax = absolute index of first candidate
	and rax, rsi # [a11] handle wrap-around
	shl rax, 4 # [a12] rax = byte offset to key (after mul by 16)
	cmp qword ptr [r9 + rax], rdi # [a13] does the key match desired key?
	jne .check_next_slot # no match, go to next candidate
	mov rax, qword ptr [r9 + rax + 8] # [a13] rax = value
	ret

	00: LD load seed, metadata ptr, slots ptr
	01: ALU seed + key
	02: ALU tmp1 * hash_magic (128-bit)
	03: ALU tmp2.hi ^ tmp2.lo
	04: ALU hash >> 7 ALU hash & 127
	05: ALU tmp3 ^ ASLR entropy FLT move metadata byte to floating point unit
	06: ALU tmp4 & capacity_mask FLT splat metadata byte across all lanes
	07: FLT [load-op] packed byte compare
	08: FLT extract/move mask back to gpr
	09: ALU bitscan to find first candidate
	10: ALU candidate += bucket index
	11: ALU candidate &= capacity_mask
	12: ALU candidate *= 16
	13: LD [load-op] compare key LD load value

	14 instructions long. The most expensive is the MUL during hashing.
	2 dependent loads, although the second (key + value) could possibly be prefetched.