rygorous/gist:5c8aad95ef36c9cab3c9f2d6cfeedd8d

## gistfile1.txt
mov  rax, [codewords]
pext rax, [masks] ; coalesce code words
shlx rax, rax, rDestBitPos ; still need something like this
or   rOut, rax ; and this
add  rDestBitPos, [total] ; this and make SIMD code emit it, or load masks to a reg then do popcnt on it?

; so 5-6 insns per 4 codewords = 1.25-1.5 insns per codeword
; leaves 2.5-2.75 insns per codeword to assemble codewords SIMD
; our best bet here is, I guess, 8 16-bit codewords at once (AVX2)
;
; so 20-22 insns to set up 8 codewords and break even
;
; given individual codeword lens as vector of  U16s, computing the totals within each
; group of 4 is literally all done using a single VPSADBW (against 0), then store, so
; totals are _very_ cheap
;
; that leaves about 18-20 insns left to set up 8 lens and masks and still break even
;
; -> yeah if you have a very limited alphabet and can do it all using VPSHUFBs, that's
; totally practical to beat for sure
;
; For our (generally) 256-symbol alphabets, I doubt it.
	mov rax, [codewords]
	pext rax, [masks] ; coalesce code words
	shlx rax, rax, rDestBitPos ; still need something like this
	or rOut, rax ; and this
	add rDestBitPos, [total] ; this and make SIMD code emit it, or load masks to a reg then do popcnt on it?

	; so 5-6 insns per 4 codewords = 1.25-1.5 insns per codeword
	; leaves 2.5-2.75 insns per codeword to assemble codewords SIMD
	; our best bet here is, I guess, 8 16-bit codewords at once (AVX2)
	;
	; so 20-22 insns to set up 8 codewords and break even
	;
	; given individual codeword lens as vector of U16s, computing the totals within each
	; group of 4 is literally all done using a single VPSADBW (against 0), then store, so
	; totals are _very_ cheap
	;
	; that leaves about 18-20 insns left to set up 8 lens and masks and still break even
	;
	; -> yeah if you have a very limited alphabet and can do it all using VPSHUFBs, that's
	; totally practical to beat for sure
	;
	; For our (generally) 256-symbol alphabets, I doubt it.