Skip to content

Instantly share code, notes, and snippets.

@pcordes
Created April 11, 2017 20:47
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save pcordes/304c70cf8b83e5e4e3825b0cae7bf58e to your computer and use it in GitHub Desktop.
copy + bswap in-place vs. copy + swap on the fly, to simulate mmap vs. read
;; 3 versions, only one enabled with %if 1 NASM/YASM preprocessor stuff, like C #if 0 / #if 1
;; shuffle on the fly
;; copy + in-place
;; read-only up/down/bidir
;;; ~/bin/asm-link loop-up-down.asm && disas loop-up-down && ocperf.py stat -etask-clock,page-faults,cycles,L1-dcache-loads,L1-dcache-load-misses,LLC-loads,LLC-load-misses,instructions,dtlb_store_misses.miss_causes_a_walk -r4 ./loop-up-down
;;;~/bin/asm-link loop-up-down.asm && rm loop-up-down.o && disas loop-up-down && nice ocperf.py stat -etask-clock,page-faults,cycles,L1-dcache-loads,LLC-loads,LLC-load-misses,instructions,dtlb_store_misses.miss_causes_a_walk,dtlb_load_misses.stlb_hit,dtlb_load_misses.miss_causes_a_walk -r3 ./loop-up-down
default rel
section .data
align 1024*1024*2
times 4096+SRC_MISALIGN db 1 ; 4096-151
srcbuf: times 1024*BUFKB db 0 ; not BSS, so it's private memory-mapped, not to the zero-page
section .bss
align 1024*1024*8
dstbuf: resb 1024*BUFKB ; initially all mapped to the same zero page, which fits in L1. Unless it's a hugepage
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Copy-and-shuffle on the fly ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; (like mmap but the page-fault penalty only happens on the first iteration):
%if 1
section .text
global _start
_start:
vpxor ymm15, ymm15
; on-the-fly copy+shuffle without an infloop running:
; buf=128M : up: 330M/s @2.9GHz. src is misaligned, dst is aligned
; buf=512M unrolled by 8 : 277M/s @ 2.45GHz. aligned. With NT stores: 302M/s @2.3GHz
; on-the-fly copy with infloop running: somewhat faster than memcpy + process in-place
; buf=32M : up: 385M/s @3.75GHz. src and dst are aligned
; buf=32M : up: 385M/s @3.75GHz. src is 13B misaligned, dst is aligned = 12320 MByte/s
; buf=128M: up: 350M/s @3.6GHz. src is 13B misaligned, dst is aligned. Other misalignments, e.g. 4096-40, are the same. Can't find a 4k problem
; buf=128M: up: 350M/s @3.6GHz. src is 13B misaligned, dst is aligned
; buf=512M unroll=8 : 300M/s @ 3.14GHz. aligned. NT stores: 340M/s @3.0GHz. No change for unroll by 4
; buf=1M aligned, unroll=4 up: 957M+-2
; buf=3M aligned, unroll=4 up: 883-906M/s. unroll=8: 880-913M/s.
; buf=3M aligned unroll=4 up: 920-940M/s (a few minutes later). interleaved (load+shuffle+store, repeat): 903-940M/s
;;; Can't reproduce: buf=3M up: misaligned by -151: 940M/s. aligned: 975M/s.
; buf=3M aligned unroll=4 NT 1013-1028M/s. NT interleaved (load1+shuffle1+store1): 1006-1024M/s. unroll=1,2,3, or 8: same
; buf=3M NT stores, mislign -71 (worst for u=4): u=4: 987-1000M. u=3: 973-983. u=2: 985 u=1: 969-980, one outlier at 1000
;;NT interleaved : 1006-1024M/s. unroll=1,2,3, or 8: same
; buf=256k up: src misaligned by -171: 970M/s. aligned 1005M/s (dst on a 4k boundary, src 171B before a boundary)
; buf=32k up: src misaligned by -171: 1180M/s. aligned 1605M/s. by -371: 1430M/s: 4k aliasing is much less of a problem
; buf=16k up: src misaligned by -171: 1298M/s. aligned 3500M/s. by -371: 1850M/s
;; misalignment from -131 to -251: ~1285 to 1300M/s. -121: 1526M/s -261: 1774M/s. Worst case: -131 to -151: 1285M/s.
BUFKB equ 1024*32
SRC_MISALIGN equ 0
%define STRIDE 32
%define STORE vmovdqu ;vmovntdq
%define UNROLL 4
mov rbx, 200000000 / (1024*BUFKB/32)
.repeatloop:
mov esi, srcbuf
mov edi, dstbuf
ALIGN 16
.bufloop:
%assign i 0
%rep UNROLL
vmovdqu ymm %+ i, [rsi+STRIDE*i]
; vpshufb ymm %+ i, ymm15
; STORE [rdi+STRIDE*i], ymm %+ i
%assign i i+1
%endrep
%assign i 0
%rep UNROLL
vpshufb ymm %+ i, ymm15
%assign i i+1
%endrep
%assign i 0
%rep UNROLL
STORE [rdi+STRIDE*i], ymm %+ i
%assign i i+1
%endrep
;; vmovdqu ymm0, [rsi+STRIDE*0]
;; vmovdqu ymm1, [rsi+STRIDE*1]
;; vmovdqu ymm2, [rsi+STRIDE*2]
;; vmovdqu ymm3, [rsi+STRIDE*3]
;; ;; vmovdqu ymm4, [rsi+STRIDE*4]
;; ;; vmovdqu ymm5, [rsi+STRIDE*5]
;; ;; vmovdqu ymm6, [rsi+STRIDE*6]
;; ;; vmovdqu ymm7, [rsi+STRIDE*7]
;; vpshufb ymm0, ymm15
;; vpshufb ymm1, ymm15
;; vpshufb ymm2, ymm15
;; vpshufb ymm3, ymm15
;; ;; vpshufb ymm4, ymm15
;; ;; vpshufb ymm5, ymm15
;; ;; vpshufb ymm6, ymm15
;; ;; vpshufb ymm7, ymm15
;; STORE [rdi+STRIDE*0], ymm0
;; STORE [rdi+STRIDE*1], ymm1
;; STORE [rdi+STRIDE*2], ymm2
;; STORE [rdi+STRIDE*3], ymm3
;; ;; STORE [rdi+STRIDE*4], ymm4
;; ;; STORE [rdi+STRIDE*5], ymm5
;; ;; STORE [rdi+STRIDE*6], ymm6
;; ;; STORE [rdi+STRIDE*7], ymm7
add esi, STRIDE*UNROLL ;128
add edi, STRIDE*UNROLL ;128
cmp edi, dstbuf + 1024*BUFKB - STRIDE*UNROLL
jbe .bufloop
dec rbx
jg .repeatloop
xor edi,edi
mov eax,231
syscall
%endif
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; read() and process in-place ;;;;;;;;;;;;;;;;;;;;;;;;
%if 0
section .text
global _start
_start:
vpxor ymm15, ymm15
;; SKL i7-6700k @ 3.9GHz(max) with DDR4-2666:
;24k just in-place bswap: up or down: ~3850M/s L1d loads, i.e. vectors loaded+shuffled+stored / sec
;128k just in-place bswap: down: ~2330M up: ~2330M
;128+64k down: 2320M/s up: 2300M/s
;256k: down: 2325 M/s up: 2300 M/s
;1M : down: 1590 M/s up: 1607 M/s
;4M : down: 1550 up: 1576
;8M : same 1550
;8M+32k : 1500
;8M+64k : up 1450
;8M+128k : up 1370
;8M+256k : up 1250
;10M : down: 735M up: 870M
;; Actually doing the rep movs:
;buf=8MB block=128kB ; down: ~340Mload/s
;; with no infloop
;buf=16M block=4M : down: 251M/s up: 250M vectors copied+swapped / sec (from L1d loads / sec, which doesn't count rep movsb), @2.67GHz
;buf=16M block=128k : down: 264M/s up: 261M/s (both @ ~2.67GHz)
; without cache-blocking
;buf=10M block=10M : down: 242M (@2.67GHz) up: 240M (@2.67GHz). no infloop
;buf=10M block=10M : down: 290M (@3.8GHz) up: 290M (@3.8GHz). infloop running on another core to keep speed up
;buf=32M block=32M : down: 244M (@3.8GHz) up: 240M (@3.8GHz)
;; with an infloop running to keep clock speed up.
;; src+dst hot in L3:
;buf=3M block=256k ; down: 610M/s up: 610M/s @ 3.89GHz. Without infloop: 605M/s to 615M/s
;buf=3M block=128k ; down: 620M/s up: 598M/s @ 3.82GHz
;buf=3M block=64k ; down: 637M/s up: 610M/s @ 3.88GHz. movntdq: 280M/s up/down
;buf=3M block=48k ; down: 655M/s up: 610M/s @ 3.88GHz
;buf=3M block=32k ; down: 660 to 688M/s up: 624M/s @ 3.88GHz
;buf=3M block=16k ; down: 777M/s up: 756 to 767M/s @ 3.89GHz. movntdq: 225M/s up/down
;; Larger than L3:
;buf=32M block=1M : down: 304M/s up: 305M @3.75GHz
;buf=32M block=512k : down: 311M/s up: 305M @3.75GHz
;buf=32M block=256k : down: 312M/s up: 311M @3.75GHz
;buf=32M block=128k : down: 318M/s up: 315M @3.75GHz not sensitive to src misalignment
;buf=32M block=64k : down: 325M up: 315M/s vectors/s @ 3.75GHz. up NT: 235M. down NT: 227M
;buf=32M block=32k : down: 330M up: 320M/s @3.75GHz
;buf=32M block=24k : down: 337M up: 331M/s @3.75GHz
;buf=32M block=16k : down: 341M up: 351M/s @3.75GHz ; mostly this is from overlapping rep movsb with the loop, not L1 hits
;;buf=32M block=8k : down: 325M up: 346M/s @3.75GHz
;;buf=32M block=4k : down: 300M up: 335M/s @3.75GHz
%define STRIDE -32
%define STORE vmovdqu
;%define STORE vmovntdq
SRC_MISALIGN equ 0
BLOCK equ 512
BUFKB equ 1024*32
mov rbx, 200000000 / (1024*BUFKB/128) / 4
.repeatloop:
mov esi, srcbuf
mov edi, dstbuf
.bufloop:
; read() a block
mov ecx, 1024*BLOCK
mov eax, srcbuf+1024*BUFKB
sub eax, esi
cmp ecx, eax
cmovg ecx, eax ; bytes = min(BLOCK, bytes left)
%if STRIDE > 0 ; always-ascending memcpy from "pagecache"
mov eax, edi ; start
rep movsb
mov edx, edi ; end
;; add edi, ecx ;; skip the copy
;; mov eax, dstbuf
;; mov edx, dstbuf+1024*BLOCK
%else
lea edx, [rdi+STRIDE] ; end
rep movsb
lea eax, [rdi+STRIDE] ; start
;; add edi, ecx
;; mov eax, dstbuf+1024*BLOCK + STRIDE
;; mov edx, dstbuf + STRIDE
%endif
; and bswap it
ALIGN 16
.blockloop:
vmovdqa ymm0, [rax+STRIDE*0]
vmovdqa ymm1, [rax+STRIDE*1]
vmovdqa ymm2, [rax+STRIDE*2]
vmovdqa ymm3, [rax+STRIDE*3]
vpshufb ymm0, ymm15
vpshufb ymm1, ymm15
vpshufb ymm2, ymm15
vpshufb ymm3, ymm15
STORE [rax+STRIDE*0], ymm0
STORE [rax+STRIDE*1], ymm1
STORE [rax+STRIDE*2], ymm2
STORE [rax+STRIDE*3], ymm3
add eax, STRIDE*4 ;128
cmp eax, edx
jne .blockloop
cmp edi, dstbuf + 1024*BUFKB
jb .bufloop
dec rbx
jg .repeatloop
xor edi,edi
mov eax,231
syscall
%endif
;;;;;;;;;;;;;;;;;;;;;;;;;; Read-only loop ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%if 0
;;; Reading memory in ascending order is faster on Skylake, for read-only loops
;;; But alternating up/down loops are a big win when the buffer size is >= cache size. (For L1, L2, and L3).
;;; Hugepages probably make TLB effects insignificant.
section .bss
align 1024*1024*8
buf: resb 1024*KB ; initially all mapped to the same zero page, which fits in L1. Unless it's a hugepage
section .text
default rel
global _start
_start:
mov edi, buf
mov eax, 0
mov ecx, 1024 * KB / 8
rep stosq ; dirty the buffer
%define DOWNONLY 0
%define STRIDE 32
;; for large buffers (memory-bound), SKL downclocks if there aren't threads on other cores. Without that, ironically vsqrtps runs faster than vmovaps because it keeps the core clock high
;;; For very large buffers, down is slower than up. Even though this does trivial work, so demand-loads outpace HW prefetching.
;KB equ 1024*10 ; down: 895+-1% up: 1010+-2% bidir: 1210+-1% (vxorps with infloop active)
;KB equ 1024*8 - 1 ; up: ~1300M/s+-2% down: ~1350+-10%. bidir: 1525+-3%
;;; L3 hits
;KB equ 1024 * 7 ; downonly: ~1920Mloads/s +-50. uponly: 1980M/s +-50. bidir: 2000M/s +-40?. With vxorps or vaddps, with a busy-loop on another core to keep the clock at 3.9GHz, which it should do anyway with an L3 working set
; at 2MiB: hard to measure any difference
;KB equ 1024 * 1 ; downonly and uponly: ~2000Mload/s or maybe 2100, bidir: ~2300M/s (vxorps)
;KB equ 1024 / 2 ; down or up: 2140M/s bidir: 2650 (vxorps)
;KB equ 256+128 ; down ~2300 up: 2340 bidir: 2940
;;; L2 hits
;;;;KB equ 1024 / 4 = L2; up: 2670 to 2930. down: 2620 to 2860, but usually only ~2700. bidir: 3340 to 3450. L1 miss rate ~ 44% (down from 50%)
;;KB equ 128 ; up: 3250 to 3600. down: ~3250. bidir: 3960 to 4215 (better than one per clock with vxorps). (vaddps=~3620M/s, nearly hitting its latency bottleneck)
;;KB equ 48 ; up: 3650, still 50% L1 miss rate. down: 3450-3650. bidir: ~5600Mloads/sec. L1 miss rate ~= 17%
KB equ 256+128
%define INSN vxorps
%define DO_DOWNLOOP 0
%if DO_DOWNLOOP
%define DOWNONLY 0
%endif
; mov rcx, 5000000000 / (1024*KB/128) / 8
mov rcx, 1000000000 / (1024*KB/(STRIDE*4)) / (4*(1+DO_DOWNLOOP-DOWNONLY))
ALIGN 32
.repeatloop:
mov esi, buf
%if DOWNONLY != 1
.workloop:
INSN ymm0, [rsi+STRIDE*0]
INSN ymm1, [rsi+STRIDE*1]
INSN ymm2, [rsi+STRIDE*2]
INSN ymm3, [rsi+STRIDE*3]
add esi, STRIDE*4 ;128
cmp esi, buf-STRIDE*3 + 1024*KB
jb .workloop
%else
mov esi, buf + 1024*KB
%endif
%if DO_DOWNLOOP
.downloop:
sub esi, STRIDE*4 ;128
INSN ymm3, [rsi+STRIDE*3]
INSN ymm2, [rsi+STRIDE*2]
INSN ymm1, [rsi+STRIDE*1]
INSN ymm0, [rsi+STRIDE*0]
cmp esi, buf
ja .downloop
%endif
dec rcx
jnz .repeatloop
xor edi,edi
mov eax,231
syscall
%endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment