pcordes/loop-up-down.asm

## loop-up-down.asm
    ;; 3 versions, only one enabled with %if 1  NASM/YASM preprocessor stuff, like C #if 0 / #if 1
    ;; shuffle on the fly
    ;; copy + in-place
    ;; read-only up/down/bidir

;;; ~/bin/asm-link loop-up-down.asm && disas loop-up-down && ocperf.py stat -etask-clock,page-faults,cycles,L1-dcache-loads,L1-dcache-load-misses,LLC-loads,LLC-load-misses,instructions,dtlb_store_misses.miss_causes_a_walk  -r4 ./loop-up-down

;;;~/bin/asm-link loop-up-down.asm && rm loop-up-down.o && disas loop-up-down && nice ocperf.py stat -etask-clock,page-faults,cycles,L1-dcache-loads,LLC-loads,LLC-load-misses,instructions,dtlb_store_misses.miss_causes_a_walk,dtlb_load_misses.stlb_hit,dtlb_load_misses.miss_causes_a_walk -r3 ./loop-up-down

default rel

section .data
align 1024*1024*2
times 4096+SRC_MISALIGN db 1   ; 4096-151
srcbuf:  times 1024*BUFKB  db 0   ; not BSS, so it's private memory-mapped, not to the zero-page

section .bss
align 1024*1024*8
dstbuf:    resb 1024*BUFKB   ; initially all mapped to the same zero page, which fits in L1.  Unless it's a hugepage


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Copy-and-shuffle on the fly ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;   (like mmap but the page-fault penalty only happens on the first iteration):
%if 1
section .text
global _start
_start:

    vpxor ymm15, ymm15

; on-the-fly copy+shuffle without an infloop running:
; buf=128M : up: 330M/s @2.9GHz.  src is misaligned, dst is aligned
; buf=512M unrolled by 8 : 277M/s @ 2.45GHz.  aligned.  With NT stores: 302M/s @2.3GHz

; on-the-fly copy with infloop running: somewhat faster than memcpy + process in-place

; buf=32M : up: 385M/s @3.75GHz. src and dst are aligned
; buf=32M : up: 385M/s @3.75GHz. src is 13B misaligned, dst is aligned  = 12320 MByte/s
; buf=128M: up: 350M/s @3.6GHz.  src is 13B misaligned, dst is aligned.  Other misalignments, e.g. 4096-40, are the same.  Can't find a 4k problem
; buf=128M: up: 350M/s @3.6GHz.  src is 13B misaligned, dst is aligned
; buf=512M unroll=8 : 300M/s @ 3.14GHz.  aligned.  NT stores: 340M/s @3.0GHz.  No change for unroll by 4

; buf=1M aligned, unroll=4 up: 957M+-2
; buf=3M aligned, unroll=4 up: 883-906M/s.  unroll=8: 880-913M/s.
; buf=3M aligned  unroll=4 up: 920-940M/s (a few minutes later).  interleaved (load+shuffle+store, repeat): 903-940M/s
      ;;; Can't reproduce: buf=3M up: misaligned by -151: 940M/s.  aligned: 975M/s.
; buf=3M aligned  unroll=4 NT 1013-1028M/s.  NT interleaved (load1+shuffle1+store1): 1006-1024M/s.  unroll=1,2,3, or 8: same
    ; buf=3M NT stores, mislign -71 (worst for u=4): u=4: 987-1000M.  u=3: 973-983.  u=2: 985 u=1: 969-980, one outlier at 1000
    ;;NT interleaved : 1006-1024M/s.  unroll=1,2,3, or 8: same

; buf=256k up: src misaligned by -171:  970M/s.  aligned 1005M/s  (dst on a 4k boundary, src 171B before a boundary)
; buf=32k  up: src misaligned by -171: 1180M/s.  aligned 1605M/s.  by -371: 1430M/s: 4k aliasing is much less of a problem
; buf=16k  up: src misaligned by -171: 1298M/s.  aligned 3500M/s.  by -371: 1850M/s
    ;; misalignment from -131 to -251: ~1285 to 1300M/s.  -121: 1526M/s  -261: 1774M/s.  Worst case: -131 to -151: 1285M/s.

BUFKB equ 1024*32
SRC_MISALIGN equ 0
%define STRIDE 32
%define STORE vmovdqu ;vmovntdq
%define UNROLL 4

    mov     rbx, 200000000 / (1024*BUFKB/32)
.repeatloop:

    mov     esi, srcbuf
    mov     edi, dstbuf
ALIGN 16
.bufloop:


%assign i 0
%rep UNROLL
    vmovdqu     ymm %+ i, [rsi+STRIDE*i]
;    vpshufb     ymm %+ i, ymm15
;    STORE     [rdi+STRIDE*i], ymm %+ i
    %assign i i+1
%endrep
%assign i 0
%rep UNROLL
    vpshufb     ymm %+ i, ymm15
    %assign i i+1
%endrep
%assign i 0
%rep UNROLL
    STORE     [rdi+STRIDE*i], ymm %+ i
    %assign i i+1
%endrep

    ;; vmovdqu     ymm0, [rsi+STRIDE*0]
    ;; vmovdqu     ymm1, [rsi+STRIDE*1]
    ;; vmovdqu     ymm2, [rsi+STRIDE*2]
    ;; vmovdqu     ymm3, [rsi+STRIDE*3]
    ;; ;; vmovdqu     ymm4, [rsi+STRIDE*4]
    ;; ;; vmovdqu     ymm5, [rsi+STRIDE*5]
    ;; ;; vmovdqu     ymm6, [rsi+STRIDE*6]
    ;; ;; vmovdqu     ymm7, [rsi+STRIDE*7]
    ;; vpshufb     ymm0, ymm15
    ;; vpshufb     ymm1, ymm15
    ;; vpshufb     ymm2, ymm15
    ;; vpshufb     ymm3, ymm15
    ;; ;; vpshufb     ymm4, ymm15
    ;; ;; vpshufb     ymm5, ymm15
    ;; ;; vpshufb     ymm6, ymm15
    ;; ;; vpshufb     ymm7, ymm15
    ;; STORE     [rdi+STRIDE*0], ymm0
    ;; STORE     [rdi+STRIDE*1], ymm1
    ;; STORE     [rdi+STRIDE*2], ymm2
    ;; STORE     [rdi+STRIDE*3], ymm3
    ;; ;; STORE     [rdi+STRIDE*4], ymm4
    ;; ;; STORE     [rdi+STRIDE*5], ymm5
    ;; ;; STORE     [rdi+STRIDE*6], ymm6
    ;; ;; STORE     [rdi+STRIDE*7], ymm7

    add         esi, STRIDE*UNROLL ;128
    add         edi, STRIDE*UNROLL ;128
    cmp         edi, dstbuf + 1024*BUFKB - STRIDE*UNROLL
    jbe  .bufloop

    dec     rbx
    jg .repeatloop

    xor edi,edi
    mov eax,231
    syscall
%endif

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; read() and process in-place ;;;;;;;;;;;;;;;;;;;;;;;;

%if 0
section .text
global _start
_start:

    vpxor ymm15, ymm15

;; SKL i7-6700k @ 3.9GHz(max) with DDR4-2666:
;24k just in-place bswap: up or down: ~3850M/s L1d loads, i.e. vectors loaded+shuffled+stored / sec
;128k just in-place bswap: down: ~2330M  up: ~2330M
;128+64k down: 2320M/s up: 2300M/s
;256k: down: 2325 M/s  up: 2300 M/s
;1M  : down: 1590 M/s  up: 1607 M/s
;4M  : down: 1550      up: 1576
;8M  : same 1550
;8M+32k  : 1500
;8M+64k  : up 1450
;8M+128k : up 1370
;8M+256k : up 1250
;10M : down: 735M      up: 870M

;; Actually doing the rep movs:
;buf=8MB block=128kB ; down: ~340Mload/s

;; with no infloop
;buf=16M block=4M   : down: 251M/s   up: 250M vectors copied+swapped / sec (from L1d loads / sec, which doesn't count rep movsb), @2.67GHz
;buf=16M block=128k : down: 264M/s   up: 261M/s  (both @ ~2.67GHz)

; without cache-blocking
;buf=10M block=10M : down: 242M (@2.67GHz) up: 240M (@2.67GHz).  no infloop

;buf=10M block=10M : down: 290M (@3.8GHz) up: 290M (@3.8GHz).  infloop running on another core to keep speed up
;buf=32M block=32M : down: 244M (@3.8GHz) up: 240M (@3.8GHz)

;; with an infloop running to keep clock speed up.
;; src+dst hot in L3:
;buf=3M block=256k  ; down: 610M/s   up:  610M/s @ 3.89GHz.  Without infloop: 605M/s to 615M/s
;buf=3M block=128k  ; down: 620M/s   up:  598M/s @ 3.82GHz
;buf=3M block=64k   ; down: 637M/s   up:  610M/s @ 3.88GHz.         movntdq: 280M/s up/down
;buf=3M block=48k   ; down: 655M/s   up:  610M/s @ 3.88GHz
;buf=3M block=32k   ; down: 660 to 688M/s  up:  624M/s @ 3.88GHz
;buf=3M block=16k   ; down: 777M/s   up:  756 to 767M/s @ 3.89GHz.  movntdq: 225M/s up/down

;; Larger than L3:
;buf=32M block=1M   : down: 304M/s   up:  305M   @3.75GHz
;buf=32M block=512k : down: 311M/s   up:  305M   @3.75GHz
;buf=32M block=256k : down: 312M/s   up:  311M   @3.75GHz
;buf=32M block=128k : down: 318M/s   up:  315M   @3.75GHz  not sensitive to src misalignment
;buf=32M block=64k  : down: 325M     up:  315M/s vectors/s  @ 3.75GHz.  up NT: 235M.  down NT: 227M
;buf=32M block=32k  : down: 330M     up:  320M/s   @3.75GHz
;buf=32M block=24k  : down: 337M     up:  331M/s   @3.75GHz
;buf=32M block=16k  : down: 341M     up:  351M/s   @3.75GHz  ; mostly this is from overlapping rep movsb with the loop, not L1 hits
;;buf=32M block=8k   : down: 325M     up:  346M/s   @3.75GHz
;;buf=32M block=4k   : down: 300M     up:  335M/s   @3.75GHz
%define STRIDE -32
%define STORE vmovdqu
;%define STORE vmovntdq

SRC_MISALIGN equ 0
BLOCK equ 512
BUFKB equ 1024*32

    mov     rbx, 200000000 / (1024*BUFKB/128) / 4
.repeatloop:

    mov     esi, srcbuf
    mov     edi, dstbuf
.bufloop:
    ; read() a block
    mov         ecx, 1024*BLOCK

    mov         eax, srcbuf+1024*BUFKB
    sub         eax, esi
    cmp         ecx, eax
    cmovg       ecx, eax   ; bytes = min(BLOCK, bytes left)

%if STRIDE > 0  ; always-ascending memcpy from "pagecache"
    mov         eax, edi  ; start
    rep movsb
    mov         edx, edi  ; end

    ;; add  edi, ecx        ;; skip the copy
    ;; mov eax, dstbuf
    ;; mov edx, dstbuf+1024*BLOCK
%else
    lea         edx, [rdi+STRIDE]  ; end
    rep movsb
    lea         eax, [rdi+STRIDE]  ; start

    ;; add  edi, ecx
    ;; mov eax, dstbuf+1024*BLOCK + STRIDE
    ;; mov edx, dstbuf + STRIDE
%endif


    ; and bswap it
ALIGN 16
.blockloop:
    vmovdqa     ymm0, [rax+STRIDE*0]
    vmovdqa     ymm1, [rax+STRIDE*1]
    vmovdqa     ymm2, [rax+STRIDE*2]
    vmovdqa     ymm3, [rax+STRIDE*3]
    vpshufb     ymm0, ymm15
    vpshufb     ymm1, ymm15
    vpshufb     ymm2, ymm15
    vpshufb     ymm3, ymm15
    STORE       [rax+STRIDE*0], ymm0
    STORE       [rax+STRIDE*1], ymm1
    STORE       [rax+STRIDE*2], ymm2
    STORE       [rax+STRIDE*3], ymm3

    add         eax, STRIDE*4 ;128
    cmp         eax, edx
    jne .blockloop

    cmp         edi, dstbuf + 1024*BUFKB
    jb  .bufloop

    dec     rbx
    jg .repeatloop

    xor edi,edi
    mov eax,231
    syscall
%endif


;;;;;;;;;;;;;;;;;;;;;;;;;; Read-only loop ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

%if 0
;;; Reading memory in ascending order is faster on Skylake, for read-only loops
;;; But alternating up/down loops are a big win when the buffer size is >= cache size.  (For L1, L2, and L3).
;;; Hugepages probably make TLB effects insignificant.

section .bss
align 1024*1024*8
buf:    resb 1024*KB   ; initially all mapped to the same zero page, which fits in L1.  Unless it's a hugepage

section .text
default rel
global _start
_start:

    mov     edi, buf
    mov     eax, 0
    mov 	ecx, 1024 * KB / 8
    rep     stosq        ; dirty the buffer

%define DOWNONLY 0
%define STRIDE 32
;; for large buffers (memory-bound), SKL downclocks if there aren't threads on other cores.  Without that, ironically vsqrtps runs faster than vmovaps because it keeps the core clock high
;;; For very large buffers, down is slower than up.  Even though this does trivial work, so demand-loads outpace HW prefetching.

	;KB equ 1024*10     ; down: 895+-1%  up: 1010+-2%  bidir: 1210+-1%  (vxorps with infloop active)
	;KB equ 1024*8 - 1 ; up: ~1300M/s+-2% down: ~1350+-10%.  bidir: 1525+-3%

;;; L3 hits
    ;KB equ 1024 * 7  ; downonly: ~1920Mloads/s +-50.  uponly: 1980M/s +-50.  bidir: 2000M/s +-40?.  With vxorps or vaddps, with a busy-loop on another core to keep the clock at 3.9GHz, which it should do anyway with an L3 working set
    ; at 2MiB: hard to measure any difference
    ;KB equ 1024 * 1     ; downonly and uponly: ~2000Mload/s or maybe 2100, bidir: ~2300M/s  (vxorps)
    ;KB equ 1024 / 2     ; down or up: 2140M/s  bidir: 2650  (vxorps)
    ;KB equ 256+128      ; down ~2300  up: 2340  bidir: 2940
;;; L2 hits
;;;;KB equ 1024 / 4 = L2; up: 2670 to 2930.  down: 2620 to 2860, but usually only ~2700.  bidir: 3340 to 3450.  L1 miss rate ~ 44% (down from 50%)
    ;;KB equ 128  ; up: 3250 to 3600. down: ~3250.  bidir: 3960 to 4215 (better than one per clock with vxorps).  (vaddps=~3620M/s, nearly hitting its latency bottleneck)
    ;;KB equ 48   ; up: 3650, still 50% L1 miss rate.  down: 3450-3650.  bidir: ~5600Mloads/sec.  L1 miss rate ~= 17%
KB equ 256+128
%define INSN vxorps
%define DO_DOWNLOOP 0
%if DO_DOWNLOOP
%define DOWNONLY 0
%endif
;    mov     rcx, 5000000000 / (1024*KB/128) / 8
    mov     rcx, 1000000000 / (1024*KB/(STRIDE*4)) / (4*(1+DO_DOWNLOOP-DOWNONLY))
ALIGN 32
.repeatloop:
    mov     esi, buf
%if DOWNONLY != 1
.workloop:
    INSN   ymm0, [rsi+STRIDE*0]
    INSN   ymm1, [rsi+STRIDE*1]
    INSN   ymm2, [rsi+STRIDE*2]
    INSN   ymm3, [rsi+STRIDE*3]

    add    esi, STRIDE*4 ;128
    cmp    esi, buf-STRIDE*3 + 1024*KB
    jb  .workloop
%else
    mov     esi, buf + 1024*KB
%endif

%if DO_DOWNLOOP
.downloop:
    sub    esi, STRIDE*4 ;128

    INSN   ymm3, [rsi+STRIDE*3]
    INSN   ymm2, [rsi+STRIDE*2]
    INSN   ymm1, [rsi+STRIDE*1]
    INSN   ymm0, [rsi+STRIDE*0]

    cmp    esi, buf
    ja  .downloop
%endif

    dec     rcx
    jnz .repeatloop

    xor edi,edi
    mov eax,231
    syscall
%endif
	;; 3 versions, only one enabled with %if 1 NASM/YASM preprocessor stuff, like C #if 0 / #if 1
	;; shuffle on the fly
	;; copy + in-place
	;; read-only up/down/bidir

	;;; ~/bin/asm-link loop-up-down.asm && disas loop-up-down && ocperf.py stat -etask-clock,page-faults,cycles,L1-dcache-loads,L1-dcache-load-misses,LLC-loads,LLC-load-misses,instructions,dtlb_store_misses.miss_causes_a_walk -r4 ./loop-up-down

	;;;~/bin/asm-link loop-up-down.asm && rm loop-up-down.o && disas loop-up-down && nice ocperf.py stat -etask-clock,page-faults,cycles,L1-dcache-loads,LLC-loads,LLC-load-misses,instructions,dtlb_store_misses.miss_causes_a_walk,dtlb_load_misses.stlb_hit,dtlb_load_misses.miss_causes_a_walk -r3 ./loop-up-down

	default rel

	section .data
	align 102410242
	times 4096+SRC_MISALIGN db 1 ; 4096-151
	srcbuf: times 1024*BUFKB db 0 ; not BSS, so it's private memory-mapped, not to the zero-page

	section .bss
	align 102410248
	dstbuf: resb 1024*BUFKB ; initially all mapped to the same zero page, which fits in L1. Unless it's a hugepage



	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Copy-and-shuffle on the fly ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	;; (like mmap but the page-fault penalty only happens on the first iteration):
	%if 1
	section .text
	global _start
	_start:

	vpxor ymm15, ymm15

	; on-the-fly copy+shuffle without an infloop running:
	; buf=128M : up: 330M/s @2.9GHz. src is misaligned, dst is aligned
	; buf=512M unrolled by 8 : 277M/s @ 2.45GHz. aligned. With NT stores: 302M/s @2.3GHz

	; on-the-fly copy with infloop running: somewhat faster than memcpy + process in-place

	; buf=32M : up: 385M/s @3.75GHz. src and dst are aligned
	; buf=32M : up: 385M/s @3.75GHz. src is 13B misaligned, dst is aligned = 12320 MByte/s
	; buf=128M: up: 350M/s @3.6GHz. src is 13B misaligned, dst is aligned. Other misalignments, e.g. 4096-40, are the same. Can't find a 4k problem
	; buf=128M: up: 350M/s @3.6GHz. src is 13B misaligned, dst is aligned
	; buf=512M unroll=8 : 300M/s @ 3.14GHz. aligned. NT stores: 340M/s @3.0GHz. No change for unroll by 4

	; buf=1M aligned, unroll=4 up: 957M+-2
	; buf=3M aligned, unroll=4 up: 883-906M/s. unroll=8: 880-913M/s.
	; buf=3M aligned unroll=4 up: 920-940M/s (a few minutes later). interleaved (load+shuffle+store, repeat): 903-940M/s
	;;; Can't reproduce: buf=3M up: misaligned by -151: 940M/s. aligned: 975M/s.
	; buf=3M aligned unroll=4 NT 1013-1028M/s. NT interleaved (load1+shuffle1+store1): 1006-1024M/s. unroll=1,2,3, or 8: same
	; buf=3M NT stores, mislign -71 (worst for u=4): u=4: 987-1000M. u=3: 973-983. u=2: 985 u=1: 969-980, one outlier at 1000
	;;NT interleaved : 1006-1024M/s. unroll=1,2,3, or 8: same

	; buf=256k up: src misaligned by -171: 970M/s. aligned 1005M/s (dst on a 4k boundary, src 171B before a boundary)
	; buf=32k up: src misaligned by -171: 1180M/s. aligned 1605M/s. by -371: 1430M/s: 4k aliasing is much less of a problem
	; buf=16k up: src misaligned by -171: 1298M/s. aligned 3500M/s. by -371: 1850M/s
	;; misalignment from -131 to -251: ~1285 to 1300M/s. -121: 1526M/s -261: 1774M/s. Worst case: -131 to -151: 1285M/s.

	BUFKB equ 1024*32
	SRC_MISALIGN equ 0
	%define STRIDE 32
	%define STORE vmovdqu ;vmovntdq
	%define UNROLL 4

	mov rbx, 200000000 / (1024*BUFKB/32)
	.repeatloop:

	mov esi, srcbuf
	mov edi, dstbuf
	ALIGN 16
	.bufloop:


	%assign i 0
	%rep UNROLL
	vmovdqu ymm %+ i, [rsi+STRIDE*i]
	; vpshufb ymm %+ i, ymm15
	; STORE [rdi+STRIDE*i], ymm %+ i
	%assign i i+1
	%endrep
	%assign i 0
	%rep UNROLL
	vpshufb ymm %+ i, ymm15
	%assign i i+1
	%endrep
	%assign i 0
	%rep UNROLL
	STORE [rdi+STRIDE*i], ymm %+ i
	%assign i i+1
	%endrep

	;; vmovdqu ymm0, [rsi+STRIDE*0]
	;; vmovdqu ymm1, [rsi+STRIDE*1]
	;; vmovdqu ymm2, [rsi+STRIDE*2]
	;; vmovdqu ymm3, [rsi+STRIDE*3]
	;; ;; vmovdqu ymm4, [rsi+STRIDE*4]
	;; ;; vmovdqu ymm5, [rsi+STRIDE*5]
	;; ;; vmovdqu ymm6, [rsi+STRIDE*6]
	;; ;; vmovdqu ymm7, [rsi+STRIDE*7]
	;; vpshufb ymm0, ymm15
	;; vpshufb ymm1, ymm15
	;; vpshufb ymm2, ymm15
	;; vpshufb ymm3, ymm15
	;; ;; vpshufb ymm4, ymm15
	;; ;; vpshufb ymm5, ymm15
	;; ;; vpshufb ymm6, ymm15
	;; ;; vpshufb ymm7, ymm15
	;; STORE [rdi+STRIDE*0], ymm0
	;; STORE [rdi+STRIDE*1], ymm1
	;; STORE [rdi+STRIDE*2], ymm2
	;; STORE [rdi+STRIDE*3], ymm3
	;; ;; STORE [rdi+STRIDE*4], ymm4
	;; ;; STORE [rdi+STRIDE*5], ymm5
	;; ;; STORE [rdi+STRIDE*6], ymm6
	;; ;; STORE [rdi+STRIDE*7], ymm7

	add esi, STRIDE*UNROLL ;128
	add edi, STRIDE*UNROLL ;128
	cmp edi, dstbuf + 1024BUFKB - STRIDEUNROLL
	jbe .bufloop

	dec rbx
	jg .repeatloop

	xor edi,edi
	mov eax,231
	syscall
	%endif

	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; read() and process in-place ;;;;;;;;;;;;;;;;;;;;;;;;

	%if 0
	section .text
	global _start
	_start:

	vpxor ymm15, ymm15

	;; SKL i7-6700k @ 3.9GHz(max) with DDR4-2666:
	;24k just in-place bswap: up or down: ~3850M/s L1d loads, i.e. vectors loaded+shuffled+stored / sec
	;128k just in-place bswap: down: ~2330M up: ~2330M
	;128+64k down: 2320M/s up: 2300M/s
	;256k: down: 2325 M/s up: 2300 M/s
	;1M : down: 1590 M/s up: 1607 M/s
	;4M : down: 1550 up: 1576
	;8M : same 1550
	;8M+32k : 1500
	;8M+64k : up 1450
	;8M+128k : up 1370
	;8M+256k : up 1250
	;10M : down: 735M up: 870M

	;; Actually doing the rep movs:
	;buf=8MB block=128kB ; down: ~340Mload/s

	;; with no infloop
	;buf=16M block=4M : down: 251M/s up: 250M vectors copied+swapped / sec (from L1d loads / sec, which doesn't count rep movsb), @2.67GHz
	;buf=16M block=128k : down: 264M/s up: 261M/s (both @ ~2.67GHz)

	; without cache-blocking
	;buf=10M block=10M : down: 242M (@2.67GHz) up: 240M (@2.67GHz). no infloop

	;buf=10M block=10M : down: 290M (@3.8GHz) up: 290M (@3.8GHz). infloop running on another core to keep speed up
	;buf=32M block=32M : down: 244M (@3.8GHz) up: 240M (@3.8GHz)

	;; with an infloop running to keep clock speed up.
	;; src+dst hot in L3:
	;buf=3M block=256k ; down: 610M/s up: 610M/s @ 3.89GHz. Without infloop: 605M/s to 615M/s
	;buf=3M block=128k ; down: 620M/s up: 598M/s @ 3.82GHz
	;buf=3M block=64k ; down: 637M/s up: 610M/s @ 3.88GHz. movntdq: 280M/s up/down
	;buf=3M block=48k ; down: 655M/s up: 610M/s @ 3.88GHz
	;buf=3M block=32k ; down: 660 to 688M/s up: 624M/s @ 3.88GHz
	;buf=3M block=16k ; down: 777M/s up: 756 to 767M/s @ 3.89GHz. movntdq: 225M/s up/down

	;; Larger than L3:
	;buf=32M block=1M : down: 304M/s up: 305M @3.75GHz
	;buf=32M block=512k : down: 311M/s up: 305M @3.75GHz
	;buf=32M block=256k : down: 312M/s up: 311M @3.75GHz
	;buf=32M block=128k : down: 318M/s up: 315M @3.75GHz not sensitive to src misalignment
	;buf=32M block=64k : down: 325M up: 315M/s vectors/s @ 3.75GHz. up NT: 235M. down NT: 227M
	;buf=32M block=32k : down: 330M up: 320M/s @3.75GHz
	;buf=32M block=24k : down: 337M up: 331M/s @3.75GHz
	;buf=32M block=16k : down: 341M up: 351M/s @3.75GHz ; mostly this is from overlapping rep movsb with the loop, not L1 hits
	;;buf=32M block=8k : down: 325M up: 346M/s @3.75GHz
	;;buf=32M block=4k : down: 300M up: 335M/s @3.75GHz
	%define STRIDE -32
	%define STORE vmovdqu
	;%define STORE vmovntdq

	SRC_MISALIGN equ 0
	BLOCK equ 512
	BUFKB equ 1024*32

	mov rbx, 200000000 / (1024*BUFKB/128) / 4
	.repeatloop:

	mov esi, srcbuf
	mov edi, dstbuf
	.bufloop:
	; read() a block
	mov ecx, 1024*BLOCK

	mov eax, srcbuf+1024*BUFKB
	sub eax, esi
	cmp ecx, eax
	cmovg ecx, eax ; bytes = min(BLOCK, bytes left)

	%if STRIDE > 0 ; always-ascending memcpy from "pagecache"
	mov eax, edi ; start
	rep movsb
	mov edx, edi ; end

	;; add edi, ecx ;; skip the copy
	;; mov eax, dstbuf
	;; mov edx, dstbuf+1024*BLOCK
	%else
	lea edx, [rdi+STRIDE] ; end
	rep movsb
	lea eax, [rdi+STRIDE] ; start

	;; add edi, ecx
	;; mov eax, dstbuf+1024*BLOCK + STRIDE
	;; mov edx, dstbuf + STRIDE
	%endif


	; and bswap it
	ALIGN 16
	.blockloop:
	vmovdqa ymm0, [rax+STRIDE*0]
	vmovdqa ymm1, [rax+STRIDE*1]
	vmovdqa ymm2, [rax+STRIDE*2]
	vmovdqa ymm3, [rax+STRIDE*3]
	vpshufb ymm0, ymm15
	vpshufb ymm1, ymm15
	vpshufb ymm2, ymm15
	vpshufb ymm3, ymm15
	STORE [rax+STRIDE*0], ymm0
	STORE [rax+STRIDE*1], ymm1
	STORE [rax+STRIDE*2], ymm2
	STORE [rax+STRIDE*3], ymm3

	add eax, STRIDE*4 ;128
	cmp eax, edx
	jne .blockloop

	cmp edi, dstbuf + 1024*BUFKB
	jb .bufloop

	dec rbx
	jg .repeatloop

	xor edi,edi
	mov eax,231
	syscall
	%endif




	;;;;;;;;;;;;;;;;;;;;;;;;;; Read-only loop ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

	%if 0
	;;; Reading memory in ascending order is faster on Skylake, for read-only loops
	;;; But alternating up/down loops are a big win when the buffer size is >= cache size. (For L1, L2, and L3).
	;;; Hugepages probably make TLB effects insignificant.

	section .bss
	align 102410248
	buf: resb 1024*KB ; initially all mapped to the same zero page, which fits in L1. Unless it's a hugepage

	section .text
	default rel
	global _start
	_start:

	mov edi, buf
	mov eax, 0
	mov ecx, 1024 * KB / 8
	rep stosq ; dirty the buffer

	%define DOWNONLY 0
	%define STRIDE 32
	;; for large buffers (memory-bound), SKL downclocks if there aren't threads on other cores. Without that, ironically vsqrtps runs faster than vmovaps because it keeps the core clock high
	;;; For very large buffers, down is slower than up. Even though this does trivial work, so demand-loads outpace HW prefetching.

	;KB equ 1024*10 ; down: 895+-1% up: 1010+-2% bidir: 1210+-1% (vxorps with infloop active)
	;KB equ 1024*8 - 1 ; up: ~1300M/s+-2% down: ~1350+-10%. bidir: 1525+-3%

	;;; L3 hits
	;KB equ 1024 * 7 ; downonly: ~1920Mloads/s +-50. uponly: 1980M/s +-50. bidir: 2000M/s +-40?. With vxorps or vaddps, with a busy-loop on another core to keep the clock at 3.9GHz, which it should do anyway with an L3 working set
	; at 2MiB: hard to measure any difference
	;KB equ 1024 * 1 ; downonly and uponly: ~2000Mload/s or maybe 2100, bidir: ~2300M/s (vxorps)
	;KB equ 1024 / 2 ; down or up: 2140M/s bidir: 2650 (vxorps)
	;KB equ 256+128 ; down ~2300 up: 2340 bidir: 2940
	;;; L2 hits
	;;;;KB equ 1024 / 4 = L2; up: 2670 to 2930. down: 2620 to 2860, but usually only ~2700. bidir: 3340 to 3450. L1 miss rate ~ 44% (down from 50%)
	;;KB equ 128 ; up: 3250 to 3600. down: ~3250. bidir: 3960 to 4215 (better than one per clock with vxorps). (vaddps=~3620M/s, nearly hitting its latency bottleneck)
	;;KB equ 48 ; up: 3650, still 50% L1 miss rate. down: 3450-3650. bidir: ~5600Mloads/sec. L1 miss rate ~= 17%
	KB equ 256+128
	%define INSN vxorps
	%define DO_DOWNLOOP 0
	%if DO_DOWNLOOP
	%define DOWNONLY 0
	%endif
	; mov rcx, 5000000000 / (1024*KB/128) / 8
	mov rcx, 1000000000 / (1024KB/(STRIDE4)) / (4*(1+DO_DOWNLOOP-DOWNONLY))
	ALIGN 32
	.repeatloop:
	mov esi, buf
	%if DOWNONLY != 1
	.workloop:
	INSN ymm0, [rsi+STRIDE*0]
	INSN ymm1, [rsi+STRIDE*1]
	INSN ymm2, [rsi+STRIDE*2]
	INSN ymm3, [rsi+STRIDE*3]

	add esi, STRIDE*4 ;128
	cmp esi, buf-STRIDE3 + 1024KB
	jb .workloop
	%else
	mov esi, buf + 1024*KB
	%endif

	%if DO_DOWNLOOP
	.downloop:
	sub esi, STRIDE*4 ;128

	INSN ymm3, [rsi+STRIDE*3]
	INSN ymm2, [rsi+STRIDE*2]
	INSN ymm1, [rsi+STRIDE*1]
	INSN ymm0, [rsi+STRIDE*0]

	cmp esi, buf
	ja .downloop
	%endif

	dec rcx
	jnz .repeatloop

	xor edi,edi
	mov eax,231
	syscall
	%endif