-
-
Save AndrewScheidecker/c47c88fe6e37108978b08d1eefd31aa9 to your computer and use it in GitHub Desktop.
WAVM benchmark of i64 copy loop on 32-bit and 64-bit WASM memory
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// wavm test script --trace-assembly $WAVM_DIR/Test/benchmark/memory64.wast | |
/* | |
(func (export "i64 copy loop") | |
(param $dest i32) (param $source i32) (param $numBytes i32) | |
block $exitCopyLoop | |
loop $copyLoop | |
(br_if $exitCopyLoop (i32.eq (local.get $numBytes) (i32.const 0))) | |
(i64.store | |
(local.get $dest) | |
(i64.load (local.get $source))) | |
(local.set $numBytes (i32.sub (local.get $numBytes) (i32.const 8))) | |
(local.set $dest (i32.add (local.get $dest) (i32.const 8))) | |
(local.set $source (i32.add (local.get $source) (i32.const 8))) | |
(br $copyLoop) | |
end | |
end | |
) | |
*/ | |
// 32-bit i64 copy loop: | |
movq %rdi, %rax // WAVM ABI inefficiency | |
movq %rdi, %r8 // \ | |
andq $-2147483648, %r8 // | derive MemoryRuntimeData* from ContextRuntimeData* | |
movabsq $0, %rdi // / | |
movq (%r8,%rdi), %r8 // r8 = memory base | |
testl %ecx, %ecx // \ | |
je 26 // / branch to end if count == 0 | |
nop // branch target alignment | |
// loop: | |
movl %edx, %edi // \ | |
movq (%r8,%rdi), %r9 // / load from source | |
movl %esi, %edi // \ | |
movq %r9, (%r8,%rdi) // / store to dest | |
addl $-8, %ecx // \ | |
addl $8, %esi // | update source, dest, and count | |
addl $8, %edx // / | |
testl %ecx, %ecx // \ | |
jne -25 // / branch to loop if count != 0 | |
// end: | |
retq | |
// 64-bit i64 copy loop: | |
movq %rdi, %rax // WAVM ABI inefficiency | |
andq $-2147483648, %rdi // \ | |
movabsq $0, %r9 // / derive MemoryRuntimeData* from ContextRuntimeData* | |
movq (%rdi,%r9), %r8 // r8 = memory base | |
movq 16(%rdi,%r9), %r9 // r9 = memory reserved bytes (excluding guard region) | |
testq %rcx, %rcx // \ | |
je 59 // / branch to end if count == 0 | |
nopw %cs:(%rax,%rax) // \ | |
nopl (%rax) // / branch target alignment | |
// loop: | |
cmpq %r9, %rdx // \ | |
movq %r9, %rdi // | rdi = min(memory reserved bytes, source) | |
cmovbq %rdx, %rdi // / | |
movq (%r8,%rdi), %r10 // load from source | |
cmpq %r9, %rsi // \ | |
movq %r9, %rdi // | rdi = min(memory reserved bytes, dest) | |
cmovbq %rsi, %rdi // / | |
movq %r10, (%r8,%rdi) // store to dest | |
addq $-8, %rcx // \ | |
addq $8, %rsi // | update source, dest, and count | |
addq $8, %rdx // / | |
testq %rcx, %rcx // \ | |
jne -45 // / branch to loop if count != 0 | |
// end: | |
retq | |
// Benchmarks on Skylake Xeon: | |
// memory.copy (2MB) : 87,562ns +/- 1868ns | |
// 32-bit i64 copy loop(2MB) : 185,502ns +/- 3918ns | |
// 64-bit i64 copy loop(2MB) : 279,501ns +/- 4924ns | |
// 279,501ns / 185,502ns = 150% of the time or 66% of the throughput of 32-bit copy | |
// 279,501ns / 87,562ns = 319% of the time or 31% of the throughput of memory.copy |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment