Skip to content

Instantly share code, notes, and snippets.

@AndrewScheidecker
Last active December 1, 2020 01:12
Show Gist options
  • Save AndrewScheidecker/c47c88fe6e37108978b08d1eefd31aa9 to your computer and use it in GitHub Desktop.
Save AndrewScheidecker/c47c88fe6e37108978b08d1eefd31aa9 to your computer and use it in GitHub Desktop.
WAVM benchmark of i64 copy loop on 32-bit and 64-bit WASM memory
// wavm test script --trace-assembly $WAVM_DIR/Test/benchmark/memory64.wast
/*
(func (export "i64 copy loop")
(param $dest i32) (param $source i32) (param $numBytes i32)
block $exitCopyLoop
loop $copyLoop
(br_if $exitCopyLoop (i32.eq (local.get $numBytes) (i32.const 0)))
(i64.store
(local.get $dest)
(i64.load (local.get $source)))
(local.set $numBytes (i32.sub (local.get $numBytes) (i32.const 8)))
(local.set $dest (i32.add (local.get $dest) (i32.const 8)))
(local.set $source (i32.add (local.get $source) (i32.const 8)))
(br $copyLoop)
end
end
)
*/
// 32-bit i64 copy loop:
movq %rdi, %rax // WAVM ABI inefficiency
movq %rdi, %r8 // \
andq $-2147483648, %r8 // | derive MemoryRuntimeData* from ContextRuntimeData*
movabsq $0, %rdi // /
movq (%r8,%rdi), %r8 // r8 = memory base
testl %ecx, %ecx // \
je 26 // / branch to end if count == 0
nop // branch target alignment
// loop:
movl %edx, %edi // \
movq (%r8,%rdi), %r9 // / load from source
movl %esi, %edi // \
movq %r9, (%r8,%rdi) // / store to dest
addl $-8, %ecx // \
addl $8, %esi // | update source, dest, and count
addl $8, %edx // /
testl %ecx, %ecx // \
jne -25 // / branch to loop if count != 0
// end:
retq
// 64-bit i64 copy loop:
movq %rdi, %rax // WAVM ABI inefficiency
andq $-2147483648, %rdi // \
movabsq $0, %r9 // / derive MemoryRuntimeData* from ContextRuntimeData*
movq (%rdi,%r9), %r8 // r8 = memory base
movq 16(%rdi,%r9), %r9 // r9 = memory reserved bytes (excluding guard region)
testq %rcx, %rcx // \
je 59 // / branch to end if count == 0
nopw %cs:(%rax,%rax) // \
nopl (%rax) // / branch target alignment
// loop:
cmpq %r9, %rdx // \
movq %r9, %rdi // | rdi = min(memory reserved bytes, source)
cmovbq %rdx, %rdi // /
movq (%r8,%rdi), %r10 // load from source
cmpq %r9, %rsi // \
movq %r9, %rdi // | rdi = min(memory reserved bytes, dest)
cmovbq %rsi, %rdi // /
movq %r10, (%r8,%rdi) // store to dest
addq $-8, %rcx // \
addq $8, %rsi // | update source, dest, and count
addq $8, %rdx // /
testq %rcx, %rcx // \
jne -45 // / branch to loop if count != 0
// end:
retq
// Benchmarks on Skylake Xeon:
// memory.copy (2MB) : 87,562ns +/- 1868ns
// 32-bit i64 copy loop(2MB) : 185,502ns +/- 3918ns
// 64-bit i64 copy loop(2MB) : 279,501ns +/- 4924ns
// 279,501ns / 185,502ns = 150% of the time or 66% of the throughput of 32-bit copy
// 279,501ns / 87,562ns = 319% of the time or 31% of the throughput of memory.copy
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment