AndrewScheidecker/memory64-i64-copy-loop.s Secret

## memory64-i64-copy-loop.s
// wavm test script --trace-assembly $WAVM_DIR/Test/benchmark/memory64.wast

/*
(func (export "i64 copy loop")
    (param $dest i32) (param $source i32) (param $numBytes i32)
    block $exitCopyLoop
      loop $copyLoop
        (br_if $exitCopyLoop (i32.eq (local.get $numBytes) (i32.const 0)))
        (i64.store
          (local.get $dest)
          (i64.load (local.get $source)))
        (local.set $numBytes (i32.sub (local.get $numBytes) (i32.const 8)))
        (local.set $dest (i32.add (local.get $dest) (i32.const 8)))
        (local.set $source (i32.add (local.get $source) (i32.const 8)))
        (br $copyLoop)
      end
    end
  )
*/

// 32-bit i64 copy loop:

        movq    %rdi, %rax          // WAVM ABI inefficiency
        movq    %rdi, %r8           // \
        andq    $-2147483648, %r8   // | derive MemoryRuntimeData* from ContextRuntimeData*
        movabsq $0, %rdi            // /
        movq    (%r8,%rdi), %r8     // r8 = memory base
        testl   %ecx, %ecx          // \
        je      26                  // / branch to end if count == 0

        nop                         // branch target alignment
                                    // loop:
        movl    %edx, %edi          // \
        movq    (%r8,%rdi), %r9     // / load from source
        movl    %esi, %edi          // \
        movq    %r9, (%r8,%rdi)     // / store to dest
        addl    $-8, %ecx           // \
        addl    $8, %esi            // | update source, dest, and count
        addl    $8, %edx            // /
        testl   %ecx, %ecx          // \
        jne     -25                 // / branch to loop if count != 0

                                    // end:
        retq

// 64-bit i64 copy loop:

        movq    %rdi, %rax          // WAVM ABI inefficiency
        andq    $-2147483648, %rdi  // \
        movabsq $0, %r9             // / derive MemoryRuntimeData* from ContextRuntimeData*
        movq    (%rdi,%r9), %r8     // r8 = memory base
        movq    16(%rdi,%r9), %r9   // r9 = memory reserved bytes (excluding guard region)
        testq   %rcx, %rcx          // \
        je      59                  // / branch to end if count == 0

        nopw    %cs:(%rax,%rax)     // \
        nopl    (%rax)              // / branch target alignment
                                    // loop:
        cmpq    %r9, %rdx           // \
        movq    %r9, %rdi           // | rdi = min(memory reserved bytes, source)
        cmovbq  %rdx, %rdi          // /
        movq    (%r8,%rdi), %r10    // load from source
        cmpq    %r9, %rsi           // \
        movq    %r9, %rdi           // | rdi = min(memory reserved bytes, dest)
        cmovbq  %rsi, %rdi          // /
        movq    %r10, (%r8,%rdi)    // store to dest
        addq    $-8, %rcx           // \
        addq    $8, %rsi            // | update source, dest, and count
        addq    $8, %rdx            // /
        testq   %rcx, %rcx          // \
        jne     -45                 // / branch to loop if count != 0

                                    // end:
        retq

// Benchmarks on Skylake Xeon:

// memory.copy         (2MB) :  87,562ns +/- 1868ns
// 32-bit i64 copy loop(2MB) : 185,502ns +/- 3918ns
// 64-bit i64 copy loop(2MB) : 279,501ns +/- 4924ns

// 279,501ns / 185,502ns = 150% of the time or 66% of the throughput of 32-bit copy
// 279,501ns /  87,562ns = 319% of the time or 31% of the throughput of memory.copy
	// wavm test script --trace-assembly $WAVM_DIR/Test/benchmark/memory64.wast

	/*
	(func (export "i64 copy loop")
	(param $dest i32) (param $source i32) (param $numBytes i32)
	block $exitCopyLoop
	loop $copyLoop
	(br_if $exitCopyLoop (i32.eq (local.get $numBytes) (i32.const 0)))
	(i64.store
	(local.get $dest)
	(i64.load (local.get $source)))
	(local.set $numBytes (i32.sub (local.get $numBytes) (i32.const 8)))
	(local.set $dest (i32.add (local.get $dest) (i32.const 8)))
	(local.set $source (i32.add (local.get $source) (i32.const 8)))
	(br $copyLoop)
	end
	end
	)
	*/

	// 32-bit i64 copy loop:

	movq %rdi, %rax // WAVM ABI inefficiency
	movq %rdi, %r8 // \
	andq $-2147483648, %r8 // \| derive MemoryRuntimeData* from ContextRuntimeData*
	movabsq $0, %rdi // /
	movq (%r8,%rdi), %r8 // r8 = memory base
	testl %ecx, %ecx // \
	je 26 // / branch to end if count == 0

	nop // branch target alignment
	// loop:
	movl %edx, %edi // \
	movq (%r8,%rdi), %r9 // / load from source
	movl %esi, %edi // \
	movq %r9, (%r8,%rdi) // / store to dest
	addl $-8, %ecx // \
	addl $8, %esi // \| update source, dest, and count
	addl $8, %edx // /
	testl %ecx, %ecx // \
	jne -25 // / branch to loop if count != 0

	// end:
	retq

	// 64-bit i64 copy loop:

	movq %rdi, %rax // WAVM ABI inefficiency
	andq $-2147483648, %rdi // \
	movabsq $0, %r9 // / derive MemoryRuntimeData* from ContextRuntimeData*
	movq (%rdi,%r9), %r8 // r8 = memory base
	movq 16(%rdi,%r9), %r9 // r9 = memory reserved bytes (excluding guard region)
	testq %rcx, %rcx // \
	je 59 // / branch to end if count == 0

	nopw %cs:(%rax,%rax) // \
	nopl (%rax) // / branch target alignment
	// loop:
	cmpq %r9, %rdx // \
	movq %r9, %rdi // \| rdi = min(memory reserved bytes, source)
	cmovbq %rdx, %rdi // /
	movq (%r8,%rdi), %r10 // load from source
	cmpq %r9, %rsi // \
	movq %r9, %rdi // \| rdi = min(memory reserved bytes, dest)
	cmovbq %rsi, %rdi // /
	movq %r10, (%r8,%rdi) // store to dest
	addq $-8, %rcx // \
	addq $8, %rsi // \| update source, dest, and count
	addq $8, %rdx // /
	testq %rcx, %rcx // \
	jne -45 // / branch to loop if count != 0

	// end:
	retq

	// Benchmarks on Skylake Xeon:

	// memory.copy (2MB) : 87,562ns +/- 1868ns
	// 32-bit i64 copy loop(2MB) : 185,502ns +/- 3918ns
	// 64-bit i64 copy loop(2MB) : 279,501ns +/- 4924ns

	// 279,501ns / 185,502ns = 150% of the time or 66% of the throughput of 32-bit copy
	// 279,501ns / 87,562ns = 319% of the time or 31% of the throughput of memory.copy