bdw/fib.asm Secret

## fib.asm
	.section	.rodata
.fmt:
	.string	"%lld\n"

	.text
        .globl main
/* The driving loop. Nothing to see here */
main:
        pushq %rbp
        mov %rsp, %rbp
        subq $0x20, %rsp
        movq $1, 0x8(%rsp)
.loop:
        movq 0x8(%rsp), %rdi
        call fib1 /* or call fib0 */
	/* print the string, disabled as it dominates the time spent */
        /*
        movq $.fmt, %rdi
	movq %rax, %rsi
        movq $0, %rax
        call printf
        */
	incq 0x8(%rsp)
        /* loop 60000 times. the answer of fib doesn't make sense then, but that's not the point of this exercise */
        movq $60000, %rdx
        cmpq 0x8(%rsp), %rdx
        jg .loop
        movq %rbp, %rsp
        popq %rbp
        ret


/* fibonacci algorithm as it is currently compiled from the 'hot loop'
 * within the nqp frame above. The actual nqp frame is considerably
 * larger due to lexotic handling, why it does that I don't really know. */
fib0:
        pushq %rbp
        movq %rsp, %rbp
        subq $0x80, %rsp
/* the actual code uses %rbx for moarvm registers, but these aren't available here, so
 * we make do with the regular stack pointer */
        movq %rdi, (%rsp)
        movq $0, 0x8(%rsp)
        movq $1, 0x10(%rsp)
        movq $0, 0x20(%rsp)
_loop:
        /* cmp_i */
        movq (%rsp), %rax
        cmpq 0x20(%rsp), %rax
        setg %al
        movzx %al, %rax
        movq %rax, 0x20(%rsp)
        /* if_i */
        movq 0x20(%rsp), %rax
	testq %rax, %rax
        je _epilogue
        /* add_i */
        movq 0x10(%rsp), %rax
        addq 0x8(%rsp), %rax
        movq %rax, 0x10(%rsp)
        /* sub_i */
        movq 0x10(%rsp), %rax
        subq 0x8(%rsp), %rax
        movq %rax, 0x8(%rsp)
        /* const_i64_16 */
        movq $1, 0x28(%rsp)
        /* sub_i */
        movq (%rsp), %rax
        subq 0x28(%rsp), %rax
        movq %rax, 0x28(%rsp)
        /* set */
        movq 0x28(%rsp), %rcx
        movq %rcx, (%rsp)
        /* const_i64_16. this is useless but not removed */
        movq $0, 0x30(%rsp)
        /* goto. note that the actual code emits a gc checkpoint at this and the earlier jump,
         * but we're not running inside MoarVM so I've eliminated them in both frames */
        jmp _loop
_epilogue:
        mov 0x10(%rsp), %rax
/* function epilogue */
        movq %rbp, %rsp          /* restore stack top */
        popq %rbp                /* restore stack bottom */
        ret


/* this is an example of what the tree compiler should reasonably be able to do,
 * without truly fancy optimizations. The algorithm is the same, I've just added
 * register selection and very minor instruction reductions based on constants. */
fib1:
        pushq %rbp
        movq %rsp, %rbp
        movq $1, %rax
        movq $0, %rcx
        movq %rdi, %rdx
_loop2:
        /* if_i(cmp_i(r0, const_i(0)), _epilogue2) == unless_i(r0, epilogue2) */
        test %rdx, %rdx
        jz _epilogue2
        /* add_i */
        addq %rcx, %rax
        /* sub_i. I'm sure I can find a way to optimze this further, but that would not be realistic in general */
        movq %rax, %r8
        subq %rcx, %r8
        movq %r8, %rcx
        /* sub_i(r0, const_i(1)) == dec_i(r0). note that decq isn't particularly fast */
        decq %rdx
        /* goto */
        jmp _loop2
_epilogue2:
        movq %rcx, %rax
        movq %rbp, %rsp
        popq %rbp
        ret

## fib.nqp
#!/usr/bin/env nqp-m
# what can i say, i like the fibonacci series
sub fib(int $n) {
    my int $a := 0;
    my int $b := 1;

    while $n > 0 {
        $b := $b + $a;
        $a := $b - $a;
        $n := $n - 1;
    }
    return $a;
}

my int $i := 0;
while $i < 60 {
    nqp::say(fib($i));
    $i := $i + 1;
}

## results.txt
# With fib0 as the algorithm:
$ make bar
$ time ./bar

real	0m7.508s
user	0m7.513s
sys	0m0.001s

# now edit the file and use fib1
$ make bar
$ time ./bar

real	0m1.414s
user	0m1.414s
sys	0m0.001s

# 0.7508 / 0.1414 = 5,3x speedup. Your Mileage Will Vary.

# This is admittedly a very tiny and somewhat artificial example!
# Not nearly all code will benefit in the same way.
# On the other hand, this example only demonstrates one of all feasible optimizations.
	.section .rodata
	.fmt:
	.string "%lld\n"

	.text
	.globl main
	/* The driving loop. Nothing to see here */
	main:
	pushq %rbp
	mov %rsp, %rbp
	subq $0x20, %rsp
	movq $1, 0x8(%rsp)
	.loop:
	movq 0x8(%rsp), %rdi
	call fib1 /* or call fib0 */
	/* print the string, disabled as it dominates the time spent */
	/*
	movq $.fmt, %rdi
	movq %rax, %rsi
	movq $0, %rax
	call printf
	*/
	incq 0x8(%rsp)
	/* loop 60000 times. the answer of fib doesn't make sense then, but that's not the point of this exercise */
	movq $60000, %rdx
	cmpq 0x8(%rsp), %rdx
	jg .loop
	movq %rbp, %rsp
	popq %rbp
	ret


	/* fibonacci algorithm as it is currently compiled from the 'hot loop'
	* within the nqp frame above. The actual nqp frame is considerably
	* larger due to lexotic handling, why it does that I don't really know. */
	fib0:
	pushq %rbp
	movq %rsp, %rbp
	subq $0x80, %rsp
	/* the actual code uses %rbx for moarvm registers, but these aren't available here, so
	* we make do with the regular stack pointer */
	movq %rdi, (%rsp)
	movq $0, 0x8(%rsp)
	movq $1, 0x10(%rsp)
	movq $0, 0x20(%rsp)
	_loop:
	/* cmp_i */
	movq (%rsp), %rax
	cmpq 0x20(%rsp), %rax
	setg %al
	movzx %al, %rax
	movq %rax, 0x20(%rsp)
	/* if_i */
	movq 0x20(%rsp), %rax
	testq %rax, %rax
	je _epilogue
	/* add_i */
	movq 0x10(%rsp), %rax
	addq 0x8(%rsp), %rax
	movq %rax, 0x10(%rsp)
	/* sub_i */
	movq 0x10(%rsp), %rax
	subq 0x8(%rsp), %rax
	movq %rax, 0x8(%rsp)
	/* const_i64_16 */
	movq $1, 0x28(%rsp)
	/* sub_i */
	movq (%rsp), %rax
	subq 0x28(%rsp), %rax
	movq %rax, 0x28(%rsp)
	/* set */
	movq 0x28(%rsp), %rcx
	movq %rcx, (%rsp)
	/* const_i64_16. this is useless but not removed */
	movq $0, 0x30(%rsp)
	/* goto. note that the actual code emits a gc checkpoint at this and the earlier jump,
	* but we're not running inside MoarVM so I've eliminated them in both frames */
	jmp _loop
	_epilogue:
	mov 0x10(%rsp), %rax
	/* function epilogue */
	movq %rbp, %rsp /* restore stack top */
	popq %rbp /* restore stack bottom */
	ret


	/* this is an example of what the tree compiler should reasonably be able to do,
	* without truly fancy optimizations. The algorithm is the same, I've just added
	* register selection and very minor instruction reductions based on constants. */
	fib1:
	pushq %rbp
	movq %rsp, %rbp
	movq $1, %rax
	movq $0, %rcx
	movq %rdi, %rdx
	_loop2:
	/* if_i(cmp_i(r0, const_i(0)), _epilogue2) == unless_i(r0, epilogue2) */
	test %rdx, %rdx
	jz _epilogue2
	/* add_i */
	addq %rcx, %rax
	/* sub_i. I'm sure I can find a way to optimze this further, but that would not be realistic in general */
	movq %rax, %r8
	subq %rcx, %r8
	movq %r8, %rcx
	/* sub_i(r0, const_i(1)) == dec_i(r0). note that decq isn't particularly fast */
	decq %rdx
	/* goto */
	jmp _loop2
	_epilogue2:
	movq %rcx, %rax
	movq %rbp, %rsp
	popq %rbp
	ret
	#!/usr/bin/env nqp-m
	# what can i say, i like the fibonacci series
	sub fib(int $n) {
	my int $a := 0;
	my int $b := 1;

	while $n > 0 {
	$b := $b + $a;
	$a := $b - $a;
	$n := $n - 1;
	}
	return $a;
	}

	my int $i := 0;
	while $i < 60 {
	nqp::say(fib($i));
	$i := $i + 1;
	}
	# With fib0 as the algorithm:
	$ make bar
	$ time ./bar

	real 0m7.508s
	user 0m7.513s
	sys 0m0.001s

	# now edit the file and use fib1
	$ make bar
	$ time ./bar

	real 0m1.414s
	user 0m1.414s
	sys 0m0.001s

	# 0.7508 / 0.1414 = 5,3x speedup. Your Mileage Will Vary.

	# This is admittedly a very tiny and somewhat artificial example!
	# Not nearly all code will benefit in the same way.
	# On the other hand, this example only demonstrates one of all feasible optimizations.