molenzwiebel/flash.asm

## flash.asm
# ====================================================================================================
# ====================================================================================================
# ====================================================================================================
# ====================================================================================================
# ====================================================================================================
# ===================================== README README README README ==================================
# ====================================================================================================
# == IT IS HIGHLY RECOMMENDED THAT YOU READ THIS CODE WHILE READING THE ACCOMPANYING README FILE AT ==
# == THE SAME TIME. IT WILL EXPLAIN SOME CONCEPTS USED IN THIS CODE, AND WHILE THE COMMENTS ABOVE   ==
# == FUNCTIONS CONTAIN A LOT OF CONTEXT, THEY WILL NOT EXPLAIN EVERYTHING. ALSO: I SPENT A LOT OF   ==
# == TIME ON THE README, SO I'D LOVE IT IF SOMEONE ACTUALLY READ IT AND APPRECIATED THE EFFORT.     ==
# == FIND THE README AT: https://gist.github.com/molenzwiebel/2839f812b1d795ac22ee26a49d8e82ba      ==
# ====================================================================================================
# ====================================================================================================
# ====================================================================================================
# ====================================================================================================
# ====================================================================================================
# ====================================================================================================

.bss
# The 30.000 signed 8-bit cells we use.
CELLS: .skip 30000

# Does nothing. Used for optimized out instructions so we don't have to reorder arrays.
.equ OP_NOP, 1
# A simple + or - instruction. Is actually an add, not a write.
.equ OP_WRITE, 2
# Moves the current cell pointer (> and <). Optimized out by using offsets.
.equ OP_MOVE, 3
# Reads a char from stdin (,)
.equ OP_READ, 4
# Writes a char to stdout (.)
.equ OP_PRINT, 5
# A simple loop open ([)
.equ OP_LOOP_OPEN, 6
# A simple loop close (])
.equ OP_LOOP_CLOSE, 7
# A direct write to a cell, instead of an addition. p[offset] = coeff
.equ OP_WRITE_CELL, 8
# A multiplicative addition to a cell, instead of a normal addition.
.equ OP_MULT_WRITE, 9
# Beginning of a simple loop that can be proven to run N times.
.equ OP_UNROLL_LOOP_BEGIN, 10
# End of a simple loop that can be proven to run N times.
.equ OP_UNROLL_LOOP_END, 11
# Within an unrolled loop, an operation that simply adds a cell. This is OP_MULT_WRITE but with metadata of 1.
.equ OP_CELL_ADD, 12
# Signifies the end of a "loop" that can be proven to only run once.
.equ OP_IF_END, 13
# A conditional set within an unrolled loop. Different from normal sets since it needs to be executed conditionally.
.equ OP_UNROLL_SET, 14
# A loop that searches for a 0 cell. Equivalent to [>>>>], with coeff being the amount of moves inside the loop.
.equ OP_POINTER_SEEK, 15

.text
# Lookup table for JIT code sizes for every instruction.
CODE_SIZES:	.byte 0 # index 0
			.byte 0 # nop
			.byte 7 # write
			.byte 0 # move
			.byte 9 # read
			.byte 10 # print
			.byte 17 # loop open
			.byte 17 # loop close
			.byte 7 # write cell
			.byte 14 # mult write
			.byte 10 # unroll loop start
			.byte 3 # unroll loop end
			.byte 6 # cell add
			.byte 21 # unroll set
			.byte 20 # pointer seek

.global brainfuck
.global build_representation
.global pre_offset_optimizations
.global compute_offsets
.global possibly_unroll_loop
.global possibly_convert_to_if
.global compile_bf
.global compute_jit_size

# Converts the textual brainfuck representation into an IR (intermediate representation)
# corresponding to the brainfuck. The IR consists of 4 bytes, which looks like:
# 1 byte: opcode
# 1 byte: coefficient
# 2 bytes: offset
#
# The opcode is one of the opcodes defined at the top of this file. This function already
# does some initial optimizations, most notably collapsing the +- and <> instructions
# and writing the amount to the coefficient field. ++- results into OP_WRITE with an
# coefficient of one.
#
# Zeroing loops ([-]) are also detected and optimized in this program. All other constructs
# are translated directly.
#
# The offset field is written as 0, since offsets will be computed later.
# -----------------------------------------------------------------------------------------
# Arguments:
# 	rdi: a pointer to brainfuck source to optimize
# Returns:
#   rax: a pointer to the optimized version of the source
build_representation:
	pushq %rbp
	movq %rsp, %rbp

	# Save rdi since we allocate a new buffer, which
	# destroys rdi even though we need it later.
	subq $8, %rsp
	pushq %rdi

	# Allocate a new result buffer. First, check how long the source is.
	call strlen

	# Add one for the null terminator (in case we don't do a single optimization).
	# Then, multiply by four since we need 4 bytes for every original instruction in the worst case.
	addq $1, %rdi
	shlq $2, %rdi

	# Allocate len * 4 + 1 bytes and zero them.
	movq $1, %rsi
	call calloc

	# The old initial source is in rcx, new buffer in rax/
	popq %rcx

	# Save buffer, so we can return the start later.
	pushq %rax

build_loop_check:
	# Stop once we've reached the end.
	cmpb $0, (%rcx)
	je build_loop_end

	# *ptr == '[', check for the [-] construct
	cmpb $91, (%rcx)
	je maybe_zero_cell

	# *ptr == ']', no optimization possible (copy the char over)
	cmpb $93, (%rcx)
	je build_rbracket

	# *ptr == ',', no optimization possible (copy the char over)
	cmpb $44, (%rcx)
	je build_read

	# *ptr == '.', no optimization possible (copy the char over)
	cmpb $46, (%rcx)
	je build_print

	# *ptr == '+', collapse subsequent instructions
	cmpb $43, (%rcx)
	je collapse_write

	# *ptr == '-', collapse subsequent instructions
	cmpb $45, (%rcx)
	je collapse_write

	# *ptr == '>', collapse subsequent instructions
	cmpb $62, (%rcx)
	je collapse_ptr

	# *ptr == '<', collapse subsequent instructions
	cmpb $60, (%rcx)
	je collapse_ptr

	# This wasn't a brainfuck instruction so ignore it alltogether.
	# Simply go to the next character and loop again.
	incq %rcx
	jmp build_loop_check

collapse_write:
	# count == 0, then do a while loop
	movq $0, %rdx

collapse_write_check:
	# to prevent overflow (we only have 1 byte for length), stop once we reach 100
	# '+' * 130 will just optimize into [OP_WRITE(100), OP_WRITE(30)], which isn't a big deal
	cmpb $-100, %dl
	jl collapse_write_after
	cmpb $100, %dl
	jg collapse_write_after

	# if this is + or -, adjust our count and loop
	cmpb $43, (%rcx)
	je collapse_write_incr
	cmpb $45, (%rcx)
	je collapse_write_decr

	# else, we've reached the end of the collapsible strips, so stop
	jmp collapse_write_after

collapse_write_incr:
	# Add one to the count, goto next character.
	addb $1, %dl
	addq $1, %rcx

	jmp collapse_write_check

collapse_write_decr:
	# Subtract one from the count, goto next character.
	subb $1, %dl
	addq $1, %rcx

	jmp collapse_write_check

collapse_write_after:
	# We've gone too far, go back one character.
	subq $1, %rcx

	# Write OP_WRITE and the count to the IR node.
	movb $OP_WRITE, (%rax)
	movb %dl, 1(%rax)

	jmp build_loop_step

collapse_ptr:
	# Again, count == 0 and while.
	movq $0, %rdx

collapse_ptr_check:
	# Again, prevent overflow by optimizing a bit less. See optimize_write_check for more details.
	cmpb $-100, %dl
	jl collapse_ptr_after
	cmpb $100, %dl
	jg collapse_ptr_after

	# If > or <, adjust our count.
	cmpb $62, (%rcx)
	je collapse_ptr_incr
	cmpb $60, (%rcx)
	je collapse_ptr_decr

	# We've reached the end of the strip.
	jmp collapse_ptr_after

collapse_ptr_incr:
	# count++, goto next character.
	addb $1, %dl
	addq $1, %rcx

	jmp collapse_ptr_check

collapse_ptr_decr:
	# count--, go to next character
	subb $1, %dl
	addq $1, %rcx

	jmp collapse_ptr_check

collapse_ptr_after:
	# Gone too far, subtract one.
	subq $1, %rcx

	# Write OP_MOVE(count) to our result.
	movb $OP_MOVE, (%rax)
	movb %dl, 1(%rax)

	jmp build_loop_step

maybe_zero_cell:
	# If the next two bytes aren't -], this is not a [-] construct
	# we are willing to optimize. Simply copy the current [ over normally
	# and the next iterations will copy the -]s over.
	cmpb $45, 1(%rcx)
	jne build_lbracket
	cmpb $93, 2(%rcx)
	jne build_lbracket

	# Write OP_WRITE_CELL with the value 0.
	movb $OP_WRITE_CELL, (%rax)
	addq $2, %rcx

	jmp build_loop_step

build_lbracket:
	# Simply copy over.
	movb $OP_LOOP_OPEN, (%rax)
	jmp build_loop_step

build_rbracket:
	# Simply copy over.
	movb $OP_LOOP_CLOSE, (%rax)
	jmp build_loop_step

build_print:
	# Simply copy over.
	movb $OP_PRINT, (%rax)
	jmp build_loop_step

build_read:
	# Simply copy over.
	movb $OP_READ, (%rax)
	jmp build_loop_step

build_loop_step:
	# The loop step, go to the next character and IR node and loop back.
	addq $4, %rax
	incq %rcx
	jmp build_loop_check

build_loop_end:
	# We stored the beginning of the newly allocated buffer on the stack.
	# We pop it off now so we can return it into rax.
	popq %rax

	movq %rbp, %rsp
	popq %rbp
	ret

# Does a set of optimizations on the converted IR that need to be done before
# computing the pointer offsets and zeroing the moves. Most of these rely on
# moves to still exist in the IR, which are optimized out when we compute offsets.
#
# The following constructs are optimized here:
# - '++--' to OP_NOP
# - '><><' to OP_NOP
# - '[-]+++' to OP_WRITE_CELL(3)
# - '][-]' into OP_LOOP_CLOSE, OP_NOP
# --------------------------------------------------------------------------------
# Arguments:
# 	rdi: a pointer to the IR array
pre_offset_optimizations:
	pushq %rbp
	movq %rsp, %rbp

	movq %rdi, %rax

pre_offset_check:
	cmpb $0, (%rax)
	je pre_offset_end

	# Check if they might be useless collapsed operations.
	cmpb $OP_WRITE, (%rax)
	je maybe_remove_useless_collapsed
	cmpb $OP_MOVE, (%rax)
	je maybe_remove_useless_collapsed

	# Maybe fuse a zero set and a write into a single op.
	cmpb $OP_WRITE_CELL, (%rax)
	je maybe_fuse_zero_add

	# Check if this is a pointer seek loop.
	cmpb $OP_LOOP_OPEN, (%rax)
	je maybe_pointer_seek

	# Maybe remove a useless zeroing after the closing of a loop.
	# (we already know the cell will be zero after a loop).
	cmpb $OP_LOOP_CLOSE, (%rax)
	je maybe_remove_useless_zero

	jmp pre_offset_step

maybe_pointer_seek:
	# If the next ins isn't move or the one after isn't a loop close, abort.
	cmpb $OP_MOVE, 4(%rax)
	jne pre_offset_step
	cmpb $OP_LOOP_CLOSE, 8(%rax)
	jne pre_offset_step

	movb $OP_POINTER_SEEK, (%rax)
	movb $OP_NOP, 4(%rax)
	movb $OP_NOP, 8(%rax)

	# Copy over the coefficient from the move.
	movzbl 5(%rax), %edx
	movb %dl, 1(%rax)

	jmp pre_offset_step

maybe_remove_useless_collapsed:
	# if the coeff isnt 0, ignore this
	cmpb $0, 1(%rax)
	jne pre_offset_step

	# replace the operation with nop
	movb $OP_NOP, (%rax)
	jmp pre_offset_step

maybe_remove_useless_zero:
	# if the next op is a write_cell(0), continue. else, go to step
	cmpb $OP_WRITE_CELL, 4(%rax)
	jne pre_offset_step
	cmpb $0, 5(%rax)
	jne pre_offset_step

	# Skip our skipped instruction, write NOP.
	addq $4, %rax
	movb $OP_NOP, (%rax)

maybe_fuse_zero_add:
	# If the next operation isn't a write, skip.
	cmpb $0, 1(%rax)
	jne pre_offset_step
	cmpb $OP_WRITE, 4(%rax)
	jne pre_offset_step

	# Cannot mov two memory references, so load into edx.
	movsbl 5(%rax), %edx
	movb %dl, 1(%rax)

	# Write NOP for the next instruction.
	movb $OP_NOP, 4(%rax)
	movb $OP_WRITE_CELL, (%rax)

	jmp pre_offset_step

pre_offset_step:
	# Simply go to the next IR node and try to optimize again.
	addq $4, %rax
	jmp pre_offset_check

pre_offset_end:
	movq %rbp, %rsp
	popq %rbp
	ret

# One of the big performance gains we use is computing "offsets". Basically, with the
# following code: +>->++<, it is much more performant if we can emit
# p[0]++, p[1]--, p[2] += 2 and ignore the pointer movements completely. This is also
# great for pipelining since this way of emitting code does not create data dependencies
# which means the processor itself can give us "free performance". Do note that we still
# need to move the pointer at the end, specifically before [s and ]s (since we cannot)
# predict how much times a loop will run. The code emitting will insert the pointer
# movements based on the offset field for that op. Do note that the offset field is
# written for every field, even the ones that might not need it. This makes computation
# easier.
# --------------------------------------------------------------------------------------
# Arguments:
# 	rdi: a pointer to the IR array
compute_offsets:
	pushq %rbp
	movq %rsp, %rbp

	# We start at offset zero.
	movq $0, %rcx
	movq %rdi, %rax

offset_check:
	# Stop if we've reached the end.
	cmpb $0, (%rax)
	je offset_end

	# Write the offset to this instruction.
	movw %cx, 2(%rax)

	# If this was a move, nop the move and adjust the offset.
	cmpb $OP_MOVE, (%rax)
	je offset_move

	# If this was an open or close, reset the offset (since the instruction will adjust the "base" of the pointer).
	cmpb $OP_LOOP_OPEN, (%rax)
	je offset_clear
	cmpb $OP_LOOP_CLOSE, (%rax)
	je offset_clear

	# If this was a pointer seek, set the offset to negative the seek count.
	# This is because we transform into a do-while loop, which means we always go too far.
	cmpb $OP_POINTER_SEEK, (%rax)
	je offset_seek

	# Nothing interesting, just step and loop.
	jmp offset_step

offset_clear:
	# Simply zero offset, step and loop.
	movq $0, %rcx
	jmp offset_step

offset_move:
	# Load the coefficient and add to the current offset.
	movsbl 1(%rax), %edx
	addl %edx, %ecx

	# NOP out this pointer move, then loop.
	movb $OP_NOP, (%rax)
	jmp offset_step

offset_seek:
	# Load the coefficient and negate.
	movsbl 1(%rax), %edx
	negl %edx

	# Set the new offset.
	movq %rdx, %rcx
	jmp offset_step

offset_step:
	# Goto the next IR node, then loop.
	addq $4, %rax
	jmp offset_check

offset_end:
	# We do not have a result.
	movq %rbp, %rsp
	popq %rbp
	ret

# This is a helper function that takes a function pointer in rsi and runs it on the
# start of every loop contained in rdi. This is extracted in a helper function since
# we need to iterate over the code twice: once for loop unrolling and once for if
# loop detection.
# ----------------------------------------------------------------------------------
# Arguments:
# 	rdi: a pointer to the IR array
#   rsi: a function pointer to be ran on every OP_LOOP_OPEN node
for_every_loop_run:
	pushq %rbp
	movq %rsp, %rbp

	# save rdi, it is caller saved and we modify it here
	subq $8, %rsp
	pushq %rdi

for_loops_body:
	# stop if we've reached the end
	cmpb $0, (%rdi)
	je for_loops_end

	# next if this isn't loop open, else call the pointer
	# the argument is already in rdi since we work in rdi
	cmpb $OP_LOOP_OPEN, (%rdi)
	jne for_loops_step
	call *%rsi

for_loops_step:
	# Go to the next node and loop.
	addq $4, %rdi
	jmp for_loops_body

for_loops_end:
	# Cleanup our rdi and leave.
	popq %rdi

	movq %rbp, %rsp
	popq %rbp
	ret

# Takes a pointer to the node at the beginning of a balanced loop and checks if it
# can potentially be removed completely (technically unrolling is not the valid
# term for the operation but ¯\_(ツ)_/¯). This construct optimizes the following
# kind of loops: [->>+>+++<<<].
#
# Basically any loop where the total pointer moves cancel out and the total adjustment
# to the cell at the beginning of the loop is -1 can be proven to run N times, where
# N is the current value of the cell at "index" 0. As such, the loop given in the
# example can be optimized to:
# p[2] += p[0]
# p[3] += p[0] * 3
# p[0] = 0
#
# This removes the need to run the loop completely. This function first goes through the
# loop and determines if it has no nested loops, I/O or other constructs that we cannot
# translate appropriately. If this is the case, the [ and ] are replaced with their unroll
# variants and every +/- is replaced with either a CELL_ADD (if the adjustment is 1) or a
# MULT_WRITE (if the coefficient is anything else). Any adjustment to the cell at index 0
# is also replaced by UNROLL_BASE_ADJUST, since otherwise the program is not 100% correct.
# ----------------------------------------------------------------------------------------
# Arguments:
# 	rdi: a pointer to the IR array, pointing at the opening [ of a loop
possibly_unroll_loop:
	pushq %rbp
	movq %rsp, %rbp

	# rax: The current instruction.
	# rcx: Start of loop (needed later).
	# rdx: Tracks modifications made against p[0] to see if they cancel out.
	movq %rdi, %rax
	movq %rdi, %rcx
	movq $0, %rdx

	# Move past the opening [
	addq $4, %rax

unroll_loop_check_body:
	# If we haven't returned yet and the loop closed, this is seemingly valid.
	# We say 'seemingly', since we still need to check if the base doesn't move and
	# if the total adjustments to the base totals to -1.
	cmpb $OP_LOOP_CLOSE, (%rax)
	je unroll_loop_seemingly_valid

	# If this is a '.', '[', ',' or cell write, we cannot unroll it.
	# Simply leave the current function (give up).
	cmpb $OP_READ, (%rax)
	je unroll_leave
	cmpb $OP_PRINT, (%rax)
	je unroll_leave
	cmpb $OP_LOOP_OPEN, (%rax)
	je unroll_leave
	cmpb $OP_WRITE_CELL, (%rax)
	je maybe_write_cell_leave

	# If this is a write, we might need to adjust the base.
	cmpb $OP_WRITE, (%rax)
	je maybe_adjust_base

	# Nothing interesting, still valid so far. Let's continue.
	jmp unroll_loop_check_step

maybe_write_cell_leave:
	# This was a write cell. If we write to the base, it destroys
	# the assumption that this runs N times. As such, we need to abort.
	cmpw $0, 2(%rax)
	je unroll_leave

	# It's fine, this operation doesn't prevent us from unrolling.
	jmp unroll_loop_check_step

maybe_adjust_base:
	# If this doesn't adjust the base, no worries, the loop is still unrollable.
	cmpw $0, 2(%rax)
	jne unroll_loop_check_step

	# Add the coefficient to dl.
	addb 1(%rax), %dl

	# <fallthrough to step>
unroll_loop_check_step:
	# Go to the next node and loop.
	addq $4, %rax

	jmp unroll_loop_check_body

unroll_loop_seemingly_valid:
	# At this point, the loop does not contain invalid instructions.
	# We still need to check if the offset at the end returns to 0 and
	# if the base pointer has a total difference of -1.
	cmpb $-1, %dl
	jne unroll_leave
	cmpw $0, 2(%rax)
	jne unroll_leave

	# Start at begin of loop again, replace [ with the unroll start node.
	movq %rcx, %rax
	movb $OP_UNROLL_LOOP_BEGIN, (%rax)
	movb $0, 1(%rax)

	# Again, skip the [.
	addq $4, %rax

unroll_valid_write_body:
	# If we've reached the end, complete and return.
	cmpb $OP_LOOP_CLOSE, (%rax)
	je unroll_valid_write_end

	cmpb $OP_WRITE_CELL, (%rax)
	je unroll_valid_write_cell_write

	# If this is not a write, we don't have to adjust it. Simply go to the step.
	cmpb $OP_WRITE, (%rax)
	jne unroll_valid_write_step

	# If the value is 1, we do a cell add instead of a multiplication.
	cmpb $1, 1(%rax)
	je unroll_valid_write_cell_add

	# If we write to the "base" (index 0), we need to replace the op with a nop.
	cmpw $0, 2(%rax)
	je unroll_valid_write_nop

	# This is a write and its value isn't 1, so we replace it with the multiplication opcode instead.
	movb $OP_MULT_WRITE, (%rax)

	# Step.
	jmp unroll_valid_write_step

unroll_valid_write_cell_write:
	movb $OP_UNROLL_SET, (%rax)
	jmp unroll_valid_write_step

unroll_valid_write_cell_add:
	# Change the opcode to cell add and step.
	movb $OP_CELL_ADD, (%rax)

	jmp unroll_valid_write_step

unroll_valid_write_nop:
	# Change the opcode to nop and step.
	movb $OP_NOP, (%rax)

	# <fallthrough to step>

unroll_valid_write_step:
	# Go to the next node and loop.
	addq $4, %rax

	jmp unroll_valid_write_body

unroll_valid_write_end:
	# Replace the closing ']' with an unroll end opcode.
	movb $OP_UNROLL_LOOP_END, (%rax)

	# <fallthrough to leave>

unroll_leave:
	# Cleanup and leave.
	movq %rbp, %rsp
	popq %rbp
	ret

# Starts at the beginning of a loop and tries to determine if we can statically prove
# that the loop only runs once. A loop only runs once if it's base is set to 0 within
# the loop body, as such:
#     while (*p) { /* do something */; *p = 0; }
#
# It should be obvious that that can be converted to
#     if (*p) { /* do something */; *p = 0; }
# which eliminates one check and jump that will never be taken.
#
# This simply goes through all the instructions in the loop and aborts if there's either
# a nested loop or any sort of I/O. It also keeps track of the "validity" of the loop,
# making it valid if the value is zero'd and invalid if something is written to the base.
# ---------------------------------------------------------------------------------------
# Arguments:
# 	rdi: a pointer to the IR array, pointing at the opening [ of a loop
possibly_convert_to_if:
	pushq %rbp
	movq %rsp, %rbp

	# rax: Pointer to the current instruction.
	# rcx: Copy of the start instruction, for later.
	# rdx: Fs the loop is "valid" (1 if true, 0 if not)
	movq %rdi, %rax
	movq %rdi, %rcx
	movq $0, %rdx

	# Move past the opening [
	addq $4, %rax

convert_body:
	# Stop checking if we've reached the end of the loop.
	cmpb $OP_LOOP_CLOSE, (%rax)
	je convert_loop_end

	# If this is a read, print or nested loop, abort.
	cmpb $OP_READ, (%rax)
	je convert_leave
	cmpb $OP_PRINT, (%rax)
	je convert_leave
	cmpb $OP_LOOP_OPEN, (%rax)
	je convert_leave

	# If this is a write, mark as invalid if the offset is 0.
	cmpb $OP_WRITE, (%rax)
	je convert_set_maybe_invalid

	# If this is an unrolled loop or a [-], mark as valid if they act on offset 0.
	cmpb $OP_UNROLL_LOOP_BEGIN, (%rax)
	je convert_set_maybe_valid
	cmpb $OP_WRITE_CELL, (%rax)
	je convert_set_maybe_valid

	# Nothing interesting, move to step.
	jmp convert_step

convert_set_maybe_invalid:
	# If the offset isn't 0, this doesnt invalidate the loop.
	cmpw $0, 2(%rax)
	jne convert_step

	# Set as invalid and step.
	movq $0, %rdx
	jmp convert_step

convert_set_maybe_valid:
	# If the write isn't zero, this might make it invalid.
	cmpb $0, 1(%rax)
	jne convert_step

	# If the offset isn't 0, this doesnt validate the loop.
	cmpw $0, 2(%rax)
	jne convert_step

	# Set as valid and step.
	movq $1, %rdx
	jmp convert_step

convert_step:
	# Go to the next node and loop.
	addq $4, %rax
	jmp convert_body

convert_loop_end:
	# If it's currently invalid or the moves don't cancel out, abort.
	cmpq $0, %rdx
	je convert_leave
	cmpw $0, 2(%rax)
	jne convert_leave

	# Replace the ] with an if end, which compiles differently.
	movb $OP_IF_END, (%rax)

	# <fallthrough to leave>
convert_leave:
	# Cleanup and leave.
	movq %rbp, %rsp
	popq %rbp
	ret

# Since we need to allocate an executable block of memory for our JITted code, we
# use this function to compute how big the allocated block needs to be. This function
# operates on the fully optimized code and will return wrong results for other IRs.
# It uses a simple lookup table to have a fixed size for every instruction. This
# greatly decreases complexity but misses some optimizations (it will emit a useless
# lea 0(rbx), rbx if the offset is 0 for loops for example).
# -----------------------------------------------------------------------------------
# Arguments:
# 	rdi: a pointer to "ast"
compute_jit_size:
	pushq %rbp
	movq %rsp, %rbp

	# We need at least one byte for the return opcode.
	movq $1, %rax
	movq %rdi, %rcx

size_check:
	# If we've reached the end, return.
	cmpb $0, (%rcx)
	je size_end

	# edx = sign_extend(node->opcode)
	movzbl (%rcx), %edx
	# edx = CODE_SIZES[edx]
	movzbl CODE_SIZES(%rdx), %edx
	# Add the size to the current total.
	addq %rdx, %rax

size_step:
	# Simply goto the next value.
	addq $4, %rcx
	jmp size_check

size_end:
	movq %rbp, %rsp
	popq %rbp
	ret

# Actually does the JIT compilation on the optimized IR. This allocates an executable
# and writable block of memory, then writes the raw bytes/opcodes for the CPU instructions
# that correspond to the specified brainfuck instruction. This is technically probably
# an assembler, not a compiler.
# ----------------------------------------------------------------------------------------
# Arguments:
# 	rdi: a pointer to the optimized JIT array
compile_bf:
	pushq %rbp
	movq %rsp, %rbp

	# We store the current instruction to compile in rbx, the executable block in r10
	# and the beginning of the executable block in r11. These are callee saved.
	subq $8, %rsp
	pushq %rbx
	pushq %r10
	pushq %r11

	# Find out how much space we need to allocate.
	movq %rdi, %rbx
	call compute_jit_size

	# Call mmap to allocate our memory.
	# Equivalent to calling:
	# mmap(NULL, len, PROT_WRITE | PROT_EXEC, MAP_ANON | MAP_PRIVATE, -1, 0);
	movq %rax, %rsi
	movl $0, %edi
	movl $6, %edx
	movl $34, %ecx
	movl $-1, %r8d
	movl $0, %r9d
	call mmap

	# Put the allocated memory in r10 and r11.
	movq %rax, %r10
	movq %rax, %r11

compile_check:
	# If we've reached the end, return.
	cmpb $0, (%rbx)
	je compile_end

	# For every potential operation, run the compilation label.
	cmpb $OP_WRITE, (%rbx)
	je compile_write

	cmpb $OP_READ, (%rbx)
	je compile_read

	cmpb $OP_PRINT, (%rbx)
	je compile_print

	cmpb $OP_LOOP_OPEN, (%rbx)
	je compile_lbracket

	cmpb $OP_LOOP_CLOSE, (%rbx)
	je compile_rbracket

	cmpb $OP_WRITE_CELL, (%rbx)
	je compile_write_cell

	cmpb $OP_MULT_WRITE, (%rbx)
	je compile_mult_write

	cmpb $OP_UNROLL_LOOP_BEGIN, (%rbx)
	je compile_unroll_begin

	cmpb $OP_UNROLL_LOOP_END, (%rbx)
	je compile_unroll_end

	cmpb $OP_CELL_ADD, (%rbx)
	je compile_cell_add

	cmpb $OP_IF_END, (%rbx)
	je compile_if_end

	cmpb $OP_UNROLL_SET, (%rbx)
	je compile_unroll_set

	cmpb $OP_POINTER_SEEK, (%rbx)
	je compile_pointer_seek

	# This was an operation that we didn't recognize/handle (probably a NOP). Simply ignore it and loop.
	jmp compile_step

compile_write:
	# 0x80 0x83 0xAA 0xAA 0xAA 0xAA 0xBB = addb 0xBB, 0xAA(%rbx)
	movw $0x8380, (%r10)

	# Write the offset.
	movswl 2(%rbx), %eax
	movl %eax, 2(%r10)

	# Write the coefficient.
	movsbl 1(%rbx), %eax
	movb %al, 6(%r10)

	# Advance 7 bytes and loop again.
	addq $7, %r10
	jmp compile_step

compile_read:
	# 41 ff d5 == callq *%r13
	# 88 83 AA AA AA AA = mov %al, AA(%rbx)
	movl $0x00d5ff41, (%r10)
	movw $0x8388, 3(%r10)

	# Write the offset for the target.
	movswl 2(%rbx), %eax
	movl %eax, 5(%r10)

	# Advance 9 bytes, then loop.
	addq $9, %r10
	jmp compile_step

compile_print:
	# 0f b6 bb AA AA AA AA 	movzbl AA(%rbx),%edi
	# 41 ff d4             	callq  *%r12
	movl $0x00bbb60f, (%r10)

	# Write the offset.
	movswl 2(%rbx), %eax
	movl %eax, 3(%r10)

	# Write the call operation.
	movl $0x00d4ff41, 7(%r10)

	# Advance 10 bytes, loop.
	addq $10, %r10
	jmp compile_step

compile_lbracket:
	# Skip emitting the lea if we don't need it.
	cmpw $0, 2(%rbx)
	je compile_lbracket_test

	# 48 8d 9b AA AA AA AA 	lea    AA(%rbx),%rbx
	# We use lea since it does computation in the decode stage, which helps pipelining.
	movl $0x009b8d48, (%r10)

	# Write the offset for the lea.
	movswl 2(%rbx), %eax
	movl %eax, 3(%r10)

	addq $7, %r10

compile_lbracket_test:
	# Write the other instructions.
	# 80 3b 00             	cmpb   $0x0,(%rbx)
	# 0f 84 OF FF SS ET    	je     <offset>
	movw $0x038a, (%r10)
	movw $0xc084, 2(%r10)
	movw $0x840f, 4(%r10)
	movl $0x00, 6(%r10)

	# Advance the 10 bytes.
	addq $10, %r10

	# Push our current position on the stack. We use the stack to keep track of bracket pairs.
	# Once we reach the closing bracket, we pop, compute offsets and write the j(n)e targets.
	pushq %r10

	jmp compile_step

compile_rbracket:
	# Skip emitting the lea if we don't need it.
	cmpw $0, 2(%rbx)
	je compile_rbracket_test

	# 48 8d 9b AA AA AA AA 	lea    AA(%rbx),%rbx
	# We use lea since it does computation in the decode stage, which helps pipelining.
	movl $0x009b8d48, (%r10)

	# Write the offset.
	movswl 2(%rbx), %eax
	movl %eax, 3(%r10)

	addq $7, %r10

compile_rbracket_test:
	# Write the other bytes.
	# 80 3b 00             	cmpb   $0x0,(%rbx)
	# 0f 85 OF FF SS ET    	jne     <offset>
	movw $0x038a, (%r10)
	movw $0xc084, 2(%r10)
	movw $0x850f, 4(%r10)
	movl $0x00, 6(%r10)
	addq $10, %r10

	# Find out what the code address was of our starting position.
	popq %rcx

	# Find out the difference between our current address and the starting one.
	movq %r10, %rax
	subq %rcx, %rax

	# Write the positive offset at the starting location.
	movl %eax, -4(%rcx)

	# Now negate it and write the negative offset at our current location.
	negl %eax
	movl %eax, -4(%r10)

	jmp compile_step

compile_write_cell:
	# c6 83 AA AA AA AA BB = movb BB,AA(%rbx)
	movw $0x83c6, (%r10)

	# Write the offset.
	movswl 2(%rbx), %eax
	movl %eax, 2(%r10)

	# Write the value.
	movsbl 1(%rbx), %eax
	movb %al, 6(%r10)

	# Advance 7 bytes and loop.
	addq $7, %r10
	jmp compile_step

compile_mult_write:
	# b8 XX XX XX XX       	mov    XX,%eax
	movw $0xb8, (%r10)

	# Write the value.
	movsbl 1(%rbx), %eax
	movl %eax, 1(%r10)

	# 0f af c2             	imul   %edx,%eax
	movl $0x00c2af0f, 5(%r10)

	# 00 83 XX XX XX XX    	add    %al,XX(%rbx)
	movb $0x83, 9(%r10)

	# Write the offset.
	movswl 2(%rbx), %eax
	movl %eax, 10(%r10)

	# Advance 11 bytes and loop.
	addq $14, %r10
	jmp compile_step

compile_unroll_begin:
	# Do not emit the lea if we do not need it.
	cmpw $0, 2(%rbx)
	je compile_unroll_begin_mov

	# 48 8d 9b 30 75 00 00 	lea    0x7530(%rbx),%rbx
	movl $0x009b8d48, (%r10)

	# Write the offset.
	movswl 2(%rbx), %eax
	movl %eax, 3(%r10)

	addq $7, %r10

compile_unroll_begin_mov:
	# 0f b6 13             	movzbl (%rbx),%edx
	movl $0x0013b60f, (%r10)

	# Advance 10 bytes and loop.
	addq $3, %r10
	jmp compile_step

compile_unroll_end:
	# Zero the cell at index 0.
	# c6 03 00             	movb   $0x0,(%rbx)
	movl $0x000003c6, (%r10)

	addq $3, %r10
	jmp compile_step

compile_cell_add:
	# Simply add the value in %dl to the offset value.
	# 00 93 31 75 00 00    	add    %dl,0x7531(%rbx)
	movb $0x93, 1(%r10)

	# Write the offset.
	movswl 2(%rbx), %eax
	movl %eax, 2(%r10)

	addq $6, %r10
	jmp compile_step

compile_if_end:
	# Find out what the code address was of our starting position.
	popq %rcx

	# Find out the difference between our current address and the starting one.
	movq %r10, %rax
	subq %rcx, %rax

	# Write the offset at the starting location.
	movl %eax, -4(%rcx)

	# We do not need to write anything else, so we don't adjust r10 and simply loop.
	jmp compile_step

compile_unroll_set:
	#   4004d7:	b0 0a                	mov    $0xa,%al
	movb $0xb0, (%r10)

	# Write the value.
	movsbl 1(%rbx), %eax
	movl %eax, 1(%r10)

	#   4004d9:	0f b6 8b 31 75 00 00 	movzbl 0x7531(%rbx),%ecx
	movl $0x008bb60f, 2(%r10)

	# Write the offset.
	movswl 2(%rbx), %eax
	movl %eax, 5(%r10)

	#   4004e0:	84 d2                	test   %dl,%dl
	#   4004e2:	66 0f 45 c8          	cmovne %ax,%cx
	#   4004e6:	88 8b 31 75 00 00    	mov    %cl,0x7531(%rbx)
	movw $0xd284, 9(%r10)
	movl $0xc8450f66, 11(%r10)
	movw $0x8b88, 15(%r10)

	# Write the offset.
	movswl 2(%rbx), %eax
	movl %eax, 17(%r10)

	# Advance 21 bytes and loop.
	addq $21, %r10

	jmp compile_step

compile_pointer_seek:
	# Do not emit the lea if we do not need it.
	cmpw $0, 2(%rbx)
	je compile_pointer_seek_body

	# 48 8d 9b 30 75 00 00 	lea    0x7530(%rbx),%rbx
	movl $0x009b8d48, (%r10)

	# Write the offset.
	movswl 2(%rbx), %eax
	movl %eax, 3(%r10)

	addq $7, %r10

compile_pointer_seek_body:
	# cmpb $0, (%rdi)
	movl $0x00003b80, (%r10)

	# leaq 9(%rdi), %rdi
	movl $0x005b8d48, 3(%r10)

	# Write the value.
	movsbl 1(%rbx), %eax
	movb %al, 6(%r10)

	# jne <back>. The code size is 13 bytes, so the offset back is -13.
	movw $0x850f, 7(%r10)
	movl $-13, 9(%r10)

	# Advance 13 bytes and step.
	addq $13, %r10
	jmp compile_step

compile_step:
	# Goto the next IR node and actually loop.
	addq $4, %rbx
	jmp compile_check

compile_end:
	# Write ret to last byte.
	movb $0xC3, (%r10)

	# Put resulting buffer in rax
	movq %r11, %rax

	# Cleanup, return.
	popq %r11
	popq %r10
	popq %rbx
	movq %rbp, %rsp
	popq %rbp
	ret

# Since our JITted block of memory makes certain assumptions about the environment
# (such as certain functions existing in registers), this function functions as a
# "trampoline" that takes a block of JITted instructions and actually runs the code.
# -----------------------------------------------------------------------------------
# Arguments:
# 	rdi: pointer to executable block of memory with instructions
run_compiled_bf:
	pushq %rbp
	movq %rsp, %rbp

	# Prepare the stack and registers. We purposefully misalign by 8,
	# so that using call will align the stack by pushing the return address.
	pushq %rbx
	pushq %r12
	pushq %r13

	# Our compiled code expects the cell pointer in rdx, putchar in r12 and getchar in r13.
	leaq CELLS, %rbx
	movq $putchar, %r12
	movq $getchar, %r13

	# Call our compiled code. The stack is not currently aligned, but with the return value it will align properly.
	call *%rdi

	# Cleanup and return.
	popq %r13
	popq %r12
	popq %rbx

	movq %rbp, %rsp
	popq %rbp
	ret

# Takes a simple brainfuck char* input and parses it into IR, optimizes it and then
# compiles and runs it.
# -----------------------------------------------------------------------------------
# Arguments:
# 	rdi: char* to execute
brainfuck:
	pushq %rbp
	movq %rsp, %rbp

	# Build our representation.
	call build_representation
	movq %rax, %rdi

	# Optimize our representation. rdi is never modified (calling convention) and
	# we don't care about the return value so we can just call every function in order.
	call pre_offset_optimizations
	call compute_offsets

	movq $possibly_unroll_loop, %rsi
	call for_every_loop_run

	movq $possibly_convert_to_if, %rsi
	call for_every_loop_run

	# Compile the program
	call compile_bf

	# Run the program
	movq %rax, %rdi
	call run_compiled_bf

	movq %rbp, %rsp
	popq %rbp
	ret