Skip to content

Instantly share code, notes, and snippets.

@molenzwiebel
Last active April 9, 2018 22:29
Show Gist options
  • Save molenzwiebel/748e336a1bb465690b477ef1c42bed4c to your computer and use it in GitHub Desktop.
Save molenzwiebel/748e336a1bb465690b477ef1c42bed4c to your computer and use it in GitHub Desktop.
# ====================================================================================================
# ====================================================================================================
# ====================================================================================================
# ====================================================================================================
# ====================================================================================================
# ===================================== README README README README ==================================
# ====================================================================================================
# == IT IS HIGHLY RECOMMENDED THAT YOU READ THIS CODE WHILE READING THE ACCOMPANYING README FILE AT ==
# == THE SAME TIME. IT WILL EXPLAIN SOME CONCEPTS USED IN THIS CODE, AND WHILE THE COMMENTS ABOVE ==
# == FUNCTIONS CONTAIN A LOT OF CONTEXT, THEY WILL NOT EXPLAIN EVERYTHING. ALSO: I SPENT A LOT OF ==
# == TIME ON THE README, SO I'D LOVE IT IF SOMEONE ACTUALLY READ IT AND APPRECIATED THE EFFORT. ==
# == FIND THE README AT: https://gist.github.com/molenzwiebel/2839f812b1d795ac22ee26a49d8e82ba ==
# ====================================================================================================
# ====================================================================================================
# ====================================================================================================
# ====================================================================================================
# ====================================================================================================
# ====================================================================================================
.bss
# The 30.000 signed 8-bit cells we use.
CELLS: .skip 30000
# Does nothing. Used for optimized out instructions so we don't have to reorder arrays.
.equ OP_NOP, 1
# A simple + or - instruction. Is actually an add, not a write.
.equ OP_WRITE, 2
# Moves the current cell pointer (> and <). Optimized out by using offsets.
.equ OP_MOVE, 3
# Reads a char from stdin (,)
.equ OP_READ, 4
# Writes a char to stdout (.)
.equ OP_PRINT, 5
# A simple loop open ([)
.equ OP_LOOP_OPEN, 6
# A simple loop close (])
.equ OP_LOOP_CLOSE, 7
# A direct write to a cell, instead of an addition. p[offset] = coeff
.equ OP_WRITE_CELL, 8
# A multiplicative addition to a cell, instead of a normal addition.
.equ OP_MULT_WRITE, 9
# Beginning of a simple loop that can be proven to run N times.
.equ OP_UNROLL_LOOP_BEGIN, 10
# End of a simple loop that can be proven to run N times.
.equ OP_UNROLL_LOOP_END, 11
# Within an unrolled loop, an operation that simply adds a cell. This is OP_MULT_WRITE but with metadata of 1.
.equ OP_CELL_ADD, 12
# Signifies the end of a "loop" that can be proven to only run once.
.equ OP_IF_END, 13
# A conditional set within an unrolled loop. Different from normal sets since it needs to be executed conditionally.
.equ OP_UNROLL_SET, 14
# A loop that searches for a 0 cell. Equivalent to [>>>>], with coeff being the amount of moves inside the loop.
.equ OP_POINTER_SEEK, 15
.text
# Lookup table for JIT code sizes for every instruction.
CODE_SIZES: .byte 0 # index 0
.byte 0 # nop
.byte 7 # write
.byte 0 # move
.byte 9 # read
.byte 10 # print
.byte 17 # loop open
.byte 17 # loop close
.byte 7 # write cell
.byte 14 # mult write
.byte 10 # unroll loop start
.byte 3 # unroll loop end
.byte 6 # cell add
.byte 21 # unroll set
.byte 20 # pointer seek
.global brainfuck
.global build_representation
.global pre_offset_optimizations
.global compute_offsets
.global possibly_unroll_loop
.global possibly_convert_to_if
.global compile_bf
.global compute_jit_size
# Converts the textual brainfuck representation into an IR (intermediate representation)
# corresponding to the brainfuck. The IR consists of 4 bytes, which looks like:
# 1 byte: opcode
# 1 byte: coefficient
# 2 bytes: offset
#
# The opcode is one of the opcodes defined at the top of this file. This function already
# does some initial optimizations, most notably collapsing the +- and <> instructions
# and writing the amount to the coefficient field. ++- results into OP_WRITE with an
# coefficient of one.
#
# Zeroing loops ([-]) are also detected and optimized in this program. All other constructs
# are translated directly.
#
# The offset field is written as 0, since offsets will be computed later.
# -----------------------------------------------------------------------------------------
# Arguments:
# rdi: a pointer to brainfuck source to optimize
# Returns:
# rax: a pointer to the optimized version of the source
build_representation:
pushq %rbp
movq %rsp, %rbp
# Save rdi since we allocate a new buffer, which
# destroys rdi even though we need it later.
subq $8, %rsp
pushq %rdi
# Allocate a new result buffer. First, check how long the source is.
call strlen
# Add one for the null terminator (in case we don't do a single optimization).
# Then, multiply by four since we need 4 bytes for every original instruction in the worst case.
addq $1, %rdi
shlq $2, %rdi
# Allocate len * 4 + 1 bytes and zero them.
movq $1, %rsi
call calloc
# The old initial source is in rcx, new buffer in rax/
popq %rcx
# Save buffer, so we can return the start later.
pushq %rax
build_loop_check:
# Stop once we've reached the end.
cmpb $0, (%rcx)
je build_loop_end
# *ptr == '[', check for the [-] construct
cmpb $91, (%rcx)
je maybe_zero_cell
# *ptr == ']', no optimization possible (copy the char over)
cmpb $93, (%rcx)
je build_rbracket
# *ptr == ',', no optimization possible (copy the char over)
cmpb $44, (%rcx)
je build_read
# *ptr == '.', no optimization possible (copy the char over)
cmpb $46, (%rcx)
je build_print
# *ptr == '+', collapse subsequent instructions
cmpb $43, (%rcx)
je collapse_write
# *ptr == '-', collapse subsequent instructions
cmpb $45, (%rcx)
je collapse_write
# *ptr == '>', collapse subsequent instructions
cmpb $62, (%rcx)
je collapse_ptr
# *ptr == '<', collapse subsequent instructions
cmpb $60, (%rcx)
je collapse_ptr
# This wasn't a brainfuck instruction so ignore it alltogether.
# Simply go to the next character and loop again.
incq %rcx
jmp build_loop_check
collapse_write:
# count == 0, then do a while loop
movq $0, %rdx
collapse_write_check:
# to prevent overflow (we only have 1 byte for length), stop once we reach 100
# '+' * 130 will just optimize into [OP_WRITE(100), OP_WRITE(30)], which isn't a big deal
cmpb $-100, %dl
jl collapse_write_after
cmpb $100, %dl
jg collapse_write_after
# if this is + or -, adjust our count and loop
cmpb $43, (%rcx)
je collapse_write_incr
cmpb $45, (%rcx)
je collapse_write_decr
# else, we've reached the end of the collapsible strips, so stop
jmp collapse_write_after
collapse_write_incr:
# Add one to the count, goto next character.
addb $1, %dl
addq $1, %rcx
jmp collapse_write_check
collapse_write_decr:
# Subtract one from the count, goto next character.
subb $1, %dl
addq $1, %rcx
jmp collapse_write_check
collapse_write_after:
# We've gone too far, go back one character.
subq $1, %rcx
# Write OP_WRITE and the count to the IR node.
movb $OP_WRITE, (%rax)
movb %dl, 1(%rax)
jmp build_loop_step
collapse_ptr:
# Again, count == 0 and while.
movq $0, %rdx
collapse_ptr_check:
# Again, prevent overflow by optimizing a bit less. See optimize_write_check for more details.
cmpb $-100, %dl
jl collapse_ptr_after
cmpb $100, %dl
jg collapse_ptr_after
# If > or <, adjust our count.
cmpb $62, (%rcx)
je collapse_ptr_incr
cmpb $60, (%rcx)
je collapse_ptr_decr
# We've reached the end of the strip.
jmp collapse_ptr_after
collapse_ptr_incr:
# count++, goto next character.
addb $1, %dl
addq $1, %rcx
jmp collapse_ptr_check
collapse_ptr_decr:
# count--, go to next character
subb $1, %dl
addq $1, %rcx
jmp collapse_ptr_check
collapse_ptr_after:
# Gone too far, subtract one.
subq $1, %rcx
# Write OP_MOVE(count) to our result.
movb $OP_MOVE, (%rax)
movb %dl, 1(%rax)
jmp build_loop_step
maybe_zero_cell:
# If the next two bytes aren't -], this is not a [-] construct
# we are willing to optimize. Simply copy the current [ over normally
# and the next iterations will copy the -]s over.
cmpb $45, 1(%rcx)
jne build_lbracket
cmpb $93, 2(%rcx)
jne build_lbracket
# Write OP_WRITE_CELL with the value 0.
movb $OP_WRITE_CELL, (%rax)
addq $2, %rcx
jmp build_loop_step
build_lbracket:
# Simply copy over.
movb $OP_LOOP_OPEN, (%rax)
jmp build_loop_step
build_rbracket:
# Simply copy over.
movb $OP_LOOP_CLOSE, (%rax)
jmp build_loop_step
build_print:
# Simply copy over.
movb $OP_PRINT, (%rax)
jmp build_loop_step
build_read:
# Simply copy over.
movb $OP_READ, (%rax)
jmp build_loop_step
build_loop_step:
# The loop step, go to the next character and IR node and loop back.
addq $4, %rax
incq %rcx
jmp build_loop_check
build_loop_end:
# We stored the beginning of the newly allocated buffer on the stack.
# We pop it off now so we can return it into rax.
popq %rax
movq %rbp, %rsp
popq %rbp
ret
# Does a set of optimizations on the converted IR that need to be done before
# computing the pointer offsets and zeroing the moves. Most of these rely on
# moves to still exist in the IR, which are optimized out when we compute offsets.
#
# The following constructs are optimized here:
# - '++--' to OP_NOP
# - '><><' to OP_NOP
# - '[-]+++' to OP_WRITE_CELL(3)
# - '][-]' into OP_LOOP_CLOSE, OP_NOP
# --------------------------------------------------------------------------------
# Arguments:
# rdi: a pointer to the IR array
pre_offset_optimizations:
pushq %rbp
movq %rsp, %rbp
movq %rdi, %rax
pre_offset_check:
cmpb $0, (%rax)
je pre_offset_end
# Check if they might be useless collapsed operations.
cmpb $OP_WRITE, (%rax)
je maybe_remove_useless_collapsed
cmpb $OP_MOVE, (%rax)
je maybe_remove_useless_collapsed
# Maybe fuse a zero set and a write into a single op.
cmpb $OP_WRITE_CELL, (%rax)
je maybe_fuse_zero_add
# Check if this is a pointer seek loop.
cmpb $OP_LOOP_OPEN, (%rax)
je maybe_pointer_seek
# Maybe remove a useless zeroing after the closing of a loop.
# (we already know the cell will be zero after a loop).
cmpb $OP_LOOP_CLOSE, (%rax)
je maybe_remove_useless_zero
jmp pre_offset_step
maybe_pointer_seek:
# If the next ins isn't move or the one after isn't a loop close, abort.
cmpb $OP_MOVE, 4(%rax)
jne pre_offset_step
cmpb $OP_LOOP_CLOSE, 8(%rax)
jne pre_offset_step
movb $OP_POINTER_SEEK, (%rax)
movb $OP_NOP, 4(%rax)
movb $OP_NOP, 8(%rax)
# Copy over the coefficient from the move.
movzbl 5(%rax), %edx
movb %dl, 1(%rax)
jmp pre_offset_step
maybe_remove_useless_collapsed:
# if the coeff isnt 0, ignore this
cmpb $0, 1(%rax)
jne pre_offset_step
# replace the operation with nop
movb $OP_NOP, (%rax)
jmp pre_offset_step
maybe_remove_useless_zero:
# if the next op is a write_cell(0), continue. else, go to step
cmpb $OP_WRITE_CELL, 4(%rax)
jne pre_offset_step
cmpb $0, 5(%rax)
jne pre_offset_step
# Skip our skipped instruction, write NOP.
addq $4, %rax
movb $OP_NOP, (%rax)
maybe_fuse_zero_add:
# If the next operation isn't a write, skip.
cmpb $0, 1(%rax)
jne pre_offset_step
cmpb $OP_WRITE, 4(%rax)
jne pre_offset_step
# Cannot mov two memory references, so load into edx.
movsbl 5(%rax), %edx
movb %dl, 1(%rax)
# Write NOP for the next instruction.
movb $OP_NOP, 4(%rax)
movb $OP_WRITE_CELL, (%rax)
jmp pre_offset_step
pre_offset_step:
# Simply go to the next IR node and try to optimize again.
addq $4, %rax
jmp pre_offset_check
pre_offset_end:
movq %rbp, %rsp
popq %rbp
ret
# One of the big performance gains we use is computing "offsets". Basically, with the
# following code: +>->++<, it is much more performant if we can emit
# p[0]++, p[1]--, p[2] += 2 and ignore the pointer movements completely. This is also
# great for pipelining since this way of emitting code does not create data dependencies
# which means the processor itself can give us "free performance". Do note that we still
# need to move the pointer at the end, specifically before [s and ]s (since we cannot)
# predict how much times a loop will run. The code emitting will insert the pointer
# movements based on the offset field for that op. Do note that the offset field is
# written for every field, even the ones that might not need it. This makes computation
# easier.
# --------------------------------------------------------------------------------------
# Arguments:
# rdi: a pointer to the IR array
compute_offsets:
pushq %rbp
movq %rsp, %rbp
# We start at offset zero.
movq $0, %rcx
movq %rdi, %rax
offset_check:
# Stop if we've reached the end.
cmpb $0, (%rax)
je offset_end
# Write the offset to this instruction.
movw %cx, 2(%rax)
# If this was a move, nop the move and adjust the offset.
cmpb $OP_MOVE, (%rax)
je offset_move
# If this was an open or close, reset the offset (since the instruction will adjust the "base" of the pointer).
cmpb $OP_LOOP_OPEN, (%rax)
je offset_clear
cmpb $OP_LOOP_CLOSE, (%rax)
je offset_clear
# If this was a pointer seek, set the offset to negative the seek count.
# This is because we transform into a do-while loop, which means we always go too far.
cmpb $OP_POINTER_SEEK, (%rax)
je offset_seek
# Nothing interesting, just step and loop.
jmp offset_step
offset_clear:
# Simply zero offset, step and loop.
movq $0, %rcx
jmp offset_step
offset_move:
# Load the coefficient and add to the current offset.
movsbl 1(%rax), %edx
addl %edx, %ecx
# NOP out this pointer move, then loop.
movb $OP_NOP, (%rax)
jmp offset_step
offset_seek:
# Load the coefficient and negate.
movsbl 1(%rax), %edx
negl %edx
# Set the new offset.
movq %rdx, %rcx
jmp offset_step
offset_step:
# Goto the next IR node, then loop.
addq $4, %rax
jmp offset_check
offset_end:
# We do not have a result.
movq %rbp, %rsp
popq %rbp
ret
# This is a helper function that takes a function pointer in rsi and runs it on the
# start of every loop contained in rdi. This is extracted in a helper function since
# we need to iterate over the code twice: once for loop unrolling and once for if
# loop detection.
# ----------------------------------------------------------------------------------
# Arguments:
# rdi: a pointer to the IR array
# rsi: a function pointer to be ran on every OP_LOOP_OPEN node
for_every_loop_run:
pushq %rbp
movq %rsp, %rbp
# save rdi, it is caller saved and we modify it here
subq $8, %rsp
pushq %rdi
for_loops_body:
# stop if we've reached the end
cmpb $0, (%rdi)
je for_loops_end
# next if this isn't loop open, else call the pointer
# the argument is already in rdi since we work in rdi
cmpb $OP_LOOP_OPEN, (%rdi)
jne for_loops_step
call *%rsi
for_loops_step:
# Go to the next node and loop.
addq $4, %rdi
jmp for_loops_body
for_loops_end:
# Cleanup our rdi and leave.
popq %rdi
movq %rbp, %rsp
popq %rbp
ret
# Takes a pointer to the node at the beginning of a balanced loop and checks if it
# can potentially be removed completely (technically unrolling is not the valid
# term for the operation but ¯\_(ツ)_/¯). This construct optimizes the following
# kind of loops: [->>+>+++<<<].
#
# Basically any loop where the total pointer moves cancel out and the total adjustment
# to the cell at the beginning of the loop is -1 can be proven to run N times, where
# N is the current value of the cell at "index" 0. As such, the loop given in the
# example can be optimized to:
# p[2] += p[0]
# p[3] += p[0] * 3
# p[0] = 0
#
# This removes the need to run the loop completely. This function first goes through the
# loop and determines if it has no nested loops, I/O or other constructs that we cannot
# translate appropriately. If this is the case, the [ and ] are replaced with their unroll
# variants and every +/- is replaced with either a CELL_ADD (if the adjustment is 1) or a
# MULT_WRITE (if the coefficient is anything else). Any adjustment to the cell at index 0
# is also replaced by UNROLL_BASE_ADJUST, since otherwise the program is not 100% correct.
# ----------------------------------------------------------------------------------------
# Arguments:
# rdi: a pointer to the IR array, pointing at the opening [ of a loop
possibly_unroll_loop:
pushq %rbp
movq %rsp, %rbp
# rax: The current instruction.
# rcx: Start of loop (needed later).
# rdx: Tracks modifications made against p[0] to see if they cancel out.
movq %rdi, %rax
movq %rdi, %rcx
movq $0, %rdx
# Move past the opening [
addq $4, %rax
unroll_loop_check_body:
# If we haven't returned yet and the loop closed, this is seemingly valid.
# We say 'seemingly', since we still need to check if the base doesn't move and
# if the total adjustments to the base totals to -1.
cmpb $OP_LOOP_CLOSE, (%rax)
je unroll_loop_seemingly_valid
# If this is a '.', '[', ',' or cell write, we cannot unroll it.
# Simply leave the current function (give up).
cmpb $OP_READ, (%rax)
je unroll_leave
cmpb $OP_PRINT, (%rax)
je unroll_leave
cmpb $OP_LOOP_OPEN, (%rax)
je unroll_leave
cmpb $OP_WRITE_CELL, (%rax)
je maybe_write_cell_leave
# If this is a write, we might need to adjust the base.
cmpb $OP_WRITE, (%rax)
je maybe_adjust_base
# Nothing interesting, still valid so far. Let's continue.
jmp unroll_loop_check_step
maybe_write_cell_leave:
# This was a write cell. If we write to the base, it destroys
# the assumption that this runs N times. As such, we need to abort.
cmpw $0, 2(%rax)
je unroll_leave
# It's fine, this operation doesn't prevent us from unrolling.
jmp unroll_loop_check_step
maybe_adjust_base:
# If this doesn't adjust the base, no worries, the loop is still unrollable.
cmpw $0, 2(%rax)
jne unroll_loop_check_step
# Add the coefficient to dl.
addb 1(%rax), %dl
# <fallthrough to step>
unroll_loop_check_step:
# Go to the next node and loop.
addq $4, %rax
jmp unroll_loop_check_body
unroll_loop_seemingly_valid:
# At this point, the loop does not contain invalid instructions.
# We still need to check if the offset at the end returns to 0 and
# if the base pointer has a total difference of -1.
cmpb $-1, %dl
jne unroll_leave
cmpw $0, 2(%rax)
jne unroll_leave
# Start at begin of loop again, replace [ with the unroll start node.
movq %rcx, %rax
movb $OP_UNROLL_LOOP_BEGIN, (%rax)
movb $0, 1(%rax)
# Again, skip the [.
addq $4, %rax
unroll_valid_write_body:
# If we've reached the end, complete and return.
cmpb $OP_LOOP_CLOSE, (%rax)
je unroll_valid_write_end
cmpb $OP_WRITE_CELL, (%rax)
je unroll_valid_write_cell_write
# If this is not a write, we don't have to adjust it. Simply go to the step.
cmpb $OP_WRITE, (%rax)
jne unroll_valid_write_step
# If the value is 1, we do a cell add instead of a multiplication.
cmpb $1, 1(%rax)
je unroll_valid_write_cell_add
# If we write to the "base" (index 0), we need to replace the op with a nop.
cmpw $0, 2(%rax)
je unroll_valid_write_nop
# This is a write and its value isn't 1, so we replace it with the multiplication opcode instead.
movb $OP_MULT_WRITE, (%rax)
# Step.
jmp unroll_valid_write_step
unroll_valid_write_cell_write:
movb $OP_UNROLL_SET, (%rax)
jmp unroll_valid_write_step
unroll_valid_write_cell_add:
# Change the opcode to cell add and step.
movb $OP_CELL_ADD, (%rax)
jmp unroll_valid_write_step
unroll_valid_write_nop:
# Change the opcode to nop and step.
movb $OP_NOP, (%rax)
# <fallthrough to step>
unroll_valid_write_step:
# Go to the next node and loop.
addq $4, %rax
jmp unroll_valid_write_body
unroll_valid_write_end:
# Replace the closing ']' with an unroll end opcode.
movb $OP_UNROLL_LOOP_END, (%rax)
# <fallthrough to leave>
unroll_leave:
# Cleanup and leave.
movq %rbp, %rsp
popq %rbp
ret
# Starts at the beginning of a loop and tries to determine if we can statically prove
# that the loop only runs once. A loop only runs once if it's base is set to 0 within
# the loop body, as such:
# while (*p) { /* do something */; *p = 0; }
#
# It should be obvious that that can be converted to
# if (*p) { /* do something */; *p = 0; }
# which eliminates one check and jump that will never be taken.
#
# This simply goes through all the instructions in the loop and aborts if there's either
# a nested loop or any sort of I/O. It also keeps track of the "validity" of the loop,
# making it valid if the value is zero'd and invalid if something is written to the base.
# ---------------------------------------------------------------------------------------
# Arguments:
# rdi: a pointer to the IR array, pointing at the opening [ of a loop
possibly_convert_to_if:
pushq %rbp
movq %rsp, %rbp
# rax: Pointer to the current instruction.
# rcx: Copy of the start instruction, for later.
# rdx: Fs the loop is "valid" (1 if true, 0 if not)
movq %rdi, %rax
movq %rdi, %rcx
movq $0, %rdx
# Move past the opening [
addq $4, %rax
convert_body:
# Stop checking if we've reached the end of the loop.
cmpb $OP_LOOP_CLOSE, (%rax)
je convert_loop_end
# If this is a read, print or nested loop, abort.
cmpb $OP_READ, (%rax)
je convert_leave
cmpb $OP_PRINT, (%rax)
je convert_leave
cmpb $OP_LOOP_OPEN, (%rax)
je convert_leave
# If this is a write, mark as invalid if the offset is 0.
cmpb $OP_WRITE, (%rax)
je convert_set_maybe_invalid
# If this is an unrolled loop or a [-], mark as valid if they act on offset 0.
cmpb $OP_UNROLL_LOOP_BEGIN, (%rax)
je convert_set_maybe_valid
cmpb $OP_WRITE_CELL, (%rax)
je convert_set_maybe_valid
# Nothing interesting, move to step.
jmp convert_step
convert_set_maybe_invalid:
# If the offset isn't 0, this doesnt invalidate the loop.
cmpw $0, 2(%rax)
jne convert_step
# Set as invalid and step.
movq $0, %rdx
jmp convert_step
convert_set_maybe_valid:
# If the write isn't zero, this might make it invalid.
cmpb $0, 1(%rax)
jne convert_step
# If the offset isn't 0, this doesnt validate the loop.
cmpw $0, 2(%rax)
jne convert_step
# Set as valid and step.
movq $1, %rdx
jmp convert_step
convert_step:
# Go to the next node and loop.
addq $4, %rax
jmp convert_body
convert_loop_end:
# If it's currently invalid or the moves don't cancel out, abort.
cmpq $0, %rdx
je convert_leave
cmpw $0, 2(%rax)
jne convert_leave
# Replace the ] with an if end, which compiles differently.
movb $OP_IF_END, (%rax)
# <fallthrough to leave>
convert_leave:
# Cleanup and leave.
movq %rbp, %rsp
popq %rbp
ret
# Since we need to allocate an executable block of memory for our JITted code, we
# use this function to compute how big the allocated block needs to be. This function
# operates on the fully optimized code and will return wrong results for other IRs.
# It uses a simple lookup table to have a fixed size for every instruction. This
# greatly decreases complexity but misses some optimizations (it will emit a useless
# lea 0(rbx), rbx if the offset is 0 for loops for example).
# -----------------------------------------------------------------------------------
# Arguments:
# rdi: a pointer to "ast"
compute_jit_size:
pushq %rbp
movq %rsp, %rbp
# We need at least one byte for the return opcode.
movq $1, %rax
movq %rdi, %rcx
size_check:
# If we've reached the end, return.
cmpb $0, (%rcx)
je size_end
# edx = sign_extend(node->opcode)
movzbl (%rcx), %edx
# edx = CODE_SIZES[edx]
movzbl CODE_SIZES(%rdx), %edx
# Add the size to the current total.
addq %rdx, %rax
size_step:
# Simply goto the next value.
addq $4, %rcx
jmp size_check
size_end:
movq %rbp, %rsp
popq %rbp
ret
# Actually does the JIT compilation on the optimized IR. This allocates an executable
# and writable block of memory, then writes the raw bytes/opcodes for the CPU instructions
# that correspond to the specified brainfuck instruction. This is technically probably
# an assembler, not a compiler.
# ----------------------------------------------------------------------------------------
# Arguments:
# rdi: a pointer to the optimized JIT array
compile_bf:
pushq %rbp
movq %rsp, %rbp
# We store the current instruction to compile in rbx, the executable block in r10
# and the beginning of the executable block in r11. These are callee saved.
subq $8, %rsp
pushq %rbx
pushq %r10
pushq %r11
# Find out how much space we need to allocate.
movq %rdi, %rbx
call compute_jit_size
# Call mmap to allocate our memory.
# Equivalent to calling:
# mmap(NULL, len, PROT_WRITE | PROT_EXEC, MAP_ANON | MAP_PRIVATE, -1, 0);
movq %rax, %rsi
movl $0, %edi
movl $6, %edx
movl $34, %ecx
movl $-1, %r8d
movl $0, %r9d
call mmap
# Put the allocated memory in r10 and r11.
movq %rax, %r10
movq %rax, %r11
compile_check:
# If we've reached the end, return.
cmpb $0, (%rbx)
je compile_end
# For every potential operation, run the compilation label.
cmpb $OP_WRITE, (%rbx)
je compile_write
cmpb $OP_READ, (%rbx)
je compile_read
cmpb $OP_PRINT, (%rbx)
je compile_print
cmpb $OP_LOOP_OPEN, (%rbx)
je compile_lbracket
cmpb $OP_LOOP_CLOSE, (%rbx)
je compile_rbracket
cmpb $OP_WRITE_CELL, (%rbx)
je compile_write_cell
cmpb $OP_MULT_WRITE, (%rbx)
je compile_mult_write
cmpb $OP_UNROLL_LOOP_BEGIN, (%rbx)
je compile_unroll_begin
cmpb $OP_UNROLL_LOOP_END, (%rbx)
je compile_unroll_end
cmpb $OP_CELL_ADD, (%rbx)
je compile_cell_add
cmpb $OP_IF_END, (%rbx)
je compile_if_end
cmpb $OP_UNROLL_SET, (%rbx)
je compile_unroll_set
cmpb $OP_POINTER_SEEK, (%rbx)
je compile_pointer_seek
# This was an operation that we didn't recognize/handle (probably a NOP). Simply ignore it and loop.
jmp compile_step
compile_write:
# 0x80 0x83 0xAA 0xAA 0xAA 0xAA 0xBB = addb 0xBB, 0xAA(%rbx)
movw $0x8380, (%r10)
# Write the offset.
movswl 2(%rbx), %eax
movl %eax, 2(%r10)
# Write the coefficient.
movsbl 1(%rbx), %eax
movb %al, 6(%r10)
# Advance 7 bytes and loop again.
addq $7, %r10
jmp compile_step
compile_read:
# 41 ff d5 == callq *%r13
# 88 83 AA AA AA AA = mov %al, AA(%rbx)
movl $0x00d5ff41, (%r10)
movw $0x8388, 3(%r10)
# Write the offset for the target.
movswl 2(%rbx), %eax
movl %eax, 5(%r10)
# Advance 9 bytes, then loop.
addq $9, %r10
jmp compile_step
compile_print:
# 0f b6 bb AA AA AA AA movzbl AA(%rbx),%edi
# 41 ff d4 callq *%r12
movl $0x00bbb60f, (%r10)
# Write the offset.
movswl 2(%rbx), %eax
movl %eax, 3(%r10)
# Write the call operation.
movl $0x00d4ff41, 7(%r10)
# Advance 10 bytes, loop.
addq $10, %r10
jmp compile_step
compile_lbracket:
# Skip emitting the lea if we don't need it.
cmpw $0, 2(%rbx)
je compile_lbracket_test
# 48 8d 9b AA AA AA AA lea AA(%rbx),%rbx
# We use lea since it does computation in the decode stage, which helps pipelining.
movl $0x009b8d48, (%r10)
# Write the offset for the lea.
movswl 2(%rbx), %eax
movl %eax, 3(%r10)
addq $7, %r10
compile_lbracket_test:
# Write the other instructions.
# 80 3b 00 cmpb $0x0,(%rbx)
# 0f 84 OF FF SS ET je <offset>
movw $0x038a, (%r10)
movw $0xc084, 2(%r10)
movw $0x840f, 4(%r10)
movl $0x00, 6(%r10)
# Advance the 10 bytes.
addq $10, %r10
# Push our current position on the stack. We use the stack to keep track of bracket pairs.
# Once we reach the closing bracket, we pop, compute offsets and write the j(n)e targets.
pushq %r10
jmp compile_step
compile_rbracket:
# Skip emitting the lea if we don't need it.
cmpw $0, 2(%rbx)
je compile_rbracket_test
# 48 8d 9b AA AA AA AA lea AA(%rbx),%rbx
# We use lea since it does computation in the decode stage, which helps pipelining.
movl $0x009b8d48, (%r10)
# Write the offset.
movswl 2(%rbx), %eax
movl %eax, 3(%r10)
addq $7, %r10
compile_rbracket_test:
# Write the other bytes.
# 80 3b 00 cmpb $0x0,(%rbx)
# 0f 85 OF FF SS ET jne <offset>
movw $0x038a, (%r10)
movw $0xc084, 2(%r10)
movw $0x850f, 4(%r10)
movl $0x00, 6(%r10)
addq $10, %r10
# Find out what the code address was of our starting position.
popq %rcx
# Find out the difference between our current address and the starting one.
movq %r10, %rax
subq %rcx, %rax
# Write the positive offset at the starting location.
movl %eax, -4(%rcx)
# Now negate it and write the negative offset at our current location.
negl %eax
movl %eax, -4(%r10)
jmp compile_step
compile_write_cell:
# c6 83 AA AA AA AA BB = movb BB,AA(%rbx)
movw $0x83c6, (%r10)
# Write the offset.
movswl 2(%rbx), %eax
movl %eax, 2(%r10)
# Write the value.
movsbl 1(%rbx), %eax
movb %al, 6(%r10)
# Advance 7 bytes and loop.
addq $7, %r10
jmp compile_step
compile_mult_write:
# b8 XX XX XX XX mov XX,%eax
movw $0xb8, (%r10)
# Write the value.
movsbl 1(%rbx), %eax
movl %eax, 1(%r10)
# 0f af c2 imul %edx,%eax
movl $0x00c2af0f, 5(%r10)
# 00 83 XX XX XX XX add %al,XX(%rbx)
movb $0x83, 9(%r10)
# Write the offset.
movswl 2(%rbx), %eax
movl %eax, 10(%r10)
# Advance 11 bytes and loop.
addq $14, %r10
jmp compile_step
compile_unroll_begin:
# Do not emit the lea if we do not need it.
cmpw $0, 2(%rbx)
je compile_unroll_begin_mov
# 48 8d 9b 30 75 00 00 lea 0x7530(%rbx),%rbx
movl $0x009b8d48, (%r10)
# Write the offset.
movswl 2(%rbx), %eax
movl %eax, 3(%r10)
addq $7, %r10
compile_unroll_begin_mov:
# 0f b6 13 movzbl (%rbx),%edx
movl $0x0013b60f, (%r10)
# Advance 10 bytes and loop.
addq $3, %r10
jmp compile_step
compile_unroll_end:
# Zero the cell at index 0.
# c6 03 00 movb $0x0,(%rbx)
movl $0x000003c6, (%r10)
addq $3, %r10
jmp compile_step
compile_cell_add:
# Simply add the value in %dl to the offset value.
# 00 93 31 75 00 00 add %dl,0x7531(%rbx)
movb $0x93, 1(%r10)
# Write the offset.
movswl 2(%rbx), %eax
movl %eax, 2(%r10)
addq $6, %r10
jmp compile_step
compile_if_end:
# Find out what the code address was of our starting position.
popq %rcx
# Find out the difference between our current address and the starting one.
movq %r10, %rax
subq %rcx, %rax
# Write the offset at the starting location.
movl %eax, -4(%rcx)
# We do not need to write anything else, so we don't adjust r10 and simply loop.
jmp compile_step
compile_unroll_set:
# 4004d7: b0 0a mov $0xa,%al
movb $0xb0, (%r10)
# Write the value.
movsbl 1(%rbx), %eax
movl %eax, 1(%r10)
# 4004d9: 0f b6 8b 31 75 00 00 movzbl 0x7531(%rbx),%ecx
movl $0x008bb60f, 2(%r10)
# Write the offset.
movswl 2(%rbx), %eax
movl %eax, 5(%r10)
# 4004e0: 84 d2 test %dl,%dl
# 4004e2: 66 0f 45 c8 cmovne %ax,%cx
# 4004e6: 88 8b 31 75 00 00 mov %cl,0x7531(%rbx)
movw $0xd284, 9(%r10)
movl $0xc8450f66, 11(%r10)
movw $0x8b88, 15(%r10)
# Write the offset.
movswl 2(%rbx), %eax
movl %eax, 17(%r10)
# Advance 21 bytes and loop.
addq $21, %r10
jmp compile_step
compile_pointer_seek:
# Do not emit the lea if we do not need it.
cmpw $0, 2(%rbx)
je compile_pointer_seek_body
# 48 8d 9b 30 75 00 00 lea 0x7530(%rbx),%rbx
movl $0x009b8d48, (%r10)
# Write the offset.
movswl 2(%rbx), %eax
movl %eax, 3(%r10)
addq $7, %r10
compile_pointer_seek_body:
# cmpb $0, (%rdi)
movl $0x00003b80, (%r10)
# leaq 9(%rdi), %rdi
movl $0x005b8d48, 3(%r10)
# Write the value.
movsbl 1(%rbx), %eax
movb %al, 6(%r10)
# jne <back>. The code size is 13 bytes, so the offset back is -13.
movw $0x850f, 7(%r10)
movl $-13, 9(%r10)
# Advance 13 bytes and step.
addq $13, %r10
jmp compile_step
compile_step:
# Goto the next IR node and actually loop.
addq $4, %rbx
jmp compile_check
compile_end:
# Write ret to last byte.
movb $0xC3, (%r10)
# Put resulting buffer in rax
movq %r11, %rax
# Cleanup, return.
popq %r11
popq %r10
popq %rbx
movq %rbp, %rsp
popq %rbp
ret
# Since our JITted block of memory makes certain assumptions about the environment
# (such as certain functions existing in registers), this function functions as a
# "trampoline" that takes a block of JITted instructions and actually runs the code.
# -----------------------------------------------------------------------------------
# Arguments:
# rdi: pointer to executable block of memory with instructions
run_compiled_bf:
pushq %rbp
movq %rsp, %rbp
# Prepare the stack and registers. We purposefully misalign by 8,
# so that using call will align the stack by pushing the return address.
pushq %rbx
pushq %r12
pushq %r13
# Our compiled code expects the cell pointer in rdx, putchar in r12 and getchar in r13.
leaq CELLS, %rbx
movq $putchar, %r12
movq $getchar, %r13
# Call our compiled code. The stack is not currently aligned, but with the return value it will align properly.
call *%rdi
# Cleanup and return.
popq %r13
popq %r12
popq %rbx
movq %rbp, %rsp
popq %rbp
ret
# Takes a simple brainfuck char* input and parses it into IR, optimizes it and then
# compiles and runs it.
# -----------------------------------------------------------------------------------
# Arguments:
# rdi: char* to execute
brainfuck:
pushq %rbp
movq %rsp, %rbp
# Build our representation.
call build_representation
movq %rax, %rdi
# Optimize our representation. rdi is never modified (calling convention) and
# we don't care about the return value so we can just call every function in order.
call pre_offset_optimizations
call compute_offsets
movq $possibly_unroll_loop, %rsi
call for_every_loop_run
movq $possibly_convert_to_if, %rsi
call for_every_loop_run
# Compile the program
call compile_bf
# Run the program
movq %rax, %rdi
call run_compiled_bf
movq %rbp, %rsp
popq %rbp
ret
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment