Last active
April 9, 2018 22:29
-
-
Save molenzwiebel/748e336a1bb465690b477ef1c42bed4c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ==================================================================================================== | |
# ==================================================================================================== | |
# ==================================================================================================== | |
# ==================================================================================================== | |
# ==================================================================================================== | |
# ===================================== README README README README ================================== | |
# ==================================================================================================== | |
# == IT IS HIGHLY RECOMMENDED THAT YOU READ THIS CODE WHILE READING THE ACCOMPANYING README FILE AT == | |
# == THE SAME TIME. IT WILL EXPLAIN SOME CONCEPTS USED IN THIS CODE, AND WHILE THE COMMENTS ABOVE == | |
# == FUNCTIONS CONTAIN A LOT OF CONTEXT, THEY WILL NOT EXPLAIN EVERYTHING. ALSO: I SPENT A LOT OF == | |
# == TIME ON THE README, SO I'D LOVE IT IF SOMEONE ACTUALLY READ IT AND APPRECIATED THE EFFORT. == | |
# == FIND THE README AT: https://gist.github.com/molenzwiebel/2839f812b1d795ac22ee26a49d8e82ba == | |
# ==================================================================================================== | |
# ==================================================================================================== | |
# ==================================================================================================== | |
# ==================================================================================================== | |
# ==================================================================================================== | |
# ==================================================================================================== | |
.bss | |
# The 30.000 signed 8-bit cells we use. | |
CELLS: .skip 30000 | |
# Does nothing. Used for optimized out instructions so we don't have to reorder arrays. | |
.equ OP_NOP, 1 | |
# A simple + or - instruction. Is actually an add, not a write. | |
.equ OP_WRITE, 2 | |
# Moves the current cell pointer (> and <). Optimized out by using offsets. | |
.equ OP_MOVE, 3 | |
# Reads a char from stdin (,) | |
.equ OP_READ, 4 | |
# Writes a char to stdout (.) | |
.equ OP_PRINT, 5 | |
# A simple loop open ([) | |
.equ OP_LOOP_OPEN, 6 | |
# A simple loop close (]) | |
.equ OP_LOOP_CLOSE, 7 | |
# A direct write to a cell, instead of an addition. p[offset] = coeff | |
.equ OP_WRITE_CELL, 8 | |
# A multiplicative addition to a cell, instead of a normal addition. | |
.equ OP_MULT_WRITE, 9 | |
# Beginning of a simple loop that can be proven to run N times. | |
.equ OP_UNROLL_LOOP_BEGIN, 10 | |
# End of a simple loop that can be proven to run N times. | |
.equ OP_UNROLL_LOOP_END, 11 | |
# Within an unrolled loop, an operation that simply adds a cell. This is OP_MULT_WRITE but with metadata of 1. | |
.equ OP_CELL_ADD, 12 | |
# Signifies the end of a "loop" that can be proven to only run once. | |
.equ OP_IF_END, 13 | |
# A conditional set within an unrolled loop. Different from normal sets since it needs to be executed conditionally. | |
.equ OP_UNROLL_SET, 14 | |
# A loop that searches for a 0 cell. Equivalent to [>>>>], with coeff being the amount of moves inside the loop. | |
.equ OP_POINTER_SEEK, 15 | |
.text | |
# Lookup table for JIT code sizes for every instruction. | |
CODE_SIZES: .byte 0 # index 0 | |
.byte 0 # nop | |
.byte 7 # write | |
.byte 0 # move | |
.byte 9 # read | |
.byte 10 # print | |
.byte 17 # loop open | |
.byte 17 # loop close | |
.byte 7 # write cell | |
.byte 14 # mult write | |
.byte 10 # unroll loop start | |
.byte 3 # unroll loop end | |
.byte 6 # cell add | |
.byte 21 # unroll set | |
.byte 20 # pointer seek | |
.global brainfuck | |
.global build_representation | |
.global pre_offset_optimizations | |
.global compute_offsets | |
.global possibly_unroll_loop | |
.global possibly_convert_to_if | |
.global compile_bf | |
.global compute_jit_size | |
# Converts the textual brainfuck representation into an IR (intermediate representation) | |
# corresponding to the brainfuck. The IR consists of 4 bytes, which looks like: | |
# 1 byte: opcode | |
# 1 byte: coefficient | |
# 2 bytes: offset | |
# | |
# The opcode is one of the opcodes defined at the top of this file. This function already | |
# does some initial optimizations, most notably collapsing the +- and <> instructions | |
# and writing the amount to the coefficient field. ++- results into OP_WRITE with an | |
# coefficient of one. | |
# | |
# Zeroing loops ([-]) are also detected and optimized in this program. All other constructs | |
# are translated directly. | |
# | |
# The offset field is written as 0, since offsets will be computed later. | |
# ----------------------------------------------------------------------------------------- | |
# Arguments: | |
# rdi: a pointer to brainfuck source to optimize | |
# Returns: | |
# rax: a pointer to the optimized version of the source | |
build_representation: | |
pushq %rbp | |
movq %rsp, %rbp | |
# Save rdi since we allocate a new buffer, which | |
# destroys rdi even though we need it later. | |
subq $8, %rsp | |
pushq %rdi | |
# Allocate a new result buffer. First, check how long the source is. | |
call strlen | |
# Add one for the null terminator (in case we don't do a single optimization). | |
# Then, multiply by four since we need 4 bytes for every original instruction in the worst case. | |
addq $1, %rdi | |
shlq $2, %rdi | |
# Allocate len * 4 + 1 bytes and zero them. | |
movq $1, %rsi | |
call calloc | |
# The old initial source is in rcx, new buffer in rax/ | |
popq %rcx | |
# Save buffer, so we can return the start later. | |
pushq %rax | |
build_loop_check: | |
# Stop once we've reached the end. | |
cmpb $0, (%rcx) | |
je build_loop_end | |
# *ptr == '[', check for the [-] construct | |
cmpb $91, (%rcx) | |
je maybe_zero_cell | |
# *ptr == ']', no optimization possible (copy the char over) | |
cmpb $93, (%rcx) | |
je build_rbracket | |
# *ptr == ',', no optimization possible (copy the char over) | |
cmpb $44, (%rcx) | |
je build_read | |
# *ptr == '.', no optimization possible (copy the char over) | |
cmpb $46, (%rcx) | |
je build_print | |
# *ptr == '+', collapse subsequent instructions | |
cmpb $43, (%rcx) | |
je collapse_write | |
# *ptr == '-', collapse subsequent instructions | |
cmpb $45, (%rcx) | |
je collapse_write | |
# *ptr == '>', collapse subsequent instructions | |
cmpb $62, (%rcx) | |
je collapse_ptr | |
# *ptr == '<', collapse subsequent instructions | |
cmpb $60, (%rcx) | |
je collapse_ptr | |
# This wasn't a brainfuck instruction so ignore it alltogether. | |
# Simply go to the next character and loop again. | |
incq %rcx | |
jmp build_loop_check | |
collapse_write: | |
# count == 0, then do a while loop | |
movq $0, %rdx | |
collapse_write_check: | |
# to prevent overflow (we only have 1 byte for length), stop once we reach 100 | |
# '+' * 130 will just optimize into [OP_WRITE(100), OP_WRITE(30)], which isn't a big deal | |
cmpb $-100, %dl | |
jl collapse_write_after | |
cmpb $100, %dl | |
jg collapse_write_after | |
# if this is + or -, adjust our count and loop | |
cmpb $43, (%rcx) | |
je collapse_write_incr | |
cmpb $45, (%rcx) | |
je collapse_write_decr | |
# else, we've reached the end of the collapsible strips, so stop | |
jmp collapse_write_after | |
collapse_write_incr: | |
# Add one to the count, goto next character. | |
addb $1, %dl | |
addq $1, %rcx | |
jmp collapse_write_check | |
collapse_write_decr: | |
# Subtract one from the count, goto next character. | |
subb $1, %dl | |
addq $1, %rcx | |
jmp collapse_write_check | |
collapse_write_after: | |
# We've gone too far, go back one character. | |
subq $1, %rcx | |
# Write OP_WRITE and the count to the IR node. | |
movb $OP_WRITE, (%rax) | |
movb %dl, 1(%rax) | |
jmp build_loop_step | |
collapse_ptr: | |
# Again, count == 0 and while. | |
movq $0, %rdx | |
collapse_ptr_check: | |
# Again, prevent overflow by optimizing a bit less. See optimize_write_check for more details. | |
cmpb $-100, %dl | |
jl collapse_ptr_after | |
cmpb $100, %dl | |
jg collapse_ptr_after | |
# If > or <, adjust our count. | |
cmpb $62, (%rcx) | |
je collapse_ptr_incr | |
cmpb $60, (%rcx) | |
je collapse_ptr_decr | |
# We've reached the end of the strip. | |
jmp collapse_ptr_after | |
collapse_ptr_incr: | |
# count++, goto next character. | |
addb $1, %dl | |
addq $1, %rcx | |
jmp collapse_ptr_check | |
collapse_ptr_decr: | |
# count--, go to next character | |
subb $1, %dl | |
addq $1, %rcx | |
jmp collapse_ptr_check | |
collapse_ptr_after: | |
# Gone too far, subtract one. | |
subq $1, %rcx | |
# Write OP_MOVE(count) to our result. | |
movb $OP_MOVE, (%rax) | |
movb %dl, 1(%rax) | |
jmp build_loop_step | |
maybe_zero_cell: | |
# If the next two bytes aren't -], this is not a [-] construct | |
# we are willing to optimize. Simply copy the current [ over normally | |
# and the next iterations will copy the -]s over. | |
cmpb $45, 1(%rcx) | |
jne build_lbracket | |
cmpb $93, 2(%rcx) | |
jne build_lbracket | |
# Write OP_WRITE_CELL with the value 0. | |
movb $OP_WRITE_CELL, (%rax) | |
addq $2, %rcx | |
jmp build_loop_step | |
build_lbracket: | |
# Simply copy over. | |
movb $OP_LOOP_OPEN, (%rax) | |
jmp build_loop_step | |
build_rbracket: | |
# Simply copy over. | |
movb $OP_LOOP_CLOSE, (%rax) | |
jmp build_loop_step | |
build_print: | |
# Simply copy over. | |
movb $OP_PRINT, (%rax) | |
jmp build_loop_step | |
build_read: | |
# Simply copy over. | |
movb $OP_READ, (%rax) | |
jmp build_loop_step | |
build_loop_step: | |
# The loop step, go to the next character and IR node and loop back. | |
addq $4, %rax | |
incq %rcx | |
jmp build_loop_check | |
build_loop_end: | |
# We stored the beginning of the newly allocated buffer on the stack. | |
# We pop it off now so we can return it into rax. | |
popq %rax | |
movq %rbp, %rsp | |
popq %rbp | |
ret | |
# Does a set of optimizations on the converted IR that need to be done before | |
# computing the pointer offsets and zeroing the moves. Most of these rely on | |
# moves to still exist in the IR, which are optimized out when we compute offsets. | |
# | |
# The following constructs are optimized here: | |
# - '++--' to OP_NOP | |
# - '><><' to OP_NOP | |
# - '[-]+++' to OP_WRITE_CELL(3) | |
# - '][-]' into OP_LOOP_CLOSE, OP_NOP | |
# -------------------------------------------------------------------------------- | |
# Arguments: | |
# rdi: a pointer to the IR array | |
pre_offset_optimizations: | |
pushq %rbp | |
movq %rsp, %rbp | |
movq %rdi, %rax | |
pre_offset_check: | |
cmpb $0, (%rax) | |
je pre_offset_end | |
# Check if they might be useless collapsed operations. | |
cmpb $OP_WRITE, (%rax) | |
je maybe_remove_useless_collapsed | |
cmpb $OP_MOVE, (%rax) | |
je maybe_remove_useless_collapsed | |
# Maybe fuse a zero set and a write into a single op. | |
cmpb $OP_WRITE_CELL, (%rax) | |
je maybe_fuse_zero_add | |
# Check if this is a pointer seek loop. | |
cmpb $OP_LOOP_OPEN, (%rax) | |
je maybe_pointer_seek | |
# Maybe remove a useless zeroing after the closing of a loop. | |
# (we already know the cell will be zero after a loop). | |
cmpb $OP_LOOP_CLOSE, (%rax) | |
je maybe_remove_useless_zero | |
jmp pre_offset_step | |
maybe_pointer_seek: | |
# If the next ins isn't move or the one after isn't a loop close, abort. | |
cmpb $OP_MOVE, 4(%rax) | |
jne pre_offset_step | |
cmpb $OP_LOOP_CLOSE, 8(%rax) | |
jne pre_offset_step | |
movb $OP_POINTER_SEEK, (%rax) | |
movb $OP_NOP, 4(%rax) | |
movb $OP_NOP, 8(%rax) | |
# Copy over the coefficient from the move. | |
movzbl 5(%rax), %edx | |
movb %dl, 1(%rax) | |
jmp pre_offset_step | |
maybe_remove_useless_collapsed: | |
# if the coeff isnt 0, ignore this | |
cmpb $0, 1(%rax) | |
jne pre_offset_step | |
# replace the operation with nop | |
movb $OP_NOP, (%rax) | |
jmp pre_offset_step | |
maybe_remove_useless_zero: | |
# if the next op is a write_cell(0), continue. else, go to step | |
cmpb $OP_WRITE_CELL, 4(%rax) | |
jne pre_offset_step | |
cmpb $0, 5(%rax) | |
jne pre_offset_step | |
# Skip our skipped instruction, write NOP. | |
addq $4, %rax | |
movb $OP_NOP, (%rax) | |
maybe_fuse_zero_add: | |
# If the next operation isn't a write, skip. | |
cmpb $0, 1(%rax) | |
jne pre_offset_step | |
cmpb $OP_WRITE, 4(%rax) | |
jne pre_offset_step | |
# Cannot mov two memory references, so load into edx. | |
movsbl 5(%rax), %edx | |
movb %dl, 1(%rax) | |
# Write NOP for the next instruction. | |
movb $OP_NOP, 4(%rax) | |
movb $OP_WRITE_CELL, (%rax) | |
jmp pre_offset_step | |
pre_offset_step: | |
# Simply go to the next IR node and try to optimize again. | |
addq $4, %rax | |
jmp pre_offset_check | |
pre_offset_end: | |
movq %rbp, %rsp | |
popq %rbp | |
ret | |
# One of the big performance gains we use is computing "offsets". Basically, with the | |
# following code: +>->++<, it is much more performant if we can emit | |
# p[0]++, p[1]--, p[2] += 2 and ignore the pointer movements completely. This is also | |
# great for pipelining since this way of emitting code does not create data dependencies | |
# which means the processor itself can give us "free performance". Do note that we still | |
# need to move the pointer at the end, specifically before [s and ]s (since we cannot) | |
# predict how much times a loop will run. The code emitting will insert the pointer | |
# movements based on the offset field for that op. Do note that the offset field is | |
# written for every field, even the ones that might not need it. This makes computation | |
# easier. | |
# -------------------------------------------------------------------------------------- | |
# Arguments: | |
# rdi: a pointer to the IR array | |
compute_offsets: | |
pushq %rbp | |
movq %rsp, %rbp | |
# We start at offset zero. | |
movq $0, %rcx | |
movq %rdi, %rax | |
offset_check: | |
# Stop if we've reached the end. | |
cmpb $0, (%rax) | |
je offset_end | |
# Write the offset to this instruction. | |
movw %cx, 2(%rax) | |
# If this was a move, nop the move and adjust the offset. | |
cmpb $OP_MOVE, (%rax) | |
je offset_move | |
# If this was an open or close, reset the offset (since the instruction will adjust the "base" of the pointer). | |
cmpb $OP_LOOP_OPEN, (%rax) | |
je offset_clear | |
cmpb $OP_LOOP_CLOSE, (%rax) | |
je offset_clear | |
# If this was a pointer seek, set the offset to negative the seek count. | |
# This is because we transform into a do-while loop, which means we always go too far. | |
cmpb $OP_POINTER_SEEK, (%rax) | |
je offset_seek | |
# Nothing interesting, just step and loop. | |
jmp offset_step | |
offset_clear: | |
# Simply zero offset, step and loop. | |
movq $0, %rcx | |
jmp offset_step | |
offset_move: | |
# Load the coefficient and add to the current offset. | |
movsbl 1(%rax), %edx | |
addl %edx, %ecx | |
# NOP out this pointer move, then loop. | |
movb $OP_NOP, (%rax) | |
jmp offset_step | |
offset_seek: | |
# Load the coefficient and negate. | |
movsbl 1(%rax), %edx | |
negl %edx | |
# Set the new offset. | |
movq %rdx, %rcx | |
jmp offset_step | |
offset_step: | |
# Goto the next IR node, then loop. | |
addq $4, %rax | |
jmp offset_check | |
offset_end: | |
# We do not have a result. | |
movq %rbp, %rsp | |
popq %rbp | |
ret | |
# This is a helper function that takes a function pointer in rsi and runs it on the | |
# start of every loop contained in rdi. This is extracted in a helper function since | |
# we need to iterate over the code twice: once for loop unrolling and once for if | |
# loop detection. | |
# ---------------------------------------------------------------------------------- | |
# Arguments: | |
# rdi: a pointer to the IR array | |
# rsi: a function pointer to be ran on every OP_LOOP_OPEN node | |
for_every_loop_run: | |
pushq %rbp | |
movq %rsp, %rbp | |
# save rdi, it is caller saved and we modify it here | |
subq $8, %rsp | |
pushq %rdi | |
for_loops_body: | |
# stop if we've reached the end | |
cmpb $0, (%rdi) | |
je for_loops_end | |
# next if this isn't loop open, else call the pointer | |
# the argument is already in rdi since we work in rdi | |
cmpb $OP_LOOP_OPEN, (%rdi) | |
jne for_loops_step | |
call *%rsi | |
for_loops_step: | |
# Go to the next node and loop. | |
addq $4, %rdi | |
jmp for_loops_body | |
for_loops_end: | |
# Cleanup our rdi and leave. | |
popq %rdi | |
movq %rbp, %rsp | |
popq %rbp | |
ret | |
# Takes a pointer to the node at the beginning of a balanced loop and checks if it | |
# can potentially be removed completely (technically unrolling is not the valid | |
# term for the operation but ¯\_(ツ)_/¯). This construct optimizes the following | |
# kind of loops: [->>+>+++<<<]. | |
# | |
# Basically any loop where the total pointer moves cancel out and the total adjustment | |
# to the cell at the beginning of the loop is -1 can be proven to run N times, where | |
# N is the current value of the cell at "index" 0. As such, the loop given in the | |
# example can be optimized to: | |
# p[2] += p[0] | |
# p[3] += p[0] * 3 | |
# p[0] = 0 | |
# | |
# This removes the need to run the loop completely. This function first goes through the | |
# loop and determines if it has no nested loops, I/O or other constructs that we cannot | |
# translate appropriately. If this is the case, the [ and ] are replaced with their unroll | |
# variants and every +/- is replaced with either a CELL_ADD (if the adjustment is 1) or a | |
# MULT_WRITE (if the coefficient is anything else). Any adjustment to the cell at index 0 | |
# is also replaced by UNROLL_BASE_ADJUST, since otherwise the program is not 100% correct. | |
# ---------------------------------------------------------------------------------------- | |
# Arguments: | |
# rdi: a pointer to the IR array, pointing at the opening [ of a loop | |
possibly_unroll_loop: | |
pushq %rbp | |
movq %rsp, %rbp | |
# rax: The current instruction. | |
# rcx: Start of loop (needed later). | |
# rdx: Tracks modifications made against p[0] to see if they cancel out. | |
movq %rdi, %rax | |
movq %rdi, %rcx | |
movq $0, %rdx | |
# Move past the opening [ | |
addq $4, %rax | |
unroll_loop_check_body: | |
# If we haven't returned yet and the loop closed, this is seemingly valid. | |
# We say 'seemingly', since we still need to check if the base doesn't move and | |
# if the total adjustments to the base totals to -1. | |
cmpb $OP_LOOP_CLOSE, (%rax) | |
je unroll_loop_seemingly_valid | |
# If this is a '.', '[', ',' or cell write, we cannot unroll it. | |
# Simply leave the current function (give up). | |
cmpb $OP_READ, (%rax) | |
je unroll_leave | |
cmpb $OP_PRINT, (%rax) | |
je unroll_leave | |
cmpb $OP_LOOP_OPEN, (%rax) | |
je unroll_leave | |
cmpb $OP_WRITE_CELL, (%rax) | |
je maybe_write_cell_leave | |
# If this is a write, we might need to adjust the base. | |
cmpb $OP_WRITE, (%rax) | |
je maybe_adjust_base | |
# Nothing interesting, still valid so far. Let's continue. | |
jmp unroll_loop_check_step | |
maybe_write_cell_leave: | |
# This was a write cell. If we write to the base, it destroys | |
# the assumption that this runs N times. As such, we need to abort. | |
cmpw $0, 2(%rax) | |
je unroll_leave | |
# It's fine, this operation doesn't prevent us from unrolling. | |
jmp unroll_loop_check_step | |
maybe_adjust_base: | |
# If this doesn't adjust the base, no worries, the loop is still unrollable. | |
cmpw $0, 2(%rax) | |
jne unroll_loop_check_step | |
# Add the coefficient to dl. | |
addb 1(%rax), %dl | |
# <fallthrough to step> | |
unroll_loop_check_step: | |
# Go to the next node and loop. | |
addq $4, %rax | |
jmp unroll_loop_check_body | |
unroll_loop_seemingly_valid: | |
# At this point, the loop does not contain invalid instructions. | |
# We still need to check if the offset at the end returns to 0 and | |
# if the base pointer has a total difference of -1. | |
cmpb $-1, %dl | |
jne unroll_leave | |
cmpw $0, 2(%rax) | |
jne unroll_leave | |
# Start at begin of loop again, replace [ with the unroll start node. | |
movq %rcx, %rax | |
movb $OP_UNROLL_LOOP_BEGIN, (%rax) | |
movb $0, 1(%rax) | |
# Again, skip the [. | |
addq $4, %rax | |
unroll_valid_write_body: | |
# If we've reached the end, complete and return. | |
cmpb $OP_LOOP_CLOSE, (%rax) | |
je unroll_valid_write_end | |
cmpb $OP_WRITE_CELL, (%rax) | |
je unroll_valid_write_cell_write | |
# If this is not a write, we don't have to adjust it. Simply go to the step. | |
cmpb $OP_WRITE, (%rax) | |
jne unroll_valid_write_step | |
# If the value is 1, we do a cell add instead of a multiplication. | |
cmpb $1, 1(%rax) | |
je unroll_valid_write_cell_add | |
# If we write to the "base" (index 0), we need to replace the op with a nop. | |
cmpw $0, 2(%rax) | |
je unroll_valid_write_nop | |
# This is a write and its value isn't 1, so we replace it with the multiplication opcode instead. | |
movb $OP_MULT_WRITE, (%rax) | |
# Step. | |
jmp unroll_valid_write_step | |
unroll_valid_write_cell_write: | |
movb $OP_UNROLL_SET, (%rax) | |
jmp unroll_valid_write_step | |
unroll_valid_write_cell_add: | |
# Change the opcode to cell add and step. | |
movb $OP_CELL_ADD, (%rax) | |
jmp unroll_valid_write_step | |
unroll_valid_write_nop: | |
# Change the opcode to nop and step. | |
movb $OP_NOP, (%rax) | |
# <fallthrough to step> | |
unroll_valid_write_step: | |
# Go to the next node and loop. | |
addq $4, %rax | |
jmp unroll_valid_write_body | |
unroll_valid_write_end: | |
# Replace the closing ']' with an unroll end opcode. | |
movb $OP_UNROLL_LOOP_END, (%rax) | |
# <fallthrough to leave> | |
unroll_leave: | |
# Cleanup and leave. | |
movq %rbp, %rsp | |
popq %rbp | |
ret | |
# Starts at the beginning of a loop and tries to determine if we can statically prove | |
# that the loop only runs once. A loop only runs once if it's base is set to 0 within | |
# the loop body, as such: | |
# while (*p) { /* do something */; *p = 0; } | |
# | |
# It should be obvious that that can be converted to | |
# if (*p) { /* do something */; *p = 0; } | |
# which eliminates one check and jump that will never be taken. | |
# | |
# This simply goes through all the instructions in the loop and aborts if there's either | |
# a nested loop or any sort of I/O. It also keeps track of the "validity" of the loop, | |
# making it valid if the value is zero'd and invalid if something is written to the base. | |
# --------------------------------------------------------------------------------------- | |
# Arguments: | |
# rdi: a pointer to the IR array, pointing at the opening [ of a loop | |
possibly_convert_to_if: | |
pushq %rbp | |
movq %rsp, %rbp | |
# rax: Pointer to the current instruction. | |
# rcx: Copy of the start instruction, for later. | |
# rdx: Fs the loop is "valid" (1 if true, 0 if not) | |
movq %rdi, %rax | |
movq %rdi, %rcx | |
movq $0, %rdx | |
# Move past the opening [ | |
addq $4, %rax | |
convert_body: | |
# Stop checking if we've reached the end of the loop. | |
cmpb $OP_LOOP_CLOSE, (%rax) | |
je convert_loop_end | |
# If this is a read, print or nested loop, abort. | |
cmpb $OP_READ, (%rax) | |
je convert_leave | |
cmpb $OP_PRINT, (%rax) | |
je convert_leave | |
cmpb $OP_LOOP_OPEN, (%rax) | |
je convert_leave | |
# If this is a write, mark as invalid if the offset is 0. | |
cmpb $OP_WRITE, (%rax) | |
je convert_set_maybe_invalid | |
# If this is an unrolled loop or a [-], mark as valid if they act on offset 0. | |
cmpb $OP_UNROLL_LOOP_BEGIN, (%rax) | |
je convert_set_maybe_valid | |
cmpb $OP_WRITE_CELL, (%rax) | |
je convert_set_maybe_valid | |
# Nothing interesting, move to step. | |
jmp convert_step | |
convert_set_maybe_invalid: | |
# If the offset isn't 0, this doesnt invalidate the loop. | |
cmpw $0, 2(%rax) | |
jne convert_step | |
# Set as invalid and step. | |
movq $0, %rdx | |
jmp convert_step | |
convert_set_maybe_valid: | |
# If the write isn't zero, this might make it invalid. | |
cmpb $0, 1(%rax) | |
jne convert_step | |
# If the offset isn't 0, this doesnt validate the loop. | |
cmpw $0, 2(%rax) | |
jne convert_step | |
# Set as valid and step. | |
movq $1, %rdx | |
jmp convert_step | |
convert_step: | |
# Go to the next node and loop. | |
addq $4, %rax | |
jmp convert_body | |
convert_loop_end: | |
# If it's currently invalid or the moves don't cancel out, abort. | |
cmpq $0, %rdx | |
je convert_leave | |
cmpw $0, 2(%rax) | |
jne convert_leave | |
# Replace the ] with an if end, which compiles differently. | |
movb $OP_IF_END, (%rax) | |
# <fallthrough to leave> | |
convert_leave: | |
# Cleanup and leave. | |
movq %rbp, %rsp | |
popq %rbp | |
ret | |
# Since we need to allocate an executable block of memory for our JITted code, we | |
# use this function to compute how big the allocated block needs to be. This function | |
# operates on the fully optimized code and will return wrong results for other IRs. | |
# It uses a simple lookup table to have a fixed size for every instruction. This | |
# greatly decreases complexity but misses some optimizations (it will emit a useless | |
# lea 0(rbx), rbx if the offset is 0 for loops for example). | |
# ----------------------------------------------------------------------------------- | |
# Arguments: | |
# rdi: a pointer to "ast" | |
compute_jit_size: | |
pushq %rbp | |
movq %rsp, %rbp | |
# We need at least one byte for the return opcode. | |
movq $1, %rax | |
movq %rdi, %rcx | |
size_check: | |
# If we've reached the end, return. | |
cmpb $0, (%rcx) | |
je size_end | |
# edx = sign_extend(node->opcode) | |
movzbl (%rcx), %edx | |
# edx = CODE_SIZES[edx] | |
movzbl CODE_SIZES(%rdx), %edx | |
# Add the size to the current total. | |
addq %rdx, %rax | |
size_step: | |
# Simply goto the next value. | |
addq $4, %rcx | |
jmp size_check | |
size_end: | |
movq %rbp, %rsp | |
popq %rbp | |
ret | |
# Actually does the JIT compilation on the optimized IR. This allocates an executable | |
# and writable block of memory, then writes the raw bytes/opcodes for the CPU instructions | |
# that correspond to the specified brainfuck instruction. This is technically probably | |
# an assembler, not a compiler. | |
# ---------------------------------------------------------------------------------------- | |
# Arguments: | |
# rdi: a pointer to the optimized JIT array | |
compile_bf: | |
pushq %rbp | |
movq %rsp, %rbp | |
# We store the current instruction to compile in rbx, the executable block in r10 | |
# and the beginning of the executable block in r11. These are callee saved. | |
subq $8, %rsp | |
pushq %rbx | |
pushq %r10 | |
pushq %r11 | |
# Find out how much space we need to allocate. | |
movq %rdi, %rbx | |
call compute_jit_size | |
# Call mmap to allocate our memory. | |
# Equivalent to calling: | |
# mmap(NULL, len, PROT_WRITE | PROT_EXEC, MAP_ANON | MAP_PRIVATE, -1, 0); | |
movq %rax, %rsi | |
movl $0, %edi | |
movl $6, %edx | |
movl $34, %ecx | |
movl $-1, %r8d | |
movl $0, %r9d | |
call mmap | |
# Put the allocated memory in r10 and r11. | |
movq %rax, %r10 | |
movq %rax, %r11 | |
compile_check: | |
# If we've reached the end, return. | |
cmpb $0, (%rbx) | |
je compile_end | |
# For every potential operation, run the compilation label. | |
cmpb $OP_WRITE, (%rbx) | |
je compile_write | |
cmpb $OP_READ, (%rbx) | |
je compile_read | |
cmpb $OP_PRINT, (%rbx) | |
je compile_print | |
cmpb $OP_LOOP_OPEN, (%rbx) | |
je compile_lbracket | |
cmpb $OP_LOOP_CLOSE, (%rbx) | |
je compile_rbracket | |
cmpb $OP_WRITE_CELL, (%rbx) | |
je compile_write_cell | |
cmpb $OP_MULT_WRITE, (%rbx) | |
je compile_mult_write | |
cmpb $OP_UNROLL_LOOP_BEGIN, (%rbx) | |
je compile_unroll_begin | |
cmpb $OP_UNROLL_LOOP_END, (%rbx) | |
je compile_unroll_end | |
cmpb $OP_CELL_ADD, (%rbx) | |
je compile_cell_add | |
cmpb $OP_IF_END, (%rbx) | |
je compile_if_end | |
cmpb $OP_UNROLL_SET, (%rbx) | |
je compile_unroll_set | |
cmpb $OP_POINTER_SEEK, (%rbx) | |
je compile_pointer_seek | |
# This was an operation that we didn't recognize/handle (probably a NOP). Simply ignore it and loop. | |
jmp compile_step | |
compile_write: | |
# 0x80 0x83 0xAA 0xAA 0xAA 0xAA 0xBB = addb 0xBB, 0xAA(%rbx) | |
movw $0x8380, (%r10) | |
# Write the offset. | |
movswl 2(%rbx), %eax | |
movl %eax, 2(%r10) | |
# Write the coefficient. | |
movsbl 1(%rbx), %eax | |
movb %al, 6(%r10) | |
# Advance 7 bytes and loop again. | |
addq $7, %r10 | |
jmp compile_step | |
compile_read: | |
# 41 ff d5 == callq *%r13 | |
# 88 83 AA AA AA AA = mov %al, AA(%rbx) | |
movl $0x00d5ff41, (%r10) | |
movw $0x8388, 3(%r10) | |
# Write the offset for the target. | |
movswl 2(%rbx), %eax | |
movl %eax, 5(%r10) | |
# Advance 9 bytes, then loop. | |
addq $9, %r10 | |
jmp compile_step | |
compile_print: | |
# 0f b6 bb AA AA AA AA movzbl AA(%rbx),%edi | |
# 41 ff d4 callq *%r12 | |
movl $0x00bbb60f, (%r10) | |
# Write the offset. | |
movswl 2(%rbx), %eax | |
movl %eax, 3(%r10) | |
# Write the call operation. | |
movl $0x00d4ff41, 7(%r10) | |
# Advance 10 bytes, loop. | |
addq $10, %r10 | |
jmp compile_step | |
compile_lbracket: | |
# Skip emitting the lea if we don't need it. | |
cmpw $0, 2(%rbx) | |
je compile_lbracket_test | |
# 48 8d 9b AA AA AA AA lea AA(%rbx),%rbx | |
# We use lea since it does computation in the decode stage, which helps pipelining. | |
movl $0x009b8d48, (%r10) | |
# Write the offset for the lea. | |
movswl 2(%rbx), %eax | |
movl %eax, 3(%r10) | |
addq $7, %r10 | |
compile_lbracket_test: | |
# Write the other instructions. | |
# 80 3b 00 cmpb $0x0,(%rbx) | |
# 0f 84 OF FF SS ET je <offset> | |
movw $0x038a, (%r10) | |
movw $0xc084, 2(%r10) | |
movw $0x840f, 4(%r10) | |
movl $0x00, 6(%r10) | |
# Advance the 10 bytes. | |
addq $10, %r10 | |
# Push our current position on the stack. We use the stack to keep track of bracket pairs. | |
# Once we reach the closing bracket, we pop, compute offsets and write the j(n)e targets. | |
pushq %r10 | |
jmp compile_step | |
compile_rbracket: | |
# Skip emitting the lea if we don't need it. | |
cmpw $0, 2(%rbx) | |
je compile_rbracket_test | |
# 48 8d 9b AA AA AA AA lea AA(%rbx),%rbx | |
# We use lea since it does computation in the decode stage, which helps pipelining. | |
movl $0x009b8d48, (%r10) | |
# Write the offset. | |
movswl 2(%rbx), %eax | |
movl %eax, 3(%r10) | |
addq $7, %r10 | |
compile_rbracket_test: | |
# Write the other bytes. | |
# 80 3b 00 cmpb $0x0,(%rbx) | |
# 0f 85 OF FF SS ET jne <offset> | |
movw $0x038a, (%r10) | |
movw $0xc084, 2(%r10) | |
movw $0x850f, 4(%r10) | |
movl $0x00, 6(%r10) | |
addq $10, %r10 | |
# Find out what the code address was of our starting position. | |
popq %rcx | |
# Find out the difference between our current address and the starting one. | |
movq %r10, %rax | |
subq %rcx, %rax | |
# Write the positive offset at the starting location. | |
movl %eax, -4(%rcx) | |
# Now negate it and write the negative offset at our current location. | |
negl %eax | |
movl %eax, -4(%r10) | |
jmp compile_step | |
compile_write_cell: | |
# c6 83 AA AA AA AA BB = movb BB,AA(%rbx) | |
movw $0x83c6, (%r10) | |
# Write the offset. | |
movswl 2(%rbx), %eax | |
movl %eax, 2(%r10) | |
# Write the value. | |
movsbl 1(%rbx), %eax | |
movb %al, 6(%r10) | |
# Advance 7 bytes and loop. | |
addq $7, %r10 | |
jmp compile_step | |
compile_mult_write: | |
# b8 XX XX XX XX mov XX,%eax | |
movw $0xb8, (%r10) | |
# Write the value. | |
movsbl 1(%rbx), %eax | |
movl %eax, 1(%r10) | |
# 0f af c2 imul %edx,%eax | |
movl $0x00c2af0f, 5(%r10) | |
# 00 83 XX XX XX XX add %al,XX(%rbx) | |
movb $0x83, 9(%r10) | |
# Write the offset. | |
movswl 2(%rbx), %eax | |
movl %eax, 10(%r10) | |
# Advance 11 bytes and loop. | |
addq $14, %r10 | |
jmp compile_step | |
compile_unroll_begin: | |
# Do not emit the lea if we do not need it. | |
cmpw $0, 2(%rbx) | |
je compile_unroll_begin_mov | |
# 48 8d 9b 30 75 00 00 lea 0x7530(%rbx),%rbx | |
movl $0x009b8d48, (%r10) | |
# Write the offset. | |
movswl 2(%rbx), %eax | |
movl %eax, 3(%r10) | |
addq $7, %r10 | |
compile_unroll_begin_mov: | |
# 0f b6 13 movzbl (%rbx),%edx | |
movl $0x0013b60f, (%r10) | |
# Advance 10 bytes and loop. | |
addq $3, %r10 | |
jmp compile_step | |
compile_unroll_end: | |
# Zero the cell at index 0. | |
# c6 03 00 movb $0x0,(%rbx) | |
movl $0x000003c6, (%r10) | |
addq $3, %r10 | |
jmp compile_step | |
compile_cell_add: | |
# Simply add the value in %dl to the offset value. | |
# 00 93 31 75 00 00 add %dl,0x7531(%rbx) | |
movb $0x93, 1(%r10) | |
# Write the offset. | |
movswl 2(%rbx), %eax | |
movl %eax, 2(%r10) | |
addq $6, %r10 | |
jmp compile_step | |
compile_if_end: | |
# Find out what the code address was of our starting position. | |
popq %rcx | |
# Find out the difference between our current address and the starting one. | |
movq %r10, %rax | |
subq %rcx, %rax | |
# Write the offset at the starting location. | |
movl %eax, -4(%rcx) | |
# We do not need to write anything else, so we don't adjust r10 and simply loop. | |
jmp compile_step | |
compile_unroll_set: | |
# 4004d7: b0 0a mov $0xa,%al | |
movb $0xb0, (%r10) | |
# Write the value. | |
movsbl 1(%rbx), %eax | |
movl %eax, 1(%r10) | |
# 4004d9: 0f b6 8b 31 75 00 00 movzbl 0x7531(%rbx),%ecx | |
movl $0x008bb60f, 2(%r10) | |
# Write the offset. | |
movswl 2(%rbx), %eax | |
movl %eax, 5(%r10) | |
# 4004e0: 84 d2 test %dl,%dl | |
# 4004e2: 66 0f 45 c8 cmovne %ax,%cx | |
# 4004e6: 88 8b 31 75 00 00 mov %cl,0x7531(%rbx) | |
movw $0xd284, 9(%r10) | |
movl $0xc8450f66, 11(%r10) | |
movw $0x8b88, 15(%r10) | |
# Write the offset. | |
movswl 2(%rbx), %eax | |
movl %eax, 17(%r10) | |
# Advance 21 bytes and loop. | |
addq $21, %r10 | |
jmp compile_step | |
compile_pointer_seek: | |
# Do not emit the lea if we do not need it. | |
cmpw $0, 2(%rbx) | |
je compile_pointer_seek_body | |
# 48 8d 9b 30 75 00 00 lea 0x7530(%rbx),%rbx | |
movl $0x009b8d48, (%r10) | |
# Write the offset. | |
movswl 2(%rbx), %eax | |
movl %eax, 3(%r10) | |
addq $7, %r10 | |
compile_pointer_seek_body: | |
# cmpb $0, (%rdi) | |
movl $0x00003b80, (%r10) | |
# leaq 9(%rdi), %rdi | |
movl $0x005b8d48, 3(%r10) | |
# Write the value. | |
movsbl 1(%rbx), %eax | |
movb %al, 6(%r10) | |
# jne <back>. The code size is 13 bytes, so the offset back is -13. | |
movw $0x850f, 7(%r10) | |
movl $-13, 9(%r10) | |
# Advance 13 bytes and step. | |
addq $13, %r10 | |
jmp compile_step | |
compile_step: | |
# Goto the next IR node and actually loop. | |
addq $4, %rbx | |
jmp compile_check | |
compile_end: | |
# Write ret to last byte. | |
movb $0xC3, (%r10) | |
# Put resulting buffer in rax | |
movq %r11, %rax | |
# Cleanup, return. | |
popq %r11 | |
popq %r10 | |
popq %rbx | |
movq %rbp, %rsp | |
popq %rbp | |
ret | |
# Since our JITted block of memory makes certain assumptions about the environment | |
# (such as certain functions existing in registers), this function functions as a | |
# "trampoline" that takes a block of JITted instructions and actually runs the code. | |
# ----------------------------------------------------------------------------------- | |
# Arguments: | |
# rdi: pointer to executable block of memory with instructions | |
run_compiled_bf: | |
pushq %rbp | |
movq %rsp, %rbp | |
# Prepare the stack and registers. We purposefully misalign by 8, | |
# so that using call will align the stack by pushing the return address. | |
pushq %rbx | |
pushq %r12 | |
pushq %r13 | |
# Our compiled code expects the cell pointer in rdx, putchar in r12 and getchar in r13. | |
leaq CELLS, %rbx | |
movq $putchar, %r12 | |
movq $getchar, %r13 | |
# Call our compiled code. The stack is not currently aligned, but with the return value it will align properly. | |
call *%rdi | |
# Cleanup and return. | |
popq %r13 | |
popq %r12 | |
popq %rbx | |
movq %rbp, %rsp | |
popq %rbp | |
ret | |
# Takes a simple brainfuck char* input and parses it into IR, optimizes it and then | |
# compiles and runs it. | |
# ----------------------------------------------------------------------------------- | |
# Arguments: | |
# rdi: char* to execute | |
brainfuck: | |
pushq %rbp | |
movq %rsp, %rbp | |
# Build our representation. | |
call build_representation | |
movq %rax, %rdi | |
# Optimize our representation. rdi is never modified (calling convention) and | |
# we don't care about the return value so we can just call every function in order. | |
call pre_offset_optimizations | |
call compute_offsets | |
movq $possibly_unroll_loop, %rsi | |
call for_every_loop_run | |
movq $possibly_convert_to_if, %rsi | |
call for_every_loop_run | |
# Compile the program | |
call compile_bf | |
# Run the program | |
movq %rax, %rdi | |
call run_compiled_bf | |
movq %rbp, %rsp | |
popq %rbp | |
ret |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment