Skip to content

Instantly share code, notes, and snippets.

@adamnew123456
Last active August 29, 2015 14:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save adamnew123456/8beb23c712336e959d89 to your computer and use it in GitHub Desktop.
Save adamnew123456/8beb23c712336e959d89 to your computer and use it in GitHub Desktop.
A BrainFuck Compiler For NASM/DOSBox
"""
Compiles BrainFuck programs into x86 assembly.
Supported operators:
> < + - [ ] . ,
Note that this doesn't support Linux for simplicity, and is instead targeted
at DOSBox (due to the fact that Linux buffers stdio while DOSBox does not).
"""
import sys
# Valid characters used in BrainFuck code. Characters not in this set are ignored.
BF_CODE = '<>,.+-[]'
def _label_names():
"""
Generates label names, as an infinite generator.
"""
label_count = 1
while True:
label_name = 'L' + str(label_counter)
yield label_name
label_count += 1
label_names = _label_names()
class BFCode:
"""
This is how BrainFuck code is stored.
This class parses a code string, and is also capable of generating assembly
code for it.
"""
def __init__(self, body):
self.elements = self._parse(body)
def _parse(self, code):
"""
Generates a list of characters by stripping off invalid BrainFuck
characters. This also counts loop brackets, and raises a
:class:`SyntaxError` if they are mismatched.
:return: A list of code characters.
"""
elements = []
loop_depth = 0
for char in code:
if char == '[':
loop_depth += 1
elif char == ']':
loop_depth -= 1
if char in BF_CODE:
elements.append(char)
if loop_depth < 0:
raise SyntaxError('Expected {} more ['.format(-loop_depth))
elif loop_depth > 0:
raise SyntaxError('Expected {} more ]'.format(loop_depth))
return elements
def generate_code(self):
"""
Generates assembly corresponding to this code.
:return: A string containing assembly code.
"""
code = ''
# A stack of label pairs, (start_label, end_label), which define a loop
label_stack = []
current_start_label = None
current_end_label = None
# This is essentially a peephole optimization, to compress:
#
# +++
#
# Down from the naive:
#
# INC al
# INC al
# INC al
#
# To the simpler
#
# ADD al, 0x03
#
# < > + and - can all be chained in this way.
#
# For example, to chain +++, these varaibles would contain:
#
# chained_insr = '+'
# chained_count = 3
chained_instr = ''
chain_count = 0
# Generating the assembly for BF code is generally pretty
# simple. The things to remember are:
#
# 1. The current value is always inside AL.
# 2. The pointer to the current value is always in BL.
# This is because there are only 256 available cells,
# which fits into 1 byte. Although this is less than
# what is generally recommended, it seems to work, and
# prevents the code from having to wrap the cell index
# around (since the processor will wrap BL for us when
# it hits 256). We have to use BX when accessing the
# cells since NASM doesn't like [BFmemory + bl].
# 3. The address to the cells is BFmemory.
def generate_move_right():
'Generate the code for a sequence of >'
nonlocal code
code += 'mov [BFmemory + bx], al\n'
code += 'add bx, {}\n'.format(hex(chain_count))
code += 'mov al, [BFmemory + bx]\n'
def generate_move_left():
'Generate the code for a sequence of <'
nonlocal code
code += 'mov [BFmemory + bx], al\n'
code += 'sub bx, {}\n'.format(hex(chain_count))
code += 'mov al, [BFmemory + bx]\n'
def generate_add():
'Generate the code for a sequence of +'
nonlocal code
code += 'add ax, {}\n'.format(hex(chain_count))
def generate_sub():
'Generate the code for a sequence of -'
nonlocal code
code += 'sub ax, {}\n'.format(hex(chain_count))
def generate_enter_loop():
'Generate the code for a [, and sets up the state for the next ]'
nonlocal code
nonlocal current_start_label
nonlocal current_end_label
nonlocal label_stack
label_stack.append((current_start_label,
current_end_label))
current_start_label = next(label_names)
current_end_label = 'exit' + current_start_label
code += current_start_label + ':\n'
code += 'test ax, ax\n'
code += 'jz {}\n'.format(current_end_label)
def generate_exit_loop():
'Generate the code for a ], and restores the context of the outer loop'
nonlocal code
nonlocal current_start_label
nonlocal current_end_label
nonlocal label_stack
code += 'jmp ' + current_start_label + '\n'
code += current_end_label + ':\n'
current_start_label, current_end_label = label_stack.pop()
GENERATOR_TABLE = {
'<': generate_move_left,
'>': generate_move_right,
'+': generate_add,
'-': generate_sub,
'[': generate_enter_loop,
']': generate_exit_loop
}
# Only these operators can be chained
CHAINABLE = '><+-'
def gen_chained_if_possible():
'Output a chained operator if any operator is currently chained'
nonlocal code
nonlocal chain_count
nonlocal chained_instr
if chained_instr:
# Since each cell can hold only 1 byte, and there are 256
# cells, we can avoid confusing NASM by keeping all operands
# within 0x00 - 0xff.
#
# For example, starting at the first cell and then running
# 256 > operations would cause BL to go from 0x00 to 0xff and
# then overflow back to 0x00.
chain_count %= 256
generator = GENERATOR_TABLE[chained_instr]
generator()
chain_count = 0
chained_instr = ''
for element in self.elements:
if chained_instr == element:
chain_count += 1
continue
else:
gen_chained_if_possible()
if element in CHAINABLE:
chained_instr = element
chain_count = 1
elif element in GENERATOR_TABLE:
generator = GENERATOR_TABLE[element]
generator()
elif element == ',':
code += 'call read_byte\n'
elif element == '.':
code += 'call print_byte\n'
gen_chained_if_possible()
return code
def __str__(self):
return ''.join(str(element) for element in self.elements)
if __name__ == '__main__':
if len(sys.argv) != 1:
print(sys.argv[0], 'expects input from stdin, and no arguments')
code = sys.stdin.read()
compiler = BFCode(code)
# We have to prep the output for the assembler
sys.stdout.write(
'''
org 100h
section .bss
BFmemory:
resw 255
section .text
start:
; Initialize both the current value (AX) and the current offset (BL) to
; zero
xor ax, ax
xor bx, bx
; Since our memory is uninitialized, we have to zero it out
call zero_mem
;; BEGIN GENERATED CODE ;;
''')
sys.stdout.write(compiler.generate_code())
sys.stdout.write(''';; END GENERATED CODE ;;
call exit
; Zeroes out the memory used for the cells
zero_mem:
xor bx, bx
mov bx, 0xff
zero_mem_loop:
mov [BFmemory + bx], byte 0x00
dec bx
jnz zero_mem_loop
; Since CX is 0, which is avalid address into the BFmemory array, go ahead
; and do one more zero
mov [BFmemory], byte 0x00
ret
; Prints out the contents of AX, keeping AX the same
print_byte:
; Store AX, since doing the interrupt clobbers AX
push ax
; Interrupt 02h requires DL contain our byte to print
mov dl,al
; Clear AX to hold the interrupt number
xor ax, ax
mov ah, 0x02
int 0x21
; Restore AX
pop ax
ret
; Reads in an ASCII value into the contents of AX
read_byte:
mov ah, 0x01
int 0x21
; AL contains our byte, so just zero AH and we'll have it
xor ah, ah
ret
; Terminates the program
exit:
mov ah, 0x4C
int 0x21
''')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment