Last active
August 29, 2015 14:06
-
-
Save adamnew123456/8beb23c712336e959d89 to your computer and use it in GitHub Desktop.
A BrainFuck Compiler For NASM/DOSBox
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Compiles BrainFuck programs into x86 assembly. | |
Supported operators: | |
> < + - [ ] . , | |
Note that this doesn't support Linux for simplicity, and is instead targeted | |
at DOSBox (due to the fact that Linux buffers stdio while DOSBox does not). | |
""" | |
import sys | |
# Valid characters used in BrainFuck code. Characters not in this set are ignored. | |
BF_CODE = '<>,.+-[]' | |
def _label_names(): | |
""" | |
Generates label names, as an infinite generator. | |
""" | |
label_count = 1 | |
while True: | |
label_name = 'L' + str(label_counter) | |
yield label_name | |
label_count += 1 | |
label_names = _label_names() | |
class BFCode: | |
""" | |
This is how BrainFuck code is stored. | |
This class parses a code string, and is also capable of generating assembly | |
code for it. | |
""" | |
def __init__(self, body): | |
self.elements = self._parse(body) | |
def _parse(self, code): | |
""" | |
Generates a list of characters by stripping off invalid BrainFuck | |
characters. This also counts loop brackets, and raises a | |
:class:`SyntaxError` if they are mismatched. | |
:return: A list of code characters. | |
""" | |
elements = [] | |
loop_depth = 0 | |
for char in code: | |
if char == '[': | |
loop_depth += 1 | |
elif char == ']': | |
loop_depth -= 1 | |
if char in BF_CODE: | |
elements.append(char) | |
if loop_depth < 0: | |
raise SyntaxError('Expected {} more ['.format(-loop_depth)) | |
elif loop_depth > 0: | |
raise SyntaxError('Expected {} more ]'.format(loop_depth)) | |
return elements | |
def generate_code(self): | |
""" | |
Generates assembly corresponding to this code. | |
:return: A string containing assembly code. | |
""" | |
code = '' | |
# A stack of label pairs, (start_label, end_label), which define a loop | |
label_stack = [] | |
current_start_label = None | |
current_end_label = None | |
# This is essentially a peephole optimization, to compress: | |
# | |
# +++ | |
# | |
# Down from the naive: | |
# | |
# INC al | |
# INC al | |
# INC al | |
# | |
# To the simpler | |
# | |
# ADD al, 0x03 | |
# | |
# < > + and - can all be chained in this way. | |
# | |
# For example, to chain +++, these varaibles would contain: | |
# | |
# chained_insr = '+' | |
# chained_count = 3 | |
chained_instr = '' | |
chain_count = 0 | |
# Generating the assembly for BF code is generally pretty | |
# simple. The things to remember are: | |
# | |
# 1. The current value is always inside AL. | |
# 2. The pointer to the current value is always in BL. | |
# This is because there are only 256 available cells, | |
# which fits into 1 byte. Although this is less than | |
# what is generally recommended, it seems to work, and | |
# prevents the code from having to wrap the cell index | |
# around (since the processor will wrap BL for us when | |
# it hits 256). We have to use BX when accessing the | |
# cells since NASM doesn't like [BFmemory + bl]. | |
# 3. The address to the cells is BFmemory. | |
def generate_move_right(): | |
'Generate the code for a sequence of >' | |
nonlocal code | |
code += 'mov [BFmemory + bx], al\n' | |
code += 'add bx, {}\n'.format(hex(chain_count)) | |
code += 'mov al, [BFmemory + bx]\n' | |
def generate_move_left(): | |
'Generate the code for a sequence of <' | |
nonlocal code | |
code += 'mov [BFmemory + bx], al\n' | |
code += 'sub bx, {}\n'.format(hex(chain_count)) | |
code += 'mov al, [BFmemory + bx]\n' | |
def generate_add(): | |
'Generate the code for a sequence of +' | |
nonlocal code | |
code += 'add ax, {}\n'.format(hex(chain_count)) | |
def generate_sub(): | |
'Generate the code for a sequence of -' | |
nonlocal code | |
code += 'sub ax, {}\n'.format(hex(chain_count)) | |
def generate_enter_loop(): | |
'Generate the code for a [, and sets up the state for the next ]' | |
nonlocal code | |
nonlocal current_start_label | |
nonlocal current_end_label | |
nonlocal label_stack | |
label_stack.append((current_start_label, | |
current_end_label)) | |
current_start_label = next(label_names) | |
current_end_label = 'exit' + current_start_label | |
code += current_start_label + ':\n' | |
code += 'test ax, ax\n' | |
code += 'jz {}\n'.format(current_end_label) | |
def generate_exit_loop(): | |
'Generate the code for a ], and restores the context of the outer loop' | |
nonlocal code | |
nonlocal current_start_label | |
nonlocal current_end_label | |
nonlocal label_stack | |
code += 'jmp ' + current_start_label + '\n' | |
code += current_end_label + ':\n' | |
current_start_label, current_end_label = label_stack.pop() | |
GENERATOR_TABLE = { | |
'<': generate_move_left, | |
'>': generate_move_right, | |
'+': generate_add, | |
'-': generate_sub, | |
'[': generate_enter_loop, | |
']': generate_exit_loop | |
} | |
# Only these operators can be chained | |
CHAINABLE = '><+-' | |
def gen_chained_if_possible(): | |
'Output a chained operator if any operator is currently chained' | |
nonlocal code | |
nonlocal chain_count | |
nonlocal chained_instr | |
if chained_instr: | |
# Since each cell can hold only 1 byte, and there are 256 | |
# cells, we can avoid confusing NASM by keeping all operands | |
# within 0x00 - 0xff. | |
# | |
# For example, starting at the first cell and then running | |
# 256 > operations would cause BL to go from 0x00 to 0xff and | |
# then overflow back to 0x00. | |
chain_count %= 256 | |
generator = GENERATOR_TABLE[chained_instr] | |
generator() | |
chain_count = 0 | |
chained_instr = '' | |
for element in self.elements: | |
if chained_instr == element: | |
chain_count += 1 | |
continue | |
else: | |
gen_chained_if_possible() | |
if element in CHAINABLE: | |
chained_instr = element | |
chain_count = 1 | |
elif element in GENERATOR_TABLE: | |
generator = GENERATOR_TABLE[element] | |
generator() | |
elif element == ',': | |
code += 'call read_byte\n' | |
elif element == '.': | |
code += 'call print_byte\n' | |
gen_chained_if_possible() | |
return code | |
def __str__(self): | |
return ''.join(str(element) for element in self.elements) | |
if __name__ == '__main__': | |
if len(sys.argv) != 1: | |
print(sys.argv[0], 'expects input from stdin, and no arguments') | |
code = sys.stdin.read() | |
compiler = BFCode(code) | |
# We have to prep the output for the assembler | |
sys.stdout.write( | |
''' | |
org 100h | |
section .bss | |
BFmemory: | |
resw 255 | |
section .text | |
start: | |
; Initialize both the current value (AX) and the current offset (BL) to | |
; zero | |
xor ax, ax | |
xor bx, bx | |
; Since our memory is uninitialized, we have to zero it out | |
call zero_mem | |
;; BEGIN GENERATED CODE ;; | |
''') | |
sys.stdout.write(compiler.generate_code()) | |
sys.stdout.write(''';; END GENERATED CODE ;; | |
call exit | |
; Zeroes out the memory used for the cells | |
zero_mem: | |
xor bx, bx | |
mov bx, 0xff | |
zero_mem_loop: | |
mov [BFmemory + bx], byte 0x00 | |
dec bx | |
jnz zero_mem_loop | |
; Since CX is 0, which is avalid address into the BFmemory array, go ahead | |
; and do one more zero | |
mov [BFmemory], byte 0x00 | |
ret | |
; Prints out the contents of AX, keeping AX the same | |
print_byte: | |
; Store AX, since doing the interrupt clobbers AX | |
push ax | |
; Interrupt 02h requires DL contain our byte to print | |
mov dl,al | |
; Clear AX to hold the interrupt number | |
xor ax, ax | |
mov ah, 0x02 | |
int 0x21 | |
; Restore AX | |
pop ax | |
ret | |
; Reads in an ASCII value into the contents of AX | |
read_byte: | |
mov ah, 0x01 | |
int 0x21 | |
; AL contains our byte, so just zero AH and we'll have it | |
xor ah, ah | |
ret | |
; Terminates the program | |
exit: | |
mov ah, 0x4C | |
int 0x21 | |
''') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment