Skip to content

Instantly share code, notes, and snippets.

@TerrorBite
Created April 22, 2012 15:43
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save TerrorBite/2464756 to your computer and use it in GitHub Desktop.
Save TerrorBite/2464756 to your computer and use it in GitHub Desktop.
DCPU-16 Assembler written in Python
import struct, re, argparse
# This regex matches lines that look vaguely like assembly code
re_prelim = re.compile(r'(?:[A-Z]{3}|[a-z]{3})\s')
# This regex is used to parse the data from a line of assembly code
re_line = re.compile(
r'^(?:\S+\s+)?(?P<instr>[A-Z]{3}|[a-z]{3})\s+(?P<arga>[A-Za-z0-9_\[\]+]+)\s*(?:,\s*(?P<argb>[A-Za-z0-9_\[\]+]+))?'
)
# This regex matches labels
re_label = re.compile(r'^(?P<label>:\w+)\s*(?:(?P<instr>[A-Za-z]{3})|$)')
# This regex matches either decimal or hex numbers (used for checking if parameters are a label)
re_number = re.compile(r'(?:[0-9]+|0x[0-9A-Fa-f])')
# Dictionary of opcodes. Key is the name, value is a tuple of (type, binary) where type is 0=basic, 1=extended
opcodefs = {
'SET': (0, 0x1), 'ADD': (0, 0x2), 'SUB': (0, 0x3),
'MUL': (0, 0x4), 'DIV': (0, 0x5), 'MOD': (0, 0x6),
'SHL': (0, 0x7), 'SHR': (0, 0x8), 'AND': (0, 0x9),
'BOR': (0, 0xa), 'XOR': (0, 0xb), 'IFE': (0, 0xc),
'IFN': (0, 0xd), 'IFG': (0, 0xe), 'IFB': (0, 0xf),
'JSR': (1, 0x01),
}
# Dict of some possible argument values and their binary equivalents, used for lookups.
argdefs = {
'A': 0x00, 'B': 0x01, 'C': 0x02, 'X': 0x03, 'Y': 0x04, 'Z': 0x05, 'I': 0x06, 'J': 0x07,
'[A]': 0x08, '[B]': 0x09, '[C]': 0x0a, '[X]': 0x0b, '[Y]': 0x0c, '[Z]': 0x0d, '[I]': 0x0e, '[J]': 0x0f,
'POP': 0x18, 'PEEK': 0x19, 'PUSH': 0x1a, 'SP': 0x1b, 'PC': 0x1c, 'O': 0x1d,
}
class SourceLine:
"""
Loads a line of assembly source and does some preliminary parsing.
"""
def __init__(self, line_number, inum, source):
self.lineno = line_number
self.inum = inum # Which instruction is this
self.source = source
self.instruction = None
self.args = None
self.parse(source)
def parse(self, source):
"""
Parses a line of source code.
In this stage, the line is split into several strings, namely the instruction (e.g. ADD)
and the two (or one, for extended instructions) arguments. No verification is done
beyond what the regular expression filters out, e.g. bogus instructions like ABC or
arguments like 0x56g1 will be accepted at this point.
"""
m = re_line.search(source)
if m is None:
raise Exception("Syntax error\nLine %d: %s" % (self.lineno, source))
data = m.groupdict()
#print repr(data)
self.instruction = data['instr'];
self.args = (data['arga'], data['argb'])
class Instruction:
"""
This class is used to compile a line of assembly source into its binary machine code representation.
"""
def __init__(self, parent, source=None):
"""
Initializes variables
"""
self.parent = parent
self.sl = source
self.lineno = source.lineno
self.inum = source.inum
self.address = None
self.opcode = None
self.arg1 = None
self.arg2 = None
self.bytecode = None
self.words = 1
if source is not None:
self.process(source)
def process(self, source):
"""
Processes a line of assembly.
In this stage, some checking is done and the text form of the instruction (e.g. "ADD") and the arguments
are converted into their numerical representations. An error is raised if the instruction does not exist.
Argument values must parse as either:
* a special value (i.e. must directly match a key in the argdefs dict)
* a decimal or hexadecimal value (either literal, or enclosed in [] to denote a pointer)
* an offset plus register value as a pointer (e.g. "[0x1000+I]")
Any argument values that do NOT match one of the above is assumed to be a label, and will cause an error if
the label does not actually exist.
Labels are not actually replaced with addresses at this stage, because we don't yet know how many words each
instruction takes up and therefore we don't know the actual address of any label. Instead we use a placeholder
value for labels, which will be replaced with the correct address in the final stage.
"""
# Count the number of words this instruction will take up. This is important for label resolution.
self.words = 1
# Look for instruction in our dict of opcodes
if source.instruction in opcodefs:
self.opcode = opcodefs[source.instruction]
else:
# Raise error if it isn't there
raise Exception("Unrecognised instruction %s\nLine %s: %s" % (source.instruction, self.lineno, source.source))
# Is this a basic instruction? If it is, it needs to have two arguments (i.e. the second one can't be None)
if self.opcode[0] == 0:
if source.args[1] is None:
raise Exception("Basic instructions require two arguments\nLine %d: %s" % (self.lineno, source.source))
# Work out the numerical value of the second argument (which an extended instruction won't have)
self.arg2 = self.parse_arg(source.args[1])
# Add 1 to our word length if the argument has an associated data value (i.e. "next word")
self.words += 1 if self.arg2[1] else 0
# Both types of instruction will have a first argument, process that next
self.arg1 = self.parse_arg(source.args[0])
# Add 1 to our word length if it has a data value
self.words += 1 if self.arg1[1] else 0
def build_bytecode(self):
"""
Performs the final stage of actually building the string containing the raw bytecode.
Label resolution is performed at this stage.
"""
bytecode = None
# Resolve labels
if self.arg1[0] == 0xff:
# First argument was a label.
# In practice, basic instructions will rarely have a label as a first argument,
# but the JSR extended instruction will almost always have one!
# Look up the address by istruction number
val = self.parent.address(self.arg1[1])
if shortform and val < 0x1f:
# We can use short form for this one!
self.arg1 = (0x20 + val, None)
else:
# Too big to fit in short form
self.arg1 = (0x1f, val)
if self.arg2 and self.arg2[0] == 0xff:
# Second argument exists and was a label. Pay no attention to the debug statement behind the curtain.
print "Debug: Label dest line %d resolved to address 0x%04x" % (self.arg2[1], self.parent.address(self.arg2[1]))
# Look up the address by istruction number
val = self.parent.address(self.arg2[1])
if shortform and val < 0x1f:
# We can use short form for this one!
self.arg2 = (0x20 + val, None)
else:
# Too big to fit in short form
self.arg2 = (0x1f, val)
# Check which endianness the user requested, and set the struct format appropriately.
# We default to the DCPU-16's native little-endian byte format, but users can also opt for
# big-endian (aka network byte format) if their emulator reads input files differently.
fmt = '<H' # Little-endian format
if big_endian:
fmt = '>H' # Switch to big-endian
# Generate bytecode for the first word.
if self.opcode[0] == 0:
# Basic instruction
bytecode = struct.pack(fmt, self.opcode[1] | (self.arg1[0] << 4) | (self.arg2[0] << 10) )
else:
# Extended instruction
print "DEBUG: self.arg1 is %s and self.arg2 is %s" % (repr(self.arg1), repr(self.arg2))
bytecode = struct.pack(fmt, (self.opcode[1] << 4) | (self.arg1[0] << 10) )
# If either of the arguments read a "next word", then add that next word
if self.arg1[1] is not None:
bytecode += struct.pack(fmt, self.arg1[1])
if self.arg2 and self.arg2[1] is not None:
bytecode += struct.pack(fmt, self.arg2[1])
# Save our generated bytecode as well as returning it. More for debugging than anything else.
self.bytecode = bytecode
return bytecode
def parse_arg(self, source):
"""
Converts arguments from assembly source form into their binary form.
"""
# Does the argument exactly match one of our predetermined values?
if source in argdefs:
# Look up the value in our dict
return (argdefs[source], None)
# our work here is done
# Otherwise, lookup failed. Next, check for square brackets, i.e. a pointer
elif source[0] == '[' and source[-1] == ']':
# Check for a + in there as well, meaning it's probably a literal+register value
if '+' in source:
# Pointer plus register
bits = source[1:-1].split('+')
# Support the register name being on either side of the + sign
if bits[1][0] in argdefs:
# We assume base 16 if the literal part starts with '0x', otherwise base 10
return (argdefs[bits[1][0]] + 0x10, int(bits[0], 16) if bits[0][0:2] == '0x' else int(bits[0], 10))
elif bits[0][0] in argdefs:
return (argdefs[bits[0][0]] + 0x10, int(bits[1], 16) if bits[0][0:2] == '0x' else int(bits[0], 10))
else:
# We don't know what this is
raise Exception("Invalid argument format on line %d: %s" % (self.lineno, source))
else:
# This is a pointer located at a literal address
return (0x1e, int(source[1:-1], 16) if source[1:3] == '0x' else int(source[1:-1], 10))
elif re_number.match(source):
# This matches the regex for a literal value
val = int(source, 16) if source[0:2] == '0x' else int(source, 10)
if val < 0x1f:
# We can use short form for this one!
return (0x20 + val, None)
else:
# Too big to fit in short form
return (0x1f, val)
else:
# None of the above, so assume it is a label
print "Debug: Label %s at line %d" % (source, self.parent.labels[source])
print repr(self.parent.addresses)
# Check if it's a known label
if source in self.parent.labels:
return (0xff, self.parent.labels[source])
else:
# We've never seen this label before
raise Exception('Unknown label "%s" at line %d' % (source, self.lineno))
class Assembler:
"""
This is the actual assembler.
All of the work is done by other classes, this one mostly just pulls everything together.
"""
def __init__(self):
# Assembled words go here
self.output = []
self.labels = {}
self.addresses = []
def address(self, index):
"Applies bias to compensate for short-form labels"
return self.addresses[index - self.bias]
def dump(self, i):
label = ''
source = ' '.join(i.sl.source.split()) # Compact any whitespace
m = re_label.match(source)
if m:
label = m.group('label')
source = source.split(None, 1)[1]
# the following one-liner formats a binary string into big-endian words in hexadecimal i.e. "7c01 0030"
bytecode = ' '.join(
["%02x%02x" % ((ord(m), ord(n)) if big_endian else (ord(n), ord(m))) for m, n in zip(i.bytecode[0::2], i.bytecode[1::2])]
)
print "%03d %04x: %s%s ; %s (line %d) %s" % (
i.inum, self.address(i.inum),
source, ' '*(20-len(source)),
bytecode, i.lineno, label)
def assemble(self, infile, outfile):
fd = open(infile, 'r')
data = fd.readlines()
fd.close()
# This is used to compensate for shortform labels
self.bias = 0
# Count number of lines (used for "error on line x" messages)
linecount = 0
# Count number of instructions (used for
icount = 0
sourcelines = []
instructions = []
print '=== FIRST PASS ==='
# In this pass, each input line is subjected to preliminary parsing, blank lines
# and comments are discarded, and a list of label positions (by line) is compiled.
for line in data:
# Line numbers start at 1
linecount += 1
# Discard leading/trailing whitespace, and comments
line = line.strip().split(';')[0]
# Check if this line looks like a label
m = re_label.match(line)
if m is not None:
# Save the label along with the inum of the instruction it appears above
self.labels[m.group('label')[1:]] = icount
print "DEBUG: Found label %s at line %d" % (m.group('label'), linecount)
# Check if this line (also) looks like an instruction
if re_prelim.search(line):
sl = SourceLine(linecount, icount, line)
sourcelines.append(sl)
# Increment instruction count
icount += 1
print "Debug: %d instructions parsed from %d lines" % (icount,linecount)
print '=== SECOND PASS ==='
# In this pass we process opcodes and their arguments, find instruction word length
# and determine an address for each instruction. This will allow us to calculate
# label addresses in the final pass.
# This list will map instruction numbers to addresses.
# We could use a dict here, but a list is cleaner and faster
self.addresses = [None]*icount
# This counts which address we are up to. Basically it's a program counter
addr = 0
for sl in sourcelines:
i = Instruction(self, sl) # Process instruction line (determines addresses)
# Map instruction numbers to addresses
self.addresses[i.inum] = addr
# Add the length of the instruction to our program counter.
addr += i.words
instructions.append(i)
# Debugging
print repr(self.addresses)
print '=== THIRD PASS ==='
# In this pass we resolve label addresses, generate machine code and build the actual binary.
binary = []
# Build the binary out of individual instructions
for i in instructions:
binary.append(i.build_bytecode())
# Debugging code: Prints the instruction alongside its binary version for comparison
self.dump(i)
# Join up our list of binary fragments into a single string
bytecode = ''.join(binary)
# and dump it into the output file
fd = open(outfile, 'wb')
fd.write(bytecode)
fd.close()
# We're done!
def main():
from sys import argv
# Command-line arguments are handled by the argparse library
parser = argparse.ArgumentParser(description='Compiles DCPU-16 assembly source into bytecode.')
# Big-endian option
parser.add_argument('-be', dest='big_endian', action='store_true',
help='Output each 16-bit word in big-endian format. The default format is little-endian.')
# Short form labels
parser.add_argument('-s', dest='shortform', action='store_true',
help='Enable short-form labels. Warning: highly experiental!')
# Input file
parser.add_argument('infile', nargs=1, help='The assembly file to read from.')
# Output file
parser.add_argument('outfile', nargs=1, help='The filename to write output to.')
# Now parse the arguments we were given
args = parser.parse_args()
global big_endian, shortform
big_endian = args.big_endian
shortform = args.shortform
# Create an Assembler and set it to work on our file
a = Assembler()
a.assemble(infile=args.infile[0], outfile=args.outfile[0])
# Standard python foo (allows this file to be imported as a library)
if __name__ == '__main__': main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment