TerrorBite/assembler.py

## assembler.py
import struct, re, argparse

# This regex matches lines that look vaguely like assembly code
re_prelim = re.compile(r'(?:[A-Z]{3}|[a-z]{3})\s')

# This regex is used to parse the data from a line of assembly code
re_line = re.compile(
        r'^(?:\S+\s+)?(?P<instr>[A-Z]{3}|[a-z]{3})\s+(?P<arga>[A-Za-z0-9_\[\]+]+)\s*(?:,\s*(?P<argb>[A-Za-z0-9_\[\]+]+))?'
        )

# This regex matches labels
re_label = re.compile(r'^(?P<label>:\w+)\s*(?:(?P<instr>[A-Za-z]{3})|$)')

# This regex matches either decimal or hex numbers (used for checking if parameters are a label)
re_number = re.compile(r'(?:[0-9]+|0x[0-9A-Fa-f])')

# Dictionary of opcodes. Key is the name, value is a tuple of (type, binary) where type is 0=basic, 1=extended
opcodefs = {
        'SET': (0, 0x1), 'ADD': (0, 0x2), 'SUB': (0, 0x3),
        'MUL': (0, 0x4), 'DIV': (0, 0x5), 'MOD': (0, 0x6),
        'SHL': (0, 0x7), 'SHR': (0, 0x8), 'AND': (0, 0x9),
        'BOR': (0, 0xa), 'XOR': (0, 0xb), 'IFE': (0, 0xc),
        'IFN': (0, 0xd), 'IFG': (0, 0xe), 'IFB': (0, 0xf),
        'JSR': (1, 0x01),
        }

# Dict of some possible argument values and their binary equivalents, used for lookups.
argdefs = {
        'A': 0x00, 'B': 0x01, 'C': 0x02, 'X': 0x03, 'Y': 0x04, 'Z': 0x05, 'I': 0x06, 'J': 0x07,
        '[A]': 0x08, '[B]': 0x09, '[C]': 0x0a, '[X]': 0x0b, '[Y]': 0x0c, '[Z]': 0x0d, '[I]': 0x0e, '[J]': 0x0f,
        'POP': 0x18, 'PEEK': 0x19, 'PUSH': 0x1a, 'SP': 0x1b, 'PC': 0x1c, 'O': 0x1d,
        }

class SourceLine:
    """
    Loads a line of assembly source and does some preliminary parsing.
    """
    def __init__(self, line_number, inum, source):
        self.lineno = line_number
        self.inum = inum # Which instruction is this
        self.source = source
        self.instruction = None
        self.args = None
        self.parse(source)

    def parse(self, source):
        """
        Parses a line of source code.

        In this stage, the line is split into several strings, namely the instruction (e.g. ADD)
        and the two (or one, for extended instructions) arguments. No verification is done
        beyond what the regular expression filters out, e.g. bogus instructions like ABC or
        arguments like 0x56g1 will be accepted at this point.
        """
        m = re_line.search(source)
        if m is None:
            raise Exception("Syntax error\nLine %d: %s" % (self.lineno, source))
        data = m.groupdict()
        #print repr(data)
        self.instruction = data['instr'];
        self.args = (data['arga'], data['argb'])


class Instruction:
    """
    This class is used to compile a line of assembly source into its binary machine code representation.
    """
    def __init__(self, parent, source=None):
        """
        Initializes variables
        """
        self.parent = parent
        self.sl = source
        self.lineno = source.lineno
        self.inum = source.inum
        self.address = None
        self.opcode = None
        self.arg1 = None
        self.arg2 = None
        self.bytecode = None
        self.words = 1

        if source is not None:
            self.process(source)

    def process(self, source):
        """
        Processes a line of assembly.

        In this stage, some checking is done and the text form of the instruction (e.g. "ADD") and the arguments
        are converted into their numerical representations. An error is raised if the instruction does not exist.

        Argument values must parse as either:
            * a special value (i.e. must directly match a key in the argdefs dict)
            * a decimal or hexadecimal value (either literal, or enclosed in [] to denote a pointer)
            * an offset plus register value as a pointer (e.g. "[0x1000+I]")
        Any argument values that do NOT match one of the above is assumed to be a label, and will cause an error if
        the label does not actually exist.

        Labels are not actually replaced with addresses at this stage, because we don't yet know how many words each
        instruction takes up and therefore we don't know the actual address of any label. Instead we use a placeholder
        value for labels, which will be replaced with the correct address in the final stage.
        """

        # Count the number of words this instruction will take up. This is important for label resolution.
        self.words = 1

        # Look for instruction in our dict of opcodes
        if source.instruction in opcodefs:
            self.opcode = opcodefs[source.instruction]
        else:
            # Raise error if it isn't there
            raise Exception("Unrecognised instruction %s\nLine %s: %s" % (source.instruction, self.lineno, source.source))

        # Is this a basic instruction? If it is, it needs to have two arguments (i.e. the second one can't be None)
        if self.opcode[0] == 0:
            if source.args[1] is None:
                raise Exception("Basic instructions require two arguments\nLine %d: %s" % (self.lineno, source.source))

            # Work out the numerical value of the second argument (which an extended instruction won't have)
            self.arg2 = self.parse_arg(source.args[1])
            # Add 1 to our word length if the argument has an associated data value (i.e. "next word")
            self.words += 1 if self.arg2[1] else 0

        # Both types of instruction will have a first argument, process that next
        self.arg1 = self.parse_arg(source.args[0])
        # Add 1 to our word length if it has a data value
        self.words += 1 if self.arg1[1] else 0

    def build_bytecode(self):
        """
        Performs the final stage of actually building the string containing the raw bytecode.

        Label resolution is performed at this stage.
        """
        bytecode = None

        # Resolve labels
        if self.arg1[0] == 0xff:
            # First argument was a label.
            # In practice, basic instructions will rarely have a label as a first argument,
            # but the JSR extended instruction will almost always have one!

            # Look up the address by istruction number
            val = self.parent.address(self.arg1[1])
            if shortform and val < 0x1f:
                # We can use short form for this one!
                self.arg1 = (0x20 + val, None)
            else:
                # Too big to fit in short form
                self.arg1 = (0x1f, val)

        if self.arg2 and self.arg2[0] == 0xff:
            # Second argument exists and was a label. Pay no attention to the debug statement behind the curtain.
            print "Debug: Label dest line %d resolved to address 0x%04x" % (self.arg2[1], self.parent.address(self.arg2[1]))
            # Look up the address by istruction number
            val = self.parent.address(self.arg2[1])
            if shortform and val < 0x1f:
                # We can use short form for this one!
                self.arg2 = (0x20 + val, None)
            else:
                # Too big to fit in short form
                self.arg2 = (0x1f, val)

        # Check which endianness the user requested, and set the struct format appropriately.
        # We default to the DCPU-16's native little-endian byte format, but users can also opt for
        # big-endian (aka network byte format) if their emulator reads input files differently.
        fmt = '<H' # Little-endian format
        if big_endian:
            fmt = '>H' # Switch to big-endian

        # Generate bytecode for the first word.
        if self.opcode[0] == 0:
            # Basic instruction
            bytecode = struct.pack(fmt, self.opcode[1] | (self.arg1[0] << 4) | (self.arg2[0] << 10) )
        else:
            # Extended instruction
            print "DEBUG: self.arg1 is %s and self.arg2 is %s" % (repr(self.arg1), repr(self.arg2))
            bytecode = struct.pack(fmt, (self.opcode[1] << 4) | (self.arg1[0] << 10) )

        # If either of the arguments read a "next word", then add that next word
        if self.arg1[1] is not None:
            bytecode += struct.pack(fmt, self.arg1[1])
        if self.arg2 and self.arg2[1] is not None:
            bytecode += struct.pack(fmt, self.arg2[1])

        # Save our generated bytecode as well as returning it. More for debugging than anything else.
        self.bytecode = bytecode
        return bytecode

    def parse_arg(self, source):
        """
        Converts arguments from assembly source form into their binary form.
        """
        # Does the argument exactly match one of our predetermined values?
        if source in argdefs:
            # Look up the value in our dict
            return (argdefs[source], None)
            # our work here is done

        # Otherwise, lookup failed. Next, check for square brackets, i.e. a pointer
        elif source[0] == '[' and source[-1] == ']':
            # Check for a + in there as well, meaning it's probably a literal+register value
            if '+' in source:
                # Pointer plus register
                bits = source[1:-1].split('+')

                # Support the register name being on either side of the + sign
                if bits[1][0] in argdefs:
                    # We assume base 16 if the literal part starts with '0x', otherwise base 10
                    return (argdefs[bits[1][0]] + 0x10, int(bits[0], 16) if bits[0][0:2] == '0x' else int(bits[0], 10))
                elif bits[0][0] in argdefs:
                    return (argdefs[bits[0][0]] + 0x10, int(bits[1], 16) if bits[0][0:2] == '0x' else int(bits[0], 10))
                else:
                    # We don't know what this is
                    raise Exception("Invalid argument format on line %d: %s" % (self.lineno, source))
            else:
                # This is a pointer located at a literal address
                return (0x1e, int(source[1:-1], 16) if source[1:3] == '0x' else int(source[1:-1], 10))
        elif re_number.match(source):
            # This matches the regex for a literal value
            val = int(source, 16) if source[0:2] == '0x' else int(source, 10)
            if val < 0x1f:
                # We can use short form for this one!
                return (0x20 + val, None)
            else:
                # Too big to fit in short form
                return (0x1f, val)
        else:
            # None of the above, so assume it is a label
            print "Debug: Label %s at line %d" % (source, self.parent.labels[source])
            print repr(self.parent.addresses)

            # Check if it's a known label
            if source in self.parent.labels:
                return (0xff, self.parent.labels[source])
            else:
                # We've never seen this label before
                raise Exception('Unknown label "%s" at line %d' % (source, self.lineno))


class Assembler:
    """
    This is the actual assembler.

    All of the work is done by other classes, this one mostly just pulls everything together.
    """

    def __init__(self):
        # Assembled words go here
        self.output = []

        self.labels = {}
        self.addresses = []

    def address(self, index):
        "Applies bias to compensate for short-form labels"
        return self.addresses[index - self.bias]

    def dump(self, i):

        label = ''
        source = ' '.join(i.sl.source.split()) # Compact any whitespace
        m = re_label.match(source)
        if m:
            label = m.group('label')
            source = source.split(None, 1)[1]

        # the following one-liner formats a binary string into big-endian words in hexadecimal i.e. "7c01 0030"
        bytecode = ' '.join(
                ["%02x%02x" % ((ord(m), ord(n)) if big_endian else (ord(n), ord(m))) for m, n in zip(i.bytecode[0::2], i.bytecode[1::2])]
                )

        print "%03d %04x:    %s%s ; %s (line %d) %s" % (
                i.inum, self.address(i.inum),
                source, ' '*(20-len(source)),
                bytecode, i.lineno, label)

    def assemble(self, infile, outfile):
        fd = open(infile, 'r')
        data = fd.readlines()
        fd.close()

        # This is used to compensate for shortform labels
        self.bias = 0

        # Count number of lines (used for "error on line x" messages)
        linecount = 0

        # Count number of instructions (used for
        icount = 0
        sourcelines = []
        instructions = []

        print '=== FIRST PASS ==='
        # In this pass, each input line is subjected to preliminary parsing, blank lines
        # and comments are discarded, and a list of label positions (by line) is compiled.
        for line in data:
            # Line numbers start at 1
            linecount += 1

            # Discard leading/trailing whitespace, and comments
            line = line.strip().split(';')[0]

            # Check if this line looks like a label
            m = re_label.match(line)
            if m is not None:
                # Save the label along with the inum of the instruction it appears above
                self.labels[m.group('label')[1:]] = icount
                print "DEBUG: Found label %s at line %d" % (m.group('label'), linecount)

            # Check if this line (also) looks like an instruction
            if re_prelim.search(line):
                sl = SourceLine(linecount, icount, line)
                sourcelines.append(sl)
                # Increment instruction count
                icount += 1
        print "Debug: %d instructions parsed from %d lines" % (icount,linecount)

        print '=== SECOND PASS ==='
        # In this pass we process opcodes and their arguments, find instruction word length
        # and determine an address for each instruction. This will allow us to calculate
        # label addresses in the final pass.

        # This list will map instruction numbers to addresses.
        # We could use a dict here, but a list is cleaner and faster
        self.addresses = [None]*icount

        # This counts which address we are up to. Basically it's a program counter
        addr = 0
        for sl in sourcelines:
            i = Instruction(self, sl) # Process instruction line (determines addresses)

            # Map instruction numbers to addresses
            self.addresses[i.inum] = addr

            # Add the length of the instruction to our program counter.
            addr += i.words
            instructions.append(i)

        # Debugging
        print repr(self.addresses)

        print '=== THIRD PASS ==='
        # In this pass we resolve label addresses, generate machine code and build the actual binary.

        binary = []

        # Build the binary out of individual instructions
        for i in instructions:
            binary.append(i.build_bytecode())
            # Debugging code: Prints the instruction alongside its binary version for comparison
            self.dump(i)

        # Join up our list of binary fragments into a single string
        bytecode = ''.join(binary)

        # and dump it into the output file
        fd = open(outfile, 'wb')
        fd.write(bytecode)
        fd.close()

        # We're done!

def main():
    from sys import argv

    # Command-line arguments are handled by the argparse library
    parser = argparse.ArgumentParser(description='Compiles DCPU-16 assembly source into bytecode.')

    # Big-endian option
    parser.add_argument('-be', dest='big_endian', action='store_true',
            help='Output each 16-bit word in big-endian format. The default format is little-endian.')

    # Short form labels
    parser.add_argument('-s', dest='shortform', action='store_true',
            help='Enable short-form labels. Warning: highly experiental!')

    # Input file
    parser.add_argument('infile', nargs=1, help='The assembly file to read from.')

    # Output file
    parser.add_argument('outfile', nargs=1, help='The filename to write output to.')

    # Now parse the arguments we were given
    args = parser.parse_args()
    global big_endian, shortform
    big_endian = args.big_endian
    shortform = args.shortform

    # Create an Assembler and set it to work on our file
    a = Assembler()
    a.assemble(infile=args.infile[0], outfile=args.outfile[0])

# Standard python foo (allows this file to be imported as a library)
if __name__ == '__main__': main()
	import struct, re, argparse

	# This regex matches lines that look vaguely like assembly code
	re_prelim = re.compile(r'(?:[A-Z]{3}\|[a-z]{3})\s')

	# This regex is used to parse the data from a line of assembly code
	re_line = re.compile(
	r'^(?:\S+\s+)?(?P<instr>[A-Z]{3}\|[a-z]{3})\s+(?P<arga>[A-Za-z0-9_\[\]+]+)\s(?:,\s(?P<argb>[A-Za-z0-9_\[\]+]+))?'
	)

	# This regex matches labels
	re_label = re.compile(r'^(?P<label>:\w+)\s*(?:(?P<instr>[A-Za-z]{3})\|$)')

	# This regex matches either decimal or hex numbers (used for checking if parameters are a label)
	re_number = re.compile(r'(?:[0-9]+\|0x[0-9A-Fa-f])')

	# Dictionary of opcodes. Key is the name, value is a tuple of (type, binary) where type is 0=basic, 1=extended
	opcodefs = {
	'SET': (0, 0x1), 'ADD': (0, 0x2), 'SUB': (0, 0x3),
	'MUL': (0, 0x4), 'DIV': (0, 0x5), 'MOD': (0, 0x6),
	'SHL': (0, 0x7), 'SHR': (0, 0x8), 'AND': (0, 0x9),
	'BOR': (0, 0xa), 'XOR': (0, 0xb), 'IFE': (0, 0xc),
	'IFN': (0, 0xd), 'IFG': (0, 0xe), 'IFB': (0, 0xf),
	'JSR': (1, 0x01),
	}

	# Dict of some possible argument values and their binary equivalents, used for lookups.
	argdefs = {
	'A': 0x00, 'B': 0x01, 'C': 0x02, 'X': 0x03, 'Y': 0x04, 'Z': 0x05, 'I': 0x06, 'J': 0x07,
	'[A]': 0x08, '[B]': 0x09, '[C]': 0x0a, '[X]': 0x0b, '[Y]': 0x0c, '[Z]': 0x0d, '[I]': 0x0e, '[J]': 0x0f,
	'POP': 0x18, 'PEEK': 0x19, 'PUSH': 0x1a, 'SP': 0x1b, 'PC': 0x1c, 'O': 0x1d,
	}

	class SourceLine:
	"""
	Loads a line of assembly source and does some preliminary parsing.
	"""
	def __init__(self, line_number, inum, source):
	self.lineno = line_number
	self.inum = inum # Which instruction is this
	self.source = source
	self.instruction = None
	self.args = None
	self.parse(source)

	def parse(self, source):
	"""
	Parses a line of source code.

	In this stage, the line is split into several strings, namely the instruction (e.g. ADD)
	and the two (or one, for extended instructions) arguments. No verification is done
	beyond what the regular expression filters out, e.g. bogus instructions like ABC or
	arguments like 0x56g1 will be accepted at this point.
	"""
	m = re_line.search(source)
	if m is None:
	raise Exception("Syntax error\nLine %d: %s" % (self.lineno, source))
	data = m.groupdict()
	#print repr(data)
	self.instruction = data['instr'];
	self.args = (data['arga'], data['argb'])


	class Instruction:
	"""
	This class is used to compile a line of assembly source into its binary machine code representation.
	"""
	def __init__(self, parent, source=None):
	"""
	Initializes variables
	"""
	self.parent = parent
	self.sl = source
	self.lineno = source.lineno
	self.inum = source.inum
	self.address = None
	self.opcode = None
	self.arg1 = None
	self.arg2 = None
	self.bytecode = None
	self.words = 1

	if source is not None:
	self.process(source)

	def process(self, source):
	"""
	Processes a line of assembly.

	In this stage, some checking is done and the text form of the instruction (e.g. "ADD") and the arguments
	are converted into their numerical representations. An error is raised if the instruction does not exist.

	Argument values must parse as either:
	* a special value (i.e. must directly match a key in the argdefs dict)
	* a decimal or hexadecimal value (either literal, or enclosed in [] to denote a pointer)
	* an offset plus register value as a pointer (e.g. "[0x1000+I]")
	Any argument values that do NOT match one of the above is assumed to be a label, and will cause an error if
	the label does not actually exist.

	Labels are not actually replaced with addresses at this stage, because we don't yet know how many words each
	instruction takes up and therefore we don't know the actual address of any label. Instead we use a placeholder
	value for labels, which will be replaced with the correct address in the final stage.
	"""

	# Count the number of words this instruction will take up. This is important for label resolution.
	self.words = 1

	# Look for instruction in our dict of opcodes
	if source.instruction in opcodefs:
	self.opcode = opcodefs[source.instruction]
	else:
	# Raise error if it isn't there
	raise Exception("Unrecognised instruction %s\nLine %s: %s" % (source.instruction, self.lineno, source.source))

	# Is this a basic instruction? If it is, it needs to have two arguments (i.e. the second one can't be None)
	if self.opcode[0] == 0:
	if source.args[1] is None:
	raise Exception("Basic instructions require two arguments\nLine %d: %s" % (self.lineno, source.source))

	# Work out the numerical value of the second argument (which an extended instruction won't have)
	self.arg2 = self.parse_arg(source.args[1])
	# Add 1 to our word length if the argument has an associated data value (i.e. "next word")
	self.words += 1 if self.arg2[1] else 0

	# Both types of instruction will have a first argument, process that next
	self.arg1 = self.parse_arg(source.args[0])
	# Add 1 to our word length if it has a data value
	self.words += 1 if self.arg1[1] else 0

	def build_bytecode(self):
	"""
	Performs the final stage of actually building the string containing the raw bytecode.

	Label resolution is performed at this stage.
	"""
	bytecode = None

	# Resolve labels
	if self.arg1[0] == 0xff:
	# First argument was a label.
	# In practice, basic instructions will rarely have a label as a first argument,
	# but the JSR extended instruction will almost always have one!

	# Look up the address by istruction number
	val = self.parent.address(self.arg1[1])
	if shortform and val < 0x1f:
	# We can use short form for this one!
	self.arg1 = (0x20 + val, None)
	else:
	# Too big to fit in short form
	self.arg1 = (0x1f, val)

	if self.arg2 and self.arg2[0] == 0xff:
	# Second argument exists and was a label. Pay no attention to the debug statement behind the curtain.
	print "Debug: Label dest line %d resolved to address 0x%04x" % (self.arg2[1], self.parent.address(self.arg2[1]))
	# Look up the address by istruction number
	val = self.parent.address(self.arg2[1])
	if shortform and val < 0x1f:
	# We can use short form for this one!
	self.arg2 = (0x20 + val, None)
	else:
	# Too big to fit in short form
	self.arg2 = (0x1f, val)

	# Check which endianness the user requested, and set the struct format appropriately.
	# We default to the DCPU-16's native little-endian byte format, but users can also opt for
	# big-endian (aka network byte format) if their emulator reads input files differently.
	fmt = '<H' # Little-endian format
	if big_endian:
	fmt = '>H' # Switch to big-endian

	# Generate bytecode for the first word.
	if self.opcode[0] == 0:
	# Basic instruction
	bytecode = struct.pack(fmt, self.opcode[1] \| (self.arg1[0] << 4) \| (self.arg2[0] << 10) )
	else:
	# Extended instruction
	print "DEBUG: self.arg1 is %s and self.arg2 is %s" % (repr(self.arg1), repr(self.arg2))
	bytecode = struct.pack(fmt, (self.opcode[1] << 4) \| (self.arg1[0] << 10) )

	# If either of the arguments read a "next word", then add that next word
	if self.arg1[1] is not None:
	bytecode += struct.pack(fmt, self.arg1[1])
	if self.arg2 and self.arg2[1] is not None:
	bytecode += struct.pack(fmt, self.arg2[1])

	# Save our generated bytecode as well as returning it. More for debugging than anything else.
	self.bytecode = bytecode
	return bytecode

	def parse_arg(self, source):
	"""
	Converts arguments from assembly source form into their binary form.
	"""
	# Does the argument exactly match one of our predetermined values?
	if source in argdefs:
	# Look up the value in our dict
	return (argdefs[source], None)
	# our work here is done

	# Otherwise, lookup failed. Next, check for square brackets, i.e. a pointer
	elif source[0] == '[' and source[-1] == ']':
	# Check for a + in there as well, meaning it's probably a literal+register value
	if '+' in source:
	# Pointer plus register
	bits = source[1:-1].split('+')

	# Support the register name being on either side of the + sign
	if bits[1][0] in argdefs:
	# We assume base 16 if the literal part starts with '0x', otherwise base 10
	return (argdefs[bits[1][0]] + 0x10, int(bits[0], 16) if bits[0][0:2] == '0x' else int(bits[0], 10))
	elif bits[0][0] in argdefs:
	return (argdefs[bits[0][0]] + 0x10, int(bits[1], 16) if bits[0][0:2] == '0x' else int(bits[0], 10))
	else:
	# We don't know what this is
	raise Exception("Invalid argument format on line %d: %s" % (self.lineno, source))
	else:
	# This is a pointer located at a literal address
	return (0x1e, int(source[1:-1], 16) if source[1:3] == '0x' else int(source[1:-1], 10))
	elif re_number.match(source):
	# This matches the regex for a literal value
	val = int(source, 16) if source[0:2] == '0x' else int(source, 10)
	if val < 0x1f:
	# We can use short form for this one!
	return (0x20 + val, None)
	else:
	# Too big to fit in short form
	return (0x1f, val)
	else:
	# None of the above, so assume it is a label
	print "Debug: Label %s at line %d" % (source, self.parent.labels[source])
	print repr(self.parent.addresses)

	# Check if it's a known label
	if source in self.parent.labels:
	return (0xff, self.parent.labels[source])
	else:
	# We've never seen this label before
	raise Exception('Unknown label "%s" at line %d' % (source, self.lineno))


	class Assembler:
	"""
	This is the actual assembler.

	All of the work is done by other classes, this one mostly just pulls everything together.
	"""

	def __init__(self):
	# Assembled words go here
	self.output = []

	self.labels = {}
	self.addresses = []

	def address(self, index):
	"Applies bias to compensate for short-form labels"
	return self.addresses[index - self.bias]

	def dump(self, i):

	label = ''
	source = ' '.join(i.sl.source.split()) # Compact any whitespace
	m = re_label.match(source)
	if m:
	label = m.group('label')
	source = source.split(None, 1)[1]

	# the following one-liner formats a binary string into big-endian words in hexadecimal i.e. "7c01 0030"
	bytecode = ' '.join(
	["%02x%02x" % ((ord(m), ord(n)) if big_endian else (ord(n), ord(m))) for m, n in zip(i.bytecode[0::2], i.bytecode[1::2])]
	)

	print "%03d %04x: %s%s ; %s (line %d) %s" % (
	i.inum, self.address(i.inum),
	source, ' '*(20-len(source)),
	bytecode, i.lineno, label)

	def assemble(self, infile, outfile):
	fd = open(infile, 'r')
	data = fd.readlines()
	fd.close()

	# This is used to compensate for shortform labels
	self.bias = 0

	# Count number of lines (used for "error on line x" messages)
	linecount = 0

	# Count number of instructions (used for
	icount = 0
	sourcelines = []
	instructions = []

	print '=== FIRST PASS ==='
	# In this pass, each input line is subjected to preliminary parsing, blank lines
	# and comments are discarded, and a list of label positions (by line) is compiled.
	for line in data:
	# Line numbers start at 1
	linecount += 1

	# Discard leading/trailing whitespace, and comments
	line = line.strip().split(';')[0]

	# Check if this line looks like a label
	m = re_label.match(line)
	if m is not None:
	# Save the label along with the inum of the instruction it appears above
	self.labels[m.group('label')[1:]] = icount
	print "DEBUG: Found label %s at line %d" % (m.group('label'), linecount)

	# Check if this line (also) looks like an instruction
	if re_prelim.search(line):
	sl = SourceLine(linecount, icount, line)
	sourcelines.append(sl)
	# Increment instruction count
	icount += 1
	print "Debug: %d instructions parsed from %d lines" % (icount,linecount)

	print '=== SECOND PASS ==='
	# In this pass we process opcodes and their arguments, find instruction word length
	# and determine an address for each instruction. This will allow us to calculate
	# label addresses in the final pass.

	# This list will map instruction numbers to addresses.
	# We could use a dict here, but a list is cleaner and faster
	self.addresses = [None]*icount

	# This counts which address we are up to. Basically it's a program counter
	addr = 0
	for sl in sourcelines:
	i = Instruction(self, sl) # Process instruction line (determines addresses)

	# Map instruction numbers to addresses
	self.addresses[i.inum] = addr

	# Add the length of the instruction to our program counter.
	addr += i.words
	instructions.append(i)

	# Debugging
	print repr(self.addresses)

	print '=== THIRD PASS ==='
	# In this pass we resolve label addresses, generate machine code and build the actual binary.

	binary = []

	# Build the binary out of individual instructions
	for i in instructions:
	binary.append(i.build_bytecode())
	# Debugging code: Prints the instruction alongside its binary version for comparison
	self.dump(i)

	# Join up our list of binary fragments into a single string
	bytecode = ''.join(binary)

	# and dump it into the output file
	fd = open(outfile, 'wb')
	fd.write(bytecode)
	fd.close()

	# We're done!

	def main():
	from sys import argv

	# Command-line arguments are handled by the argparse library
	parser = argparse.ArgumentParser(description='Compiles DCPU-16 assembly source into bytecode.')

	# Big-endian option
	parser.add_argument('-be', dest='big_endian', action='store_true',
	help='Output each 16-bit word in big-endian format. The default format is little-endian.')

	# Short form labels
	parser.add_argument('-s', dest='shortform', action='store_true',
	help='Enable short-form labels. Warning: highly experiental!')

	# Input file
	parser.add_argument('infile', nargs=1, help='The assembly file to read from.')

	# Output file
	parser.add_argument('outfile', nargs=1, help='The filename to write output to.')

	# Now parse the arguments we were given
	args = parser.parse_args()
	global big_endian, shortform
	big_endian = args.big_endian
	shortform = args.shortform

	# Create an Assembler and set it to work on our file
	a = Assembler()
	a.assemble(infile=args.infile[0], outfile=args.outfile[0])

	# Standard python foo (allows this file to be imported as a library)
	if __name__ == '__main__': main()