Created
August 6, 2020 13:43
-
-
Save randrews/e019a389e3fcda5a13767f27d24c4201 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
lpeg = require('lpeg') | |
-- # Forge Compiler | |
-- Being a compiler for Forge, a high-level language for the Vulcan computer. | |
-- ## Parser | |
-- This iterates over the tokens and line numbers in a file. | |
-- Pass in an iterator over lines of source (like from io.lines) and | |
-- successive calls will yield successive tokens. | |
function read(lines) | |
-- We store the current line number, the current column in that line, and the current line | |
local line_num = 1 | |
local start = 1 | |
local line = lines() | |
-- This handles turning tokens that look like numbers into actual numbers: | |
-- - Decimals with an optional leading minus sign | |
-- - Hex with a leading `0x` | |
-- - Binary with a leading `0b` | |
local function parse_number(token) | |
if token:match('^[-]?%d+$') then return tonumber(token) | |
elseif token:match('^0x([%da-fA-F]+)$') then return tonumber(token:sub(3), 16) | |
elseif token:match('^0b([01]+)$') then return tonumber(token:sub(3), 2) | |
else return token end | |
end | |
return function() | |
while true do | |
-- Try to match a token: throw away any leading spaces, grab the next word, | |
-- and then the current position. Start at the current start column | |
local token, after = line:match('^%s*(%g+)()', start) | |
if token then | |
-- If we grabbed something, then update the start column, try to see if | |
-- it's a number, and then return it and the current line | |
start = after | |
return parse_number(token), line_num | |
else | |
-- Otherwise this line has no more tokens. Increment the line number, | |
-- reset start, and grab a new line. Because we're in a loop this will | |
-- just try again on the next line... | |
line_num = line_num + 1 | |
start = 1 | |
line = lines() | |
-- ...Unless there is no next line. In which case we break out of the | |
-- loop and return nil. | |
if not line then break end | |
end | |
end | |
end | |
end | |
function compile(lines, final_emit) | |
-- We have three segments in the program: | |
-- | |
-- - Global, which gets emitted first and is all the expressions in the global context, followed by an implicit hlt | |
-- - Words, emitted second and are all the functions | |
-- - Variables, emitted last and containing the labels and .db's for variables (all initialized to 0, the initializers | |
-- run where the declaration was, in text) | |
local segments = { global = {}, words = {}, variables = {} } | |
local current_segment = 'global' | |
local function emit(line, segment) | |
table.insert(segments[segment or current_segment], line) | |
end | |
-- The dictionary, which initially has only the primitive words in it: an entry here contains either a label or | |
-- an opcode, and tells us how to handle each word. Initially all the words in it will be the single-opcode primitives: | |
local dictionary = { | |
['+'] = { asm = 'add' }, ['-'] = { asm = 'sub' }, ['*'] = { asm = 'mul' }, ['/'] = { asm = 'div' }, mod = { asm = 'mod' }, | |
drop = { asm = 'pop' }, dup = { asm = 'dup' }, ['2dup'] = { asm = '2dup' }, swap = { asm = 'swap' }, | |
['and'] = { asm = 'and' }, ['or'] = { asm = 'or' }, xor = { asm = 'xor' }, | |
['>'] = { asm = 'agt' }, ['<'] = { asm = 'alt' }, | |
['@'] = { asm = 'load24' }, ['!'] = { asm = 'store24' } | |
} | |
-- Some things can't be used as word names: | |
local reserved = { | |
[':'] = true, [';'] = true, ['('] = true, [')'] = true | |
} | |
local sym_id = 0 | |
local function gensym() | |
sym_id = sym_id + 1 | |
return '_gen_' .. sym_id | |
end | |
local mode = 'normal' | |
local old_mode = nil | |
for token, line_num in read(lines) do | |
if token == '(' then | |
old_mode, mode = mode, 'comment' | |
elseif mode == 'comment' then | |
if token == ')' then mode = old_mode end | |
elseif mode == 'word-name' then | |
assert(type(token) ~= 'number' and not reserved[token], | |
'Invalid name \"' .. token .. '\" for new word on line ' .. line_num) | |
dictionary[token] = { label = gensym() } | |
current_segment = 'words' | |
mode = 'word-definition' | |
emit(dictionary[token].label .. ':') | |
elseif mode == 'word-definition' and token == ';' then | |
emit('\tret') | |
mode, current_segment = 'normal', 'global' | |
else | |
if type(token) == 'number' then emit('\tnop\t' .. token) | |
elseif token == ':' then mode = 'word-name' | |
else | |
local def = dictionary[token] | |
assert(def, 'Undefined word \"' .. token .. '\" on line ' .. line_num) | |
if def.asm then emit('\t' .. def.asm) | |
elseif def.label then emit('\t' .. 'call ' .. def.label) end | |
end | |
end | |
end | |
-- Helper for emitting an entire segment to the final output at once | |
local function emit_segment(segment) | |
for _, line in ipairs(segment) do final_emit(line) end | |
end | |
-- Emit all of the global segment followed by a hlt | |
-- If there are any words or variables, emit those too. | |
-- They don't need hlts because words will automatically return | |
-- and globals never get jumped to. | |
emit_segment(segments.global) | |
final_emit('\thlt') | |
emit_segment(segments.words) | |
emit_segment(segments.variables) | |
end | |
return { read = read, compile = compile } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment