Skip to content

Instantly share code, notes, and snippets.

@p4bl0-
Last active October 12, 2023 09:09
Show Gist options
  • Save p4bl0-/9f4e950e6c06fbba7e168097d89b0e46 to your computer and use it in GitHub Desktop.
Save p4bl0-/9f4e950e6c06fbba7e168097d89b0e46 to your computer and use it in GitHub Desktop.
A complete compiler for a simple language (in less than 150 LoC)

This project is a tiny compiler for a very simple language consisting of boolean expression.

The language has two constants: 1 for true and 0 for false, and 4 logic gates: ! (not), & (and), | (or), and ^ (xor).

It can also use parentheses to manage priorities.

Here is its grammar in BNF format:

expr ::= "0" | "1"
       | "!" expr
       | expr "&" expr
       | expr "|" expr
       | expr "^" expr
       | "(" expr ")"

The goal is to compile it to a virtual machine bytecode.

The virual machine has two registers (acc and tmp) and memory stack.

The bytecode consists in 6 instructions:

  • t: sets the acc register to 1
  • f: sets the acc register to 0
  • s: pushes the value of the acc register onto the stack
  • l: pops the top value of the stack and write it to the acc register
  • h: copies the value of the acc register into the tmp register
  • n: sets the acc register to the value of acc nand tmp

This is sufficient because the nand gate is functionally complete: https://en.wikipedia.org/wiki/Functional_completeness

After lexing and parsing, we do not need any semantic analysis or type checking because there are no names (e.g., no variables) and a single type.

This is why we do not keep source location information in the parser's output. In a more complex language (e.g., adding support for assignment to variables), we would need an additional front-end pass after the parser. This is not a lot to add, I could make it if asked to.

The compilation strategy is to first use known formulas to translate the parsed boolean expression into an equivalent expression that only uses nand gates. (See https://en.wikipedia.org/wiki/NAND_logic)

In a second pass, this (functional) expression is compiled down to (imperative) bytecode instructions.

This is were we can see actual compilation work: the paradigm shift from functional "high level" to imperative "low level" forces us to implement our language's abstractions (here, function call) using more primitive constructs (stack and register manipulation).

We can also see that the approach we use will generate non-optimal code, because our compiler will always treat nand gates in the same way without regard to the context (e.g. we would manually translate "!1" into "thn" but our compiler will generate "tsthln"). An additional optimization pass that look for such patterns and replace them with manually optimized ones would be quite easy to make. I could also provide it if asked to.

The VM is implemented in C.

The compiler is implemented in OCaml.

The lexer uses ocamllex, the parser uses menhir.

Files need to be renamed without the numbers and underscore at the beginging of their names, which are here so that they appear in the desired order in this gist.

To compile the compiler: ocamlbuild -use-menhir main.byte.

To compile the vm: gcc nandvm.c -o nandvm.

Then you can for example run:

  • ./main.byte <<<"1" | ./nandvm
  • ./main.byte <<<"!(0 ^ 1)" | ./nandvm
  • ./main.byte <<<"0 ^ (0 | 1) & !(1 ^ 1)" | ./nandvm
{
open Parser
exception Error of char
}
rule token = parse
| eof { Leof }
| [' ' '\t' '\n'] { token lexbuf }
| "0" { Lbool (false) }
| "1" { Lbool (true) }
| "!" { Lnot }
| "&" { Land }
| "|" { Lor }
| "^" { Lxor }
| "(" { Lopar }
| ")" { Lcpar }
| _ as c { raise (Error c) }
%{
open Ast.Bool
%}
%token <bool> Lbool
%token Lnot Land Lor Lxor Lopar Lcpar Leof
%left Land Lor Lxor
%right Lnot
%start prog
%type <Ast.Bool.expr> prog
%%
prog:
| e = expr; Leof { e }
;
expr:
| b = Lbool { Bool (b) }
| Lnot; e = expr { Not (e) }
| Lopar; e = expr; Lcpar { e }
| a = expr; Land; b = expr { And (a, b) }
| a = expr; Lor; b = expr { Or (a, b) }
| a = expr; Lxor; b = expr { Xor (a, b) }
;
module Bool = struct
type expr =
| Bool of bool
| Not of expr
| And of expr * expr
| Or of expr * expr
| Xor of expr * expr
end
module Nand = struct
type expr =
| B of bool
| N of expr * expr
end
module Byte = struct
type instr =
| False
| True
| Push
| Pop
| Hold
| Nand
and prog = instr list
end
open Ast.Bool
open Ast.Nand
(* formulas taken from https://en.wikipedia.org/wiki/NAND_logic *)
let rec compile = function
| Bool b -> B b
| Not e -> let ce = compile e in N (ce, ce)
| And (a, b) -> let ca, cb = compile a, compile b in
N (N (ca, cb), N (ca, cb))
| Or (a, b) -> let ca, cb = compile a, compile b in
N (N (ca, ca), N (cb, cb))
| Xor (a, b) -> let ca, cb = compile a, compile b in
N (N (ca, N (ca, cb)), N (cb, N (ca, cb)))
(* bonus: eval, useless in our compiler except to make some tests *)
let rec eval = function
| Bool b -> b
| Not e -> not (eval e)
| And (a, b) -> (eval a) && (eval b)
| Or (a, b) -> (eval a) || (eval b)
| Xor (a, b) -> (eval a) <> (eval b)
open Ast.Nand
open Ast.Byte
let rec compile = function
| B false -> [ False ]
| B true -> [ True ]
| N (a, b) -> (compile a) @ [ Push ] @ (compile b) @ [ Hold ; Pop ; Nand ]
(* bonus: eval, useless in our compiler except to make some tests *)
let rec eval = function
| B e -> e
| N (a, b) -> not ((eval a) && (eval b))
open Ast.Byte
let emit prog =
let bc = List.map (function
| False -> 'f'
| True -> 't'
| Push -> 's'
| Pop -> 'l'
| Hold -> 'h'
| Nand -> 'n')
prog in
List.iter print_char bc
(* bonus: run bytecode (functionnal style vm runtime implementation) *)
let run code =
let rec exec prog acc tmp stack =
match prog with
| [] -> acc
| False :: p -> exec p false tmp stack
| True :: p -> exec p true tmp stack
| Push :: p -> exec p acc tmp (acc :: stack)
| Pop :: p -> exec p (List.hd stack) tmp (List.tl stack)
| Hold :: p -> exec p acc acc stack
| Nand :: p -> exec p (not (acc && tmp)) tmp stack
in exec code false false []
let err msg pos =
Lexing.(Printf.eprintf "Error on line %d col %d: %s.\n"
pos.pos_lnum (pos.pos_cnum - pos.pos_bol) msg) ;
exit 1
let () =
let src = Lexing.from_channel Stdlib.stdin in
try
let boolexpr = Parser.prog Lexer.token src in
let nandexpr = Bool.compile boolexpr in
let bytecode = Nand.compile nandexpr in
Byte.emit bytecode
with
| Lexer.Error c ->
err (Printf.sprintf "unrecognized char '%c'" c) (Lexing.lexeme_start_p src)
| Parser.Error ->
err "syntax error" (Lexing.lexeme_start_p src)
#include <stdio.h>
int
main (int argc, char *argv[])
{
char acc = 0, tmp = 0; // registers
char mem[16384]; // memory
int sp = 0; // stack pointer
char instr;
while ((instr = getchar()) != EOF) {
switch (instr) {
case 'f' /* False */ : acc = 0; break;
case 't' /* True */ : acc = 1; break;
case 's' /* Push */ : mem[sp++] = acc; break; // to avoid "p" conflict, "s" as in "store"
case 'l' /* Pop */ : acc = mem[--sp]; break; // to avoid "p" conflict, "l" as in "load"
case 'h' /* Hold */ : tmp = acc; break;
case 'n' /* Nand */ : acc = !(acc && tmp); break;
}
}
printf("result = %d\n", acc);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment