p4bl0-/00_readme.md

## 00_readme.md

      
    Raw
  

              00_readme.md
            
          
    This project is a tiny compiler for a very simple language consisting of boolean expression.
The language has two constants: 1 for true and 0 for false, and 4 logic gates:
! (not), & (and), | (or), and ^ (xor).
It can also use parentheses to manage priorities.
Here is its grammar in BNF format:
expr ::= "0" | "1"
       | "!" expr
       | expr "&" expr
       | expr "|" expr
       | expr "^" expr
       | "(" expr ")"

The goal is to compile it to a virtual machine bytecode.
The virual machine has two registers (acc and tmp) and memory stack.
The bytecode consists in 6 instructions:

t: sets the acc register to 1
f: sets the acc register to 0
s: pushes the value of the acc register onto the stack
l: pops the top value of the stack and write it to the acc register
h: copies the value of the acc register into the tmp register
n: sets the acc register to the value of acc nand tmp

This is sufficient because the nand gate is functionally complete:
https://en.wikipedia.org/wiki/Functional_completeness
After lexing and parsing, we do not need any semantic analysis or type checking
because there are no names (e.g., no variables) and a single type.
This is why we do not keep source location information in the parser's output.
In a more complex language (e.g., adding support for assignment to variables),
we would need an additional front-end pass after the parser.
This is not a lot to add, I could make it if asked to.
The compilation strategy is to first use known formulas to translate the parsed
boolean expression into an equivalent expression that only uses nand gates.
(See https://en.wikipedia.org/wiki/NAND_logic)
In a second pass, this (functional) expression is compiled down to (imperative)
bytecode instructions.
This is were we can see actual compilation work: the paradigm shift from
functional "high level" to imperative "low level" forces us to implement
our language's abstractions (here, function call) using more primitive
constructs (stack and register manipulation).
We can also see that the approach we use will generate non-optimal code,
because our compiler will always treat nand gates in the same way without
regard to the context (e.g. we would manually translate "!1" into "thn" but
our compiler will generate "tsthln"). An additional optimization pass that
look for such patterns and replace them with manually optimized ones would
be quite easy to make. I could also provide it if asked to.
The VM is implemented in C.
The compiler is implemented in OCaml.
The lexer uses ocamllex, the parser uses menhir.
Files need to be renamed without the numbers and underscore at the beginging of
their names, which are here so that they appear in the desired order in this gist.
To compile the compiler: ocamlbuild -use-menhir main.byte.
To compile the vm: gcc nandvm.c -o nandvm.
Then you can for example run:

./main.byte <<<"1" | ./nandvm
./main.byte <<<"!(0 ^ 1)" | ./nandvm
./main.byte <<<"0 ^ (0 | 1) & !(1 ^ 1)" | ./nandvm


## 01_lexer.mll
{
  open Parser
  exception Error of char
}

rule token = parse
| eof             { Leof }
| [' ' '\t' '\n'] { token lexbuf }
| "0"             { Lbool (false) }
| "1"             { Lbool (true) }
| "!"             { Lnot }
| "&"             { Land }
| "|"             { Lor }
| "^"             { Lxor }
| "("             { Lopar }
| ")"             { Lcpar }
| _ as c          { raise (Error c) }

## 02_parser.mly
%{
  open Ast.Bool
%}

%token <bool> Lbool
%token Lnot Land Lor Lxor Lopar Lcpar Leof

%left Land Lor Lxor
%right Lnot

%start prog

%type <Ast.Bool.expr> prog

%%

prog:
| e = expr; Leof { e }
;

expr:
| b = Lbool                { Bool (b) }
| Lnot; e = expr           { Not (e) }
| Lopar; e = expr; Lcpar   { e }
| a = expr; Land; b = expr { And (a, b) }
| a = expr; Lor;  b = expr { Or (a, b) }
| a = expr; Lxor; b = expr { Xor (a, b) }
;

## 03_ast.ml
module Bool = struct
  type expr =
  | Bool of bool
  | Not  of expr
  | And  of expr * expr
  | Or   of expr * expr
  | Xor  of expr * expr
end

module Nand = struct
  type expr =
  | B of bool
  | N of expr * expr
end

module Byte = struct
  type instr =
    | False
    | True
    | Push
    | Pop
    | Hold
    | Nand
  and prog = instr list
end

## 04_bool.ml
open Ast.Bool
open Ast.Nand

(* formulas taken from https://en.wikipedia.org/wiki/NAND_logic *)
let rec compile = function
  | Bool b     -> B b
  | Not e      -> let ce = compile e in N (ce, ce)
  | And (a, b) -> let ca, cb = compile a, compile b in
                  N (N (ca, cb), N (ca, cb))
  | Or (a, b)  -> let ca, cb = compile a, compile b in
                  N (N (ca, ca), N (cb, cb))
  | Xor (a, b) -> let ca, cb = compile a, compile b in
                  N (N (ca, N (ca, cb)), N (cb, N (ca, cb)))

(* bonus: eval, useless in our compiler except to make some tests *)
let rec eval = function
  | Bool b     -> b
  | Not e      -> not (eval e)
  | And (a, b) -> (eval a) && (eval b)
  | Or (a, b)  -> (eval a) || (eval b)
  | Xor (a, b) -> (eval a) <> (eval b)

## 05_nand.ml
open Ast.Nand
open Ast.Byte

let rec compile = function
  | B false  -> [ False ]
  | B true   -> [ True ]
  | N (a, b) -> (compile a) @ [ Push ] @ (compile b) @ [ Hold ; Pop ; Nand ]

(* bonus: eval, useless in our compiler except to make some tests *)
let rec eval = function
  | B e      -> e
  | N (a, b) -> not ((eval a) && (eval b))

## 06_byte.ml
open Ast.Byte

let emit prog =
  let bc = List.map (function
               | False -> 'f'
               | True  -> 't'
               | Push  -> 's'
               | Pop   -> 'l'
               | Hold  -> 'h'
               | Nand  -> 'n')
             prog in
  List.iter print_char bc

(* bonus: run bytecode (functionnal style vm runtime implementation) *)
let run code =
  let rec exec prog acc tmp stack =
    match prog with
    | []         -> acc
    | False :: p -> exec p false tmp stack
    | True  :: p -> exec p true tmp stack
    | Push  :: p -> exec p acc tmp (acc :: stack)
    | Pop   :: p -> exec p (List.hd stack) tmp (List.tl stack)
    | Hold  :: p -> exec p acc acc stack
    | Nand  :: p -> exec p (not (acc && tmp)) tmp stack
  in exec code false false []

## 07_main.ml
let err msg pos =
  Lexing.(Printf.eprintf "Error on line %d col %d: %s.\n"
            pos.pos_lnum (pos.pos_cnum - pos.pos_bol) msg) ;
  exit 1

let () =
  let src = Lexing.from_channel Stdlib.stdin in
  try
    let boolexpr = Parser.prog Lexer.token src in
    let nandexpr = Bool.compile boolexpr in
    let bytecode = Nand.compile nandexpr in
    Byte.emit bytecode
  with
  | Lexer.Error c ->
     err (Printf.sprintf "unrecognized char '%c'" c) (Lexing.lexeme_start_p src)
  | Parser.Error ->
     err "syntax error" (Lexing.lexeme_start_p src)

## 08_nandvm.c
#include <stdio.h>

int
main (int argc, char *argv[])
{
  char acc = 0, tmp = 0; // registers
  char mem[16384];       // memory
  int sp = 0;            // stack pointer
  char instr;

  while ((instr = getchar()) != EOF) {
    switch (instr) {
    case 'f' /* False */ : acc = 0;             break;
    case 't' /* True  */ : acc = 1;             break;
    case 's' /* Push  */ : mem[sp++] = acc;     break; // to avoid "p" conflict, "s" as in "store"
    case 'l' /* Pop   */ : acc = mem[--sp];     break; // to avoid "p" conflict, "l" as in "load"
    case 'h' /* Hold  */ : tmp = acc;           break;
    case 'n' /* Nand  */ : acc = !(acc && tmp); break;
    }
  }

  printf("result = %d\n", acc);
  return 0;
}
	{
	open Parser
	exception Error of char
	}

	rule token = parse
	\| eof { Leof }
	\| [' ' '\t' '\n'] { token lexbuf }
	\| "0" { Lbool (false) }
	\| "1" { Lbool (true) }
	\| "!" { Lnot }
	\| "&" { Land }
	\| "\|" { Lor }
	\| "^" { Lxor }
	\| "(" { Lopar }
	\| ")" { Lcpar }
	\| _ as c { raise (Error c) }
	%{
	open Ast.Bool
	%}

	%token <bool> Lbool
	%token Lnot Land Lor Lxor Lopar Lcpar Leof

	%left Land Lor Lxor
	%right Lnot

	%start prog

	%type <Ast.Bool.expr> prog

	%%

	prog:
	\| e = expr; Leof { e }
	;

	expr:
	\| b = Lbool { Bool (b) }
	\| Lnot; e = expr { Not (e) }
	\| Lopar; e = expr; Lcpar { e }
	\| a = expr; Land; b = expr { And (a, b) }
	\| a = expr; Lor; b = expr { Or (a, b) }
	\| a = expr; Lxor; b = expr { Xor (a, b) }
	;
	module Bool = struct
	type expr =
	\| Bool of bool
	\| Not of expr
	\| And of expr * expr
	\| Or of expr * expr
	\| Xor of expr * expr
	end

	module Nand = struct
	type expr =
	\| B of bool
	\| N of expr * expr
	end

	module Byte = struct
	type instr =
	\| False
	\| True
	\| Push
	\| Pop
	\| Hold
	\| Nand
	and prog = instr list
	end
	open Ast.Bool
	open Ast.Nand

	(* formulas taken from https://en.wikipedia.org/wiki/NAND_logic *)
	let rec compile = function
	\| Bool b -> B b
	\| Not e -> let ce = compile e in N (ce, ce)
	\| And (a, b) -> let ca, cb = compile a, compile b in
	N (N (ca, cb), N (ca, cb))
	\| Or (a, b) -> let ca, cb = compile a, compile b in
	N (N (ca, ca), N (cb, cb))
	\| Xor (a, b) -> let ca, cb = compile a, compile b in
	N (N (ca, N (ca, cb)), N (cb, N (ca, cb)))

	(* bonus: eval, useless in our compiler except to make some tests *)
	let rec eval = function
	\| Bool b -> b
	\| Not e -> not (eval e)
	\| And (a, b) -> (eval a) && (eval b)
	\| Or (a, b) -> (eval a) \|\| (eval b)
	\| Xor (a, b) -> (eval a) <> (eval b)
	open Ast.Nand
	open Ast.Byte

	let rec compile = function
	\| B false -> [ False ]
	\| B true -> [ True ]
	\| N (a, b) -> (compile a) @ [ Push ] @ (compile b) @ [ Hold ; Pop ; Nand ]

	(* bonus: eval, useless in our compiler except to make some tests *)
	let rec eval = function
	\| B e -> e
	\| N (a, b) -> not ((eval a) && (eval b))
	open Ast.Byte

	let emit prog =
	let bc = List.map (function
	\| False -> 'f'
	\| True -> 't'
	\| Push -> 's'
	\| Pop -> 'l'
	\| Hold -> 'h'
	\| Nand -> 'n')
	prog in
	List.iter print_char bc

	(* bonus: run bytecode (functionnal style vm runtime implementation) *)
	let run code =
	let rec exec prog acc tmp stack =
	match prog with
	\| [] -> acc
	\| False :: p -> exec p false tmp stack
	\| True :: p -> exec p true tmp stack
	\| Push :: p -> exec p acc tmp (acc :: stack)
	\| Pop :: p -> exec p (List.hd stack) tmp (List.tl stack)
	\| Hold :: p -> exec p acc acc stack
	\| Nand :: p -> exec p (not (acc && tmp)) tmp stack
	in exec code false false []
	let err msg pos =
	Lexing.(Printf.eprintf "Error on line %d col %d: %s.\n"
	pos.pos_lnum (pos.pos_cnum - pos.pos_bol) msg) ;
	exit 1

	let () =
	let src = Lexing.from_channel Stdlib.stdin in
	try
	let boolexpr = Parser.prog Lexer.token src in
	let nandexpr = Bool.compile boolexpr in
	let bytecode = Nand.compile nandexpr in
	Byte.emit bytecode
	with
	\| Lexer.Error c ->
	err (Printf.sprintf "unrecognized char '%c'" c) (Lexing.lexeme_start_p src)
	\| Parser.Error ->
	err "syntax error" (Lexing.lexeme_start_p src)
	#include <stdio.h>

	int
	main (int argc, char *argv[])
	{
	char acc = 0, tmp = 0; // registers
	char mem[16384]; // memory
	int sp = 0; // stack pointer
	char instr;

	while ((instr = getchar()) != EOF) {
	switch (instr) {
	case 'f' /* False */ : acc = 0; break;
	case 't' /* True */ : acc = 1; break;
	case 's' /* Push */ : mem[sp++] = acc; break; // to avoid "p" conflict, "s" as in "store"
	case 'l' /* Pop */ : acc = mem[--sp]; break; // to avoid "p" conflict, "l" as in "load"
	case 'h' /* Hold */ : tmp = acc; break;
	case 'n' /* Nand */ : acc = !(acc && tmp); break;
	}
	}

	printf("result = %d\n", acc);
	return 0;
	}