Last active
July 7, 2023 02:19
-
-
Save Guest0x0/844688233e1ea27b0a2307734271644d to your computer and use it in GitHub Desktop.
a minimal pattern matching compiler demo
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(* a minimal pattern-matching compiler *) | |
(* use integer as variable for easy generation *) | |
type var = int | |
type typename = string | |
type constructor = string | |
type typ = | |
| Cons of typename * (constructor * typ list) list | |
type pattern = | |
(* variable binding is not handled for simplicity *) | |
| PAny | |
| PCon of constructor * pattern list | |
(* for simplicity, only to distinguish different match body *) | |
type match_body = int | |
let seed = ref 0 | |
let fresh () = | |
incr seed; !seed | |
(* the output of compilation. assume we are initially matching variable $0 *) | |
type decision_tree = | |
| Fail | |
| Leaf of match_body | |
(* the variable to match + all cases. | |
every case contains: | |
- the constructor | |
- a list of variables that bind the data of the constructor | |
- a sub tree for remaining job *) | |
| Branch of typename * var * (constructor * var list * decision_tree) list | |
type warning = | |
| Unreachable of match_body | |
(* the pattern describes what values are unmatched *) | |
| Unmatched of pattern | |
let warnings = ref [] | |
(* assume we found an unmatched constructor somewhere, | |
when reporting error, we want to reconstruct a complete value (w.r.t. the initial match expr). | |
[context] is used for this: it tells where we are in the whole match now. | |
A context is a pattern with a hole. | |
One can fill a pattern into the hole to obtain a complete pattern *) | |
type match_context = | |
| Outermost | |
| In_cons of { | |
parent : match_context; | |
constr : constructor; | |
before : pattern list; (* in reverse order *) | |
(* the "hole" is here *) | |
after : pattern list (* in normal order *) | |
} | |
(* fill a context with a pattern *) | |
let rec fill_context (ctx : match_context) (pat : pattern) : pattern = | |
match ctx with | |
| Outermost -> pat | |
| In_cons { parent; constr; before; after } -> | |
fill_context parent (PCon(constr, List.rev_append before (pat :: after))) | |
(* shift the hole in the context to the "next" position, | |
with [pat] filled into the original hole *) | |
let rec next_hole (ctx : match_context) (pat : pattern) : match_context = | |
match ctx with | |
| Outermost -> failwith "next_hole" | |
| In_cons { parent; constr; before; after } -> | |
match after with | |
| [] -> | |
next_hole parent (PCon(constr, List.rev_append before [pat])) | |
| _ :: rest -> | |
In_cons { parent; constr; before = pat :: before; after = rest } | |
(* When we are compiling something like: | |
match p with | |
| _ , true -> ... | |
| false, _ -> ... | |
| ... | |
there are multiple values to match, | |
and the result of matching the first value | |
will affect the result of matching the second. | |
So, compilation requires matching *multiple* values simutaneously. | |
We know that in each match arm, | |
the number of patterns should match the number of values matched. | |
Hence this kind of multiple-match problem would have the shape of a matrix: | |
match x1, x2, ..., xn with | |
| p_1_1, p_1_2, ..., p_1_n -> body_1 | |
| p_2_1, p_2_2, ..., p_2_n -> body_2 | |
| ... | |
| p_m_1, p_m_2, ..., p_m_n -> body_m *) | |
(* Here for simplicity, we use a list of match arms to represent the problem, | |
and leave the invariant that the number of patterns in eacr arm | |
is equal to the number of values implicit. *) | |
type match_arm = | |
{ pats : pattern list | |
; body : match_body } | |
let delete_first_pattern arm = | |
match arm.pats with | |
| PAny :: pats' -> { arm with pats = pats' } | |
| _ -> failwith "impossible" | |
let rec compile_aux | |
(* the values to match *) | |
(heads : (var * typ) list) | |
(* the arms of the match. | |
invariant: for any [arm] in [arms], [List.length heads = List.length arm.pats] *) | |
(arms : match_arm list) | |
(* a hash set used to track which branches are reachable *) | |
~(reachable : (match_body, unit) Hashtbl.t) | |
~(context : match_context) | |
: decision_tree = | |
match heads with | |
| [] -> | |
(* there is nothing left to match, | |
and we should receive the correct match body to execute *) | |
begin match arms with | |
| { pats = []; body } :: _ -> | |
Hashtbl.add reachable body (); | |
Leaf body | |
| _ -> | |
(* match failures should have been resolved before this stage *) | |
failwith "impossible" | |
end | |
| (var, typ) :: heads' -> | |
(* whether it is necessary to match this value *) | |
let is_necessary = | |
List.exists | |
(function { pats = (PCon _) :: _; _ } -> true | _ -> false) | |
arms | |
in | |
if not is_necessary | |
then | |
(* there's no need to match this value, simply move to the next position *) | |
compile_aux ~reachable ~context:(next_hole context PAny) | |
heads' (List.map delete_first_pattern arms) | |
else | |
let remaining_arms_for_constr constr item_typs = | |
arms |> List.filter_map @@ fun arm -> | |
match arm.pats with | |
| [] -> failwith "impossible" | |
| PAny :: pats' -> | |
(* maintain the correct number of patterns by filling [PAny] *) | |
Some { arm with pats = List.map (fun _ -> PAny) item_typs @ pats' } | |
| PCon(constr', item_pats) :: pats' -> | |
if constr' = constr | |
(* if the constructor matches, continue to match sub-patterns *) | |
then Some { arm with pats = item_pats @ pats' } | |
else None | |
in | |
let (Cons(typename, constrs)) = typ in | |
let decision_tree_branches = constrs |> List.map @@ fun (constr, item_typs) -> | |
(* assign a variable to each item of the constructor *) | |
let new_heads = List.map (fun typ -> fresh (), typ) item_typs in | |
let remaining_arms = remaining_arms_for_constr constr item_typs in | |
let subtree = | |
match remaining_arms with | |
| [] -> | |
(* match failure *) | |
let unmatched = fill_context context (PCon(constr, List.map (fun _ -> PAny) item_typs)) in | |
warnings := (Unmatched unmatched) :: !warnings; | |
Fail | |
| _ -> | |
let context' = | |
match new_heads with | |
| [] -> | |
(* the next hole is not inside this constructor. | |
find the next hole in the current context *) | |
if heads' = [] | |
then context | |
else next_hole context (PCon(constr, [])) | |
| _ :: rest -> | |
(* the next hole is inside this constructor. | |
initialize the items with [PAny] *) | |
In_cons { parent = context; constr; before = []; after = List.map (fun _ -> PAny) rest } | |
in | |
(* when the constructor matches, | |
the next thing to do is to match sub-patterns under the constructor. | |
So we assign a variable name to each item of the constructor, | |
and add these variables as new heads (aka. matched values) *) | |
compile_aux ~reachable ~context:context' (new_heads @ heads') remaining_arms | |
in | |
(constr, List.map fst new_heads, subtree) | |
in | |
Branch(typename, var, decision_tree_branches) | |
let compile ((typ, arms) : typ * (pattern * match_body) list) : decision_tree * warning list = | |
let reachable = Hashtbl.create 10 in | |
warnings := []; | |
let tree = compile_aux ~reachable ~context:Outermost | |
[0, typ] | |
(List.map (fun (pat, body) -> { pats = [pat]; body }) arms) | |
in | |
let unreachable = arms |> List.filter_map @@ fun (_, body) -> | |
if not (Hashtbl.mem reachable body) | |
then Some (Unreachable body) | |
else None | |
in | |
(tree, unreachable @ !warnings) | |
let bool : typ = Cons("bool", ["True", []; "False", []]) | |
let pair x y : typ = Cons("pair", ["Pair", [x; y]]) | |
let option x : typ = Cons("option", ["None", []; "Some", [x]]) | |
let either x y : typ = Cons("either", ["Left", [x]; "Right", [y]]) | |
let pAny = PAny | |
let pTrue = PCon("True", []) | |
let pFalse = PCon("False", []) | |
let pPair p1 p2 = PCon("Pair", [p1; p2]) | |
let pNone = PCon("None", []) | |
let pSome p = PCon("Some", [p]) | |
let pLeft p = PCon("Left", [p]) | |
let pRight p = PCon("Right", [p]) | |
let (@=>) pat body = (pat, body) | |
let ex1 = | |
( bool | |
, [ pTrue @=> 1 | |
; pFalse @=> 2 ]) | |
let ex2 = | |
( bool | |
, [ pTrue @=> 1 | |
; pTrue @=> 2 ]) | |
let ex3 = | |
( pair bool bool | |
, [ pPair pTrue pTrue @=> 1 | |
; pPair pTrue pFalse @=> 2 | |
; pPair pFalse pTrue @=> 3 | |
; pPair pFalse pFalse @=> 4 ]) | |
let ex4 = | |
( pair bool bool | |
, [ pPair pAny pTrue @=> 1 | |
; pPair pTrue pAny @=> 2 | |
; pPair pFalse pFalse @=> 3 ]) | |
let ex5 = | |
( pair bool bool | |
, [ pPair pAny pTrue @=> 1 | |
; pPair pTrue pAny @=> 2 ]) | |
let ex6 = | |
( pair bool (pair bool bool) | |
, [ pPair pTrue (pPair pTrue pFalse) @=> 1 | |
; pPair pAny (pPair pAny pFalse) @=> 2 | |
; pPair pTrue (pPair pAny pTrue ) @=> 3 | |
; pPair pFalse (pPair pTrue pAny ) @=> 4 ]) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment