Created
April 25, 2016 06:25
-
-
Save jneen/93eb107ef5ca3293a90f2d2b7077754d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
open Batteries.Printf | |
open Batteries | |
type 't tokstate = Start | Tok of 't | Err of string | Eof | |
type 't state = { | |
mutable index : int ; | |
mutable head : 't tokstate ; | |
mutable behind : 't tokstate list ; | |
mutable ahead : 't tokstate list ; | |
mutable bt : int ; | |
read : unit -> 't tokstate ; | |
formatter : 't -> string ; | |
} | |
let new_state read formatter = { | |
index = 0 ; | |
head = Start ; | |
behind = [] ; | |
ahead = [] ; | |
bt = 0 ; | |
read = read ; | |
formatter = formatter ; | |
} | |
let format_tok st = function | |
| Start -> "\\a" | |
| Tok t -> st.formatter t | |
| Err s -> "!" ^ s | |
| Eof -> "\\z" | |
let format_toks st toks = toks |> List.map (format_tok st) |> String.join "" | |
let string_of_state st = | |
let head = format_tok st st.head | |
in let ahead = format_toks st st.ahead | |
in let behind = format_toks st (List.rev st.behind) | |
in sprintf "%i [%s](%s)[%s]" st.index behind head ahead | |
type parse_error = int * string list | |
exception ParseError of parse_error | |
let empty_error = (-1, []) | |
let merge_errors = function | |
| (e, (_, [])) -> e | |
| ((_, []), e) -> e | |
| ((p, m), (p', m')) when p < p' -> (p', m') | |
| ((p, m), (p', m')) when p > p' -> (p, m) | |
| ((p, m), (p', m')) when p = p' -> (p, List.sort_uniq compare (m @ m')) | |
| _ -> empty_error (* unreachable *) | |
type consumed = Consumed | Empty | |
let merge_consumed = function | |
| (Empty, Empty) -> Empty | |
| _ -> Consumed | |
type 'a result = | |
| Ok of 'a * parse_error | |
| Err of parse_error | |
let empty_result = Err empty_error | |
let error_of = function Ok (_, e) -> e | Err e -> e | |
let merge_results = function | |
| (Ok (a, e), r) -> Ok (a, merge_errors (e, error_of r)) | |
| (Err e, r) -> Err (merge_errors (e, error_of r)) | |
type 'a reply = consumed * 'a result | |
let ok (c:consumed) (v:'a) = ((c, Ok (v, empty_error)) : 'a reply) | |
let okc v = ok Consumed v | |
let oke v = ok Empty v | |
let nokc st msg = (Consumed, Err (st.index, msg)) | |
let noke st msg = (Empty, Err (st.index, msg)) | |
let empty_reply = (Empty, Err empty_error) | |
let map_reply f = function | |
| (c, Ok (v, e)) -> (c, Ok (f v, e)) | |
| (c, Err e) -> (c, Err e) | |
let merge_replies ((c, r), (c', r')) = (merge_consumed (c, c'), merge_results (r, r')) | |
let read st = st.read () | |
let advance1 st = | |
(* string_of_state st |> printf "> %s\n"; *) | |
st.index <- st.index + 1; | |
if st.bt > 0 then st.behind <- st.head :: st.behind; | |
(match st.ahead with | |
| (x :: xs) -> st.ahead <- xs; st.head <- x | |
| [] -> st.head <- read st) | |
let rewind1 st = | |
(* st |> string_of_state |> printf "< %s\n"; *) | |
st.index <- st.index - 1; | |
st.ahead <- st.head :: st.ahead; | |
match st.behind with | |
| (x :: xs) -> st.behind <- xs; st.head <- x | |
| [] -> raise (Failure "can't rewind any more!") | |
let rec advance n st = match n with 0 -> () | _ -> advance1 st; advance (n-1) st | |
let rec rewind n st = match n with 0 -> () | _ -> rewind1 st; rewind (n-1) st | |
let lookahead n st = | |
advance n st; rewind n st; (* populate the lookahead buffer *) | |
List.take n st.ahead | |
let rewind_to idx st = rewind (st.index - idx) st | |
let incr_bt st = st.bt <- st.bt + 1 | |
let decr_bt st = st.bt <- st.bt - 1 ; if st.bt = 0 then st.behind <- [] | |
type ('a, 't) parser = { mutable exec : 't state -> 'a reply } | |
let pure x = let rep = oke x in { exec = fun _ -> rep } | |
let fail m = { exec = fun st -> noke st m } | |
let backtracking p = { exec = fun st -> | |
let backtrack_index = st.index | |
in incr_bt st; | |
let reply = p.exec st | |
in match reply with | |
| (_, Err e) -> rewind_to backtrack_index st; decr_bt st; (Empty, Err e) | |
| _ -> decr_bt st; reply; | |
} | |
let satisfy p = | |
{ exec = fun st -> match st.head with | |
| Tok t when p t -> advance1 st; okc t | |
| _ -> noke st [] } | |
let exact toks = { exec = fun st -> | |
let bt_idx = st.index | |
in let consumed = match toks with [] -> Empty | _ -> Consumed | |
in let err_msg = toks |> List.map (fun t -> Tok t) |> format_toks st | |
in let rec check_for toks = match toks with | |
| [] -> ok consumed toks | |
| (t::ts) -> match st.head with | |
| Tok h when t = h -> advance1 st; check_for ts | |
| _ -> rewind_to bt_idx st; noke st [err_msg] | |
in check_for toks | |
} | |
let eq c = | |
{ exec = fun st -> match st.head with | |
| Tok t when c = t -> advance1 st; okc t | |
| _ -> noke st [] } | |
let oneOf l = { exec = fun st -> match st.head with | |
| Tok t when List.mem t l -> advance1 st; okc t | |
| _ -> noke st [] } | |
let eof () = { exec = fun st -> match st.head with | |
| Eof -> oke () | |
| _ -> noke st ["eof"] } | |
let start () = { exec = fun st -> match st.head with | |
| Start -> advance1 st; okc () | |
| _ -> noke st ["start of stream"] } | |
let alt parsers = | |
let exec st = | |
let rec loop = function | |
| (last, (p :: ps)) -> | |
(match p.exec st with | |
| (Empty, Err e) -> loop (merge_results (Err e, last), ps) | |
| (consumed, other) -> (consumed, merge_results (other, last))) | |
| (last, []) -> (Empty, last) | |
in loop (empty_result, parsers) | |
in { exec = exec } | |
(* this can't contain a reply because polymorphic exceptions are illegal *) | |
exception SeqBreak of consumed * parse_error | |
let seq p' p = | |
let exec st = | |
let r = p.exec st | |
in match r with | |
| (c, Ok (v, e)) -> | |
merge_replies (p'.exec st, r) |> map_reply (fun v' -> (v, v')) | |
| (c, Err e) -> (c, Err e) | |
in { exec = exec } | |
let map f p = { exec = fun st -> map_reply f (p.exec st) } | |
let result r p = map (fun _ -> r) p | |
let default x p = alt [ p; pure x ] | |
let opt p = p |> map (fun x -> Some x) |> default None | |
let ignore p = p |> result () | |
let follow p' p = p |> seq p' |> map (fun (_, x) -> x) | |
let skip p' p = p |> seq p' |> map (fun (x, _) -> x) | |
let many p = | |
let exec st = | |
let rec loop last accum = | |
let reply = merge_replies (p.exec st, last) | |
in match reply with | |
| (_, Ok (v, _)) -> loop reply (v :: accum) | |
| _ -> merge_replies (oke (List.rev accum), reply) | |
in loop empty_reply [] | |
in { exec = exec } | |
let many1 p = p |> seq (many p) |> map (fun (x, xs) -> (x :: xs)) | |
let sep_by sep p = | |
let p_sep = sep |> follow p | |
in p |> seq (many p_sep) |> map (fun (x, xs) -> (x :: xs)) | |
let defer f = { exec = fun st -> (Lazy.force f).exec st } | |
let desc s p = { exec = fun st -> match p.exec st with | |
(* replace all messages generated by p *) | |
| (c, Err (i, _)) -> (c, Err (i, [s])) | |
| reply -> reply } | |
exception NotImplemented | |
let declare () = { exec = fun _ -> raise NotImplemented } | |
let implement p p' = p.exec <- p'.exec | |
let parse p ?(formatter = (fun _ -> "")) stream = | |
let with_eof = start () |> follow p |> skip (eof ()) | |
in let st = new_state stream formatter | |
in match with_eof.exec st with | |
| (_, Ok (v, _)) -> v | |
| (_, Err e) -> raise (ParseError e) | |
module Unicode = struct | |
type uchar_token = UChar.t tokstate | |
type 'a uchar_parser = ('a, UChar.t) parser | |
let format c = c |> UTF8.of_char |> UTF8.escaped | |
let string_of_uchars uch = uch |> List.map UChar.char_of | |
|> List.map String.of_char | |
|> String.join "" | |
let str s = exact (String.to_list s |> List.map UChar.of_char) |> result s | |
|> desc (sprintf "\"%s\"" s) | |
let ch c = | |
let uc = UChar.of_char c | |
in let test x = UChar.eq uc x | |
in satisfy test |> result uc |> desc (sprintf "'%s'" (format uc)) | |
let range low high = | |
let ulow = UChar.of_char low | |
in let uhigh = UChar.of_char high | |
in let description = sprintf "'%s'..'%s'" (format ulow) (format uhigh) | |
in let test c = let open UChar in compare c ulow >= 0 && compare c uhigh <= 0 | |
in satisfy test |> desc description | |
let nch c = | |
let uc = UChar.of_char c | |
in satisfy (fun c' -> uc != c') |> desc (sprintf "[^%s]" (format uc)) | |
let parse p stream = parse p ~formatter:format stream | |
let noneOf str = | |
let test c = not (UTF8.contains str c) | |
in satisfy test |> desc (sprintf "none of [%s]" (UTF8.escaped str)) | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment