Created
June 13, 2025 14:38
-
-
Save qexat/083956e5a167faf200da248c6d0c75cf to your computer and use it in GitHub Desktop.
some parser combinator draft
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| module Option = struct | |
| include Option | |
| let or_else : type item. (unit -> item t) -> item t -> item t = | |
| fun rightf left -> | |
| match left with | |
| | None -> rightf () | |
| | _ -> left | |
| ;; | |
| let[@inline] ( or ) : type item. item t -> item t -> item t = | |
| fun left right -> or_else (fun () -> right) left | |
| ;; | |
| end | |
| module String = struct | |
| include String | |
| let flat_map (s : string) (f : char -> string) = | |
| s |> to_seq |> Seq.flat_map (fun c -> to_seq (f c)) |> of_seq | |
| ;; | |
| let serialize s = "\"" ^ escaped s ^ "\"" | |
| end | |
| let ( let+ ) = Option.bind | |
| let ( let- ) : type item. item option -> (unit -> item option) -> item option = | |
| fun left rightf -> Option.or_else rightf left | |
| ;; | |
| module type STRING_LIKE = sig | |
| type t | |
| val equal : t -> t -> bool | |
| val length : t -> int | |
| val sub : t -> int -> int -> t | |
| end | |
| (* TODO: rename rule → pattern *) | |
| (* TODO: add range support (e.g. A-Z) *) | |
| module rec Rule : sig | |
| (** This rules. *) | |
| (** ['token t] represents a rule of ['token]s. *) | |
| type 'token t = | |
| | Choice of 'token t * 'token t (** A | B *) | |
| | Maybe of 'token t (** A? *) | |
| | Negation of 'token t (** ¬A *) | |
| | Raw of 'token | |
| | Sequence of 'token t list (** A B C D... *) | |
| | Several of 'token t (** A+ *) | |
| val map : 'token1 'token2. ('token1 -> 'token2) -> 'token1 t -> 'token2 t | |
| val const : 'token1 'token2. 'token1 -> 'token2 t -> 'token1 t | |
| val lift : 'token. 'token -> 'token t | |
| val apply : 'token1 'token2. ('token1 -> 'token2) t -> 'token1 t -> 'token2 t | |
| val map2 | |
| : 'token1 'token2 'token3. | |
| ('token1 -> 'token2 -> 'token3) -> 'token1 t -> 'token2 t -> 'token3 t | |
| val join : 'token. 'token t t -> 'token t | |
| val bind : 'token1 'token2. 'token1 t -> ('token1 -> 'token2 t) -> 'token2 t | |
| (* stuff *) | |
| val raw : 'token. 'token -> 'token t | |
| val ( #| ) : 'token. 'token t -> 'token t -> 'token t | |
| val ( ~^ ) : 'token. 'token t -> 'token t | |
| val ( ~? ) : 'token. 'token t -> 'token t | |
| val ( ~+ ) : 'token. 'token t -> 'token t | |
| val ( ~* ) : 'token. 'token t -> 'token t | |
| val ( let>>= ) : 'a 'b. 'a t -> ('a -> 'b t) -> 'b t | |
| val serialize : 'token. ('token -> string) -> 'token t -> string | |
| val show : 'token. ('token -> string) -> 'token t -> string | |
| (* parsing *) | |
| val parse | |
| : 'token. | |
| using:(module STRING_LIKE with type t = 'token) | |
| -> 'token | |
| -> 'token t | |
| -> 'token Parsed.t option | |
| end = struct | |
| type 'token t = | |
| | Choice of 'token t * 'token t | |
| | Maybe of 'token t | |
| | Negation of 'token t | |
| | Raw of 'token | |
| | Sequence of 'token t list | |
| | Several of 'token t | |
| let rec map = | |
| fun f -> function | |
| | Choice (left, right) -> Choice (map f left, map f right) | |
| | Maybe rule -> Maybe (map f rule) | |
| | Negation rule -> Negation (map f rule) | |
| | Raw raw -> Raw (f raw) | |
| | Sequence rules -> Sequence (List.map (map f) rules) | |
| | Several rule -> Several (map f rule) | |
| ;; | |
| let const = fun token rule -> map (Fun.const token) rule | |
| let rec join = function | |
| | Choice (left, right) -> Choice (join left, join right) | |
| | Maybe rule -> Maybe (join rule) | |
| | Negation rule -> Negation (join rule) | |
| | Raw raw -> raw | |
| | Sequence rules -> Sequence (List.map join rules) | |
| | Several rule -> Several (join rule) | |
| ;; | |
| let bind = fun rule f -> join (map f rule) | |
| let map2 = fun f left right -> bind (map f left) ((Fun.flip map) right) | |
| let apply = fun left right -> map2 ( @@ ) left right | |
| let lift = fun raw -> Raw raw | |
| (* stuff *) | |
| let raw = lift | |
| let ( #| ) = fun left right -> Choice (left, right) | |
| let ( ~^ ) = fun rule -> Negation rule | |
| let ( ~? ) = fun rule -> Maybe rule | |
| let ( ~+ ) = fun rule -> Several rule | |
| let ( ~* ) = fun rule -> ~?(+rule) | |
| let ( let>>= ) = bind | |
| let rec serialize f = function | |
| | Choice (left, right) -> | |
| Printf.sprintf "Choice(%s, %s)" (serialize f left) (serialize f right) | |
| | Maybe rule -> Printf.sprintf "Maybe(%s)" (serialize f rule) | |
| | Negation rule -> Printf.sprintf "Negation(%s)" (serialize f rule) | |
| | Raw token -> Printf.sprintf "Raw(%s)" (f token) | |
| | Sequence items -> | |
| Printf.sprintf "Sequence(%s)" (items |> List.map (serialize f) |> String.concat ", ") | |
| | Several rule -> Printf.sprintf "Several(%s)" (serialize f rule) | |
| ;; | |
| let rec show f = function | |
| | Choice (left, right) -> | |
| Printf.sprintf "%s \x1b[35m%s\x1b[39m %s" (show f left) "|" (show f right) | |
| | Maybe rule -> Printf.sprintf "(%s)\x1b[35m%s\x1b[39m" (show f rule) "?" | |
| | Negation rule -> Printf.sprintf "\x1b[35m%s\x1b[39m(%s)" "~" (show f rule) | |
| | Raw token -> f token | |
| | Sequence items -> items |> List.map (show f) |> String.concat " " | |
| | Several rule -> Printf.sprintf "(%s)\x1b[35m%s\x1b[39m" (show f rule) "+" | |
| ;; | |
| (* parse *) | |
| module Extend_token (T : STRING_LIKE) = struct | |
| include T | |
| (* convenient functions that we can derive from [T.sub] *) | |
| let take ~(n : int) (token : t) : t = | |
| if n >= length token then token else sub token 0 n | |
| ;; | |
| let start_from ~index:(k : int) (token : t) : t = | |
| if k <= 0 then token else sub token k (length token - k) | |
| ;; | |
| end | |
| let rec parse | |
| : type token. | |
| using:(module STRING_LIKE with type t = token) | |
| -> token | |
| -> token t | |
| -> token Parsed.t option | |
| = | |
| fun ~using raw -> | |
| let module Token = Extend_token ((val using)) in | |
| function | |
| | Choice (left, right) -> parse_choice ~using raw ~left ~right | |
| | Maybe rule -> parse_maybe ~using raw rule | |
| | Negation rule -> parse_negation ~using raw rule | |
| | Raw raw' when Token.equal raw raw' -> Some (Raw raw) | |
| | Raw raw' -> None | |
| | Sequence rules -> parse_sequence ~using raw rules | |
| | Several rule -> parse_several ~using raw rule | |
| and parse_minimal | |
| : type token. | |
| using:(module STRING_LIKE with type t = token) | |
| -> ?n:int | |
| -> token | |
| -> token t | |
| -> (token Parsed.t * int) option | |
| = | |
| let inject_n n = Option.map (fun parsed -> parsed, n) in | |
| fun ~using ?(n = 1) raw rule -> | |
| let module Token = Extend_token ((val using)) in | |
| if n >= Token.length raw | |
| then None | |
| else | |
| let- () = inject_n n @@ parse ~using (Token.take ~n raw) rule in | |
| parse_minimal ~using ~n:(n + 1) raw rule | |
| and parse_maximal | |
| : type token. | |
| using:(module STRING_LIKE with type t = token) | |
| -> k:int | |
| -> token | |
| -> token t | |
| -> (token Parsed.t * int) option | |
| = | |
| let inject_k k = Option.map (fun parsed -> parsed, k) in | |
| fun ~using ~k raw rule -> | |
| let module Token = Extend_token ((val using)) in | |
| if k <= 1 | |
| then None | |
| else | |
| let- () = inject_k k @@ parse ~using (Token.take ~n:k raw) rule in | |
| parse_maximal ~using ~k:(k - 1) raw rule | |
| and parse_as_many | |
| : type token. | |
| using:(module STRING_LIKE with type t = token) | |
| -> k:int | |
| -> token Parsed.t list | |
| -> token | |
| -> token t | |
| -> token Parsed.t list * int | |
| = | |
| fun ~using ~k acc raw rule -> | |
| let module Token = Extend_token ((val using)) in | |
| if k = Token.length raw | |
| then List.rev acc, k | |
| else ( | |
| (* if something is wrong here, then replace parse_minimal with parse_maximal *) | |
| match parse_minimal ~using ~n:k (Token.start_from ~index:k raw) rule with | |
| | None -> List.rev acc, k | |
| | Some (parsed, k) -> parse_as_many ~using ~k (parsed :: acc) raw rule) | |
| and parse_sequence_list | |
| : type token. | |
| using:(module STRING_LIKE with type t = token) | |
| -> ?acc:token Parsed.t list | |
| -> token | |
| -> token t list | |
| -> token Parsed.t list option | |
| = | |
| fun ~using ?(acc = []) raw -> | |
| let module Token = Extend_token ((val using)) in | |
| function | |
| | [] -> Some (List.rev acc) | |
| | first :: rest -> | |
| let+ first', k = parse_maximal ~using ~k:(Token.length raw) raw first in | |
| parse_sequence_list ~using ~acc:(first' :: acc) (Token.start_from ~index:k raw) rest | |
| and parse_choice | |
| : type token. | |
| using:(module STRING_LIKE with type t = token) | |
| -> token | |
| -> left:token t | |
| -> right:token t | |
| -> token Parsed.t option | |
| = | |
| fun ~using raw ~left ~right -> | |
| let- () = Option.map Parsed.left (parse ~using raw left) in | |
| Option.map Parsed.right (parse ~using raw right) | |
| and parse_maybe | |
| : type token. | |
| using:(module STRING_LIKE with type t = token) | |
| -> token | |
| -> token t | |
| -> token Parsed.t option | |
| = | |
| fun ~using raw rule -> Some (Maybe (parse ~using raw rule)) | |
| and parse_negation | |
| : type token. | |
| using:(module STRING_LIKE with type t = token) | |
| -> token | |
| -> token t | |
| -> token Parsed.t option | |
| = | |
| fun ~using raw rule -> | |
| match parse ~using raw rule with | |
| | None -> Some (Negation raw) | |
| | Some _ -> None | |
| and parse_sequence | |
| : type token. | |
| using:(module STRING_LIKE with type t = token) | |
| -> token | |
| -> token t list | |
| -> token Parsed.t option | |
| = | |
| fun ~using raw rules -> | |
| parse_sequence_list ~using raw rules |> Option.map Parsed.sequence | |
| and parse_several | |
| : type token. | |
| using:(module STRING_LIKE with type t = token) | |
| -> token | |
| -> token t | |
| -> token Parsed.t option | |
| = | |
| fun ~using raw rule -> | |
| let module Token = Extend_token ((val using)) in | |
| let+ first, k = parse_maximal ~using ~k:(Token.length raw) raw rule in | |
| let rest, _ = parse_as_many ~using ~k [ first ] raw rule in | |
| Some (Parsed.Several rest) | |
| ;; | |
| end | |
| (* the converse of patterns *) | |
| and Parsed : sig | |
| type 'token t = | |
| | Choice of ('token t, 'token t) Either.t (** ↑ We matched either left or right. *) | |
| | Maybe of 'token t option (** We may have matched. *) | |
| | Negation of 'token (** None if we matched and what we got if we didn't. *) | |
| | Raw of 'token | |
| | Sequence of 'token t list | |
| | Several of 'token t list (** The list of what we matched. *) | |
| val left : 'token. 'token t -> 'token t | |
| val right : 'token. 'token t -> 'token t | |
| val sequence : 'token. 'token t list -> 'token t | |
| val serialize : 'token. ('token -> string) -> 'token t -> string | |
| val to_lexeme : 'token. ('token -> string) -> 'token t -> string | |
| end = struct | |
| type 'token t = | |
| | Choice of ('token t, 'token t) Either.t | |
| | Maybe of 'token t option | |
| | Negation of 'token | |
| | Raw of 'token | |
| | Sequence of 'token t list | |
| | Several of 'token t list | |
| let left parsed = Choice (Left parsed) | |
| let right parsed = Choice (Right parsed) | |
| let sequence : type token. token t list -> token t = fun items -> Sequence items | |
| let rec serialize f = function | |
| | Choice (Left parsed) -> Printf.sprintf "Choice(Left(%s))" (serialize f parsed) | |
| | Choice (Right parsed) -> Printf.sprintf "Choice(Right(%s))" (serialize f parsed) | |
| | Maybe None -> Printf.sprintf "Maybe(None)" | |
| | Maybe (Some parsed) -> Printf.sprintf "Maybe(Some(%s))" (serialize f parsed) | |
| | Negation token -> Printf.sprintf "Negation(%s)" (f token) | |
| | Raw token -> Printf.sprintf "Raw(%s)" (f token) | |
| | Sequence items -> | |
| Printf.sprintf "Sequence(%s)" (items |> List.map (serialize f) |> String.concat ", ") | |
| | Several [] -> failwith "unreachable" | |
| | Several (first :: rest) -> | |
| Printf.sprintf "Several(%s, %d)" (serialize f first) (1 + List.length rest) | |
| ;; | |
| let rec to_lexeme f parsed = | |
| let rec inner f = function | |
| | Choice (Left parsed) -> inner f parsed | |
| | Choice (Right parsed) -> inner f parsed | |
| | Maybe None -> "" | |
| | Maybe (Some parsed) -> inner f parsed | |
| | Negation token -> f token | |
| | Raw token -> f token | |
| | Sequence items -> items |> List.map (inner f) |> String.concat "" | |
| | Several [] -> failwith "unreachable" | |
| | Several items -> items |> List.map (inner f) |> String.concat "" | |
| in | |
| inner f parsed | |
| |> Fun.flip String.flat_map (function | |
| | '"' -> "\\\"" | |
| | c -> String.make 1 c) | |
| |> Printf.sprintf "\"%s\"" | |
| ;; | |
| end | |
| open Rule | |
| let ps s r = | |
| Printf.printf "\x1b[1mrule:\x1b[22m %s\n" (Rule.show String.serialize r); | |
| match parse ~using:(module String) s r with | |
| | None -> Printf.printf "no match\n" | |
| | Some parsed -> | |
| Printf.printf | |
| "\x1b[1mmatch:\x1b[22m %s\n\x1b[1mlexeme:\x1b[22m \x1b[32m%s\x1b[39m\n" | |
| (Parsed.serialize String.serialize parsed) | |
| (Parsed.to_lexeme (fun x -> x) parsed) | |
| ;; | |
| let comment = Sequence [ raw ";;"; ~*(~^(raw "\n")) ] | |
| let string = ";; hello world!" | |
| let () = ps string comment |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment