Skip to content

Instantly share code, notes, and snippets.

@qexat
Created June 13, 2025 14:38
Show Gist options
  • Select an option

  • Save qexat/083956e5a167faf200da248c6d0c75cf to your computer and use it in GitHub Desktop.

Select an option

Save qexat/083956e5a167faf200da248c6d0c75cf to your computer and use it in GitHub Desktop.
some parser combinator draft
module Option = struct
include Option
let or_else : type item. (unit -> item t) -> item t -> item t =
fun rightf left ->
match left with
| None -> rightf ()
| _ -> left
;;
let[@inline] ( or ) : type item. item t -> item t -> item t =
fun left right -> or_else (fun () -> right) left
;;
end
module String = struct
include String
let flat_map (s : string) (f : char -> string) =
s |> to_seq |> Seq.flat_map (fun c -> to_seq (f c)) |> of_seq
;;
let serialize s = "\"" ^ escaped s ^ "\""
end
let ( let+ ) = Option.bind
let ( let- ) : type item. item option -> (unit -> item option) -> item option =
fun left rightf -> Option.or_else rightf left
;;
module type STRING_LIKE = sig
type t
val equal : t -> t -> bool
val length : t -> int
val sub : t -> int -> int -> t
end
(* TODO: rename rule → pattern *)
(* TODO: add range support (e.g. A-Z) *)
module rec Rule : sig
(** This rules. *)
(** ['token t] represents a rule of ['token]s. *)
type 'token t =
| Choice of 'token t * 'token t (** A | B *)
| Maybe of 'token t (** A? *)
| Negation of 'token t (** ¬A *)
| Raw of 'token
| Sequence of 'token t list (** A B C D... *)
| Several of 'token t (** A+ *)
val map : 'token1 'token2. ('token1 -> 'token2) -> 'token1 t -> 'token2 t
val const : 'token1 'token2. 'token1 -> 'token2 t -> 'token1 t
val lift : 'token. 'token -> 'token t
val apply : 'token1 'token2. ('token1 -> 'token2) t -> 'token1 t -> 'token2 t
val map2
: 'token1 'token2 'token3.
('token1 -> 'token2 -> 'token3) -> 'token1 t -> 'token2 t -> 'token3 t
val join : 'token. 'token t t -> 'token t
val bind : 'token1 'token2. 'token1 t -> ('token1 -> 'token2 t) -> 'token2 t
(* stuff *)
val raw : 'token. 'token -> 'token t
val ( #| ) : 'token. 'token t -> 'token t -> 'token t
val ( ~^ ) : 'token. 'token t -> 'token t
val ( ~? ) : 'token. 'token t -> 'token t
val ( ~+ ) : 'token. 'token t -> 'token t
val ( ~* ) : 'token. 'token t -> 'token t
val ( let>>= ) : 'a 'b. 'a t -> ('a -> 'b t) -> 'b t
val serialize : 'token. ('token -> string) -> 'token t -> string
val show : 'token. ('token -> string) -> 'token t -> string
(* parsing *)
val parse
: 'token.
using:(module STRING_LIKE with type t = 'token)
-> 'token
-> 'token t
-> 'token Parsed.t option
end = struct
type 'token t =
| Choice of 'token t * 'token t
| Maybe of 'token t
| Negation of 'token t
| Raw of 'token
| Sequence of 'token t list
| Several of 'token t
let rec map =
fun f -> function
| Choice (left, right) -> Choice (map f left, map f right)
| Maybe rule -> Maybe (map f rule)
| Negation rule -> Negation (map f rule)
| Raw raw -> Raw (f raw)
| Sequence rules -> Sequence (List.map (map f) rules)
| Several rule -> Several (map f rule)
;;
let const = fun token rule -> map (Fun.const token) rule
let rec join = function
| Choice (left, right) -> Choice (join left, join right)
| Maybe rule -> Maybe (join rule)
| Negation rule -> Negation (join rule)
| Raw raw -> raw
| Sequence rules -> Sequence (List.map join rules)
| Several rule -> Several (join rule)
;;
let bind = fun rule f -> join (map f rule)
let map2 = fun f left right -> bind (map f left) ((Fun.flip map) right)
let apply = fun left right -> map2 ( @@ ) left right
let lift = fun raw -> Raw raw
(* stuff *)
let raw = lift
let ( #| ) = fun left right -> Choice (left, right)
let ( ~^ ) = fun rule -> Negation rule
let ( ~? ) = fun rule -> Maybe rule
let ( ~+ ) = fun rule -> Several rule
let ( ~* ) = fun rule -> ~?(+rule)
let ( let>>= ) = bind
let rec serialize f = function
| Choice (left, right) ->
Printf.sprintf "Choice(%s, %s)" (serialize f left) (serialize f right)
| Maybe rule -> Printf.sprintf "Maybe(%s)" (serialize f rule)
| Negation rule -> Printf.sprintf "Negation(%s)" (serialize f rule)
| Raw token -> Printf.sprintf "Raw(%s)" (f token)
| Sequence items ->
Printf.sprintf "Sequence(%s)" (items |> List.map (serialize f) |> String.concat ", ")
| Several rule -> Printf.sprintf "Several(%s)" (serialize f rule)
;;
let rec show f = function
| Choice (left, right) ->
Printf.sprintf "%s \x1b[35m%s\x1b[39m %s" (show f left) "|" (show f right)
| Maybe rule -> Printf.sprintf "(%s)\x1b[35m%s\x1b[39m" (show f rule) "?"
| Negation rule -> Printf.sprintf "\x1b[35m%s\x1b[39m(%s)" "~" (show f rule)
| Raw token -> f token
| Sequence items -> items |> List.map (show f) |> String.concat " "
| Several rule -> Printf.sprintf "(%s)\x1b[35m%s\x1b[39m" (show f rule) "+"
;;
(* parse *)
module Extend_token (T : STRING_LIKE) = struct
include T
(* convenient functions that we can derive from [T.sub] *)
let take ~(n : int) (token : t) : t =
if n >= length token then token else sub token 0 n
;;
let start_from ~index:(k : int) (token : t) : t =
if k <= 0 then token else sub token k (length token - k)
;;
end
let rec parse
: type token.
using:(module STRING_LIKE with type t = token)
-> token
-> token t
-> token Parsed.t option
=
fun ~using raw ->
let module Token = Extend_token ((val using)) in
function
| Choice (left, right) -> parse_choice ~using raw ~left ~right
| Maybe rule -> parse_maybe ~using raw rule
| Negation rule -> parse_negation ~using raw rule
| Raw raw' when Token.equal raw raw' -> Some (Raw raw)
| Raw raw' -> None
| Sequence rules -> parse_sequence ~using raw rules
| Several rule -> parse_several ~using raw rule
and parse_minimal
: type token.
using:(module STRING_LIKE with type t = token)
-> ?n:int
-> token
-> token t
-> (token Parsed.t * int) option
=
let inject_n n = Option.map (fun parsed -> parsed, n) in
fun ~using ?(n = 1) raw rule ->
let module Token = Extend_token ((val using)) in
if n >= Token.length raw
then None
else
let- () = inject_n n @@ parse ~using (Token.take ~n raw) rule in
parse_minimal ~using ~n:(n + 1) raw rule
and parse_maximal
: type token.
using:(module STRING_LIKE with type t = token)
-> k:int
-> token
-> token t
-> (token Parsed.t * int) option
=
let inject_k k = Option.map (fun parsed -> parsed, k) in
fun ~using ~k raw rule ->
let module Token = Extend_token ((val using)) in
if k <= 1
then None
else
let- () = inject_k k @@ parse ~using (Token.take ~n:k raw) rule in
parse_maximal ~using ~k:(k - 1) raw rule
and parse_as_many
: type token.
using:(module STRING_LIKE with type t = token)
-> k:int
-> token Parsed.t list
-> token
-> token t
-> token Parsed.t list * int
=
fun ~using ~k acc raw rule ->
let module Token = Extend_token ((val using)) in
if k = Token.length raw
then List.rev acc, k
else (
(* if something is wrong here, then replace parse_minimal with parse_maximal *)
match parse_minimal ~using ~n:k (Token.start_from ~index:k raw) rule with
| None -> List.rev acc, k
| Some (parsed, k) -> parse_as_many ~using ~k (parsed :: acc) raw rule)
and parse_sequence_list
: type token.
using:(module STRING_LIKE with type t = token)
-> ?acc:token Parsed.t list
-> token
-> token t list
-> token Parsed.t list option
=
fun ~using ?(acc = []) raw ->
let module Token = Extend_token ((val using)) in
function
| [] -> Some (List.rev acc)
| first :: rest ->
let+ first', k = parse_maximal ~using ~k:(Token.length raw) raw first in
parse_sequence_list ~using ~acc:(first' :: acc) (Token.start_from ~index:k raw) rest
and parse_choice
: type token.
using:(module STRING_LIKE with type t = token)
-> token
-> left:token t
-> right:token t
-> token Parsed.t option
=
fun ~using raw ~left ~right ->
let- () = Option.map Parsed.left (parse ~using raw left) in
Option.map Parsed.right (parse ~using raw right)
and parse_maybe
: type token.
using:(module STRING_LIKE with type t = token)
-> token
-> token t
-> token Parsed.t option
=
fun ~using raw rule -> Some (Maybe (parse ~using raw rule))
and parse_negation
: type token.
using:(module STRING_LIKE with type t = token)
-> token
-> token t
-> token Parsed.t option
=
fun ~using raw rule ->
match parse ~using raw rule with
| None -> Some (Negation raw)
| Some _ -> None
and parse_sequence
: type token.
using:(module STRING_LIKE with type t = token)
-> token
-> token t list
-> token Parsed.t option
=
fun ~using raw rules ->
parse_sequence_list ~using raw rules |> Option.map Parsed.sequence
and parse_several
: type token.
using:(module STRING_LIKE with type t = token)
-> token
-> token t
-> token Parsed.t option
=
fun ~using raw rule ->
let module Token = Extend_token ((val using)) in
let+ first, k = parse_maximal ~using ~k:(Token.length raw) raw rule in
let rest, _ = parse_as_many ~using ~k [ first ] raw rule in
Some (Parsed.Several rest)
;;
end
(* the converse of patterns *)
and Parsed : sig
type 'token t =
| Choice of ('token t, 'token t) Either.t (** ↑ We matched either left or right. *)
| Maybe of 'token t option (** We may have matched. *)
| Negation of 'token (** None if we matched and what we got if we didn't. *)
| Raw of 'token
| Sequence of 'token t list
| Several of 'token t list (** The list of what we matched. *)
val left : 'token. 'token t -> 'token t
val right : 'token. 'token t -> 'token t
val sequence : 'token. 'token t list -> 'token t
val serialize : 'token. ('token -> string) -> 'token t -> string
val to_lexeme : 'token. ('token -> string) -> 'token t -> string
end = struct
type 'token t =
| Choice of ('token t, 'token t) Either.t
| Maybe of 'token t option
| Negation of 'token
| Raw of 'token
| Sequence of 'token t list
| Several of 'token t list
let left parsed = Choice (Left parsed)
let right parsed = Choice (Right parsed)
let sequence : type token. token t list -> token t = fun items -> Sequence items
let rec serialize f = function
| Choice (Left parsed) -> Printf.sprintf "Choice(Left(%s))" (serialize f parsed)
| Choice (Right parsed) -> Printf.sprintf "Choice(Right(%s))" (serialize f parsed)
| Maybe None -> Printf.sprintf "Maybe(None)"
| Maybe (Some parsed) -> Printf.sprintf "Maybe(Some(%s))" (serialize f parsed)
| Negation token -> Printf.sprintf "Negation(%s)" (f token)
| Raw token -> Printf.sprintf "Raw(%s)" (f token)
| Sequence items ->
Printf.sprintf "Sequence(%s)" (items |> List.map (serialize f) |> String.concat ", ")
| Several [] -> failwith "unreachable"
| Several (first :: rest) ->
Printf.sprintf "Several(%s, %d)" (serialize f first) (1 + List.length rest)
;;
let rec to_lexeme f parsed =
let rec inner f = function
| Choice (Left parsed) -> inner f parsed
| Choice (Right parsed) -> inner f parsed
| Maybe None -> ""
| Maybe (Some parsed) -> inner f parsed
| Negation token -> f token
| Raw token -> f token
| Sequence items -> items |> List.map (inner f) |> String.concat ""
| Several [] -> failwith "unreachable"
| Several items -> items |> List.map (inner f) |> String.concat ""
in
inner f parsed
|> Fun.flip String.flat_map (function
| '"' -> "\\\""
| c -> String.make 1 c)
|> Printf.sprintf "\"%s\""
;;
end
open Rule
let ps s r =
Printf.printf "\x1b[1mrule:\x1b[22m %s\n" (Rule.show String.serialize r);
match parse ~using:(module String) s r with
| None -> Printf.printf "no match\n"
| Some parsed ->
Printf.printf
"\x1b[1mmatch:\x1b[22m %s\n\x1b[1mlexeme:\x1b[22m \x1b[32m%s\x1b[39m\n"
(Parsed.serialize String.serialize parsed)
(Parsed.to_lexeme (fun x -> x) parsed)
;;
let comment = Sequence [ raw ";;"; ~*(~^(raw "\n")) ]
let string = ";; hello world!"
let () = ps string comment
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment