Skip to content

Instantly share code, notes, and snippets.

@ryyppy
Created August 18, 2021 09:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ryyppy/c86cce0c7699ef1d22f55c9de96e8f3b to your computer and use it in GitHub Desktop.
Save ryyppy/c86cce0c7699ef1d22f55c9de96e8f3b to your computer and use it in GitHub Desktop.
Ingredient Parser example
/* Mini parser inspired by:
https://troydm.github.io/blog/2014/03/29/writing-micro-compiler-in-ocaml/
*/
exception Syntax_Error(string)
/* mostly for shopping list related features */
type modifier = WithoutQuantity /* Don't show quantities in shopping list [OE] = [Ohne Einheit] */
/* data type parsed from RichText */
type ingredient = {
quantity: option<float>,
unit_: option<string>,
description: string,
modifiers: array<modifier>,
comment: option<string>,
/* Original content */
raw: string,
}
type error = {
raw: string,
msg: string,
}
type result = Belt.Result.t<ingredient, error>
/* Single line string Stream emulation
Only tracks position inside the input string
and returns the character on given position accordingly */
module Stream = {
type t = {
pos: ref<int>,
str: string,
}
@bs.val external fromCharCode: char => string = "String.fromCharCode"
exception End_of_String
let fromString = str => {pos: ref(0), str: str}
let toCharCode = (c: string) => Js.String.charCodeAt(0, c)
let readChar = (stm: t): string => {
let str = stm.str
let pos = stm.pos.contents
if pos >= Js.String.length(str) {
raise(End_of_String)
} else {
stm.pos := pos + 1
Js.String.get(str, pos)
}
}
let atEnd = (stm: t): bool => stm.pos.contents >= Js.String.length(stm.str)
let unreadChar = (stm: t) => {
let pos = stm.pos.contents
if pos > 0 {
stm.pos := pos - 1
} else {
()
}
}
let rec skipBlankChars = (stm: t) =>
if atEnd(stm) {
()
} else {
let c = readChar(stm)
if c === " " || (c === j`\\t` || toCharCode(c) === 160.) {
skipBlankChars(stm)
} else {
unreadChar(stm)
}
}
}
module Lexer = {
/*
Examples on what to parse:
[OE]{1 großer Löffel} gesalzene Erdnüsse | kleingehackt
{500 g} Spiralnudeln
[OE]{1 Hand} Mungbohnensprossen
{4} Eier | nur Dotter benötigt
*/
type token =
| LeftCurly
| RightCurly
| LeftBracket
| RightBracket
| Comma
| NumberLiteral(float)
| Identifier(string) /* modifier, unit, ingredient, comment text */
| Pipe
| End /* Extra token for signalling the end */
let string_of_token = token =>
switch token {
| LeftCurly => "{"
| RightCurly => "}"
| LeftBracket => "["
| RightBracket => "]"
| Comma => ","
| NumberLiteral(value) => "NumberLiteral: " ++ Js.Float.toString(value)
| Identifier(value) => "Identifier: " ++ value
| Pipe => "|"
| End => "End"
}
let toCharCode = Stream.toCharCode
let isDigit = (c: string) => {
let code = toCharCode(c)
code >= toCharCode("0") && code <= toCharCode("9")
}
let isIdentChar = (c: string) => {
let code = toCharCode(c)
(code >= toCharCode("A") && code <= toCharCode("Z")) ||
((code >= toCharCode("a") && code <= toCharCode("z")) ||
(c === j`ü` ||
(c === j`Ü` ||
(c === j`Ö` ||
(c === j`ö` ||
(c === j`ä` ||
(c === j`Ä` ||
(c === j`ß` ||
(c === j`î` ||
(c === j`è` ||
(c === j`à` ||
(c === "/" ||
(c === "&" ||
(c === ";" ||
(c === "-" ||
(c === "." ||
(c === "(" ||
(c === ":" || (c === "?" || (c === "!" || c === ")"))))))))))))))))))))
}
type t = {
lastToken: ref<option<token>>,
stm: Stream.t,
}
let syntaxError = (~lexer: t, ~pos: option<int>=?, msg: string) => {
let pos = Belt.Option.getWithDefault(pos, lexer.stm.pos.contents)
raise(Syntax_Error(msg ++ (" on pos " ++ pos->string_of_int)))
}
let scanExn = (s: t) => {
let stm = s.stm
let c = Stream.readChar(stm)
let rec scanIdent = acc =>
if Stream.atEnd(stm) {
Identifier(acc)
} else {
let nc = Stream.readChar(stm)
if isIdentChar(nc) {
scanIdent(acc ++ nc)
} else {
stm->Stream.unreadChar
Identifier(acc)
}
}
/* inFraction = are we currently parsing the fraction after the dot */
let rec scanNumberLiteral = (~inFraction=false, acc) => {
let nc = Stream.readChar(stm)
if (nc === "." && !inFraction) || isDigit(nc) {
scanNumberLiteral(~inFraction=true, acc ++ nc)
} else {
stm->Stream.unreadChar
switch Belt.Float.fromString(acc) {
| Some(num) => NumberLiteral(num)
| None => syntaxError(~lexer=s, "Could not parse number literal")
}
}
}
switch c {
| "[" => LeftBracket
| "]" => RightBracket
| "{" => LeftCurly
| "}" => RightCurly
| "|" => Pipe
| "," => Comma
| _ =>
if isIdentChar(c) {
scanIdent(c)
} else if isDigit(c) {
scanNumberLiteral(c)
} else {
syntaxError(
~lexer=s,
"couldn't identify token: '" ++
(c ++
("' (Code: " ++ (toCharCode(c)->Js.Float.toString ++ ")"))),
)
}
}
}
let scan = (s: t) =>
try Some(scanExn(s)) catch {
| Stream.End_of_String => None
}
let make = stm => {lastToken: ref(None), stm: stm}
let matchNext = (lexer: t) =>
switch lexer.lastToken.contents {
| None =>
Stream.skipBlankChars(lexer.stm)
scan(lexer)->Belt.Option.getWithDefault(End)
| Some(token) =>
lexer.lastToken := None
token
}
let matchToken = (lexer: t, token: token) => matchNext(lexer) == token
/* Will clear the lastToken^, useful when you can't use matchToken and can
only use nextToken */
let flushToken = (lexer: t) => {
if lexer.lastToken.contents !== None {
lexer.lastToken := None
}
()
}
/* This will only return the next token when the lastToken^ was flushed
Don't call nextToken in a recursive function without calling matchToken / flushToken
inbetween!
*/
let nextToken = (lexer: t) =>
switch lexer.lastToken.contents {
| Some(token) => token
| None =>
let stm = lexer.stm
if Stream.atEnd(stm) {
End
} else {
Stream.skipBlankChars(stm)
switch scan(lexer) {
| Some(token) =>
lexer.lastToken := Some(token)
token
| None => End
}
}
}
let lex = (str: string): list<token> => {
let stm = Stream.fromString(str)
let lexer = make(stm)
let rec lex = acc =>
switch matchNext(lexer) {
| End => acc
| token => lex(list{token, ...acc})
}
lex(list{}) |> Belt.List.reverse
}
}
module Parser = {
type t = {
lexer: Lexer.t,
/* Values tracked over the parsing process */
quantity: option<float>,
unit_: option<string>,
description: option<string>,
modifiers: option<array<modifier>>,
comment: option<string>,
}
let make = lexer => {
lexer: lexer,
quantity: None,
unit_: None,
description: None,
modifiers: None,
comment: None,
}
let joinCommasAndIdents = arr =>
Belt.Array.reduce(arr, "", (acc, str) =>
switch str {
| "," => acc ++ ","
| str => acc ++ (" " ++ str)
}
)
let unexpectedTokenError = (lexer: Lexer.t, token: Lexer.token, msg: string) =>
Lexer.syntaxError(
~lexer,
"Unexpected token '" ++ (Lexer.string_of_token(token) ++ ("' " ++ msg)),
)
let modifier = (lexer: Lexer.t, ident: string) => {
let pos = lexer.stm.pos.contents - String.length(ident) + 1
switch ident->Js.String.toUpperCase {
| "OE" => WithoutQuantity
| _ => Lexer.syntaxError(~lexer, ~pos, "Unknown Modifier: " ++ ident)
}
}
let rec modifiers = (p: t, acc: array<modifier>): t => {
open Lexer
let {lexer} = p
let token = lexer->matchNext
switch token {
| Identifier(str) => modifiers(p, Js.Array.concat(acc, [modifier(lexer, str)]))
| Comma => modifiers(p, acc)
| LeftBracket => modifiers(p, acc)
| RightBracket => {...p, modifiers: Some(acc)}
| token => unexpectedTokenError(lexer, token, "in Modifier section")
}
}
let rec quantifier = (~value: option<float>=?, ~unit_: array<string>=[], p: t): t => {
open Lexer
let {lexer} = p
switch lexer->matchNext {
| NumberLiteral(value) => quantifier(~unit_, ~value, p)
| Identifier(u) => quantifier(~unit_=Js.Array.concat([u], unit_), ~value?, p)
| LeftCurly => quantifier(~unit_, ~value?, p)
| RightCurly => {
...p,
unit_: Some(Js.Array.joinWith(" ", unit_)),
quantity: value,
}
| token => unexpectedTokenError(lexer, token, "in Quantifier section")
}
}
let rec description = (~acc: array<string>=[], p: t): t => {
open Lexer
let {lexer} = p
switch lexer->nextToken {
| Pipe
| End => {...p, description: Some(joinCommasAndIdents(acc)->Js.String.trim)}
| NumberLiteral(value) =>
lexer->flushToken
description(p, ~acc=Js.Array.concat([Js.Float.toString(value)], acc))
| Identifier(str) =>
lexer->flushToken
description(p, ~acc=Js.Array.concat([str], acc))
| Comma =>
lexer->flushToken
description(~acc=Js.Array.concat([","], acc), p)
| token => unexpectedTokenError(lexer, token, "in Description")
}
}
let rec comment = (~acc: array<string>=[], p: t): t => {
open Lexer
let {lexer} = p
let addString = (str: string) => comment(p, ~acc=Js.Array.concat([str], acc))
switch lexer->matchNext {
| Pipe => comment(p, ~acc)
| Identifier(str) => comment(p, ~acc=Js.Array.concat([str], acc))
| Comma => comment(p, ~acc=Js.Array.concat([","], acc))
| LeftCurly => addString("{")
| RightCurly => addString("}")
| LeftBracket => addString("[")
| RightBracket => addString("]")
| NumberLiteral(v) => addString(v->Belt.Float.toString)
| End => {...p, comment: Some(joinCommasAndIdents(acc)->Js.String.trim)}
}
}
}
let parse = (str: string): result => {
open Lexer
let stm = Stream.fromString(str)
let lexer = Lexer.make(stm)
let rec traverse = p =>
switch lexer->nextToken {
| End => p
| LeftBracket => Parser.modifiers(p, [])->traverse
| LeftCurly => Parser.quantifier(p)->traverse
| Pipe => Parser.comment(p)->traverse
| Identifier(_) => Parser.description(p)->traverse
| token => Parser.unexpectedTokenError(lexer, token, "in Root")
}
let run = () => {
let p = Parser.make(lexer)->traverse
Belt.Result.Ok({
quantity: p.quantity,
unit_: p.unit_,
description: Belt.Option.getWithDefault(p.description, "?"),
modifiers: Belt.Option.getWithDefault(p.modifiers, []),
comment: p.comment,
raw: str,
})
}
try run() catch {
| Syntax_Error(msg) => Error({raw: str, msg: msg})
}
}
// didn't include all the Test setup infra.. just an example on how the parser is being used.
let allCasesParse = _loc => {
let input = [
j` [OE]{5 Haende} grüner Paprika | vorzugsweise grob `,
j`{1} Zwiebel `,
j`{1 Dose} (à 212 ml) Mais`,
j`{1} Knoblauchzehe`,
j`{1 Stk} Schnittknoblauch | , 3 mm breit`,
j`{750 g} Mehl (z.B. 2/3 Weizen und 1/3 Dinkelmehl)`,
]
Belt.Array.forEach(input, str => {
open IngredientParser
let p = parse(str)
switch p {
| Belt.Result.Ok(_ing) => ()
| Error({msg}) => fail(msg)
}
})
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment