-
-
Save ryyppy/c86cce0c7699ef1d22f55c9de96e8f3b to your computer and use it in GitHub Desktop.
Ingredient Parser example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Mini parser inspired by: | |
https://troydm.github.io/blog/2014/03/29/writing-micro-compiler-in-ocaml/ | |
*/ | |
exception Syntax_Error(string) | |
/* mostly for shopping list related features */ | |
type modifier = WithoutQuantity /* Don't show quantities in shopping list [OE] = [Ohne Einheit] */ | |
/* data type parsed from RichText */ | |
type ingredient = { | |
quantity: option<float>, | |
unit_: option<string>, | |
description: string, | |
modifiers: array<modifier>, | |
comment: option<string>, | |
/* Original content */ | |
raw: string, | |
} | |
type error = { | |
raw: string, | |
msg: string, | |
} | |
type result = Belt.Result.t<ingredient, error> | |
/* Single line string Stream emulation | |
Only tracks position inside the input string | |
and returns the character on given position accordingly */ | |
module Stream = { | |
type t = { | |
pos: ref<int>, | |
str: string, | |
} | |
@bs.val external fromCharCode: char => string = "String.fromCharCode" | |
exception End_of_String | |
let fromString = str => {pos: ref(0), str: str} | |
let toCharCode = (c: string) => Js.String.charCodeAt(0, c) | |
let readChar = (stm: t): string => { | |
let str = stm.str | |
let pos = stm.pos.contents | |
if pos >= Js.String.length(str) { | |
raise(End_of_String) | |
} else { | |
stm.pos := pos + 1 | |
Js.String.get(str, pos) | |
} | |
} | |
let atEnd = (stm: t): bool => stm.pos.contents >= Js.String.length(stm.str) | |
let unreadChar = (stm: t) => { | |
let pos = stm.pos.contents | |
if pos > 0 { | |
stm.pos := pos - 1 | |
} else { | |
() | |
} | |
} | |
let rec skipBlankChars = (stm: t) => | |
if atEnd(stm) { | |
() | |
} else { | |
let c = readChar(stm) | |
if c === " " || (c === j`\\t` || toCharCode(c) === 160.) { | |
skipBlankChars(stm) | |
} else { | |
unreadChar(stm) | |
} | |
} | |
} | |
module Lexer = { | |
/* | |
Examples on what to parse: | |
[OE]{1 großer Löffel} gesalzene Erdnüsse | kleingehackt | |
{500 g} Spiralnudeln | |
[OE]{1 Hand} Mungbohnensprossen | |
{4} Eier | nur Dotter benötigt | |
*/ | |
type token = | |
| LeftCurly | |
| RightCurly | |
| LeftBracket | |
| RightBracket | |
| Comma | |
| NumberLiteral(float) | |
| Identifier(string) /* modifier, unit, ingredient, comment text */ | |
| Pipe | |
| End /* Extra token for signalling the end */ | |
let string_of_token = token => | |
switch token { | |
| LeftCurly => "{" | |
| RightCurly => "}" | |
| LeftBracket => "[" | |
| RightBracket => "]" | |
| Comma => "," | |
| NumberLiteral(value) => "NumberLiteral: " ++ Js.Float.toString(value) | |
| Identifier(value) => "Identifier: " ++ value | |
| Pipe => "|" | |
| End => "End" | |
} | |
let toCharCode = Stream.toCharCode | |
let isDigit = (c: string) => { | |
let code = toCharCode(c) | |
code >= toCharCode("0") && code <= toCharCode("9") | |
} | |
let isIdentChar = (c: string) => { | |
let code = toCharCode(c) | |
(code >= toCharCode("A") && code <= toCharCode("Z")) || | |
((code >= toCharCode("a") && code <= toCharCode("z")) || | |
(c === j`ü` || | |
(c === j`Ü` || | |
(c === j`Ö` || | |
(c === j`ö` || | |
(c === j`ä` || | |
(c === j`Ä` || | |
(c === j`ß` || | |
(c === j`î` || | |
(c === j`è` || | |
(c === j`à` || | |
(c === "/" || | |
(c === "&" || | |
(c === ";" || | |
(c === "-" || | |
(c === "." || | |
(c === "(" || | |
(c === ":" || (c === "?" || (c === "!" || c === ")")))))))))))))))))))) | |
} | |
type t = { | |
lastToken: ref<option<token>>, | |
stm: Stream.t, | |
} | |
let syntaxError = (~lexer: t, ~pos: option<int>=?, msg: string) => { | |
let pos = Belt.Option.getWithDefault(pos, lexer.stm.pos.contents) | |
raise(Syntax_Error(msg ++ (" on pos " ++ pos->string_of_int))) | |
} | |
let scanExn = (s: t) => { | |
let stm = s.stm | |
let c = Stream.readChar(stm) | |
let rec scanIdent = acc => | |
if Stream.atEnd(stm) { | |
Identifier(acc) | |
} else { | |
let nc = Stream.readChar(stm) | |
if isIdentChar(nc) { | |
scanIdent(acc ++ nc) | |
} else { | |
stm->Stream.unreadChar | |
Identifier(acc) | |
} | |
} | |
/* inFraction = are we currently parsing the fraction after the dot */ | |
let rec scanNumberLiteral = (~inFraction=false, acc) => { | |
let nc = Stream.readChar(stm) | |
if (nc === "." && !inFraction) || isDigit(nc) { | |
scanNumberLiteral(~inFraction=true, acc ++ nc) | |
} else { | |
stm->Stream.unreadChar | |
switch Belt.Float.fromString(acc) { | |
| Some(num) => NumberLiteral(num) | |
| None => syntaxError(~lexer=s, "Could not parse number literal") | |
} | |
} | |
} | |
switch c { | |
| "[" => LeftBracket | |
| "]" => RightBracket | |
| "{" => LeftCurly | |
| "}" => RightCurly | |
| "|" => Pipe | |
| "," => Comma | |
| _ => | |
if isIdentChar(c) { | |
scanIdent(c) | |
} else if isDigit(c) { | |
scanNumberLiteral(c) | |
} else { | |
syntaxError( | |
~lexer=s, | |
"couldn't identify token: '" ++ | |
(c ++ | |
("' (Code: " ++ (toCharCode(c)->Js.Float.toString ++ ")"))), | |
) | |
} | |
} | |
} | |
let scan = (s: t) => | |
try Some(scanExn(s)) catch { | |
| Stream.End_of_String => None | |
} | |
let make = stm => {lastToken: ref(None), stm: stm} | |
let matchNext = (lexer: t) => | |
switch lexer.lastToken.contents { | |
| None => | |
Stream.skipBlankChars(lexer.stm) | |
scan(lexer)->Belt.Option.getWithDefault(End) | |
| Some(token) => | |
lexer.lastToken := None | |
token | |
} | |
let matchToken = (lexer: t, token: token) => matchNext(lexer) == token | |
/* Will clear the lastToken^, useful when you can't use matchToken and can | |
only use nextToken */ | |
let flushToken = (lexer: t) => { | |
if lexer.lastToken.contents !== None { | |
lexer.lastToken := None | |
} | |
() | |
} | |
/* This will only return the next token when the lastToken^ was flushed | |
Don't call nextToken in a recursive function without calling matchToken / flushToken | |
inbetween! | |
*/ | |
let nextToken = (lexer: t) => | |
switch lexer.lastToken.contents { | |
| Some(token) => token | |
| None => | |
let stm = lexer.stm | |
if Stream.atEnd(stm) { | |
End | |
} else { | |
Stream.skipBlankChars(stm) | |
switch scan(lexer) { | |
| Some(token) => | |
lexer.lastToken := Some(token) | |
token | |
| None => End | |
} | |
} | |
} | |
let lex = (str: string): list<token> => { | |
let stm = Stream.fromString(str) | |
let lexer = make(stm) | |
let rec lex = acc => | |
switch matchNext(lexer) { | |
| End => acc | |
| token => lex(list{token, ...acc}) | |
} | |
lex(list{}) |> Belt.List.reverse | |
} | |
} | |
module Parser = { | |
type t = { | |
lexer: Lexer.t, | |
/* Values tracked over the parsing process */ | |
quantity: option<float>, | |
unit_: option<string>, | |
description: option<string>, | |
modifiers: option<array<modifier>>, | |
comment: option<string>, | |
} | |
let make = lexer => { | |
lexer: lexer, | |
quantity: None, | |
unit_: None, | |
description: None, | |
modifiers: None, | |
comment: None, | |
} | |
let joinCommasAndIdents = arr => | |
Belt.Array.reduce(arr, "", (acc, str) => | |
switch str { | |
| "," => acc ++ "," | |
| str => acc ++ (" " ++ str) | |
} | |
) | |
let unexpectedTokenError = (lexer: Lexer.t, token: Lexer.token, msg: string) => | |
Lexer.syntaxError( | |
~lexer, | |
"Unexpected token '" ++ (Lexer.string_of_token(token) ++ ("' " ++ msg)), | |
) | |
let modifier = (lexer: Lexer.t, ident: string) => { | |
let pos = lexer.stm.pos.contents - String.length(ident) + 1 | |
switch ident->Js.String.toUpperCase { | |
| "OE" => WithoutQuantity | |
| _ => Lexer.syntaxError(~lexer, ~pos, "Unknown Modifier: " ++ ident) | |
} | |
} | |
let rec modifiers = (p: t, acc: array<modifier>): t => { | |
open Lexer | |
let {lexer} = p | |
let token = lexer->matchNext | |
switch token { | |
| Identifier(str) => modifiers(p, Js.Array.concat(acc, [modifier(lexer, str)])) | |
| Comma => modifiers(p, acc) | |
| LeftBracket => modifiers(p, acc) | |
| RightBracket => {...p, modifiers: Some(acc)} | |
| token => unexpectedTokenError(lexer, token, "in Modifier section") | |
} | |
} | |
let rec quantifier = (~value: option<float>=?, ~unit_: array<string>=[], p: t): t => { | |
open Lexer | |
let {lexer} = p | |
switch lexer->matchNext { | |
| NumberLiteral(value) => quantifier(~unit_, ~value, p) | |
| Identifier(u) => quantifier(~unit_=Js.Array.concat([u], unit_), ~value?, p) | |
| LeftCurly => quantifier(~unit_, ~value?, p) | |
| RightCurly => { | |
...p, | |
unit_: Some(Js.Array.joinWith(" ", unit_)), | |
quantity: value, | |
} | |
| token => unexpectedTokenError(lexer, token, "in Quantifier section") | |
} | |
} | |
let rec description = (~acc: array<string>=[], p: t): t => { | |
open Lexer | |
let {lexer} = p | |
switch lexer->nextToken { | |
| Pipe | |
| End => {...p, description: Some(joinCommasAndIdents(acc)->Js.String.trim)} | |
| NumberLiteral(value) => | |
lexer->flushToken | |
description(p, ~acc=Js.Array.concat([Js.Float.toString(value)], acc)) | |
| Identifier(str) => | |
lexer->flushToken | |
description(p, ~acc=Js.Array.concat([str], acc)) | |
| Comma => | |
lexer->flushToken | |
description(~acc=Js.Array.concat([","], acc), p) | |
| token => unexpectedTokenError(lexer, token, "in Description") | |
} | |
} | |
let rec comment = (~acc: array<string>=[], p: t): t => { | |
open Lexer | |
let {lexer} = p | |
let addString = (str: string) => comment(p, ~acc=Js.Array.concat([str], acc)) | |
switch lexer->matchNext { | |
| Pipe => comment(p, ~acc) | |
| Identifier(str) => comment(p, ~acc=Js.Array.concat([str], acc)) | |
| Comma => comment(p, ~acc=Js.Array.concat([","], acc)) | |
| LeftCurly => addString("{") | |
| RightCurly => addString("}") | |
| LeftBracket => addString("[") | |
| RightBracket => addString("]") | |
| NumberLiteral(v) => addString(v->Belt.Float.toString) | |
| End => {...p, comment: Some(joinCommasAndIdents(acc)->Js.String.trim)} | |
} | |
} | |
} | |
let parse = (str: string): result => { | |
open Lexer | |
let stm = Stream.fromString(str) | |
let lexer = Lexer.make(stm) | |
let rec traverse = p => | |
switch lexer->nextToken { | |
| End => p | |
| LeftBracket => Parser.modifiers(p, [])->traverse | |
| LeftCurly => Parser.quantifier(p)->traverse | |
| Pipe => Parser.comment(p)->traverse | |
| Identifier(_) => Parser.description(p)->traverse | |
| token => Parser.unexpectedTokenError(lexer, token, "in Root") | |
} | |
let run = () => { | |
let p = Parser.make(lexer)->traverse | |
Belt.Result.Ok({ | |
quantity: p.quantity, | |
unit_: p.unit_, | |
description: Belt.Option.getWithDefault(p.description, "?"), | |
modifiers: Belt.Option.getWithDefault(p.modifiers, []), | |
comment: p.comment, | |
raw: str, | |
}) | |
} | |
try run() catch { | |
| Syntax_Error(msg) => Error({raw: str, msg: msg}) | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// didn't include all the Test setup infra.. just an example on how the parser is being used. | |
let allCasesParse = _loc => { | |
let input = [ | |
j` [OE]{5 Haende} grüner Paprika | vorzugsweise grob `, | |
j`{1} Zwiebel `, | |
j`{1 Dose} (à 212 ml) Mais`, | |
j`{1} Knoblauchzehe`, | |
j`{1 Stk} Schnittknoblauch | , 3 mm breit`, | |
j`{750 g} Mehl (z.B. 2/3 Weizen und 1/3 Dinkelmehl)`, | |
] | |
Belt.Array.forEach(input, str => { | |
open IngredientParser | |
let p = parse(str) | |
switch p { | |
| Belt.Result.Ok(_ing) => () | |
| Error({msg}) => fail(msg) | |
} | |
}) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment