Last active
January 1, 2023 08:33
-
-
Save naosense/e398b49dfa2e4bb964de97b8a63a670b to your computer and use it in GitHub Desktop.
parser combinator in rust
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// inspired by https://bodil.lol/parser-combinators/ | |
#![allow(dead_code)] | |
#[derive(Clone, Debug, Eq, PartialEq)] | |
struct Element { | |
name: String, | |
attributes: Vec<(String, String)>, | |
children: Vec<Element>, | |
} | |
fn the_letter_a(input: &str) -> Result<(&str, ()), &str> { | |
match input.chars().next() { | |
Some('a') => Ok((&input['a'.len_utf8()..], ())), | |
_ => Err(input), | |
} | |
} | |
fn match_literal<'a>(expected: &'static str) -> impl Parser<'a, ()> { | |
move |input: &'a str| match input.get(0..expected.len()) { | |
Some(next) if next == expected => Ok((&input[expected.len()..], ())), | |
_ => Err(input), | |
} | |
} | |
fn identifier(input: &str) -> ParseResult<String> { | |
let mut matched = String::new(); | |
let mut chars = input.chars(); | |
match chars.next() { | |
Some(next) if next.is_alphabetic() => matched.push(next), | |
_ => return Err(input), | |
} | |
while let Some(next) = chars.next() { | |
if next.is_alphanumeric() || next == '-' { | |
matched.push(next); | |
} else { | |
break; | |
} | |
} | |
let next_index = matched.len(); | |
Ok((&input[next_index..], matched)) | |
} | |
fn pair<'a, P1, P2, R1, R2>(parser1: P1, parser2: P2) -> impl Parser<'a, (R1, R2)> | |
where | |
P1: Parser<'a, R1>, | |
P2: Parser<'a, R2>, | |
{ | |
move |input| match parser1.parse(input) { | |
Ok((next_input, result1)) => match parser2.parse(next_input) { | |
Ok((final_input, result2)) => Ok((final_input, (result1, result2))), | |
Err(err) => Err(err), | |
}, | |
Err(err) => Err(err), | |
} | |
} | |
fn map<'a, P, F, A, B>(parser: P, map_fn: F) -> impl Parser<'a, B> | |
where | |
P: Parser<'a, A>, | |
F: Fn(A) -> B, | |
{ | |
move |input| match parser.parse(input) { | |
Ok((next_input, result)) => Ok((next_input, map_fn(result))), | |
Err(err) => Err(err), | |
} | |
} | |
type ParseResult<'a, Output> = Result<(&'a str, Output), &'a str>; | |
trait Parser<'a, Output> { | |
fn parse(&self, input: &'a str) -> ParseResult<'a, Output>; | |
fn map<F, NewOutput>(self, map_fn: F) -> BoxedParser<'a, NewOutput> | |
where | |
Self: Sized + 'a, | |
Output: 'a, | |
NewOutput: 'a, | |
F: Fn(Output) -> NewOutput + 'a, | |
{ | |
BoxedParser::new(map(self, map_fn)) | |
} | |
fn pred<F>(self, predicate: F) -> BoxedParser<'a, Output> | |
where | |
Self: Sized + 'a, | |
Output: 'a, | |
F: Fn(&Output) -> bool + 'a, | |
{ | |
BoxedParser::new(pred(self, predicate)) | |
} | |
fn and_then<F, NextParser, NewOutput>(self, f: F) -> BoxedParser<'a, NewOutput> | |
where | |
Self: Sized + 'a, | |
Output: 'a, | |
NewOutput: 'a, | |
NextParser: Parser<'a, NewOutput> + 'a, | |
F: Fn(Output) -> NextParser + 'a, | |
{ | |
BoxedParser::new(and_then(self, f)) | |
} | |
} | |
struct BoxedParser<'a, Output> { | |
parser: Box<dyn Parser<'a, Output> + 'a>, | |
} | |
impl<'a, Output> BoxedParser<'a, Output> { | |
fn new<P>(parser: P) -> Self | |
where | |
P: Parser<'a, Output> + 'a, | |
{ | |
BoxedParser { | |
parser: Box::new(parser), | |
} | |
} | |
} | |
impl<'a, Output> Parser<'a, Output> for BoxedParser<'a, Output> { | |
fn parse(&self, input: &'a str) -> ParseResult<'a, Output> { | |
self.parser.parse(input) | |
} | |
} | |
impl<'a, F, Output> Parser<'a, Output> for F | |
where | |
F: Fn(&'a str) -> ParseResult<Output>, | |
{ | |
fn parse(&self, input: &'a str) -> ParseResult<'a, Output> { | |
self(input) | |
} | |
} | |
fn left<'a, P1, P2, R1, R2>(parser1: P1, parser2: P2) -> impl Parser<'a, R1> | |
where | |
P1: Parser<'a, R1>, | |
P2: Parser<'a, R2>, | |
{ | |
map(pair(parser1, parser2), |(left, _)| left) | |
} | |
fn right<'a, P1, P2, R1, R2>(parser1: P1, parser2: P2) -> impl Parser<'a, R2> | |
where | |
P1: Parser<'a, R1>, | |
P2: Parser<'a, R2>, | |
{ | |
map(pair(parser1, parser2), |(_, right)| right) | |
} | |
fn one_or_more<'a, P, A>(parser: P) -> impl Parser<'a, Vec<A>> | |
where | |
P: Parser<'a, A>, | |
{ | |
move |mut input| { | |
let mut result = Vec::new(); | |
if let Ok((next_input, first_item)) = parser.parse(input) { | |
input = next_input; | |
result.push(first_item); | |
} else { | |
return Err(input); | |
} | |
while let Ok((next_input, next_item)) = parser.parse(input) { | |
input = next_input; | |
result.push(next_item); | |
} | |
Ok((input, result)) | |
} | |
} | |
fn zero_or_more<'a, P, A>(parser: P) -> impl Parser<'a, Vec<A>> | |
where | |
P: Parser<'a, A>, | |
{ | |
move |mut input| { | |
let mut result = Vec::new(); | |
while let Ok((next_input, next_item)) = parser.parse(input) { | |
input = next_input; | |
result.push(next_item); | |
} | |
Ok((input, result)) | |
} | |
} | |
fn any_char(input: &str) -> ParseResult<char> { | |
match input.chars().next() { | |
Some(next) => Ok((&input[next.len_utf8()..], next)), | |
_ => Err(input), | |
} | |
} | |
fn pred<'a, P, A, F>(parser: P, predicate: F) -> impl Parser<'a, A> | |
where | |
P: Parser<'a, A>, | |
F: Fn(&A) -> bool, | |
{ | |
move |input| { | |
if let Ok((next_input, value)) = parser.parse(input) { | |
if predicate(&value) { | |
return Ok((next_input, value)); | |
} | |
} | |
Err(input) | |
} | |
} | |
fn whitespace_char<'a>() -> impl Parser<'a, char> { | |
pred(any_char, |c| c.is_whitespace()) | |
} | |
fn space1<'a>() -> impl Parser<'a, Vec<char>> { | |
one_or_more(whitespace_char()) | |
} | |
fn space0<'a>() -> impl Parser<'a, Vec<char>> { | |
zero_or_more(whitespace_char()) | |
} | |
fn quoted_string<'a>() -> impl Parser<'a, String> { | |
map( | |
right( | |
match_literal("\""), | |
left( | |
zero_or_more(pred(any_char, |c| *c != '"')), | |
match_literal("\""), | |
), | |
), | |
|chars| chars.into_iter().collect(), | |
) | |
} | |
fn attributes_pair<'a>() -> impl Parser<'a, (String, String)> { | |
pair(identifier, right(match_literal("="), quoted_string())) | |
} | |
fn attributes<'a>() -> impl Parser<'a, Vec<(String, String)>> { | |
zero_or_more(right(space1(), attributes_pair())) | |
} | |
fn element_start<'a>() -> impl Parser<'a, (String, Vec<(String, String)>)> { | |
right(match_literal("<"), pair(identifier, attributes())) | |
} | |
fn single_element<'a>() -> impl Parser<'a, Element> { | |
map( | |
left(element_start(), match_literal("/>")), | |
|(name, attributes)| Element { | |
name, | |
attributes, | |
children: vec![], | |
}, | |
) | |
} | |
fn open_element<'a>() -> impl Parser<'a, Element> { | |
left(element_start(), match_literal(">")).map(|(name, attributes)| Element { | |
name, | |
attributes, | |
children: vec![], | |
}) | |
} | |
fn either<'a, P1, P2, A>(parser1: P1, parser2: P2) -> impl Parser<'a, A> | |
where | |
P1: Parser<'a, A>, | |
P2: Parser<'a, A>, | |
{ | |
move |input| match parser1.parse(input) { | |
ok @ Ok(_) => ok, | |
Err(_) => parser2.parse(input), | |
} | |
} | |
fn element<'a>() -> impl Parser<'a, Element> { | |
whitespace_wrap(either(single_element(), parent_element())) | |
} | |
fn close_element<'a>(expect_name: String) -> impl Parser<'a, String> { | |
right(match_literal("</"), left(identifier, match_literal(">"))) | |
.pred(move |name| name == &expect_name) | |
} | |
fn parent_element<'a>() -> impl Parser<'a, Element> { | |
open_element().and_then(|el| { | |
left(zero_or_more(element()), close_element(el.name.clone())).map(move |children| { | |
let mut el = el.clone(); | |
el.children = children; | |
el | |
}) | |
}) | |
} | |
fn and_then<'a, P, F, A, B, NextP>(parser: P, f: F) -> impl Parser<'a, B> | |
where | |
P: Parser<'a, A>, | |
NextP: Parser<'a, B>, | |
F: Fn(A) -> NextP, | |
{ | |
move |input| match parser.parse(input) { | |
Ok((next_input, result)) => f(result).parse(next_input), | |
Err(err) => Err(err), | |
} | |
} | |
fn whitespace_wrap<'a, P, A>(parser: P) -> impl Parser<'a, A> | |
where | |
P: Parser<'a, A>, | |
{ | |
right(space0(), left(parser, space0())) | |
} | |
#[test] | |
fn literal_parser() { | |
let parse_joe = match_literal("Hello Joe!"); | |
assert_eq!(Ok(("", ())), parse_joe.parse("Hello Joe!")); | |
assert_eq!( | |
Ok((" Hello Robert!", ())), | |
parse_joe.parse("Hello Joe! Hello Robert!") | |
); | |
assert_eq!(Err("Hello Mike!"), parse_joe.parse("Hello Mike!")); | |
} | |
#[test] | |
fn identifier_parser() { | |
assert_eq!( | |
Ok(("", "i-am-an-identifier".to_string())), | |
identifier("i-am-an-identifier") | |
); | |
assert_eq!( | |
Ok((" entirely an identifier", "not".to_string())), | |
identifier("not entirely an identifier") | |
); | |
assert_eq!( | |
Err("!not at all an identifier"), | |
identifier("!not at all an identifier") | |
); | |
} | |
#[test] | |
fn pair_combinator() { | |
let tag_opener = pair(match_literal("<"), identifier); | |
assert_eq!( | |
Ok(("/>", ((), "my-first-element".to_string()))), | |
tag_opener.parse("<my-first-element/>") | |
); | |
assert_eq!(Err("oops"), tag_opener.parse("oops")); | |
assert_eq!(Err("!oops"), tag_opener.parse("<!oops")); | |
} | |
#[test] | |
fn one_or_more_combinator() { | |
let parser = one_or_more(match_literal("ha")); | |
assert_eq!(Ok(("", vec![(), (), ()])), parser.parse("hahaha")); | |
assert_eq!(Err("ahah"), parser.parse("ahah")); | |
assert_eq!(Err(""), parser.parse("")); | |
} | |
#[test] | |
fn zero_or_more_combinator() { | |
let parser = zero_or_more(match_literal("ha")); | |
assert_eq!(Ok(("", vec![(), (), ()])), parser.parse("hahaha")); | |
assert_eq!(Ok(("ahah", vec![])), parser.parse("ahah")); | |
assert_eq!(Ok(("", vec![])), parser.parse("")); | |
} | |
#[test] | |
fn predicate_combinator() { | |
let parser = pred(any_char, |c| *c == 'o'); | |
assert_eq!(Ok(("mg", 'o')), parser.parse("omg")); | |
assert_eq!(Err("lol"), parser.parse("lol")); | |
} | |
#[test] | |
fn quoted_string_parser() { | |
assert_eq!( | |
Ok(("", "Hello Joe!".to_string())), | |
quoted_string().parse("\"Hello Joe!\"") | |
); | |
} | |
#[test] | |
fn attribute_parser() { | |
assert_eq!( | |
Ok(( | |
"", | |
vec![ | |
("one".to_string(), "1".to_string()), | |
("two".to_string(), "2".to_string()) | |
] | |
)), | |
attributes().parse(" one=\"1\" two=\"2\"") | |
); | |
} | |
#[test] | |
fn single_element_parser() { | |
assert_eq!( | |
Ok(( | |
"", | |
Element { | |
name: "div".to_string(), | |
attributes: vec![("class".to_string(), "float".to_string())], | |
children: vec![] | |
} | |
)), | |
single_element().parse("<div class=\"float\"/>") | |
); | |
} | |
#[test] | |
fn xml_parser() { | |
let doc = r#" | |
<top label="Top"> | |
<semi-bottom label="Bottom"/> | |
<middle> | |
<bottom label="Another bottom"/> | |
</middle> | |
</top>"#; | |
let parsed_doc = Element { | |
name: "top".to_string(), | |
attributes: vec![("label".to_string(), "Top".to_string())], | |
children: vec![ | |
Element { | |
name: "semi-bottom".to_string(), | |
attributes: vec![("label".to_string(), "Bottom".to_string())], | |
children: vec![], | |
}, | |
Element { | |
name: "middle".to_string(), | |
attributes: vec![], | |
children: vec![Element { | |
name: "bottom".to_string(), | |
attributes: vec![("label".to_string(), "Another bottom".to_string())], | |
children: vec![], | |
}], | |
}, | |
], | |
}; | |
assert_eq!(Ok(("", parsed_doc)), element().parse(doc)); | |
} | |
#[test] | |
fn mismatched_closing_tag() { | |
let doc = r#" | |
<top> | |
<bottom/> | |
</middle>"#; | |
assert_eq!(Err("</middle>"), element().parse(doc)); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package playground | |
import scala.util.{Failure, Success, Try} | |
object ParserCombinator { | |
def theLetterA(input: String): Try[(String, Unit)] = { | |
input.toList match { | |
case first :: rest if first == 'a' => Success((rest.mkString, ())) | |
case _ => Failure(ParseError(input)) | |
} | |
} | |
def literal(expected: String): Parser[Unit] = { | |
(input: String) => { | |
input.take(expected.length) match { | |
case s if s == expected => Success((input.drop(expected.length), ())) | |
case _ => Failure(ParseError(input)) | |
} | |
} | |
} | |
def identifier(input: String): ParseResult[String] = { | |
pair( | |
(anychar _).pred(c => c.isLetter), | |
zeroOrMore((anychar _).pred(c => c.isLetterOrDigit || c == '-')) | |
).parse(input) match { | |
case Success((rest, (first, second))) => | |
Success((rest, first + second.mkString)) | |
case Failure(exception) => Failure(ParseError(input)) | |
} | |
} | |
def pair[R1, R2](parser1: Parser[R1], parser2: Parser[R2]): Parser[(R1, R2)] = { | |
for { | |
result1 <- parser1 | |
result2 <- parser2 | |
} yield (result1, result2) | |
} | |
def map[A, B](parser: Parser[A], fn: (A) => B): Parser[B] = { | |
(input: String) => { | |
parser.parse(input) match { | |
case Success((nextInput, result)) => Success((nextInput, fn(result))) | |
case err@Failure(_) => err.asInstanceOf[ParseResult[B]] | |
} | |
} | |
} | |
// Tuples cannot be directly destructured in method or function parameters. | |
def left[R1, R2](parser1: Parser[R1], parser2: Parser[R2]): Parser[R1] = { | |
pair(parser1, parser2).map({ case (l, _) => l }) | |
} | |
// ditto | |
def right[R1, R2](parser1: Parser[R1], parser2: Parser[R2]): Parser[R2] = { | |
pair(parser1, parser2).map({ case (_, r) => r }) | |
} | |
def oneOrMore[A](parser: Parser[A]): Parser[Vector[A]] = { | |
pair(parser, zeroOrMore(parser)).map( | |
{ case (head: A, tail: Vector[A]) => head +: tail } | |
) | |
} | |
def zeroOrMore[A](parser: Parser[A]): Parser[Vector[A]] = { | |
(input: String) => { | |
var result = Vector[A]() | |
var remain = input | |
var break = false | |
while (!break) { | |
parser.parse(remain) match { | |
case Success((nextInput, nextItem)) => | |
remain = nextInput | |
result :+= nextItem | |
case Failure(_) => break = true | |
} | |
} | |
Success((remain, result)) | |
} | |
} | |
def anychar(input: String): ParseResult[Char] = { | |
input.toList match { | |
case first :: rest => Success((rest.mkString, first)) | |
case _ => Failure(ParseError(input)) | |
} | |
} | |
def pred[A](parser: Parser[A], predicate: A => Boolean): Parser[A] = { | |
(input: String) => { | |
parser.parse(input) match { | |
case Success((nextInput, value)) if predicate(value) => Success((nextInput, value)) | |
case _ => Failure(ParseError(input)) | |
} | |
} | |
} | |
def whitespace(): Parser[Char] = { | |
(anychar _).pred(c => c.isWhitespace) | |
} | |
def space1(): Parser[Vector[Char]] = { | |
oneOrMore(whitespace()) | |
} | |
def space0(): Parser[Vector[Char]] = { | |
zeroOrMore(whitespace()) | |
} | |
def quotedString(): Parser[String] = { | |
right( | |
literal("\""), | |
left( | |
zeroOrMore((anychar: Parser[Char]).pred((c: Char) => c != '"')), | |
literal("\"") | |
) | |
).map((chars: Vector[Char]) => chars.mkString) | |
} | |
def attributePair(): Parser[(String, String)] = { | |
pair(identifier, right(literal("="), quotedString())) | |
} | |
def attributes(): Parser[Vector[(String, String)]] = { | |
zeroOrMore(right(space1(), attributePair())) | |
} | |
def elementStart(): Parser[(String, Vector[(String, String)])] = { | |
right(literal("<"), pair(identifier, attributes())) | |
} | |
def singleElement(): Parser[Element] = { | |
left(elementStart(), literal("/>")) | |
.map({ case (name, attributes) => Element(name, attributes, Vector()) }) | |
} | |
def openElement(): Parser[Element] = { | |
left(elementStart(), literal(">")) | |
.map({ case (name, attributes) => Element(name, attributes, Vector()) }) | |
} | |
def either[A](parser1: Parser[A], parser2: Parser[A]): Parser[A] = { | |
(input: String) => { | |
parser1.parse(input) match { | |
case ok@Success(_) => ok | |
case _ => parser2.parse(input) | |
} | |
} | |
} | |
def element(): Parser[Element] = { | |
wrap(either(singleElement(), parentElement())) | |
} | |
def parentElement(): Parser[Element] = { | |
for { | |
el <- openElement() | |
children <- left(zeroOrMore(element()), closeElement(el.name)) | |
} yield el.copy(children = children) | |
} | |
def wrap[A](parser: Parser[A]): Parser[A] = { | |
right(space0(), left(parser, space0())) | |
} | |
def closeElement(expected: String): Parser[String] = { | |
right( | |
literal("</"), | |
left(identifier, literal(">")) | |
).pred(name => name == expected) | |
} | |
type ParseResult[Output] = Try[(String, Output)] | |
@FunctionalInterface | |
trait Parser[Output] { | |
def parse(input: String): ParseResult[Output] | |
def map[NewOutput](fn: Output => NewOutput): Parser[NewOutput] = { | |
ParserCombinator.map(this, fn) | |
} | |
def pred(predicate: Output => Boolean): Parser[Output] = { | |
ParserCombinator.pred(this, predicate) | |
} | |
def flatMap[NewOutput](fn: Output => Parser[NewOutput]): Parser[NewOutput] = { | |
(input: String) => { | |
this.parse(input) match { | |
case Success((nextInput, result)) => fn(result).parse(nextInput) | |
case err@Failure(_) => err.asInstanceOf[ParseResult[NewOutput]] | |
} | |
} | |
} | |
} | |
implicit def function2parser[Output](f1: String => ParseResult[Output]): Parser[Output] = { | |
(input: String) => f1.apply(input) | |
} | |
case class ParseError(input: String) extends Throwable | |
case class Element(name: String, attributes: Vector[(String, String)], children: Vector[Element]) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import playground.ParserCombinator._ | |
import scala.util.{Failure, Success} | |
class ParserCombinatorTests extends munit.FunSuite { | |
test("literal parser") { | |
val parseJoe = literal("Hello Joe!") | |
assertEquals(parseJoe.parse("Hello Joe!"), Success("", ())) | |
} | |
test("identifier parser") { | |
assertEquals(identifier("i-am-an-identifier"), Success(("", "i-am-an-identifier"))) | |
assertEquals(identifier("not entirely an identifier"), Success((" entirely an identifier", "not"))) | |
assertEquals(identifier("!not at all an identifier"), Failure(ParseError("!not at all an identifier"))) | |
} | |
test("pair combinator") { | |
val tagOpener = pair(literal("<"), identifier _) | |
assertEquals(tagOpener.parse("<my-first-element/>"), Success(("/>", ((), "my-first-element")))) | |
assertEquals(tagOpener.parse("oops"), Failure(ParseError("oops"))) | |
assertEquals(tagOpener.parse("<!oops"), Failure(ParseError("!oops"))) | |
} | |
test("right combinator") { | |
val tagOpener = right(literal("<"), identifier _) | |
assertEquals(tagOpener.parse("<my-first-element/>"), Success(("/>", "my-first-element"))) | |
assertEquals(tagOpener.parse("oops"), Failure(ParseError("oops"))) | |
assertEquals(tagOpener.parse("<!oops"), Failure(ParseError("!oops"))) | |
} | |
test("one or more combinator") { | |
val parser = oneOrMore(literal("ha")) | |
assertEquals(parser.parse("hahaha"), Success(("", Vector((), (), ())))) | |
assertEquals(parser.parse("ahah"), Failure(ParseError("ahah"))) | |
assertEquals(parser.parse(""), Failure(ParseError(""))) | |
} | |
test("zero or more combinator") { | |
val parser = zeroOrMore(literal("ha")) | |
assertEquals(parser.parse("hahaha"), Success(("", Vector((), (), ())))) | |
assertEquals(parser.parse("ahah"), Success(("ahah", Vector.empty))) | |
assertEquals(parser.parse(""), Success("", Vector.empty)) | |
} | |
test("predicate combinator") { | |
val parser = pred(anychar, (c: Char) => c == 'o') | |
assertEquals(parser.parse("omg"), Success(("mg", 'o'))) | |
} | |
test("quoted string parser") { | |
assertEquals(quotedString().parse("\"Hello Joe!\""), Success("", "Hello Joe!")) | |
} | |
test("attribute parser") { | |
assertEquals(attributes().parse(" one=\"1\" two=\"2\""), Success("", Vector(("one", "1"), ("two", "2")))) | |
} | |
test("single element parser") { | |
assertEquals( | |
singleElement().parse("<div class=\"float\"/>"), | |
Success(( | |
"", | |
Element("div", Vector(("class", "float")), Vector())) | |
) | |
) | |
} | |
test("xml parser") { | |
val doc = | |
"""<top label="Top"> | |
| <semi-bottom label="Bottom"/> | |
| <middle> | |
| <bottom label="Another bottom"/> | |
| </middle> | |
|</top> | |
|""".stripMargin | |
val parseDoc = Element( | |
"top", | |
Vector(("label", "Top")), | |
Vector( | |
Element("semi-bottom", Vector(("label", "Bottom")), Vector()), | |
Element("middle", Vector(), Vector(Element("bottom", Vector(("label", "Another bottom")), Vector()))) | |
) | |
) | |
assertEquals(element().parse(doc), Success(("", parseDoc))) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment