Skip to content

Instantly share code, notes, and snippets.

@naosense
Last active January 1, 2023 08:33
Show Gist options
  • Save naosense/e398b49dfa2e4bb964de97b8a63a670b to your computer and use it in GitHub Desktop.
Save naosense/e398b49dfa2e4bb964de97b8a63a670b to your computer and use it in GitHub Desktop.
parser combinator in rust
// inspired by https://bodil.lol/parser-combinators/
#![allow(dead_code)]
#[derive(Clone, Debug, Eq, PartialEq)]
struct Element {
name: String,
attributes: Vec<(String, String)>,
children: Vec<Element>,
}
fn the_letter_a(input: &str) -> Result<(&str, ()), &str> {
match input.chars().next() {
Some('a') => Ok((&input['a'.len_utf8()..], ())),
_ => Err(input),
}
}
fn match_literal<'a>(expected: &'static str) -> impl Parser<'a, ()> {
move |input: &'a str| match input.get(0..expected.len()) {
Some(next) if next == expected => Ok((&input[expected.len()..], ())),
_ => Err(input),
}
}
fn identifier(input: &str) -> ParseResult<String> {
let mut matched = String::new();
let mut chars = input.chars();
match chars.next() {
Some(next) if next.is_alphabetic() => matched.push(next),
_ => return Err(input),
}
while let Some(next) = chars.next() {
if next.is_alphanumeric() || next == '-' {
matched.push(next);
} else {
break;
}
}
let next_index = matched.len();
Ok((&input[next_index..], matched))
}
fn pair<'a, P1, P2, R1, R2>(parser1: P1, parser2: P2) -> impl Parser<'a, (R1, R2)>
where
P1: Parser<'a, R1>,
P2: Parser<'a, R2>,
{
move |input| match parser1.parse(input) {
Ok((next_input, result1)) => match parser2.parse(next_input) {
Ok((final_input, result2)) => Ok((final_input, (result1, result2))),
Err(err) => Err(err),
},
Err(err) => Err(err),
}
}
fn map<'a, P, F, A, B>(parser: P, map_fn: F) -> impl Parser<'a, B>
where
P: Parser<'a, A>,
F: Fn(A) -> B,
{
move |input| match parser.parse(input) {
Ok((next_input, result)) => Ok((next_input, map_fn(result))),
Err(err) => Err(err),
}
}
type ParseResult<'a, Output> = Result<(&'a str, Output), &'a str>;
trait Parser<'a, Output> {
fn parse(&self, input: &'a str) -> ParseResult<'a, Output>;
fn map<F, NewOutput>(self, map_fn: F) -> BoxedParser<'a, NewOutput>
where
Self: Sized + 'a,
Output: 'a,
NewOutput: 'a,
F: Fn(Output) -> NewOutput + 'a,
{
BoxedParser::new(map(self, map_fn))
}
fn pred<F>(self, predicate: F) -> BoxedParser<'a, Output>
where
Self: Sized + 'a,
Output: 'a,
F: Fn(&Output) -> bool + 'a,
{
BoxedParser::new(pred(self, predicate))
}
fn and_then<F, NextParser, NewOutput>(self, f: F) -> BoxedParser<'a, NewOutput>
where
Self: Sized + 'a,
Output: 'a,
NewOutput: 'a,
NextParser: Parser<'a, NewOutput> + 'a,
F: Fn(Output) -> NextParser + 'a,
{
BoxedParser::new(and_then(self, f))
}
}
struct BoxedParser<'a, Output> {
parser: Box<dyn Parser<'a, Output> + 'a>,
}
impl<'a, Output> BoxedParser<'a, Output> {
fn new<P>(parser: P) -> Self
where
P: Parser<'a, Output> + 'a,
{
BoxedParser {
parser: Box::new(parser),
}
}
}
impl<'a, Output> Parser<'a, Output> for BoxedParser<'a, Output> {
fn parse(&self, input: &'a str) -> ParseResult<'a, Output> {
self.parser.parse(input)
}
}
impl<'a, F, Output> Parser<'a, Output> for F
where
F: Fn(&'a str) -> ParseResult<Output>,
{
fn parse(&self, input: &'a str) -> ParseResult<'a, Output> {
self(input)
}
}
fn left<'a, P1, P2, R1, R2>(parser1: P1, parser2: P2) -> impl Parser<'a, R1>
where
P1: Parser<'a, R1>,
P2: Parser<'a, R2>,
{
map(pair(parser1, parser2), |(left, _)| left)
}
fn right<'a, P1, P2, R1, R2>(parser1: P1, parser2: P2) -> impl Parser<'a, R2>
where
P1: Parser<'a, R1>,
P2: Parser<'a, R2>,
{
map(pair(parser1, parser2), |(_, right)| right)
}
fn one_or_more<'a, P, A>(parser: P) -> impl Parser<'a, Vec<A>>
where
P: Parser<'a, A>,
{
move |mut input| {
let mut result = Vec::new();
if let Ok((next_input, first_item)) = parser.parse(input) {
input = next_input;
result.push(first_item);
} else {
return Err(input);
}
while let Ok((next_input, next_item)) = parser.parse(input) {
input = next_input;
result.push(next_item);
}
Ok((input, result))
}
}
fn zero_or_more<'a, P, A>(parser: P) -> impl Parser<'a, Vec<A>>
where
P: Parser<'a, A>,
{
move |mut input| {
let mut result = Vec::new();
while let Ok((next_input, next_item)) = parser.parse(input) {
input = next_input;
result.push(next_item);
}
Ok((input, result))
}
}
fn any_char(input: &str) -> ParseResult<char> {
match input.chars().next() {
Some(next) => Ok((&input[next.len_utf8()..], next)),
_ => Err(input),
}
}
fn pred<'a, P, A, F>(parser: P, predicate: F) -> impl Parser<'a, A>
where
P: Parser<'a, A>,
F: Fn(&A) -> bool,
{
move |input| {
if let Ok((next_input, value)) = parser.parse(input) {
if predicate(&value) {
return Ok((next_input, value));
}
}
Err(input)
}
}
fn whitespace_char<'a>() -> impl Parser<'a, char> {
pred(any_char, |c| c.is_whitespace())
}
fn space1<'a>() -> impl Parser<'a, Vec<char>> {
one_or_more(whitespace_char())
}
fn space0<'a>() -> impl Parser<'a, Vec<char>> {
zero_or_more(whitespace_char())
}
fn quoted_string<'a>() -> impl Parser<'a, String> {
map(
right(
match_literal("\""),
left(
zero_or_more(pred(any_char, |c| *c != '"')),
match_literal("\""),
),
),
|chars| chars.into_iter().collect(),
)
}
fn attributes_pair<'a>() -> impl Parser<'a, (String, String)> {
pair(identifier, right(match_literal("="), quoted_string()))
}
fn attributes<'a>() -> impl Parser<'a, Vec<(String, String)>> {
zero_or_more(right(space1(), attributes_pair()))
}
fn element_start<'a>() -> impl Parser<'a, (String, Vec<(String, String)>)> {
right(match_literal("<"), pair(identifier, attributes()))
}
fn single_element<'a>() -> impl Parser<'a, Element> {
map(
left(element_start(), match_literal("/>")),
|(name, attributes)| Element {
name,
attributes,
children: vec![],
},
)
}
fn open_element<'a>() -> impl Parser<'a, Element> {
left(element_start(), match_literal(">")).map(|(name, attributes)| Element {
name,
attributes,
children: vec![],
})
}
fn either<'a, P1, P2, A>(parser1: P1, parser2: P2) -> impl Parser<'a, A>
where
P1: Parser<'a, A>,
P2: Parser<'a, A>,
{
move |input| match parser1.parse(input) {
ok @ Ok(_) => ok,
Err(_) => parser2.parse(input),
}
}
fn element<'a>() -> impl Parser<'a, Element> {
whitespace_wrap(either(single_element(), parent_element()))
}
fn close_element<'a>(expect_name: String) -> impl Parser<'a, String> {
right(match_literal("</"), left(identifier, match_literal(">")))
.pred(move |name| name == &expect_name)
}
fn parent_element<'a>() -> impl Parser<'a, Element> {
open_element().and_then(|el| {
left(zero_or_more(element()), close_element(el.name.clone())).map(move |children| {
let mut el = el.clone();
el.children = children;
el
})
})
}
fn and_then<'a, P, F, A, B, NextP>(parser: P, f: F) -> impl Parser<'a, B>
where
P: Parser<'a, A>,
NextP: Parser<'a, B>,
F: Fn(A) -> NextP,
{
move |input| match parser.parse(input) {
Ok((next_input, result)) => f(result).parse(next_input),
Err(err) => Err(err),
}
}
fn whitespace_wrap<'a, P, A>(parser: P) -> impl Parser<'a, A>
where
P: Parser<'a, A>,
{
right(space0(), left(parser, space0()))
}
#[test]
fn literal_parser() {
let parse_joe = match_literal("Hello Joe!");
assert_eq!(Ok(("", ())), parse_joe.parse("Hello Joe!"));
assert_eq!(
Ok((" Hello Robert!", ())),
parse_joe.parse("Hello Joe! Hello Robert!")
);
assert_eq!(Err("Hello Mike!"), parse_joe.parse("Hello Mike!"));
}
#[test]
fn identifier_parser() {
assert_eq!(
Ok(("", "i-am-an-identifier".to_string())),
identifier("i-am-an-identifier")
);
assert_eq!(
Ok((" entirely an identifier", "not".to_string())),
identifier("not entirely an identifier")
);
assert_eq!(
Err("!not at all an identifier"),
identifier("!not at all an identifier")
);
}
#[test]
fn pair_combinator() {
let tag_opener = pair(match_literal("<"), identifier);
assert_eq!(
Ok(("/>", ((), "my-first-element".to_string()))),
tag_opener.parse("<my-first-element/>")
);
assert_eq!(Err("oops"), tag_opener.parse("oops"));
assert_eq!(Err("!oops"), tag_opener.parse("<!oops"));
}
#[test]
fn one_or_more_combinator() {
let parser = one_or_more(match_literal("ha"));
assert_eq!(Ok(("", vec![(), (), ()])), parser.parse("hahaha"));
assert_eq!(Err("ahah"), parser.parse("ahah"));
assert_eq!(Err(""), parser.parse(""));
}
#[test]
fn zero_or_more_combinator() {
let parser = zero_or_more(match_literal("ha"));
assert_eq!(Ok(("", vec![(), (), ()])), parser.parse("hahaha"));
assert_eq!(Ok(("ahah", vec![])), parser.parse("ahah"));
assert_eq!(Ok(("", vec![])), parser.parse(""));
}
#[test]
fn predicate_combinator() {
let parser = pred(any_char, |c| *c == 'o');
assert_eq!(Ok(("mg", 'o')), parser.parse("omg"));
assert_eq!(Err("lol"), parser.parse("lol"));
}
#[test]
fn quoted_string_parser() {
assert_eq!(
Ok(("", "Hello Joe!".to_string())),
quoted_string().parse("\"Hello Joe!\"")
);
}
#[test]
fn attribute_parser() {
assert_eq!(
Ok((
"",
vec![
("one".to_string(), "1".to_string()),
("two".to_string(), "2".to_string())
]
)),
attributes().parse(" one=\"1\" two=\"2\"")
);
}
#[test]
fn single_element_parser() {
assert_eq!(
Ok((
"",
Element {
name: "div".to_string(),
attributes: vec![("class".to_string(), "float".to_string())],
children: vec![]
}
)),
single_element().parse("<div class=\"float\"/>")
);
}
#[test]
fn xml_parser() {
let doc = r#"
<top label="Top">
<semi-bottom label="Bottom"/>
<middle>
<bottom label="Another bottom"/>
</middle>
</top>"#;
let parsed_doc = Element {
name: "top".to_string(),
attributes: vec![("label".to_string(), "Top".to_string())],
children: vec![
Element {
name: "semi-bottom".to_string(),
attributes: vec![("label".to_string(), "Bottom".to_string())],
children: vec![],
},
Element {
name: "middle".to_string(),
attributes: vec![],
children: vec![Element {
name: "bottom".to_string(),
attributes: vec![("label".to_string(), "Another bottom".to_string())],
children: vec![],
}],
},
],
};
assert_eq!(Ok(("", parsed_doc)), element().parse(doc));
}
#[test]
fn mismatched_closing_tag() {
let doc = r#"
<top>
<bottom/>
</middle>"#;
assert_eq!(Err("</middle>"), element().parse(doc));
}
package playground
import scala.util.{Failure, Success, Try}
object ParserCombinator {
def theLetterA(input: String): Try[(String, Unit)] = {
input.toList match {
case first :: rest if first == 'a' => Success((rest.mkString, ()))
case _ => Failure(ParseError(input))
}
}
def literal(expected: String): Parser[Unit] = {
(input: String) => {
input.take(expected.length) match {
case s if s == expected => Success((input.drop(expected.length), ()))
case _ => Failure(ParseError(input))
}
}
}
def identifier(input: String): ParseResult[String] = {
pair(
(anychar _).pred(c => c.isLetter),
zeroOrMore((anychar _).pred(c => c.isLetterOrDigit || c == '-'))
).parse(input) match {
case Success((rest, (first, second))) =>
Success((rest, first + second.mkString))
case Failure(exception) => Failure(ParseError(input))
}
}
def pair[R1, R2](parser1: Parser[R1], parser2: Parser[R2]): Parser[(R1, R2)] = {
for {
result1 <- parser1
result2 <- parser2
} yield (result1, result2)
}
def map[A, B](parser: Parser[A], fn: (A) => B): Parser[B] = {
(input: String) => {
parser.parse(input) match {
case Success((nextInput, result)) => Success((nextInput, fn(result)))
case err@Failure(_) => err.asInstanceOf[ParseResult[B]]
}
}
}
// Tuples cannot be directly destructured in method or function parameters.
def left[R1, R2](parser1: Parser[R1], parser2: Parser[R2]): Parser[R1] = {
pair(parser1, parser2).map({ case (l, _) => l })
}
// ditto
def right[R1, R2](parser1: Parser[R1], parser2: Parser[R2]): Parser[R2] = {
pair(parser1, parser2).map({ case (_, r) => r })
}
def oneOrMore[A](parser: Parser[A]): Parser[Vector[A]] = {
pair(parser, zeroOrMore(parser)).map(
{ case (head: A, tail: Vector[A]) => head +: tail }
)
}
def zeroOrMore[A](parser: Parser[A]): Parser[Vector[A]] = {
(input: String) => {
var result = Vector[A]()
var remain = input
var break = false
while (!break) {
parser.parse(remain) match {
case Success((nextInput, nextItem)) =>
remain = nextInput
result :+= nextItem
case Failure(_) => break = true
}
}
Success((remain, result))
}
}
def anychar(input: String): ParseResult[Char] = {
input.toList match {
case first :: rest => Success((rest.mkString, first))
case _ => Failure(ParseError(input))
}
}
def pred[A](parser: Parser[A], predicate: A => Boolean): Parser[A] = {
(input: String) => {
parser.parse(input) match {
case Success((nextInput, value)) if predicate(value) => Success((nextInput, value))
case _ => Failure(ParseError(input))
}
}
}
def whitespace(): Parser[Char] = {
(anychar _).pred(c => c.isWhitespace)
}
def space1(): Parser[Vector[Char]] = {
oneOrMore(whitespace())
}
def space0(): Parser[Vector[Char]] = {
zeroOrMore(whitespace())
}
def quotedString(): Parser[String] = {
right(
literal("\""),
left(
zeroOrMore((anychar: Parser[Char]).pred((c: Char) => c != '"')),
literal("\"")
)
).map((chars: Vector[Char]) => chars.mkString)
}
def attributePair(): Parser[(String, String)] = {
pair(identifier, right(literal("="), quotedString()))
}
def attributes(): Parser[Vector[(String, String)]] = {
zeroOrMore(right(space1(), attributePair()))
}
def elementStart(): Parser[(String, Vector[(String, String)])] = {
right(literal("<"), pair(identifier, attributes()))
}
def singleElement(): Parser[Element] = {
left(elementStart(), literal("/>"))
.map({ case (name, attributes) => Element(name, attributes, Vector()) })
}
def openElement(): Parser[Element] = {
left(elementStart(), literal(">"))
.map({ case (name, attributes) => Element(name, attributes, Vector()) })
}
def either[A](parser1: Parser[A], parser2: Parser[A]): Parser[A] = {
(input: String) => {
parser1.parse(input) match {
case ok@Success(_) => ok
case _ => parser2.parse(input)
}
}
}
def element(): Parser[Element] = {
wrap(either(singleElement(), parentElement()))
}
def parentElement(): Parser[Element] = {
for {
el <- openElement()
children <- left(zeroOrMore(element()), closeElement(el.name))
} yield el.copy(children = children)
}
def wrap[A](parser: Parser[A]): Parser[A] = {
right(space0(), left(parser, space0()))
}
def closeElement(expected: String): Parser[String] = {
right(
literal("</"),
left(identifier, literal(">"))
).pred(name => name == expected)
}
type ParseResult[Output] = Try[(String, Output)]
@FunctionalInterface
trait Parser[Output] {
def parse(input: String): ParseResult[Output]
def map[NewOutput](fn: Output => NewOutput): Parser[NewOutput] = {
ParserCombinator.map(this, fn)
}
def pred(predicate: Output => Boolean): Parser[Output] = {
ParserCombinator.pred(this, predicate)
}
def flatMap[NewOutput](fn: Output => Parser[NewOutput]): Parser[NewOutput] = {
(input: String) => {
this.parse(input) match {
case Success((nextInput, result)) => fn(result).parse(nextInput)
case err@Failure(_) => err.asInstanceOf[ParseResult[NewOutput]]
}
}
}
}
implicit def function2parser[Output](f1: String => ParseResult[Output]): Parser[Output] = {
(input: String) => f1.apply(input)
}
case class ParseError(input: String) extends Throwable
case class Element(name: String, attributes: Vector[(String, String)], children: Vector[Element])
}
import playground.ParserCombinator._
import scala.util.{Failure, Success}
class ParserCombinatorTests extends munit.FunSuite {
test("literal parser") {
val parseJoe = literal("Hello Joe!")
assertEquals(parseJoe.parse("Hello Joe!"), Success("", ()))
}
test("identifier parser") {
assertEquals(identifier("i-am-an-identifier"), Success(("", "i-am-an-identifier")))
assertEquals(identifier("not entirely an identifier"), Success((" entirely an identifier", "not")))
assertEquals(identifier("!not at all an identifier"), Failure(ParseError("!not at all an identifier")))
}
test("pair combinator") {
val tagOpener = pair(literal("<"), identifier _)
assertEquals(tagOpener.parse("<my-first-element/>"), Success(("/>", ((), "my-first-element"))))
assertEquals(tagOpener.parse("oops"), Failure(ParseError("oops")))
assertEquals(tagOpener.parse("<!oops"), Failure(ParseError("!oops")))
}
test("right combinator") {
val tagOpener = right(literal("<"), identifier _)
assertEquals(tagOpener.parse("<my-first-element/>"), Success(("/>", "my-first-element")))
assertEquals(tagOpener.parse("oops"), Failure(ParseError("oops")))
assertEquals(tagOpener.parse("<!oops"), Failure(ParseError("!oops")))
}
test("one or more combinator") {
val parser = oneOrMore(literal("ha"))
assertEquals(parser.parse("hahaha"), Success(("", Vector((), (), ()))))
assertEquals(parser.parse("ahah"), Failure(ParseError("ahah")))
assertEquals(parser.parse(""), Failure(ParseError("")))
}
test("zero or more combinator") {
val parser = zeroOrMore(literal("ha"))
assertEquals(parser.parse("hahaha"), Success(("", Vector((), (), ()))))
assertEquals(parser.parse("ahah"), Success(("ahah", Vector.empty)))
assertEquals(parser.parse(""), Success("", Vector.empty))
}
test("predicate combinator") {
val parser = pred(anychar, (c: Char) => c == 'o')
assertEquals(parser.parse("omg"), Success(("mg", 'o')))
}
test("quoted string parser") {
assertEquals(quotedString().parse("\"Hello Joe!\""), Success("", "Hello Joe!"))
}
test("attribute parser") {
assertEquals(attributes().parse(" one=\"1\" two=\"2\""), Success("", Vector(("one", "1"), ("two", "2"))))
}
test("single element parser") {
assertEquals(
singleElement().parse("<div class=\"float\"/>"),
Success((
"",
Element("div", Vector(("class", "float")), Vector()))
)
)
}
test("xml parser") {
val doc =
"""<top label="Top">
| <semi-bottom label="Bottom"/>
| <middle>
| <bottom label="Another bottom"/>
| </middle>
|</top>
|""".stripMargin
val parseDoc = Element(
"top",
Vector(("label", "Top")),
Vector(
Element("semi-bottom", Vector(("label", "Bottom")), Vector()),
Element("middle", Vector(), Vector(Element("bottom", Vector(("label", "Another bottom")), Vector())))
)
)
assertEquals(element().parse(doc), Success(("", parseDoc)))
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment