Skip to content

Instantly share code, notes, and snippets.

@RomanHargrave
Created July 5, 2022 02:15
Show Gist options
  • Save RomanHargrave/5328db7bdcd7ff0f78fe9402f44c0831 to your computer and use it in GitHub Desktop.
Save RomanHargrave/5328db7bdcd7ff0f78fe9402f44c0831 to your computer and use it in GitHub Desktop.
use crate::lang::Atom;
use nom::character::complete::{
char,
multispace1,
};
use nom::sequence::{delimited, preceded, terminated};
use nom::combinator::{
map,
map_res,
map_opt,
verify,
value
};
use nom::branch::alt;
use nom::bytes::complete::{
is_not,
take_while_m_n
};
use nom::error::{
FromExternalError,
ParseError
};
use nom::IResult;
use nom::multi::fold_many0;
/// The [nom::combinator::value] combinator applied consuming a single character and emitting an
/// arbitrary value.
///
/// The application
/// ```
/// char_val!('a' => '\u{07}');
/// ```
///
/// Is directly equivalent to
/// ```
/// nom::combinator::value('\u{07}', nom::character::complete::char('a'));
/// ```
#[macro_export]
macro_rules! char_val {
($from:literal => $to:expr) => {
nom::combinator::value(
$to,
nom::character::complete::char($from)
)
}
}
// Much of this is derived from the escaped string example packaged with Nom, in part because my
// brain was not working at the time. The basic idea here is to break the body of a string into
// fragments based on whether the parser that consumes a given span of the string body produces
// one, many, or no characters (String, Char, and Void respectively) and then recombine them
// appropriately. Effectively, this will take in input string with escapes, and produce an owned
// string representing the logical value of the input string body.
/// R7RS §6.7. hexadecimal escape sequence parser, invoked following the escape character. The
/// parser will consume text matching the expression `x[0-9A-F]{1,8}`.
pub fn hex_scalar_seq<'s, E>(i: &'s str) -> IResult<&'s str, char, E>
where
E: ParseError<&'s str> + FromExternalError<&'s str, std::num::ParseIntError>
{
// summary: extract the hexadecimal sequence between 'x' and ';', convert it to a 32-bit
// integer, and convert that integer to a character.
map_opt(
map_res(
// consume x..; and return text between delimiters (x, ;)
preceded(
char('x'),
take_while_m_n(1, 8, |c: char| c.is_ascii_hexdigit()),
),
// convert hexadecimal sequence to 32-bit integer
|r| u32::from_str_radix(r, 16)
),
// convert integer to character
char::from_u32
)(i)
}
const ESCAPE_LEADER: char = '\\';
/// Parse an escape sequence other than whitespace
fn escaped_char<'s, E>(i: &'s str) -> IResult<&'s str, char, E>
where
E: ParseError<&'s str> + FromExternalError<&'s str, std::num::ParseIntError>
{
// Expect a sequence starting with \
preceded(
char(ESCAPE_LEADER),
// Immediately following \, test for acceptable values. Branches are tested in order, so it
// may pay to shuffle this around later such that branches are explored in order of most
// common first to least common last.
alt((
// Escape sequences defined by spec. Do not remove or alter.
terminated(hex_scalar_seq, char(';')), // x…; scalar
char_val!('a' => '\u{07}'), // Alarm
char_val!('b' => '\u{08}'), // Backspace
char_val!('t' => '\u{09}'), // Tab
char_val!('n' => '\u{0A}'), // Linefeed
char_val!('r' => '\u{0D}'), // Return
char_val!('"' => '"'),
char_val!('\\' => '\\'),
char_val!('|' => '|'),
))
)(i)
}
/// Parse escaped whitespace, specifically a backslash followed by any amount of whitespace.
fn escaped_whitespace<'s, E>(i: &'s str) -> IResult<&'s str, &'s str, E>
where
E: ParseError<&'s str>
{
// Look for a \ followed by >1 whitespace characters
preceded(
char(ESCAPE_LEADER),
multispace1
)(i)
}
/// Parse normal string components, given the string delimiter.
fn normal_text<'s, E>(
delim: char
) -> impl FnMut(&'s str) -> IResult<&'s str, &'s str, E>
where
E: ParseError<&'s str>
{
move |i|
verify(
// Take as many characters as possible until " or \ are reached.
is_not([ESCAPE_LEADER, delim].as_slice()),
// Convert the result to an error if the above parser consumed 0 characters.
|s: &str| !s.is_empty()
)(i)
}
/// Part of a string collected during parsing.
#[derive(Debug, Clone)]
enum Fragment<'s> {
/// A length of text not containing any special sequences.
String(&'s str),
/// A single character, corresponding to escape sequences that produce a single character.
Char(char),
/// Nothing, having no length.
Void
}
/// Produces a function that, with a specified delimiter, consumes part of a string, producing the
/// appropriate [Fragment] variant (String, Character, Void) for the first applicable span of text,
/// returning the remainder and variant.
fn string_fragment<'s, E>(
delim: char
) -> impl FnMut(&'s str) -> IResult<&'s str, Fragment<'s>, E>
where
E: ParseError<&'s str> + FromExternalError<&'s str, std::num::ParseIntError>
{
move |i|
// alt() tests in order. normal_text should occur more frequently than other branches so should
// be tested first (preferred), followed by regular character escapes, finally whitespace escapes.
alt((
// Parsers producing a string of characters have their result placed into the String variant
map(normal_text(delim), Fragment::String),
// Likewise, any single character parser results are placed into the Char variant
map(escaped_char, Fragment::Char),
// Anything parser that "skips" portions of the input string produces the Void variant
value(Fragment::Void, escaped_whitespace),
))(i)
}
/// Consume a block of text containing escape sequences. The parser transforms escape sequences at
/// parse time, producing an owned string representing the transformed input.
fn string_body<'s, E>(
delim: char
) -> impl FnMut(&'s str) -> IResult<&'s str, String, E>
where
E: ParseError<&'s str> + FromExternalError<&'s str, std::num::ParseIntError>
{
move |i|
fold_many0(
// Apply the string_fragment parser until it fails
string_fragment(delim),
String::new,
// Every time the string_fragment parser succeeds, the below function is called with its output
// and another value. The initial state of the other value is computed by calling the parameter
// prior to this function, which in this case is String::new. Notably, we are going to take the
// initial value as mutable, which is not /terribly/ common in other applications of this
// pattern, but allows for construction of the string without reallocating a new string on each
// application (though the string will still have two grow :shrug:).
|mut str, frag|
match frag {
// When a string fragment is encountered, append the entire string to this string. This
// is the case applicable to the normal_text parser.
Fragment::String(s) => {
str.push_str(s);
str
}
// When a character fragment is encountered, append the character. Applicable to
// character escapes.
Fragment::Char(c) => {
str.push(c);
str
}
// The void fragment will not result in any changes to the "accumulator" string
Fragment::Void => str
}
)(i)
}
/// Parse a complete string residing between two delimiters.
pub fn delimited_string<'s, E>(
delimiter: char
) -> impl FnMut(&'s str) -> IResult<&'s str, String, E>
where
E: ParseError<&'s str> + FromExternalError<&'s str, std::num::ParseIntError>
{
move |i|
// Parse the string and yield an owned String
delimited(
// Expect to see "
char(delimiter),
// Followed by the string body and any escaped chars, etc...
string_body(delimiter),
// Expect the closing "
char(delimiter)
)(i)
}
/// Consume a "-delimited span of text and emit an [Atom::String] owning its computed value, or
/// an error if the string is not valid.
pub fn string<'s, E>(i: &'s str) -> IResult<&'s str, Atom<'s>, E>
where
E: ParseError<&'s str> + FromExternalError<&'s str, std::num::ParseIntError>
{
map(
delimited_string('"'),
// Move the computed string body to an Atom::String variant
Atom::String
)(i)
}
/// Consume a |-delimited string, which is used for complex identifiers
pub fn long_identifier<'s, E>(i: &'s str) -> IResult<&'s str, Atom<'s>, E>
where
E: ParseError<&'s str> + FromExternalError<&'s str, std::num::ParseIntError>
{
map(
delimited_string('|'),
Atom::String
)(i)
}
#[cfg(test)]
mod test {
use super::*;
use nom::error::ErrorKind;
type WantError<'s> = (&'s str, ErrorKind);
#[test]
fn parses_scalar_seq() {
let parse =
hex_scalar_seq::<WantError>;
assert_eq!(parse("xAE"), Ok(("", '\u{AE}')),
"Parses one-byte scalar");
assert_eq!(parse("xae"), Ok(("", '\u{AE}')),
"Parser is not case-sensitive");
assert_eq!(parse("xAe"), Ok(("", '\u{AE}')),
"Parser is not case-sensitive");
// we do not want the parser to consume ; as it is not applicable to its use in the character
// literal parser
assert_eq!(parse("xAE;"), Ok((";", '\u{AE}')),
"Parser should not parse string/ident hex scalar terminator");
parse("x")
.expect_err("Parser should not parse hex scalar seq with 0 bits");
assert_eq!(parse("x000000AE"), Ok(("", '\u{AE}')),
"Parser should parse hex scalar seq up to 32 bits");
assert_eq!(parse("x00000000AE"), Ok(("AE", '\0')),
"Parser should not consume more than 32-bits worth of hex chars");
}
/// Test that the whitespace escape parser consumes any quantity of whitespace and halts at the
/// first non-whitespace character.
#[test]
fn parses_escaped_whitespace() {
let parse =
escaped_whitespace::<WantError>;
assert_eq!(parse(r"\ "), Ok(("", " ")),
"Parser should consume whitespace");
assert_eq!(parse(r"\ "), Ok(("", " ")),
"Parser should consume any amount of contiguous whitespace");
assert_eq!(parse("\\ \n "), Ok(("", " \n ")),
"Parser should consume any whitespace, including linefeed");
assert_eq!(parse(r"\ a "), Ok(("a ", " ")),
"Parser should halt where contiguous whitespace breaks");
assert_eq!(parse(r"\ a "), Ok(("a", " ")),
"Parser should halt where contiguous whitespace breaks");
}
/// Test that the normal text parser consumes any quantity of normal text and stops at the
/// first backslash or quote. Also test that the parser produces an error when asked to parse
/// text beginning with an unexpected special character.
#[test]
fn parses_normal_text() {
let mut parse =
normal_text::<WantError>('"');
// the parser should entirely consume an input containing no escape leader nor delimiter
assert_eq!(parse("one two 3."), Ok(("", "one two 3.")),
"Parser consumes normal text");
// the parser should stop at its configured delimiter
assert_eq!(parse("literally.\""), Ok(("\"", "literally.")),
"Parser does not consume past delimiter");
assert_eq!(normal_text::<WantError>('|')("pop|tarts"), Ok(("|tarts", "pop")),
"Parser does not consume past delimiter");
// the parser should always stop at \
assert_eq!(parse(r"literally.\"), Ok((r"\", "literally.")),
"Parser does not consume escape leader");
// the parser should always stop at \
assert_eq!(parse(r"pop\tarts"), Ok((r"\tarts", "pop")),
"Parser does not consume escape leader");
// e.g. unless some other character that might otherwise be a delimiter (|) is this parser's
// configured delimiter, it should consume it.
assert_eq!(parse("pop|tarts"), Ok(("", "pop|tarts")),
"Parser consumes other delimiters unless told otherwise");
// the following two cases will not return Ok() because
parse("\\Strawberry Pop Tarts may be a cheap and inexpensive source of incendiary devices.")
.expect_err("Consumed text beginning with backslash");
parse("\"Strawberry Pop Tarts may be a cheap and inexpensive source of incendiary devices.")
.expect_err("Consumed text beginning with quote");
}
macro_rules! expect_char_escape {
($($input:literal => $output:literal, $msg:literal,)+) => {
$( assert_eq!(
escaped_char::<WantError>($input),
Ok(("", $output)),
"{}: unexpected result for input {}", $msg, $input
); )+
}
}
/// Test that the character escape sequence parser calls complex escape sequence parsers as
/// needed and returns the expected character for a given escape sequence as defined in R7RS §6.7.
#[test]
fn parses_char_escape() {
expect_char_escape!(
r"\xAE;" => '\u{AE}', "Should parse hexadecimal scalar escape seq",
r"\a" => '\u{07}', "Should parse alarm escape seq",
r"\n" => '\u{0A}', "Should parse linefeed escape seq",
r"\r" => '\u{0D}', "Should parse carriage return escape seq",
"\\\"" => '\u{22}', "Should parse double quote escape seq",
r"\\" => '\u{5C}', "Should parse backslash escape seq",
r"\|" => '\u{7C}', "Should parse vertical line escape seq",
);
// should not parse undefined character escape. if adding new escapes, ensure that this
// remains undefined (e.g. update it *here* to something not in the escape parser)
escaped_char::<WantError>("z")
.expect_err("Should not parse undefined escape sequence");
}
// TODO assert_matches! stabilization would be very, very welcome
macro_rules! expect_match {
($left:expr, $right:pat_param, $fail_msg:expr) => {
match $left {
$right => (),
thing @ _ => panic!(
"{}: expected {} but got {:?} instead",
$fail_msg, stringify!($right), thing
)
}
}
}
/// Test that the fragment parser emits the correct fragment variants for a given input.
#[test]
fn emits_fragment_variants() {
let mut parse =
string_fragment::<(&str, ErrorKind)>('"');
expect_match!(parse("The Pop Tarts ... flames 10-18 inches"), Ok((_, Fragment::String(_))),
"Did not produce String fragment for normal text");
expect_match!(parse(r"\xAE;"), Ok((_, Fragment::Char(_))),
"Did not produce Char fragment for character escape");
expect_match!(parse(r"\ "), Ok((_, Fragment::Void)),
"Did not produce Void fragment for escaped whitespace");
expect_match!(parse("\\ \n "), Ok((_, Fragment::Void)),
"Did not produce Void fragment for escaped whitespace containing linefeed");
}
/// Test that the string body parser consumes and recombines the input string as expected.
#[test]
fn parses_string_body() {
let mut parse =
string_body::<(&str, ErrorKind)>('"');
assert_eq!(parse(r"\x1FAD0; Pop Tarts"), Ok(("", String::from("\u{1FAD0} Pop Tarts"))),
"Failed to transform char fragment followed by string fragment");
assert_eq!(parse(r"one\ two"), Ok(("", String::from("onetwo"))),
"Failed to transform string fragments separated by void fragment");
assert_eq!(parse(""), Ok(("", String::from(""))),
"Failed to transform empty input to empty string");
assert_eq!(parse(r"\ "), Ok(("", String::from(""))),
"Failed to transform single void fragment to empty string");
assert_eq!(parse(r"\ \n18-10 inches"), Ok(("", String::from("\n18-10 inches"))),
"Failed to transform [Void,Char,String] sequence");
// non-printable chars that may appear in strings unescaped
// yes, the newlines are supposed to be there
assert_eq!(parse("\nStrawberry\n"), Ok(("", String::from("\nStrawberry\n"))),
"Failed to transform String fragment containing newlines");
assert_eq!(parse("\t"), Ok(("", String::from("\t"))),
"Failed to transform String fragment containing tab");
}
/// Test that delimited strings are correctly parsed, including where broken by newlines, etc…
/// Also test that escaped string delimiters are treated nicely. Note that most tests concerned
/// with the body parser are in [parses_string_body].
#[test]
fn parses_delimited_string() {
let parse =
string::<WantError>;
assert_eq!(
parse("\"Strawberry Pop Tarts\""),
Ok(("", Atom::String(String::from("Strawberry Pop Tarts")))),
"Failed to parse valid delimited string"
);
assert_eq!(
parse("\"\\\"flames 18-10 inches\""),
Ok(("", Atom::String(String::from("\"flames 18-10 inches")))),
"Failed to parse valid delimited string with single escaped delimiter"
);
assert_eq!(
parse("\"\\\"flames 18-10 inches\\\" in height\""),
Ok(("", Atom::String(String::from("\"flames 18-10 inches\" in height")))),
"Failed to parse valid delimited string with balanced escaped delimiters"
);
assert_eq!(
parse("\"incendiary devices.\n Toasters\""),
Ok(("", Atom::String(String::from("incendiary devices.\n Toasters")))),
"Failed to parse valid delimited string containing unescaped newline"
);
parse("Pop Tarts may be … incendiary devices")
.expect_err("Did not yield error when parsing un-delimited string");
}
#[test]
fn parses_delimited_identifier() {
let parse =
long_identifier::<WantError>;
assert_eq!(
parse("|pop tarts|"),
Ok(("", Atom::String(String::from("pop tarts")))),
"Failed to parse delimited identifier"
)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment