Skip to content

Instantly share code, notes, and snippets.

@elfsternberg
Created June 2, 2024 05:13
Show Gist options
  • Save elfsternberg/85652dd3a9b0d87ff1d3a24d0f411f9c to your computer and use it in GitHub Desktop.
Save elfsternberg/85652dd3a9b0d87ff1d3a24d0f411f9c to your computer and use it in GitHub Desktop.
Rust Nom Parser that recognizes "at the beginning of a line."
use nom::{
bytes::complete::{tag, take_while},
combinator::recognize,
sequence::preceded,
IResult,
};
/**
* Using Rust Nom, show how to detect "content that begins at the start of a line." This particular
* variant rolls forward until it finds any content *other than* the start of a line, which is
* defined as "the input token after any \n".
*
*/
fn is_beginning_of_line(input: &str) -> IResult<&str, ()> {
if input.is_empty() {
// It took me an absolutely ridiculous amount of time to find a simple "how do you construct
// a standard error in Rust Nom" example. Shout-out to Daniel Imfeld (@dimfeld)
// (https://imfeld.dev/writing/parsing_with_nom) for being the *23rd* entry Google offered
// to answer that question, and being the *first* one with an example that wasn't "How to
// write a custom Nom error" or "How to handle errors in Nom."
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Eof,
)));
}
let (remaining, _) = recognize(preceded(
take_while(|c| c == '\n'),
// The empty string tag always succeeds without consuming any of the input. So we're
// skipping the start-of-line markers before "trivially" succeeding, always successfully
// checking the empty token, so we don't actually consume the next token.
tag(""),
))(input)?;
// The nice thing about using "recognize" is that it will keep all the white space you consumed,
// in case you want to count the number of empty lines this parser ate. Since both `input` and
// `remaining` are slices pointing into the original input array, comparing them by pointer is
// much faster than doing a string comparison. Remaining is now pointing to the first token
// *after* the return, which could be literally where the input started, or remaining is where
// the input needs to be next *after* the input started with "\n". (I.e, we rolled forward one
// or more "\n"'s, but didn't meet anything else so we can't be anywhere but at column 0).
if std::ptr::eq(input, remaining) || input.starts_with('\n') {
Ok((remaining, ()))
} else {
// I wasn't really sure what error to return here. This is one of those parsers you'll
// probably use in an alt or something and we're consuming a hazy concept of end-of-line
// until we're sure we're at the beginning of a line. "Tag" was the closest thing that
// seemed right.
Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Tag,
)))
}
}
// And this is how you use it; you're looking for *content* at the *start* of the line, not the
// start itself.
fn pattern_at_beginning_of_line(input: &str) -> IResult<&str, &str> {
preceded(
is_beginning_of_line,
tag("BEGIN"),
)(input)
}
fn main() use nom::{
bytes::complete::{tag, take_while},
combinator::recognize,
sequence::preceded,
IResult,
};
/**
* Using Rust Nom, show how to detect "content that begins at the start of a line." This particular
* variant rolls forward until it finds any content *other than* the start of a line, which is
* defined as "the input token after any \n".
*
*/
fn is_beginning_of_line(input: &str) -> IResult<&str, ()> {
if input.is_empty() {
// It took me an absolutely ridiculous amount of time to find a simple "how do you construct
// a standard error in Rust Nom" example. Shout-out to Daniel Imfeld (@dimfeld)
// (https://imfeld.dev/writing/parsing_with_nom) for being the *23rd* entry Google offered
// to answer that question, and being the *first* one with an example that wasn't "How to
// write a custom Nom error" or "How to handle errors in Nom."
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Eof,
)));
}
let (remaining, _) = recognize(preceded(
take_while(|c| c == '\n'),
// The empty string tag always succeeds without consuming any of the input. So we're
// skipping the start-of-line markers before "trivially" succeeding, always successfully
// checking the empty token, so we don't actually consume the next token.
tag(""),
))(input)?;
// The nice thing about using "recognize" is that it will keep all the white space you consumed,
// in case you want to count the number of empty lines this parser ate. Since both `input` and
// `remaining` are slices pointing into the original input array, comparing them by pointer is
// much faster than doing a string comparison. Remaining is now pointing to the first token
// *after* the line feed, which could be literally where the input started, or remaining is where
// the input needs to be next *after* the input started with "\n". (I.e, we rolled forward one
// or more "\n"'s, but didn't meet anything else so we can't be anywhere but at column 0).
if std::ptr::eq(input, remaining) || input.starts_with('\n') {
Ok((remaining, ()))
} else {
// I wasn't really sure what error to return here. This is one of those parsers you'll
// probably use in an alt or something and we're consuming a hazy concept of end-of-line
// until we're sure we're at the beginning of a line. "Tag" was the closest thing that
// seemed right.
Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Tag,
)))
}
}
// And this is how you use it; you're looking for *content* at the *start* of the line, not the
// start itself.
fn pattern_at_beginning_of_line(input: &str) -> IResult<&str, &str> {
preceded(
is_beginning_of_line,
tag("BEGIN"),
)(input)
}
fn main() {
let input = "\nBEGINThis is where your input will be next.";
// Note that `pattern_at_beginning_of_line` matches the parser *after* `is_beginning_of_line`.
// Using `preceded` (above) throws out all the line feeds.
match pattern_at_beginning_of_line(input) {
Ok((remaining, matched)) => println!("Matched: '{}', Remaining: '{}'", matched, remaining),
Err(err) => println!("Error: {:?}", err),
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn predicate_empty_in_not_sol() {
let result = is_beginning_of_line("");
assert!(result.is_err());
}
#[test]
fn predicate_sol() {
let result = is_beginning_of_line("\n");
assert!(result.is_ok());
}
#[test]
fn predicate_not_sol() {
let result = is_beginning_of_line("TEST");
assert!(result.is_ok());
let (remaining, _) = result.unwrap();
assert_eq!(remaining, "TEST");
}
#[test]
fn predicate_more_than_sol() {
let result = is_beginning_of_line("\nTEST");
assert!(result.is_ok());
let (remaining, _) = result.unwrap();
assert_eq!(remaining, "TEST");
}
#[test]
fn sample_test() {
let result = pattern_at_beginning_of_line("BEGIN: the rest");
assert!(result.is_ok());
let (remaining, input) = result.unwrap();
println!("{:?}", input);
assert_eq!(remaining, ": the rest");
}
#[test]
fn with_leading_return() {
let result = pattern_at_beginning_of_line("\nBEGIN: the rest");
assert!(result.is_ok());
let (remaining, _) = result.unwrap();
assert_eq!(remaining, ": the rest");
}
#[test]
fn with_multiple_leading_return() {
let result = pattern_at_beginning_of_line("\n\n\nBEGIN: the rest");
assert!(result.is_ok());
let (remaining, _) = result.unwrap();
assert_eq!(remaining, ": the rest");
}
#[test]
fn with_space_leading_return() {
let result = pattern_at_beginning_of_line(" \nBEGIN: the rest");
assert!(result.is_err());
}
}
let input = "\nBEGINThis is where your input will be next";
match pattern_at_beginning_of_line(input) {
Ok((remaining, matched)) => println!("Matched: '{}', Remaining: '{}'", matched, remaining),
Err(err) => println!("Error: {:?}", err),
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn predicate_empty_in_not_sol() {
let result = is_beginning_of_line("");
assert!(result.is_err());
}
#[test]
fn predicate_sol() {
let result = is_beginning_of_line("\n");
assert!(result.is_ok());
}
#[test]
fn predicate_not_sol() {
let result = is_beginning_of_line("TEST");
assert!(result.is_ok());
let (remaining, _) = result.unwrap();
assert_eq!(remaining, "TEST");
}
#[test]
fn predicate_more_than_sol() {
let result = is_beginning_of_line("\nTEST");
assert!(result.is_ok());
let (remaining, _) = result.unwrap();
assert_eq!(remaining, "TEST");
}
#[test]
fn sample_test() {
let result = pattern_at_beginning_of_line("BEGIN: the rest");
assert!(result.is_ok());
let (remaining, input) = result.unwrap();
println!("{:?}", input);
assert_eq!(remaining, ": the rest");
}
#[test]
fn with_leading_return() {
let result = pattern_at_beginning_of_line("\nBEGIN: the rest");
assert!(result.is_ok());
let (remaining, _) = result.unwrap();
assert_eq!(remaining, ": the rest");
}
#[test]
fn with_multiple_leading_return() {
let result = pattern_at_beginning_of_line("\n\n\nBEGIN: the rest");
assert!(result.is_ok());
let (remaining, _) = result.unwrap();
assert_eq!(remaining, ": the rest");
}
#[test]
fn with_space_leading_return() {
let result = pattern_at_beginning_of_line(" \nBEGIN: the rest");
assert!(result.is_err());
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment