Created
June 2, 2024 05:13
-
-
Save elfsternberg/85652dd3a9b0d87ff1d3a24d0f411f9c to your computer and use it in GitHub Desktop.
Rust Nom Parser that recognizes "at the beginning of a line."
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use nom::{ | |
bytes::complete::{tag, take_while}, | |
combinator::recognize, | |
sequence::preceded, | |
IResult, | |
}; | |
/** | |
* Using Rust Nom, show how to detect "content that begins at the start of a line." This particular | |
* variant rolls forward until it finds any content *other than* the start of a line, which is | |
* defined as "the input token after any \n". | |
* | |
*/ | |
fn is_beginning_of_line(input: &str) -> IResult<&str, ()> { | |
if input.is_empty() { | |
// It took me an absolutely ridiculous amount of time to find a simple "how do you construct | |
// a standard error in Rust Nom" example. Shout-out to Daniel Imfeld (@dimfeld) | |
// (https://imfeld.dev/writing/parsing_with_nom) for being the *23rd* entry Google offered | |
// to answer that question, and being the *first* one with an example that wasn't "How to | |
// write a custom Nom error" or "How to handle errors in Nom." | |
return Err(nom::Err::Error(nom::error::Error::new( | |
input, | |
nom::error::ErrorKind::Eof, | |
))); | |
} | |
let (remaining, _) = recognize(preceded( | |
take_while(|c| c == '\n'), | |
// The empty string tag always succeeds without consuming any of the input. So we're | |
// skipping the start-of-line markers before "trivially" succeeding, always successfully | |
// checking the empty token, so we don't actually consume the next token. | |
tag(""), | |
))(input)?; | |
// The nice thing about using "recognize" is that it will keep all the white space you consumed, | |
// in case you want to count the number of empty lines this parser ate. Since both `input` and | |
// `remaining` are slices pointing into the original input array, comparing them by pointer is | |
// much faster than doing a string comparison. Remaining is now pointing to the first token | |
// *after* the return, which could be literally where the input started, or remaining is where | |
// the input needs to be next *after* the input started with "\n". (I.e, we rolled forward one | |
// or more "\n"'s, but didn't meet anything else so we can't be anywhere but at column 0). | |
if std::ptr::eq(input, remaining) || input.starts_with('\n') { | |
Ok((remaining, ())) | |
} else { | |
// I wasn't really sure what error to return here. This is one of those parsers you'll | |
// probably use in an alt or something and we're consuming a hazy concept of end-of-line | |
// until we're sure we're at the beginning of a line. "Tag" was the closest thing that | |
// seemed right. | |
Err(nom::Err::Error(nom::error::Error::new( | |
input, | |
nom::error::ErrorKind::Tag, | |
))) | |
} | |
} | |
// And this is how you use it; you're looking for *content* at the *start* of the line, not the | |
// start itself. | |
fn pattern_at_beginning_of_line(input: &str) -> IResult<&str, &str> { | |
preceded( | |
is_beginning_of_line, | |
tag("BEGIN"), | |
)(input) | |
} | |
fn main() use nom::{ | |
bytes::complete::{tag, take_while}, | |
combinator::recognize, | |
sequence::preceded, | |
IResult, | |
}; | |
/** | |
* Using Rust Nom, show how to detect "content that begins at the start of a line." This particular | |
* variant rolls forward until it finds any content *other than* the start of a line, which is | |
* defined as "the input token after any \n". | |
* | |
*/ | |
fn is_beginning_of_line(input: &str) -> IResult<&str, ()> { | |
if input.is_empty() { | |
// It took me an absolutely ridiculous amount of time to find a simple "how do you construct | |
// a standard error in Rust Nom" example. Shout-out to Daniel Imfeld (@dimfeld) | |
// (https://imfeld.dev/writing/parsing_with_nom) for being the *23rd* entry Google offered | |
// to answer that question, and being the *first* one with an example that wasn't "How to | |
// write a custom Nom error" or "How to handle errors in Nom." | |
return Err(nom::Err::Error(nom::error::Error::new( | |
input, | |
nom::error::ErrorKind::Eof, | |
))); | |
} | |
let (remaining, _) = recognize(preceded( | |
take_while(|c| c == '\n'), | |
// The empty string tag always succeeds without consuming any of the input. So we're | |
// skipping the start-of-line markers before "trivially" succeeding, always successfully | |
// checking the empty token, so we don't actually consume the next token. | |
tag(""), | |
))(input)?; | |
// The nice thing about using "recognize" is that it will keep all the white space you consumed, | |
// in case you want to count the number of empty lines this parser ate. Since both `input` and | |
// `remaining` are slices pointing into the original input array, comparing them by pointer is | |
// much faster than doing a string comparison. Remaining is now pointing to the first token | |
// *after* the line feed, which could be literally where the input started, or remaining is where | |
// the input needs to be next *after* the input started with "\n". (I.e, we rolled forward one | |
// or more "\n"'s, but didn't meet anything else so we can't be anywhere but at column 0). | |
if std::ptr::eq(input, remaining) || input.starts_with('\n') { | |
Ok((remaining, ())) | |
} else { | |
// I wasn't really sure what error to return here. This is one of those parsers you'll | |
// probably use in an alt or something and we're consuming a hazy concept of end-of-line | |
// until we're sure we're at the beginning of a line. "Tag" was the closest thing that | |
// seemed right. | |
Err(nom::Err::Error(nom::error::Error::new( | |
input, | |
nom::error::ErrorKind::Tag, | |
))) | |
} | |
} | |
// And this is how you use it; you're looking for *content* at the *start* of the line, not the | |
// start itself. | |
fn pattern_at_beginning_of_line(input: &str) -> IResult<&str, &str> { | |
preceded( | |
is_beginning_of_line, | |
tag("BEGIN"), | |
)(input) | |
} | |
fn main() { | |
let input = "\nBEGINThis is where your input will be next."; | |
// Note that `pattern_at_beginning_of_line` matches the parser *after* `is_beginning_of_line`. | |
// Using `preceded` (above) throws out all the line feeds. | |
match pattern_at_beginning_of_line(input) { | |
Ok((remaining, matched)) => println!("Matched: '{}', Remaining: '{}'", matched, remaining), | |
Err(err) => println!("Error: {:?}", err), | |
} | |
} | |
#[cfg(test)] | |
mod tests { | |
use super::*; | |
#[test] | |
fn predicate_empty_in_not_sol() { | |
let result = is_beginning_of_line(""); | |
assert!(result.is_err()); | |
} | |
#[test] | |
fn predicate_sol() { | |
let result = is_beginning_of_line("\n"); | |
assert!(result.is_ok()); | |
} | |
#[test] | |
fn predicate_not_sol() { | |
let result = is_beginning_of_line("TEST"); | |
assert!(result.is_ok()); | |
let (remaining, _) = result.unwrap(); | |
assert_eq!(remaining, "TEST"); | |
} | |
#[test] | |
fn predicate_more_than_sol() { | |
let result = is_beginning_of_line("\nTEST"); | |
assert!(result.is_ok()); | |
let (remaining, _) = result.unwrap(); | |
assert_eq!(remaining, "TEST"); | |
} | |
#[test] | |
fn sample_test() { | |
let result = pattern_at_beginning_of_line("BEGIN: the rest"); | |
assert!(result.is_ok()); | |
let (remaining, input) = result.unwrap(); | |
println!("{:?}", input); | |
assert_eq!(remaining, ": the rest"); | |
} | |
#[test] | |
fn with_leading_return() { | |
let result = pattern_at_beginning_of_line("\nBEGIN: the rest"); | |
assert!(result.is_ok()); | |
let (remaining, _) = result.unwrap(); | |
assert_eq!(remaining, ": the rest"); | |
} | |
#[test] | |
fn with_multiple_leading_return() { | |
let result = pattern_at_beginning_of_line("\n\n\nBEGIN: the rest"); | |
assert!(result.is_ok()); | |
let (remaining, _) = result.unwrap(); | |
assert_eq!(remaining, ": the rest"); | |
} | |
#[test] | |
fn with_space_leading_return() { | |
let result = pattern_at_beginning_of_line(" \nBEGIN: the rest"); | |
assert!(result.is_err()); | |
} | |
} | |
let input = "\nBEGINThis is where your input will be next"; | |
match pattern_at_beginning_of_line(input) { | |
Ok((remaining, matched)) => println!("Matched: '{}', Remaining: '{}'", matched, remaining), | |
Err(err) => println!("Error: {:?}", err), | |
} | |
} | |
#[cfg(test)] | |
mod tests { | |
use super::*; | |
#[test] | |
fn predicate_empty_in_not_sol() { | |
let result = is_beginning_of_line(""); | |
assert!(result.is_err()); | |
} | |
#[test] | |
fn predicate_sol() { | |
let result = is_beginning_of_line("\n"); | |
assert!(result.is_ok()); | |
} | |
#[test] | |
fn predicate_not_sol() { | |
let result = is_beginning_of_line("TEST"); | |
assert!(result.is_ok()); | |
let (remaining, _) = result.unwrap(); | |
assert_eq!(remaining, "TEST"); | |
} | |
#[test] | |
fn predicate_more_than_sol() { | |
let result = is_beginning_of_line("\nTEST"); | |
assert!(result.is_ok()); | |
let (remaining, _) = result.unwrap(); | |
assert_eq!(remaining, "TEST"); | |
} | |
#[test] | |
fn sample_test() { | |
let result = pattern_at_beginning_of_line("BEGIN: the rest"); | |
assert!(result.is_ok()); | |
let (remaining, input) = result.unwrap(); | |
println!("{:?}", input); | |
assert_eq!(remaining, ": the rest"); | |
} | |
#[test] | |
fn with_leading_return() { | |
let result = pattern_at_beginning_of_line("\nBEGIN: the rest"); | |
assert!(result.is_ok()); | |
let (remaining, _) = result.unwrap(); | |
assert_eq!(remaining, ": the rest"); | |
} | |
#[test] | |
fn with_multiple_leading_return() { | |
let result = pattern_at_beginning_of_line("\n\n\nBEGIN: the rest"); | |
assert!(result.is_ok()); | |
let (remaining, _) = result.unwrap(); | |
assert_eq!(remaining, ": the rest"); | |
} | |
#[test] | |
fn with_space_leading_return() { | |
let result = pattern_at_beginning_of_line(" \nBEGIN: the rest"); | |
assert!(result.is_err()); | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment