kelvinmo/lltxt.rs

## lltxt.rs
//! The loose-tight text format (lttxt)
//!
//! Contains helpers to parse and write to files using this format.
//
//! # Overview
//!
//! The *loose-tight text format* is a loose, semi-structured text file format
//! for simple uses such as configuration files.  The format is defined in the
//! [specification](#specification) below.
//!
//! # Reading and writing
//!
//! To read, use a reader that can return the input as individual lines (such as
//! [`std::io::BufReader`]).  Call the [`get_tokens`] function to parse each line
//! into a set of tokens.
//!
//! To write, use the [`ltwriteln`] macro to convert a line of tokens into a string.
//!
//! # Specification
//!
//! * A lttxt file is dividend into *lines*, which are further divided into string *tokens*.
//! * Lines are delimited in the same way as [`str::lines`], i.e. either a newline
//!   (`\n`) or a carriage return with a line feed (`\r\n`).
//! * Tokens are delimited by a SPACE character (unless the space character is in
//!   a quoted token).
//! * Comments are denoted by the hash (`#`) character (unless it appears in a quoted
//!   token). The hash character and all subsequent characters are ignored until the
//!   end of the line.
//! * Tokens can be *quoted* or *unquoted*.
//! * Quoted tokens are surrounded by quotation marks `"`.  They MAY contain special
//!   characters (defined below).  Apart from the SPACE and hash characters, special
//!   characters MUST be escaped when appearing within a quoted token.  In addition,
//!   backslash character `\` MUST be escaped as `\\`.
//! * Unquoted tokens are not surrounded by quotation marks.  They MUST NOT contain
//!   special characters.
//! * *Special characters* are as follows, with the escape sequence in parentheses:
//!   - SPACE
//!   - hash
//!   - tab (`\t`)
//!   - newline (`\n`)
//!   - carriage return (`\r`)
//!   - quotation marks (`\"`)
//! * When writing a lttxt file, quoted tokens SHOULD only be used if the token contains
//!   special characters.  Otherwise, unquoted tokens SHOULD be used.
//!
//! [`get_tokens`]: ./fn.get_tokens.html
//! [`std::io::BufReader`]: https://doc.rust-lang.org/nightly/std/io/struct.BufReader.html
//! [`str::lines`]: https://doc.rust-lang.org/nightly/std/primitive.str.html#method.lines
//! [`ltwriteln`]: ./macro.ltwriteln!.html
use std::fmt;

/// The type returned when the input does not comform to the expected format. Use the
/// `Debug` implementation to generate detailed information.
#[derive(Debug)]
pub enum ParseError {
    /// An unescaped quote is encountered when parsing a quoted token
    UnescapedQuote(usize),
    /// An quotation mark is encountered when parsing an unquoted token
    UnexpectedQuote(usize),
    /// End of line is encountered when parsing a quoted token
    UnmatchedQuote,
    /// An unknown escape sequence is encountered
    UnexpectedEscapeSequence(char, usize)
}

impl fmt::Display for ParseError {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        match *self {
            ParseError::UnescapedQuote(i) => write!(f, "Unescaped quote in quoted token at position {}", i),
            ParseError::UnexpectedQuote(i) => write!(f, "Unexpected quote in unquoted token at position {}", i),
            ParseError::UnmatchedQuote => write!(f, "End of line without closing quote"),
            ParseError::UnexpectedEscapeSequence(c, i) => write!(f, "Unexpected escape sequence '\\{}' at position {}", c, i)
        }
    }
}

impl std::error::Error for ParseError {
    fn description(&self) -> &str {
        match *self {
            ParseError::UnescapedQuote(_) => "Unescaped quote in quoted token",
            ParseError::UnexpectedQuote(_) => "Unexpected quote in unquoted token",
            ParseError::UnmatchedQuote => "End of line without closing quote",
            ParseError::UnexpectedEscapeSequence(_, _) => "Unexpected escape sequence"
        }
    }

    fn cause(&self) -> Option<&dyn std::error::Error> {
        None
    }
}

/// An iterator over tokens in a line.
///
/// This is created by calling [`get_tokens`].  See the documentation for [`get_tokens`]
/// for further details.
///
/// [`get_tokens`]: ./fn.get_tokens.html
pub struct Tokens<'a> {
    s: &'a str,
    pos: usize
}

impl Iterator for Tokens<'_> {
    type Item = Result<String, ParseError>;

    fn next(&mut self) -> Option<Result<String, ParseError>> {
        if self.pos >= self.s.len() {
            return None;
        }

        let mut token = String::new();
        let mut chars = self.s[self.pos..].chars().enumerate();

        let mut in_token = false;
        let mut in_quote = false;

        // Return None at the end
        while let Some((i, c)) = chars.next() {
            if c == '#' {
                if in_quote {
                    token.push(c);
                } else {
                    // Advance position so that it's none on next call
                    self.pos = self.s.len();
                    if in_token {
                        return Some(Ok(token));
                    } else {
                        return None;
                    }
                }
            } else if c == '"' {
                if in_quote {
                    // Closing quote
                    match chars.next() {
                        None | Some((_, ' ')) | Some((_, '#')) => {
                            // Ok, return the token
                            self.pos += i + 2;
                            return Some(Ok(token));
                        },
                        Some((j, _)) => {
                            // Characters after quote
                            let result = Some(Err(ParseError::UnescapedQuote(self.pos + j)));
                            // Advance position so that it's none on next call
                            self.pos = self.s.len();
                            return result;
                        }
                    };
                } else if in_token {
                    // Quote in the middle of unquoted token
                    let result = Some(Err(ParseError::UnexpectedQuote(self.pos + i)));
                    // Advance position so that it's none on next call
                    self.pos = self.s.len();
                    return result;
                } else {
                    in_token = true;
                    in_quote = true;
                }
            } else if c == '\\' {
                if in_quote {
                    let (_, n) = chars.next().unwrap_or((i, c));
                    match n {
                        '\\' | '\"' => token.push(n),
                        't' => token.push('\t'),
                        'n' => token.push('\n'),
                        'r' => token.push('\r'),
                        unexpected => {
                            let result = Some(Err(ParseError::UnexpectedEscapeSequence(unexpected, self.pos + i)));
                            self.pos = self.s.len();
                            return result;
                        }
                    };
                } else {
                    token.push(c);
                }
            } else if c == ' ' {
                if in_quote {
                    token.push(c);
                } else if in_token {
                    self.pos += i + 1;
                    return Some(Ok(token));
                }
            } else {
                in_token = true;
                token.push(c);
            }
        }

        // Advance position so that it's none on next call
        self.pos = self.s.len();

        if in_quote {
            return Some(Err(ParseError::UnmatchedQuote));
        }

        if in_token {
            return Some(Ok(token));
        }

        None
    }
}

/// Parses a line in a lttxt file and returns an iterator over tokens in that
/// line.
///
/// The iterator returned from this function will yield instances of
/// [`Result`]`<`[`String`]`, `[`ParseError`]`>`.  A [`ParseError`] is returned if an
/// error was encountered while trying to parse the next token.
///
/// [`Result`]: https://doc.rust-lang.org/nightly/std/result/enum.Result.html
/// [`String`]: https://doc.rust-lang.org/nightly/std/string/struct.String.html
/// [`ParseError`]: ./enum.ParseError.html
pub fn get_tokens(s: &str) -> Tokens {
    Tokens {
        s: s,
        pos: 0
    }
}

/// Macro for printing an lttxt formatted line to the stanard output.
///
/// See [`ltwriteln!`] for more information on the syntax.
///
/// [`ltwriteln!`]: ./macro.ltwriteln!.html
#[macro_export]
macro_rules! ltprintln {
    ($($arg:expr,)*) => { println!("{}", vec![$($arg),*].into_iter().map(|token| { crate::lttxt::quote_token(token) }).collect::<Vec<String>>().join(" ")) };
}

/// Macro for formatting a set of arguments into an lttxt formatted line.
///
/// The first argument is the output buffer.
#[macro_export]
macro_rules! ltwriteln {
    ($dst:expr, $($arg:expr,)*) => { writeln!($dst, "{}", vec![$($arg),*].into_iter().map(|token| { crate::lttxt::quote_token(token) }).collect::<Vec<String>>().join(" ")) };
}

/// Returns a quoted token if the string contains a special character, or an
/// unquoted token otherwise.
pub fn quote_token(s: &str) -> String {
    let mut result = String::new();

    let quote = s.chars().any(|c| c == ' ' || c == '#' || c == '\t' || c == '\n' || c == '\r' || c == '"');

    if quote { result.push('"'); }

    let mut chars = s.chars();
    while let Some(c) = chars.next() {
        match c {
            '\t' => result.push_str("\\t"),
            '\n' => result.push_str("\\n"),
            '\r' => result.push_str("\\r"),
            '"' => result.push_str("\\\""),
            c => result.push(c)
        };
    }

    if quote { result.push('"'); }

    result
}
	//! The loose-tight text format (lttxt)
	//!
	//! Contains helpers to parse and write to files using this format.
	//
	//! # Overview
	//!
	//! The loose-tight text format is a loose, semi-structured text file format
	//! for simple uses such as configuration files. The format is defined in the
	//! [specification](#specification) below.
	//!
	//! # Reading and writing
	//!
	//! To read, use a reader that can return the input as individual lines (such as
	//! [`std::io::BufReader`]). Call the [`get_tokens`] function to parse each line
	//! into a set of tokens.
	//!
	//! To write, use the [`ltwriteln`] macro to convert a line of tokens into a string.
	//!
	//! # Specification
	//!
	//! * A lttxt file is dividend into lines, which are further divided into string tokens.
	//! * Lines are delimited in the same way as [`str::lines`], i.e. either a newline
	//! (`\n`) or a carriage return with a line feed (`\r\n`).
	//! * Tokens are delimited by a SPACE character (unless the space character is in
	//! a quoted token).
	//! * Comments are denoted by the hash (`#`) character (unless it appears in a quoted
	//! token). The hash character and all subsequent characters are ignored until the
	//! end of the line.
	//! * Tokens can be quoted or unquoted.
	//! * Quoted tokens are surrounded by quotation marks `"`. They MAY contain special
	//! characters (defined below). Apart from the SPACE and hash characters, special
	//! characters MUST be escaped when appearing within a quoted token. In addition,
	//! backslash character `\` MUST be escaped as `\\`.
	//! * Unquoted tokens are not surrounded by quotation marks. They MUST NOT contain
	//! special characters.
	//! * Special characters are as follows, with the escape sequence in parentheses:
	//! - SPACE
	//! - hash
	//! - tab (`\t`)
	//! - newline (`\n`)
	//! - carriage return (`\r`)
	//! - quotation marks (`\"`)
	//! * When writing a lttxt file, quoted tokens SHOULD only be used if the token contains
	//! special characters. Otherwise, unquoted tokens SHOULD be used.
	//!
	//! [`get_tokens`]: ./fn.get_tokens.html
	//! [`std::io::BufReader`]: https://doc.rust-lang.org/nightly/std/io/struct.BufReader.html
	//! [`str::lines`]: https://doc.rust-lang.org/nightly/std/primitive.str.html#method.lines
	//! [`ltwriteln`]: ./macro.ltwriteln!.html
	use std::fmt;

	/// The type returned when the input does not comform to the expected format. Use the
	/// `Debug` implementation to generate detailed information.
	#[derive(Debug)]
	pub enum ParseError {
	/// An unescaped quote is encountered when parsing a quoted token
	UnescapedQuote(usize),
	/// An quotation mark is encountered when parsing an unquoted token
	UnexpectedQuote(usize),
	/// End of line is encountered when parsing a quoted token
	UnmatchedQuote,
	/// An unknown escape sequence is encountered
	UnexpectedEscapeSequence(char, usize)
	}

	impl fmt::Display for ParseError {
	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
	match *self {
	ParseError::UnescapedQuote(i) => write!(f, "Unescaped quote in quoted token at position {}", i),
	ParseError::UnexpectedQuote(i) => write!(f, "Unexpected quote in unquoted token at position {}", i),
	ParseError::UnmatchedQuote => write!(f, "End of line without closing quote"),
	ParseError::UnexpectedEscapeSequence(c, i) => write!(f, "Unexpected escape sequence '\\{}' at position {}", c, i)
	}
	}
	}

	impl std::error::Error for ParseError {
	fn description(&self) -> &str {
	match *self {
	ParseError::UnescapedQuote(_) => "Unescaped quote in quoted token",
	ParseError::UnexpectedQuote(_) => "Unexpected quote in unquoted token",
	ParseError::UnmatchedQuote => "End of line without closing quote",
	ParseError::UnexpectedEscapeSequence(_, _) => "Unexpected escape sequence"
	}
	}

	fn cause(&self) -> Option<&dyn std::error::Error> {
	None
	}
	}

	/// An iterator over tokens in a line.
	///
	/// This is created by calling [`get_tokens`]. See the documentation for [`get_tokens`]
	/// for further details.
	///
	/// [`get_tokens`]: ./fn.get_tokens.html
	pub struct Tokens<'a> {
	s: &'a str,
	pos: usize
	}

	impl Iterator for Tokens<'_> {
	type Item = Result<String, ParseError>;

	fn next(&mut self) -> Option<Result<String, ParseError>> {
	if self.pos >= self.s.len() {
	return None;
	}

	let mut token = String::new();
	let mut chars = self.s[self.pos..].chars().enumerate();

	let mut in_token = false;
	let mut in_quote = false;

	// Return None at the end
	while let Some((i, c)) = chars.next() {
	if c == '#' {
	if in_quote {
	token.push(c);
	} else {
	// Advance position so that it's none on next call
	self.pos = self.s.len();
	if in_token {
	return Some(Ok(token));
	} else {
	return None;
	}
	}
	} else if c == '"' {
	if in_quote {
	// Closing quote
	match chars.next() {
	None \| Some((_, ' ')) \| Some((_, '#')) => {
	// Ok, return the token
	self.pos += i + 2;
	return Some(Ok(token));
	},
	Some((j, _)) => {
	// Characters after quote
	let result = Some(Err(ParseError::UnescapedQuote(self.pos + j)));
	// Advance position so that it's none on next call
	self.pos = self.s.len();
	return result;
	}
	};
	} else if in_token {
	// Quote in the middle of unquoted token
	let result = Some(Err(ParseError::UnexpectedQuote(self.pos + i)));
	// Advance position so that it's none on next call
	self.pos = self.s.len();
	return result;
	} else {
	in_token = true;
	in_quote = true;
	}
	} else if c == '\\' {
	if in_quote {
	let (_, n) = chars.next().unwrap_or((i, c));
	match n {
	'\\' \| '\"' => token.push(n),
	't' => token.push('\t'),
	'n' => token.push('\n'),
	'r' => token.push('\r'),
	unexpected => {
	let result = Some(Err(ParseError::UnexpectedEscapeSequence(unexpected, self.pos + i)));
	self.pos = self.s.len();
	return result;
	}
	};
	} else {
	token.push(c);
	}
	} else if c == ' ' {
	if in_quote {
	token.push(c);
	} else if in_token {
	self.pos += i + 1;
	return Some(Ok(token));
	}
	} else {
	in_token = true;
	token.push(c);
	}
	}

	// Advance position so that it's none on next call
	self.pos = self.s.len();

	if in_quote {
	return Some(Err(ParseError::UnmatchedQuote));
	}

	if in_token {
	return Some(Ok(token));
	}

	None
	}
	}

	/// Parses a line in a lttxt file and returns an iterator over tokens in that
	/// line.
	///
	/// The iterator returned from this function will yield instances of
	/// [`Result`]`<`[`String`]`, `[`ParseError`]`>`. A [`ParseError`] is returned if an
	/// error was encountered while trying to parse the next token.
	///
	/// [`Result`]: https://doc.rust-lang.org/nightly/std/result/enum.Result.html
	/// [`String`]: https://doc.rust-lang.org/nightly/std/string/struct.String.html
	/// [`ParseError`]: ./enum.ParseError.html
	pub fn get_tokens(s: &str) -> Tokens {
	Tokens {
	s: s,
	pos: 0
	}
	}

	/// Macro for printing an lttxt formatted line to the stanard output.
	///
	/// See [`ltwriteln!`] for more information on the syntax.
	///
	/// [`ltwriteln!`]: ./macro.ltwriteln!.html
	#[macro_export]
	macro_rules! ltprintln {
	($($arg:expr,)) => { println!("{}", vec![$($arg),].into_iter().map(\|token\| { crate::lttxt::quote_token(token) }).collect::<Vec<String>>().join(" ")) };
	}

	/// Macro for formatting a set of arguments into an lttxt formatted line.
	///
	/// The first argument is the output buffer.
	#[macro_export]
	macro_rules! ltwriteln {
	($dst:expr, $($arg:expr,)) => { writeln!($dst, "{}", vec![$($arg),].into_iter().map(\|token\| { crate::lttxt::quote_token(token) }).collect::<Vec<String>>().join(" ")) };
	}

	/// Returns a quoted token if the string contains a special character, or an
	/// unquoted token otherwise.
	pub fn quote_token(s: &str) -> String {
	let mut result = String::new();

	let quote = s.chars().any(\|c\| c == ' ' \|\| c == '#' \|\| c == '\t' \|\| c == '\n' \|\| c == '\r' \|\| c == '"');

	if quote { result.push('"'); }

	let mut chars = s.chars();
	while let Some(c) = chars.next() {
	match c {
	'\t' => result.push_str("\\t"),
	'\n' => result.push_str("\\n"),
	'\r' => result.push_str("\\r"),
	'"' => result.push_str("\\\""),
	c => result.push(c)
	};
	}

	if quote { result.push('"'); }

	result
	}