Skip to content

Instantly share code, notes, and snippets.

@sekhat
Created January 24, 2022 18:58
Show Gist options
  • Save sekhat/3406a54d5171902c01bac4efec3f8623 to your computer and use it in GitHub Desktop.
Save sekhat/3406a54d5171902c01bac4efec3f8623 to your computer and use it in GitHub Desktop.
use std::iter::Peekable;
use std::num::NonZeroUsize;
/// A text token returned from the tokenizer
#[derive(Clone)]
enum TextToken<'a> {
/// The contained string represents a string of text characters
Text(&'a str),
/// The contained string represents a string of text characters
/// that aren't allowed to be broken on to the next line
DontBreak(&'a str),
/// Represents a string of characters that are all whitespace
Whitespace(&'a str),
}
/// Determines if `c` is a whitespace character
fn is_whitespace(c: char) -> bool {
c == ' ' || c == '\t' || c == '\r' || c == '\n'
}
/// The Tokenizer
struct Tokenizer<'a, I> {
/// The input
input: I,
/// The current string being processed
current: Option<&'a str>,
}
impl<'a, I> Tokenizer<'a, I>
where
I: Iterator<Item = &'a str>,
{
/// Create a new `Tokenizer` fron an iterator
fn new<V>(input: V) -> Self
where
V: IntoIterator<IntoIter = I>
{
Self {
input: input.into_iter(),
current: None,
}
}
}
// allow our tokenizer to be cloned if it's containing iterator is also Clone
impl<'a, I> Clone for Tokenizer<'a, I>
where
I: Iterator + Clone,
{
fn clone(&self) -> Self {
Self {
input: self.input.clone(),
current: self.current.clone(),
}
}
}
impl<'a, I> Iterator for Tokenizer<'a, I>
where
I: Iterator<Item = &'a str>,
{
type Item = TextToken<'a>;
fn next(&mut self) -> Option<Self::Item> {
// if we have a current string use that to read from
// otherwise read the next string from our iterator
let string = match self.current.take() {
Some(string) => string,
None => self.input.next()?,
};
// make sure we can read through the character and there string
// indices
let mut chars = string.char_indices().peekable();
// check if our first character is whitespace or not, so we can
// determine if we are reading a run of whitespace characters or text
// characters
let is_matching_whitespace = match chars.peek() {
Some((_, c)) => is_whitespace(*c),
None => return self.next(),
};
// figure out the end index of our current run
let end_index = chars
.filter(|(_, c)| is_whitespace(*c) != is_matching_whitespace)
.map(|(i, _)| i)
.next();
// figure out which part of our string is the result
let result = match end_index {
Some(index) => &string[..index],
None => string,
};
// if there is any remaining string set that to current so it's used
// next loop
self.current = match end_index {
Some(index) => Some(&string[index..]),
None => None,
};
// and return our correct result
if is_matching_whitespace {
Some(TextToken::Whitespace(result))
} else {
Some(TextToken::Text(result))
}
}
}
/// Represents a token outputted from the TextLayout
#[derive(Debug)]
enum LayoutToken<'a> {
/// The contained string of text
Text(&'a str),
/// A space between words
Space,
/// A new line
Newline,
}
/// Represents the items that can be pushed back
enum PushBack<'a> {
/// A text token to be used instead of the next item from the tokenizer
Token(TextToken<'a>),
/// The TextLayout should output a new line, then the next text token
/// to process should be `TextToken` contained
NewlineThen(TextToken<'a>),
/// The TextLayout should output the following `LayoutToken` next
LayoutToken(LayoutToken<'a>),
/// There is nothing in the push back
None,
}
impl<'a> PushBack<'a> {
/// Take the item out of the PushBack, returning it to the caller
/// and leaving the existing push back as None
fn take(&mut self) -> PushBack<'a> {
let mut result = PushBack::None;
core::mem::swap(&mut result, self);
result
}
}
/// Split a `&str` at the given utf-8 character index
fn split_at_char_index(input: &str, index: usize) -> (&str, &str) {
match input.char_indices().skip(index).next() {
Some((index, _)) => input.split_at(index),
None => (input, ""),
}
}
struct TextLayout<'a, I>
where
I: Iterator<Item = &'a str>,
{
/// The character count that words will be wrapped at
width: usize,
/// The current position into the current line
current: usize,
/// Hold the push back state
push_back: PushBack<'a>,
/// The tokenizer to read `TextToken`s from
tokenizer: Peekable<Tokenizer<'a, I>>,
}
impl<'a, I> TextLayout<'a, I>
where
I: Iterator<Item = &'a str>,
{
/// Creates a new `TextLayout`, given the expected iterator and non-zero
/// line width to wrap words at.
fn new<V>(input: V, width: NonZeroUsize) -> Self
where
V: IntoIterator<IntoIter = I>,
{
Self {
width: width.get(),
current: 0,
push_back: PushBack::None,
tokenizer: Tokenizer::new(input.into_iter()).peekable(),
}
}
}
impl<'a, I> Iterator for TextLayout<'a, I>
where
I: Iterator<Item = &'a str> + Clone,
{
type Item = LayoutToken<'a>;
fn next(&mut self) -> Option<Self::Item> {
let token = match self.push_back.take() {
// If there's no item on the push back, read a TextToken from
// the tokenizer
PushBack::None => self.tokenizer.next()?,
// If there's a text token on the push back, use that
PushBack::Token(token) => token,
// If we are expecting to output a newline, do that and prepare
// the next text token
PushBack::NewlineThen(token) => {
// we first update the push back to be a token for next
// time `next()` is called
self.push_back = PushBack::Token(token);
// reset the line index
self.current = 0;
// and return a Newline
return Some(LayoutToken::Newline);
}
// If we are expecting a layout token just return that
PushBack::LayoutToken(token) => return Some(token),
};
// are we allowed to push the current token on to the next line?
let allow_next_line = match token {
TextToken::DontBreak(_) => false,
_ => true,
};
match token {
// Both DontBreak and Text means a string of text
// DontBreak just means, you aren't allowed to push the word
// on to the next line hence the allow_next_line variable above
TextToken::DontBreak(text) | TextToken::Text(text) => {
//
// Multiple word tokens in a row means it's likely all one word
// so we need to add up all the sizes to make sure we make
// the correct decision
// We clone the tokenizer here, so we can advance that iterator
// seperately from our own.
//
// As long as the underlying iterator that tokenizer depends on
// doesn't require allocating to clone, no allocation will occur
//
// But that at least leaves it in the users hands
let mut find_end_iter = self.tokenizer.clone();
// The length of just this part of the text
let part_length = text.chars().count();
// calculate the full length of the current word
let mut length = part_length;
for item in find_end_iter {
match item {
TextToken::DontBreak(text) |
TextToken::Text(text) => {
length += text.chars().count()
},
_ => break
}
}
// So we have a part, which may be smaller than the word
// we need to decide new line base on the whole word,
// though allow_next_line changes that behaviour
let at_start_of_line = self.current == 0;
let end_line_index = self.current + length;
let word_will_overflow = end_line_index > self.width;
let part_will_overflow =
self.current + part_length > self.width;
match (
at_start_of_line,
word_will_overflow,
allow_next_line,
part_will_overflow,
) {
(false, true, false, false) | (true, true, _, false) => {
// Force that the next token won't allow being pushed
// onto the next line, since we are only part of the
// whole word
self.push_back = match self.tokenizer.next() {
Some(TextToken::Text(text))
| Some(TextToken::DontBreak(text)) => {
PushBack::Token(TextToken::DontBreak(text))
}
Some(TextToken::Whitespace(text)) => {
PushBack::Token(TextToken::Whitespace(text))
}
None => PushBack::None,
};
self.current += text.len();
Some(LayoutToken::Text(text))
}
(false, true, false, true) | (true, true, _, true) => {
// allow_next_line is irrelevant here as we are
// already at the start of a line
//
// however, if out part is smaller than the overflow
// we just want to return that and make the next
// token DontBreak
// we know that we will over flow, so it's just
// the remaining size of the line we need
let valid_char_count = self.width - self.current;
let (left, right) =
split_at_char_index(text, valid_char_count);
if right.len() > 0 {
self.push_back =
PushBack::NewlineThen(TextToken::Text(right));
}
self.current += valid_char_count;
Some(LayoutToken::Text(left))
}
(_, false, _, _) => {
// We are at start of the line, and we know we wont
// overflow so just return text
self.current += text.len();
Some(LayoutToken::Text(text))
}
(false, true, true, _) => {
// So we aren't at the start of the line, but the word
// is allowed to be pushed to the next line, it will
// overflow, so we need to return a newline so we can
// just push back the token and output the newline
//
// Doing this should trigger the at start_of_line handlers
self.push_back = PushBack::Token(TextToken::Text(text));
self.current = 0;
Some(LayoutToken::Newline)
}
}
}
TextToken::Whitespace(_) => {
// we can assume getting this far that push_back is now None
// consume all our whitespace tokens
while let Some(TextToken::Whitespace(_)) =
self.tokenizer.peek()
{
// fine returning early if we reach none while in
// white space
self.tokenizer.next()?;
}
// If we are at the end of the current line however, force
// a new line
let at_start_of_line = self.current == 0;
// only increase current if not at start of line, so that
// spaces are trimmed at start of line, but simulated otherwise
self.current += if at_start_of_line { 0 } else { 1 };
// if we get none next then we can short circuit return with
// None, as it has the same result as trimming whitespace
// at the end
let next_token = self.next()?;
match (at_start_of_line, next_token) {
// If we have a new line as our next token we can just
// return that.
(_, LayoutToken::Newline) => Some(LayoutToken::Newline),
// if we are at start of line then we can do the same
// with text
(true, LayoutToken::Text(text)) => {
Some(LayoutToken::Text(text))
}
// if we aren't at start of line we should probably output
// our space so it's no longer simulated, but we need to
// make sure next run round is the text we got, so we
// put that on the push back
(false, LayoutToken::Text(text)) => {
self.push_back =
PushBack::LayoutToken(LayoutToken::Text(text));
Some(LayoutToken::Space)
}
// if we get a second lot of white space, there might be
// even more, so consume all the white space tokens from
// the tokenizer before returning our space
(_, LayoutToken::Space) => unreachable!()
}
}
}
}
}
fn layout_and_print<'a, I>(iterator: I, width: usize)
where
I: IntoIterator<Item = &'a str>,
I::IntoIter: Clone,
{
let width = NonZeroUsize::new(width).unwrap();
let text_layout = TextLayout::new(iterator, width);
for item in text_layout {
match item {
LayoutToken::Text(text) => print!("{}", text),
LayoutToken::Space => print!(" "),
LayoutToken::Newline => println!(""),
}
}
}
fn main() {
println!("first");
layout_and_print(["x---xy---yz--z"], 5);
println!("\n\nsecond");
layout_and_print(["x---xy-", "--yz", "--z"], 5);
println!("\n\nthird");
layout_and_print(["12345 ", " 67890", "12345", "1234567890"], 5);
println!("\n\ndone");
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment