-
-
Save shepmaster/d432b7bcffc96055ba523ee0abdac404 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#![feature(min_const_generics)] | |
#![allow(dead_code)] | |
#![deny(rust_2018_idioms)] | |
use jetscii::bytes; | |
use snafu::{ResultExt, Snafu}; | |
use std::{ | |
io::{self, Read}, | |
ops::Deref, | |
str, | |
}; | |
#[derive(Debug)] | |
struct Exhausted(bool); | |
#[derive(Debug, Copy, Clone, Default, PartialEq, Eq, Hash)] | |
struct StringRingStats { | |
/// Number of bytes of leading invalid data | |
n_invalid: usize, | |
/// Number of bytes of valid UTF-8 data | |
n_utf8: usize, | |
/// Number of bytes of read but not yet valid UTF-8 data | |
n_raw: usize, | |
} | |
#[derive(Debug)] | |
struct StringRing<const N: usize> { | |
buffer: Box<[u8; N]>, | |
stats: StringRingStats, | |
} | |
impl<const N: usize> StringRing<N> { | |
fn new() -> Self { | |
Self { | |
buffer: Box::new([0; N]), | |
stats: Default::default(), | |
} | |
} | |
fn stats(&self) -> StringRingStats { | |
self.stats | |
} | |
fn extend(&mut self, mut rdr: impl Read) -> io::Result<Exhausted> { | |
let Self { buffer, stats } = self; | |
let StringRingStats { | |
n_invalid, | |
n_utf8, | |
n_raw, | |
} = stats; | |
let free = &mut buffer[*n_invalid..][*n_utf8..][*n_raw..]; | |
assert_ne!(free.len(), 0, "todo: handle full buffer"); | |
let n_new_raw_bytes = rdr.read(free)?; | |
if n_new_raw_bytes == 0 { | |
return Ok(Exhausted(true)); | |
} | |
*n_raw += n_new_raw_bytes; | |
let raw = &buffer[*n_invalid..][*n_utf8..][..*n_raw]; | |
assert_ne!(raw.len(), 0, "todo: handle empty raw"); | |
let n_new_utf8_bytes = match str::from_utf8(raw) { | |
Ok(s) => s.len(), | |
Err(e) => match e.error_len() { | |
None => e.valid_up_to(), | |
Some(_) => todo!("Report invalid UTF-8"), | |
}, | |
}; | |
*n_raw -= n_new_utf8_bytes; | |
*n_utf8 += n_new_utf8_bytes; | |
Ok(Exhausted(false)) | |
} | |
fn while_tag_name(&self) -> Streaming<MatchTicket> { | |
let Self { buffer, stats } = self; | |
let StringRingStats { n_invalid, n_utf8, .. } = *stats; | |
let utf8 = &buffer[n_invalid..][..n_utf8]; | |
// TODO: check valid tag name chars | |
let loc = utf8.iter().position(|&b| !(b as char).is_ascii_alphabetic()); | |
let len = match loc { | |
Some(i) => Streaming::Complete(i), | |
None => Streaming::Partial(utf8.len()), | |
}; | |
len.map(MatchTicket) | |
} | |
fn while_text(&self) -> Streaming<MatchTicket> { | |
let Self { buffer, stats } = self; | |
let StringRingStats { n_invalid, n_utf8, .. } = *stats; | |
let utf8 = &buffer[n_invalid..][..n_utf8]; | |
let ws = bytes!('<', '&'); | |
let loc = ws.find(utf8); | |
let len = match loc { | |
Some(i) => Streaming::Complete(i), | |
None => Streaming::Partial(utf8.len()), | |
}; | |
len.map(MatchTicket) | |
} | |
fn start_matches(&self, needle: &str) -> MatchStatus { | |
let Self { buffer, stats } = self; | |
let StringRingStats { n_invalid, n_utf8, .. } = *stats; | |
if n_utf8 < needle.len() { | |
return MatchStatus::IncompleteInput; | |
} | |
let utf8 = &buffer[n_invalid..][..n_utf8]; | |
if utf8.starts_with(needle.as_bytes()) { | |
MatchStatus::Success(MatchTicket(needle.len())) | |
} else { | |
MatchStatus::Failure | |
} | |
} | |
fn exchange_ticket(&mut self, ticket: MatchTicket) -> &'_ str { | |
let Self { buffer, stats } = self; | |
let StringRingStats { n_invalid, n_utf8, .. } = stats; | |
let utf8 = &buffer[*n_invalid..][..*n_utf8]; | |
let MatchTicket(len) = ticket; | |
// SAFETY: Haha. I don't care yet. | |
let s = unsafe { | |
let b = utf8.get_unchecked(..len); | |
str::from_utf8_unchecked(b) | |
}; | |
*n_invalid += len; | |
*n_utf8 -= len; | |
s | |
} | |
} | |
impl<const N: usize> Deref for StringRing<N> { | |
type Target = str; | |
fn deref(&self) -> &str { | |
let Self { buffer, stats } = self; | |
let StringRingStats { n_invalid, n_utf8, .. } = *stats; | |
let utf8 = &buffer[n_invalid..][..n_utf8]; | |
// SAFETY: Haha. I don't care yet. | |
unsafe { str::from_utf8_unchecked(utf8) } | |
} | |
} | |
#[derive(Debug)] | |
enum MatchStatus { | |
Success(MatchTicket), | |
Failure, | |
IncompleteInput, | |
} | |
#[derive(Debug)] | |
struct MatchTicket(usize); | |
#[derive(Debug)] | |
struct Parser<R> { | |
mediator: Mediator<R>, | |
state: State, | |
} | |
impl<R> Parser<R> | |
where | |
R: Read, | |
{ | |
fn new(input: R) -> Self { | |
Self { | |
mediator: Mediator::new(input), | |
state: State::Beginning, | |
} | |
} | |
fn next(&mut self) -> Option<Result<Token<'_>>> { | |
use State::*; | |
let Self { mediator, state } = self; | |
// TODO: review `continue` and see if there's a better way to avoid cycling | |
loop { | |
match dbg!(*state) { | |
Beginning => match mediator.starts_with("<?xml?>") { | |
Nre::Matched(s) => { | |
*state = FoundPreamble; | |
return Some(Ok(Token::Preamble(s))); | |
} | |
Nre::Error(e) => return Some(Err(e)), | |
Nre::NotMatched => { | |
todo!("Handle failure to match"); | |
} | |
Nre::Exhausted => return None, | |
}, | |
FoundPreamble => { | |
// TODO: chew whitespace | |
match mediator.starts_with("<") { | |
Nre::Matched(_) => { | |
*state = TagStart; | |
continue; | |
} | |
Nre::Error(e) => return Some(Err(e)), | |
Nre::NotMatched => { | |
todo!("Handle failure to match"); | |
} | |
Nre::Exhausted => return None, | |
} | |
} | |
TagStart => { | |
match mediator.starts_with("/") { | |
Nre::Matched(_) => { | |
*state = InsideCloseTag; | |
return Some(Ok(Token::CloseTagStart)); | |
} | |
Nre::Error(e) => return Some(Err(e)), | |
Nre::NotMatched => { /* Fall through */ } | |
Nre::Exhausted => return None, | |
} | |
*state = InsideOpenTag; | |
return Some(Ok(Token::OpenTagStart)); | |
} | |
InsideOpenTag => { | |
let s = mediator.stream_while_tag_name(); | |
dbg!(s); | |
if s.is_complete() { | |
*state = ReadOpenTagName; | |
} | |
return Some(Ok(Token::TagName(s))); | |
} | |
ReadOpenTagName => { | |
// TODO: chew whitespace | |
match mediator.starts_with(">") { | |
Nre::Matched(_) => { | |
*state = InsideElement; | |
return Some(Ok(Token::OpenTagEnd)); | |
} | |
Nre::Error(e) => return Some(Err(e)), | |
Nre::NotMatched => { | |
todo!("Handle failure to match"); | |
} | |
Nre::Exhausted => return None, | |
} | |
} | |
InsideCloseTag => { | |
let s = mediator.stream_while_tag_name(); | |
if s.is_complete() { | |
*state = ReadCloseTagName; | |
} | |
return Some(Ok(Token::TagName(s))); | |
} | |
ReadCloseTagName => { | |
// TODO: chew whitespace | |
match mediator.starts_with(">") { | |
Nre::Matched(_) => { | |
*state = InsideElement; | |
return Some(Ok(Token::CloseTagEnd)); | |
} | |
Nre::Error(e) => return Some(Err(e)), | |
Nre::NotMatched => { | |
todo!("Handle failure to match"); | |
} | |
Nre::Exhausted => return None, | |
} | |
} | |
InsideElement => { | |
match mediator.starts_with("<") { | |
Nre::Matched(_) => { | |
*state = TagStart; | |
continue; | |
} | |
Nre::Error(e) => return Some(Err(e)), | |
Nre::NotMatched => { /* Fall through */ } | |
Nre::Exhausted => return None, | |
} | |
*state = InsideText; | |
continue; | |
} | |
InsideText => { | |
let s = mediator.stream_while_text(); | |
if s.is_complete() { | |
*state = InsideElement; | |
} | |
return Some(Ok(Token::Text(s))); | |
} | |
} | |
} | |
} | |
} | |
#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] | |
enum State { | |
Beginning, | |
FoundPreamble, | |
TagStart, | |
InsideOpenTag, | |
ReadOpenTagName, | |
InsideCloseTag, | |
ReadCloseTagName, | |
InsideElement, | |
InsideText, | |
} | |
#[derive(Debug)] | |
struct Mediator<R> { | |
input: R, | |
buffer: StringRing<10>, | |
} | |
impl<R> Mediator<R> | |
where | |
R: io::Read, | |
{ | |
fn new(input: R) -> Self { | |
Self { | |
input, | |
buffer: StringRing::new(), | |
} | |
} | |
fn as_str(&self) -> &str { | |
&self.buffer | |
} | |
fn starts_with(&mut self, ss: &str) -> Nre<&'_ str, Error> { | |
loop { | |
let s = self.buffer.stats(); | |
match self.buffer.start_matches(ss) { | |
MatchStatus::Success(ticket) => { | |
let s = self.buffer.exchange_ticket(ticket); | |
return Nre::Matched(s); | |
} | |
MatchStatus::Failure => { | |
return Nre::NotMatched; | |
} | |
MatchStatus::IncompleteInput => match self.buffer.extend(&mut self.input).context(UnableToReadData) { | |
Ok(Exhausted(true)) => return Nre::Exhausted, | |
Ok(Exhausted(false)) => {} | |
Err(e) => return Nre::Error(e), | |
}, | |
} | |
assert_ne!(s, self.buffer.stats(), "Stats did not change ({:?})", s); | |
} | |
} | |
fn stream_while_tag_name(&mut self) -> Streaming<&'_ str> { | |
self.buffer | |
.while_tag_name() | |
.map(move |ticket| self.buffer.exchange_ticket(ticket)) | |
} | |
fn stream_while_text(&mut self) -> Streaming<&'_ str> { | |
self.buffer | |
.while_text() | |
.map(move |ticket| self.buffer.exchange_ticket(ticket)) | |
} | |
} | |
// TODO: unify with MatchStatus? | |
#[derive(Debug)] | |
enum Nre<T, E> { | |
Exhausted, | |
Matched(T), | |
NotMatched, | |
Error(E), | |
} | |
#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] | |
enum Streaming<T> { | |
Partial(T), | |
Complete(T), | |
} | |
impl<T> Streaming<T> { | |
fn is_partial(&self) -> bool { | |
matches!(self, Streaming::Partial(_)) | |
} | |
fn is_complete(&self) -> bool { | |
matches!(self, Streaming::Complete(_)) | |
} | |
fn as_ref(&self) -> Streaming<&T> { | |
use Streaming::*; | |
match self { | |
Partial(v) => Partial(v), | |
Complete(v) => Complete(v), | |
} | |
} | |
fn map<U>(self, f: impl FnOnce(T) -> U) -> Streaming<U> { | |
use Streaming::*; | |
match self { | |
Partial(v) => Partial(f(v)), | |
Complete(v) => Complete(f(v)), | |
} | |
} | |
fn unify(self) -> T { | |
use Streaming::*; | |
match self { | |
Partial(v) => v, | |
Complete(v) => v, | |
} | |
} | |
} | |
#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] | |
enum Token<'a> { | |
Preamble(&'a str), | |
OpenTagStart, | |
OpenTagEnd, | |
CloseTagStart, | |
CloseTagEnd, | |
TagName(Streaming<&'a str>), | |
Text(Streaming<&'a str>), | |
} | |
impl<'a> Token<'a> { | |
fn to_owned(self) -> OwnedToken { | |
use OwnedToken as OT; | |
use Token as T; | |
match self { | |
T::Preamble(s) => OT::Preamble(s.to_owned()), | |
T::OpenTagStart => OT::OpenTagStart, | |
T::OpenTagEnd => OT::OpenTagEnd, | |
T::CloseTagStart => OT::CloseTagStart, | |
T::CloseTagEnd => OT::CloseTagEnd, | |
T::TagName(s) => OT::TagName(s.map(|s| s.to_owned())), | |
T::Text(s) => OT::Text(s.map(|s| s.to_owned())), | |
} | |
} | |
} | |
#[derive(Debug, Clone, PartialEq, Eq, Hash)] | |
enum OwnedToken { | |
Preamble(String), | |
OpenTagStart, | |
OpenTagEnd, | |
CloseTagStart, | |
CloseTagEnd, | |
TagName(Streaming<String>), | |
Text(Streaming<String>), | |
} | |
#[derive(Debug, Snafu)] | |
enum Error { | |
UnableToReadData { source: io::Error }, | |
InvalidUtf8, | |
} | |
type Result<T, E = Error> = std::result::Result<T, E>; | |
#[cfg(test)] | |
mod tests { | |
use super::*; | |
#[test] | |
fn tokenize_basic_document() { | |
use OwnedToken::*; | |
use Streaming::*; | |
let input = b"<?xml?><alphabetagamma>hello</alphabetagamma>"; | |
let mut p = Parser::new(&input[..]); | |
let mut x = vec![]; | |
while let Some(v) = p.next() { | |
let a = v.unwrap().to_owned(); | |
x.push(a); | |
} | |
assert_eq!( | |
x, | |
[ | |
Preamble("<?xml?>".into()), | |
OpenTagStart, | |
TagName(Complete("alphabetagamma".into())), | |
OpenTagEnd, | |
Text(Complete("hello".into())), | |
CloseTagStart, | |
TagName(Complete("alphabetagamma".into())), | |
CloseTagEnd, | |
] | |
); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment