Last active
August 1, 2018 04:17
-
-
Save snoyberg/95f95e17b6015352e68432701eaf1b90 to your computer and use it in GitHub Desktop.
Streaming UTF-8 decoding using valid_up_to
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
valid_up_to | |
valid_up_to.exe |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
שלוםשלוםשלוםשלוםשלום |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
set -eux | |
rustc valid_up_to.rs | |
./valid_up_to < hebrew.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use std::io::{self, Read}; | |
use std::str; | |
#[derive(Debug)] | |
pub enum Utf8ReadError { | |
IO(io::Error), | |
Utf8(std::str::Utf8Error), | |
IncompleteFinal, | |
} | |
impl std::fmt::Display for Utf8ReadError { | |
fn fmt(&self, fmt: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> { | |
match self { | |
Utf8ReadError::IO(err) => err.fmt(fmt), | |
Utf8ReadError::Utf8(err) => err.fmt(fmt), | |
Utf8ReadError::IncompleteFinal => write!(fmt, "Incomplete final codepoint"), | |
} | |
} | |
} | |
impl std::error::Error for Utf8ReadError {} | |
pub struct Utf8Read<R> { | |
buf: [u8; 9], // intentionally really small for testing | |
used: usize, | |
valid: usize, | |
read: R, | |
done: bool, | |
next_char: Option<usize>, | |
} | |
pub fn utf8_read<R: Read>(read: R) -> Utf8Read<R> { | |
Utf8Read { | |
buf: [0; 9], | |
used: 0, | |
valid: 0, | |
read, | |
done: false, | |
next_char: None, | |
} | |
} | |
impl<R> Utf8Read<R> { | |
fn pop_char(&mut self, next: usize) -> Option<Result<char, Utf8ReadError>> { | |
let str = unsafe { str::from_utf8_unchecked(&self.buf[next..]) }; | |
let c = | |
match str.chars().next() { | |
None => panic!("pop_char should never be out of characters"), | |
Some(c) => c, | |
}; | |
self.next_char = Some(next + c.len_utf8()); | |
Some(Ok(c)) | |
} | |
} | |
impl<R: Read> Iterator for Utf8Read<R> { | |
type Item = Result<char, Utf8ReadError>; | |
fn next(&mut self) -> Option<Self::Item> { | |
if self.done { | |
return None; | |
} | |
if let Some(next) = self.next_char { | |
if next < self.valid { | |
return self.pop_char(next); | |
} | |
} | |
if self.used > 0 { | |
for i in 0..self.used { | |
let b = self.buf[i + self.valid]; | |
self.buf[i] = b; | |
} | |
} | |
let bytes = match self.read.read(&mut (self.buf[self.used..])) { | |
Err(e) => { | |
return Some(Err(Utf8ReadError::IO(e))); | |
} | |
Ok(bytes) => bytes, | |
}; | |
if bytes == 0 { | |
self.done = true; | |
if self.used == 0 { | |
return None; | |
} else { | |
return Some(Err(Utf8ReadError::IncompleteFinal)); | |
} | |
} | |
let total_size = self.used + bytes; | |
match str::from_utf8(&self.buf[..total_size]) { | |
Ok(_str) => { | |
self.valid = total_size; | |
} | |
Err(err) => { | |
self.valid = err.valid_up_to(); | |
if self.valid == 0 { | |
return Some(Err(Utf8ReadError::Utf8(err))); | |
} | |
} | |
}; | |
self.used = total_size - self.valid; | |
self.pop_char(0) | |
} | |
} | |
fn main() -> Result<(), Utf8ReadError> { | |
let stdin = std::io::stdin(); | |
let stdin_lock = stdin.lock(); | |
for rstr in utf8_read(stdin_lock) { | |
let str = rstr?; | |
println!("Got a chunk: {}", str); | |
} | |
Ok(()) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment