Skip to content

Instantly share code, notes, and snippets.

@snoyberg
Last active August 1, 2018 04:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save snoyberg/95f95e17b6015352e68432701eaf1b90 to your computer and use it in GitHub Desktop.
Save snoyberg/95f95e17b6015352e68432701eaf1b90 to your computer and use it in GitHub Desktop.
Streaming UTF-8 decoding using valid_up_to
valid_up_to
valid_up_to.exe
שלוםשלוםשלוםשלוםשלום
#!/usr/bin/env bash
set -eux
rustc valid_up_to.rs
./valid_up_to < hebrew.txt
use std::io::{self, Read};
use std::str;
#[derive(Debug)]
pub enum Utf8ReadError {
IO(io::Error),
Utf8(std::str::Utf8Error),
IncompleteFinal,
}
impl std::fmt::Display for Utf8ReadError {
fn fmt(&self, fmt: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> {
match self {
Utf8ReadError::IO(err) => err.fmt(fmt),
Utf8ReadError::Utf8(err) => err.fmt(fmt),
Utf8ReadError::IncompleteFinal => write!(fmt, "Incomplete final codepoint"),
}
}
}
impl std::error::Error for Utf8ReadError {}
pub struct Utf8Read<R> {
buf: [u8; 9], // intentionally really small for testing
used: usize,
valid: usize,
read: R,
done: bool,
next_char: Option<usize>,
}
pub fn utf8_read<R: Read>(read: R) -> Utf8Read<R> {
Utf8Read {
buf: [0; 9],
used: 0,
valid: 0,
read,
done: false,
next_char: None,
}
}
impl<R> Utf8Read<R> {
fn pop_char(&mut self, next: usize) -> Option<Result<char, Utf8ReadError>> {
let str = unsafe { str::from_utf8_unchecked(&self.buf[next..]) };
let c =
match str.chars().next() {
None => panic!("pop_char should never be out of characters"),
Some(c) => c,
};
self.next_char = Some(next + c.len_utf8());
Some(Ok(c))
}
}
impl<R: Read> Iterator for Utf8Read<R> {
type Item = Result<char, Utf8ReadError>;
fn next(&mut self) -> Option<Self::Item> {
if self.done {
return None;
}
if let Some(next) = self.next_char {
if next < self.valid {
return self.pop_char(next);
}
}
if self.used > 0 {
for i in 0..self.used {
let b = self.buf[i + self.valid];
self.buf[i] = b;
}
}
let bytes = match self.read.read(&mut (self.buf[self.used..])) {
Err(e) => {
return Some(Err(Utf8ReadError::IO(e)));
}
Ok(bytes) => bytes,
};
if bytes == 0 {
self.done = true;
if self.used == 0 {
return None;
} else {
return Some(Err(Utf8ReadError::IncompleteFinal));
}
}
let total_size = self.used + bytes;
match str::from_utf8(&self.buf[..total_size]) {
Ok(_str) => {
self.valid = total_size;
}
Err(err) => {
self.valid = err.valid_up_to();
if self.valid == 0 {
return Some(Err(Utf8ReadError::Utf8(err)));
}
}
};
self.used = total_size - self.valid;
self.pop_char(0)
}
}
fn main() -> Result<(), Utf8ReadError> {
let stdin = std::io::stdin();
let stdin_lock = stdin.lock();
for rstr in utf8_read(stdin_lock) {
let str = rstr?;
println!("Got a chunk: {}", str);
}
Ok(())
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment