Skip to content

Instantly share code, notes, and snippets.

Created March 7, 2017 10:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anonymous/0587b4484ec9a15f5c5ce6908b3807c1 to your computer and use it in GitHub Desktop.
Save anonymous/0587b4484ec9a15f5c5ce6908b3807c1 to your computer and use it in GitHub Desktop.
use std::cmp;
use std::str;
pub struct Decoder {
incomplete: [u8; 4],
incomplete_len: u8,
}
impl Decoder {
pub fn new() -> Self {
Decoder {
incomplete: [0, 0, 0, 0],
incomplete_len: 0,
}
}
pub fn next_chunk<'a>(&'a mut self, input_chunk: &'a [u8]) -> DecoderIter<'a> {
DecoderIter {
decoder: self,
input: input_chunk,
error_len: 0,
last: false,
}
}
pub fn last_chunk<'a>(&'a mut self, input_chunk: &'a [u8]) -> DecoderIter<'a> {
DecoderIter {
decoder: self,
input: input_chunk,
error_len: 0,
last: true,
}
}
}
pub struct DecoderIter<'a> {
decoder: &'a mut Decoder,
input: &'a [u8],
error_len: u8,
last: bool,
}
impl<'a> Iterator for DecoderIter<'a> {
type Item = Result<&'a str, &'a [u8]>;
fn next(&mut self) -> Option<Result<&'a str, &'a [u8]>> {
if self.error_len > 0 {
self.error()
} else if self.decoder.incomplete_len > 0 {
self.try_complete()
} else {
self.decode()
}
}
}
impl<'a> DecoderIter<'a> {
fn error(&mut self) -> Option<Result<&'a str, &'a [u8]>> {
let (invalid, rest) = self.input.split_at(self.error_len as usize);
self.input = rest;
self.error_len = 0;
Some(Err(invalid))
}
fn decode(&mut self) -> Option<Result<&'a str, &'a [u8]>> {
if self.input.is_empty() {
return None
}
let error = match str::from_utf8(self.input) {
Ok(valid) => {
self.input = b"";
return Some(Ok(valid))
}
Err(error) => error,
};
let valid_up_to = error.valid_up_to();
let resume_from = utf8error_resume_from(&error, self.input);
if valid_up_to > 0 {
// There is a valid prefix, so we’ll return that.
let (valid, after_valid) = self.input.split_at(valid_up_to);
let valid = unsafe {
str::from_utf8_unchecked(valid)
};
// Save info about the error for the next iteration.
match resume_from {
Some(resume_from) => {
let error_len = resume_from.checked_sub(valid_up_to).unwrap();
assert!(error_len <= 4);
self.error_len = error_len as u8;
self.input = after_valid;
}
None => self.save_incomplete(after_valid)
}
Some(Ok(valid))
} else {
match resume_from {
Some(resume_from) => {
let (invalid, rest) = self.input.split_at(resume_from);
self.input = rest;
Some(Err(invalid))
}
None if self.last => {
let incomplete = self.input;
self.input = b"";
Some(Err(incomplete))
}
None => {
self.save_incomplete(self.input);
Some(Ok(""))
}
}
}
}
fn save_incomplete(&mut self, incomplete_input: &'a [u8]) {
let len = incomplete_input.len();
self.decoder.incomplete[..len].copy_from_slice(incomplete_input);
self.decoder.incomplete_len = len as u8;
self.input = b"";
}
fn try_complete(&mut self) -> Option<Result<&'a str, &'a [u8]>> {
let incomplete = &mut self.decoder.incomplete;
let incomplete_len = self.decoder.incomplete_len as usize;
if self.input.is_empty() {
if self.last {
self.decoder.incomplete_len = 0;
return Some(Err(&incomplete[..incomplete_len]))
} else {
return None
}
}
let bytes_from_input;
let input_not_copied;
{
let unwritten = &mut incomplete[incomplete_len..];
bytes_from_input = cmp::min(unwritten.len(), self.input.len());
let (to_copy, not_copied) = self.input.split_at(bytes_from_input);
unwritten[..bytes_from_input].copy_from_slice(to_copy);
input_not_copied = not_copied;
}
let spliced = &incomplete[..incomplete_len + bytes_from_input];
match str::from_utf8(spliced) {
Ok(valid) => {
self.input = input_not_copied;
self.decoder.incomplete_len = 0;
Some(Ok(valid))
}
Err(error) => {
let valid_up_to = error.valid_up_to();
if valid_up_to > 0 {
let valid = unsafe {
str::from_utf8_unchecked(&spliced[..valid_up_to])
};
let consumed_input = valid_up_to.checked_sub(incomplete_len).unwrap();
self.input = &self.input[consumed_input..];
self.decoder.incomplete_len = 0;
Some(Ok(valid))
} else {
match utf8error_resume_from(&error, spliced) {
Some(resume_from) => {
let consumed_input = resume_from.checked_sub(incomplete_len).unwrap();
self.input = &self.input[consumed_input..];
self.decoder.incomplete_len = 0;
Some(Err(&spliced[..resume_from]))
}
None if self.last => {
self.input = input_not_copied;
self.decoder.incomplete_len = 0;
Some(Err(spliced))
}
None => {
self.input = input_not_copied;
self.decoder.incomplete_len = spliced.len() as u8;
None
}
}
}
}
}
}
}
use std::str::Utf8Error;
/// Remove this when https://github.com/rust-lang/rust/pull/40212 is stable
fn utf8error_resume_from(error: &Utf8Error, input: &[u8]) -> Option<usize> {
let valid_up_to = error.valid_up_to();
let after_valid = &input[valid_up_to..];
// `after_valid` is not empty, `str::from_utf8` would have returned `Ok(_)`.
let first = after_valid[0];
let char_width = UTF8_CHAR_WIDTH[first as usize];
macro_rules! get_byte {
($i: expr) => {
if let Some(&byte) = after_valid.get($i) {
byte
} else {
return None
}
}
}
let invalid_sequence_length;
match char_width {
0 => invalid_sequence_length = 1,
1 => panic!("found ASCII byte after Utf8Error.valid_up_to()"),
2 => {
let second = get_byte!(1);
debug_assert!(!is_continuation_byte(second));
invalid_sequence_length = 1;
}
3 => {
let second = get_byte!(1);
if valid_three_bytes_sequence_prefix(first, second) {
let third = get_byte!(2);
debug_assert!(!is_continuation_byte(third));
invalid_sequence_length = 2;
} else {
invalid_sequence_length = 1;
}
}
4 => {
let second = get_byte!(1);
if valid_four_bytes_sequence_prefix(first, second) {
let third = get_byte!(2);
if is_continuation_byte(third) {
let fourth = get_byte!(3);
debug_assert!(!is_continuation_byte(fourth));
invalid_sequence_length = 3;
} else {
invalid_sequence_length = 2;
}
} else {
invalid_sequence_length = 1;
}
}
_ => unreachable!()
}
Some(valid_up_to + invalid_sequence_length)
}
// https://tools.ietf.org/html/rfc3629
static UTF8_CHAR_WIDTH: [u8; 256] = [
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
];
#[inline]
fn is_continuation_byte(b: u8) -> bool {
const CONTINUATION_MASK: u8 = 0b1100_0000;
const CONTINUATION_TAG: u8 = 0b1000_0000;
b & CONTINUATION_MASK == CONTINUATION_TAG
}
#[inline]
fn valid_three_bytes_sequence_prefix(first: u8, second: u8) -> bool {
matches!((first, second),
(0xE0 , 0xA0 ... 0xBF) |
(0xE1 ... 0xEC, 0x80 ... 0xBF) |
(0xED , 0x80 ... 0x9F) |
// Exclude surrogates: (0xED, 0xA0 ... 0xBF)
(0xEE ... 0xEF, 0x80 ... 0xBF)
)
}
#[inline]
fn valid_four_bytes_sequence_prefix(first: u8, second: u8) -> bool {
matches!((first, second),
(0xF0 , 0x90 ... 0xBF) |
(0xF1 ... 0xF3, 0x80 ... 0xBF) |
(0xF4 , 0x80 ... 0x8F)
)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment