Created
March 7, 2017 10:26
-
-
Save anonymous/0587b4484ec9a15f5c5ce6908b3807c1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use std::cmp; | |
use std::str; | |
pub struct Decoder { | |
incomplete: [u8; 4], | |
incomplete_len: u8, | |
} | |
impl Decoder { | |
pub fn new() -> Self { | |
Decoder { | |
incomplete: [0, 0, 0, 0], | |
incomplete_len: 0, | |
} | |
} | |
pub fn next_chunk<'a>(&'a mut self, input_chunk: &'a [u8]) -> DecoderIter<'a> { | |
DecoderIter { | |
decoder: self, | |
input: input_chunk, | |
error_len: 0, | |
last: false, | |
} | |
} | |
pub fn last_chunk<'a>(&'a mut self, input_chunk: &'a [u8]) -> DecoderIter<'a> { | |
DecoderIter { | |
decoder: self, | |
input: input_chunk, | |
error_len: 0, | |
last: true, | |
} | |
} | |
} | |
pub struct DecoderIter<'a> { | |
decoder: &'a mut Decoder, | |
input: &'a [u8], | |
error_len: u8, | |
last: bool, | |
} | |
impl<'a> Iterator for DecoderIter<'a> { | |
type Item = Result<&'a str, &'a [u8]>; | |
fn next(&mut self) -> Option<Result<&'a str, &'a [u8]>> { | |
if self.error_len > 0 { | |
self.error() | |
} else if self.decoder.incomplete_len > 0 { | |
self.try_complete() | |
} else { | |
self.decode() | |
} | |
} | |
} | |
impl<'a> DecoderIter<'a> { | |
fn error(&mut self) -> Option<Result<&'a str, &'a [u8]>> { | |
let (invalid, rest) = self.input.split_at(self.error_len as usize); | |
self.input = rest; | |
self.error_len = 0; | |
Some(Err(invalid)) | |
} | |
fn decode(&mut self) -> Option<Result<&'a str, &'a [u8]>> { | |
if self.input.is_empty() { | |
return None | |
} | |
let error = match str::from_utf8(self.input) { | |
Ok(valid) => { | |
self.input = b""; | |
return Some(Ok(valid)) | |
} | |
Err(error) => error, | |
}; | |
let valid_up_to = error.valid_up_to(); | |
let resume_from = utf8error_resume_from(&error, self.input); | |
if valid_up_to > 0 { | |
// There is a valid prefix, so we’ll return that. | |
let (valid, after_valid) = self.input.split_at(valid_up_to); | |
let valid = unsafe { | |
str::from_utf8_unchecked(valid) | |
}; | |
// Save info about the error for the next iteration. | |
match resume_from { | |
Some(resume_from) => { | |
let error_len = resume_from.checked_sub(valid_up_to).unwrap(); | |
assert!(error_len <= 4); | |
self.error_len = error_len as u8; | |
self.input = after_valid; | |
} | |
None => self.save_incomplete(after_valid) | |
} | |
Some(Ok(valid)) | |
} else { | |
match resume_from { | |
Some(resume_from) => { | |
let (invalid, rest) = self.input.split_at(resume_from); | |
self.input = rest; | |
Some(Err(invalid)) | |
} | |
None if self.last => { | |
let incomplete = self.input; | |
self.input = b""; | |
Some(Err(incomplete)) | |
} | |
None => { | |
self.save_incomplete(self.input); | |
Some(Ok("")) | |
} | |
} | |
} | |
} | |
fn save_incomplete(&mut self, incomplete_input: &'a [u8]) { | |
let len = incomplete_input.len(); | |
self.decoder.incomplete[..len].copy_from_slice(incomplete_input); | |
self.decoder.incomplete_len = len as u8; | |
self.input = b""; | |
} | |
fn try_complete(&mut self) -> Option<Result<&'a str, &'a [u8]>> { | |
let incomplete = &mut self.decoder.incomplete; | |
let incomplete_len = self.decoder.incomplete_len as usize; | |
if self.input.is_empty() { | |
if self.last { | |
self.decoder.incomplete_len = 0; | |
return Some(Err(&incomplete[..incomplete_len])) | |
} else { | |
return None | |
} | |
} | |
let bytes_from_input; | |
let input_not_copied; | |
{ | |
let unwritten = &mut incomplete[incomplete_len..]; | |
bytes_from_input = cmp::min(unwritten.len(), self.input.len()); | |
let (to_copy, not_copied) = self.input.split_at(bytes_from_input); | |
unwritten[..bytes_from_input].copy_from_slice(to_copy); | |
input_not_copied = not_copied; | |
} | |
let spliced = &incomplete[..incomplete_len + bytes_from_input]; | |
match str::from_utf8(spliced) { | |
Ok(valid) => { | |
self.input = input_not_copied; | |
self.decoder.incomplete_len = 0; | |
Some(Ok(valid)) | |
} | |
Err(error) => { | |
let valid_up_to = error.valid_up_to(); | |
if valid_up_to > 0 { | |
let valid = unsafe { | |
str::from_utf8_unchecked(&spliced[..valid_up_to]) | |
}; | |
let consumed_input = valid_up_to.checked_sub(incomplete_len).unwrap(); | |
self.input = &self.input[consumed_input..]; | |
self.decoder.incomplete_len = 0; | |
Some(Ok(valid)) | |
} else { | |
match utf8error_resume_from(&error, spliced) { | |
Some(resume_from) => { | |
let consumed_input = resume_from.checked_sub(incomplete_len).unwrap(); | |
self.input = &self.input[consumed_input..]; | |
self.decoder.incomplete_len = 0; | |
Some(Err(&spliced[..resume_from])) | |
} | |
None if self.last => { | |
self.input = input_not_copied; | |
self.decoder.incomplete_len = 0; | |
Some(Err(spliced)) | |
} | |
None => { | |
self.input = input_not_copied; | |
self.decoder.incomplete_len = spliced.len() as u8; | |
None | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
use std::str::Utf8Error; | |
/// Remove this when https://github.com/rust-lang/rust/pull/40212 is stable | |
fn utf8error_resume_from(error: &Utf8Error, input: &[u8]) -> Option<usize> { | |
let valid_up_to = error.valid_up_to(); | |
let after_valid = &input[valid_up_to..]; | |
// `after_valid` is not empty, `str::from_utf8` would have returned `Ok(_)`. | |
let first = after_valid[0]; | |
let char_width = UTF8_CHAR_WIDTH[first as usize]; | |
macro_rules! get_byte { | |
($i: expr) => { | |
if let Some(&byte) = after_valid.get($i) { | |
byte | |
} else { | |
return None | |
} | |
} | |
} | |
let invalid_sequence_length; | |
match char_width { | |
0 => invalid_sequence_length = 1, | |
1 => panic!("found ASCII byte after Utf8Error.valid_up_to()"), | |
2 => { | |
let second = get_byte!(1); | |
debug_assert!(!is_continuation_byte(second)); | |
invalid_sequence_length = 1; | |
} | |
3 => { | |
let second = get_byte!(1); | |
if valid_three_bytes_sequence_prefix(first, second) { | |
let third = get_byte!(2); | |
debug_assert!(!is_continuation_byte(third)); | |
invalid_sequence_length = 2; | |
} else { | |
invalid_sequence_length = 1; | |
} | |
} | |
4 => { | |
let second = get_byte!(1); | |
if valid_four_bytes_sequence_prefix(first, second) { | |
let third = get_byte!(2); | |
if is_continuation_byte(third) { | |
let fourth = get_byte!(3); | |
debug_assert!(!is_continuation_byte(fourth)); | |
invalid_sequence_length = 3; | |
} else { | |
invalid_sequence_length = 2; | |
} | |
} else { | |
invalid_sequence_length = 1; | |
} | |
} | |
_ => unreachable!() | |
} | |
Some(valid_up_to + invalid_sequence_length) | |
} | |
// https://tools.ietf.org/html/rfc3629 | |
static UTF8_CHAR_WIDTH: [u8; 256] = [ | |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F | |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F | |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F | |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF | |
0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2, | |
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF | |
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF | |
4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF | |
]; | |
#[inline] | |
fn is_continuation_byte(b: u8) -> bool { | |
const CONTINUATION_MASK: u8 = 0b1100_0000; | |
const CONTINUATION_TAG: u8 = 0b1000_0000; | |
b & CONTINUATION_MASK == CONTINUATION_TAG | |
} | |
#[inline] | |
fn valid_three_bytes_sequence_prefix(first: u8, second: u8) -> bool { | |
matches!((first, second), | |
(0xE0 , 0xA0 ... 0xBF) | | |
(0xE1 ... 0xEC, 0x80 ... 0xBF) | | |
(0xED , 0x80 ... 0x9F) | | |
// Exclude surrogates: (0xED, 0xA0 ... 0xBF) | |
(0xEE ... 0xEF, 0x80 ... 0xBF) | |
) | |
} | |
#[inline] | |
fn valid_four_bytes_sequence_prefix(first: u8, second: u8) -> bool { | |
matches!((first, second), | |
(0xF0 , 0x90 ... 0xBF) | | |
(0xF1 ... 0xF3, 0x80 ... 0xBF) | | |
(0xF4 , 0x80 ... 0x8F) | |
) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment