/utf8.rs

## utf8.rs
use std::cmp;
use std::str;

pub struct Decoder {
    incomplete: [u8; 4],
    incomplete_len: u8,
}

impl Decoder {
    pub fn new() -> Self {
        Decoder {
            incomplete: [0, 0, 0, 0],
            incomplete_len: 0,
        }
    }

    pub fn next_chunk<'a>(&'a mut self, input_chunk: &'a [u8]) -> DecoderIter<'a> {
        DecoderIter {
            decoder: self,
            input: input_chunk,
            error_len: 0,
            last: false,
        }
    }

    pub fn last_chunk<'a>(&'a mut self, input_chunk: &'a [u8]) -> DecoderIter<'a> {
        DecoderIter {
            decoder: self,
            input: input_chunk,
            error_len: 0,
            last: true,
        }
    }
}

pub struct DecoderIter<'a> {
    decoder: &'a mut Decoder,
    input: &'a [u8],
    error_len: u8,
    last: bool,
}

impl<'a> Iterator for DecoderIter<'a> {
    type Item = Result<&'a str, &'a [u8]>;
    fn next(&mut self) -> Option<Result<&'a str, &'a [u8]>> {
        if self.error_len > 0 {
            self.error()
        } else if self.decoder.incomplete_len > 0 {
            self.try_complete()
        } else {
            self.decode()
        }
    }
}

impl<'a> DecoderIter<'a> {
    fn error(&mut self) -> Option<Result<&'a str, &'a [u8]>> {
        let (invalid, rest) = self.input.split_at(self.error_len as usize);
        self.input = rest;
        self.error_len = 0;
        Some(Err(invalid))
    }

    fn decode(&mut self) -> Option<Result<&'a str, &'a [u8]>> {
        if self.input.is_empty() {
            return None
        }
        let error = match str::from_utf8(self.input) {
            Ok(valid) => {
                self.input = b"";
                return Some(Ok(valid))
            }
            Err(error) => error,
        };
        let valid_up_to = error.valid_up_to();
        let resume_from = utf8error_resume_from(&error, self.input);
        if valid_up_to > 0 {
            // There is a valid prefix, so we’ll return that.
            let (valid, after_valid) = self.input.split_at(valid_up_to);
            let valid = unsafe {
                str::from_utf8_unchecked(valid)
            };
            // Save info about the error for the next iteration.
            match resume_from {
                Some(resume_from) => {
                    let error_len = resume_from.checked_sub(valid_up_to).unwrap();
                    assert!(error_len <= 4);
                    self.error_len = error_len as u8;
                    self.input = after_valid;
                }
                None => self.save_incomplete(after_valid)
            }
            Some(Ok(valid))
        } else {
            match resume_from {
                Some(resume_from) => {
                    let (invalid, rest) = self.input.split_at(resume_from);
                    self.input = rest;
                    Some(Err(invalid))
                }
                None if self.last => {
                    let incomplete = self.input;
                    self.input = b"";
                    Some(Err(incomplete))
                }
                None => {
                    self.save_incomplete(self.input);
                    Some(Ok(""))
                }
            }
        }
    }

    fn save_incomplete(&mut self, incomplete_input: &'a [u8]) {
        let len = incomplete_input.len();
        self.decoder.incomplete[..len].copy_from_slice(incomplete_input);
        self.decoder.incomplete_len = len as u8;
        self.input = b"";
    }

    fn try_complete(&mut self) -> Option<Result<&'a str, &'a [u8]>> {
        let incomplete = &mut self.decoder.incomplete;
        let incomplete_len = self.decoder.incomplete_len as usize;

        if self.input.is_empty() {
            if self.last {
                self.decoder.incomplete_len = 0;
                return Some(Err(&incomplete[..incomplete_len]))
            } else {
                return None
            }
        }

        let bytes_from_input;
        let input_not_copied;
        {
            let unwritten = &mut incomplete[incomplete_len..];
            bytes_from_input = cmp::min(unwritten.len(), self.input.len());
            let (to_copy, not_copied) = self.input.split_at(bytes_from_input);
            unwritten[..bytes_from_input].copy_from_slice(to_copy);
            input_not_copied = not_copied;
        }
        let spliced = &incomplete[..incomplete_len + bytes_from_input];
        match str::from_utf8(spliced) {
            Ok(valid) => {
                self.input = input_not_copied;
                self.decoder.incomplete_len = 0;
                Some(Ok(valid))
            }
            Err(error) => {
                let valid_up_to = error.valid_up_to();
                if valid_up_to > 0 {
                    let valid = unsafe {
                        str::from_utf8_unchecked(&spliced[..valid_up_to])
                    };
                    let consumed_input = valid_up_to.checked_sub(incomplete_len).unwrap();
                    self.input = &self.input[consumed_input..];
                    self.decoder.incomplete_len = 0;
                    Some(Ok(valid))
                } else {
                    match utf8error_resume_from(&error, spliced) {
                        Some(resume_from) => {
                            let consumed_input = resume_from.checked_sub(incomplete_len).unwrap();
                            self.input = &self.input[consumed_input..];
                            self.decoder.incomplete_len = 0;
                            Some(Err(&spliced[..resume_from]))
                        }
                        None if self.last => {
                            self.input = input_not_copied;
                            self.decoder.incomplete_len = 0;
                            Some(Err(spliced))
                        }
                        None => {
                            self.input = input_not_copied;
                            self.decoder.incomplete_len = spliced.len() as u8;
                            None
                        }
                    }
                }
            }
        }
    }
}


use std::str::Utf8Error;

/// Remove this when https://github.com/rust-lang/rust/pull/40212 is stable
fn utf8error_resume_from(error: &Utf8Error, input: &[u8]) -> Option<usize> {
    let valid_up_to = error.valid_up_to();
    let after_valid = &input[valid_up_to..];

    // `after_valid` is not empty, `str::from_utf8` would have returned `Ok(_)`.
    let first = after_valid[0];
    let char_width = UTF8_CHAR_WIDTH[first as usize];

    macro_rules! get_byte {
        ($i: expr) => {
            if let Some(&byte) = after_valid.get($i) {
                byte
            } else {
                return None
            }
        }
    }

    let invalid_sequence_length;
    match char_width {
        0 => invalid_sequence_length = 1,
        1 => panic!("found ASCII byte after Utf8Error.valid_up_to()"),
        2 => {
            let second = get_byte!(1);
            debug_assert!(!is_continuation_byte(second));
            invalid_sequence_length = 1;
        }
        3 => {
            let second = get_byte!(1);
            if valid_three_bytes_sequence_prefix(first, second) {
                let third = get_byte!(2);
                debug_assert!(!is_continuation_byte(third));
                invalid_sequence_length = 2;
            } else {
                invalid_sequence_length = 1;
            }
        }
        4 => {
            let second = get_byte!(1);
            if valid_four_bytes_sequence_prefix(first, second) {
                let third = get_byte!(2);
                if is_continuation_byte(third) {
                    let fourth = get_byte!(3);
                    debug_assert!(!is_continuation_byte(fourth));
                    invalid_sequence_length = 3;
                } else {
                    invalid_sequence_length = 2;
                }
            } else {
                invalid_sequence_length = 1;
            }
        }
        _ => unreachable!()
    }

    Some(valid_up_to + invalid_sequence_length)
}

// https://tools.ietf.org/html/rfc3629
static UTF8_CHAR_WIDTH: [u8; 256] = [
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
    0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
    4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
];

#[inline]
fn is_continuation_byte(b: u8) -> bool {
    const CONTINUATION_MASK: u8 = 0b1100_0000;
    const CONTINUATION_TAG: u8 = 0b1000_0000;
    b & CONTINUATION_MASK == CONTINUATION_TAG
}

#[inline]
fn valid_three_bytes_sequence_prefix(first: u8, second: u8) -> bool {
    matches!((first, second),
        (0xE0         , 0xA0 ... 0xBF) |
        (0xE1 ... 0xEC, 0x80 ... 0xBF) |
        (0xED         , 0x80 ... 0x9F) |
        // Exclude surrogates: (0xED, 0xA0 ... 0xBF)
        (0xEE ... 0xEF, 0x80 ... 0xBF)
    )
}

#[inline]
fn valid_four_bytes_sequence_prefix(first: u8, second: u8) -> bool {
    matches!((first, second),
        (0xF0         , 0x90 ... 0xBF) |
        (0xF1 ... 0xF3, 0x80 ... 0xBF) |
        (0xF4         , 0x80 ... 0x8F)
    )
}
	use std::cmp;
	use std::str;

	pub struct Decoder {
	incomplete: [u8; 4],
	incomplete_len: u8,
	}

	impl Decoder {
	pub fn new() -> Self {
	Decoder {
	incomplete: [0, 0, 0, 0],
	incomplete_len: 0,
	}
	}

	pub fn next_chunk<'a>(&'a mut self, input_chunk: &'a [u8]) -> DecoderIter<'a> {
	DecoderIter {
	decoder: self,
	input: input_chunk,
	error_len: 0,
	last: false,
	}
	}

	pub fn last_chunk<'a>(&'a mut self, input_chunk: &'a [u8]) -> DecoderIter<'a> {
	DecoderIter {
	decoder: self,
	input: input_chunk,
	error_len: 0,
	last: true,
	}
	}
	}

	pub struct DecoderIter<'a> {
	decoder: &'a mut Decoder,
	input: &'a [u8],
	error_len: u8,
	last: bool,
	}

	impl<'a> Iterator for DecoderIter<'a> {
	type Item = Result<&'a str, &'a [u8]>;
	fn next(&mut self) -> Option<Result<&'a str, &'a [u8]>> {
	if self.error_len > 0 {
	self.error()
	} else if self.decoder.incomplete_len > 0 {
	self.try_complete()
	} else {
	self.decode()
	}
	}
	}

	impl<'a> DecoderIter<'a> {
	fn error(&mut self) -> Option<Result<&'a str, &'a [u8]>> {
	let (invalid, rest) = self.input.split_at(self.error_len as usize);
	self.input = rest;
	self.error_len = 0;
	Some(Err(invalid))
	}

	fn decode(&mut self) -> Option<Result<&'a str, &'a [u8]>> {
	if self.input.is_empty() {
	return None
	}
	let error = match str::from_utf8(self.input) {
	Ok(valid) => {
	self.input = b"";
	return Some(Ok(valid))
	}
	Err(error) => error,
	};
	let valid_up_to = error.valid_up_to();
	let resume_from = utf8error_resume_from(&error, self.input);
	if valid_up_to > 0 {
	// There is a valid prefix, so we’ll return that.
	let (valid, after_valid) = self.input.split_at(valid_up_to);
	let valid = unsafe {
	str::from_utf8_unchecked(valid)
	};
	// Save info about the error for the next iteration.
	match resume_from {
	Some(resume_from) => {
	let error_len = resume_from.checked_sub(valid_up_to).unwrap();
	assert!(error_len <= 4);
	self.error_len = error_len as u8;
	self.input = after_valid;
	}
	None => self.save_incomplete(after_valid)
	}
	Some(Ok(valid))
	} else {
	match resume_from {
	Some(resume_from) => {
	let (invalid, rest) = self.input.split_at(resume_from);
	self.input = rest;
	Some(Err(invalid))
	}
	None if self.last => {
	let incomplete = self.input;
	self.input = b"";
	Some(Err(incomplete))
	}
	None => {
	self.save_incomplete(self.input);
	Some(Ok(""))
	}
	}
	}
	}

	fn save_incomplete(&mut self, incomplete_input: &'a [u8]) {
	let len = incomplete_input.len();
	self.decoder.incomplete[..len].copy_from_slice(incomplete_input);
	self.decoder.incomplete_len = len as u8;
	self.input = b"";
	}

	fn try_complete(&mut self) -> Option<Result<&'a str, &'a [u8]>> {
	let incomplete = &mut self.decoder.incomplete;
	let incomplete_len = self.decoder.incomplete_len as usize;

	if self.input.is_empty() {
	if self.last {
	self.decoder.incomplete_len = 0;
	return Some(Err(&incomplete[..incomplete_len]))
	} else {
	return None
	}
	}

	let bytes_from_input;
	let input_not_copied;
	{
	let unwritten = &mut incomplete[incomplete_len..];
	bytes_from_input = cmp::min(unwritten.len(), self.input.len());
	let (to_copy, not_copied) = self.input.split_at(bytes_from_input);
	unwritten[..bytes_from_input].copy_from_slice(to_copy);
	input_not_copied = not_copied;
	}
	let spliced = &incomplete[..incomplete_len + bytes_from_input];
	match str::from_utf8(spliced) {
	Ok(valid) => {
	self.input = input_not_copied;
	self.decoder.incomplete_len = 0;
	Some(Ok(valid))
	}
	Err(error) => {
	let valid_up_to = error.valid_up_to();
	if valid_up_to > 0 {
	let valid = unsafe {
	str::from_utf8_unchecked(&spliced[..valid_up_to])
	};
	let consumed_input = valid_up_to.checked_sub(incomplete_len).unwrap();
	self.input = &self.input[consumed_input..];
	self.decoder.incomplete_len = 0;
	Some(Ok(valid))
	} else {
	match utf8error_resume_from(&error, spliced) {
	Some(resume_from) => {
	let consumed_input = resume_from.checked_sub(incomplete_len).unwrap();
	self.input = &self.input[consumed_input..];
	self.decoder.incomplete_len = 0;
	Some(Err(&spliced[..resume_from]))
	}
	None if self.last => {
	self.input = input_not_copied;
	self.decoder.incomplete_len = 0;
	Some(Err(spliced))
	}
	None => {
	self.input = input_not_copied;
	self.decoder.incomplete_len = spliced.len() as u8;
	None
	}
	}
	}
	}
	}
	}
	}


	use std::str::Utf8Error;

	/// Remove this when https://github.com/rust-lang/rust/pull/40212 is stable
	fn utf8error_resume_from(error: &Utf8Error, input: &[u8]) -> Option<usize> {
	let valid_up_to = error.valid_up_to();
	let after_valid = &input[valid_up_to..];

	// `after_valid` is not empty, `str::from_utf8` would have returned `Ok(_)`.
	let first = after_valid[0];
	let char_width = UTF8_CHAR_WIDTH[first as usize];

	macro_rules! get_byte {
	($i: expr) => {
	if let Some(&byte) = after_valid.get($i) {
	byte
	} else {
	return None
	}
	}
	}

	let invalid_sequence_length;
	match char_width {
	0 => invalid_sequence_length = 1,
	1 => panic!("found ASCII byte after Utf8Error.valid_up_to()"),
	2 => {
	let second = get_byte!(1);
	debug_assert!(!is_continuation_byte(second));
	invalid_sequence_length = 1;
	}
	3 => {
	let second = get_byte!(1);
	if valid_three_bytes_sequence_prefix(first, second) {
	let third = get_byte!(2);
	debug_assert!(!is_continuation_byte(third));
	invalid_sequence_length = 2;
	} else {
	invalid_sequence_length = 1;
	}
	}
	4 => {
	let second = get_byte!(1);
	if valid_four_bytes_sequence_prefix(first, second) {
	let third = get_byte!(2);
	if is_continuation_byte(third) {
	let fourth = get_byte!(3);
	debug_assert!(!is_continuation_byte(fourth));
	invalid_sequence_length = 3;
	} else {
	invalid_sequence_length = 2;
	}
	} else {
	invalid_sequence_length = 1;
	}
	}
	_ => unreachable!()
	}

	Some(valid_up_to + invalid_sequence_length)
	}

	// https://tools.ietf.org/html/rfc3629
	static UTF8_CHAR_WIDTH: [u8; 256] = [
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
	0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
	3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
	4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
	];

	#[inline]
	fn is_continuation_byte(b: u8) -> bool {
	const CONTINUATION_MASK: u8 = 0b1100_0000;
	const CONTINUATION_TAG: u8 = 0b1000_0000;
	b & CONTINUATION_MASK == CONTINUATION_TAG
	}

	#[inline]
	fn valid_three_bytes_sequence_prefix(first: u8, second: u8) -> bool {
	matches!((first, second),
	(0xE0 , 0xA0 ... 0xBF) \|
	(0xE1 ... 0xEC, 0x80 ... 0xBF) \|
	(0xED , 0x80 ... 0x9F) \|
	// Exclude surrogates: (0xED, 0xA0 ... 0xBF)
	(0xEE ... 0xEF, 0x80 ... 0xBF)
	)
	}

	#[inline]
	fn valid_four_bytes_sequence_prefix(first: u8, second: u8) -> bool {
	matches!((first, second),
	(0xF0 , 0x90 ... 0xBF) \|
	(0xF1 ... 0xF3, 0x80 ... 0xBF) \|
	(0xF4 , 0x80 ... 0x8F)
	)
	}