danielhenrymantilla/Cargo.toml Secret

## benches_main.rs
#![feature(test)] extern crate test; use ::test::Bencher;

use ::lib::*;

mod ascii {
    use super::*;

    macro_rules! make_tests {(
        $(
            $SIZE:literal
        ),* $(,)?
    ) => (::paste::item! {
        $(
            mod [< length_ $SIZE >] {
                use super::*;

                const SIZE: usize = $SIZE;
                const INPUT: [u8; SIZE] = [0_u8; SIZE];

                #[bench]
                fn two_passes (b: &'_ mut Bencher)
                {
                    b.iter(|| {
                        let _ = ::std::str::from_utf8(&INPUT).unwrap().to_string();
                    });
                }

                #[bench]
                fn one_pass (b: &'_ mut Bencher)
                {
                    b.iter(|| {
                        let _ = string_from_utf8_slice(&INPUT).unwrap();
                    });
                }
            }
        )*
    })}

    make_tests! {
        268435456,
        32,
    }
}

## Cargo.toml
[package]
name = "lib"
version = "0.1.0"
authors = ["Daniel Henry-Mantilla <daniel.henry.mantilla@gmail.com>"]
edition = "2018"

[dependencies]

[dev-dependencies]
paste = "0.1.6"

## src_lib.rs
#![allow(unused)]
#![deny(bare_trait_objects, elided_lifetimes_in_paths, unused_must_use)]

use ::std::{*,
    mem::MaybeUninit,
};

#[derive(Debug)]
pub
struct Utf8Error {
    valid_up_to: usize,
    error_len: Option<u8>,
}

pub
fn string_from_utf8_slice (v: &[u8])
    -> Result<String, Utf8Error>
{
    /// Mask of the value bits of a continuation byte.
    const CONT_MASK: u8 = 0b0011_1111;
    /// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte.
    const TAG_CONT_U8: u8 = 0b1000_0000;

    // https://tools.ietf.org/html/rfc3629
    static UTF8_CHAR_WIDTH: [u8; 256] = [
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
        0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
        2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
        3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
        4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
    ];

    // use truncation to fit u64 into usize
    const NONASCII_MASK: usize = 0x80808080_80808080u64 as usize;

    /// Returns `true` if any byte in the word `x` is nonascii (>= 128).
    #[inline]
    fn contains_nonascii(x: usize) -> bool {
        (x & NONASCII_MASK) != 0
    }

    // -------------------------------------------------------------

    let mut ret: Vec<MaybeUninit<u8>> = Vec::with_capacity(v.len());
    unsafe {
        // # Safety:
        //
        //   - mem::uninitialized::<MaybeUninit<_>>() is always sound.
        ret.set_len(v.len());
    }
    let mut index = 0;
    let len = v.len();

    let usize_bytes = mem::size_of::<usize>();
    let ascii_block_size = 2 * usize_bytes;
    let blocks_end = if len >= ascii_block_size { len - ascii_block_size + 1 } else { 0 };

    while index < len {
        let old_offset = index;
        macro_rules! err {
            ($error_len: expr) => {
                return Err(Utf8Error {
                    valid_up_to: old_offset,
                    error_len: $error_len,
                })
            }
        }

        macro_rules! next { () => {{
            index += 1;
            // we needed data, but there was none: error!
            if index >= len {
                err!(None)
            }
            let next_byte = v[index];
            ret[index] = MaybeUninit::new(next_byte);
            next_byte
        }}}

        let first = v[index];
        ret[index] = MaybeUninit::new(first);
        if first >= 128 {
            let w = UTF8_CHAR_WIDTH[first as usize];
            // 2-byte encoding is for codepoints  \u{0080} to  \u{07ff}
            //        first  C2 80        last DF BF
            // 3-byte encoding is for codepoints  \u{0800} to  \u{ffff}
            //        first  E0 A0 80     last EF BF BF
            //   excluding surrogates codepoints  \u{d800} to  \u{dfff}
            //               ED A0 80 to       ED BF BF
            // 4-byte encoding is for codepoints \u{1000}0 to \u{10ff}ff
            //        first  F0 90 80 80  last F4 8F BF BF
            //
            // Use the UTF-8 syntax from the RFC
            //
            // https://tools.ietf.org/html/rfc3629
            // UTF8-1      = %x00-7F
            // UTF8-2      = %xC2-DF UTF8-tail
            // UTF8-3      = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
            //               %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
            // UTF8-4      = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
            //               %xF4 %x80-8F 2( UTF8-tail )
            match w {
                2 => if next!() & !CONT_MASK != TAG_CONT_U8 {
                    err!(Some(1))
                },
                3 => {
                    match (first, next!()) {
                        (0xE0         , 0xA0 ..= 0xBF) |
                        (0xE1 ..= 0xEC, 0x80 ..= 0xBF) |
                        (0xED         , 0x80 ..= 0x9F) |
                        (0xEE ..= 0xEF, 0x80 ..= 0xBF) => {}
                        _ => err!(Some(1))
                    }
                    if next!() & !CONT_MASK != TAG_CONT_U8 {
                        err!(Some(2))
                    }
                }
                4 => {
                    match (first, next!()) {
                        (0xF0         , 0x90 ..= 0xBF) |
                        (0xF1 ..= 0xF3, 0x80 ..= 0xBF) |
                        (0xF4         , 0x80 ..= 0x8F) => {}
                        _ => err!(Some(1))
                    }
                    if next!() & !CONT_MASK != TAG_CONT_U8 {
                        err!(Some(2))
                    }
                    if next!() & !CONT_MASK != TAG_CONT_U8 {
                        err!(Some(3))
                    }
                }
                _ => err!(Some(1))
            }
            index += 1;
        } else {
            // Ascii case, try to skip forward quickly.
            // When the pointer is aligned, read 2 words of data per iteration
            // until we find a word containing a non-ascii byte.
            let ptr = v.as_ptr();
            let out_ptr = ret.as_mut_ptr() as *mut u8;
            let align = unsafe {
                // the offset is safe, because `index` is guaranteed inbounds
                ptr.add(index).align_offset(usize_bytes)
            };
            if align == 0 {
                while index < blocks_end {
                    unsafe {
                        let block = ptr.add(index) as *const usize;
                        // break if there is a nonascii byte
                        let zu = contains_nonascii(*block);
                        let zv = contains_nonascii(*block.offset(1));
                        if zu | zv {
                            break;
                        }
                        ptr::copy_nonoverlapping(
                            block,
                            out_ptr.add(index) as *mut usize,
                            2,
                        );
                    }

                    index += ascii_block_size;
                }
                // step from the point where the wordwise loop stopped
                while index < len && v[index] < 128 {
                    ret[index] = MaybeUninit::new(v[index]);
                    index += 1;
                }
            } else {
                index += 1;
            }
        }
    }

    Ok(unsafe {
        // # Safety
        //
        //  1. ret only contains initialized bytes.
        //
        //  2. these bytes are valid utf8
        let ret =
            mem::transmute::<
                Vec<MaybeUninit<u8>>,
                Vec<u8>,
            >(ret)
        ;
        String::from_utf8_unchecked(ret)
    })
}
	#![feature(test)] extern crate test; use ::test::Bencher;

	use ::lib::*;

	mod ascii {
	use super::*;

	macro_rules! make_tests {(
	$(
	$SIZE:literal
	),* $(,)?
	) => (::paste::item! {
	$(
	mod [< length_ $SIZE >] {
	use super::*;

	const SIZE: usize = $SIZE;
	const INPUT: [u8; SIZE] = [0_u8; SIZE];

	#[bench]
	fn two_passes (b: &'_ mut Bencher)
	{
	b.iter(\|\| {
	let _ = ::std::str::from_utf8(&INPUT).unwrap().to_string();
	});
	}

	#[bench]
	fn one_pass (b: &'_ mut Bencher)
	{
	b.iter(\|\| {
	let _ = string_from_utf8_slice(&INPUT).unwrap();
	});
	}
	}
	)*
	})}

	make_tests! {
	268435456,
	32,
	}
	}
	[package]
	name = "lib"
	version = "0.1.0"
	authors = ["Daniel Henry-Mantilla <daniel.henry.mantilla@gmail.com>"]
	edition = "2018"

	[dependencies]

	[dev-dependencies]
	paste = "0.1.6"
	#![allow(unused)]
	#![deny(bare_trait_objects, elided_lifetimes_in_paths, unused_must_use)]

	use ::std::{*,
	mem::MaybeUninit,
	};

	#[derive(Debug)]
	pub
	struct Utf8Error {
	valid_up_to: usize,
	error_len: Option<u8>,
	}

	pub
	fn string_from_utf8_slice (v: &[u8])
	-> Result<String, Utf8Error>
	{
	/// Mask of the value bits of a continuation byte.
	const CONT_MASK: u8 = 0b0011_1111;
	/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte.
	const TAG_CONT_U8: u8 = 0b1000_0000;

	// https://tools.ietf.org/html/rfc3629
	static UTF8_CHAR_WIDTH: [u8; 256] = [
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
	0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
	3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
	4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
	];

	// use truncation to fit u64 into usize
	const NONASCII_MASK: usize = 0x80808080_80808080u64 as usize;

	/// Returns `true` if any byte in the word `x` is nonascii (>= 128).
	#[inline]
	fn contains_nonascii(x: usize) -> bool {
	(x & NONASCII_MASK) != 0
	}

	// -------------------------------------------------------------

	let mut ret: Vec<MaybeUninit<u8>> = Vec::with_capacity(v.len());
	unsafe {
	// # Safety:
	//
	// - mem::uninitialized::<MaybeUninit<_>>() is always sound.
	ret.set_len(v.len());
	}
	let mut index = 0;
	let len = v.len();

	let usize_bytes = mem::size_of::<usize>();
	let ascii_block_size = 2 * usize_bytes;
	let blocks_end = if len >= ascii_block_size { len - ascii_block_size + 1 } else { 0 };

	while index < len {
	let old_offset = index;
	macro_rules! err {
	($error_len: expr) => {
	return Err(Utf8Error {
	valid_up_to: old_offset,
	error_len: $error_len,
	})
	}
	}

	macro_rules! next { () => {{
	index += 1;
	// we needed data, but there was none: error!
	if index >= len {
	err!(None)
	}
	let next_byte = v[index];
	ret[index] = MaybeUninit::new(next_byte);
	next_byte
	}}}

	let first = v[index];
	ret[index] = MaybeUninit::new(first);
	if first >= 128 {
	let w = UTF8_CHAR_WIDTH[first as usize];
	// 2-byte encoding is for codepoints \u{0080} to \u{07ff}
	// first C2 80 last DF BF
	// 3-byte encoding is for codepoints \u{0800} to \u{ffff}
	// first E0 A0 80 last EF BF BF
	// excluding surrogates codepoints \u{d800} to \u{dfff}
	// ED A0 80 to ED BF BF
	// 4-byte encoding is for codepoints \u{1000}0 to \u{10ff}ff
	// first F0 90 80 80 last F4 8F BF BF
	//
	// Use the UTF-8 syntax from the RFC
	//
	// https://tools.ietf.org/html/rfc3629
	// UTF8-1 = %x00-7F
	// UTF8-2 = %xC2-DF UTF8-tail
	// UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
	// %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
	// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
	// %xF4 %x80-8F 2( UTF8-tail )
	match w {
	2 => if next!() & !CONT_MASK != TAG_CONT_U8 {
	err!(Some(1))
	},
	3 => {
	match (first, next!()) {
	(0xE0 , 0xA0 ..= 0xBF) \|
	(0xE1 ..= 0xEC, 0x80 ..= 0xBF) \|
	(0xED , 0x80 ..= 0x9F) \|
	(0xEE ..= 0xEF, 0x80 ..= 0xBF) => {}
	_ => err!(Some(1))
	}
	if next!() & !CONT_MASK != TAG_CONT_U8 {
	err!(Some(2))
	}
	}
	4 => {
	match (first, next!()) {
	(0xF0 , 0x90 ..= 0xBF) \|
	(0xF1 ..= 0xF3, 0x80 ..= 0xBF) \|
	(0xF4 , 0x80 ..= 0x8F) => {}
	_ => err!(Some(1))
	}
	if next!() & !CONT_MASK != TAG_CONT_U8 {
	err!(Some(2))
	}
	if next!() & !CONT_MASK != TAG_CONT_U8 {
	err!(Some(3))
	}
	}
	_ => err!(Some(1))
	}
	index += 1;
	} else {
	// Ascii case, try to skip forward quickly.
	// When the pointer is aligned, read 2 words of data per iteration
	// until we find a word containing a non-ascii byte.
	let ptr = v.as_ptr();
	let out_ptr = ret.as_mut_ptr() as *mut u8;
	let align = unsafe {
	// the offset is safe, because `index` is guaranteed inbounds
	ptr.add(index).align_offset(usize_bytes)
	};
	if align == 0 {
	while index < blocks_end {
	unsafe {
	let block = ptr.add(index) as *const usize;
	// break if there is a nonascii byte
	let zu = contains_nonascii(*block);
	let zv = contains_nonascii(*block.offset(1));
	if zu \| zv {
	break;
	}
	ptr::copy_nonoverlapping(
	block,
	out_ptr.add(index) as *mut usize,
	2,
	);
	}

	index += ascii_block_size;
	}
	// step from the point where the wordwise loop stopped
	while index < len && v[index] < 128 {
	ret[index] = MaybeUninit::new(v[index]);
	index += 1;
	}
	} else {
	index += 1;
	}
	}
	}

	Ok(unsafe {
	// # Safety
	//
	// 1. ret only contains initialized bytes.
	//
	// 2. these bytes are valid utf8
	let ret =
	mem::transmute::<
	Vec<MaybeUninit<u8>>,
	Vec<u8>,
	>(ret)
	;
	String::from_utf8_unchecked(ret)
	})
	}