-
-
Save danielhenrymantilla/cc22b5deef85e49274f1ad4d1291833e to your computer and use it in GitHub Desktop.
string_from_utf8_slice
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#![feature(test)] extern crate test; use ::test::Bencher; | |
use ::lib::*; | |
mod ascii { | |
use super::*; | |
macro_rules! make_tests {( | |
$( | |
$SIZE:literal | |
),* $(,)? | |
) => (::paste::item! { | |
$( | |
mod [< length_ $SIZE >] { | |
use super::*; | |
const SIZE: usize = $SIZE; | |
const INPUT: [u8; SIZE] = [0_u8; SIZE]; | |
#[bench] | |
fn two_passes (b: &'_ mut Bencher) | |
{ | |
b.iter(|| { | |
let _ = ::std::str::from_utf8(&INPUT).unwrap().to_string(); | |
}); | |
} | |
#[bench] | |
fn one_pass (b: &'_ mut Bencher) | |
{ | |
b.iter(|| { | |
let _ = string_from_utf8_slice(&INPUT).unwrap(); | |
}); | |
} | |
} | |
)* | |
})} | |
make_tests! { | |
268435456, | |
32, | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[package] | |
name = "lib" | |
version = "0.1.0" | |
authors = ["Daniel Henry-Mantilla <daniel.henry.mantilla@gmail.com>"] | |
edition = "2018" | |
[dependencies] | |
[dev-dependencies] | |
paste = "0.1.6" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#![allow(unused)] | |
#![deny(bare_trait_objects, elided_lifetimes_in_paths, unused_must_use)] | |
use ::std::{*, | |
mem::MaybeUninit, | |
}; | |
#[derive(Debug)] | |
pub | |
struct Utf8Error { | |
valid_up_to: usize, | |
error_len: Option<u8>, | |
} | |
pub | |
fn string_from_utf8_slice (v: &[u8]) | |
-> Result<String, Utf8Error> | |
{ | |
/// Mask of the value bits of a continuation byte. | |
const CONT_MASK: u8 = 0b0011_1111; | |
/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte. | |
const TAG_CONT_U8: u8 = 0b1000_0000; | |
// https://tools.ietf.org/html/rfc3629 | |
static UTF8_CHAR_WIDTH: [u8; 256] = [ | |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F | |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F | |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F | |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF | |
0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2, | |
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF | |
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF | |
4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF | |
]; | |
// use truncation to fit u64 into usize | |
const NONASCII_MASK: usize = 0x80808080_80808080u64 as usize; | |
/// Returns `true` if any byte in the word `x` is nonascii (>= 128). | |
#[inline] | |
fn contains_nonascii(x: usize) -> bool { | |
(x & NONASCII_MASK) != 0 | |
} | |
// ------------------------------------------------------------- | |
let mut ret: Vec<MaybeUninit<u8>> = Vec::with_capacity(v.len()); | |
unsafe { | |
// # Safety: | |
// | |
// - mem::uninitialized::<MaybeUninit<_>>() is always sound. | |
ret.set_len(v.len()); | |
} | |
let mut index = 0; | |
let len = v.len(); | |
let usize_bytes = mem::size_of::<usize>(); | |
let ascii_block_size = 2 * usize_bytes; | |
let blocks_end = if len >= ascii_block_size { len - ascii_block_size + 1 } else { 0 }; | |
while index < len { | |
let old_offset = index; | |
macro_rules! err { | |
($error_len: expr) => { | |
return Err(Utf8Error { | |
valid_up_to: old_offset, | |
error_len: $error_len, | |
}) | |
} | |
} | |
macro_rules! next { () => {{ | |
index += 1; | |
// we needed data, but there was none: error! | |
if index >= len { | |
err!(None) | |
} | |
let next_byte = v[index]; | |
ret[index] = MaybeUninit::new(next_byte); | |
next_byte | |
}}} | |
let first = v[index]; | |
ret[index] = MaybeUninit::new(first); | |
if first >= 128 { | |
let w = UTF8_CHAR_WIDTH[first as usize]; | |
// 2-byte encoding is for codepoints \u{0080} to \u{07ff} | |
// first C2 80 last DF BF | |
// 3-byte encoding is for codepoints \u{0800} to \u{ffff} | |
// first E0 A0 80 last EF BF BF | |
// excluding surrogates codepoints \u{d800} to \u{dfff} | |
// ED A0 80 to ED BF BF | |
// 4-byte encoding is for codepoints \u{1000}0 to \u{10ff}ff | |
// first F0 90 80 80 last F4 8F BF BF | |
// | |
// Use the UTF-8 syntax from the RFC | |
// | |
// https://tools.ietf.org/html/rfc3629 | |
// UTF8-1 = %x00-7F | |
// UTF8-2 = %xC2-DF UTF8-tail | |
// UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / | |
// %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail ) | |
// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / | |
// %xF4 %x80-8F 2( UTF8-tail ) | |
match w { | |
2 => if next!() & !CONT_MASK != TAG_CONT_U8 { | |
err!(Some(1)) | |
}, | |
3 => { | |
match (first, next!()) { | |
(0xE0 , 0xA0 ..= 0xBF) | | |
(0xE1 ..= 0xEC, 0x80 ..= 0xBF) | | |
(0xED , 0x80 ..= 0x9F) | | |
(0xEE ..= 0xEF, 0x80 ..= 0xBF) => {} | |
_ => err!(Some(1)) | |
} | |
if next!() & !CONT_MASK != TAG_CONT_U8 { | |
err!(Some(2)) | |
} | |
} | |
4 => { | |
match (first, next!()) { | |
(0xF0 , 0x90 ..= 0xBF) | | |
(0xF1 ..= 0xF3, 0x80 ..= 0xBF) | | |
(0xF4 , 0x80 ..= 0x8F) => {} | |
_ => err!(Some(1)) | |
} | |
if next!() & !CONT_MASK != TAG_CONT_U8 { | |
err!(Some(2)) | |
} | |
if next!() & !CONT_MASK != TAG_CONT_U8 { | |
err!(Some(3)) | |
} | |
} | |
_ => err!(Some(1)) | |
} | |
index += 1; | |
} else { | |
// Ascii case, try to skip forward quickly. | |
// When the pointer is aligned, read 2 words of data per iteration | |
// until we find a word containing a non-ascii byte. | |
let ptr = v.as_ptr(); | |
let out_ptr = ret.as_mut_ptr() as *mut u8; | |
let align = unsafe { | |
// the offset is safe, because `index` is guaranteed inbounds | |
ptr.add(index).align_offset(usize_bytes) | |
}; | |
if align == 0 { | |
while index < blocks_end { | |
unsafe { | |
let block = ptr.add(index) as *const usize; | |
// break if there is a nonascii byte | |
let zu = contains_nonascii(*block); | |
let zv = contains_nonascii(*block.offset(1)); | |
if zu | zv { | |
break; | |
} | |
ptr::copy_nonoverlapping( | |
block, | |
out_ptr.add(index) as *mut usize, | |
2, | |
); | |
} | |
index += ascii_block_size; | |
} | |
// step from the point where the wordwise loop stopped | |
while index < len && v[index] < 128 { | |
ret[index] = MaybeUninit::new(v[index]); | |
index += 1; | |
} | |
} else { | |
index += 1; | |
} | |
} | |
} | |
Ok(unsafe { | |
// # Safety | |
// | |
// 1. ret only contains initialized bytes. | |
// | |
// 2. these bytes are valid utf8 | |
let ret = | |
mem::transmute::< | |
Vec<MaybeUninit<u8>>, | |
Vec<u8>, | |
>(ret) | |
; | |
String::from_utf8_unchecked(ret) | |
}) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment