Skip to content

Instantly share code, notes, and snippets.

@danielhenrymantilla
Created August 28, 2019 10:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save danielhenrymantilla/cc22b5deef85e49274f1ad4d1291833e to your computer and use it in GitHub Desktop.
Save danielhenrymantilla/cc22b5deef85e49274f1ad4d1291833e to your computer and use it in GitHub Desktop.
string_from_utf8_slice
#![feature(test)] extern crate test; use ::test::Bencher;
use ::lib::*;
mod ascii {
use super::*;
macro_rules! make_tests {(
$(
$SIZE:literal
),* $(,)?
) => (::paste::item! {
$(
mod [< length_ $SIZE >] {
use super::*;
const SIZE: usize = $SIZE;
const INPUT: [u8; SIZE] = [0_u8; SIZE];
#[bench]
fn two_passes (b: &'_ mut Bencher)
{
b.iter(|| {
let _ = ::std::str::from_utf8(&INPUT).unwrap().to_string();
});
}
#[bench]
fn one_pass (b: &'_ mut Bencher)
{
b.iter(|| {
let _ = string_from_utf8_slice(&INPUT).unwrap();
});
}
}
)*
})}
make_tests! {
268435456,
32,
}
}
[package]
name = "lib"
version = "0.1.0"
authors = ["Daniel Henry-Mantilla <daniel.henry.mantilla@gmail.com>"]
edition = "2018"
[dependencies]
[dev-dependencies]
paste = "0.1.6"
#![allow(unused)]
#![deny(bare_trait_objects, elided_lifetimes_in_paths, unused_must_use)]
use ::std::{*,
mem::MaybeUninit,
};
#[derive(Debug)]
pub
struct Utf8Error {
valid_up_to: usize,
error_len: Option<u8>,
}
pub
fn string_from_utf8_slice (v: &[u8])
-> Result<String, Utf8Error>
{
/// Mask of the value bits of a continuation byte.
const CONT_MASK: u8 = 0b0011_1111;
/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte.
const TAG_CONT_U8: u8 = 0b1000_0000;
// https://tools.ietf.org/html/rfc3629
static UTF8_CHAR_WIDTH: [u8; 256] = [
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
];
// use truncation to fit u64 into usize
const NONASCII_MASK: usize = 0x80808080_80808080u64 as usize;
/// Returns `true` if any byte in the word `x` is nonascii (>= 128).
#[inline]
fn contains_nonascii(x: usize) -> bool {
(x & NONASCII_MASK) != 0
}
// -------------------------------------------------------------
let mut ret: Vec<MaybeUninit<u8>> = Vec::with_capacity(v.len());
unsafe {
// # Safety:
//
// - mem::uninitialized::<MaybeUninit<_>>() is always sound.
ret.set_len(v.len());
}
let mut index = 0;
let len = v.len();
let usize_bytes = mem::size_of::<usize>();
let ascii_block_size = 2 * usize_bytes;
let blocks_end = if len >= ascii_block_size { len - ascii_block_size + 1 } else { 0 };
while index < len {
let old_offset = index;
macro_rules! err {
($error_len: expr) => {
return Err(Utf8Error {
valid_up_to: old_offset,
error_len: $error_len,
})
}
}
macro_rules! next { () => {{
index += 1;
// we needed data, but there was none: error!
if index >= len {
err!(None)
}
let next_byte = v[index];
ret[index] = MaybeUninit::new(next_byte);
next_byte
}}}
let first = v[index];
ret[index] = MaybeUninit::new(first);
if first >= 128 {
let w = UTF8_CHAR_WIDTH[first as usize];
// 2-byte encoding is for codepoints \u{0080} to \u{07ff}
// first C2 80 last DF BF
// 3-byte encoding is for codepoints \u{0800} to \u{ffff}
// first E0 A0 80 last EF BF BF
// excluding surrogates codepoints \u{d800} to \u{dfff}
// ED A0 80 to ED BF BF
// 4-byte encoding is for codepoints \u{1000}0 to \u{10ff}ff
// first F0 90 80 80 last F4 8F BF BF
//
// Use the UTF-8 syntax from the RFC
//
// https://tools.ietf.org/html/rfc3629
// UTF8-1 = %x00-7F
// UTF8-2 = %xC2-DF UTF8-tail
// UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
// %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
// %xF4 %x80-8F 2( UTF8-tail )
match w {
2 => if next!() & !CONT_MASK != TAG_CONT_U8 {
err!(Some(1))
},
3 => {
match (first, next!()) {
(0xE0 , 0xA0 ..= 0xBF) |
(0xE1 ..= 0xEC, 0x80 ..= 0xBF) |
(0xED , 0x80 ..= 0x9F) |
(0xEE ..= 0xEF, 0x80 ..= 0xBF) => {}
_ => err!(Some(1))
}
if next!() & !CONT_MASK != TAG_CONT_U8 {
err!(Some(2))
}
}
4 => {
match (first, next!()) {
(0xF0 , 0x90 ..= 0xBF) |
(0xF1 ..= 0xF3, 0x80 ..= 0xBF) |
(0xF4 , 0x80 ..= 0x8F) => {}
_ => err!(Some(1))
}
if next!() & !CONT_MASK != TAG_CONT_U8 {
err!(Some(2))
}
if next!() & !CONT_MASK != TAG_CONT_U8 {
err!(Some(3))
}
}
_ => err!(Some(1))
}
index += 1;
} else {
// Ascii case, try to skip forward quickly.
// When the pointer is aligned, read 2 words of data per iteration
// until we find a word containing a non-ascii byte.
let ptr = v.as_ptr();
let out_ptr = ret.as_mut_ptr() as *mut u8;
let align = unsafe {
// the offset is safe, because `index` is guaranteed inbounds
ptr.add(index).align_offset(usize_bytes)
};
if align == 0 {
while index < blocks_end {
unsafe {
let block = ptr.add(index) as *const usize;
// break if there is a nonascii byte
let zu = contains_nonascii(*block);
let zv = contains_nonascii(*block.offset(1));
if zu | zv {
break;
}
ptr::copy_nonoverlapping(
block,
out_ptr.add(index) as *mut usize,
2,
);
}
index += ascii_block_size;
}
// step from the point where the wordwise loop stopped
while index < len && v[index] < 128 {
ret[index] = MaybeUninit::new(v[index]);
index += 1;
}
} else {
index += 1;
}
}
}
Ok(unsafe {
// # Safety
//
// 1. ret only contains initialized bytes.
//
// 2. these bytes are valid utf8
let ret =
mem::transmute::<
Vec<MaybeUninit<u8>>,
Vec<u8>,
>(ret)
;
String::from_utf8_unchecked(ret)
})
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment