japaric/dstr.rs

## dstr.rs
#![feature(macro_rules)]
#![no_implicit_prelude]

use std::iter::Iterator;
use std::option::{None, Option, Some};
use std::slice::ImmutableSlice;
use std::{iter, slice};

pub struct Str([u8]);

/// External iterator for a string's bytes.
/// Use with the `std::iter` module.
pub type Bytes<'a> = iter::Map<'a, &'a u8, u8, slice::Items<'a, u8>>;

// Free functions
impl Str {
    /// Converts a vector to a string slice without performing any allocations.
    ///
    /// Once the slice has been validated as utf-8, it is transmuted in-place and
    /// returned as a '&str' instead of a '&[u8]'
    ///
    /// Returns None if the slice is not utf-8.
    pub fn from_utf8<'a>(v: &'a [u8]) -> Option<&'a Str> {
        if is_utf8(v) {
            Some(unsafe { raw::from_utf8(v) })
        } else { None }
    }
}

mod raw {
    use super::Str;
    use std::mem;

    /// Converts a slice of bytes to a string slice without checking
    /// that the string contains valid UTF-8.
    pub unsafe fn from_utf8<'a>(v: &'a [u8]) -> &'a Str {
        mem::transmute(v)
    }
}

fn main() {
    let msg = "Hello World!";
    let bytes = {
        use std::str::StrSlice;

        msg.as_bytes()
    };
    let str = Str::from_utf8(bytes);
    // ICE trigger
    let _ = str.is_some();
}

// Copied verbatim from `core::str`

/// Mask of the value bits of a continuation byte
const CONT_MASK: u8 = 0b0011_1111u8;
/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte
const TAG_CONT_U8: u8 = 0b1000_0000u8;

// https://tools.ietf.org/html/rfc3629
static UTF8_CHAR_WIDTH: [u8, ..256] = [
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
];

/// Given a first byte, determine how many bytes are in this UTF-8 character
#[inline]
pub fn utf8_char_width(b: u8) -> uint {
    return UTF8_CHAR_WIDTH[b as uint] as uint;
}

/// Determines if a vector of bytes contains valid UTF-8.
pub fn is_utf8(v: &[u8]) -> bool {
    run_utf8_validation_iterator(&mut v.iter())
}

/// Walk through `iter` checking that it's a valid UTF-8 sequence,
/// returning `true` in that case, or, if it is invalid, `false` with
/// `iter` reset such that it is pointing at the first byte in the
/// invalid sequence.
#[inline(always)]
fn run_utf8_validation_iterator(iter: &mut slice::Items<u8>) -> bool {
    loop {
        // save the current thing we're pointing at.
        let old = *iter;

        // restore the iterator we had at the start of this codepoint.
        macro_rules! err ( () => { {*iter = old; return false} });
        macro_rules! next ( () => {
                match iter.next() {
                    Some(a) => *a,
                    // we needed data, but there was none: error!
                    None => err!()
                }
            });

        let first = match iter.next() {
            Some(&b) => b,
            // we're at the end of the iterator and a codepoint
            // boundary at the same time, so this string is valid.
            None => return true
        };

        // ASCII characters are always valid, so only large
        // bytes need more examination.
        if first >= 128 {
            let w = utf8_char_width(first);
            let second = next!();
            // 2-byte encoding is for codepoints  \u0080 to  \u07ff
            //        first  C2 80        last DF BF
            // 3-byte encoding is for codepoints  \u0800 to  \uffff
            //        first  E0 A0 80     last EF BF BF
            //   excluding surrogates codepoints  \ud800 to  \udfff
            //               ED A0 80 to       ED BF BF
            // 4-byte encoding is for codepoints \u10000 to \u10ffff
            //        first  F0 90 80 80  last F4 8F BF BF
            //
            // Use the UTF-8 syntax from the RFC
            //
            // https://tools.ietf.org/html/rfc3629
            // UTF8-1      = %x00-7F
            // UTF8-2      = %xC2-DF UTF8-tail
            // UTF8-3      = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
            //               %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
            // UTF8-4      = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
            //               %xF4 %x80-8F 2( UTF8-tail )
            match w {
                2 => if second & !CONT_MASK != TAG_CONT_U8 {err!()},
                3 => {
                    match (first, second, next!() & !CONT_MASK) {
                        (0xE0         , 0xA0 ... 0xBF, TAG_CONT_U8) |
                        (0xE1 ... 0xEC, 0x80 ... 0xBF, TAG_CONT_U8) |
                        (0xED         , 0x80 ... 0x9F, TAG_CONT_U8) |
                        (0xEE ... 0xEF, 0x80 ... 0xBF, TAG_CONT_U8) => {}
                        _ => err!()
                    }
                }
                4 => {
                    match (first, second, next!() & !CONT_MASK, next!() & !CONT_MASK) {
                        (0xF0         , 0x90 ... 0xBF, TAG_CONT_U8, TAG_CONT_U8) |
                        (0xF1 ... 0xF3, 0x80 ... 0xBF, TAG_CONT_U8, TAG_CONT_U8) |
                        (0xF4         , 0x80 ... 0x8F, TAG_CONT_U8, TAG_CONT_U8) => {}
                        _ => err!()
                    }
                }
                _ => err!()
            }
        }
    }
}
	#![feature(macro_rules)]
	#![no_implicit_prelude]

	use std::iter::Iterator;
	use std::option::{None, Option, Some};
	use std::slice::ImmutableSlice;
	use std::{iter, slice};

	pub struct Str([u8]);

	/// External iterator for a string's bytes.
	/// Use with the `std::iter` module.
	pub type Bytes<'a> = iter::Map<'a, &'a u8, u8, slice::Items<'a, u8>>;

	// Free functions
	impl Str {
	/// Converts a vector to a string slice without performing any allocations.
	///
	/// Once the slice has been validated as utf-8, it is transmuted in-place and
	/// returned as a '&str' instead of a '&[u8]'
	///
	/// Returns None if the slice is not utf-8.
	pub fn from_utf8<'a>(v: &'a [u8]) -> Option<&'a Str> {
	if is_utf8(v) {
	Some(unsafe { raw::from_utf8(v) })
	} else { None }
	}
	}

	mod raw {
	use super::Str;
	use std::mem;

	/// Converts a slice of bytes to a string slice without checking
	/// that the string contains valid UTF-8.
	pub unsafe fn from_utf8<'a>(v: &'a [u8]) -> &'a Str {
	mem::transmute(v)
	}
	}

	fn main() {
	let msg = "Hello World!";
	let bytes = {
	use std::str::StrSlice;

	msg.as_bytes()
	};
	let str = Str::from_utf8(bytes);
	// ICE trigger
	let _ = str.is_some();
	}

	// Copied verbatim from `core::str`

	/// Mask of the value bits of a continuation byte
	const CONT_MASK: u8 = 0b0011_1111u8;
	/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte
	const TAG_CONT_U8: u8 = 0b1000_0000u8;

	// https://tools.ietf.org/html/rfc3629
	static UTF8_CHAR_WIDTH: [u8, ..256] = [
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
	0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
	3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
	4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
	];

	/// Given a first byte, determine how many bytes are in this UTF-8 character
	#[inline]
	pub fn utf8_char_width(b: u8) -> uint {
	return UTF8_CHAR_WIDTH[b as uint] as uint;
	}

	/// Determines if a vector of bytes contains valid UTF-8.
	pub fn is_utf8(v: &[u8]) -> bool {
	run_utf8_validation_iterator(&mut v.iter())
	}

	/// Walk through `iter` checking that it's a valid UTF-8 sequence,
	/// returning `true` in that case, or, if it is invalid, `false` with
	/// `iter` reset such that it is pointing at the first byte in the
	/// invalid sequence.
	#[inline(always)]
	fn run_utf8_validation_iterator(iter: &mut slice::Items<u8>) -> bool {
	loop {
	// save the current thing we're pointing at.
	let old = *iter;

	// restore the iterator we had at the start of this codepoint.
	macro_rules! err ( () => { {*iter = old; return false} });
	macro_rules! next ( () => {
	match iter.next() {
	Some(a) => *a,
	// we needed data, but there was none: error!
	None => err!()
	}
	});

	let first = match iter.next() {
	Some(&b) => b,
	// we're at the end of the iterator and a codepoint
	// boundary at the same time, so this string is valid.
	None => return true
	};

	// ASCII characters are always valid, so only large
	// bytes need more examination.
	if first >= 128 {
	let w = utf8_char_width(first);
	let second = next!();
	// 2-byte encoding is for codepoints \u0080 to \u07ff
	// first C2 80 last DF BF
	// 3-byte encoding is for codepoints \u0800 to \uffff
	// first E0 A0 80 last EF BF BF
	// excluding surrogates codepoints \ud800 to \udfff
	// ED A0 80 to ED BF BF
	// 4-byte encoding is for codepoints \u10000 to \u10ffff
	// first F0 90 80 80 last F4 8F BF BF
	//
	// Use the UTF-8 syntax from the RFC
	//
	// https://tools.ietf.org/html/rfc3629
	// UTF8-1 = %x00-7F
	// UTF8-2 = %xC2-DF UTF8-tail
	// UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
	// %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
	// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
	// %xF4 %x80-8F 2( UTF8-tail )
	match w {
	2 => if second & !CONT_MASK != TAG_CONT_U8 {err!()},
	3 => {
	match (first, second, next!() & !CONT_MASK) {
	(0xE0 , 0xA0 ... 0xBF, TAG_CONT_U8) \|
	(0xE1 ... 0xEC, 0x80 ... 0xBF, TAG_CONT_U8) \|
	(0xED , 0x80 ... 0x9F, TAG_CONT_U8) \|
	(0xEE ... 0xEF, 0x80 ... 0xBF, TAG_CONT_U8) => {}
	_ => err!()
	}
	}
	4 => {
	match (first, second, next!() & !CONT_MASK, next!() & !CONT_MASK) {
	(0xF0 , 0x90 ... 0xBF, TAG_CONT_U8, TAG_CONT_U8) \|
	(0xF1 ... 0xF3, 0x80 ... 0xBF, TAG_CONT_U8, TAG_CONT_U8) \|
	(0xF4 , 0x80 ... 0x8F, TAG_CONT_U8, TAG_CONT_U8) => {}
	_ => err!()
	}
	}
	_ => err!()
	}
	}
	}
	}