ssokolow/escape_non_utf8_paths.rs

## escape_non_utf8_paths.rs
/* POSIX paths in JSON via escaping which
doesn't alter valid UTF-8 paths.

The trick is recognizing that JSON can store binary nulls in strings
but nulls are the only character that can't occur in POSIX paths,
so we can use it as an escape character that won't change how existing
serialized paths get interpreted.

Copyright 2018-2020, Stephan Sokolow

This code is released under your choice of the MIT or Apache-2.0 licenses.
https://opensource.org/licenses/MIT
https://opensource.org/licenses/Apache-2.0
*/

use std::borrow::Cow;
use std::ffi::{OsStr, OsString};
use std::str;

// Platform-specific imports
use std::os::unix::ffi::{OsStrExt, OsStringExt};

/// Escape an OS path into something which can safely be stored in a valid UTF-8 string
fn escape_path<P: AsRef<OsStr> + ?Sized>(path: &P) -> Cow<'_, str> {
    escape_path_inner(path.as_ref())
}

/// Inner function for `escape_path` to avoid the risk of monomorphization bloat
///
/// Adapted from the example code on the `std::str::Utf8Error` rustdoc page
/// TODO: Support Windows... ideally in a way that results in the same conversion logic
///       as ntfs-3g uses.
fn escape_path_inner(path: &OsStr) -> Cow<'_, str> {
    if let Some(path_str) = path.to_str() {
        if !path_str.contains('\0') {
            // In the by-far most common case, just do a validity check and a copy
            // (According to Criterion, this halves the common-case runtime in exchange for
            // a 6-20% (+/- 5%) slow-down in the case where escaping is needed.)
            //
            // An if/else here which directs "valid but with \0" to str::replace was shown
            // by Criterion to result in a ~33% slowdown for a test string with four \0 in it
            // and a ~45% slowdown for a test string with seven \0 in it.
            return Cow::from(path_str.to_owned());
        }
    }

    // In the very uncommon case, make a copy of the string with invalid bytes escaped
    let mut input = path.as_bytes();

    // Preallocate for four escapes
    // (Just a guess, based on four mojibake'd latin1 bytes, two UTF-16 surrogates,
    // or one UTF-32 character)
    let mut result = String::with_capacity(path.len().saturating_add(4));

    loop {
        // Allowed because it should be impossible for invalid_sequence_length to
        // be outside the range of after_valid. Tests should cover all cases, so
        // it doesn't make sense to change the API to return a Result when the
        // fallible path should be impossible.
        //
        // TODO: Move this to the &after_valid[...] once attributes on expressions are no
        // longer experimental.
        #[allow(clippy::indexing_slicing)]
        match str::from_utf8(input) {
            // TODO: Try rewriting this into something faster
            Ok(valid) => {
                // Escape binary nulls inside a fully valid string so they round-trip properly
                for u_char in valid.chars() {
                    if u_char == '\0' { result.push('\0'); }
                    result.push(u_char);
                }
                break
            }
            Err(error) => {
                // Pass through the valid span
                let (valid, after_valid) = input.split_at(error.valid_up_to());

                // Escape binary nulls inside the valid span so they round-trip properly
                #[allow(clippy::result_expect_used)]
                for u_char in str::from_utf8(valid).expect(
                        "from_utf8 on left-hand output of valid_up_to()").chars() {
                    if u_char == '\0' { result.push('\0'); }
                    result.push(u_char);
                }

                // Escape any following invalid characters
                let invalid_sequence_length = match error.error_len() {
                    Some(length) => length,
                    None => after_valid.len()
                };
                for &byte in after_valid.iter().take(invalid_sequence_length) {
                    result.push('\0');
                    result.push(byte.into());
                }

                // Step forward to the next span or end the loop
                if let Some(invalid_sequence_length) = error.error_len() {
                    input = &after_valid[invalid_sequence_length..]
                } else {
                    break
                }
            }
        }
    }
    return Cow::from(result);
}

/// Take the output from `escape_path` and change it back into an OS string
///
/// (`allow(dead_code)` because its purpose is to exist on standby and pass unit tests,
///  awaiting the need to unescape my emergency records.)
#[allow(dead_code)]
fn unescape_path(path: &str) -> Cow<'_, OsStr> {
    // In the by-far most common case, just check for \0 and return a Cow<OsStr>
    // (According to Criterion, skipping the unescaping code for the common case results in a 360%+
    // speed-up in the common case with no statistically significant change in the case where
    // things need to be unescaped.)
    if !path.contains('\0') {
        return Cow::from(OsStr::new(path));
    }

    // Otherwise, unescape the escaped bytes
    // TODO: Come up with a nicer way to do this
    let mut result: Vec<u8> = Vec::with_capacity(path.len());
    let mut utf8_buf = [0_u8; 4];

    let mut raw_next = false;
    for u_char in path.chars() {
        if raw_next {
            result.push(u_char as u8);
            raw_next = false;
        } else if u_char == '\0' {
            raw_next = true;
        } else {
            result.extend(u_char.encode_utf8(&mut utf8_buf).as_bytes());
        }
    }

    return Cow::from(OsString::from_vec(result));
}

#[cfg(test)]
mod tests {
    use std::ffi::OsString;
    use std::os::unix::ffi::OsStringExt;
    use super::{escape_path, unescape_path};

    const TEST_STRINGS: &[(&[u8], &str)] = &[
        // all valid utf-8
        (b"string with no invalid utf-8", "string with no invalid utf-8"),
        // typical string with invalid utf-8
        (b"/un/fichier/fran\xe7ais", "/un/fichier/fran\0\u{00e7}ais"),
        // starting with invalid utf-8
        (b"\xe7a va", "\0\u{00e7}a va"),
        // invalid span length > 1
        (b"foo\xe7\xe7bar", "foo\0\u{00e7}\0\u{00e7}bar"),
        // only invalid characters
        (b"\xe7\xe7", "\0\u{00e7}\0\u{00e7}"),
        // empty string
        (b"", ""),
        // ending with invalid utf-8 less than 3 characters (see utf8error::error_len)
        (b"foo\xe7", "foo\0\u{00e7}"),
        (b"foo\xe7\xe7", "foo\0\u{00e7}\0\u{00e7}"),
        // ending with invalid utf-8 more than 3 characters (see utf8error::error_len)
        (b"foo\xe7\xe7\xe7\xe7", "foo\0\u{00e7}\0\u{00e7}\0\u{00e7}\0\u{00e7}"),
        // all valid utf-8, but with nulls
        (b"\0string with no\0\0invalid utf-8\0", "\0\0string with no\0\0\0\0invalid utf-8\0\0"),
    ];

    /// Test that escape_path works properly
    #[test]
    fn test_escape_path() {
        for (input, expected) in TEST_STRINGS {
            let os_string = OsString::from_vec(input.to_vec());
            let escaped = escape_path(&os_string);
            assert_eq!(escaped, *expected);
        }
    }

    /// Test that unescape_path is symmetrical to escape_path
    #[test]
    fn test_unescape_path() {
        for (input, _expected) in TEST_STRINGS {
            let os_string = OsString::from_vec(input.to_vec());
            let escaped = escape_path(&os_string);
            assert_eq!(&unescape_path(&escaped), &os_string.as_os_str());
        }
    }

    /// Test that nulls in valid UTF-8 round-trip successfully
    #[test]
    fn test_null_round_tripping() {
        let test_strings: &[(&[u8], &str)] = &[
            (b"\0foo", "\0\0foo"),
            (b"foo\0bar", "foo\0\0bar"),
            (b"foo\0\0bar", "foo\0\0\0\0bar"),
            (b"foo\0", "foo\0\0"),
            (b"\0foo\0bar\xe7baz\0\0quux\0", "\0\0foo\0\0bar\0\u{00e7}baz\0\0\0\0quux\0\0"),
        ];
        for (in_vec, expected_escaped) in test_strings {
            let os_string = OsString::from_vec(in_vec.to_vec());
            let escaped = escape_path(&os_string);
            assert_eq!(&escaped, expected_escaped);
            let round_tripped = unescape_path(&escaped);
            assert_eq!(os_string, round_tripped);
        }
    }
}
	/* POSIX paths in JSON via escaping which
	doesn't alter valid UTF-8 paths.

	The trick is recognizing that JSON can store binary nulls in strings
	but nulls are the only character that can't occur in POSIX paths,
	so we can use it as an escape character that won't change how existing
	serialized paths get interpreted.

	Copyright 2018-2020, Stephan Sokolow

	This code is released under your choice of the MIT or Apache-2.0 licenses.
	https://opensource.org/licenses/MIT
	https://opensource.org/licenses/Apache-2.0
	*/

	use std::borrow::Cow;
	use std::ffi::{OsStr, OsString};
	use std::str;

	// Platform-specific imports
	use std::os::unix::ffi::{OsStrExt, OsStringExt};

	/// Escape an OS path into something which can safely be stored in a valid UTF-8 string
	fn escape_path<P: AsRef<OsStr> + ?Sized>(path: &P) -> Cow<'_, str> {
	escape_path_inner(path.as_ref())
	}

	/// Inner function for `escape_path` to avoid the risk of monomorphization bloat
	///
	/// Adapted from the example code on the `std::str::Utf8Error` rustdoc page
	/// TODO: Support Windows... ideally in a way that results in the same conversion logic
	/// as ntfs-3g uses.
	fn escape_path_inner(path: &OsStr) -> Cow<'_, str> {
	if let Some(path_str) = path.to_str() {
	if !path_str.contains('\0') {
	// In the by-far most common case, just do a validity check and a copy
	// (According to Criterion, this halves the common-case runtime in exchange for
	// a 6-20% (+/- 5%) slow-down in the case where escaping is needed.)
	//
	// An if/else here which directs "valid but with \0" to str::replace was shown
	// by Criterion to result in a ~33% slowdown for a test string with four \0 in it
	// and a ~45% slowdown for a test string with seven \0 in it.
	return Cow::from(path_str.to_owned());
	}
	}

	// In the very uncommon case, make a copy of the string with invalid bytes escaped
	let mut input = path.as_bytes();

	// Preallocate for four escapes
	// (Just a guess, based on four mojibake'd latin1 bytes, two UTF-16 surrogates,
	// or one UTF-32 character)
	let mut result = String::with_capacity(path.len().saturating_add(4));

	loop {
	// Allowed because it should be impossible for invalid_sequence_length to
	// be outside the range of after_valid. Tests should cover all cases, so
	// it doesn't make sense to change the API to return a Result when the
	// fallible path should be impossible.
	//
	// TODO: Move this to the &after_valid[...] once attributes on expressions are no
	// longer experimental.
	#[allow(clippy::indexing_slicing)]
	match str::from_utf8(input) {
	// TODO: Try rewriting this into something faster
	Ok(valid) => {
	// Escape binary nulls inside a fully valid string so they round-trip properly
	for u_char in valid.chars() {
	if u_char == '\0' { result.push('\0'); }
	result.push(u_char);
	}
	break
	}
	Err(error) => {
	// Pass through the valid span
	let (valid, after_valid) = input.split_at(error.valid_up_to());

	// Escape binary nulls inside the valid span so they round-trip properly
	#[allow(clippy::result_expect_used)]
	for u_char in str::from_utf8(valid).expect(
	"from_utf8 on left-hand output of valid_up_to()").chars() {
	if u_char == '\0' { result.push('\0'); }
	result.push(u_char);
	}

	// Escape any following invalid characters
	let invalid_sequence_length = match error.error_len() {
	Some(length) => length,
	None => after_valid.len()
	};
	for &byte in after_valid.iter().take(invalid_sequence_length) {
	result.push('\0');
	result.push(byte.into());
	}

	// Step forward to the next span or end the loop
	if let Some(invalid_sequence_length) = error.error_len() {
	input = &after_valid[invalid_sequence_length..]
	} else {
	break
	}
	}
	}
	}
	return Cow::from(result);
	}

	/// Take the output from `escape_path` and change it back into an OS string
	///
	/// (`allow(dead_code)` because its purpose is to exist on standby and pass unit tests,
	/// awaiting the need to unescape my emergency records.)
	#[allow(dead_code)]
	fn unescape_path(path: &str) -> Cow<'_, OsStr> {
	// In the by-far most common case, just check for \0 and return a Cow<OsStr>
	// (According to Criterion, skipping the unescaping code for the common case results in a 360%+
	// speed-up in the common case with no statistically significant change in the case where
	// things need to be unescaped.)
	if !path.contains('\0') {
	return Cow::from(OsStr::new(path));
	}

	// Otherwise, unescape the escaped bytes
	// TODO: Come up with a nicer way to do this
	let mut result: Vec<u8> = Vec::with_capacity(path.len());
	let mut utf8_buf = [0_u8; 4];

	let mut raw_next = false;
	for u_char in path.chars() {
	if raw_next {
	result.push(u_char as u8);
	raw_next = false;
	} else if u_char == '\0' {
	raw_next = true;
	} else {
	result.extend(u_char.encode_utf8(&mut utf8_buf).as_bytes());
	}
	}

	return Cow::from(OsString::from_vec(result));
	}

	#[cfg(test)]
	mod tests {
	use std::ffi::OsString;
	use std::os::unix::ffi::OsStringExt;
	use super::{escape_path, unescape_path};

	const TEST_STRINGS: &[(&[u8], &str)] = &[
	// all valid utf-8
	(b"string with no invalid utf-8", "string with no invalid utf-8"),
	// typical string with invalid utf-8
	(b"/un/fichier/fran\xe7ais", "/un/fichier/fran\0\u{00e7}ais"),
	// starting with invalid utf-8
	(b"\xe7a va", "\0\u{00e7}a va"),
	// invalid span length > 1
	(b"foo\xe7\xe7bar", "foo\0\u{00e7}\0\u{00e7}bar"),
	// only invalid characters
	(b"\xe7\xe7", "\0\u{00e7}\0\u{00e7}"),
	// empty string
	(b"", ""),
	// ending with invalid utf-8 less than 3 characters (see utf8error::error_len)
	(b"foo\xe7", "foo\0\u{00e7}"),
	(b"foo\xe7\xe7", "foo\0\u{00e7}\0\u{00e7}"),
	// ending with invalid utf-8 more than 3 characters (see utf8error::error_len)
	(b"foo\xe7\xe7\xe7\xe7", "foo\0\u{00e7}\0\u{00e7}\0\u{00e7}\0\u{00e7}"),
	// all valid utf-8, but with nulls
	(b"\0string with no\0\0invalid utf-8\0", "\0\0string with no\0\0\0\0invalid utf-8\0\0"),
	];

	/// Test that escape_path works properly
	#[test]
	fn test_escape_path() {
	for (input, expected) in TEST_STRINGS {
	let os_string = OsString::from_vec(input.to_vec());
	let escaped = escape_path(&os_string);
	assert_eq!(escaped, *expected);
	}
	}

	/// Test that unescape_path is symmetrical to escape_path
	#[test]
	fn test_unescape_path() {
	for (input, _expected) in TEST_STRINGS {
	let os_string = OsString::from_vec(input.to_vec());
	let escaped = escape_path(&os_string);
	assert_eq!(&unescape_path(&escaped), &os_string.as_os_str());
	}
	}

	/// Test that nulls in valid UTF-8 round-trip successfully
	#[test]
	fn test_null_round_tripping() {
	let test_strings: &[(&[u8], &str)] = &[
	(b"\0foo", "\0\0foo"),
	(b"foo\0bar", "foo\0\0bar"),
	(b"foo\0\0bar", "foo\0\0\0\0bar"),
	(b"foo\0", "foo\0\0"),
	(b"\0foo\0bar\xe7baz\0\0quux\0", "\0\0foo\0\0bar\0\u{00e7}baz\0\0\0\0quux\0\0"),
	];
	for (in_vec, expected_escaped) in test_strings {
	let os_string = OsString::from_vec(in_vec.to_vec());
	let escaped = escape_path(&os_string);
	assert_eq!(&escaped, expected_escaped);
	let round_tripped = unescape_path(&escaped);
	assert_eq!(os_string, round_tripped);
	}
	}
	}