Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Code for storing Rust Path/PathBuf data as valid UTF-8 (eg. JSON) strings
/*
Copyright 2018-2020, Stephan Sokolow
This code is released under your choice of the MIT or Apache-2.0 licenses.
https://opensource.org/licenses/MIT
https://opensource.org/licenses/Apache-2.0
*/
use std::borrow::Cow;
use std::ffi::{OsStr, OsString};
use std::str;
// Platform-specific imports
use std::os::unix::ffi::{OsStrExt, OsStringExt};
/// Escape an OS path into something which can safely be stored in a valid UTF-8 string
fn escape_path<P: AsRef<OsStr> + ?Sized>(path: &P) -> Cow<'_, str> {
escape_path_inner(path.as_ref())
}
/// Inner function for `escape_path` to avoid the risk of monomorphization bloat
///
/// Adapted from the example code on the `std::str::Utf8Error` rustdoc page
/// TODO: Support Windows... ideally in a way that results in the same conversion logic
/// as ntfs-3g uses.
fn escape_path_inner(path: &OsStr) -> Cow<'_, str> {
if let Some(path_str) = path.to_str() {
if !path_str.contains('\0') {
// In the by-far most common case, just do a validity check and a copy
// (According to Criterion, this halves the common-case runtime in exchange for
// a 6-20% (+/- 5%) slow-down in the case where escaping is needed.)
//
// An if/else here which directs "valid but with \0" to str::replace was shown
// by Criterion to result in a ~33% slowdown for a test string with four \0 in it
// and a ~45% slowdown for a test string with seven \0 in it.
return Cow::from(path_str.to_owned());
}
}
// In the very uncommon case, make a copy of the string with invalid bytes escaped
let mut input = path.as_bytes();
// Preallocate for four escapes
// (Just a guess, based on four mojibake'd latin1 bytes, two UTF-16 surrogates,
// or one UTF-32 character)
let mut result = String::with_capacity(path.len().saturating_add(4));
loop {
// Allowed because it should be impossible for invalid_sequence_length to
// be outside the range of after_valid. Tests should cover all cases, so
// it doesn't make sense to change the API to return a Result when the
// fallible path should be impossible.
//
// TODO: Move this to the &after_valid[...] once attributes on expressions are no
// longer experimental.
#[allow(clippy::indexing_slicing)]
match str::from_utf8(input) {
// TODO: Try rewriting this into something faster
Ok(valid) => {
// Escape binary nulls inside a fully valid string so they round-trip properly
for u_char in valid.chars() {
if u_char == '\0' { result.push('\0'); }
result.push(u_char);
}
break
}
Err(error) => {
// Pass through the valid span
let (valid, after_valid) = input.split_at(error.valid_up_to());
// Escape binary nulls inside the valid span so they round-trip properly
#[allow(clippy::result_expect_used)]
for u_char in str::from_utf8(valid).expect(
"from_utf8 on left-hand output of valid_up_to()").chars() {
if u_char == '\0' { result.push('\0'); }
result.push(u_char);
}
// Escape any following invalid characters
let invalid_sequence_length = match error.error_len() {
Some(length) => length,
None => after_valid.len()
};
for &byte in after_valid.iter().take(invalid_sequence_length) {
result.push('\0');
result.push(byte.into());
}
// Step forward to the next span or end the loop
if let Some(invalid_sequence_length) = error.error_len() {
input = &after_valid[invalid_sequence_length..]
} else {
break
}
}
}
}
return Cow::from(result);
}
/// Take the output from `escape_path` and change it back into an OS string
///
/// (`allow(dead_code)` because its purpose is to exist on standby and pass unit tests,
/// awaiting the need to unescape my emergency records.)
#[allow(dead_code)]
fn unescape_path(path: &str) -> Cow<'_, OsStr> {
// In the by-far most common case, just check for \0 and return a Cow<OsStr>
// (According to Criterion, skipping the unescaping code for the common case results in a 360%+
// speed-up in the common case with no statistically significant change in the case where
// things need to be unescaped.)
if !path.contains('\0') {
return Cow::from(OsStr::new(path));
}
// Otherwise, unescape the escaped bytes
// TODO: Come up with a nicer way to do this
let mut result: Vec<u8> = Vec::with_capacity(path.len());
let mut utf8_buf = [0_u8; 4];
let mut raw_next = false;
for u_char in path.chars() {
if raw_next {
result.push(u_char as u8);
raw_next = false;
} else if u_char == '\0' {
raw_next = true;
} else {
result.extend(u_char.encode_utf8(&mut utf8_buf).as_bytes());
}
}
return Cow::from(OsString::from_vec(result));
}
#[cfg(test)]
mod tests {
use std::ffi::OsString;
use std::os::unix::ffi::OsStringExt;
use super::{escape_path, unescape_path};
const TEST_STRINGS: &[(&[u8], &str)] = &[
// all valid utf-8
(b"string with no invalid utf-8", "string with no invalid utf-8"),
// typical string with invalid utf-8
(b"/un/fichier/fran\xe7ais", "/un/fichier/fran\0\u{00e7}ais"),
// starting with invalid utf-8
(b"\xe7a va", "\0\u{00e7}a va"),
// invalid span length > 1
(b"foo\xe7\xe7bar", "foo\0\u{00e7}\0\u{00e7}bar"),
// only invalid characters
(b"\xe7\xe7", "\0\u{00e7}\0\u{00e7}"),
// empty string
(b"", ""),
// ending with invalid utf-8 less than 3 characters (see utf8error::error_len)
(b"foo\xe7", "foo\0\u{00e7}"),
(b"foo\xe7\xe7", "foo\0\u{00e7}\0\u{00e7}"),
// ending with invalid utf-8 more than 3 characters (see utf8error::error_len)
(b"foo\xe7\xe7\xe7\xe7", "foo\0\u{00e7}\0\u{00e7}\0\u{00e7}\0\u{00e7}"),
// all valid utf-8, but with nulls
(b"\0string with no\0\0invalid utf-8\0", "\0\0string with no\0\0\0\0invalid utf-8\0\0"),
];
/// Test that escape_path works properly
#[test]
fn test_escape_path() {
for (input, expected) in TEST_STRINGS {
let os_string = OsString::from_vec(input.to_vec());
let escaped = escape_path(&os_string);
assert_eq!(escaped, *expected);
}
}
/// Test that unescape_path is symmetrical to escape_path
#[test]
fn test_unescape_path() {
for (input, _expected) in TEST_STRINGS {
let os_string = OsString::from_vec(input.to_vec());
let escaped = escape_path(&os_string);
assert_eq!(&unescape_path(&escaped), &os_string.as_os_str());
}
}
/// Test that nulls in valid UTF-8 round-trip successfully
#[test]
fn test_null_round_tripping() {
let test_strings: &[(&[u8], &str)] = &[
(b"\0foo", "\0\0foo"),
(b"foo\0bar", "foo\0\0bar"),
(b"foo\0\0bar", "foo\0\0\0\0bar"),
(b"foo\0", "foo\0\0"),
(b"\0foo\0bar\xe7baz\0\0quux\0", "\0\0foo\0\0bar\0\u{00e7}baz\0\0\0\0quux\0\0"),
];
for (in_vec, expected_escaped) in test_strings {
let os_string = OsString::from_vec(in_vec.to_vec());
let escaped = escape_path(&os_string);
assert_eq!(&escaped, expected_escaped);
let round_tripped = unescape_path(&escaped);
assert_eq!(os_string, round_tripped);
}
}
}
@ssokolow

This comment has been minimized.

Copy link
Owner Author

@ssokolow ssokolow commented Jan 30, 2020

In the interest of interoperable, panic-free storage of POSIX paths in JSON or other UTF-8-requiring formats, I'm also willing to release this under other licenses if you need that.

Bear in mind that, as-is, it assumes it's receiving a path so it does not escape \0 as \0\0. I'm willing to add that if anyone wants something suitable for all OsStr and OsString values that are unlikely to contain a \0 but it can't be ruled out.

@ssokolow

This comment has been minimized.

Copy link
Owner Author

@ssokolow ssokolow commented Feb 2, 2020

Fixed. It now escapes \0 as \0\0 so it will...

  • Successfully round-trip all valid OsStr/OsString contents.
  • Leave all POSIX filesystem paths which are valid UTF-8 unchanged.
  • Use \0 as an escape character to store b"\xe7" as \0\u{00e7}.
  • Encode \0 as \0\0 so it can be distinguished from use of \0 as an escape character.

This should be fully backwards compatible with serde_json's existing behaviour, since serde_json fails if a Path or PathBuf contains invalid UTF-8.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment