-
-
Save lucacasonato/772e3bbed29db7835d45ea1aa1a6f1c7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/// A wrapper type for `String` that has a serde::Deserialize implementation | |
/// that will deserialize lossily (i.e. using replacement characters for | |
/// invalid UTF-8 sequences). | |
pub struct LossyString(String); | |
static ESCAPE: [bool; 256] = { | |
const CT: bool = true; // control character \x00..=\x1F | |
const QU: bool = true; // quote \x22 | |
const BS: bool = true; // backslash \x5C | |
const __: bool = false; // allow unescaped | |
[ | |
// 1 2 3 4 5 6 7 8 9 A B C D E F | |
CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, // 0 | |
CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, // 1 | |
__, __, QU, __, __, __, __, __, __, __, __, __, __, __, __, __, // 2 | |
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 3 | |
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 4 | |
__, __, __, __, __, __, __, __, __, __, __, __, BS, __, __, __, // 5 | |
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 6 | |
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 7 | |
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 8 | |
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 9 | |
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // A | |
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B | |
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // C | |
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D | |
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // E | |
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F | |
] | |
}; | |
impl<'de> serde::Deserialize<'de> for LossyString { | |
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> | |
where | |
D: serde::Deserializer<'de>, | |
{ | |
macro_rules! tri { | |
($opt:expr) => { | |
match $opt { | |
Some(v) => v, | |
None => { | |
return Err(serde::de::Error::custom("Unexpected end of string")) | |
} | |
} | |
}; | |
} | |
let val = Box::<RawValue>::deserialize(deserializer)?; | |
let mut chars = val.get().chars().peekable(); | |
if tri!(chars.next()) != '"' { | |
return Err(serde::de::Error::custom("Expected string")); | |
} | |
let mut str = String::new(); | |
loop { | |
let ch = tri!(chars.next()); | |
if !ESCAPE[ch as usize] { | |
str.push(ch); | |
continue; | |
} | |
match ch { | |
'"' => { | |
break; | |
} | |
'\\' => match parse_escape(&mut chars, &mut str) { | |
Ok(_) => {} | |
Err(err) => return Err(serde::de::Error::custom(err)), | |
}, | |
_ => { | |
return Err(serde::de::Error::custom("Constrol character in string")); | |
} | |
} | |
} | |
if chars.next() != None { | |
return Err(serde::de::Error::custom("Trailing characters in string")); | |
} | |
Ok(LossyString(str)) | |
} | |
} | |
static HEX: [u8; 256] = { | |
const __: u8 = 255; // not a hex digit | |
[ | |
// 1 2 3 4 5 6 7 8 9 A B C D E F | |
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 0 | |
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 1 | |
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 2 | |
00, 01, 02, 03, 04, 05, 06, 07, 08, 09, __, __, __, __, __, __, // 3 | |
__, 10, 11, 12, 13, 14, 15, __, __, __, __, __, __, __, __, __, // 4 | |
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 5 | |
__, 10, 11, 12, 13, 14, 15, __, __, __, __, __, __, __, __, __, // 6 | |
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 7 | |
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 8 | |
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 9 | |
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // A | |
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B | |
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // C | |
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D | |
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // E | |
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F | |
] | |
}; | |
fn decode_hex_val(val: char) -> Option<u16> { | |
let n = HEX[val as usize] as u16; | |
if n == 255 { | |
None | |
} else { | |
Some(n) | |
} | |
} | |
fn decode_hex_escape( | |
chars: &mut Peekable<Chars<'_>>, | |
) -> Result<u16, &'static str> { | |
let mut n = 0; | |
for _ in 0..4 { | |
let ch = match chars.next() { | |
Some(ch) => ch, | |
None => return Err("Unexpected end of string"), | |
}; | |
match decode_hex_val(ch) { | |
None => return Err("Invalid hex escape"), | |
Some(val) => { | |
n = (n << 4) + val; | |
} | |
} | |
} | |
Ok(n) | |
} | |
fn parse_escape( | |
chars: &mut Peekable<Chars<'_>>, | |
str: &mut String, | |
) -> Result<(), &'static str> { | |
macro_rules! tri { | |
($opt:expr) => { | |
match $opt { | |
Some(v) => v, | |
None => return Err("Unexpected end of string"), | |
} | |
}; | |
} | |
let ch = tri!(chars.next()); | |
match ch { | |
'"' => str.push('"'), | |
'\\' => str.push('\\'), | |
'/' => str.push('/'), | |
'b' => str.push('\x08'), | |
'f' => str.push('\x0c'), | |
'n' => str.push('\n'), | |
'r' => str.push('\r'), | |
't' => str.push('\t'), | |
'u' => { | |
let c = match decode_hex_escape(chars)? { | |
0xDC00..=0xDFFF => { | |
str.push('\u{FFFD}'); | |
return Ok(()); | |
} | |
// Non-BMP characters are encoded as a sequence of two hex | |
// escapes, representing UTF-16 surrogates. If deserializing a | |
// utf-8 string the surrogates are required to be paired, | |
// whereas deserializing a byte string accepts lone surrogates. | |
n1 @ 0xD800..=0xDBFF => { | |
if *tri!(chars.peek()) == '\\' { | |
chars.next(); | |
} else { | |
str.push('\u{FFFD}'); | |
return Ok(()); | |
} | |
if *tri!(chars.peek()) == 'u' { | |
chars.next(); | |
} else { | |
str.push('\u{FFFD}'); | |
return parse_escape(chars, str); | |
} | |
let n2 = decode_hex_escape(chars)?; | |
if n2 < 0xDC00 || n2 > 0xDFFF { | |
str.push('\u{FFFD}'); | |
} | |
let n = | |
(((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000; | |
match char::from_u32(n) { | |
Some(c) => c, | |
None => { | |
str.push('\u{FFFD}'); | |
return Ok(()); | |
} | |
} | |
} | |
n => match char::from_u32(n as u32) { | |
Some(c) => c, | |
None => { | |
str.push('\u{FFFD}'); | |
return Ok(()); | |
} | |
}, | |
}; | |
str.push(c); | |
} | |
_ => { | |
return Err("Invalid escape sequence"); | |
} | |
} | |
Ok(()) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment