Skip to content

Instantly share code, notes, and snippets.

@lucacasonato
Created November 25, 2021 16:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lucacasonato/772e3bbed29db7835d45ea1aa1a6f1c7 to your computer and use it in GitHub Desktop.
Save lucacasonato/772e3bbed29db7835d45ea1aa1a6f1c7 to your computer and use it in GitHub Desktop.
/// A wrapper type for `String` that has a serde::Deserialize implementation
/// that will deserialize lossily (i.e. using replacement characters for
/// invalid UTF-8 sequences).
pub struct LossyString(String);
static ESCAPE: [bool; 256] = {
const CT: bool = true; // control character \x00..=\x1F
const QU: bool = true; // quote \x22
const BS: bool = true; // backslash \x5C
const __: bool = false; // allow unescaped
[
// 1 2 3 4 5 6 7 8 9 A B C D E F
CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, // 0
CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, // 1
__, __, QU, __, __, __, __, __, __, __, __, __, __, __, __, __, // 2
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 3
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 4
__, __, __, __, __, __, __, __, __, __, __, __, BS, __, __, __, // 5
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 6
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 7
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 8
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 9
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // A
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // C
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // E
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F
]
};
impl<'de> serde::Deserialize<'de> for LossyString {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
macro_rules! tri {
($opt:expr) => {
match $opt {
Some(v) => v,
None => {
return Err(serde::de::Error::custom("Unexpected end of string"))
}
}
};
}
let val = Box::<RawValue>::deserialize(deserializer)?;
let mut chars = val.get().chars().peekable();
if tri!(chars.next()) != '"' {
return Err(serde::de::Error::custom("Expected string"));
}
let mut str = String::new();
loop {
let ch = tri!(chars.next());
if !ESCAPE[ch as usize] {
str.push(ch);
continue;
}
match ch {
'"' => {
break;
}
'\\' => match parse_escape(&mut chars, &mut str) {
Ok(_) => {}
Err(err) => return Err(serde::de::Error::custom(err)),
},
_ => {
return Err(serde::de::Error::custom("Constrol character in string"));
}
}
}
if chars.next() != None {
return Err(serde::de::Error::custom("Trailing characters in string"));
}
Ok(LossyString(str))
}
}
static HEX: [u8; 256] = {
const __: u8 = 255; // not a hex digit
[
// 1 2 3 4 5 6 7 8 9 A B C D E F
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 0
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 1
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 2
00, 01, 02, 03, 04, 05, 06, 07, 08, 09, __, __, __, __, __, __, // 3
__, 10, 11, 12, 13, 14, 15, __, __, __, __, __, __, __, __, __, // 4
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 5
__, 10, 11, 12, 13, 14, 15, __, __, __, __, __, __, __, __, __, // 6
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 7
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 8
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 9
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // A
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // C
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // E
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F
]
};
fn decode_hex_val(val: char) -> Option<u16> {
let n = HEX[val as usize] as u16;
if n == 255 {
None
} else {
Some(n)
}
}
fn decode_hex_escape(
chars: &mut Peekable<Chars<'_>>,
) -> Result<u16, &'static str> {
let mut n = 0;
for _ in 0..4 {
let ch = match chars.next() {
Some(ch) => ch,
None => return Err("Unexpected end of string"),
};
match decode_hex_val(ch) {
None => return Err("Invalid hex escape"),
Some(val) => {
n = (n << 4) + val;
}
}
}
Ok(n)
}
fn parse_escape(
chars: &mut Peekable<Chars<'_>>,
str: &mut String,
) -> Result<(), &'static str> {
macro_rules! tri {
($opt:expr) => {
match $opt {
Some(v) => v,
None => return Err("Unexpected end of string"),
}
};
}
let ch = tri!(chars.next());
match ch {
'"' => str.push('"'),
'\\' => str.push('\\'),
'/' => str.push('/'),
'b' => str.push('\x08'),
'f' => str.push('\x0c'),
'n' => str.push('\n'),
'r' => str.push('\r'),
't' => str.push('\t'),
'u' => {
let c = match decode_hex_escape(chars)? {
0xDC00..=0xDFFF => {
str.push('\u{FFFD}');
return Ok(());
}
// Non-BMP characters are encoded as a sequence of two hex
// escapes, representing UTF-16 surrogates. If deserializing a
// utf-8 string the surrogates are required to be paired,
// whereas deserializing a byte string accepts lone surrogates.
n1 @ 0xD800..=0xDBFF => {
if *tri!(chars.peek()) == '\\' {
chars.next();
} else {
str.push('\u{FFFD}');
return Ok(());
}
if *tri!(chars.peek()) == 'u' {
chars.next();
} else {
str.push('\u{FFFD}');
return parse_escape(chars, str);
}
let n2 = decode_hex_escape(chars)?;
if n2 < 0xDC00 || n2 > 0xDFFF {
str.push('\u{FFFD}');
}
let n =
(((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000;
match char::from_u32(n) {
Some(c) => c,
None => {
str.push('\u{FFFD}');
return Ok(());
}
}
}
n => match char::from_u32(n as u32) {
Some(c) => c,
None => {
str.push('\u{FFFD}');
return Ok(());
}
},
};
str.push(c);
}
_ => {
return Err("Invalid escape sequence");
}
}
Ok(())
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment