Skip to content

Instantly share code, notes, and snippets.

@madig
Last active January 10, 2020 08:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save madig/c56d88abc71705235309cbdaee882035 to your computer and use it in GitHub Desktop.
Save madig/c56d88abc71705235309cbdaee882035 to your computer and use it in GitHub Desktop.
fn quoted_string_literal(&mut self, quote_char: u8) -> Result<Option<Event>, Error> {
let mut acc: Vec<u8> = Vec::new();
let mut cur_char = quote_char;
while {
match self.peeked_char {
// do not stop if the quote is escaped
Some(c) => c != quote_char || cur_char == b'\\',
None => false,
}
} {
// consuming the string itself
match self.advance()? {
Some(c) => {
cur_char = c;
// interpret escaped char
if cur_char == b'\\' {
match self.advance()? {
Some(c) => match c as char {
'0'..='7' => match self.advance()? {
// read 3 chars
Some(c2) => match c as char {
'0'..='7' => match self.advance()? {
Some(c3) => match c as char {
'0'..='7' => {
let p1 = (c as u8) - b'0';
let p2 = (c2 as u8) - b'0';
let p3 = (c3 as u8) - b'0';
let num: u8 = (((p1 << 3) + p2) << 3) + p3;
if num < 128 {
// ASCII character
acc.push(num)
} else {
// character in NextStep encoding
let uchar = NEXT_STEP_DECODING_TABLE
[(num - 128) as usize];
let mut uchar_utf8 = [0u8; 4];
for char_byte in uchar
.encode_utf8(&mut uchar_utf8)
.bytes()
{
acc.push(char_byte)
}
};
cur_char = 0; // clear slash so that e.g. "\377" does not scan past the closing quote.
}
_ => {
return Err(self
.error(ErrorKind::InvalidUtf8AsciiStream))
}
},
None => {
return Err(self.error(ErrorKind::UnclosedString))
}
},
_ => {
return Err(
self.error(ErrorKind::InvalidUtf8AsciiStream)
)
}
},
None => return Err(self.error(ErrorKind::UnclosedString)),
},
'U' => {
let mut uchar_num: u32 = 0;
for _ in 1..=4 {
match self.advance()? {
Some(c) => cur_char = c,
None => {
return Err(
self.error(ErrorKind::InvalidUtf8AsciiStream)
)
}
}
if !cur_char.is_ascii_hexdigit() {
return Err(
self.error(ErrorKind::InvalidUtf8AsciiStream)
);
}
uchar_num = (uchar_num << 4)
+ (cur_char as char).to_digit(16).unwrap();
}
let mut uchar_utf8 = [0u8; 4];
for char_byte in char::from_u32(uchar_num)
.unwrap()
.encode_utf8(&mut uchar_utf8)
.bytes()
{
acc.push(char_byte)
}
}
'a' => acc.push(0x07),
'b' => acc.push(0x08),
'f' => acc.push(0x0c),
'n' => acc.push('\n' as u8),
'r' => acc.push('\r' as u8),
't' => acc.push('\t' as u8),
'v' => acc.push(0x0B),
'"' => acc.push('"' as u8),
'\n' => acc.push('\n' as u8),
_ => acc.push(c as u8),
},
None => return Err(self.error(ErrorKind::UnclosedString)),
};
} else {
acc.push(cur_char)
}
}
None => return Err(self.error(ErrorKind::UnclosedString)),
};
}
// Match the closing quote.
match self.advance()? {
Some(c) => {
if c == quote_char {
let string_literal = String::from_utf8(acc)
.map_err(|_e| self.error(ErrorKind::InvalidUtf8AsciiStream))?;
Ok(Some(Event::String(string_literal)))
} else {
Err(self.error(ErrorKind::UnclosedString))
}
}
None => Err(self.error(ErrorKind::UnclosedString)),
}
}
//
b'"' | b'\'' => return self.quoted_string_literal(c),
//
/// Table mapping from NextStep Encoding to Unicode characters, used
/// for decoding octal escaped character codes within quoted plist strings.
/// Since the first 128 characters (0x0 - 0x7f) are identical to ASCII
/// and Unicode, the table only maps NextStep range from 0x80 - 0xFF.
/// Source: ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/NEXT/NEXTSTEP.TXT
static NEXT_STEP_DECODING_TABLE: [char; 128] = [
'\u{A0}', '\u{C0}', '\u{C1}', '\u{C2}', '\u{C3}', '\u{C4}', '\u{C5}', '\u{C7}', '\u{C8}',
'\u{C9}', '\u{CA}', '\u{CB}', '\u{CC}', '\u{CD}', '\u{CE}', '\u{CF}', '\u{D0}', '\u{D1}',
'\u{D2}', '\u{D3}', '\u{D4}', '\u{D5}', '\u{D6}', '\u{D9}', '\u{DA}', '\u{DB}', '\u{DC}',
'\u{DD}', '\u{DE}', '\u{B5}', '\u{D7}', '\u{F7}', '\u{A9}', '\u{A1}', '\u{A2}', '\u{A3}',
'\u{2044}', '\u{A5}', '\u{192}', '\u{A7}', '\u{A4}', '\u{2019}', '\u{201C}', '\u{AB}',
'\u{2039}', '\u{203A}', '\u{FB01}', '\u{FB02}', '\u{AE}', '\u{2013}', '\u{2020}', '\u{2021}',
'\u{B7}', '\u{A6}', '\u{B6}', '\u{2022}', '\u{201A}', '\u{201E}', '\u{201D}', '\u{BB}',
'\u{2026}', '\u{2030}', '\u{AC}', '\u{BF}', '\u{B9}', '\u{2CB}', '\u{B4}', '\u{2C6}',
'\u{2DC}', '\u{AF}', '\u{2D8}', '\u{2D9}', '\u{A8}', '\u{B2}', '\u{2DA}', '\u{B8}', '\u{B3}',
'\u{2DD}', '\u{2DB}', '\u{2C7}', '\u{2014}', '\u{B1}', '\u{BC}', '\u{BD}', '\u{BE}', '\u{E0}',
'\u{E1}', '\u{E2}', '\u{E3}', '\u{E4}', '\u{E5}', '\u{E7}', '\u{E8}', '\u{E9}', '\u{EA}',
'\u{EB}', '\u{EC}', '\u{C6}', '\u{ED}', '\u{AA}', '\u{EE}', '\u{EF}', '\u{F0}', '\u{F1}',
'\u{141}', '\u{D8}', '\u{152}', '\u{BA}', '\u{F2}', '\u{F3}', '\u{F4}', '\u{F5}', '\u{F6}',
'\u{E6}', '\u{F9}', '\u{FA}', '\u{FB}', '\u{131}', '\u{FC}', '\u{FD}', '\u{142}', '\u{F8}',
'\u{153}', '\u{DF}', '\u{FE}', '\u{FF}', '\u{FFFD}', '\u{FFFD}',
];
//
#[test]
fn escaped_sequences_in_strings() {
let plist = r#"{
key0 = "";
key1 = "va\"lue";
key2 = 'va"lue';
key3 = "va\a\b\f\n\r\t\v\"\nlue";
key4 = "a\012b\200\377";
key5 = "\\UD83D\\UDCA9";
key6 = "\UD83D\UDCA9";
}"#;
let cursor = Cursor::new(plist.as_bytes());
let streaming_parser = AsciiReader::new(cursor);
let events: Vec<Event> = streaming_parser.map(|e| e.unwrap()).collect();
let comparison = &[
StartDictionary(None),
String("key0".to_owned()),
String("".to_owned()),
String("key1".to_owned()),
String(r#"va"lue"#.to_owned()),
String("key2".to_owned()),
String(r#"va"lue"#.to_owned()),
String("key3".to_owned()),
String("va\u{07}\u{08}\u{0C}\n\r\t\u{0B}\"\nlue".to_owned()),
String("key4".to_owned()),
String("a\nb\u{A0}\u{FFFD}".to_owned()),
String("key5".to_owned()),
String("\\UD83D\\UDCA9".to_owned()),
String("key6".to_owned()),
String("💩".to_owned()),
EndCollection,
];
assert_eq!(events, comparison);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment