Skip to content

Instantly share code, notes, and snippets.

@pstch
Created February 17, 2023 16:29
Show Gist options
  • Save pstch/81c28d81c9b4f4e04d6d55cc2c711067 to your computer and use it in GitHub Desktop.
Save pstch/81c28d81c9b4f4e04d6d55cc2c711067 to your computer and use it in GitHub Desktop.
compact serialization for mail-parser
use serde::{Serialize, Deserialize};
use std::borrow::Cow;
use mail_parser::{Message, MessagePart, MessagePartId, PartType, Header, Encoding as MimeEncoding, MimeHeaders};
use mail_parser::decoders::{base64::base64_decode, quoted_printable::quoted_printable_decode};
use encoding_rs::{Encoding as TextEncoding, UTF_8};
// TODO:
// - add criterion benchmark, especially to compare deserialization+conversion
// to Message::parse
// - add error handling, especially in charset and MIME type handling
// - make Header fully owned at the type level, so that we can make CompactMessage
// self-contained (without any dynamic references)
// + currently, instances of our structures are always self-contained, because
// header is cloned when building CompactMessagePart, however this could be skipped
// using custom Header/HeaderName/HeaderValue types that can converted from/to the
// not-always-owning original types
// - provide a feature to allow to reduce heap allocations using smallvec,
// so that we can check in real usage if it brings any improvements
// - add a test harness that allows testing with a large number of emails in parallel
// + to quickly test large quantities of mail
// + to simulate real word conditions
// 0. Decoders
// ===========================================================================
fn charset_decode<'a>(data: Cow<'a, [u8]>, charset: &'static TextEncoding) -> Cow<'a, str> {
//charset.decode(&data).0
Cow::from(charset.decode(&data).0.into_owned())
}
fn mime_decode(data: Cow<[u8]>, encoding: MimeEncoding) -> Cow<[u8]> {
match encoding {
MimeEncoding::None => data,
MimeEncoding::Base64 => Cow::from(base64_decode(&data).unwrap()),
MimeEncoding::QuotedPrintable => Cow::from(quoted_printable_decode(&data).unwrap()),
}
}
// 1. Wrapping structures
// ===========================================================================
/// This structure is an analog to Message that does not contain
/// references to the message's body, useful to serialize the
/// message structure independently of its body.
///
/// The only alteration are using CompactMessagePart rather than
/// MessagePart, and removing the raw_message field.
#[derive(Debug, PartialEq, Clone)]
#[derive(Serialize, Deserialize)]
pub struct CompactMessage<'a> {
pub html_body: Vec<MessagePartId>,
pub text_body: Vec<MessagePartId>,
pub attachments: Vec<MessagePartId>,
pub parts: Vec<CompactMessagePart<'a>>,
}
/// This structure is an analog to MessagePart that does not contain
/// references to the message's body, useful to serialize the
/// message structure independently of its body.
///
/// The only alterations are adding a field for the text's charset,
/// making sure that the 'encoding' field is properly serialized,
/// and storing the part's type rather than a typed reference to its body.
#[derive(Debug, PartialEq, Clone)]
#[derive(Serialize, Deserialize)]
pub struct CompactMessagePart<'a> {
pub headers: Vec<Header<'a>>,
pub is_encoding_problem: bool,
pub part_type: CompactPartType<'a>,
pub charset: &'static TextEncoding,
pub encoding: MimeEncoding,
pub offset_header: usize,
pub offset_body: usize,
pub offset_end: usize,
}
/// This structure is an analog to PartType that does not contain
/// references to the message's body, useful to serialize the
/// message structure independently of its body.
///
/// The only alterations are removing any values that directly
/// reference the source message body.
#[derive(Debug, PartialEq, Clone)]
#[derive(Serialize, Deserialize)]
pub enum CompactPartType<'a> {
Text,
Html,
Binary,
InlineBinary,
Message(CompactMessage<'a>),
Multipart(Vec<MessagePartId>),
}
// 2. Wrapping impls
// ===========================================================================
impl<'a, 'b> CompactMessage<'a> {
pub fn from_message(message: &'b Message<'b>) -> Self {
Self {
html_body: message.html_body.clone(),
text_body: message.text_body.clone(),
attachments: message.attachments.clone(),
parts: message.parts.iter().map(|x|
CompactMessagePart::from_message_part(x)
).collect(),
}
}
pub fn to_message(&'a self, raw_message: &'b [u8]) -> Message<'b> {
Message {
html_body: self.html_body.clone(),
text_body: self.text_body.clone(),
attachments: self.attachments.clone(),
parts: self.parts.iter().map(|x|
CompactMessagePart::to_message_part(x, raw_message.clone())
).collect(),
raw_message: Cow::from(raw_message),
}
}
}
impl<'a, 'b> CompactMessagePart<'a> {
pub fn from_message_part(part: &'b MessagePart<'b>) -> Self {
Self {
headers: part.headers.clone().into_iter().map(|h| h.into_owned()).collect(),
is_encoding_problem: part.is_encoding_problem.clone(),
encoding: part.encoding.clone(),
part_type: CompactPartType::from_part_type(&part.body),
charset: TextEncoding::for_label(part.content_transfer_encoding().unwrap_or("").as_bytes()).unwrap_or(UTF_8),
offset_header: part.offset_header.clone(),
offset_body: part.offset_body.clone(),
offset_end: part.offset_end.clone(),
}
}
pub fn to_message_part(&self, raw_message: &'b [u8]) -> MessagePart<'b> {
let offsets = (self.offset_header, self.offset_body, self.offset_end);
MessagePart {
headers: self.headers.clone().into_iter().map(|h| h.into_owned()).collect(),
is_encoding_problem: self.is_encoding_problem,
encoding: self.encoding,
body: self.part_type.to_part_type(raw_message, offsets, self.encoding, self.charset),
offset_header: self.offset_header,
offset_body: self.offset_body,
offset_end: self.offset_end,
}
}
}
impl<'a, 'b> CompactPartType<'a> {
pub fn from_part_type(part_type: &'b PartType) -> Self {
match part_type {
PartType::Text(_) => Self::Text,
PartType::Html(_) => Self::Html,
PartType::Binary(_) => Self::Binary,
PartType::InlineBinary(_) => Self::InlineBinary,
PartType::Message(message) => Self::Message(CompactMessage::from_message(message)),
PartType::Multipart(parts) => Self::Multipart(parts.clone()),
}
}
pub fn to_part_type(&'a self, raw_message: &'b [u8], offsets: (usize, usize, usize), encoding: MimeEncoding, charset: &'static TextEncoding) -> PartType<'b> {
let (offset_header, offset_body, offset_end) = offsets;
let raw_slice = Cow::from(&raw_message[offset_body..offset_end]);
match self {
Self::Text => PartType::Text(charset_decode(mime_decode(raw_slice, encoding), charset)),
Self::Html => PartType::Html(charset_decode(mime_decode(raw_slice, encoding), charset)),
Self::Binary => PartType::Binary(mime_decode(raw_slice, encoding)),
Self::InlineBinary => PartType::InlineBinary(raw_slice),
Self::Message(message) => PartType::Message(CompactMessage::to_message(message, raw_message)),
Self::Multipart(parts) => PartType::Multipart(parts.clone()),
}
}
}
// 5. Entry points
// ===========================================================================
pub fn serialize_message_structure(message: &Message) -> Vec<u8> {
bincode::serialize(&CompactMessage::from_message(message)).unwrap()
}
pub fn deserialize_message_structure<'a, 'b>(data: &'a [u8], raw_message: &'b [u8]) -> Message<'b> {
CompactMessage::to_message(&bincode::deserialize(data).unwrap(), raw_message)
}
// 5. Tests and benchmarks
// ===========================================================================
mod tests {
use mail_parser::Message;
use crate::message::{serialize_message_structure, deserialize_message_structure};
use super::CompactMessage;
#[test]
fn test_sample_message() {
let input = br#"From: Art Vandelay <art@vandelay.com> (Vandelay Industries)
To: "Colleagues": "James Smythe" <james@vandelay.com>; Friends:
jane@example.com, =?UTF-8?Q?John_Sm=C3=AEth?= <john@example.com>;
Date: Sat, 20 Nov 2021 14:22:01 -0800
Subject: Why not both importing AND exporting? =?utf-8?b?4pi6?=
Content-Type: multipart/mixed; boundary="festivus";
--festivus
Content-Type: text/html; charset="us-ascii"
Content-Transfer-Encoding: base64
PGh0bWw+PHA+SSB3YXMgdGhpbmtpbmcgYWJvdXQgcXVpdHRpbmcgdGhlICZsZHF1bztle
HBvcnRpbmcmcmRxdW87IHRvIGZvY3VzIGp1c3Qgb24gdGhlICZsZHF1bztpbXBvcnRpbm
cmcmRxdW87LDwvcD48cD5idXQgdGhlbiBJIHRob3VnaHQsIHdoeSBub3QgZG8gYm90aD8
gJiN4MjYzQTs8L3A+PC9odG1sPg==
--festivus
Content-Type: message/rfc822
From: "Cosmo Kramer" <kramer@kramerica.com>
Subject: Exporting my book about coffee tables
Content-Type: multipart/mixed; boundary="giddyup";
--giddyup
Content-Type: text/plain; charset="utf-16"
Content-Transfer-Encoding: quoted-printable
=FF=FE=0C!5=D8"=DD5=D8)=DD5=D8-=DD =005=D8*=DD5=D8"=DD =005=D8"=
=DD5=D85=DD5=D8-=DD5=D8,=DD5=D8/=DD5=D81=DD =005=D8*=DD5=D86=DD =
=005=D8=1F=DD5=D8,=DD5=D8,=DD5=D8(=DD =005=D8-=DD5=D8)=DD5=D8"=
=DD5=D8=1E=DD5=D80=DD5=D8"=DD!=00
--giddyup
Content-Type: image/gif; name*1="about "; name*0="Book ";
name*2*=utf-8''%e2%98%95 tables.gif
Content-Transfer-Encoding: Base64
Content-Disposition: attachment
R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7
--giddyup--
--festivus--
"#;
let original = Message::parse(input).unwrap();
let serialized = serialize_message_structure(&original);
let message = deserialize_message_structure(&serialized, input);
assert_eq!(original, message);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment