pv2b/rfc1342.kql

## rfc1342.kql
// This is a terrible, imperfect implementation of RFC1342 in KQL, used to convert subject headers as logged
// by postfix into readable text. This should never have been written. Bask in its terribleness.
let subjects = filteredSyslogs
| where SyslogMessage has "info: header Subject: "
| parse kind=regex SyslogMessage with "postfix[/]cleanup[[]" * ": info: header Subject: " RawSubject " from [a-z0-9-.]+[[][0-9a-f:.]+[]]" *
// This regex extracts rfc1342 "words" from the subject line, as well as "non-words". This is needed because
// there's no way in KQL to grab the strings between matches. This also allows for the case where an RFC1342 header isn't
// properly terminated... and also the case where there's some extra junk after the last ?=.
| extend Match=extract_all(@"((?:=\?([^?]{1,40})\?([BQ])\?([^?]*)\??=?[^\s]*\s*)|.[^=]*)", RawSubject)
| mv-apply Match on (
    project RawSubjectPart = tostring(Match[0]), Charset = tostring(Match[1]), Encoding = tostring(Match[2]), EncodedData = tostring(Match[3])
    | extend Base64 = iff(
        Encoding=="Q",
        base64_encode_tostring(
            // We abuse url_decode to decode quoted-printable strings...
            url_decode(
                // ... by performing this tomfoolery to convert quoted-printable strings into URL encoding.
                replace_strings(
                    EncodedData,
                    dynamic(["+",   "_", "%"  , "="]),
                    dynamic(["%2b", "+", "%25", "%"])
                )
            )
        ),
        // This part truncates the base64 string so it's a multiple of 4 bytes long.
        // This deals with the scenario where the headers are truncated in the middle of a
        // base64 sequence, which would otherwise cause base64_decode_* to return nothing
        // at all.
        extract(@"([A-Za-z0-9+/=]{4})*", 0, EncodedData)
    )
    | extend Decoded =
        // This works because base64_decode_tostring() specifies UTF-8.
        iff(Charset=~"UTF-8", base64_decode_tostring(Base64),
        // This "kinda" works because unicode codepoints 128-255 map exactly to ISO-8859-1's corresponding bytes,
        // and because ISO-8859-15 and Windows-1252 are "close enough" to ISO-8859-1 that most things decode fine.
        iff(Charset=~"Windows-1252" or Charset=~"ISO-8859-1" or Charset=~"ISO-8859-15", unicode_codepoints_to_string(base64_decode_toarray(Base64)),
        RawSubjectPart))
) | summarize Subject=strcat_array(make_list(Decoded), "") by QueueId;
	// This is a terrible, imperfect implementation of RFC1342 in KQL, used to convert subject headers as logged
	// by postfix into readable text. This should never have been written. Bask in its terribleness.
	let subjects = filteredSyslogs
	\| where SyslogMessage has "info: header Subject: "
	\| parse kind=regex SyslogMessage with "postfix[/]cleanup[[]" * ": info: header Subject: " RawSubject " from [a-z0-9-.]+[[][0-9a-f:.]+[]]" *
	// This regex extracts rfc1342 "words" from the subject line, as well as "non-words". This is needed because
	// there's no way in KQL to grab the strings between matches. This also allows for the case where an RFC1342 header isn't
	// properly terminated... and also the case where there's some extra junk after the last ?=.
	\| extend Match=extract_all(@"((?:=\?([^?]{1,40})\?([BQ])\?([^?])\??=?[^\s]\s)\|.[^=])", RawSubject)
	\| mv-apply Match on (
	project RawSubjectPart = tostring(Match[0]), Charset = tostring(Match[1]), Encoding = tostring(Match[2]), EncodedData = tostring(Match[3])
	\| extend Base64 = iff(
	Encoding=="Q",
	base64_encode_tostring(
	// We abuse url_decode to decode quoted-printable strings...
	url_decode(
	// ... by performing this tomfoolery to convert quoted-printable strings into URL encoding.
	replace_strings(
	EncodedData,
	dynamic(["+", "_", "%" , "="]),
	dynamic(["%2b", "+", "%25", "%"])
	)
	)
	),
	// This part truncates the base64 string so it's a multiple of 4 bytes long.
	// This deals with the scenario where the headers are truncated in the middle of a
	// base64 sequence, which would otherwise cause base64_decode_* to return nothing
	// at all.
	extract(@"([A-Za-z0-9+/=]{4})*", 0, EncodedData)
	)
	\| extend Decoded =
	// This works because base64_decode_tostring() specifies UTF-8.
	iff(Charset=~"UTF-8", base64_decode_tostring(Base64),
	// This "kinda" works because unicode codepoints 128-255 map exactly to ISO-8859-1's corresponding bytes,
	// and because ISO-8859-15 and Windows-1252 are "close enough" to ISO-8859-1 that most things decode fine.
	iff(Charset=~"Windows-1252" or Charset=~"ISO-8859-1" or Charset=~"ISO-8859-15", unicode_codepoints_to_string(base64_decode_toarray(Base64)),
	RawSubjectPart))
	) \| summarize Subject=strcat_array(make_list(Decoded), "") by QueueId;