Last active
September 9, 2024 16:06
-
-
Save matwerber1/4f62f67999dcfde6548aaf87d748383c to your computer and use it in GitHub Desktop.
Example SQL for Gmail Logs data written to BigQuery daily_ table
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- I was helping a company using Google Workspace for email, and they were having challenges | |
-- with email deliverability due to domain reputation. While it's true that gmail provides | |
-- provides detailed logs, I found their docs to be only marginaly helpful and lacking | |
-- more robust, real-world ways to interpret the data. I also struggled to find quality examples | |
-- when searching unofficial sources (blogs, etc.). | |
-- | |
-- The query below is my best effort attempt to translate gmail logs into something half-way usable. | |
-- I'd be shocked if there weren't mistakes or areas for improvement, but either way, sharing in | |
-- the hope that it at least helps give a jumping-off point for anyone else that finds themselves | |
-- saying "OK, I enabled Gmail Logs for BigQuery.... now what?". | |
-- | |
-- I added the comments in this doc many months later, so going off of memory and assumptions. | |
-- | |
CREATE OR REPLACE TABLE `gmail_logs_dataset`.gmail_table_for_reporting AS | |
WITH detail as ( | |
SELECT | |
event_info.success as event_success, | |
message_info.rfc2822_message_id as message_id, | |
-- When we send or receive a single email, Gmail creates a log entry for *each step* | |
-- in the workflow *for each recipient* in the To: and CC: list. Depending on the question | |
-- you're asking, you'll want to know whether a row represents a first, last, or in-between | |
-- step for a given message and recipient, which is why we partition by message ID and | |
-- destination address, order lines within each partition by the event timestamp (which I hope | |
-- is a reliable way to get the proper event order), and add a "line number" with dense rank. | |
dense_rank() OVER ( | |
PARTITION BY | |
message_info.rfc2822_message_id, | |
destination.address | |
ORDER BY | |
event_info.timestamp_usec ASC | |
) AS message_line_number, | |
-- for each row of the same email message, they all have different timestamps. If we want to | |
-- aggregate emails by a given date, hour, min, etc., we want to make sure that all table rows | |
-- that relate to the same overall email use the same timestamp when we later use a date parse func. | |
TIMESTAMP_MICROS( | |
FIRST_VALUE(event_info.timestamp_usec) OVER ( | |
PARTITION BY | |
message_info.rfc2822_message_id | |
ORDER BY | |
event_info.timestamp_usec ASC | |
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING | |
) | |
) AS message_first_event_time, | |
-- fields below are variations on the same concept above. | |
TIMESTAMP_MICROS( | |
LAST_VALUE(event_info.timestamp_usec) OVER ( | |
PARTITION BY message_info.rfc2822_message_id ORDER BY event_info.timestamp_usec ASC | |
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) | |
) | |
AS message_last_event_time, | |
TIMESTAMP_MICROS( | |
FIRST_VALUE(event_info.timestamp_usec) OVER ( | |
PARTITION BY message_info.rfc2822_message_id, destination.address | |
ORDER BY event_info.timestamp_usec ASC | |
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING | |
) | |
) AS recipient_first_timestamp, | |
TIMESTAMP_MICROS( | |
LAST_VALUE(event_info.timestamp_usec) OVER ( | |
PARTITION BY message_info.rfc2822_message_id, destination.address | |
ORDER BY event_info.timestamp_usec ASC | |
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING | |
) | |
) AS recipient_last_timestamp, | |
TIMESTAMP_MICROS(event_info.timestamp_usec) AS message_id_line_item_timestamp, | |
-- rather than trying to parse destination and source email addresses by domain name and | |
-- manually adding rules (tedious if you have multiple domains), this is a definitive way | |
-- to know whether the email was sent by someone in your Google Workspace (or an app acting | |
-- on their behalf) | |
IF(message_info.is_policy_check_for_sender = true, 'outbound','inbound') AS mail_direction, | |
-- do all of the to and from email addresses belong to domain(s) owned in your Google Workspace? | |
CASE | |
WHEN message_info.connection_info.is_internal OR message_info.connection_info.is_intra_domain | |
THEN true | |
ELSE false | |
END AS is_internal, | |
-- Some Google Services, like Calendar, might send emails (like calendar invites) on your behalf | |
CONCAT( | |
message_info.source.service, | |
if( | |
message_info.source.selector is null, | |
"", | |
CONCAT(" - ", message_info.source.selector) | |
) | |
) as service, | |
-- get everything after the "@" symbol in the sender email address | |
REGEXP_EXTRACT( | |
IF( | |
(length(message_info.source.address) = 0) OR (message_info.source.address != message_info.source.from_header_address), | |
message_info.source.from_header_address, | |
message_info.source.address | |
), | |
"@(.*)") | |
AS sender_domain, | |
-- To understand how mail is delivered, an email message has two TO addresses, a HEADER TO address | |
-- and an ENVELOPE TO (RCPT TO) address. The HEADER TO is read by users and mail clients but is not | |
-- used for mail delivery. The ENVELOPE TO is used by mail servers to determine where the mail is | |
-- being delivered. | |
-- When looking at the logs, all I remember is that when source.address was populated, header address | |
-- might have a value we didn't want to use (but always head a value) for reporting (didn't match up | |
-- with what non-technical business users of query results wanted to understand). | |
REGEXP_EXTRACT( | |
IF( | |
(length(message_info.source.address) = 0), | |
message_info.source.from_header_address, | |
message_info.source.address | |
), "@(.*)" | |
) AS true_sender_domain, | |
REGEXP_EXTRACT(destination.address , "@(.*)") AS recipient_domain, | |
LOWER( | |
IF( | |
(length(message_info.source.address) = 0) OR (message_info.source.address != message_info.source.from_header_address), | |
message_info.source.from_header_address, | |
message_info.source.address | |
) | |
) AS sender, | |
LOWER( | |
IF( | |
(length(message_info.source.address) = 0), | |
message_info.source.from_header_address, | |
message_info.source.address | |
) | |
) AS true_sender, | |
destination.address AS recipient, | |
message_info.subject, | |
message_info.connection_info.smtp_reply_code, | |
CASE | |
-- For further info: https://support.google.com/a/answer/3221692?hl=en | |
WHEN message_info.connection_info.smtp_reply_code = 0 THEN '0 - No decription given' | |
WHEN message_info.connection_info.smtp_reply_code = 250 THEN '250 - Requested action completed' | |
WHEN message_info.connection_info.smtp_reply_code BETWEEN 400 and 499 THEN '4xx - Temporary failure (auto-retry)' | |
WHEN message_info.connection_info.smtp_reply_code = 550 THEN '550 - Failed - mailbox unavailable or rejected as spam' | |
WHEN message_info.connection_info.smtp_reply_code = 551 THEN '551 - Failed - recipient mailbox not available' | |
WHEN message_info.connection_info.smtp_reply_code = 552 THEN '552 - Failed - recipient mailbox out of storage' | |
WHEN message_info.connection_info.smtp_reply_code = 553 THEN '553 - Failed - recipient email does not exist' | |
WHEN message_info.connection_info.smtp_reply_code = 554 THEN '554 - Failed - no additional detail given' | |
ELSE CAST(message_info.connection_info.smtp_reply_code as STRING) | |
END as smtp_reply_description, | |
-- we can identify a message as "in-flight" for a given recipient based on the most-recent received code | |
LAST_VALUE(message_info.connection_info.smtp_reply_code) OVER ( | |
PARTITION BY message_info.rfc2822_message_id, destination.address | |
ORDER BY event_info.timestamp_usec ASC | |
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING | |
) AS last_smtp_reply_code, | |
CONCAT( | |
message_info.action_type, " - ", | |
CASE | |
WHEN message_info.action_type = 1 THEN 'Received by inbound SMTP server' | |
WHEN message_info.action_type = 2 THEN 'Accepted by Gmail and prepared for delivery' | |
WHEN message_info.action_type = 3 THEN 'Handled by Gmail' | |
WHEN message_info.action_type = 10 THEN 'Sent by outbound SMTP server' | |
WHEN message_info.action_type = 14 THEN 'Temporary error' | |
WHEN message_info.action_type = 18 THEN 'Bounced' | |
WHEN message_info.action_type = 19 THEN 'Dropped by Gmail' | |
WHEN message_info.action_type = 45 THEN 'Accepted by Google Groups' | |
WHEN message_info.action_type = 46 THEN 'Recipient group expanded to each member' | |
WHEN message_info.action_type = 48 THEN 'Received by inbound SMTP for relay' | |
WHEN message_info.action_type = 49 THEN 'Sent via relay by outbound SMTP' | |
WHEN message_info.action_type = 51 THEN 'Written to Google Groups' | |
WHEN message_info.action_type = 54 THEN 'Rejected by Google Groups storage' | |
WHEN message_info.action_type = 55 THEN 'Re-inserted into Gmail by policies that modify route ' | |
WHEN message_info.action_type = 68 THEN 'Accepted by Gmail and prepared for delivery' | |
WHEN message_info.action_type = 69 THEN 'User marked (or unmarked) message as spam' | |
WHEN message_info.action_type = 70 THEN 'Reclassified as spam after delivery' | |
ELSE CONCAT(CAST(message_info.action_type AS STRING), '-', 'Unknown') | |
END | |
) AS gmail_action_type, | |
-- there are official message disposition codes for email, but various mailbox providers | |
-- may select different codes or add custom text that makes it hard to group disposition codes | |
-- as a way of understanding big picture reasons why something failed. The CASE below was a slow, | |
-- manual effort of looking at logs in an attempt to create consolidated groups | |
CASE | |
WHEN LEFT(message_info.description,10) = '421-4.7.28' | |
THEN 'Blocked by Google - Send rate too high' | |
WHEN | |
CONTAINS_SUBSTR(message_info.description, '554 5.7.1') | |
OR CONTAINS_SUBSTR(message_info.description, '550 5.7.0 Message not accepted as it appears to be spam') | |
OR LEFT(message_info.description, 57) = 'Google tried to deliver your message, but it was rejected' | |
AND ( | |
CONTAINS_SUBSTR(message_info.description, 'spamtitan.com') | |
OR CONTAINS_SUBSTR(message_info.description, '554 Email rejected due to security policies') | |
OR CONTAINS_SUBSTR(message_info.description, 'mailspamprotection.com') | |
OR CONTAINS_SUBSTR(message_info.description, 'judged to be spam') | |
OR CONTAINS_SUBSTR(message_info.description, 'this message looks like spam') | |
OR CONTAINS_SUBSTR(message_info.description, 'spamh.com') | |
OR CONTAINS_SUBSTR(message_info.description, 'Spam Policy Violation') | |
OR CONTAINS_SUBSTR(message_info.description, '550 Envelope blocked - User Entry') | |
OR CONTAINS_SUBSTR(message_info.description, ':blocked') | |
OR CONTAINS_SUBSTR(message_info.description, 'Spam Policy Violation') | |
OR CONTAINS_SUBSTR(message_info.description, '550 5.7.0 Local Policy Violation') | |
OR CONTAINS_SUBSTR(message_info.description, '550 #5.1.0 Address rejected') | |
OR CONTAINS_SUBSTR(message_info.description, '550 5.7.1 XGEMAIL_0009') | |
OR CONTAINS_SUBSTR(message_info.description, '550 5.4.1 All recipient addresses rejected') | |
OR CONTAINS_SUBSTR(message_info.description, '550 5.7.1 Sender unknown') | |
) | |
THEN '55X - Spam' | |
WHEN CONTAINS_SUBSTR(message_info.description, '550 Too many invalid recipients') | |
THEN '55X - Spam (due to high bounce rate)' | |
WHEN | |
LEFT(message_info.description, 9) IN('550-5.1.1','550-5.2.1','552-5.2.2', '550-5.7.1') | |
OR CONTAINS_SUBSTR(message_info.description, '550 5.1.1') | |
OR CONTAINS_SUBSTR(message_info.description, '550 Invalid Recipient') | |
OR CONTAINS_SUBSTR(message_info.description, 'User unknown') | |
OR CONTAINS_SUBSTR(message_info.description, 'AS(201806281)') | |
OR CONTAINS_SUBSTR(message_info.description, '550 No Such User Here') | |
OR CONTAINS_SUBSTR(message_info.description, 'dosn\'t exist') | |
OR CONTAINS_SUBSTR(message_info.description, 'Email address could not be found, or was misspelled') | |
OR CONTAINS_SUBSTR(message_info.description, 'Recipient not found.') | |
OR CONTAINS_SUBSTR(message_info.description, 'mailbox not found') | |
OR CONTAINS_SUBSTR(message_info.description, '550 Cannot process address') | |
OR CONTAINS_SUBSTR(message_info.description, 'mailbox is disabled') | |
OR CONTAINS_SUBSTR(message_info.description, 'mailbox unavailable') | |
OR CONTAINS_SUBSTR(message_info.description, 'does not exist') | |
OR CONTAINS_SUBSTR(message_info.description, '10minutemail.com') | |
OR CONTAINS_SUBSTR(message_info.description, 'email account that you tried to reach is disabled') | |
THEN '55X - Bounced' | |
WHEN CONTAINS_SUBSTR(message_info.description, '552 5.2.2 Mailbox size limit exceeded') | |
THEN '5XX - Mailbox full' | |
WHEN CONTAINS_SUBSTR(message_info.description, '550 5.7.1 Relaying denied') | |
THEN '55X - Closed relay system' | |
WHEN | |
LEFT(message_info.description, 59) = 'The recipient server did not accept our requests to connect' | |
AND CONTAINS_SUBSTR(message_info.description, 'timed out') | |
THEN 'Connection timeout' | |
WHEN | |
LEFT(message_info.description, 9) = 'Temporary' | |
OR CONTAINS_SUBSTR(message_info.description, '451 Hostname is not authorized') | |
OR CONTAINS_SUBSTR(message_info.description, '451 Account inbounds disabled') | |
THEN | |
'4XX - Temporary Error' | |
ELSE 'Other...' | |
END AS event_description, | |
message_info.description AS original_event_description, | |
message_info.spam_info is null AS has_spam_info, | |
CASE | |
WHEN message_info.spam_info.classification_reason = 1 AND message_info.spam_info.disposition > 1 THEN TRUE | |
WHEN message_info.spam_info.classification_reason > 1 THEN TRUE | |
ELSE message_info.is_spam | |
END AS is_spam, | |
CASE | |
WHEN message_info.spam_info.classification_reason = 1 | |
THEN ( | |
CONCAT( | |
message_info.spam_info.disposition, | |
'.', | |
CASE | |
WHEN message_info.spam_info.disposition = 1 THEN '01 - OK' | |
WHEN message_info.spam_info.disposition = 2 THEN '02 -Spam' | |
WHEN message_info.spam_info.disposition = 3 THEN '03 -Phishing' | |
WHEN message_info.spam_info.disposition = 4 THEN '04 -Suspicious' | |
WHEN message_info.spam_info.disposition = 5 THEN '05 -Malware' | |
END | |
) | |
) | |
WHEN message_info.spam_info.classification_reason > 1 | |
THEN CONCAT( | |
RIGHT(CONCAT('0',message_info.spam_info.classification_reason),2), | |
".", | |
CASE | |
WHEN message_info.spam_info.classification_reason = 1 THEN '01 - Default classification' | |
WHEN message_info.spam_info.classification_reason = 2 THEN '02 - Sender\'s past actions' | |
WHEN message_info.spam_info.classification_reason = 3 THEN '03 - Suspicious content' | |
WHEN message_info.spam_info.classification_reason = 4 THEN '04 - Suspicious link' | |
WHEN message_info.spam_info.classification_reason = 5 THEN '05 - Suspicious attachment' | |
WHEN message_info.spam_info.classification_reason = 6 THEN '06 - Custom Google Workspace Policy' | |
WHEN message_info.spam_info.classification_reason = 7 THEN '07 - DMARC' | |
WHEN message_info.spam_info.classification_reason = 8 THEN '08 - Domain in public RBLs' | |
WHEN message_info.spam_info.classification_reason = 9 THEN '09 - RFC standards violation' | |
WHEN message_info.spam_info.classification_reason = 10 THEN '10 - Gmail policy violation' | |
WHEN message_info.spam_info.classification_reason = 11 THEN '12 - Machine learning verdict' | |
WHEN message_info.spam_info.classification_reason = 12 THEN '12 - Sender reputation' | |
WHEN message_info.spam_info.classification_reason = 13 THEN '13 - Blatant spam' | |
WHEN message_info.spam_info.classification_reason = 14 THEN '14 - Advanced phishing & malware protection' | |
ELSE CONCAT(COALESCE(CAST(message_info.spam_info.classification_reason AS STRING)),'.',COALESCE(CAST(message_info.spam_info.disposition AS STRING))) | |
END | |
) | |
ELSE CAST(message_info.spam_info.classification_reason AS STRING) | |
END AS gmail_spam_classification, | |
CASE | |
WHEN message_info.connection_info.smtp_response_reason = 3 THEN '03 - Malware' | |
WHEN message_info.connection_info.smtp_response_reason = 4 THEN '04 - DMARC policy' | |
WHEN message_info.connection_info.smtp_response_reason = 5 THEN '05 - Unsupported attachment (by Gmail)' | |
WHEN message_info.connection_info.smtp_response_reason = 6 THEN '06 - Receive limit exceeded' | |
WHEN message_info.connection_info.smtp_response_reason = 7 THEN '07 - Account over quota' | |
WHEN message_info.connection_info.smtp_response_reason = 8 THEN '08 - Bad PTR record' | |
WHEN message_info.connection_info.smtp_response_reason = 9 THEN '09 - Recipient doesn’t exist' | |
WHEN message_info.connection_info.smtp_response_reason = 10 THEN '10 - Customer policy' | |
WHEN message_info.connection_info.smtp_response_reason = 12 THEN '12 - RFC violation' | |
WHEN message_info.connection_info.smtp_response_reason = 13 THEN '13 - Blatant spam' | |
WHEN message_info.connection_info.smtp_response_reason = 14 THEN '14 - Denial of service' | |
WHEN message_info.connection_info.smtp_response_reason = 15 THEN '15 - Malicious or spammy links' | |
WHEN message_info.connection_info.smtp_response_reason = 16 THEN '16 - Low IP reputation' | |
WHEN message_info.connection_info.smtp_response_reason = 17 THEN '17 - Low domain reputation' | |
WHEN message_info.connection_info.smtp_response_reason = 18 THEN '18 - IP listed in public Real-time Blackhole List (RBL)' | |
WHEN message_info.connection_info.smtp_response_reason = 19 THEN '19 - Temporarily rejected due to DoS limits' | |
WHEN message_info.connection_info.smtp_response_reason = 20 THEN '20 - Permanently rejected due to DoS limits' | |
END | |
AS inbound_smtp_response, | |
-- if I remember correctly, these values made it look as those dmarc was failing far more than what | |
-- mxtoolbox was reporting and I don't think I trusted the values below... | |
message_info.connection_info.dmarc_published_domain AS dmarc_domain, | |
CAST(message_info.connection_info.dmarc_pass AS BOOL) AS dmarc_pass, | |
CAST(message_info.connection_info.dkim_pass AS BOOL) AS dkim_pass, | |
CAST(message_info.connection_info.spf_pass AS BOOL) AS spf_pass, | |
d.* | |
FROM | |
`vaulted-keel-351205.gmail_logs_dataset.daily_*` AS d, | |
d.message_info.destination | |
), | |
-- When an internal gmail user sets up an automatic forwarding of received messages to another gmail address, we need | |
-- to make some changes to our parsing logic to properly capture original sender, forwarder, and final recipient: | |
adjust_for_forwarding AS ( | |
SELECT | |
* except(sender, sender_domain), | |
CASE | |
WHEN (CONTAINS_SUBSTR(true_sender,'+caf_@') AND service = 'gmail-ui - autoforward') | |
THEN REPLACE(true_sender, '+caf_@', '@') | |
ELSE sender | |
END AS sender, --CHANGE TO SENDER | |
CASE | |
WHEN (CONTAINS_SUBSTR(true_sender,'+caf_@') AND service = 'gmail-ui - autoforward') | |
THEN true_sender_domain ELSE sender_domain | |
END AS sender_domain, --CHANGE TO SENDER | |
CASE | |
WHEN (CONTAINS_SUBSTR(true_sender,'+caf_@') AND service = 'gmail-ui - autoforward') | |
THEN TRUE | |
ELSE FALSE | |
END AS was_forwarded, | |
CASE | |
WHEN (CONTAINS_SUBSTR(true_sender,'+caf_@') AND service = 'gmail-ui - autoforward') | |
THEN sender ELSE NULL | |
END as forwarded_for, | |
CASE | |
WHEN (CONTAINS_SUBSTR(true_sender,'+caf_@') AND service = 'gmail-ui - autoforward') | |
THEN sender_domain ELSE NULL | |
END as forwarded_for_domain | |
FROM detail | |
), | |
correct_direction_for_internal AS ( | |
SELECT | |
* except(is_internal), | |
IF( | |
sender_domain IN ('REDACTED') AND recipient_domain in ('REDACTED'), | |
true, | |
is_internal | |
) | |
AS is_internal | |
FROM adjust_for_forwarding | |
), | |
add_internal_to_mail_direction AS ( | |
SELECT | |
* except(mail_direction), | |
IF(COALESCE(is_internal,false), 'internal', mail_direction) AS mail_direction | |
FROM correct_direction_for_internal | |
), | |
detail_adjusted AS ( | |
SELECT | |
*, | |
CAST(message_first_event_time AS DATE) AS date, | |
CAST(DATE_TRUNC(message_first_event_time, WEEK(MONDAY)) AS DATE) AS week, | |
CASE EXTRACT(DAYOFWEEK FROM message_first_event_time) | |
WHEN 1 THEN '7. Sun' | |
WHEN 2 THEN '1. Mon' | |
WHEN 3 THEN '2. Tue' | |
WHEN 4 THEN '3. Wed' | |
WHEN 5 THEN '4. Thu' | |
WHEN 6 THEN '5. Fri' | |
WHEN 7 THEN '6. Sat' | |
END | |
AS day_name, | |
message_first_event_time as timestamp, | |
message_info.connection_info.client_ip AS connection_client_ip, | |
message_info.connection_info.smtp_out_connect_ip AS outbound_smtp_ip, | |
FROM add_internal_to_mail_direction | |
), | |
set_delivery_result AS ( | |
SELECT | |
*, | |
CASE | |
WHEN | |
is_spam = true | |
or event_description = '55X - Spam' | |
THEN 'Failed - Spam' | |
WHEN | |
message_info.action_type = 18 | |
OR message_info.connection_info.smtp_reply_code = 553 | |
OR event_description = '55X - Bounced' | |
THEN 'Failed - Bounced' | |
WHEN message_info.connection_info.smtp_reply_code <= 299 | |
THEN 'Succeeded' | |
WHEN message_info.connection_info.smtp_reply_code BETWEEN 400 and 499 | |
THEN 'Failed - Temporary Error' | |
WHEN | |
message_info.connection_info.smtp_reply_code >= 500 | |
OR message_info.action_type = 19 | |
THEN 'Failed - Unknown (maybe spam?)' | |
END AS delivery_result | |
FROM detail_adjusted | |
) | |
SELECT | |
* except( | |
event_info | |
,message_info | |
,message_first_event_time | |
,message_last_event_time | |
,recipient_first_timestamp | |
,recipient_last_timestamp | |
,week | |
,message_id_line_item_timestamp | |
,true_sender | |
,true_sender_domain | |
,last_smtp_reply_code | |
,smtp_reply_description | |
) | |
FROM set_delivery_result | |
WHERE | |
message_id_line_item_timestamp = recipient_last_timestamp | |
AND date > DATE_SUB(CURRENT_DATE(), INTERVAL 30 DAY) | |
and mail_direction != 'inbound' | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi @matwerber1 ,
Is there any other script before this one to be executed?
How did you create the tables in
vaulted-keel-351205.gmail_logs_dataset.daily_*
?Thanks!