Skip to content

Instantly share code, notes, and snippets.

@adobke
Created November 2, 2023 00:05
Show Gist options
  • Save adobke/cdf7ac9fe224cad7f37941dc791ccbef to your computer and use it in GitHub Desktop.
Save adobke/cdf7ac9fe224cad7f37941dc791ccbef to your computer and use it in GitHub Desktop.
Parse a gmail .mbox file and find the largest email groupings
#!/usr/bin/env python3
import argparse
import mailbox
import pathlib
import re
import sys
from collections import Counter
EMAIL_PATTERN = r"<(.*?)>"
def get_mbox_stats(mbox_path):
sender_counts = Counter()
domain_counts = Counter()
subject_counts = Counter()
sender_sizes = Counter()
subject_sizes = Counter()
mbox = mailbox.mbox(mbox_path)
print("Parsing mbox... this could take some time")
num_messags = len(mbox)
count = 0
for message in mbox:
print(message["From"])
print(message["Subject"])
count += 1
if count % 1000 == 0:
print(f"{count} / {num_messags}")
match = re.search(EMAIL_PATTERN, str(message["From"]))
if match:
sender = match.group(1).lower()
domain = sender.split("@")[1]
else:
sender = message["From"].lower()
if "@" in sender:
domain = sender.split("@", 1)[1]
else:
domain = "unknown"
subject = "no subject"
if "Subject" in message:
subject = str(message["Subject"])
message_size = len(message.as_bytes())
sender_counts[sender] += 1
domain_counts[domain] += 1
subject_counts[subject] += 1
sender_sizes[sender] += message_size
subject_sizes[subject] += message_size
return sender_counts, domain_counts, subject_counts, sender_sizes, subject_sizes
def print_stat(title, counter, size_typed):
print(title + ":")
top_list = counter.most_common(15)
max_length = max(len(key) for key, value in top_list)
max_length = min(max_length, 35)
for key, stat in top_list:
if size_typed:
stat /= 1000000
print(f"{key:{max_length}.{max_length}} : {stat} {'MB' if size_typed else ''}")
def main():
parser = argparse.ArgumentParser()
parser.add_argument("mbox_path", type=pathlib.Path)
args = parser.parse_args()
if not args.mbox_path.is_file():
print(f"File '{args.mbox_path}' does not exist")
sys.exit(1)
(
sender_counts,
domain_counts,
subject_counts,
sender_sizes,
subject_sizes,
) = get_mbox_stats(args.mbox_path)
print_stat("Most common senders", sender_counts, False)
print()
print_stat("Most common sender domains", domain_counts, False)
print()
print_stat("Most common subjects", subject_counts, False)
print()
print_stat("Top summed-size subjects", subject_sizes, True)
print()
print_stat("Top summed-size senders", sender_sizes, True)
print()
print(f"Summed size: {sum(sender_sizes.values())/1000000} MB")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment