Created
November 2, 2023 00:05
-
-
Save adobke/cdf7ac9fe224cad7f37941dc791ccbef to your computer and use it in GitHub Desktop.
Parse a gmail .mbox file and find the largest email groupings
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import argparse | |
import mailbox | |
import pathlib | |
import re | |
import sys | |
from collections import Counter | |
EMAIL_PATTERN = r"<(.*?)>" | |
def get_mbox_stats(mbox_path): | |
sender_counts = Counter() | |
domain_counts = Counter() | |
subject_counts = Counter() | |
sender_sizes = Counter() | |
subject_sizes = Counter() | |
mbox = mailbox.mbox(mbox_path) | |
print("Parsing mbox... this could take some time") | |
num_messags = len(mbox) | |
count = 0 | |
for message in mbox: | |
print(message["From"]) | |
print(message["Subject"]) | |
count += 1 | |
if count % 1000 == 0: | |
print(f"{count} / {num_messags}") | |
match = re.search(EMAIL_PATTERN, str(message["From"])) | |
if match: | |
sender = match.group(1).lower() | |
domain = sender.split("@")[1] | |
else: | |
sender = message["From"].lower() | |
if "@" in sender: | |
domain = sender.split("@", 1)[1] | |
else: | |
domain = "unknown" | |
subject = "no subject" | |
if "Subject" in message: | |
subject = str(message["Subject"]) | |
message_size = len(message.as_bytes()) | |
sender_counts[sender] += 1 | |
domain_counts[domain] += 1 | |
subject_counts[subject] += 1 | |
sender_sizes[sender] += message_size | |
subject_sizes[subject] += message_size | |
return sender_counts, domain_counts, subject_counts, sender_sizes, subject_sizes | |
def print_stat(title, counter, size_typed): | |
print(title + ":") | |
top_list = counter.most_common(15) | |
max_length = max(len(key) for key, value in top_list) | |
max_length = min(max_length, 35) | |
for key, stat in top_list: | |
if size_typed: | |
stat /= 1000000 | |
print(f"{key:{max_length}.{max_length}} : {stat} {'MB' if size_typed else ''}") | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument("mbox_path", type=pathlib.Path) | |
args = parser.parse_args() | |
if not args.mbox_path.is_file(): | |
print(f"File '{args.mbox_path}' does not exist") | |
sys.exit(1) | |
( | |
sender_counts, | |
domain_counts, | |
subject_counts, | |
sender_sizes, | |
subject_sizes, | |
) = get_mbox_stats(args.mbox_path) | |
print_stat("Most common senders", sender_counts, False) | |
print() | |
print_stat("Most common sender domains", domain_counts, False) | |
print() | |
print_stat("Most common subjects", subject_counts, False) | |
print() | |
print_stat("Top summed-size subjects", subject_sizes, True) | |
print() | |
print_stat("Top summed-size senders", sender_sizes, True) | |
print() | |
print(f"Summed size: {sum(sender_sizes.values())/1000000} MB") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment