Created
June 10, 2024 19:55
-
-
Save danslinky/c295461635176fd5eba1861f5020292b to your computer and use it in GitHub Desktop.
exploring an gmail mbox using python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# read mbox file, generate some stats\n", | |
"import os\n", | |
"import re\n", | |
"import mailbox\n", | |
"\n", | |
"mboxFile = \"my.mbox\"\n", | |
"mbox = open(mboxFile, \"r\")\n", | |
"\n", | |
"# read mbox file\n", | |
"def readMbox(mbox):\n", | |
" lines = mbox.readlines()\n", | |
" mbox.close()\n", | |
" return lines\n", | |
"\n", | |
"# get email addresses\n", | |
"def getEmails(lines):\n", | |
" emails = []\n", | |
" for line in lines:\n", | |
" if line.startswith(\"From:\"):\n", | |
" regex = r\"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+\"\n", | |
" match = re.search(regex, line)\n", | |
" if match is not None:\n", | |
" email = match.group()\n", | |
" emails.append(email)\n", | |
" return emails\n", | |
"\n", | |
"# get email domains\n", | |
"def getEmailDomains(emails):\n", | |
" domains = []\n", | |
" for email in emails:\n", | |
" try:\n", | |
" domain = email.strip().split(\"@\")[1].lower()\n", | |
" except IndexError:\n", | |
" print(f\"Invalid email address: {email}\")\n", | |
" domains.append(domain)\n", | |
" return domains\n", | |
"\n", | |
"\n", | |
"emails = getEmails(readMbox(mbox))\n", | |
"domains = getEmailDomains(emails)\n", | |
"\n", | |
"print(f\"Total emails: {len(emails)}\")\n", | |
"print(f\"Total domains: {len(domains)}\")\n", | |
"print(f\"Unique domains: {len(set(domains))}\")\n", | |
"print(f\"Top 10 domains: {sorted(set(domains), key=domains.count, reverse=True)[:10]}\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"\n", | |
"# Export emails to text files\n", | |
"email_dir = \"./myemails\"\n", | |
"if not os.path.exists(email_dir):\n", | |
" os.makedirs(email_dir)\n", | |
"\n", | |
"def get_body(message):\n", | |
" if message.is_multipart():\n", | |
" for part in message.get_payload():\n", | |
" if part.get_content_type() == 'text/plain':\n", | |
" return part.get_payload()\n", | |
" if part.get_content_type() == 'text/html':\n", | |
" return part.get_payload()\n", | |
" #base64\n", | |
" if part.get_content_type() == 'application/octet-stream':\n", | |
" # decode base64\n", | |
" return part.get_payload()\n", | |
" else:\n", | |
" return message.get_payload()\n", | |
"\n", | |
"def write_email(message, i):\n", | |
" with open(f\"{email_dir}/email_{i}.txt\", \"w\") as f:\n", | |
" f.write(f\"From: {message['From']}\\n\")\n", | |
" f.write(f\"To: {message['To']}\\n\")\n", | |
" f.write(f\"Subject: {message['Subject']}\\n\")\n", | |
" f.write(f\"Date: {message['Date']}\\n\")\n", | |
" f.write(f\"Body: {get_body(message)}\\n\")\n", | |
"\n", | |
"mbox = mailbox.mbox(mboxFile)\n", | |
"for i, message in enumerate(mbox):\n", | |
" write_email(message, i)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.11.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment