Skip to content

Instantly share code, notes, and snippets.

@danslinky
Created June 10, 2024 19:55
Show Gist options
  • Save danslinky/c295461635176fd5eba1861f5020292b to your computer and use it in GitHub Desktop.
Save danslinky/c295461635176fd5eba1861f5020292b to your computer and use it in GitHub Desktop.
exploring an gmail mbox using python
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# read mbox file, generate some stats\n",
"import os\n",
"import re\n",
"import mailbox\n",
"\n",
"mboxFile = \"my.mbox\"\n",
"mbox = open(mboxFile, \"r\")\n",
"\n",
"# read mbox file\n",
"def readMbox(mbox):\n",
" lines = mbox.readlines()\n",
" mbox.close()\n",
" return lines\n",
"\n",
"# get email addresses\n",
"def getEmails(lines):\n",
" emails = []\n",
" for line in lines:\n",
" if line.startswith(\"From:\"):\n",
" regex = r\"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+\"\n",
" match = re.search(regex, line)\n",
" if match is not None:\n",
" email = match.group()\n",
" emails.append(email)\n",
" return emails\n",
"\n",
"# get email domains\n",
"def getEmailDomains(emails):\n",
" domains = []\n",
" for email in emails:\n",
" try:\n",
" domain = email.strip().split(\"@\")[1].lower()\n",
" except IndexError:\n",
" print(f\"Invalid email address: {email}\")\n",
" domains.append(domain)\n",
" return domains\n",
"\n",
"\n",
"emails = getEmails(readMbox(mbox))\n",
"domains = getEmailDomains(emails)\n",
"\n",
"print(f\"Total emails: {len(emails)}\")\n",
"print(f\"Total domains: {len(domains)}\")\n",
"print(f\"Unique domains: {len(set(domains))}\")\n",
"print(f\"Top 10 domains: {sorted(set(domains), key=domains.count, reverse=True)[:10]}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"# Export emails to text files\n",
"email_dir = \"./myemails\"\n",
"if not os.path.exists(email_dir):\n",
" os.makedirs(email_dir)\n",
"\n",
"def get_body(message):\n",
" if message.is_multipart():\n",
" for part in message.get_payload():\n",
" if part.get_content_type() == 'text/plain':\n",
" return part.get_payload()\n",
" if part.get_content_type() == 'text/html':\n",
" return part.get_payload()\n",
" #base64\n",
" if part.get_content_type() == 'application/octet-stream':\n",
" # decode base64\n",
" return part.get_payload()\n",
" else:\n",
" return message.get_payload()\n",
"\n",
"def write_email(message, i):\n",
" with open(f\"{email_dir}/email_{i}.txt\", \"w\") as f:\n",
" f.write(f\"From: {message['From']}\\n\")\n",
" f.write(f\"To: {message['To']}\\n\")\n",
" f.write(f\"Subject: {message['Subject']}\\n\")\n",
" f.write(f\"Date: {message['Date']}\\n\")\n",
" f.write(f\"Body: {get_body(message)}\\n\")\n",
"\n",
"mbox = mailbox.mbox(mboxFile)\n",
"for i, message in enumerate(mbox):\n",
" write_email(message, i)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment