Created
February 13, 2019 15:55
-
-
Save chaitan94/206bd77f4447a2fb5c26fdf4f0e0799f to your computer and use it in GitHub Desktop.
Gmail hall of spam
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"\"\"\"\n", | |
"Most of the code below is taken from / based on https://jellis18.github.io/post/2018-01-17-mail-analysis/\n", | |
"\"\"\"\n", | |
"path = 'takeout-20181231T170238Z-001/Takeout/Mail/'\n", | |
"mboxfile = path + 'All mail Including Spam and Trash.mbox'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import mailbox\n", | |
"mbox = mailbox.mbox(mboxfile)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"for key in mbox[0].keys():\n", | |
" print(key)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import csv\n", | |
"with open(path + \"mbox.csv\", \"w\") as outfile:\n", | |
" writer = csv.writer(outfile)\n", | |
" for message in mbox:\n", | |
" writer.writerow([message['subject'], message['from'], message['date'], message['to'],\n", | |
" message['X-Gmail-Labels'], message['X-GM-THRID']])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%matplotlib inline\n", | |
"%pylab inline\n", | |
"\n", | |
"import pandas as pd\n", | |
"import matplotlib.pyplot as plt" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df = pd.read_csv(path + 'mbox.csv', names=['subject', 'from', 'date', 'to', 'label', 'thread'])\n", | |
"df['date'] = df['date'].apply(lambda x: pd.to_datetime(x, errors='coerce', utc=True))\n", | |
"df = df[df['date'].notna()]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import re\n", | |
"\n", | |
"# Clean up\n", | |
"def get_email_address(string):\n", | |
" if pd.isnull(string): return np.NAN\n", | |
" try:\n", | |
" email = re.findall(r'<(.+?)>', string)\n", | |
" except Exception as e:\n", | |
" print(string)\n", | |
" raise e\n", | |
" if not email:\n", | |
" email = list(filter(lambda y: '@' in y, string.split()))\n", | |
" return email[0] if email else np.NAN\n", | |
"\n", | |
"df['from'] = df['from'].apply(lambda x: get_email_address(x))\n", | |
"df['to'] = df['to'].apply(lambda x: get_email_address(x))\n", | |
"df['inout'] = df['from'].apply(lambda x: 'out' if (re.match(r'bkchaitan94(\\+.+)?@gmail.com', x) if not pd.isnull(x) else x) else 'in')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df.groupby('inout').size().nlargest(20)\n", | |
"#df.groupby('inout').size().nlargest(20).plot(kind='barh', log=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df[df['inout'] == 'in'].groupby('from').size().nlargest(20)\n", | |
"#df[df['inout'] == 'in'].groupby('from').size().nlargest(20).plot(kind='barh')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df[df['inout'] == 'out'].groupby('to').size().nlargest(20)\n", | |
"#df[df['inout'] == 'out'].groupby('to').size().nlargest(20).plot(kind='barh', log=True)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "basic_data_science_jupyter_kernel", | |
"language": "python", | |
"name": "basic_data_science_jupyter_kernel" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.15" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment