Created
January 11, 2016 01:55
-
-
Save tyleha/77580530af2411ebe4a9 to your computer and use it in GitHub Desktop.
Unified code needed to build a heatmap of your email data.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import matplotlib.dates as dates | |
import matplotlib.gridspec as gridspec | |
from datetime import timedelta, datetime, date | |
import GmailAccount # my package | |
gmail = GmailAccount(username='you@gmail.com', password=password) | |
gmail.login() | |
daysback = 6000 # ~10yrs...make this whatever ya like | |
notsince = 0 # since now. | |
since = (date.today() - timedelta(daysback)).strftime("%d-%b-%Y") | |
before = (date.today() - timedelta(notsince)).strftime("%d-%b-%Y") | |
SEARCH = '(SENTSINCE {si} SENTBEFORE {bf})'.format(si=since, bf=before) | |
ALL_HEADERS = '(BODY.PEEK[HEADER.FIELDS (DATE TO CC FROM SUBJECT)])' | |
# Search and fetch emails! | |
received = gmail.load_parse_query(search_query=SEARCH, | |
fetch_query=ALL_HEADERS, | |
folder='"[Gmail]/All Mail"') | |
def scrub_email(headers): | |
# IMAP sometimes returns fields with varying capitalization. Lowercase each header name. | |
return dict([(title.lower(), value) for title, value in headers]) | |
df = pd.DataFrame([scrub_email(email._headers) for email in received]) | |
# Parse date strings remaining naive across multiple timezones | |
def try_parse_date(d): | |
try: | |
ts = pd.Timestamp(d) | |
# IMAP is very much not perfect...some of my emails have no timezone | |
# in their date string. ¯\_(ツ)_/¯ | |
if ts.tz is None: | |
ts = ts.tz_localize('UTC') | |
# I moved from east coast to west coast in 2010, so automatically assume EST/PST | |
# before/after that date. | |
if ts < pd.Timestamp('2010-09-01', tz='US/Eastern'): | |
ts = ts.tz_convert('US/Eastern') | |
else: | |
ts = ts.tz_convert('US/Pacific') | |
# Here's the magic to use timezone-naive timestamps | |
return pd.Timestamp(ts.to_datetime().replace(tzinfo=None)) | |
except: | |
# If we fail, return NaN so pandas can remove this email later. | |
return np.nan | |
df['timestamp'] = df.date.map(try_parse_date) | |
# Remove any emails that Timestamp was unable to parse | |
df = df.dropna(subset=['timestamp']) | |
df['hour'] = df.timestamp.map(lambda x: x.hour) | |
freq = 'M' # could also be 'W' (week) or 'D' (day), but month looks nice. | |
df = df.set_index('timestamp', drop=False) | |
df.index = df.index.to_period(freq) | |
mindate = df.timestamp.min() | |
maxdate = df.timestamp.max() | |
pr = pd.period_range(mindate, maxdate, freq=freq) | |
# Initialize a new HeatMap dataframe where the indicies are actually Periods of time | |
# Size the frame anticipating the correct number of rows (periods) and columns (hours in a day) | |
hm = pd.DataFrame(np.zeros([len(pr), 24]) , index=pr) | |
for period in pr: | |
# HERE'S where the magic happens...with pandas, when you structure your data correctly, | |
# it can be so terse that you almost aren't sure the program does what it says it does... | |
# For this period (month), find relevant emails and count how many emails were received in | |
# each hour of the day. Takes more words to explain than to code. | |
if period in df.index: | |
hm.ix[period] = df.ix[[period]].hour.value_counts() | |
# If for some weird reason there was ever an hour period where you had no email, | |
# fill those NaNs with zeros. | |
hm.fillna(0, inplace=True) | |
### Set up figure | |
fig = plt.figure(figsize=(12,8)) | |
# This will be useful laterz | |
gs = gridspec.GridSpec(2, 2, height_ratios=[4,1], width_ratios=[20,1],) | |
gs.update(wspace=0.05) | |
### Plot our heatmap | |
ax = plt.subplot(gs[0]) | |
x = dates.date2num([p.start_time for p in pr]) | |
t = [datetime(2000, 1, 1, h, 0, 0) for h in range(24)] | |
t.append(datetime(2000, 1, 2, 0, 0, 0)) # add last fencepost | |
y = dates.date2num(t) | |
cm = plt.get_cmap('Oranges') | |
plt.pcolor(x, y, hm.transpose().as_matrix(), cmap=cm) | |
### Now format our axes to be human-readable | |
ax.xaxis.set_major_formatter(dates.DateFormatter('%b %Y')) | |
ax.yaxis.set_major_formatter(dates.DateFormatter('%H:%M')) | |
ax.set_yticks(t[::2]) | |
ax.set_xticks(x[::12]) | |
ax.set_xlim([x[0], x[-1]]) | |
ax.set_ylim([t[0], t[-1]]) | |
ax.tick_params(axis='x', pad=14, length=10, direction='inout') | |
### pcolor makes it sooo easy to add a color bar! | |
plt.colorbar(cax=plt.subplot(gs[1])) | |
ax2 = plt.subplot(gs[2]) | |
total_email = df.groupby(level=0).hour.count() | |
plt.plot_date(total_email.index, total_email, '-', linewidth=1.5, color=cm(0.999)) | |
ax2.fill_between(total_email.index, 0, total_email, color=cm(0.5)) | |
ax2.xaxis.tick_top() | |
out = ax2.set_xticks(total_email.index[::12]) | |
out = ax2.xaxis.set_ticklabels([]) | |
ax2.tick_params(axis='x', pad=14, length=10, direction='inout') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment