Skip to content

Instantly share code, notes, and snippets.

@tyleha
Created January 11, 2016 01:55
Show Gist options
  • Save tyleha/77580530af2411ebe4a9 to your computer and use it in GitHub Desktop.
Save tyleha/77580530af2411ebe4a9 to your computer and use it in GitHub Desktop.
Unified code needed to build a heatmap of your email data.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as dates
import matplotlib.gridspec as gridspec
from datetime import timedelta, datetime, date
import GmailAccount # my package
gmail = GmailAccount(username='you@gmail.com', password=password)
gmail.login()
daysback = 6000 # ~10yrs...make this whatever ya like
notsince = 0 # since now.
since = (date.today() - timedelta(daysback)).strftime("%d-%b-%Y")
before = (date.today() - timedelta(notsince)).strftime("%d-%b-%Y")
SEARCH = '(SENTSINCE {si} SENTBEFORE {bf})'.format(si=since, bf=before)
ALL_HEADERS = '(BODY.PEEK[HEADER.FIELDS (DATE TO CC FROM SUBJECT)])'
# Search and fetch emails!
received = gmail.load_parse_query(search_query=SEARCH,
fetch_query=ALL_HEADERS,
folder='"[Gmail]/All Mail"')
def scrub_email(headers):
# IMAP sometimes returns fields with varying capitalization. Lowercase each header name.
return dict([(title.lower(), value) for title, value in headers])
df = pd.DataFrame([scrub_email(email._headers) for email in received])
# Parse date strings remaining naive across multiple timezones
def try_parse_date(d):
try:
ts = pd.Timestamp(d)
# IMAP is very much not perfect...some of my emails have no timezone
# in their date string. ¯\_(ツ)_/¯
if ts.tz is None:
ts = ts.tz_localize('UTC')
# I moved from east coast to west coast in 2010, so automatically assume EST/PST
# before/after that date.
if ts < pd.Timestamp('2010-09-01', tz='US/Eastern'):
ts = ts.tz_convert('US/Eastern')
else:
ts = ts.tz_convert('US/Pacific')
# Here's the magic to use timezone-naive timestamps
return pd.Timestamp(ts.to_datetime().replace(tzinfo=None))
except:
# If we fail, return NaN so pandas can remove this email later.
return np.nan
df['timestamp'] = df.date.map(try_parse_date)
# Remove any emails that Timestamp was unable to parse
df = df.dropna(subset=['timestamp'])
df['hour'] = df.timestamp.map(lambda x: x.hour)
freq = 'M' # could also be 'W' (week) or 'D' (day), but month looks nice.
df = df.set_index('timestamp', drop=False)
df.index = df.index.to_period(freq)
mindate = df.timestamp.min()
maxdate = df.timestamp.max()
pr = pd.period_range(mindate, maxdate, freq=freq)
# Initialize a new HeatMap dataframe where the indicies are actually Periods of time
# Size the frame anticipating the correct number of rows (periods) and columns (hours in a day)
hm = pd.DataFrame(np.zeros([len(pr), 24]) , index=pr)
for period in pr:
# HERE'S where the magic happens...with pandas, when you structure your data correctly,
# it can be so terse that you almost aren't sure the program does what it says it does...
# For this period (month), find relevant emails and count how many emails were received in
# each hour of the day. Takes more words to explain than to code.
if period in df.index:
hm.ix[period] = df.ix[[period]].hour.value_counts()
# If for some weird reason there was ever an hour period where you had no email,
# fill those NaNs with zeros.
hm.fillna(0, inplace=True)
### Set up figure
fig = plt.figure(figsize=(12,8))
# This will be useful laterz
gs = gridspec.GridSpec(2, 2, height_ratios=[4,1], width_ratios=[20,1],)
gs.update(wspace=0.05)
### Plot our heatmap
ax = plt.subplot(gs[0])
x = dates.date2num([p.start_time for p in pr])
t = [datetime(2000, 1, 1, h, 0, 0) for h in range(24)]
t.append(datetime(2000, 1, 2, 0, 0, 0)) # add last fencepost
y = dates.date2num(t)
cm = plt.get_cmap('Oranges')
plt.pcolor(x, y, hm.transpose().as_matrix(), cmap=cm)
### Now format our axes to be human-readable
ax.xaxis.set_major_formatter(dates.DateFormatter('%b %Y'))
ax.yaxis.set_major_formatter(dates.DateFormatter('%H:%M'))
ax.set_yticks(t[::2])
ax.set_xticks(x[::12])
ax.set_xlim([x[0], x[-1]])
ax.set_ylim([t[0], t[-1]])
ax.tick_params(axis='x', pad=14, length=10, direction='inout')
### pcolor makes it sooo easy to add a color bar!
plt.colorbar(cax=plt.subplot(gs[1]))
ax2 = plt.subplot(gs[2])
total_email = df.groupby(level=0).hour.count()
plt.plot_date(total_email.index, total_email, '-', linewidth=1.5, color=cm(0.999))
ax2.fill_between(total_email.index, 0, total_email, color=cm(0.5))
ax2.xaxis.tick_top()
out = ax2.set_xticks(total_email.index[::12])
out = ax2.xaxis.set_ticklabels([])
ax2.tick_params(axis='x', pad=14, length=10, direction='inout')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment