tyleha/gmail_heatmap.py

## gmail_heatmap.py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as dates
import matplotlib.gridspec as gridspec
from datetime import timedelta, datetime, date

import GmailAccount # my package

gmail = GmailAccount(username='you@gmail.com', password=password)
gmail.login()

daysback = 6000 # ~10yrs...make this whatever ya like
notsince = 0 # since now.
since = (date.today() - timedelta(daysback)).strftime("%d-%b-%Y")
before = (date.today() - timedelta(notsince)).strftime("%d-%b-%Y")

SEARCH = '(SENTSINCE {si} SENTBEFORE {bf})'.format(si=since, bf=before)
ALL_HEADERS = '(BODY.PEEK[HEADER.FIELDS (DATE TO CC FROM SUBJECT)])'

# Search and fetch emails!
received = gmail.load_parse_query(search_query=SEARCH,
                                  fetch_query=ALL_HEADERS,
                                  folder='"[Gmail]/All Mail"')


def scrub_email(headers):
    # IMAP sometimes returns fields with varying capitalization. Lowercase each header name.
    return dict([(title.lower(), value) for title, value in headers])

df = pd.DataFrame([scrub_email(email._headers) for email in received])

# Parse date strings remaining naive across multiple timezones
def try_parse_date(d):
    try:
        ts = pd.Timestamp(d)
        # IMAP is very much not perfect...some of my emails have no timezone
        # in their date string. ¯\_(ツ)_/¯
        if ts.tz is None:
            ts = ts.tz_localize('UTC')
        # I moved from east coast to west coast in 2010, so automatically assume EST/PST
        # before/after that date.
        if ts < pd.Timestamp('2010-09-01', tz='US/Eastern'):
            ts = ts.tz_convert('US/Eastern')
        else:
            ts = ts.tz_convert('US/Pacific')
        # Here's the magic to use timezone-naive timestamps
        return pd.Timestamp(ts.to_datetime().replace(tzinfo=None))

    except:
        # If we fail, return NaN so pandas can remove this email later.
        return np.nan

df['timestamp'] = df.date.map(try_parse_date)
# Remove any emails that Timestamp was unable to parse
df = df.dropna(subset=['timestamp'])

df['hour'] = df.timestamp.map(lambda x: x.hour)
freq = 'M' # could also be 'W' (week) or 'D' (day), but month looks nice.
df = df.set_index('timestamp', drop=False)
df.index = df.index.to_period(freq)


mindate = df.timestamp.min()
maxdate = df.timestamp.max()
pr = pd.period_range(mindate, maxdate, freq=freq)
# Initialize a new HeatMap dataframe where the indicies are actually Periods of time
# Size the frame anticipating the correct number of rows (periods) and columns (hours in a day)
hm = pd.DataFrame(np.zeros([len(pr), 24]) , index=pr)

for period in pr:
    # HERE'S where the magic happens...with pandas, when you structure your data correctly,
    # it can be so terse that you almost aren't sure the program does what it says it does...
    # For this period (month), find relevant emails and count how many emails were received in
    # each hour of the day. Takes more words to explain than to code.
    if period in df.index:
        hm.ix[period] = df.ix[[period]].hour.value_counts()

# If for some weird reason there was ever an hour period where you had no email,
# fill those NaNs with zeros.
hm.fillna(0, inplace=True)


### Set up figure
fig = plt.figure(figsize=(12,8))
# This will be useful laterz
gs = gridspec.GridSpec(2, 2, height_ratios=[4,1], width_ratios=[20,1],)
gs.update(wspace=0.05)

### Plot our heatmap
ax = plt.subplot(gs[0])
x = dates.date2num([p.start_time for p in pr])
t = [datetime(2000, 1, 1, h, 0, 0) for h in range(24)]
t.append(datetime(2000, 1, 2, 0, 0, 0)) # add last fencepost
y = dates.date2num(t)
cm = plt.get_cmap('Oranges')
plt.pcolor(x, y, hm.transpose().as_matrix(), cmap=cm)

### Now format our axes to be human-readable
ax.xaxis.set_major_formatter(dates.DateFormatter('%b %Y'))
ax.yaxis.set_major_formatter(dates.DateFormatter('%H:%M'))
ax.set_yticks(t[::2])
ax.set_xticks(x[::12])
ax.set_xlim([x[0], x[-1]])
ax.set_ylim([t[0], t[-1]])
ax.tick_params(axis='x', pad=14, length=10, direction='inout')

### pcolor makes it sooo easy to add a color bar!
plt.colorbar(cax=plt.subplot(gs[1]))

ax2 = plt.subplot(gs[2])
total_email = df.groupby(level=0).hour.count()
plt.plot_date(total_email.index, total_email, '-', linewidth=1.5, color=cm(0.999))
ax2.fill_between(total_email.index, 0, total_email, color=cm(0.5))

ax2.xaxis.tick_top()
out = ax2.set_xticks(total_email.index[::12])
out = ax2.xaxis.set_ticklabels([])
ax2.tick_params(axis='x', pad=14, length=10, direction='inout')
	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	import matplotlib.dates as dates
	import matplotlib.gridspec as gridspec
	from datetime import timedelta, datetime, date

	import GmailAccount # my package

	gmail = GmailAccount(username='you@gmail.com', password=password)
	gmail.login()

	daysback = 6000 # ~10yrs...make this whatever ya like
	notsince = 0 # since now.
	since = (date.today() - timedelta(daysback)).strftime("%d-%b-%Y")
	before = (date.today() - timedelta(notsince)).strftime("%d-%b-%Y")

	SEARCH = '(SENTSINCE {si} SENTBEFORE {bf})'.format(si=since, bf=before)
	ALL_HEADERS = '(BODY.PEEK[HEADER.FIELDS (DATE TO CC FROM SUBJECT)])'

	# Search and fetch emails!
	received = gmail.load_parse_query(search_query=SEARCH,
	fetch_query=ALL_HEADERS,
	folder='"[Gmail]/All Mail"')


	def scrub_email(headers):
	# IMAP sometimes returns fields with varying capitalization. Lowercase each header name.
	return dict([(title.lower(), value) for title, value in headers])

	df = pd.DataFrame([scrub_email(email._headers) for email in received])

	# Parse date strings remaining naive across multiple timezones
	def try_parse_date(d):
	try:
	ts = pd.Timestamp(d)
	# IMAP is very much not perfect...some of my emails have no timezone
	# in their date string. ¯\_(ツ)_/¯
	if ts.tz is None:
	ts = ts.tz_localize('UTC')
	# I moved from east coast to west coast in 2010, so automatically assume EST/PST
	# before/after that date.
	if ts < pd.Timestamp('2010-09-01', tz='US/Eastern'):
	ts = ts.tz_convert('US/Eastern')
	else:
	ts = ts.tz_convert('US/Pacific')
	# Here's the magic to use timezone-naive timestamps
	return pd.Timestamp(ts.to_datetime().replace(tzinfo=None))

	except:
	# If we fail, return NaN so pandas can remove this email later.
	return np.nan

	df['timestamp'] = df.date.map(try_parse_date)
	# Remove any emails that Timestamp was unable to parse
	df = df.dropna(subset=['timestamp'])

	df['hour'] = df.timestamp.map(lambda x: x.hour)
	freq = 'M' # could also be 'W' (week) or 'D' (day), but month looks nice.
	df = df.set_index('timestamp', drop=False)
	df.index = df.index.to_period(freq)


	mindate = df.timestamp.min()
	maxdate = df.timestamp.max()
	pr = pd.period_range(mindate, maxdate, freq=freq)
	# Initialize a new HeatMap dataframe where the indicies are actually Periods of time
	# Size the frame anticipating the correct number of rows (periods) and columns (hours in a day)
	hm = pd.DataFrame(np.zeros([len(pr), 24]) , index=pr)

	for period in pr:
	# HERE'S where the magic happens...with pandas, when you structure your data correctly,
	# it can be so terse that you almost aren't sure the program does what it says it does...
	# For this period (month), find relevant emails and count how many emails were received in
	# each hour of the day. Takes more words to explain than to code.
	if period in df.index:
	hm.ix[period] = df.ix[[period]].hour.value_counts()

	# If for some weird reason there was ever an hour period where you had no email,
	# fill those NaNs with zeros.
	hm.fillna(0, inplace=True)


	### Set up figure
	fig = plt.figure(figsize=(12,8))
	# This will be useful laterz
	gs = gridspec.GridSpec(2, 2, height_ratios=[4,1], width_ratios=[20,1],)
	gs.update(wspace=0.05)

	### Plot our heatmap
	ax = plt.subplot(gs[0])
	x = dates.date2num([p.start_time for p in pr])
	t = [datetime(2000, 1, 1, h, 0, 0) for h in range(24)]
	t.append(datetime(2000, 1, 2, 0, 0, 0)) # add last fencepost
	y = dates.date2num(t)
	cm = plt.get_cmap('Oranges')
	plt.pcolor(x, y, hm.transpose().as_matrix(), cmap=cm)

	### Now format our axes to be human-readable
	ax.xaxis.set_major_formatter(dates.DateFormatter('%b %Y'))
	ax.yaxis.set_major_formatter(dates.DateFormatter('%H:%M'))
	ax.set_yticks(t[::2])
	ax.set_xticks(x[::12])
	ax.set_xlim([x[0], x[-1]])
	ax.set_ylim([t[0], t[-1]])
	ax.tick_params(axis='x', pad=14, length=10, direction='inout')

	### pcolor makes it sooo easy to add a color bar!
	plt.colorbar(cax=plt.subplot(gs[1]))

	ax2 = plt.subplot(gs[2])
	total_email = df.groupby(level=0).hour.count()
	plt.plot_date(total_email.index, total_email, '-', linewidth=1.5, color=cm(0.999))
	ax2.fill_between(total_email.index, 0, total_email, color=cm(0.5))

	ax2.xaxis.tick_top()
	out = ax2.set_xticks(total_email.index[::12])
	out = ax2.xaxis.set_ticklabels([])
	ax2.tick_params(axis='x', pad=14, length=10, direction='inout')