Create a gist now

Instantly share code, notes, and snippets.

@jseabold /tufte.py
Last active Jan 11, 2016

Recreation of Tufte graphic in Python based on an Rstats blog post and gist http://asbcllc.com/blog/2015/January/gotham_2014_weather/ https://gist.github.com/abresler/46c36c1a88c849b94b07
import os
import calendar
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import FixedLocator, FixedFormatter
import pandas as pd
import seaborn as sns
to_colors = lambda x : x/255.
blue3 = map(to_colors, (24, 116, 205)) # 1874CD
wheat2 = map(to_colors, (238, 216, 174)) # EED8AE
wheat3 = map(to_colors, (205, 186, 150)) # CDBA96
wheat4 = map(to_colors, (139, 126, 102)) # 8B7E66
firebrick3 = map(to_colors, (205, 38, 38)) # CD2626
gray30 = map(to_colors, (77, 77, 77)) # 4D4D4D
if not os.path.exists("tufte.csv"):
dta = pd.read_table("http://academic.udayton.edu/kissock/http/"
"Weather/gsod95-current/NYNEWYOR.txt", sep=" *",
names=["month", "day", "year", "temp"])
dta.to_csv("tufte.csv", index=False)
else:
dta = pd.read_csv("tufte.csv")
def calc_summary_stats(x):
lower = x.min()
upper = x.max()
avg = x.mean()
std_err = x.std()/np.sqrt(len(x))
ci_upper = avg + 2.101 * std_err
ci_lower = avg - 2.101 * std_err
return pd.DataFrame.from_dict(dict(lower=lower, upper=upper,
avg=avg, std_err=std_err,
ci_upper=ci_upper, ci_lower=ci_lower)
)
dta.set_index(pd.to_datetime(dta.year*10000 + dta.month*100 + dta.day,
format="%Y%m%d"), inplace=True)
dta = dta[["temp"]].query("temp != -99")
past = dta.query("index < 2014")
grouped = past.groupby(past.index.map(lambda x : (x.month, x.day)))
past_stats = grouped.apply(calc_summary_stats)
past_stats.set_index(past_stats.index.droplevel(1), inplace=True)
present = dta.query("index >= 2014")
grouped = present.groupby(present.index.map(lambda x : (x.month, x.day)))
presentlows = grouped.temp.min()
presentlows = presentlows.ix[presentlows <
past_stats.ix[presentlows.index].lower]
presenthighs = grouped.temp.max()
presenthighs = presenthighs.ix[presenthighs >
past_stats.ix[presenthighs.index].upper]
idx = range(len(past_stats))
fig, ax = plt.subplots(figsize=(20, 8), subplot_kw={'axisbg': 'white'},
facecolor='white')
# plot the high-low bars
ax.vlines(idx, past_stats.lower, past_stats.upper, color=wheat3, alpha=.9,
linewidth=1.5, zorder=-1)
# plot the confidence interval around the means
ax.vlines(idx, past_stats.ci_lower, past_stats.ci_upper, linewidth=1.5,
color=wheat4, zorder=-1)
# plot the present year time-series
ax.plot(present, color='k', zorder=10)
# plot the highs and lows of the present year
x_highs = np.where(past_stats.index.isin(presenthighs.index))[0]
# adjust for no leap day in 2014
x_highs -= 1
ax.plot(x_highs, presenthighs, 'ro')
x_lows = np.where(past_stats.index.isin(presentlows.index))[0]
# adjust for leap day
x_lows[9:] -= 1
ax.plot(x_lows, presentlows, 'bo')
# plot the made-up 2014 range. don't know what this was supposed to show.
ax.vlines(idx[len(idx) // 2 + 2], -5, 30, linewidth=15, color=wheat2)
ax.vlines(idx[len(idx) // 2 + 2], 3, 19, linewidth=15, color=wheat4)
ax.errorbar(len(idx) // 2 + 7, 9, yerr=6, capsize=4, capthick=1,
color='black')
ax.text(len(idx) // 2 + 8, 9, "Normal Range", verticalalignment='center')
ax.text(len(idx) // 2 + 7, 30, "Record High")
ax.text(len(idx) // 2 + 7, -5, "Record Low", verticalalignment='top')
ax.text(len(idx) // 2 - 1, 9, "2014 Temperature",
horizontalalignment='right')
##############
## text data
#
ax.annotate("We had 30 days that were the\ncoldest since 1995",
xy=(x_lows[4], presentlows[4]), xytext=(50, -45),
textcoords='offset points', arrowprops=dict(facecolor='blue',
width=2,
headwidth=0,
frac=0,
shrink=.05),
color='blue', horizontalalignment='left')
ax.annotate("We had 5 days that were the\nhottest since 1995",
xy=(x_highs[0], presenthighs[0]), xytext=(0, 40),
textcoords='offset points', arrowprops=dict(facecolor='red',
width=2,
headwidth=0,
frac=0,
shrink=.05),
color='red', horizontalalignment='center')
ax.text(69, 94, u"Data represents average daily temperatures. Accessible "
"data dates back to\nJanuary 1, 1975. Data for 2014 is only "
"available through December 16.\nAverage temperature for"
u" the year was 54.8\u00b0 making 2014 the 6th coldest\nyear"
"since 1995", verticalalignment='top', horizontalalignment='center')
##############
## formatting
#
yticks = range(-10, 101, 10)
ax.yaxis.set_ticks(yticks)
ylabels = [str(i) + u"\u00b0" for i in yticks]
ax.yaxis.set_ticklabels(ylabels, fontsize=14)
ax.yaxis.grid(color='white', zorder=1)
xticks = past.groupby(past.index.month).apply(lambda x : x.index.day.max()
).cumsum().values
ax.xaxis.set_ticks(xticks)
left_spine = ax.spines['left']
left_spine.set_visible(True)
left_spine.set_color(wheat4)
left_spine.set_linewidth(2)
xticks = np.r_[0, xticks]
minor_xticks = (xticks[1:] + xticks[:-1])/2
ax.xaxis.set_minor_locator(FixedLocator(minor_xticks))
ax.xaxis.set_minor_formatter(FixedFormatter(calendar.month_name[1:]))
ax.xaxis.set_ticklabels([])
ax.xaxis.grid(color=wheat3, linestyle='dotted')
ax.set_title(" New York City's Weather in 2014", loc="left",
fontsize=23)
ax.text(2, 97, " Temperature in Fahrenheit", fontsize=15,
fontdict=dict(weight='bold'))
ax.set_xlim(0, len(idx))
ax.set_ylim(-10, 100)
fig.savefig("tufte.svg")
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment