Skip to content

Instantly share code, notes, and snippets.

@josef-pkt
Created April 2, 2012 01:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save josef-pkt/2279875 to your computer and use it in GitHub Desktop.
Save josef-pkt/2279875 to your computer and use it in GitHub Desktop.
Plots a scatter plot of vectors x and y, also showing marginal histograms and least squares line fit. Sparklines are optionally displayed (relevant if the data are time series). "shs" in the name is shorthand for "Scatter plot with Histograms and Sparklin
#Copyright (c) 2011, Josh Hemann (hemann @ colorado . edu)
#All rights reserved.
#
#Redistribution and use in source and binary forms, with or without
#modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the code's author, Josh Hemann, nor the
# names of its contributors, may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
#ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
#WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
#DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
#DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
#(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
#LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
#ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
#SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
from __future__ import division
import warnings
import numpy as np
import scipy as sp
from scipy import stats
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from matplotlib.ticker import NullFormatter, MaxNLocator
__author__ = "Josh Hemann"
__copyright__ = "Copyright 2010, Josh Hemann"
__credits__ = ["Josh Hemann; DASH Research Group, University of Colorado"]
__license__ = "BSD"
__version__ = "0.2"
__maintainer__ = "Josh Hemann"
__email__ = "hemann@colorado.edu"
__status__ = "Development"
def plot_shs(x, y, xtitle=None, ytitle=None, window_title=None,
xrange=None, yrange=None, filename=None, format='png',
sparklines=False, lowess=None, histogram_color='gainsboro',
sparkline_color='black'):
"""
Purpose
-------
Plots a scatter plot of vectors x and y, also showing marginal histograms
and least squares line fit. Sparklines are optionally displayed (relevant if
the data are time series). "shs" in the name is shorthand for "Scatter plot
with Histograms and Sparklines".
Inputs
------
x - A 1D numpy array containing the data for the independent variable.
y - A 1D numpy array containing the data for the dependent variable.
Keywords
--------
filename - A scalar string defining a path and file name to save the
displayed plot. The default format is png.
format - A scalar string taking on one of the accepted matplotlib
image format strings, e.g. 'png', 'pdf'.
histogram_color - A scalar string defining the facecolor of the histograms.
By default set to 'gainsboro'; any html color name is valid.
lowess - If set as a scalar float, a lowess smooth line is fit to the
data, with the float value passed to the Biopython lowess()
function's f parameter. Sensible values are between
0.2, 0.8.
sparklines - If True, sparklines are added to the plot, above/right of
the histograms
sparkline_color - A scalar string defining the facecolor of the histograms.
By default set to 'black'; any html color name is valid.
window_title - A scalar string defining the title for the plot GUI window
xrange - A two-element list of floats defining the minimum and
maximum of the x-axis. By default, sensible limits are
estimated from the data.
xtitle - A scalar string defining the title for the x-axis (abcissa)
yrange - A two-element list of floats defining the minimum and
maximum of the y-axis. By default, sensible limits are
estimated from the data.
ytitle - A scalar string defining the title for the y-axis (ordinate)
Return
------
None
Side Effects
------------
Displays a matplotlib plot window and optionally writes the image out ot a
file in the local directory.
"""
try:
assert len(x) == len(y), 'Input arrays must be equal length.'
except AssertionError, msg:
print(msg)
return None
if xtitle is None:
xtitle = 'X'
if ytitle is None:
ytitle = 'Y'
if window_title is None:
window_title = 'Scatter Plot'
B1, B0, R, tt, stderr= stats.linregress(x, y)
print (' %s ' % window_title).center(78, '-')
print('Linear regression using stats.linregress')
print('parameters: a=%.2f b=%.2f, std error= %.3f, R=%.3f' \
% (B0, B1, stderr, R))
Nx = len(x)
minx = np.min(x)
maxx = np.max(x)
miny = np.min(y)
maxy = np.max(y)
if xrange is None:
if minx > 0:
domainMin = minx * 0.95
else:
domainMin = minx * 1.05
if maxx > 0:
domainMax = maxx * 1.05
else:
domainMax = maxx * 0.95
else:
domainMin, domainMax = xrange
if yrange is None:
if miny > 0:
rangeMin = miny * 0.95
else:
rangeMin = miny * 1.05
if maxy > 0:
rangeMax = maxy * 1.05
else:
rangeMax = maxy * 0.95
else:
rangeMin, rangeMax = yrange
domain = np.linspace(domainMin, domainMax, Nx)
regression_line = B0 + (B1 * domain)
# OLS_fit = B0 + (B1 * x)
# SSE = np.sum((y - OLS_fit)**2)
# SST = np.sum((y - np.average(y))**2)
# SSR = np.sum((OLS_fit - np.average(y))**2)
# R2_ls = SSR/SST
# print('scatter_hist_spark: OLS R^2: %f' % R2_ls)
left, width = 0.11, 0.7
bottom, height = 0.1, 0.7
bottom_h = left + width
left_h = left + width + 0.01
hist_h = 0.10
sprk_h = 0.05
rect_scatter = [left, bottom, width, height]
rect_histx = [left, bottom_h, width, hist_h]
rect_histy = [left_h, bottom, hist_h, height]
rect_sparkx = [left, bottom + height + hist_h + 0.01, width, sprk_h]
rect_sparky = [left_h + hist_h, bottom, sprk_h, height]
# start with a rectangular Figure
fig = plt.figure(figsize=(8,8))
fig.canvas.set_window_title(window_title)
axScatter = plt.axes(rect_scatter)
plt.ylabel(ytitle)
plt.xlabel(xtitle)
axScatter.xaxis.set_major_locator(MaxNLocator(8))
axScatter.yaxis.set_major_locator(MaxNLocator(8))
axHistx = plt.axes(rect_histx)
nullfmt = NullFormatter()
axHistx.xaxis.set_major_formatter(nullfmt)
axHistx.yaxis.set_major_formatter(nullfmt)
axHisty = plt.axes(rect_histy)
axHisty.xaxis.set_major_formatter(nullfmt)
axHisty.yaxis.set_major_formatter(nullfmt)
axScatter.scatter(x, y, edgecolors='red', facecolors='none', clip_on=False)
#axScatter.plot(x, y, 'ro')
axScatter.plot(domain, regression_line, 'g-', linewidth=2,
label='Least Squares Fit, %s=%1.3f' % ('$R^2$', R*R))
if lowess is not None:
try:
from Bio.Statistics import lowess as biop
except ImportError, msg:
warnings.warn('Warning: lowess smoothing relies on functionality' \
+ ' in BioPython, which is not installed.\nIgnoring'\
+ 'lowess plot...')
else:
sorted_indices = np.argsort(x)
lowess_smooth = biop.lowess(x[sorted_indices], y[sorted_indices],
iter=10, f=lowess)
axScatter.plot(x[sorted_indices], lowess_smooth, 'b-', linewidth=2,
alpha=0.6, label='Lowess Smooth')
axScatter.set_xlim(domainMin, domainMax)
axScatter.set_ylim(rangeMin, rangeMax)
leg = axScatter.legend(loc='best', borderpad=0.2, shadow=False,
prop=FontProperties(size='x-small'),
markerscale=0.1, title='N=%i R=%1.3f' % (Nx, R))
leg.get_frame().set_alpha(0.8)
axHistx.hist(x, bins=40, normed=True, facecolor=histogram_color)
axHisty.hist(y, bins=40, normed=True, facecolor=histogram_color,
orientation='horizontal')
axHistx.set_xlim(axScatter.get_xlim())
axHisty.set_ylim(axScatter.get_ylim())
if sparklines:
axSparkx = plt.axes(rect_sparkx)
axSparkx.plot(range(Nx), x, color=sparkline_color, linewidth=2)
axSparkx.set_xlim(0, Nx)
axSparkx.axis('off')
axSparky = plt.axes(rect_sparky)
axSparky.plot(y[::-1], np.arange(len(y)), color=sparkline_color,
linewidth=2)
axSparky.set_ylim(0, len(y))
axSparky.axis('off')
plt.show()
if filename is not None:
plt.savefig(filename, format=format)
return None
if __name__ == '__main__':
#Example taken from http://www.scipy.org/Cookbook/LinearRegression
n = 50
t = np.linspace(-5, 5, n)
a = 0.8
b = -4
x = sp.polyval([a,b], t)
#add some noise
xn = x + sp.randn(n)
plot_shs(t, xn, xtitle='np.linspace(5,5)', ytitle='Data Value',
window_title='Test 1 of scatter_hist_spark',
lowess=0.5)
#Example for x and y being time series data
N = 100
seasonality = np.cos(range(N))
x = seasonality + np.random.normal(loc=0, scale=0.2, size=N)
y = seasonality + np.random.normal(loc=0, scale=2, size=N)
plot_shs(x, y, xtitle='Series 1', ytitle='Series 2',
window_title='Test 2 of scatter_hist_spark',
lowess=0.5, sparklines=True, histogram_color='goldenrod',
sparkline_color='fuchsia')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment