Created
April 2, 2012 01:15
-
-
Save josef-pkt/2279875 to your computer and use it in GitHub Desktop.
Plots a scatter plot of vectors x and y, also showing marginal histograms and least squares line fit. Sparklines are optionally displayed (relevant if the data are time series). "shs" in the name is shorthand for "Scatter plot with Histograms and Sparklin
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Copyright (c) 2011, Josh Hemann (hemann @ colorado . edu) | |
#All rights reserved. | |
# | |
#Redistribution and use in source and binary forms, with or without | |
#modification, are permitted provided that the following conditions are met: | |
# * Redistributions of source code must retain the above copyright | |
# notice, this list of conditions and the following disclaimer. | |
# * Redistributions in binary form must reproduce the above copyright | |
# notice, this list of conditions and the following disclaimer in the | |
# documentation and/or other materials provided with the distribution. | |
# * Neither the name of the code's author, Josh Hemann, nor the | |
# names of its contributors, may be used to endorse or promote products | |
# derived from this software without specific prior written permission. | |
# | |
#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | |
#ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
#WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
#DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY | |
#DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES | |
#(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
#LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND | |
#ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
#SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
from __future__ import division | |
import warnings | |
import numpy as np | |
import scipy as sp | |
from scipy import stats | |
import matplotlib.pyplot as plt | |
from matplotlib.font_manager import FontProperties | |
from matplotlib.ticker import NullFormatter, MaxNLocator | |
__author__ = "Josh Hemann" | |
__copyright__ = "Copyright 2010, Josh Hemann" | |
__credits__ = ["Josh Hemann; DASH Research Group, University of Colorado"] | |
__license__ = "BSD" | |
__version__ = "0.2" | |
__maintainer__ = "Josh Hemann" | |
__email__ = "hemann@colorado.edu" | |
__status__ = "Development" | |
def plot_shs(x, y, xtitle=None, ytitle=None, window_title=None, | |
xrange=None, yrange=None, filename=None, format='png', | |
sparklines=False, lowess=None, histogram_color='gainsboro', | |
sparkline_color='black'): | |
""" | |
Purpose | |
------- | |
Plots a scatter plot of vectors x and y, also showing marginal histograms | |
and least squares line fit. Sparklines are optionally displayed (relevant if | |
the data are time series). "shs" in the name is shorthand for "Scatter plot | |
with Histograms and Sparklines". | |
Inputs | |
------ | |
x - A 1D numpy array containing the data for the independent variable. | |
y - A 1D numpy array containing the data for the dependent variable. | |
Keywords | |
-------- | |
filename - A scalar string defining a path and file name to save the | |
displayed plot. The default format is png. | |
format - A scalar string taking on one of the accepted matplotlib | |
image format strings, e.g. 'png', 'pdf'. | |
histogram_color - A scalar string defining the facecolor of the histograms. | |
By default set to 'gainsboro'; any html color name is valid. | |
lowess - If set as a scalar float, a lowess smooth line is fit to the | |
data, with the float value passed to the Biopython lowess() | |
function's f parameter. Sensible values are between | |
0.2, 0.8. | |
sparklines - If True, sparklines are added to the plot, above/right of | |
the histograms | |
sparkline_color - A scalar string defining the facecolor of the histograms. | |
By default set to 'black'; any html color name is valid. | |
window_title - A scalar string defining the title for the plot GUI window | |
xrange - A two-element list of floats defining the minimum and | |
maximum of the x-axis. By default, sensible limits are | |
estimated from the data. | |
xtitle - A scalar string defining the title for the x-axis (abcissa) | |
yrange - A two-element list of floats defining the minimum and | |
maximum of the y-axis. By default, sensible limits are | |
estimated from the data. | |
ytitle - A scalar string defining the title for the y-axis (ordinate) | |
Return | |
------ | |
None | |
Side Effects | |
------------ | |
Displays a matplotlib plot window and optionally writes the image out ot a | |
file in the local directory. | |
""" | |
try: | |
assert len(x) == len(y), 'Input arrays must be equal length.' | |
except AssertionError, msg: | |
print(msg) | |
return None | |
if xtitle is None: | |
xtitle = 'X' | |
if ytitle is None: | |
ytitle = 'Y' | |
if window_title is None: | |
window_title = 'Scatter Plot' | |
B1, B0, R, tt, stderr= stats.linregress(x, y) | |
print (' %s ' % window_title).center(78, '-') | |
print('Linear regression using stats.linregress') | |
print('parameters: a=%.2f b=%.2f, std error= %.3f, R=%.3f' \ | |
% (B0, B1, stderr, R)) | |
Nx = len(x) | |
minx = np.min(x) | |
maxx = np.max(x) | |
miny = np.min(y) | |
maxy = np.max(y) | |
if xrange is None: | |
if minx > 0: | |
domainMin = minx * 0.95 | |
else: | |
domainMin = minx * 1.05 | |
if maxx > 0: | |
domainMax = maxx * 1.05 | |
else: | |
domainMax = maxx * 0.95 | |
else: | |
domainMin, domainMax = xrange | |
if yrange is None: | |
if miny > 0: | |
rangeMin = miny * 0.95 | |
else: | |
rangeMin = miny * 1.05 | |
if maxy > 0: | |
rangeMax = maxy * 1.05 | |
else: | |
rangeMax = maxy * 0.95 | |
else: | |
rangeMin, rangeMax = yrange | |
domain = np.linspace(domainMin, domainMax, Nx) | |
regression_line = B0 + (B1 * domain) | |
# OLS_fit = B0 + (B1 * x) | |
# SSE = np.sum((y - OLS_fit)**2) | |
# SST = np.sum((y - np.average(y))**2) | |
# SSR = np.sum((OLS_fit - np.average(y))**2) | |
# R2_ls = SSR/SST | |
# print('scatter_hist_spark: OLS R^2: %f' % R2_ls) | |
left, width = 0.11, 0.7 | |
bottom, height = 0.1, 0.7 | |
bottom_h = left + width | |
left_h = left + width + 0.01 | |
hist_h = 0.10 | |
sprk_h = 0.05 | |
rect_scatter = [left, bottom, width, height] | |
rect_histx = [left, bottom_h, width, hist_h] | |
rect_histy = [left_h, bottom, hist_h, height] | |
rect_sparkx = [left, bottom + height + hist_h + 0.01, width, sprk_h] | |
rect_sparky = [left_h + hist_h, bottom, sprk_h, height] | |
# start with a rectangular Figure | |
fig = plt.figure(figsize=(8,8)) | |
fig.canvas.set_window_title(window_title) | |
axScatter = plt.axes(rect_scatter) | |
plt.ylabel(ytitle) | |
plt.xlabel(xtitle) | |
axScatter.xaxis.set_major_locator(MaxNLocator(8)) | |
axScatter.yaxis.set_major_locator(MaxNLocator(8)) | |
axHistx = plt.axes(rect_histx) | |
nullfmt = NullFormatter() | |
axHistx.xaxis.set_major_formatter(nullfmt) | |
axHistx.yaxis.set_major_formatter(nullfmt) | |
axHisty = plt.axes(rect_histy) | |
axHisty.xaxis.set_major_formatter(nullfmt) | |
axHisty.yaxis.set_major_formatter(nullfmt) | |
axScatter.scatter(x, y, edgecolors='red', facecolors='none', clip_on=False) | |
#axScatter.plot(x, y, 'ro') | |
axScatter.plot(domain, regression_line, 'g-', linewidth=2, | |
label='Least Squares Fit, %s=%1.3f' % ('$R^2$', R*R)) | |
if lowess is not None: | |
try: | |
from Bio.Statistics import lowess as biop | |
except ImportError, msg: | |
warnings.warn('Warning: lowess smoothing relies on functionality' \ | |
+ ' in BioPython, which is not installed.\nIgnoring'\ | |
+ 'lowess plot...') | |
else: | |
sorted_indices = np.argsort(x) | |
lowess_smooth = biop.lowess(x[sorted_indices], y[sorted_indices], | |
iter=10, f=lowess) | |
axScatter.plot(x[sorted_indices], lowess_smooth, 'b-', linewidth=2, | |
alpha=0.6, label='Lowess Smooth') | |
axScatter.set_xlim(domainMin, domainMax) | |
axScatter.set_ylim(rangeMin, rangeMax) | |
leg = axScatter.legend(loc='best', borderpad=0.2, shadow=False, | |
prop=FontProperties(size='x-small'), | |
markerscale=0.1, title='N=%i R=%1.3f' % (Nx, R)) | |
leg.get_frame().set_alpha(0.8) | |
axHistx.hist(x, bins=40, normed=True, facecolor=histogram_color) | |
axHisty.hist(y, bins=40, normed=True, facecolor=histogram_color, | |
orientation='horizontal') | |
axHistx.set_xlim(axScatter.get_xlim()) | |
axHisty.set_ylim(axScatter.get_ylim()) | |
if sparklines: | |
axSparkx = plt.axes(rect_sparkx) | |
axSparkx.plot(range(Nx), x, color=sparkline_color, linewidth=2) | |
axSparkx.set_xlim(0, Nx) | |
axSparkx.axis('off') | |
axSparky = plt.axes(rect_sparky) | |
axSparky.plot(y[::-1], np.arange(len(y)), color=sparkline_color, | |
linewidth=2) | |
axSparky.set_ylim(0, len(y)) | |
axSparky.axis('off') | |
plt.show() | |
if filename is not None: | |
plt.savefig(filename, format=format) | |
return None | |
if __name__ == '__main__': | |
#Example taken from http://www.scipy.org/Cookbook/LinearRegression | |
n = 50 | |
t = np.linspace(-5, 5, n) | |
a = 0.8 | |
b = -4 | |
x = sp.polyval([a,b], t) | |
#add some noise | |
xn = x + sp.randn(n) | |
plot_shs(t, xn, xtitle='np.linspace(5,5)', ytitle='Data Value', | |
window_title='Test 1 of scatter_hist_spark', | |
lowess=0.5) | |
#Example for x and y being time series data | |
N = 100 | |
seasonality = np.cos(range(N)) | |
x = seasonality + np.random.normal(loc=0, scale=0.2, size=N) | |
y = seasonality + np.random.normal(loc=0, scale=2, size=N) | |
plot_shs(x, y, xtitle='Series 1', ytitle='Series 2', | |
window_title='Test 2 of scatter_hist_spark', | |
lowess=0.5, sparklines=True, histogram_color='goldenrod', | |
sparkline_color='fuchsia') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment