Skip to content

Instantly share code, notes, and snippets.

@cheesinglee
Created October 5, 2015 19:10
Show Gist options
  • Save cheesinglee/c9edd44cf75cb749b6fc to your computer and use it in GitHub Desktop.
Save cheesinglee/c9edd44cf75cb749b6fc to your computer and use it in GitHub Desktop.
Scripts for correlations blog post
#!/usr/bin/env python
from __future__ import print_function
from scipy.stats import pearsonr,spearmanr
"""
Edward Tufte uses this example from Anscombe to show 4 datasets of x
and y that have the same mean, standard deviation, and regression
line, but which are qualitatively different.
matplotlib fun for a rainy day
"""
from pylab import *
x = array([10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5])
y1 = array([8.04, 6.95, 7.58, 8.81, 8.33, 9.96, 7.24, 4.26, 10.84, 4.82, 5.68])
y2 = array([9.14, 8.14, 8.74, 8.77, 9.26, 8.10, 6.13, 3.10, 9.13, 7.26, 4.74])
y3 = array([7.46, 6.77, 12.74, 7.11, 7.81, 8.84, 6.08, 5.39, 8.15, 6.42, 5.73])
x4 = array([8,8,8,8,8,8,8,19,8,8,8])
y4 = array([6.58,5.76,7.71,8.84,8.47,7.04,5.25,12.50,5.56,7.91,6.89])
def fit(x):
return 3+0.5*x
xfit = array( [amin(x), amax(x) ] )
close('all')
subplot(221)
plot(x,y1,'ks', xfit, fit(xfit), 'r-', lw=2)
axis([2,20,2,14])
setp(gca(), xticklabels=[], yticks=(4,8,12), xticks=(0,10,20))
text(3,12, 'I', fontsize=20)
text(3,11, r'$r = %0.5f$' % pearsonr(x,y1)[0],fontsize=14)
text(3,10, r'$\rho = %0.5f$' % spearmanr(x,y1)[0],fontsize=14)
subplot(222)
plot(x,y2,'ks', xfit, fit(xfit), 'r-', lw=2)
axis([2,20,2,14])
setp(gca(), xticklabels=[], yticks=(4,8,12), yticklabels=[], xticks=(0,10,20))
text(3,12, 'II', fontsize=20)
text(3,11, r'$r = %0.5f$' % pearsonr(x,y2)[0],fontsize=14)
text(3,10, r'$\rho = %0.5f$' % spearmanr(x,y2)[0],fontsize=14)
subplot(223)
plot(x,y3,'ks', xfit, fit(xfit), 'r-', lw=2)
axis([2,20,2,14])
text(3,12, 'III', fontsize=20)
setp(gca(), yticks=(4,8,12), xticks=(0,10,20))
text(3,11, r'$r = %0.5f$' % pearsonr(x,y3)[0],fontsize=14)
text(3,10, r'$\rho = %0.5f$' % spearmanr(x,y3)[0],fontsize=14)
subplot(224)
xfit = array([amin(x4),amax(x4)])
plot(x4,y4,'ks', xfit, fit(xfit), 'r-', lw=2)
axis([2,20,2,14])
setp(gca(), yticklabels=[], yticks=(4,8,12), xticks=(0,10,20))
text(3,12, 'IV', fontsize=20)
text(3,11, r'$r = %0.5f$' % pearsonr(x4,y4)[0],fontsize=14)
text(3,10, r'$\rho = %0.5f$' % spearmanr(x4,y4)[0],fontsize=14)
#verify the stats
pairs = (x,y1), (x,y2), (x,y3), (x4,y4)
for x,y in pairs:
print ('mean=%1.2f, std=%1.2f, r=%1.2f'%(mean(y), std(y), corrcoef(x,y)[0][1]))
show()
# -*- coding: utf-8 -*-
"""
Spyder Editor
This is a temporary script file.
"""
from numpy import *
from scipy.stats import pearsonr, spearmanr, linregress
from matplotlib import pyplot
def regress_and_plot(x,y,xlabel,ylabel):
pyplot.figure()
pyplot.cla()
slope, intercept, r_value, p_value, std_err = linregress(x,y)
x1 = min(x)
x2 = max(x)
y1 = slope*x1 + intercept
y2 = slope*x2 + intercept
x_ann = (x1+x2)/2
y_ann = (y1+y2)/2
pyplot.plot(x,y,'o')
pyplot.plot([x1,x2],[y1,y2])
pyplot.annotate('r = %f' % r_value,
xy=(x_ann,y_ann),
xytext= (x_ann+1, y_ann - 5000),
xycoords='data')
pyplot.xlabel(xlabel)
pyplot.ylabel(ylabel)
pyplot.savefig(xlabel+'_vs_'+ylabel+'.png')
def loan_amount(years):
P = random.randn()*3000 + 35000
r = (0.1 + random.randn()*0.01)/12
N = random.uniform(120,180)
c = r*P/(1 -(1+r)**-N)
N = 12*years
return (P*(1+r)**N - (((1+r)**N-1)/r) * c)
N = 50
age = floor(random.uniform(25,66,N))
age.sort()
salary = [a*1000 + 20000 + random.randn()*a*150 for a in age]
debt = [floor(30000*exp((20-a)/5) + random.randn()*a*0) for a in age]
debt = [max(loan_amount(a-22),0) for a in age]
shoe_size = floor(2*(10 + random.randn(N)))/2
print(pearsonr(age,debt))
print(spearmanr(age,debt))
pyplot.close()
regress_and_plot(age,salary,'Age','Salary')
regress_and_plot(age,debt,'Age','Student Loan Debt')
regress_and_plot(shoe_size,salary,'Shoe Size','Salary')
pyplot.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment