Skip to content

Instantly share code, notes, and snippets.

@cheesinglee
Last active August 29, 2015 14:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cheesinglee/46669b71ad8253978eab to your computer and use it in GitHub Desktop.
Save cheesinglee/46669b71ad8253978eab to your computer and use it in GitHub Desktop.
Map-reduce Spearman's rho?
Number of map-reduce blocks on columns, and population size on rows [200,500,1000,5000,10000]
1 3 5 10 20 30 50
================================================================================
uncorrelated data
0.020074 0.067028 0.150469 0.112782 0.260606 0.485714 0.400000
-0.018003 -0.095463 -0.102538 0.050660 0.031538 0.358824 0.333333
-0.011529 -0.000860 -0.019766 -0.164176 -0.203649 -0.167781 -0.183459
-0.004745 -0.021326 -0.014286 -0.075185 -0.091779 -0.063402 -0.051401
0.005333 0.000626 0.007783 0.024034 -0.040481 -0.017537 -0.115995
strongly correlated data
0.850227 0.815092 0.811069 0.849624 0.563636 0.371429 0.800000
0.853438 0.862345 0.822358 0.659352 0.490769 0.032353 0.115152
0.849022 0.850589 0.818749 0.729313 0.619688 0.276738 0.312782
0.835320 0.844361 0.818085 0.678766 0.494625 0.403399 0.248713
0.835845 0.847948 0.832904 0.684553 0.477683 0.336716 0.214730
weakly correlated data
-0.342108 -0.449953 -0.599062 -0.563910 -0.878788 -0.771429 -0.800000
-0.310755 -0.244976 -0.203840 -0.158127 -0.238462 -0.047059 -0.151515
-0.247143 -0.286354 -0.257244 -0.165665 -0.248403 -0.320187 -0.284211
-0.228843 -0.216734 -0.174657 -0.210940 -0.192367 -0.127055 -0.065455
-0.258024 -0.268665 -0.249632 -0.210766 -0.235960 -0.238336 -0.223411
>>>
# -*- coding: utf-8 -*-
"""
Created on Wed Jan 21 10:40:47 2015
@author: cheesinglee
"""
from numpy import *
from scipy.stats import spearmanr
from matplotlib import pyplot
population_sizes = [200,500,1000,5000,10000]
n_batches = [1,3,5,10,20,30,50]
REPS = 1
print('\t\t'.join(map(str,n_batches)))
print('='*80)
print('uncorrelated data')
for N in population_sizes:
# print 'N = %d\n' % N
x_all = random.rand(N)*100
y_all = random.rand(N)*100
rho_list = []
for b in n_batches:
x = array_split(x_all,b)
y = array_split(y_all,b)
updates = []
for x_batch,y_batch in zip(x,y):
rho = spearmanr(x_batch,y_batch)[0]
updates.append([rho,len(x_batch)])
rho_final = 0
for rho,n in updates:
rho_final += float(n)/N*rho
rho_list.append(rho)
print('\t'.join(['%0.6f' % x for x in rho_list]))
pyplot.figure()
pyplot.plot(x_all,y_all,'.')
pyplot.title('uncorrelated')
print('\n\nstrongly correlated data')
for N in population_sizes:
x_all = linspace(-100,100,N)
y_all = (x_all/25)**3 + 10
noise = random.randn(N)*10
y_all += noise
rho_list = []
for b in n_batches:
x = array_split(x_all,b)
y = array_split(y_all,b)
updates = []
for x_batch,y_batch in zip(x,y):
rho = spearmanr(x_batch,y_batch)[0]
updates.append([rho,len(x_batch)])
rho_final = 0
for rho,n in updates:
rho_final += float(n)/N*rho
rho_list.append(rho)
print('\t'.join(['%0.6f' % x for x in rho_list]))
pyplot.figure()
pyplot.plot(x_all,y_all,'.')
pyplot.title('strongly correlated')
print('\n\nweakly correlated data')
for N in population_sizes:
mean = [0,0]
cov = [[400,-100],[-100,400]]
samples = random.multivariate_normal(mean,cov,N)
x_all = samples[:,0]
y_all = samples[:,1]
rho_list = []
for b in n_batches:
x = array_split(x_all,b)
y = array_split(y_all,b)
updates = []
for x_batch,y_batch in zip(x,y):
rho = spearmanr(x_batch,y_batch)[0]
updates.append([rho,len(x_batch)])
rho_final = 0
for rho,n in updates:
rho_final += float(n)/N*rho
rho_list.append(rho)
print('\t'.join(['%0.6f' % x for x in rho_list]))
pyplot.figure()
pyplot.plot(x_all,y_all,'.')
pyplot.title('weakly correlated')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment