cheesinglee/output.txt

## output.txt
Number of map-reduce blocks on columns, and population size on rows [200,500,1000,5000,10000]

1		3		5		10		20		30		50
================================================================================
uncorrelated data
0.020074	0.067028	0.150469	0.112782	0.260606	0.485714	0.400000
-0.018003	-0.095463	-0.102538	0.050660	0.031538	0.358824	0.333333
-0.011529	-0.000860	-0.019766	-0.164176	-0.203649	-0.167781	-0.183459
-0.004745	-0.021326	-0.014286	-0.075185	-0.091779	-0.063402	-0.051401
0.005333	0.000626	0.007783	0.024034	-0.040481	-0.017537	-0.115995


strongly correlated data
0.850227	0.815092	0.811069	0.849624	0.563636	0.371429	0.800000
0.853438	0.862345	0.822358	0.659352	0.490769	0.032353	0.115152
0.849022	0.850589	0.818749	0.729313	0.619688	0.276738	0.312782
0.835320	0.844361	0.818085	0.678766	0.494625	0.403399	0.248713
0.835845	0.847948	0.832904	0.684553	0.477683	0.336716	0.214730


weakly correlated data
-0.342108	-0.449953	-0.599062	-0.563910	-0.878788	-0.771429	-0.800000
-0.310755	-0.244976	-0.203840	-0.158127	-0.238462	-0.047059	-0.151515
-0.247143	-0.286354	-0.257244	-0.165665	-0.248403	-0.320187	-0.284211
-0.228843	-0.216734	-0.174657	-0.210940	-0.192367	-0.127055	-0.065455
-0.258024	-0.268665	-0.249632	-0.210766	-0.235960	-0.238336	-0.223411
>>>

## spearmans.py
# -*- coding: utf-8 -*-
"""
Created on Wed Jan 21 10:40:47 2015

@author: cheesinglee
"""

from numpy import *
from scipy.stats import spearmanr
from matplotlib import pyplot

population_sizes = [200,500,1000,5000,10000]
n_batches = [1,3,5,10,20,30,50]
REPS = 1

print('\t\t'.join(map(str,n_batches)))
print('='*80)

print('uncorrelated data')
for N in population_sizes:
#    print 'N = %d\n' % N
    x_all = random.rand(N)*100
    y_all = random.rand(N)*100
    rho_list = []
    for b in n_batches:
        x = array_split(x_all,b)
        y = array_split(y_all,b)
        updates = []
        for x_batch,y_batch in zip(x,y):
            rho = spearmanr(x_batch,y_batch)[0]
            updates.append([rho,len(x_batch)])
        rho_final = 0
        for rho,n in updates:
            rho_final += float(n)/N*rho
        rho_list.append(rho)
    print('\t'.join(['%0.6f' % x for x in rho_list]))
pyplot.figure()
pyplot.plot(x_all,y_all,'.')
pyplot.title('uncorrelated')


print('\n\nstrongly correlated data')
for N in population_sizes:
    x_all = linspace(-100,100,N)
    y_all = (x_all/25)**3 + 10
    noise = random.randn(N)*10
    y_all += noise
    rho_list = []
    for b in n_batches:
        x = array_split(x_all,b)
        y = array_split(y_all,b)
        updates = []
        for x_batch,y_batch in zip(x,y):
            rho = spearmanr(x_batch,y_batch)[0]
            updates.append([rho,len(x_batch)])
        rho_final = 0
        for rho,n in updates:
            rho_final += float(n)/N*rho
        rho_list.append(rho)
    print('\t'.join(['%0.6f' % x for x in rho_list]))
pyplot.figure()
pyplot.plot(x_all,y_all,'.')
pyplot.title('strongly correlated')

print('\n\nweakly correlated data')
for N in population_sizes:
    mean = [0,0]
    cov = [[400,-100],[-100,400]]
    samples = random.multivariate_normal(mean,cov,N)
    x_all = samples[:,0]
    y_all = samples[:,1]
    rho_list = []
    for b in n_batches:
        x = array_split(x_all,b)
        y = array_split(y_all,b)
        updates = []
        for x_batch,y_batch in zip(x,y):
            rho = spearmanr(x_batch,y_batch)[0]
            updates.append([rho,len(x_batch)])
        rho_final = 0
        for rho,n in updates:
            rho_final += float(n)/N*rho
        rho_list.append(rho)
    print('\t'.join(['%0.6f' % x for x in rho_list]))
pyplot.figure()
pyplot.plot(x_all,y_all,'.')
pyplot.title('weakly correlated')
	Number of map-reduce blocks on columns, and population size on rows [200,500,1000,5000,10000]

	1 3 5 10 20 30 50
	================================================================================
	uncorrelated data
	0.020074 0.067028 0.150469 0.112782 0.260606 0.485714 0.400000
	-0.018003 -0.095463 -0.102538 0.050660 0.031538 0.358824 0.333333
	-0.011529 -0.000860 -0.019766 -0.164176 -0.203649 -0.167781 -0.183459
	-0.004745 -0.021326 -0.014286 -0.075185 -0.091779 -0.063402 -0.051401
	0.005333 0.000626 0.007783 0.024034 -0.040481 -0.017537 -0.115995


	strongly correlated data
	0.850227 0.815092 0.811069 0.849624 0.563636 0.371429 0.800000
	0.853438 0.862345 0.822358 0.659352 0.490769 0.032353 0.115152
	0.849022 0.850589 0.818749 0.729313 0.619688 0.276738 0.312782
	0.835320 0.844361 0.818085 0.678766 0.494625 0.403399 0.248713
	0.835845 0.847948 0.832904 0.684553 0.477683 0.336716 0.214730


	weakly correlated data
	-0.342108 -0.449953 -0.599062 -0.563910 -0.878788 -0.771429 -0.800000
	-0.310755 -0.244976 -0.203840 -0.158127 -0.238462 -0.047059 -0.151515
	-0.247143 -0.286354 -0.257244 -0.165665 -0.248403 -0.320187 -0.284211
	-0.228843 -0.216734 -0.174657 -0.210940 -0.192367 -0.127055 -0.065455
	-0.258024 -0.268665 -0.249632 -0.210766 -0.235960 -0.238336 -0.223411
	>>>
	# -- coding: utf-8 --
	"""
	Created on Wed Jan 21 10:40:47 2015

	@author: cheesinglee
	"""

	from numpy import *
	from scipy.stats import spearmanr
	from matplotlib import pyplot

	population_sizes = [200,500,1000,5000,10000]
	n_batches = [1,3,5,10,20,30,50]
	REPS = 1

	print('\t\t'.join(map(str,n_batches)))
	print('='*80)

	print('uncorrelated data')
	for N in population_sizes:
	# print 'N = %d\n' % N
	x_all = random.rand(N)*100
	y_all = random.rand(N)*100
	rho_list = []
	for b in n_batches:
	x = array_split(x_all,b)
	y = array_split(y_all,b)
	updates = []
	for x_batch,y_batch in zip(x,y):
	rho = spearmanr(x_batch,y_batch)[0]
	updates.append([rho,len(x_batch)])
	rho_final = 0
	for rho,n in updates:
	rho_final += float(n)/N*rho
	rho_list.append(rho)
	print('\t'.join(['%0.6f' % x for x in rho_list]))
	pyplot.figure()
	pyplot.plot(x_all,y_all,'.')
	pyplot.title('uncorrelated')


	print('\n\nstrongly correlated data')
	for N in population_sizes:
	x_all = linspace(-100,100,N)
	y_all = (x_all/25)**3 + 10
	noise = random.randn(N)*10
	y_all += noise
	rho_list = []
	for b in n_batches:
	x = array_split(x_all,b)
	y = array_split(y_all,b)
	updates = []
	for x_batch,y_batch in zip(x,y):
	rho = spearmanr(x_batch,y_batch)[0]
	updates.append([rho,len(x_batch)])
	rho_final = 0
	for rho,n in updates:
	rho_final += float(n)/N*rho
	rho_list.append(rho)
	print('\t'.join(['%0.6f' % x for x in rho_list]))
	pyplot.figure()
	pyplot.plot(x_all,y_all,'.')
	pyplot.title('strongly correlated')

	print('\n\nweakly correlated data')
	for N in population_sizes:
	mean = [0,0]
	cov = [[400,-100],[-100,400]]
	samples = random.multivariate_normal(mean,cov,N)
	x_all = samples[:,0]
	y_all = samples[:,1]
	rho_list = []
	for b in n_batches:
	x = array_split(x_all,b)
	y = array_split(y_all,b)
	updates = []
	for x_batch,y_batch in zip(x,y):
	rho = spearmanr(x_batch,y_batch)[0]
	updates.append([rho,len(x_batch)])
	rho_final = 0
	for rho,n in updates:
	rho_final += float(n)/N*rho
	rho_list.append(rho)
	print('\t'.join(['%0.6f' % x for x in rho_list]))
	pyplot.figure()
	pyplot.plot(x_all,y_all,'.')
	pyplot.title('weakly correlated')