Created
September 10, 2015 10:42
-
-
Save j-faria/7f0a6671565e13586ac3 to your computer and use it in GitHub Desktop.
Bayesian correlation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/env/python | |
# -*- coding: utf-8 -*- | |
# Written by Pedro Figueira. | |
# Original version can be found at https://pedrofigueira@bitbucket.org/pedrofigueira/bayesiancorrelation | |
# Modified by João Faria | |
"""Bayesian Correlation. | |
Usage: | |
BayesCorr [-i FILE] [-rvo FILE] [-n iter] | |
BayesCorr [-h | --help] | |
Options: | |
-h --help Show this screen. | |
-i, --input FILE Input file with x,y data (in 2 columns). | |
-r, --rank Sample Spearman's rank correlation coefficient. | |
-n iter Specify number of iterations [default: 1000]. | |
-v, --verbose Verbose output. | |
-o, --output FILE Output posterior samples to file. | |
""" | |
import docopt | |
import os, sys | |
import numpy as np | |
from scipy.stats import rankdata | |
from scipy.stats import pearsonr, spearmanr | |
import pymc as pm | |
# from pymc.Matplot import plot as mcplot | |
def MCMC_model(data=None, niter=100, inputfile=None, spearmanrank=False, save_posterior=None, verbose=False): | |
""" | |
MCMC calculation for the Bivariate Normal distribution | |
Parameters | |
---------- | |
data: 2d-array | |
Observed x,y data. | |
niter: int, optional | |
Number of iterations to run MCMC for. Warmup is set to *niter*/10. | |
spearmanrank: bool, optional | |
Toggle Spearman's rank calculation. | |
save_posterior: str, optional | |
Filename where to save the samples as a pickle object. | |
verbose: bool, optional | |
Verbose sampling output. | |
""" | |
if data: | |
x_input, y_input = data | |
else: | |
if(inputfile is None): | |
print "Reading test data from file testdata_male.txt" | |
#example from http://sumsar.net/blog/2014/03/bayesian-first-aid-pearson-correlation-test/ | |
x_input, y_input = np.loadtxt("testdata_male.txt", unpack=True) | |
else: | |
print "Reading data from file %s" % inputfile | |
x_input, y_input = np.loadtxt(inputfile, unpack=True) | |
x_input = np.asarray(x_input) | |
y_input = np.asarray(y_input) | |
pearson = pearsonr(x_input, y_input) | |
spearman = spearmanr(x_input, y_input) | |
if verbose: | |
print("Pearson's coefficient is %.3f with a %.2e 2-sided p-value " % (pearson[0], pearson[1])) | |
print("Spearman's rank coefficient is %.3f with a %.2e 2-sided p-value " % (spearman[0], spearman[1])) | |
# rank variables for spearman's rank calculation | |
if(spearmanrank): | |
print ('Analysing ranked data') | |
x_input = rankdata(x_input) | |
y_input = rankdata(y_input) | |
# creating standartized data sets (this does not affect the value of the correlation coefficient) | |
x = (x_input - x_input.mean()) / x_input.std() | |
y = (y_input - y_input.mean()) / y_input.std() | |
#beginning of data analysis | |
data = np.array([x, y]) | |
# assume an uniformative prior for rho between -1 and 1 and starting at 0 for the sampling | |
rho = pm.Uniform('rho', lower=-1, upper=1, value=0.0) | |
# the priors on the means and variances, note the starting point derived from the data | |
mu1 = pm.Normal('mu1', 0.0, 1000.0, value = x.mean()) | |
mu2 = pm.Normal('mu2', 0.0, 1000.0, value = y.mean()) | |
sigma1 = pm.InverseGamma('sigma1', 11.0, 10.0, value = x.std()) | |
sigma2 = pm.InverseGamma('sigma2', 11.0, 10.0, value = y.std()) | |
# the mean and covariance of the bivariate normal are defined as deterministic functions | |
@pm.deterministic | |
def mean(mu1=mu1, mu2=mu2): | |
return np.array([mu1, mu2]) | |
@pm.deterministic | |
def cov(rho=rho, sigma1=sigma1, sigma2=sigma2): | |
ss1 = sigma1 * sigma1 | |
ss2 = sigma2 * sigma2 | |
rss = rho * sigma1 * sigma2 | |
return [[ss1, rss], [rss, ss2]] | |
# bivariate normal - the likelihood | |
xy = pm.MvNormalCov('xy', mu=mean, C=cov, value=data.T, observed=True) | |
# use the previously declared local variables as input to the model | |
model = pm.Model({'xy':xy, 'rho':rho, 'mu1':mu1, 'mu2':mu2, 'sigma1':sigma1, 'sigma2':sigma2}) | |
# call minimization routine to find maximum a posteriori | |
pm.MAP(model).fit() | |
# options | |
verb = 4 if verbose else 0 | |
pbar = verbose # whether to show a progress bar | |
if save_posterior: # save posterior samples? | |
print ('Saving posterior samples in "%s"' % save_posterior) | |
db = 'pickle' | |
dbname = save_posterior if save_posterior.endswith('.pickle') else save_posterior+'.pickle' | |
else: | |
db = 'ram' | |
dbname = None | |
# instantiate the MCMC and sample from the posterior | |
mcmc = pm.MCMC(model, verbose=verb, db=db, dbname=dbname) | |
mcmc.sample(iter=niter, burn=niter/10, progress_bar=pbar) | |
mcmc.rho.summary() | |
if db == 'pickle': | |
mcmc.db.close() | |
return model, mcmc | |
if __name__ == '__main__': | |
# Parse arguments, use file docstring as a parameter definition | |
try: | |
args = docopt.docopt(__doc__) | |
except docopt.DocoptExit as e: # Handle invalid options | |
print e.message | |
sys.exit(1) | |
# print args | |
infile = args['--input'] | |
if infile and not os.path.exists(infile): | |
print 'File "%s" does not seem to exist.' % infile | |
sys.exit(1) | |
model = MCMC_model(inputfile = infile, | |
spearmanrank = args['--rank'], | |
niter = int(args['-n']), | |
save_posterior = args['--output'], | |
verbose = args['--verbose']) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment