Skip to content

Instantly share code, notes, and snippets.

@j-faria
Created September 10, 2015 10:42
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save j-faria/7f0a6671565e13586ac3 to your computer and use it in GitHub Desktop.
Save j-faria/7f0a6671565e13586ac3 to your computer and use it in GitHub Desktop.
Bayesian correlation
#!/usr/env/python
# -*- coding: utf-8 -*-
# Written by Pedro Figueira.
# Original version can be found at https://pedrofigueira@bitbucket.org/pedrofigueira/bayesiancorrelation
# Modified by João Faria
"""Bayesian Correlation.
Usage:
BayesCorr [-i FILE] [-rvo FILE] [-n iter]
BayesCorr [-h | --help]
Options:
-h --help Show this screen.
-i, --input FILE Input file with x,y data (in 2 columns).
-r, --rank Sample Spearman's rank correlation coefficient.
-n iter Specify number of iterations [default: 1000].
-v, --verbose Verbose output.
-o, --output FILE Output posterior samples to file.
"""
import docopt
import os, sys
import numpy as np
from scipy.stats import rankdata
from scipy.stats import pearsonr, spearmanr
import pymc as pm
# from pymc.Matplot import plot as mcplot
def MCMC_model(data=None, niter=100, inputfile=None, spearmanrank=False, save_posterior=None, verbose=False):
"""
MCMC calculation for the Bivariate Normal distribution
Parameters
----------
data: 2d-array
Observed x,y data.
niter: int, optional
Number of iterations to run MCMC for. Warmup is set to *niter*/10.
spearmanrank: bool, optional
Toggle Spearman's rank calculation.
save_posterior: str, optional
Filename where to save the samples as a pickle object.
verbose: bool, optional
Verbose sampling output.
"""
if data:
x_input, y_input = data
else:
if(inputfile is None):
print "Reading test data from file testdata_male.txt"
#example from http://sumsar.net/blog/2014/03/bayesian-first-aid-pearson-correlation-test/
x_input, y_input = np.loadtxt("testdata_male.txt", unpack=True)
else:
print "Reading data from file %s" % inputfile
x_input, y_input = np.loadtxt(inputfile, unpack=True)
x_input = np.asarray(x_input)
y_input = np.asarray(y_input)
pearson = pearsonr(x_input, y_input)
spearman = spearmanr(x_input, y_input)
if verbose:
print("Pearson's coefficient is %.3f with a %.2e 2-sided p-value " % (pearson[0], pearson[1]))
print("Spearman's rank coefficient is %.3f with a %.2e 2-sided p-value " % (spearman[0], spearman[1]))
# rank variables for spearman's rank calculation
if(spearmanrank):
print ('Analysing ranked data')
x_input = rankdata(x_input)
y_input = rankdata(y_input)
# creating standartized data sets (this does not affect the value of the correlation coefficient)
x = (x_input - x_input.mean()) / x_input.std()
y = (y_input - y_input.mean()) / y_input.std()
#beginning of data analysis
data = np.array([x, y])
# assume an uniformative prior for rho between -1 and 1 and starting at 0 for the sampling
rho = pm.Uniform('rho', lower=-1, upper=1, value=0.0)
# the priors on the means and variances, note the starting point derived from the data
mu1 = pm.Normal('mu1', 0.0, 1000.0, value = x.mean())
mu2 = pm.Normal('mu2', 0.0, 1000.0, value = y.mean())
sigma1 = pm.InverseGamma('sigma1', 11.0, 10.0, value = x.std())
sigma2 = pm.InverseGamma('sigma2', 11.0, 10.0, value = y.std())
# the mean and covariance of the bivariate normal are defined as deterministic functions
@pm.deterministic
def mean(mu1=mu1, mu2=mu2):
return np.array([mu1, mu2])
@pm.deterministic
def cov(rho=rho, sigma1=sigma1, sigma2=sigma2):
ss1 = sigma1 * sigma1
ss2 = sigma2 * sigma2
rss = rho * sigma1 * sigma2
return [[ss1, rss], [rss, ss2]]
# bivariate normal - the likelihood
xy = pm.MvNormalCov('xy', mu=mean, C=cov, value=data.T, observed=True)
# use the previously declared local variables as input to the model
model = pm.Model({'xy':xy, 'rho':rho, 'mu1':mu1, 'mu2':mu2, 'sigma1':sigma1, 'sigma2':sigma2})
# call minimization routine to find maximum a posteriori
pm.MAP(model).fit()
# options
verb = 4 if verbose else 0
pbar = verbose # whether to show a progress bar
if save_posterior: # save posterior samples?
print ('Saving posterior samples in "%s"' % save_posterior)
db = 'pickle'
dbname = save_posterior if save_posterior.endswith('.pickle') else save_posterior+'.pickle'
else:
db = 'ram'
dbname = None
# instantiate the MCMC and sample from the posterior
mcmc = pm.MCMC(model, verbose=verb, db=db, dbname=dbname)
mcmc.sample(iter=niter, burn=niter/10, progress_bar=pbar)
mcmc.rho.summary()
if db == 'pickle':
mcmc.db.close()
return model, mcmc
if __name__ == '__main__':
# Parse arguments, use file docstring as a parameter definition
try:
args = docopt.docopt(__doc__)
except docopt.DocoptExit as e: # Handle invalid options
print e.message
sys.exit(1)
# print args
infile = args['--input']
if infile and not os.path.exists(infile):
print 'File "%s" does not seem to exist.' % infile
sys.exit(1)
model = MCMC_model(inputfile = infile,
spearmanrank = args['--rank'],
niter = int(args['-n']),
save_posterior = args['--output'],
verbose = args['--verbose'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment