j-faria/BayesCorr.py

## BayesCorr.py
#!/usr/env/python
# -*- coding: utf-8 -*-

# Written by Pedro Figueira.
#  Original version can be found at https://pedrofigueira@bitbucket.org/pedrofigueira/bayesiancorrelation
# Modified by João Faria

"""Bayesian Correlation.

Usage:
    BayesCorr [-i FILE] [-rvo FILE] [-n iter]
    BayesCorr [-h | --help]

Options:
    -h --help           Show this screen.
    -i, --input FILE    Input file with x,y data (in 2 columns).
    -r, --rank          Sample Spearman's rank correlation coefficient.
    -n iter             Specify number of iterations [default: 1000].
    -v, --verbose       Verbose output.
    -o, --output FILE   Output posterior samples to file.
"""


import docopt
import os, sys
import numpy as np

from scipy.stats import rankdata
from scipy.stats import pearsonr, spearmanr

import pymc as pm
# from pymc.Matplot import plot as mcplot

def MCMC_model(data=None, niter=100, inputfile=None, spearmanrank=False, save_posterior=None, verbose=False):
    """
    MCMC calculation for the Bivariate Normal distribution

    Parameters
    ----------

    data: 2d-array
          Observed x,y data.
    niter: int, optional
           Number of iterations to run MCMC for. Warmup is set to *niter*/10.
    spearmanrank: bool, optional
                  Toggle Spearman's rank calculation.
    save_posterior: str, optional
                    Filename where to save the samples as a pickle object.
    verbose: bool, optional
             Verbose sampling output.
    """

    if data:
        x_input, y_input = data
    else:
        if(inputfile is None):
            print "Reading test data from file testdata_male.txt"
            #example from http://sumsar.net/blog/2014/03/bayesian-first-aid-pearson-correlation-test/
            x_input, y_input = np.loadtxt("testdata_male.txt", unpack=True)
        else:
            print "Reading data from file %s" % inputfile
            x_input, y_input = np.loadtxt(inputfile, unpack=True)

    x_input = np.asarray(x_input)
    y_input = np.asarray(y_input)

    pearson = pearsonr(x_input, y_input)
    spearman = spearmanr(x_input, y_input)
    if verbose:
        print("Pearson's coefficient is %.3f with a %.2e 2-sided p-value " % (pearson[0], pearson[1]))
        print("Spearman's rank coefficient is %.3f with a %.2e 2-sided p-value " % (spearman[0], spearman[1]))

    # rank variables for spearman's rank calculation
    if(spearmanrank):
        print ('Analysing ranked data')
        x_input = rankdata(x_input)
        y_input = rankdata(y_input)

    # creating standartized data sets (this does not affect the value of the correlation coefficient)
    x = (x_input - x_input.mean()) / x_input.std()
    y = (y_input - y_input.mean()) / y_input.std()

    #beginning of data analysis
    data = np.array([x, y])

    # assume an uniformative prior for rho between -1 and 1 and starting at 0 for the sampling
    rho = pm.Uniform('rho', lower=-1, upper=1, value=0.0)
    # the priors on the means and variances, note the starting point derived from the data
    mu1 = pm.Normal('mu1', 0.0, 1000.0, value = x.mean())
    mu2 = pm.Normal('mu2', 0.0, 1000.0, value = y.mean())
    sigma1 = pm.InverseGamma('sigma1', 11.0, 10.0, value = x.std())
    sigma2 = pm.InverseGamma('sigma2', 11.0, 10.0, value = y.std())

    # the mean and covariance of the bivariate normal are defined as deterministic functions
    @pm.deterministic
    def mean(mu1=mu1, mu2=mu2):
        return np.array([mu1, mu2])

    @pm.deterministic
    def cov(rho=rho, sigma1=sigma1, sigma2=sigma2):
        ss1 = sigma1 * sigma1
        ss2 = sigma2 * sigma2
        rss = rho * sigma1 * sigma2
        return [[ss1, rss], [rss, ss2]]

    # bivariate normal - the likelihood
    xy = pm.MvNormalCov('xy', mu=mean, C=cov, value=data.T, observed=True)

    # use the previously declared local variables as input to the model
    model = pm.Model({'xy':xy, 'rho':rho, 'mu1':mu1, 'mu2':mu2, 'sigma1':sigma1, 'sigma2':sigma2})
    # call minimization routine to find maximum a posteriori
    pm.MAP(model).fit()

    # options
    verb = 4 if verbose else 0
    pbar = verbose  # whether to show a progress bar

    if save_posterior: # save posterior samples?
        print ('Saving posterior samples in "%s"' % save_posterior)
        db = 'pickle'
        dbname = save_posterior if save_posterior.endswith('.pickle') else save_posterior+'.pickle'
    else:
        db = 'ram'
        dbname = None

    # instantiate the MCMC and sample from the posterior
    mcmc = pm.MCMC(model, verbose=verb, db=db, dbname=dbname)
    mcmc.sample(iter=niter, burn=niter/10, progress_bar=pbar)

    mcmc.rho.summary()

    if db == 'pickle':
        mcmc.db.close()


    return model, mcmc


if __name__ == '__main__':
    # Parse arguments, use file docstring as a parameter definition
    try:
        args = docopt.docopt(__doc__)
    except docopt.DocoptExit as e: # Handle invalid options
        print e.message
        sys.exit(1)

    # print args

    infile = args['--input']
    if infile and not os.path.exists(infile):
        print 'File "%s" does not seem to exist.' % infile
        sys.exit(1)

    model = MCMC_model(inputfile = infile,
                       spearmanrank = args['--rank'],
                       niter = int(args['-n']),
                       save_posterior = args['--output'],
                       verbose = args['--verbose'])
	#!/usr/env/python
	# -- coding: utf-8 --

	# Written by Pedro Figueira.
	# Original version can be found at https://pedrofigueira@bitbucket.org/pedrofigueira/bayesiancorrelation
	# Modified by João Faria

	"""Bayesian Correlation.

	Usage:
	BayesCorr [-i FILE] [-rvo FILE] [-n iter]
	BayesCorr [-h \| --help]

	Options:
	-h --help Show this screen.
	-i, --input FILE Input file with x,y data (in 2 columns).
	-r, --rank Sample Spearman's rank correlation coefficient.
	-n iter Specify number of iterations [default: 1000].
	-v, --verbose Verbose output.
	-o, --output FILE Output posterior samples to file.
	"""


	import docopt
	import os, sys
	import numpy as np

	from scipy.stats import rankdata
	from scipy.stats import pearsonr, spearmanr

	import pymc as pm
	# from pymc.Matplot import plot as mcplot

	def MCMC_model(data=None, niter=100, inputfile=None, spearmanrank=False, save_posterior=None, verbose=False):
	"""
	MCMC calculation for the Bivariate Normal distribution

	Parameters
	----------

	data: 2d-array
	Observed x,y data.
	niter: int, optional
	Number of iterations to run MCMC for. Warmup is set to niter/10.
	spearmanrank: bool, optional
	Toggle Spearman's rank calculation.
	save_posterior: str, optional
	Filename where to save the samples as a pickle object.
	verbose: bool, optional
	Verbose sampling output.
	"""

	if data:
	x_input, y_input = data
	else:
	if(inputfile is None):
	print "Reading test data from file testdata_male.txt"
	#example from http://sumsar.net/blog/2014/03/bayesian-first-aid-pearson-correlation-test/
	x_input, y_input = np.loadtxt("testdata_male.txt", unpack=True)
	else:
	print "Reading data from file %s" % inputfile
	x_input, y_input = np.loadtxt(inputfile, unpack=True)

	x_input = np.asarray(x_input)
	y_input = np.asarray(y_input)

	pearson = pearsonr(x_input, y_input)
	spearman = spearmanr(x_input, y_input)
	if verbose:
	print("Pearson's coefficient is %.3f with a %.2e 2-sided p-value " % (pearson[0], pearson[1]))
	print("Spearman's rank coefficient is %.3f with a %.2e 2-sided p-value " % (spearman[0], spearman[1]))

	# rank variables for spearman's rank calculation
	if(spearmanrank):
	print ('Analysing ranked data')
	x_input = rankdata(x_input)
	y_input = rankdata(y_input)

	# creating standartized data sets (this does not affect the value of the correlation coefficient)
	x = (x_input - x_input.mean()) / x_input.std()
	y = (y_input - y_input.mean()) / y_input.std()

	#beginning of data analysis
	data = np.array([x, y])

	# assume an uniformative prior for rho between -1 and 1 and starting at 0 for the sampling
	rho = pm.Uniform('rho', lower=-1, upper=1, value=0.0)
	# the priors on the means and variances, note the starting point derived from the data
	mu1 = pm.Normal('mu1', 0.0, 1000.0, value = x.mean())
	mu2 = pm.Normal('mu2', 0.0, 1000.0, value = y.mean())
	sigma1 = pm.InverseGamma('sigma1', 11.0, 10.0, value = x.std())
	sigma2 = pm.InverseGamma('sigma2', 11.0, 10.0, value = y.std())

	# the mean and covariance of the bivariate normal are defined as deterministic functions
	@pm.deterministic
	def mean(mu1=mu1, mu2=mu2):
	return np.array([mu1, mu2])

	@pm.deterministic
	def cov(rho=rho, sigma1=sigma1, sigma2=sigma2):
	ss1 = sigma1 * sigma1
	ss2 = sigma2 * sigma2
	rss = rho * sigma1 * sigma2
	return [[ss1, rss], [rss, ss2]]

	# bivariate normal - the likelihood
	xy = pm.MvNormalCov('xy', mu=mean, C=cov, value=data.T, observed=True)

	# use the previously declared local variables as input to the model
	model = pm.Model({'xy':xy, 'rho':rho, 'mu1':mu1, 'mu2':mu2, 'sigma1':sigma1, 'sigma2':sigma2})
	# call minimization routine to find maximum a posteriori
	pm.MAP(model).fit()

	# options
	verb = 4 if verbose else 0
	pbar = verbose # whether to show a progress bar

	if save_posterior: # save posterior samples?
	print ('Saving posterior samples in "%s"' % save_posterior)
	db = 'pickle'
	dbname = save_posterior if save_posterior.endswith('.pickle') else save_posterior+'.pickle'
	else:
	db = 'ram'
	dbname = None

	# instantiate the MCMC and sample from the posterior
	mcmc = pm.MCMC(model, verbose=verb, db=db, dbname=dbname)
	mcmc.sample(iter=niter, burn=niter/10, progress_bar=pbar)

	mcmc.rho.summary()

	if db == 'pickle':
	mcmc.db.close()


	return model, mcmc


	if __name__ == '__main__':
	# Parse arguments, use file docstring as a parameter definition
	try:
	args = docopt.docopt(__doc__)
	except docopt.DocoptExit as e: # Handle invalid options
	print e.message
	sys.exit(1)

	# print args

	infile = args['--input']
	if infile and not os.path.exists(infile):
	print 'File "%s" does not seem to exist.' % infile
	sys.exit(1)

	model = MCMC_model(inputfile = infile,
	spearmanrank = args['--rank'],
	niter = int(args['-n']),
	save_posterior = args['--output'],
	verbose = args['--verbose'])