JohannesBuchner/overlapgauss.py

## overlapgauss.py
import numpy
import matplotlib.pyplot as plt
import scipy.stats

# two gaussian uncertainties with width sigma
# at distance delta
# what is the probability that they actually have the same value?

def compute_bayes(delta, border=5):
	a = scipy.stats.norm()
	b = scipy.stats.norm(loc=delta)

	# integration space starts from the left border of the first gaussian
	# and ends at the right edge of the second gaussian
	x = numpy.linspace(-border, border + delta, 4000)
	# which is where the prior is defined
	prior = scipy.stats.uniform(loc=-border, scale=2*border + delta)

	# model single-value:
	Z1 = numpy.trapz(x=x, y=a.pdf(x) * b.pdf(x) * prior.pdf(x))

	# model two-value:
	# the two integrals for the two values are independent
	int_x = numpy.trapz(x=x, y=a.pdf(x) * prior.pdf(x))
	int_y = numpy.trapz(x=x, y=b.pdf(x) * prior.pdf(x))
	Z2 = int_x * int_y

	return Z2 / (Z2 + Z1)

def compute_naive(delta):
	a = scipy.stats.norm()
	b = scipy.stats.norm(loc=delta)

	# fraction of overlapping area

	# at 5 sigma, the remaining area from each is negligible
	border = 5

	x = numpy.linspace(-border, border + delta, 4000)
	ap = a.pdf(x)
	bp = b.pdf(x)
	mul = numpy.min([ap, bp], axis=0)
	Z2 = numpy.trapz(x=x, y=mul)
	return 1 - Z2

def compute_aic(delta):
	a = scipy.stats.norm()
	b = scipy.stats.norm(loc=delta)
	loglike_1 = a.logpdf(0) + b.logpdf(delta)
	# best case is in the middle
	peak = delta / 2.
	loglike_2 = a.logpdf(peak) + b.logpdf(peak)
	deltachi2 = -2 * (loglike_2 - loglike_1)
	# parameters are just the location
	nparam2 = 2
	nparam1 = 1
	return deltachi2 - 2 * (nparam2 - nparam1)


def compute_bic(delta):
	a = scipy.stats.norm()
	b = scipy.stats.norm(loc=delta)
	loglike_1 = a.logpdf(0) + b.logpdf(delta)
	# best case is in the middle
	peak = delta / 2.
	loglike_2 = a.logpdf(peak) + b.logpdf(peak)
	deltachi2 = -2 * (loglike_2 - loglike_1)
	# parameters are just the location
	nparam2 = 2
	nparam1 = 1
	nsample = 2
	return deltachi2 - numpy.log(nsample) * (nparam2 - nparam1)

def compute_bic_prob(delta):
	a = scipy.stats.norm()
	b = scipy.stats.norm(loc=delta)
	loglike_1 = a.logpdf(0) + b.logpdf(delta)
	# best case is in the middle
	peak = delta / 2.
	loglike_2 = a.logpdf(peak) + b.logpdf(peak)
	# parameters are just the location
	nparam2 = 2
	nparam1 = 1
	nsample = 2
	bic1 = -2 * loglike_1 - numpy.log(nsample) * nparam1
	bic2 = -2 * loglike_2 - numpy.log(nsample) * nparam2

	Z1 = -0.5 * numpy.exp(bic1)
	Z2 = -0.5 * numpy.exp(bic2)
	return Z2 / (Z2 + Z1)

deltas = [0, 1, 2, 3, 4, 5]

def test_bayes():

	BICBs = [compute_bic_prob(delta) for delta in deltas]
	plt.plot(deltas, BICBs, ':', label='BIC')

	Bnaives = [compute_naive(delta) for delta in deltas]
	plt.plot(deltas, Bnaives, '-', label='1 - overlapping area')

	alpha = 1
	for border in 3, 5, 1, 100:
		Bs = [compute_bayes(delta, border=border) for delta in deltas]

		plt.plot(deltas, Bs, 'o-', label='prior up to $%s\sigma$' % border, alpha=alpha)
		alpha=0.1

	for d in deltas:
		if int(d) != d: continue
		s = 0.04
		a = scipy.stats.norm(d, s)
		b = scipy.stats.norm(d+s*d, s)
		x = numpy.linspace(d-3*s, d+s*d+3*s, 100)
		ap = a.pdf(x)
		bp = b.pdf(x)
		plt.fill(x, ap / ap.max() * 0.1, color='grey', alpha=0.5)
		plt.fill(x, bp / bp.max() * 0.1, color='blue', alpha=0.5)

	plt.xlabel('distance in units of $\sigma$')
	plt.ylabel('prob(different|D)')
	plt.xlim(-0.5, 5.5)
	plt.legend(loc='best')
	plt.savefig('overlapgauss_bayes.pdf', bbox_inches='tight')
	plt.savefig('overlapgauss_bayes.png', dpi=140, bbox_inches='tight')
	plt.close()

def test_aic():
	import matplotlib.patches as patches
	AICs = [compute_aic(delta) for delta in deltas]

	plt.plot(deltas, AICs, 's-')
	#plt.text(1, 8)
	plt.text(0, 10, 'Are two measurements distinct?', va='center')
	plt.text(1, 7, 'Different value', va='center')
	plt.text(2, -1.5, 'Same value', va='center')
	plt.gca().add_patch(patches.Rectangle((-0.5, 0), deltas[-1]+1, 6, alpha=0.1, linewidth=0, color='grey'))
	plt.gca().add_patch(patches.Rectangle((-0.5, 0), deltas[-1]+1, 5, alpha=0.2, linewidth=0, color='grey'))
	plt.gca().add_patch(patches.Rectangle((-0.5, 0), deltas[-1]+1, 4, alpha=0.3, linewidth=0, color='grey'))

	for d in deltas:
		if int(d) != d: continue
		s = 0.04
		a = scipy.stats.norm(d, s)
		b = scipy.stats.norm(d+s*d, s)
		x = numpy.linspace(d-3*s, d+s*d+3*s, 100)
		ap = a.pdf(x)
		bp = b.pdf(x)
		plt.fill(x, ap / ap.max() * 1 - 4, color='grey', alpha=0.5)
		plt.fill(x, bp / bp.max() * 1 - 4, color='blue', alpha=0.5)

	plt.ylabel('Akaike Information Criterion')
	plt.xlim(-0.5, 5.5)
	plt.xlabel('distance in units of $\sigma$')
	plt.savefig('overlapgauss_aic.pdf', bbox_inches='tight')
	plt.savefig('overlapgauss_aic.png', dpi=140, bbox_inches='tight')
	plt.close()

test_aic()
test_bayes()
	import numpy
	import matplotlib.pyplot as plt
	import scipy.stats

	# two gaussian uncertainties with width sigma
	# at distance delta
	# what is the probability that they actually have the same value?

	def compute_bayes(delta, border=5):
	a = scipy.stats.norm()
	b = scipy.stats.norm(loc=delta)

	# integration space starts from the left border of the first gaussian
	# and ends at the right edge of the second gaussian
	x = numpy.linspace(-border, border + delta, 4000)
	# which is where the prior is defined
	prior = scipy.stats.uniform(loc=-border, scale=2*border + delta)

	# model single-value:
	Z1 = numpy.trapz(x=x, y=a.pdf(x) * b.pdf(x) * prior.pdf(x))

	# model two-value:
	# the two integrals for the two values are independent
	int_x = numpy.trapz(x=x, y=a.pdf(x) * prior.pdf(x))
	int_y = numpy.trapz(x=x, y=b.pdf(x) * prior.pdf(x))
	Z2 = int_x * int_y

	return Z2 / (Z2 + Z1)

	def compute_naive(delta):
	a = scipy.stats.norm()
	b = scipy.stats.norm(loc=delta)

	# fraction of overlapping area

	# at 5 sigma, the remaining area from each is negligible
	border = 5

	x = numpy.linspace(-border, border + delta, 4000)
	ap = a.pdf(x)
	bp = b.pdf(x)
	mul = numpy.min([ap, bp], axis=0)
	Z2 = numpy.trapz(x=x, y=mul)
	return 1 - Z2

	def compute_aic(delta):
	a = scipy.stats.norm()
	b = scipy.stats.norm(loc=delta)
	loglike_1 = a.logpdf(0) + b.logpdf(delta)
	# best case is in the middle
	peak = delta / 2.
	loglike_2 = a.logpdf(peak) + b.logpdf(peak)
	deltachi2 = -2 * (loglike_2 - loglike_1)
	# parameters are just the location
	nparam2 = 2
	nparam1 = 1
	return deltachi2 - 2 * (nparam2 - nparam1)


	def compute_bic(delta):
	a = scipy.stats.norm()
	b = scipy.stats.norm(loc=delta)
	loglike_1 = a.logpdf(0) + b.logpdf(delta)
	# best case is in the middle
	peak = delta / 2.
	loglike_2 = a.logpdf(peak) + b.logpdf(peak)
	deltachi2 = -2 * (loglike_2 - loglike_1)
	# parameters are just the location
	nparam2 = 2
	nparam1 = 1
	nsample = 2
	return deltachi2 - numpy.log(nsample) * (nparam2 - nparam1)

	def compute_bic_prob(delta):
	a = scipy.stats.norm()
	b = scipy.stats.norm(loc=delta)
	loglike_1 = a.logpdf(0) + b.logpdf(delta)
	# best case is in the middle
	peak = delta / 2.
	loglike_2 = a.logpdf(peak) + b.logpdf(peak)
	# parameters are just the location
	nparam2 = 2
	nparam1 = 1
	nsample = 2
	bic1 = -2 * loglike_1 - numpy.log(nsample) * nparam1
	bic2 = -2 * loglike_2 - numpy.log(nsample) * nparam2

	Z1 = -0.5 * numpy.exp(bic1)
	Z2 = -0.5 * numpy.exp(bic2)
	return Z2 / (Z2 + Z1)

	deltas = [0, 1, 2, 3, 4, 5]

	def test_bayes():

	BICBs = [compute_bic_prob(delta) for delta in deltas]
	plt.plot(deltas, BICBs, ':', label='BIC')

	Bnaives = [compute_naive(delta) for delta in deltas]
	plt.plot(deltas, Bnaives, '-', label='1 - overlapping area')

	alpha = 1
	for border in 3, 5, 1, 100:
	Bs = [compute_bayes(delta, border=border) for delta in deltas]

	plt.plot(deltas, Bs, 'o-', label='prior up to $%s\sigma$' % border, alpha=alpha)
	alpha=0.1

	for d in deltas:
	if int(d) != d: continue
	s = 0.04
	a = scipy.stats.norm(d, s)
	b = scipy.stats.norm(d+s*d, s)
	x = numpy.linspace(d-3s, d+sd+3*s, 100)
	ap = a.pdf(x)
	bp = b.pdf(x)
	plt.fill(x, ap / ap.max() * 0.1, color='grey', alpha=0.5)
	plt.fill(x, bp / bp.max() * 0.1, color='blue', alpha=0.5)

	plt.xlabel('distance in units of $\sigma$')
	plt.ylabel('prob(different\|D)')
	plt.xlim(-0.5, 5.5)
	plt.legend(loc='best')
	plt.savefig('overlapgauss_bayes.pdf', bbox_inches='tight')
	plt.savefig('overlapgauss_bayes.png', dpi=140, bbox_inches='tight')
	plt.close()

	def test_aic():
	import matplotlib.patches as patches
	AICs = [compute_aic(delta) for delta in deltas]

	plt.plot(deltas, AICs, 's-')
	#plt.text(1, 8)
	plt.text(0, 10, 'Are two measurements distinct?', va='center')
	plt.text(1, 7, 'Different value', va='center')
	plt.text(2, -1.5, 'Same value', va='center')
	plt.gca().add_patch(patches.Rectangle((-0.5, 0), deltas[-1]+1, 6, alpha=0.1, linewidth=0, color='grey'))
	plt.gca().add_patch(patches.Rectangle((-0.5, 0), deltas[-1]+1, 5, alpha=0.2, linewidth=0, color='grey'))
	plt.gca().add_patch(patches.Rectangle((-0.5, 0), deltas[-1]+1, 4, alpha=0.3, linewidth=0, color='grey'))

	for d in deltas:
	if int(d) != d: continue
	s = 0.04
	a = scipy.stats.norm(d, s)
	b = scipy.stats.norm(d+s*d, s)
	x = numpy.linspace(d-3s, d+sd+3*s, 100)
	ap = a.pdf(x)
	bp = b.pdf(x)
	plt.fill(x, ap / ap.max() * 1 - 4, color='grey', alpha=0.5)
	plt.fill(x, bp / bp.max() * 1 - 4, color='blue', alpha=0.5)

	plt.ylabel('Akaike Information Criterion')
	plt.xlim(-0.5, 5.5)
	plt.xlabel('distance in units of $\sigma$')
	plt.savefig('overlapgauss_aic.pdf', bbox_inches='tight')
	plt.savefig('overlapgauss_aic.png', dpi=140, bbox_inches='tight')
	plt.close()

	test_aic()
	test_bayes()