zaltoprofen/likelihood_ratio.py

## likelihood_ratio.py
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.pipeline import Pipeline
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats

modelF = lambda deg: Pipeline([
    ('poly', PolynomialFeatures(degree=deg)),
    ('linear', LinearRegression(fit_intercept=False))])

p = 11
beta = np.random.normal(0, 10, p)
def f(x):
    return sum(beta[i]*x**i for i in range(p))

X = (np.random.rand(30) * 10 - 5)
Y = f(X) + np.random.normal(0, 5, size=X.shape)

poor_model = modelF(7)
rich_model = modelF(8)

poor_model = poor_model.fit(X[:, np.newaxis], Y)
rich_model = rich_model.fit(X[:, np.newaxis], Y)

def plot_model(model, name):
    X = np.arange(-5, 5, 0.01)
    eY = model.predict(X[:, np.newaxis])
    plt.plot(X, eY, '-', label=name)

def likelihood(model, X, y):
    ey = model.predict(X[:, np.newaxis])
    rss = np.linalg.norm(y - ey)**2
    n = y.size
    return -n/2 * (np.log(2*np.pi) + 1 + np.log(rss/n))

def parametric_bootstrap(X, y, model1, model2, n=1000):
    def get_dd(X, y, model1, model2, sigma2):
        new_y = ey + np.random.normal(0, np.sqrt(sigma2), y.size)
        model1 = model1.fit(X[:, np.newaxis], new_y)
        model2 = model2.fit(X[:, np.newaxis], new_y)
        return -2 * (likelihood(model1, X, new_y) - likelihood(model2, X, new_y))

    model1 = model1.fit(X[:, np.newaxis], y)
    ey = model1.predict(X[:, np.newaxis])
    rss = np.linalg.norm(y - ey)**2
    sigma2 = rss/y.size
    return np.array([get_dd(X, y, model1, model2, sigma2) for k in range(n)])

L1 = likelihood(poor_model, X, Y)
print('poor model log-likelihood: %f' % L1)
L2 = likelihood(rich_model, X, Y)
print('rich model log-likelihood: %f' % L2)

DD12 = -2 * (L1 - L2)
print('Delta D(1,2): %f' % DD12)

samples_pb = 2000
dd12 = parametric_bootstrap(X, Y, poor_model, rich_model, samples_pb)
larger_than_DD12 = sum(dd12 >= DD12)
print('dd12 >= %f: %d' % (DD12, larger_than_DD12))
print('p(dd12 >= %f)=%f' % (DD12, float(larger_than_DD12)/samples_pb))
print('p(dd12\' >= %f)=%f' % (DD12, scipy.stats.chi2.sf(DD12, 1)))

plt.hist(dd12, bins=30, normed=True)
plt.savefig('plot.png')
plt.show()

## plot.png

      
    Raw
  

              plot.png
	from sklearn.preprocessing import PolynomialFeatures
	from sklearn.linear_model import LinearRegression, Lasso
	from sklearn.pipeline import Pipeline
	import numpy as np
	import matplotlib.pyplot as plt
	import scipy.stats

	modelF = lambda deg: Pipeline([
	('poly', PolynomialFeatures(degree=deg)),
	('linear', LinearRegression(fit_intercept=False))])

	p = 11
	beta = np.random.normal(0, 10, p)
	def f(x):
	return sum(beta[i]x*i for i in range(p))

	X = (np.random.rand(30) * 10 - 5)
	Y = f(X) + np.random.normal(0, 5, size=X.shape)

	poor_model = modelF(7)
	rich_model = modelF(8)

	poor_model = poor_model.fit(X[:, np.newaxis], Y)
	rich_model = rich_model.fit(X[:, np.newaxis], Y)

	def plot_model(model, name):
	X = np.arange(-5, 5, 0.01)
	eY = model.predict(X[:, np.newaxis])
	plt.plot(X, eY, '-', label=name)

	def likelihood(model, X, y):
	ey = model.predict(X[:, np.newaxis])
	rss = np.linalg.norm(y - ey)**2
	n = y.size
	return -n/2 * (np.log(2*np.pi) + 1 + np.log(rss/n))

	def parametric_bootstrap(X, y, model1, model2, n=1000):
	def get_dd(X, y, model1, model2, sigma2):
	new_y = ey + np.random.normal(0, np.sqrt(sigma2), y.size)
	model1 = model1.fit(X[:, np.newaxis], new_y)
	model2 = model2.fit(X[:, np.newaxis], new_y)
	return -2 * (likelihood(model1, X, new_y) - likelihood(model2, X, new_y))

	model1 = model1.fit(X[:, np.newaxis], y)
	ey = model1.predict(X[:, np.newaxis])
	rss = np.linalg.norm(y - ey)**2
	sigma2 = rss/y.size
	return np.array([get_dd(X, y, model1, model2, sigma2) for k in range(n)])

	L1 = likelihood(poor_model, X, Y)
	print('poor model log-likelihood: %f' % L1)
	L2 = likelihood(rich_model, X, Y)
	print('rich model log-likelihood: %f' % L2)

	DD12 = -2 * (L1 - L2)
	print('Delta D(1,2): %f' % DD12)

	samples_pb = 2000
	dd12 = parametric_bootstrap(X, Y, poor_model, rich_model, samples_pb)
	larger_than_DD12 = sum(dd12 >= DD12)
	print('dd12 >= %f: %d' % (DD12, larger_than_DD12))
	print('p(dd12 >= %f)=%f' % (DD12, float(larger_than_DD12)/samples_pb))
	print('p(dd12\' >= %f)=%f' % (DD12, scipy.stats.chi2.sf(DD12, 1)))

	plt.hist(dd12, bins=30, normed=True)
	plt.savefig('plot.png')
	plt.show()