oseiskar/bootstrap.m

## bootstrap.m
%% GNU Octave

% The model
function y = model(x, y0, theta, sigma)
    noise = randn(size(x))*sigma;
    y = y0 * exp(-theta*x) + noise;
endfunction

% Least squares-type fit to the model
function [y0, theta, sigma] = fit_model(x, y)
    y = abs(y);
    n = numel(x);
    m_theta_and_lny0 = ([x(:),ones(n,1)] \ log(y(:)));
    theta = -m_theta_and_lny0(1);
    y0 = exp(m_theta_and_lny0(2));
    yrec = model(x, y0, theta, 0);
    sigma = sqrt(sum((yrec - y).^2)/(n-1));
endfunction

% Example: two different populations, A and B
fprintf('--- Actual parameters \n');
x = linspace(0,5, 50);

% note: swapping A and B seem to significantly worsen the results
y0_a = 0.8
y0_b = 1.7

theta_a = 0.8
theta_b = 1.3

sigma_a = 0.1
sigma_b = 0.15

y_a = model(x, y0_a, theta_a, sigma_a);
y_b = model(x, y0_b, theta_b, sigma_b);

fprintf('--- Estimated parameters \n');
[y0_a_est, theta_a_est, sigma_a_est] = fit_model(x, y_a)
[y0_b_est, theta_b_est, sigma_b_est] = fit_model(x, y_b)


fprintf('--- Parametric bootstrap test \n');
% Define test statistic z (Euclidean distance of vectors)
function z = test_statistic(pop_a, pop_b)
    z = sqrt(sum((pop_a - pop_b).^2, 2));
endfunction

function z = bootstrap(x, y0, theta, sigma, n_samples)
    x_mat = repmat(x, n_samples, 1);

    pop_a = model(x_mat, y0, theta, sigma);
    pop_b = model(x_mat, y0, theta, sigma);

    z = test_statistic(pop_a, pop_b);
endfunction

% Parametric bootstrap: compute distribution of z under the null hypothesis:
% population B has the same distribution, i.e., model parameters as A
% (same as the estimated parameters)
n_samples = 100000
diffs = bootstrap(x, y0_a_est, theta_a_est, sigma_a_est, n_samples);

figure(1);
hist(diffs, bins=50);
title('Distribution of z');

% Find threshold of z for the given significance level
significance_level = 0.05
threshold = quantile(diffs, 1-significance_level)

% Compute the actual test statistic for A and B
z_value = test_statistic(y_a, y_b)
null_hypothesis_rejected = z_value > threshold

figure(3);
plot(x,y_a, 'ro');
hold on;
plot(x,y_b, 'bo');
plot(x, model(x, y0_a_est, theta_a_est, 0), 'r--');
plot(x, model(x, y0_b_est, theta_b_est, 0), 'b--');

plot(x, model(x, y0_a, theta_a, 0), 'r-');
plot(x, model(x, y0_b, theta_b, 0), 'b-');

hold off;
legend('sample from A', 'sample from B',
       'estimated A', 'estimated B',
       'actual A', 'actual B');

input('- press enter -');
	%% GNU Octave

	% The model
	function y = model(x, y0, theta, sigma)
	noise = randn(size(x))*sigma;
	y = y0 * exp(-theta*x) + noise;
	endfunction

	% Least squares-type fit to the model
	function [y0, theta, sigma] = fit_model(x, y)
	y = abs(y);
	n = numel(x);
	m_theta_and_lny0 = ([x(:),ones(n,1)] \ log(y(:)));
	theta = -m_theta_and_lny0(1);
	y0 = exp(m_theta_and_lny0(2));
	yrec = model(x, y0, theta, 0);
	sigma = sqrt(sum((yrec - y).^2)/(n-1));
	endfunction

	% Example: two different populations, A and B
	fprintf('--- Actual parameters \n');
	x = linspace(0,5, 50);

	% note: swapping A and B seem to significantly worsen the results
	y0_a = 0.8
	y0_b = 1.7

	theta_a = 0.8
	theta_b = 1.3

	sigma_a = 0.1
	sigma_b = 0.15

	y_a = model(x, y0_a, theta_a, sigma_a);
	y_b = model(x, y0_b, theta_b, sigma_b);

	fprintf('--- Estimated parameters \n');
	[y0_a_est, theta_a_est, sigma_a_est] = fit_model(x, y_a)
	[y0_b_est, theta_b_est, sigma_b_est] = fit_model(x, y_b)


	fprintf('--- Parametric bootstrap test \n');
	% Define test statistic z (Euclidean distance of vectors)
	function z = test_statistic(pop_a, pop_b)
	z = sqrt(sum((pop_a - pop_b).^2, 2));
	endfunction

	function z = bootstrap(x, y0, theta, sigma, n_samples)
	x_mat = repmat(x, n_samples, 1);

	pop_a = model(x_mat, y0, theta, sigma);
	pop_b = model(x_mat, y0, theta, sigma);

	z = test_statistic(pop_a, pop_b);
	endfunction

	% Parametric bootstrap: compute distribution of z under the null hypothesis:
	% population B has the same distribution, i.e., model parameters as A
	% (same as the estimated parameters)
	n_samples = 100000
	diffs = bootstrap(x, y0_a_est, theta_a_est, sigma_a_est, n_samples);

	figure(1);
	hist(diffs, bins=50);
	title('Distribution of z');

	% Find threshold of z for the given significance level
	significance_level = 0.05
	threshold = quantile(diffs, 1-significance_level)

	% Compute the actual test statistic for A and B
	z_value = test_statistic(y_a, y_b)
	null_hypothesis_rejected = z_value > threshold

	figure(3);
	plot(x,y_a, 'ro');
	hold on;
	plot(x,y_b, 'bo');
	plot(x, model(x, y0_a_est, theta_a_est, 0), 'r--');
	plot(x, model(x, y0_b_est, theta_b_est, 0), 'b--');

	plot(x, model(x, y0_a, theta_a, 0), 'r-');
	plot(x, model(x, y0_b, theta_b, 0), 'b-');

	hold off;
	legend('sample from A', 'sample from B',
	'estimated A', 'estimated B',
	'actual A', 'actual B');

	input('- press enter -');