GermanCM/bootstrap_CI_linear_reg.py

## bootstrap_CI_linear_reg.py
# bootstrap confidence intervals
import numpy as np
from numpy.random import seed, rand, randint, randn, std
from numpy import mean, median, percentile
from scipy.stats import linregress

# seed random number generator
seed(1)
# prepare data
x = 20 * randn(1000) + 100
y = x + (10 * randn(1000) + 50)


### LINEAR REGRESSION FIT WITH ALL THE DATASET AT ONCE
b1, b0, r_value, p_value, std_err = linregress(x, y)
yhat_whole_ds = b0 + b1 * x

### LINEAR REGRESSION FIT WITH BOOTSTRAP SAMPLES
regression_scores = list()
for _ in range(100):
    # bootstrap sample
    indices = randint(0, 1000, 550)
    x_sample = x[indices]
    y_sample = y[indices]

    # simple linear regression model
    b1_sample, b0_sample, r_value, p_value, std_err = linregress(x_sample, y_sample)
    regression_scores.append([b1_sample, b0_sample])

b0_scores = np.array(regression_scores)[:, 1]
b1_scores = np.array(regression_scores)[:, 0]
b0_estimate = median(b0_scores)
b1_estimate = median(b1_scores)

yhat_bootstrapped = b0_estimate + b1_estimate * x

### MODELS VALIDATION
from sklearn.metrics import mean_squared_error

yhat_whole_ds_rmse = mean_squared_error(y, yhat_whole_ds).round(2)
yhat_bootstrapped_rmse = mean_squared_error(y, yhat_bootstrapped).round(2)

b0_estimate = median(b0_scores)
print('b0 50th percentile (median) = %.3f' % median(b0_scores))
alpha = 5.0
lower_p = alpha / 2.0
perc_5th = percentile(b0_scores, lower_p)
upper_p = 100 - (alpha / 2.0)
perc_95th = percentile(b0_scores, upper_p)
print('confidence interval: ', [perc_5th.round(2), perc_95th.round(2)])
print('b0 estimate: ', [perc_5th.round(2), b0_estimate.round(2), perc_95th.round(2)])

b1_estimate = median(b1_scores)
print('b1 50th percentile (median) = %.3f' % median(b1_scores))
alpha = 5.0
lower_p = alpha / 2.0
perc_5th = percentile(b1_scores, lower_p)
upper_p = 100 - (alpha / 2.0)
perc_95th = percentile(b1_scores, upper_p)
print('confidence interval: ', [perc_5th.round(2), perc_95th.round(2)])
print('b1 estimate: ', [perc_5th.round(2), b1_estimate.round(2), perc_95th.round(2)])

# Those ocnfidence intervals are a rough estimate, to capture the variance of the interval alongo the x values, let's proceed as follows:
yhat_bootstrapped_predictions_low_values = []
yhat_bootstrapped_predictions_high_values = []

import pandas as pd
slope_intercept_pairs_df = pd.DataFrame({'b0_scores': b0_scores, 'b1_scores': b1_scores})
#slope_intercept_pairs = zip(b0_scores, b1_scores)

for x_value in x:
    #for each x value, we calculate all possible predictions from the bootstrapped estimates
    preds_for_this_x = []
    for index in slope_intercept_pairs_df.index:
        #print(x_value)
        interc = slope_intercept_pairs_df.iloc[index]['b0_scores']
        slope = slope_intercept_pairs_df.iloc[index]['b1_scores']

        y_pred = interc + slope*x_value
        preds_for_this_x.append(y_pred)

    perc_5th_value = percentile(preds_for_this_x, lower_p)
    perc_95th_value = percentile(preds_for_this_x, upper_p)

    yhat_bootstrapped_predictions_low_values.append(perc_5th_value)
    yhat_bootstrapped_predictions_high_values.append(perc_95th_value)

import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(x=x, y=yhat_bootstrapped_predictions_low_values,
    fill=None,
    mode='lines',
    line_color='indigo',
    ))

fig.add_trace(go.Scatter(x=x, y=yhat_whole_ds,
    fill='tonexty',
    mode='lines',
    line_color='red',
    ))

fig.add_trace(go.Scatter(x=x, y=yhat_bootstrapped_predictions_high_values,
    fill='tonexty', # fill area between yhat_bootstrapped_predictions_low_values and yhat_bootstrapped_predictions_high_values
    mode='lines', line_color='indigo'))

fig.update_layout(annotations=[dict(xref='paper',
                                    yref='paper',
                                    x=0.5, y=1,
                                    showarrow=False,
                                    text='95% confidence interval')])

fig.show()
	# bootstrap confidence intervals
	import numpy as np
	from numpy.random import seed, rand, randint, randn, std
	from numpy import mean, median, percentile
	from scipy.stats import linregress

	# seed random number generator
	seed(1)
	# prepare data
	x = 20 * randn(1000) + 100
	y = x + (10 * randn(1000) + 50)


	### LINEAR REGRESSION FIT WITH ALL THE DATASET AT ONCE
	b1, b0, r_value, p_value, std_err = linregress(x, y)
	yhat_whole_ds = b0 + b1 * x

	### LINEAR REGRESSION FIT WITH BOOTSTRAP SAMPLES
	regression_scores = list()
	for _ in range(100):
	# bootstrap sample
	indices = randint(0, 1000, 550)
	x_sample = x[indices]
	y_sample = y[indices]

	# simple linear regression model
	b1_sample, b0_sample, r_value, p_value, std_err = linregress(x_sample, y_sample)
	regression_scores.append([b1_sample, b0_sample])

	b0_scores = np.array(regression_scores)[:, 1]
	b1_scores = np.array(regression_scores)[:, 0]
	b0_estimate = median(b0_scores)
	b1_estimate = median(b1_scores)

	yhat_bootstrapped = b0_estimate + b1_estimate * x

	### MODELS VALIDATION
	from sklearn.metrics import mean_squared_error

	yhat_whole_ds_rmse = mean_squared_error(y, yhat_whole_ds).round(2)
	yhat_bootstrapped_rmse = mean_squared_error(y, yhat_bootstrapped).round(2)

	b0_estimate = median(b0_scores)
	print('b0 50th percentile (median) = %.3f' % median(b0_scores))
	alpha = 5.0
	lower_p = alpha / 2.0
	perc_5th = percentile(b0_scores, lower_p)
	upper_p = 100 - (alpha / 2.0)
	perc_95th = percentile(b0_scores, upper_p)
	print('confidence interval: ', [perc_5th.round(2), perc_95th.round(2)])
	print('b0 estimate: ', [perc_5th.round(2), b0_estimate.round(2), perc_95th.round(2)])

	b1_estimate = median(b1_scores)
	print('b1 50th percentile (median) = %.3f' % median(b1_scores))
	alpha = 5.0
	lower_p = alpha / 2.0
	perc_5th = percentile(b1_scores, lower_p)
	upper_p = 100 - (alpha / 2.0)
	perc_95th = percentile(b1_scores, upper_p)
	print('confidence interval: ', [perc_5th.round(2), perc_95th.round(2)])
	print('b1 estimate: ', [perc_5th.round(2), b1_estimate.round(2), perc_95th.round(2)])

	# Those ocnfidence intervals are a rough estimate, to capture the variance of the interval alongo the x values, let's proceed as follows:
	yhat_bootstrapped_predictions_low_values = []
	yhat_bootstrapped_predictions_high_values = []

	import pandas as pd
	slope_intercept_pairs_df = pd.DataFrame({'b0_scores': b0_scores, 'b1_scores': b1_scores})
	#slope_intercept_pairs = zip(b0_scores, b1_scores)

	for x_value in x:
	#for each x value, we calculate all possible predictions from the bootstrapped estimates
	preds_for_this_x = []
	for index in slope_intercept_pairs_df.index:
	#print(x_value)
	interc = slope_intercept_pairs_df.iloc[index]['b0_scores']
	slope = slope_intercept_pairs_df.iloc[index]['b1_scores']

	y_pred = interc + slope*x_value
	preds_for_this_x.append(y_pred)

	perc_5th_value = percentile(preds_for_this_x, lower_p)
	perc_95th_value = percentile(preds_for_this_x, upper_p)

	yhat_bootstrapped_predictions_low_values.append(perc_5th_value)
	yhat_bootstrapped_predictions_high_values.append(perc_95th_value)

	import plotly.graph_objects as go

	fig = go.Figure()
	fig.add_trace(go.Scatter(x=x, y=yhat_bootstrapped_predictions_low_values,
	fill=None,
	mode='lines',
	line_color='indigo',
	))

	fig.add_trace(go.Scatter(x=x, y=yhat_whole_ds,
	fill='tonexty',
	mode='lines',
	line_color='red',
	))

	fig.add_trace(go.Scatter(x=x, y=yhat_bootstrapped_predictions_high_values,
	fill='tonexty', # fill area between yhat_bootstrapped_predictions_low_values and yhat_bootstrapped_predictions_high_values
	mode='lines', line_color='indigo'))

	fig.update_layout(annotations=[dict(xref='paper',
	yref='paper',
	x=0.5, y=1,
	showarrow=False,
	text='95% confidence interval')])

	fig.show()