Created
June 19, 2020 15:54
-
-
Save rcassani/cbc395e9092d2088782fedb9154bf6ee to your computer and use it in GitHub Desktop.
Code for the simulation and figures in https://www.castoriscausa.com/posts/2020/06/19/multiple-comparisons-visual/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
# This is the code for the simulation and figures described in the post: | |
# "Multiple comparisons problem, visual approach" in | |
# https://www.castoriscausa.com/posts/2020/06/19/multiple-comparisons-visual/ | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import scipy.stats as stats | |
from mpl_toolkits.mplot3d import Axes3D | |
import matplotlib.patches as patches | |
# set seed for reproducibility | |
np.random.seed(111) | |
n = 100 # number of samples from the population | |
m = 2 # variables in each population | |
t = 10000 # number of sampling processes | |
alpha = 0.05 # significance level | |
# as two-tailed | |
std_score = stats.norm.ppf(1 - alpha / 2) | |
# Bonferroni corrected significance level | |
alpha_corrB = alpha / m | |
std_score_corrB = stats.norm.ppf(1 - alpha_corrB / 2) | |
# X Mean = 0, X SD = 1 | |
mu_X = np.zeros(m) | |
Sigma_X = np.eye(m) | |
# Y Mean = 0, Y SD = 1 | |
mu_Y = np.zeros(m) | |
Sigma_Y = np.eye(m) | |
# matrix to save the results | |
std_scores = np.zeros((t, m)) | |
for ix_t in range(t): | |
# sampling X and Y populations | |
x = stats.multivariate_normal.rvs(mean=mu_X, cov=Sigma_X**2, size=n) | |
y = stats.multivariate_normal.rvs(mean=mu_Y, cov=Sigma_Y**2, size=n) | |
# sample mean and sample std | |
x_bar = np.mean(x, axis=0) | |
x_s = np.std(x, axis=0) | |
y_bar = np.mean(y, axis=0) | |
y_s = np.std(y, axis=0) | |
# mean of the sampling distribution of sample means | |
mu_X_bar = x_bar | |
mu_Y_bar = y_bar | |
# std of the sampling distribution of sample means | |
sigma_X_bar = np.sqrt((x_s**2) / n) | |
sigma_Y_bar = np.sqrt((y_s**2) / n) | |
# mean and std of Z, with Z = X_bar - Y_bar | |
mu_Z = mu_X_bar - mu_Y_bar | |
sigma_Z = np.sqrt(sigma_X_bar**2 + sigma_Y_bar**2) | |
# standard scores | |
std_scores[ix_t] = mu_Z / sigma_Z | |
# probability that at a significant difference between the mean is found just by chance | |
p_h1 = sum(np.any((abs(std_scores) > std_score), axis=-1)) / t | |
print("Probability of a significant difference just by change = " + str(p_h1 * 100) + '%') | |
if m > 1: | |
p_h1 = sum(np.any((abs(std_scores) > std_score_corrB), axis=-1)) / t | |
print("Probability (after Bonferroni correction) of a significant difference just by change = " + str(p_h1 * 100) + '%') | |
#%% plots | |
# proabability density function for m=1 and m=2 | |
std_score_axis = np.arange(-4, 4, 0.01) | |
if m == 1: | |
# theoretical distribution of Z | |
plt.figure(figsize=[10,5]) | |
plt.plot(std_score_axis, stats.norm.pdf(std_score_axis, 0, 1), linewidth=4) | |
# experimental distribution of Z | |
plt.hist(std_scores, density=True, bins=100, color='#1f77b4aa') | |
plt.xlabel('$Z$ in $\sigma_Z$ units') | |
plt.xlim(-4, 4) | |
plt.ylim(0, 0.5) | |
ymin, ymax = plt.ylim() | |
plt.vlines((-std_score, std_score), ymin, ymax, colors='r', linestyles=':', | |
linewidth = 2) | |
plt.ylim(ymin, ymax) | |
plt.text(-1.1, 0.46, r'$P(|Z| <1.96\sigma_Z ) = 1 - \alpha$', fontsize=14) | |
plt.text(2, 0.07, r'$P(Z > 1.96\sigma_Z ) = \alpha / 2$', fontsize=14) | |
plt.text(-3.9, 0.07, r'$P(Z < 1.96\sigma_Z ) = \alpha / 2$', fontsize=14) | |
print('Plot one variable') | |
None | |
elif m == 2: | |
# common parameters | |
std_score_grid_x, std_score_grid_y = np.meshgrid(std_score_axis, std_score_axis) | |
std_score_grid_xy = np.stack((std_score_grid_x, std_score_grid_y), axis=-1) | |
rv = stats.multivariate_normal([0, 0], [[1, 0], [0, 1]]) | |
# theoretical 3D plot | |
fig = plt.figure(figsize=[15,6]) | |
ax = fig.gca(projection='3d') | |
ax.view_init(60, -45) | |
ax.set_xlabel(r'$Z_1$ in $\sigma_{Z_1}$ units') | |
ax.set_ylabel(r'$Z_2$ in $\sigma_{Z_2}$ units') | |
ax.set_zlabel('Z axis') | |
plt.xlim(-4, 4) | |
plt.ylim(-4, 4) | |
# theoreical distribution of Z_1 | |
ax.plot(std_score_axis, 4+(std_score_axis*0), stats.norm.pdf(std_score_axis, 0, 1)) | |
# theoreical distribution of Z_2 | |
ax.plot(-4+(std_score_axis*0), std_score_axis, stats.norm.pdf(std_score_axis, 0, 1)) | |
# vertical lines | |
#zmin, zmax = ax.set_zlim() | |
ax.set_zlim(0, 0.5) | |
zmin, zmax = ax.set_zlim() | |
ax.plot([std_score, std_score], [4, 4], [zmin, zmax], 'r:', linewidth=2) | |
ax.plot([-std_score, -std_score], [4, 4], [zmin, zmax], 'r:', linewidth=2) | |
ax.plot([-4, -4], [std_score, std_score], [zmin, zmax], 'r:', linewidth=2) | |
ax.plot([-4, -4], [-std_score, -std_score], [zmin, zmax], 'r:', linewidth=2) | |
# theoreical distribution of Z | |
surf = ax.plot_surface(std_score_grid_x, std_score_grid_y, rv.pdf(std_score_grid_xy), linewidth=1, cmap='viridis') | |
# red box | |
ww, zz = np.meshgrid((-std_score, std_score), (zmin/4, zmax/2.2)) | |
ax.plot_surface(abs(ww), ww, zz, color='#FF0000FF', shade=True) | |
ax.plot_surface(-abs(ww), ww, zz, color='#FF0000FF', shade=True) | |
ax.plot_surface(ww, abs(ww), zz, color='#FF0000FF', shade=True) | |
ax.plot_surface(ww, -abs(ww), zz, color='#FF0000FF',shade=True) | |
fig.colorbar(surf) | |
surf.set_clim(vmin=0, vmax=0.16) | |
#%% # theoretical 2D plot | |
fig = plt.figure(figsize=(8, 8)) | |
gs = fig.add_gridspec(2, 2, width_ratios=(3, 5), height_ratios=(3, 5), | |
left=0.1, right=0.8, bottom=0.1, top=0.8, | |
wspace=0.2, hspace=0.2) | |
ax_2d = fig.add_subplot(gs[1, 1]) | |
im = plt.imshow(rv.pdf(std_score_grid_xy), extent=[-4,4,-4,4], origin='lower') | |
cbaxes = fig.add_axes([0.85, 0.1, 0.03, 0.395]) # This is the position for the colorbar | |
cb = plt.colorbar(im, cax = cbaxes) | |
plt.clim(0,0.16) | |
rect = patches.Rectangle((-std_score, -std_score), 2*std_score, 2*std_score, | |
edgecolor='r', linestyle=':', facecolor='none') | |
ax_2d.add_patch(rect) | |
rect = patches.Rectangle((-std_score_corrB, -std_score_corrB), 2*std_score_corrB, 2*std_score_corrB, | |
edgecolor='g', linestyle=':', facecolor='none') | |
ax_2d.add_patch(rect) | |
ax_2d.axis('off') | |
ax_top = fig.add_subplot(gs[0, 1], sharex=ax_2d) | |
ax_top.plot(std_score_axis, stats.norm.pdf(std_score_axis, 0, 1)) | |
ax_top.set_ylim([0, 0.5]) | |
ax_top.set_xlim([-4, 4]) | |
ymin, ymax = ax_top.get_ylim() | |
ax_top.vlines((-std_score, std_score), ymin, ymax, colors='r', linestyles=':', | |
linewidth = 2) | |
ax_top.vlines((-std_score_corrB, std_score_corrB), ymin, ymax, colors='g', linestyles=':', | |
linewidth = 2) | |
plt.yticks(np.arange(0,0.6,0.1)) | |
plt.xticks(np.arange(-4,4+1)) | |
ax_top.grid(True) | |
ax_top.set_xlabel(r'$Z_1$ in $\sigma_{Z_1}$ units') | |
ax_rig = fig.add_subplot(gs[1, 0], sharey=ax_2d) | |
ax = ax_rig.plot(stats.norm.pdf(std_score_axis, 0, 1), std_score_axis) | |
ax = ax_rig.plot(stats.norm.pdf(std_score_axis, 0, 1), std_score_axis) | |
ax_rig.set_xlim([0.5, 0]) | |
ax_rig.set_ylim([-4, 4]) | |
ax_rig.yaxis.tick_right() | |
plt.xticks(rotation=90) | |
plt.yticks(rotation=90) | |
ax_rig.hlines((-std_score, std_score), ymin, ymax, colors='r', linestyles=':', | |
linewidth = 2) | |
ax_rig.hlines((-std_score_corrB, std_score_corrB), ymin, ymax, colors='g', linestyles=':', | |
linewidth = 2) | |
plt.xticks(np.arange(0,0.6,0.1)) | |
plt.yticks(np.arange(-4,4+1)) | |
ax_rig.grid(True) | |
ax_rig.set_ylabel(r'$Z_2$ in $\sigma_{Z_2}$ units') | |
ax_rig.yaxis.set_label_position("right") | |
# experimental 2D plot | |
fig = plt.figure(figsize=(8, 8)) | |
gs = fig.add_gridspec(2, 2, width_ratios=(3, 5), height_ratios=(3, 5), | |
left=0.1, right=0.8, bottom=0.1, top=0.8, | |
wspace=0.2, hspace=0.2) | |
ax_2d = fig.add_subplot(gs[1, 1]) | |
ima, _, _ = np.histogram2d(std_scores[:,0], std_scores[:,1], bins=100, density=True) | |
im = plt.imshow(ima, extent=[-4,4,-4,4], origin='lower') | |
cbaxes = fig.add_axes([0.85, 0.1, 0.03, 0.395]) # This is the position for the colorbar | |
cb = plt.colorbar(im, cax = cbaxes) | |
plt.clim(0,0.16) | |
rect = patches.Rectangle((-std_score, -std_score), 2*std_score, 2*std_score, | |
edgecolor='r', linestyle=':', facecolor='none') | |
ax_2d.add_patch(rect) | |
rect = patches.Rectangle((-std_score_corrB, -std_score_corrB), 2*std_score_corrB, 2*std_score_corrB, | |
edgecolor='g', linestyle=':', facecolor='none') | |
ax_2d.add_patch(rect) | |
ax_2d.axis('off') | |
ax_top = fig.add_subplot(gs[0, 1], sharex=ax_2d) | |
ax_top.plot(std_score_axis, stats.norm.pdf(std_score_axis, 0, 1)) | |
ax_top.hist(std_scores[:,0], density=True, bins=100, color='#1f77b4aa') | |
ax_top.set_ylim([0, 0.5]) | |
ax_top.set_xlim([-4, 4]) | |
ymin, ymax = ax_top.get_ylim() | |
ax_top.vlines((-std_score, std_score), ymin, ymax, colors='r', linestyles=':', | |
linewidth = 2) | |
ax_top.vlines((-std_score_corrB, std_score_corrB), ymin, ymax, colors='g', linestyles=':', | |
linewidth = 2) | |
plt.yticks(np.arange(0,0.6,0.1)) | |
plt.xticks(np.arange(-4,4+1)) | |
ax_top.grid(True) | |
ax_top.set_xlabel(r'$Z_1$ in $\sigma_{Z_1}$ units') | |
ax_rig = fig.add_subplot(gs[1, 0], sharey=ax_2d) | |
ax = ax_rig.plot(stats.norm.pdf(std_score_axis, 0, 1), std_score_axis, color='#ff7f0e') | |
ax_rig.hist(std_scores[:,1], density=True, bins=100, color='#ff7f0eaa', orientation='horizontal') | |
ax_rig.set_xlim([0.5, 0]) | |
ax_rig.set_ylim([-4, 4]) | |
ax_rig.yaxis.tick_right() | |
plt.xticks(rotation=90) | |
plt.yticks(rotation=90) | |
ax_rig.hlines((-std_score, std_score), ymin, ymax, colors='r', linestyles=':', | |
linewidth = 2) | |
ax_rig.hlines((-std_score_corrB, std_score_corrB), ymin, ymax, colors='g', linestyles=':', | |
linewidth = 2) | |
plt.xticks(np.arange(0,0.6,0.1)) | |
plt.yticks(np.arange(-4,4+1)) | |
ax_rig.grid(True) | |
ax_rig.set_ylabel(r'$Z_2$ in $\sigma_{Z_2}$ units') | |
ax_rig.yaxis.set_label_position("right") | |
#%% Bonferroni correction | |
# significance level and Bonferroni corrected significance level levels vs number of comparison | |
alpha = 0.05 | |
m = 15 | |
fig = plt.figure(figsize=[15, 5]) | |
plt.plot(np.arange(m)+1, (1 - ((1-alpha) ** (np.arange(m)+1))) * 100, | |
'ro', label=r'Uncorrected $\alpha$') | |
plt.plot(np.arange(m)+1, (1 - ((1-alpha /(np.arange(m)+1)) ** (np.arange(m)+1))) * 100, | |
'g*', label=r'Bonferroni corrected $\alpha/m$') | |
plt.xlabel('number of multiple comparisons', fontsize=14) | |
plt.ylabel('probability of finding at least \n one significant result (%)', fontsize=14) | |
plt.yticks(np.arange(0,60,5)) | |
plt.xticks(np.arange(1,m+1,1)) | |
plt.legend(fontsize=14) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment