Skip to content

Instantly share code, notes, and snippets.

@rcassani
Created June 19, 2020 15:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rcassani/cbc395e9092d2088782fedb9154bf6ee to your computer and use it in GitHub Desktop.
Save rcassani/cbc395e9092d2088782fedb9154bf6ee to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# This is the code for the simulation and figures described in the post:
# "Multiple comparisons problem, visual approach" in
# https://www.castoriscausa.com/posts/2020/06/19/multiple-comparisons-visual/
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.patches as patches
# set seed for reproducibility
np.random.seed(111)
n = 100 # number of samples from the population
m = 2 # variables in each population
t = 10000 # number of sampling processes
alpha = 0.05 # significance level
# as two-tailed
std_score = stats.norm.ppf(1 - alpha / 2)
# Bonferroni corrected significance level
alpha_corrB = alpha / m
std_score_corrB = stats.norm.ppf(1 - alpha_corrB / 2)
# X Mean = 0, X SD = 1
mu_X = np.zeros(m)
Sigma_X = np.eye(m)
# Y Mean = 0, Y SD = 1
mu_Y = np.zeros(m)
Sigma_Y = np.eye(m)
# matrix to save the results
std_scores = np.zeros((t, m))
for ix_t in range(t):
# sampling X and Y populations
x = stats.multivariate_normal.rvs(mean=mu_X, cov=Sigma_X**2, size=n)
y = stats.multivariate_normal.rvs(mean=mu_Y, cov=Sigma_Y**2, size=n)
# sample mean and sample std
x_bar = np.mean(x, axis=0)
x_s = np.std(x, axis=0)
y_bar = np.mean(y, axis=0)
y_s = np.std(y, axis=0)
# mean of the sampling distribution of sample means
mu_X_bar = x_bar
mu_Y_bar = y_bar
# std of the sampling distribution of sample means
sigma_X_bar = np.sqrt((x_s**2) / n)
sigma_Y_bar = np.sqrt((y_s**2) / n)
# mean and std of Z, with Z = X_bar - Y_bar
mu_Z = mu_X_bar - mu_Y_bar
sigma_Z = np.sqrt(sigma_X_bar**2 + sigma_Y_bar**2)
# standard scores
std_scores[ix_t] = mu_Z / sigma_Z
# probability that at a significant difference between the mean is found just by chance
p_h1 = sum(np.any((abs(std_scores) > std_score), axis=-1)) / t
print("Probability of a significant difference just by change = " + str(p_h1 * 100) + '%')
if m > 1:
p_h1 = sum(np.any((abs(std_scores) > std_score_corrB), axis=-1)) / t
print("Probability (after Bonferroni correction) of a significant difference just by change = " + str(p_h1 * 100) + '%')
#%% plots
# proabability density function for m=1 and m=2
std_score_axis = np.arange(-4, 4, 0.01)
if m == 1:
# theoretical distribution of Z
plt.figure(figsize=[10,5])
plt.plot(std_score_axis, stats.norm.pdf(std_score_axis, 0, 1), linewidth=4)
# experimental distribution of Z
plt.hist(std_scores, density=True, bins=100, color='#1f77b4aa')
plt.xlabel('$Z$ in $\sigma_Z$ units')
plt.xlim(-4, 4)
plt.ylim(0, 0.5)
ymin, ymax = plt.ylim()
plt.vlines((-std_score, std_score), ymin, ymax, colors='r', linestyles=':',
linewidth = 2)
plt.ylim(ymin, ymax)
plt.text(-1.1, 0.46, r'$P(|Z| <1.96\sigma_Z ) = 1 - \alpha$', fontsize=14)
plt.text(2, 0.07, r'$P(Z > 1.96\sigma_Z ) = \alpha / 2$', fontsize=14)
plt.text(-3.9, 0.07, r'$P(Z < 1.96\sigma_Z ) = \alpha / 2$', fontsize=14)
print('Plot one variable')
None
elif m == 2:
# common parameters
std_score_grid_x, std_score_grid_y = np.meshgrid(std_score_axis, std_score_axis)
std_score_grid_xy = np.stack((std_score_grid_x, std_score_grid_y), axis=-1)
rv = stats.multivariate_normal([0, 0], [[1, 0], [0, 1]])
# theoretical 3D plot
fig = plt.figure(figsize=[15,6])
ax = fig.gca(projection='3d')
ax.view_init(60, -45)
ax.set_xlabel(r'$Z_1$ in $\sigma_{Z_1}$ units')
ax.set_ylabel(r'$Z_2$ in $\sigma_{Z_2}$ units')
ax.set_zlabel('Z axis')
plt.xlim(-4, 4)
plt.ylim(-4, 4)
# theoreical distribution of Z_1
ax.plot(std_score_axis, 4+(std_score_axis*0), stats.norm.pdf(std_score_axis, 0, 1))
# theoreical distribution of Z_2
ax.plot(-4+(std_score_axis*0), std_score_axis, stats.norm.pdf(std_score_axis, 0, 1))
# vertical lines
#zmin, zmax = ax.set_zlim()
ax.set_zlim(0, 0.5)
zmin, zmax = ax.set_zlim()
ax.plot([std_score, std_score], [4, 4], [zmin, zmax], 'r:', linewidth=2)
ax.plot([-std_score, -std_score], [4, 4], [zmin, zmax], 'r:', linewidth=2)
ax.plot([-4, -4], [std_score, std_score], [zmin, zmax], 'r:', linewidth=2)
ax.plot([-4, -4], [-std_score, -std_score], [zmin, zmax], 'r:', linewidth=2)
# theoreical distribution of Z
surf = ax.plot_surface(std_score_grid_x, std_score_grid_y, rv.pdf(std_score_grid_xy), linewidth=1, cmap='viridis')
# red box
ww, zz = np.meshgrid((-std_score, std_score), (zmin/4, zmax/2.2))
ax.plot_surface(abs(ww), ww, zz, color='#FF0000FF', shade=True)
ax.plot_surface(-abs(ww), ww, zz, color='#FF0000FF', shade=True)
ax.plot_surface(ww, abs(ww), zz, color='#FF0000FF', shade=True)
ax.plot_surface(ww, -abs(ww), zz, color='#FF0000FF',shade=True)
fig.colorbar(surf)
surf.set_clim(vmin=0, vmax=0.16)
#%% # theoretical 2D plot
fig = plt.figure(figsize=(8, 8))
gs = fig.add_gridspec(2, 2, width_ratios=(3, 5), height_ratios=(3, 5),
left=0.1, right=0.8, bottom=0.1, top=0.8,
wspace=0.2, hspace=0.2)
ax_2d = fig.add_subplot(gs[1, 1])
im = plt.imshow(rv.pdf(std_score_grid_xy), extent=[-4,4,-4,4], origin='lower')
cbaxes = fig.add_axes([0.85, 0.1, 0.03, 0.395]) # This is the position for the colorbar
cb = plt.colorbar(im, cax = cbaxes)
plt.clim(0,0.16)
rect = patches.Rectangle((-std_score, -std_score), 2*std_score, 2*std_score,
edgecolor='r', linestyle=':', facecolor='none')
ax_2d.add_patch(rect)
rect = patches.Rectangle((-std_score_corrB, -std_score_corrB), 2*std_score_corrB, 2*std_score_corrB,
edgecolor='g', linestyle=':', facecolor='none')
ax_2d.add_patch(rect)
ax_2d.axis('off')
ax_top = fig.add_subplot(gs[0, 1], sharex=ax_2d)
ax_top.plot(std_score_axis, stats.norm.pdf(std_score_axis, 0, 1))
ax_top.set_ylim([0, 0.5])
ax_top.set_xlim([-4, 4])
ymin, ymax = ax_top.get_ylim()
ax_top.vlines((-std_score, std_score), ymin, ymax, colors='r', linestyles=':',
linewidth = 2)
ax_top.vlines((-std_score_corrB, std_score_corrB), ymin, ymax, colors='g', linestyles=':',
linewidth = 2)
plt.yticks(np.arange(0,0.6,0.1))
plt.xticks(np.arange(-4,4+1))
ax_top.grid(True)
ax_top.set_xlabel(r'$Z_1$ in $\sigma_{Z_1}$ units')
ax_rig = fig.add_subplot(gs[1, 0], sharey=ax_2d)
ax = ax_rig.plot(stats.norm.pdf(std_score_axis, 0, 1), std_score_axis)
ax = ax_rig.plot(stats.norm.pdf(std_score_axis, 0, 1), std_score_axis)
ax_rig.set_xlim([0.5, 0])
ax_rig.set_ylim([-4, 4])
ax_rig.yaxis.tick_right()
plt.xticks(rotation=90)
plt.yticks(rotation=90)
ax_rig.hlines((-std_score, std_score), ymin, ymax, colors='r', linestyles=':',
linewidth = 2)
ax_rig.hlines((-std_score_corrB, std_score_corrB), ymin, ymax, colors='g', linestyles=':',
linewidth = 2)
plt.xticks(np.arange(0,0.6,0.1))
plt.yticks(np.arange(-4,4+1))
ax_rig.grid(True)
ax_rig.set_ylabel(r'$Z_2$ in $\sigma_{Z_2}$ units')
ax_rig.yaxis.set_label_position("right")
# experimental 2D plot
fig = plt.figure(figsize=(8, 8))
gs = fig.add_gridspec(2, 2, width_ratios=(3, 5), height_ratios=(3, 5),
left=0.1, right=0.8, bottom=0.1, top=0.8,
wspace=0.2, hspace=0.2)
ax_2d = fig.add_subplot(gs[1, 1])
ima, _, _ = np.histogram2d(std_scores[:,0], std_scores[:,1], bins=100, density=True)
im = plt.imshow(ima, extent=[-4,4,-4,4], origin='lower')
cbaxes = fig.add_axes([0.85, 0.1, 0.03, 0.395]) # This is the position for the colorbar
cb = plt.colorbar(im, cax = cbaxes)
plt.clim(0,0.16)
rect = patches.Rectangle((-std_score, -std_score), 2*std_score, 2*std_score,
edgecolor='r', linestyle=':', facecolor='none')
ax_2d.add_patch(rect)
rect = patches.Rectangle((-std_score_corrB, -std_score_corrB), 2*std_score_corrB, 2*std_score_corrB,
edgecolor='g', linestyle=':', facecolor='none')
ax_2d.add_patch(rect)
ax_2d.axis('off')
ax_top = fig.add_subplot(gs[0, 1], sharex=ax_2d)
ax_top.plot(std_score_axis, stats.norm.pdf(std_score_axis, 0, 1))
ax_top.hist(std_scores[:,0], density=True, bins=100, color='#1f77b4aa')
ax_top.set_ylim([0, 0.5])
ax_top.set_xlim([-4, 4])
ymin, ymax = ax_top.get_ylim()
ax_top.vlines((-std_score, std_score), ymin, ymax, colors='r', linestyles=':',
linewidth = 2)
ax_top.vlines((-std_score_corrB, std_score_corrB), ymin, ymax, colors='g', linestyles=':',
linewidth = 2)
plt.yticks(np.arange(0,0.6,0.1))
plt.xticks(np.arange(-4,4+1))
ax_top.grid(True)
ax_top.set_xlabel(r'$Z_1$ in $\sigma_{Z_1}$ units')
ax_rig = fig.add_subplot(gs[1, 0], sharey=ax_2d)
ax = ax_rig.plot(stats.norm.pdf(std_score_axis, 0, 1), std_score_axis, color='#ff7f0e')
ax_rig.hist(std_scores[:,1], density=True, bins=100, color='#ff7f0eaa', orientation='horizontal')
ax_rig.set_xlim([0.5, 0])
ax_rig.set_ylim([-4, 4])
ax_rig.yaxis.tick_right()
plt.xticks(rotation=90)
plt.yticks(rotation=90)
ax_rig.hlines((-std_score, std_score), ymin, ymax, colors='r', linestyles=':',
linewidth = 2)
ax_rig.hlines((-std_score_corrB, std_score_corrB), ymin, ymax, colors='g', linestyles=':',
linewidth = 2)
plt.xticks(np.arange(0,0.6,0.1))
plt.yticks(np.arange(-4,4+1))
ax_rig.grid(True)
ax_rig.set_ylabel(r'$Z_2$ in $\sigma_{Z_2}$ units')
ax_rig.yaxis.set_label_position("right")
#%% Bonferroni correction
# significance level and Bonferroni corrected significance level levels vs number of comparison
alpha = 0.05
m = 15
fig = plt.figure(figsize=[15, 5])
plt.plot(np.arange(m)+1, (1 - ((1-alpha) ** (np.arange(m)+1))) * 100,
'ro', label=r'Uncorrected $\alpha$')
plt.plot(np.arange(m)+1, (1 - ((1-alpha /(np.arange(m)+1)) ** (np.arange(m)+1))) * 100,
'g*', label=r'Bonferroni corrected $\alpha/m$')
plt.xlabel('number of multiple comparisons', fontsize=14)
plt.ylabel('probability of finding at least \n one significant result (%)', fontsize=14)
plt.yticks(np.arange(0,60,5))
plt.xticks(np.arange(1,m+1,1))
plt.legend(fontsize=14)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment