Skip to content

Instantly share code, notes, and snippets.

@QuentinAndre
Created August 13, 2020 20:21
Show Gist options
  • Save QuentinAndre/7671c324ec6eb4466b64247c740ee439 to your computer and use it in GitHub Desktop.
Save QuentinAndre/7671c324ec6eb4466b64247c740ee439 to your computer and use it in GitHub Desktop.
Simulation of false-positive rates
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
def exclude_zscore_outliers(x):
"""
A small utility function to exclude *extreme outliers**:
values that are above or below 2 z-score.
"""
std = x.std()
m = x.mean()
high_cutoff = m+2*std
low_cutoff = m-2*std
return x[(x < high_cutoff) & (x > low_cutoff)]
def exclude_zscore_outliers_common(x, y):
"""
Same function, but applies a common cutoff to two vectors.
"""
xy = np.concatenate([x, y])
std = xy.std()
m = xy.mean()
high_cutoff = m+2*std
low_cutoff = m-2*std
return (
x[(x < high_cutoff) & (x > low_cutoff)],
y[(y < high_cutoff) & (y > low_cutoff)],
)
def compare_pvals(n=50):
"""
Compare two vectors of data of size n coming
from the same distribution, and return the p-values of the
comparison (1) before excluding extreme outliers, (2) after
excluding outliers using a common cutoff and (3) after
excluding outliers using a different cutoff for each vector.
"""
x = np.exp(np.random.normal(3, 1, n))
y = np.exp(np.random.normal(3, 1, n))
#x = np.random.normal(3, 1, n) # Uncomment for normal data
#y = np.random.normal(3, 1, n) # Uncomment for normal data
p = stats.ttest_ind(x, y).pvalue
x_common, y_common = exclude_zscore_outliers_common(x, y)
p_common = stats.ttest_ind(x_common, y_common).pvalue
x_diff = exclude_zscore_outliers(x)
y_diff = exclude_zscore_outliers(y)
p_diff = stats.ttest_ind(x_diff, y_diff).pvalue
return p, p_common, p_diff
# Let's repeat this experiment 10,000 times:
N = 5000
pvals = np.empty(shape=(3, N))
for i in range(N):
pvals[:, i] = compare_pvals()
pvals_no_excl, pvals_common_cutoff, pvals_diff_cutoffs = pvals
# Now let's visualize the p-values and false-positive rates:
hist_kws = dict(
bins=np.arange(0, 1.025, 0.025),
align="mid", density=True,
histtype="step", lw=1.5
)
alpha_no_excl = (pvals_no_excl < 0.05).mean()
alpha_common = (pvals_common_cutoff < 0.05).mean()
alpha_diff = (pvals_diff_cutoffs < 0.05).mean()
plt.hist(
pvals_no_excl,
**hist_kws,
label=f"No Exclusion ($\\alpha= {alpha_no_excl:.3f}$)"
)
plt.hist(
pvals_common_cutoff,
**hist_kws,
label=f"Common Cutoff ($\\alpha= {alpha_common:.3f}$)",
)
plt.hist(
pvals_diff_cutoffs,
**hist_kws,
label=f"Different Cutoff ($\\alpha= {alpha_diff:.3f}$)",
)
ax = plt.gca()
ax.legend(frameon=False)
ax.set_xlabel("p-value")
ax.set_ylabel("Density")
sns.despine()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment