Skip to content

Instantly share code, notes, and snippets.

@tyarkoni
Created April 3, 2016 15:59
Show Gist options
  • Save tyarkoni/9d33a97f3eda4675686f17994f89d2e9 to your computer and use it in GitHub Desktop.
Save tyarkoni/9d33a97f3eda4675686f17994f89d2e9 to your computer and use it in GitHub Desktop.
Matching on unreliable variables produces residual confounding
'''
A small simulation to demonstrate that matching trials does not solve the
problem of residual confounding. For description of original problem, see
http://dx.doi.org/10.1371/journal.pone.0152719
Here we simulate a situation where we match trials from two conditions that
differ in Y on an indicator M. By hypothesis, there is no difference in Y in
the population after controlling for M. But because of measurement error,
matching on M will, on average, leave a residual mean difference in the Y's.
Raising the reliability of M (REL_M) will decrease this difference, and setting
it to 1.0 will eliminate it completely, demonstrating that matching works just
as well as statistical control when the matching indicator is measured with
perfect reliability.
'''
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# %matplotlib inline # If running in notebook
# Number of trials to match
N = 100
# Number of simulation reps
REPS = 1000
# Difference (in sd) between conditions
X_DIFF = 0.5
# Correlation between M and Y
R_MY = 0.5
# Reliability of M: set to 1 to eliminate difference
REL_M = 0.5
# Pool to sample trials from
POOL = 10000
### Simulation ###
y_diffs = np.zeros(REPS)
matched_diffs = np.zeros(REPS)
for rep_i in range(REPS):
# Create variables
M = np.random.normal([0, X_DIFF], size=(POOL, 2))
Y = M * R_MY + np.sqrt(1 - R_MY ** 2) * np.random.normal(size=(POOL, 2))
# Introduce error to M
M = M * REL_M + np.sqrt(1 - REL_M ** 2) * np.random.normal(size=(POOL, 2))
# Compute zero-order difference in Y for same sample size
y_diffs[rep_i] = np.diff(Y[np.random.choice(POOL, N, False)].mean(0))
# Match on M
c1, c2 = np.c_[M[:, 0], Y[:, 0]], np.c_[M[:, 1], Y[:, 1]]
c1 = c1[np.argsort(c1[:, 0])]
c2 = c2[np.argsort(c2[:, 0])]
j = 0
matched = []
for i, v in enumerate(c1[:, 0]):
other = c2[j, 0]
# Call it a match if values are very close
if np.abs(v - other) < 0.0002:
matched.append([c1[i, 1], c2[j, 1]])
j += 1
elif v > other:
j += 1
matched = np.array(matched)
matched = matched[np.random.choice(len(matched), N, False)]
matched_diffs[rep_i] = np.diff(matched.mean(0))
print("Original difference in y:", y_diffs.mean().round(2))
print("Difference in y after matching:", matched_diffs.mean().round(2))
ax = sns.kdeplot(y_diffs)
ax = sns.kdeplot(matched_diffs, ax=ax)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment