tyarkoni/simulate_matching.py

## simulate_matching.py
'''
A small simulation to demonstrate that matching trials does not solve the
problem of residual confounding. For description of original problem, see
http://dx.doi.org/10.1371/journal.pone.0152719
Here we simulate a situation where we match trials from two conditions that
differ in Y on an indicator M. By hypothesis, there is no difference in Y in
the population after controlling for M. But because of measurement error,
matching on M will, on average, leave a residual mean difference in the Y's.
Raising the reliability of M (REL_M) will decrease this difference, and setting
it to 1.0 will eliminate it completely, demonstrating that matching works just
as well as statistical control when the matching indicator is measured with
perfect reliability.
'''
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# %matplotlib inline  # If running in notebook

# Number of trials to match
N = 100

# Number of simulation reps
REPS = 1000

# Difference (in sd) between conditions
X_DIFF = 0.5

# Correlation between M and Y
R_MY = 0.5

# Reliability of M: set to 1 to eliminate difference
REL_M = 0.5

# Pool to sample trials from
POOL = 10000


### Simulation ###
y_diffs = np.zeros(REPS)
matched_diffs = np.zeros(REPS)

for rep_i in range(REPS):

    # Create variables
    M = np.random.normal([0, X_DIFF], size=(POOL, 2))
    Y = M * R_MY + np.sqrt(1 - R_MY ** 2) * np.random.normal(size=(POOL, 2))

    # Introduce error to M
    M = M * REL_M + np.sqrt(1 - REL_M ** 2) * np.random.normal(size=(POOL, 2))

    # Compute zero-order difference in Y for same sample size
    y_diffs[rep_i] = np.diff(Y[np.random.choice(POOL, N, False)].mean(0))

    # Match on M
    c1, c2 = np.c_[M[:, 0], Y[:, 0]], np.c_[M[:, 1], Y[:, 1]]
    c1 = c1[np.argsort(c1[:, 0])]
    c2 = c2[np.argsort(c2[:, 0])]
    j = 0
    matched = []
    for i, v in enumerate(c1[:, 0]):
        other = c2[j, 0]
        # Call it a match if values are very close
        if np.abs(v - other) < 0.0002:
            matched.append([c1[i, 1], c2[j, 1]])
            j += 1
        elif v > other:
            j += 1

    matched = np.array(matched)
    matched = matched[np.random.choice(len(matched), N, False)]
    matched_diffs[rep_i] = np.diff(matched.mean(0))

print("Original difference in y:", y_diffs.mean().round(2))
print("Difference in y after matching:", matched_diffs.mean().round(2))
ax = sns.kdeplot(y_diffs)
ax = sns.kdeplot(matched_diffs, ax=ax)
	'''
	A small simulation to demonstrate that matching trials does not solve the
	problem of residual confounding. For description of original problem, see
	http://dx.doi.org/10.1371/journal.pone.0152719
	Here we simulate a situation where we match trials from two conditions that
	differ in Y on an indicator M. By hypothesis, there is no difference in Y in
	the population after controlling for M. But because of measurement error,
	matching on M will, on average, leave a residual mean difference in the Y's.
	Raising the reliability of M (REL_M) will decrease this difference, and setting
	it to 1.0 will eliminate it completely, demonstrating that matching works just
	as well as statistical control when the matching indicator is measured with
	perfect reliability.
	'''
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	# %matplotlib inline # If running in notebook

	# Number of trials to match
	N = 100

	# Number of simulation reps
	REPS = 1000

	# Difference (in sd) between conditions
	X_DIFF = 0.5

	# Correlation between M and Y
	R_MY = 0.5

	# Reliability of M: set to 1 to eliminate difference
	REL_M = 0.5

	# Pool to sample trials from
	POOL = 10000


	### Simulation ###
	y_diffs = np.zeros(REPS)
	matched_diffs = np.zeros(REPS)

	for rep_i in range(REPS):

	# Create variables
	M = np.random.normal([0, X_DIFF], size=(POOL, 2))
	Y = M * R_MY + np.sqrt(1 - R_MY ** 2) * np.random.normal(size=(POOL, 2))

	# Introduce error to M
	M = M * REL_M + np.sqrt(1 - REL_M ** 2) * np.random.normal(size=(POOL, 2))

	# Compute zero-order difference in Y for same sample size
	y_diffs[rep_i] = np.diff(Y[np.random.choice(POOL, N, False)].mean(0))

	# Match on M
	c1, c2 = np.c_[M[:, 0], Y[:, 0]], np.c_[M[:, 1], Y[:, 1]]
	c1 = c1[np.argsort(c1[:, 0])]
	c2 = c2[np.argsort(c2[:, 0])]
	j = 0
	matched = []
	for i, v in enumerate(c1[:, 0]):
	other = c2[j, 0]
	# Call it a match if values are very close
	if np.abs(v - other) < 0.0002:
	matched.append([c1[i, 1], c2[j, 1]])
	j += 1
	elif v > other:
	j += 1

	matched = np.array(matched)
	matched = matched[np.random.choice(len(matched), N, False)]
	matched_diffs[rep_i] = np.diff(matched.mean(0))

	print("Original difference in y:", y_diffs.mean().round(2))
	print("Difference in y after matching:", matched_diffs.mean().round(2))
	ax = sns.kdeplot(y_diffs)
	ax = sns.kdeplot(matched_diffs, ax=ax)