Skip to content

Instantly share code, notes, and snippets.

@BioSciEconomist
Last active May 17, 2022 21:14
Show Gist options
  • Save BioSciEconomist/b1e7308b1ffed12b4b5be4c7c030edf3 to your computer and use it in GitHub Desktop.
Save BioSciEconomist/b1e7308b1ffed12b4b5be4c7c030edf3 to your computer and use it in GitHub Desktop.
Back of the Envelope Difference in Differences Simulation and Power Analysis for a Linear Probability Model
# *-----------------------------------------------------------------
# | PROGRAM NAME: DID LPM Power.py
# | DATE: 1/6/22
# | CREATED BY: MATT BOGARD
# | PROJECT FILE:
# *----------------------------------------------------------------
# | PURPOSE: simulate data mimicking hypothesized DID scenario and assess power (2x2 DID)
# *----------------------------------------------------------------
# !!!!!! WARNING - UNDER CONSTRUCTION - THIS DOES NOT ACCOUNT FOR CORRELATION STRUCTURES
# AND RESAMPLING METHOD MAY NEED REVISION
# CONTACT matttbogard@gmail.com for feedback and suggestions to help improve this script
import pandas as pd
import numpy as np
from matplotlib import pyplot
import statsmodels.api as sm # import stastmodels
import statsmodels.formula.api as smf # this allows us to use an explicit formulation
import statsmodels
from statsmodels.stats import power
pd.set_option('display.float_format', '{:.2f}'.format) # suppress sci notation
np.set_printoptions(suppress=True) # suppress sci notation
#------------------------------------------
# simulate data
#------------------------------------------
T1_TRT = .85 # treatment pre average outcome
T2_TRT = .83 # treatment post average outcome
T1_CTRL = .80 # control pre average outcome
T2_CTRL = .64 # control post average outcome
(T1_TRT - T2_TRT) - (T1_CTRL-T2_CTRL) # population DID treatment effect
# treatment pre post
trtPre = pd.DataFrame(columns=['ID','prob','treat','time'])
trtPre['ID'] =list(range(1,100001))
np.random.seed(123)
trtPre['prob'] = np.random.binomial(1, T1_TRT, 100000)
trtPre['treat'] = 1
trtPre['time'] = 0
trtPre['prob'].describe()
trtPost = pd.DataFrame(columns=['ID','prob','treat','time'])
trtPost['ID'] =list(range(1,100001))
np.random.seed(123)
trtPost['prob'] = np.random.binomial(1,T2_TRT,100000)
trtPost['treat'] = 1
trtPost['time'] = 1
trtPost['prob'].describe()
# control pre post
ctrlPre = pd.DataFrame(columns=['ID','prob','treat','time'])
ctrlPre['ID'] =list(range(100001,200001))
np.random.seed(123)
ctrlPre['prob'] = np.random.binomial(1,T1_CTRL,100000)
ctrlPre['treat'] = 0
ctrlPre['time'] = 0
ctrlPre['prob'].describe()
ctrlPost = pd.DataFrame(columns=['ID','prob','treat','time'])
ctrlPost['ID'] =list(range(100001,200001))
np.random.seed(123)
ctrlPost['prob'] = np.random.binomial(1,T2_CTRL,100000)
ctrlPost['treat'] = 0
ctrlPost['time'] = 1
ctrlPost['prob'].describe()
# combine data
df = pd.concat([trtPre,trtPost,ctrlPre,ctrlPost,],ignore_index=True)
(trtPre.prob.mean()-trtPost.prob.mean()) - (ctrlPre.prob.mean() - ctrlPost.prob.mean())
#----------------------------------------
# estimate DID model
#----------------------------------------
# Y = B0 + B1*Treat + B2*Post + B3*Treat*Post + e
# raw comparisons
results = smf.ols('prob ~ treat + time + treat*time', data=df).fit() # linear probability model
results.summary()
#------------------------------------
# test on a sample
#-----------------------------------
nSize = 150
np.random.seed(321)
sample_t = trtPre.sample(nSize, replace=False) # get 500 trt IDs
np.random.seed(321)
sample_c = ctrlPre.sample(nSize, replace=False) # get 500 ctrl IDs
ids = pd.concat([sample_t,sample_c,],ignore_index=True)
ids['rsample'] = 1
# flag and subset randomly sampled IDs
rs = pd.merge(df,ids[['rsample','ID']], on='ID', how='left')
mask = (rs['rsample'] == 1)
rs = rs[mask]
# did estimate
results = smf.ols('prob ~ treat + time + treat*time', data=rs).fit() # linear probability model
results.summary()
#--------------------------------
# power analysis - compute power for a given sample size
#--------------------------------
# define power analysis function
nSize = 150
def run_tests(trt_dat,ctrl_dat,size):
"""uses monte carlo type resampling of simulated data to construct power, trt_dat is simulated pre period data for the treatment group, ctrl_dat is simulated pre period data for the control group, size is the # of reps in the simualtion"""
# Create an empty array to store replicates
est = np.empty(size)
seeds = np.empty(size)
pvals = np.empty(size)
# pull bootstrapped samples
for i in range(size):
print (i)
# Create a bootstrap sample
np.random.seed(i)
sample_t = trt_dat.sample(nSize, replace=False) # get randome set of trt IDs
np.random.seed(i)
sample_c = ctrl_dat.sample(nSize, replace=False) # get random set of ctrl IDs
ids = pd.concat([sample_t,sample_c,],ignore_index=True)
ids['rsample'] = 1
# flag and subset randomly sampled IDs
rs = pd.merge(df,ids[['rsample','ID']], on='ID', how='left')
mask = (rs['rsample'] == 1)
rs = rs[mask]
# fit model
model = smf.ols('prob ~ treat + time + treat*time', data=rs).fit()
est[i] = model.params['treat:time']
pvals[i] = model.pvalues['treat:time']
#print(tmp.y1.mean())
seeds[i] = i
return est, seeds,pvals
# get simulated trt and control data
run_trt = trtPre
run_ctrl = ctrlPre
# estimate power using function defined above
results = run_tests(run_trt,run_ctrl,500)
report = pd.DataFrame(columns=['seed','est'])
report['seed'] = results[1]
report['est'] = results[0]
report['pvals'] = results[2]
report['sig'] = np.where(report['pvals'] < .10,1,0) # alpha = .10, change to desired level of confidence
# print(report)
sum(report['sig'])/len(report['sig']) # power
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment