BioSciEconomist/DID LPM Power.py

## DID LPM Power.py
# *-----------------------------------------------------------------
# | PROGRAM NAME: DID LPM Power.py
# | DATE: 1/6/22
# | CREATED BY: MATT BOGARD
# | PROJECT FILE:
# *----------------------------------------------------------------
# | PURPOSE: simulate data mimicking hypothesized DID scenario and assess power (2x2 DID)
# *----------------------------------------------------------------

# !!!!!! WARNING - UNDER CONSTRUCTION - THIS DOES NOT ACCOUNT FOR CORRELATION STRUCTURES
#         AND RESAMPLING METHOD MAY NEED REVISION

# CONTACT matttbogard@gmail.com for feedback and suggestions to help improve this script

import pandas as pd
import numpy as np
from matplotlib import pyplot
import statsmodels.api as sm # import stastmodels
import statsmodels.formula.api as smf # this allows us to use an explicit formulation

import statsmodels
from statsmodels.stats import power

pd.set_option('display.float_format', '{:.2f}'.format) # suppress sci notation
np.set_printoptions(suppress=True) # suppress sci notation

#------------------------------------------
# simulate data
#------------------------------------------


T1_TRT = .85 # treatment pre average outcome
T2_TRT = .83 # treatment post average outcome

T1_CTRL = .80 # control pre average outcome
T2_CTRL = .64 # control post average outcome

(T1_TRT - T2_TRT) - (T1_CTRL-T2_CTRL) # population DID treatment effect


# treatment pre post

trtPre = pd.DataFrame(columns=['ID','prob','treat','time'])
trtPre['ID'] =list(range(1,100001))
np.random.seed(123)
trtPre['prob'] = np.random.binomial(1, T1_TRT, 100000)
trtPre['treat'] = 1
trtPre['time'] = 0

trtPre['prob'].describe()


trtPost = pd.DataFrame(columns=['ID','prob','treat','time'])
trtPost['ID'] =list(range(1,100001))
np.random.seed(123)
trtPost['prob'] = np.random.binomial(1,T2_TRT,100000)
trtPost['treat'] = 1
trtPost['time'] = 1

trtPost['prob'].describe()


# control pre post


ctrlPre = pd.DataFrame(columns=['ID','prob','treat','time'])
ctrlPre['ID'] =list(range(100001,200001))
np.random.seed(123)
ctrlPre['prob'] = np.random.binomial(1,T1_CTRL,100000)
ctrlPre['treat'] = 0
ctrlPre['time'] = 0

ctrlPre['prob'].describe()


ctrlPost = pd.DataFrame(columns=['ID','prob','treat','time'])
ctrlPost['ID'] =list(range(100001,200001))
np.random.seed(123)
ctrlPost['prob'] = np.random.binomial(1,T2_CTRL,100000)
ctrlPost['treat'] = 0
ctrlPost['time'] = 1

ctrlPost['prob'].describe()


# combine data

df = pd.concat([trtPre,trtPost,ctrlPre,ctrlPost,],ignore_index=True)

(trtPre.prob.mean()-trtPost.prob.mean()) - (ctrlPre.prob.mean() - ctrlPost.prob.mean())


#----------------------------------------
# estimate DID model
#----------------------------------------


# Y = B0 + B1*Treat + B2*Post  + B3*Treat*Post + e

# raw comparisons
results = smf.ols('prob ~ treat + time + treat*time', data=df).fit() # linear probability model

results.summary()


#------------------------------------
# test on a sample
#-----------------------------------

nSize = 150


np.random.seed(321)
sample_t = trtPre.sample(nSize, replace=False) # get 500 trt IDs

np.random.seed(321)
sample_c = ctrlPre.sample(nSize, replace=False) # get 500 ctrl IDs

ids = pd.concat([sample_t,sample_c,],ignore_index=True)
ids['rsample'] = 1


# flag and subset randomly sampled IDs
rs = pd.merge(df,ids[['rsample','ID']], on='ID', how='left')
mask = (rs['rsample'] == 1)
rs = rs[mask]


# did estimate
results = smf.ols('prob ~ treat + time + treat*time', data=rs).fit() # linear probability model

results.summary()


#--------------------------------
# power analysis - compute power for a given sample size
#--------------------------------

# define power analysis function

nSize = 150

def run_tests(trt_dat,ctrl_dat,size):
    """uses monte carlo type resampling of simulated data to construct power, trt_dat is simulated pre period data for the treatment group, ctrl_dat is simulated pre period data for the control group, size is the # of reps in the simualtion"""
    # Create an empty array to store replicates
    est = np.empty(size)
    seeds = np.empty(size)
    pvals = np.empty(size)
    # pull bootstrapped samples
    for i in range(size):
        print (i)
        # Create a bootstrap sample
        np.random.seed(i)
        sample_t = trt_dat.sample(nSize, replace=False) # get randome set of trt IDs
        np.random.seed(i)
        sample_c = ctrl_dat.sample(nSize, replace=False) # get random set of ctrl IDs

        ids = pd.concat([sample_t,sample_c,],ignore_index=True)
        ids['rsample'] = 1


        # flag and subset randomly sampled IDs
        rs = pd.merge(df,ids[['rsample','ID']], on='ID', how='left')
        mask = (rs['rsample'] == 1)
        rs = rs[mask]

        # fit model
        model = smf.ols('prob ~ treat + time + treat*time', data=rs).fit()
        est[i] = model.params['treat:time']
        pvals[i] = model.pvalues['treat:time']
        #print(tmp.y1.mean())
        seeds[i] = i

    return est, seeds,pvals


# get simulated trt and control data

run_trt = trtPre
run_ctrl = ctrlPre

# estimate power using function defined above

results = run_tests(run_trt,run_ctrl,500)

report = pd.DataFrame(columns=['seed','est'])
report['seed'] = results[1]
report['est'] = results[0]
report['pvals'] = results[2]
report['sig'] = np.where(report['pvals'] < .10,1,0) # alpha = .10, change to desired level of confidence
# print(report)

sum(report['sig'])/len(report['sig']) # power
	# *-----------------------------------------------------------------
	# \| PROGRAM NAME: DID LPM Power.py
	# \| DATE: 1/6/22
	# \| CREATED BY: MATT BOGARD
	# \| PROJECT FILE:
	# *----------------------------------------------------------------
	# \| PURPOSE: simulate data mimicking hypothesized DID scenario and assess power (2x2 DID)
	# *----------------------------------------------------------------

	# !!!!!! WARNING - UNDER CONSTRUCTION - THIS DOES NOT ACCOUNT FOR CORRELATION STRUCTURES
	# AND RESAMPLING METHOD MAY NEED REVISION

	# CONTACT matttbogard@gmail.com for feedback and suggestions to help improve this script

	import pandas as pd
	import numpy as np
	from matplotlib import pyplot
	import statsmodels.api as sm # import stastmodels
	import statsmodels.formula.api as smf # this allows us to use an explicit formulation

	import statsmodels
	from statsmodels.stats import power

	pd.set_option('display.float_format', '{:.2f}'.format) # suppress sci notation
	np.set_printoptions(suppress=True) # suppress sci notation

	#------------------------------------------
	# simulate data
	#------------------------------------------


	T1_TRT = .85 # treatment pre average outcome
	T2_TRT = .83 # treatment post average outcome

	T1_CTRL = .80 # control pre average outcome
	T2_CTRL = .64 # control post average outcome

	(T1_TRT - T2_TRT) - (T1_CTRL-T2_CTRL) # population DID treatment effect


	# treatment pre post

	trtPre = pd.DataFrame(columns=['ID','prob','treat','time'])
	trtPre['ID'] =list(range(1,100001))
	np.random.seed(123)
	trtPre['prob'] = np.random.binomial(1, T1_TRT, 100000)
	trtPre['treat'] = 1
	trtPre['time'] = 0

	trtPre['prob'].describe()


	trtPost = pd.DataFrame(columns=['ID','prob','treat','time'])
	trtPost['ID'] =list(range(1,100001))
	np.random.seed(123)
	trtPost['prob'] = np.random.binomial(1,T2_TRT,100000)
	trtPost['treat'] = 1
	trtPost['time'] = 1

	trtPost['prob'].describe()


	# control pre post


	ctrlPre = pd.DataFrame(columns=['ID','prob','treat','time'])
	ctrlPre['ID'] =list(range(100001,200001))
	np.random.seed(123)
	ctrlPre['prob'] = np.random.binomial(1,T1_CTRL,100000)
	ctrlPre['treat'] = 0
	ctrlPre['time'] = 0

	ctrlPre['prob'].describe()


	ctrlPost = pd.DataFrame(columns=['ID','prob','treat','time'])
	ctrlPost['ID'] =list(range(100001,200001))
	np.random.seed(123)
	ctrlPost['prob'] = np.random.binomial(1,T2_CTRL,100000)
	ctrlPost['treat'] = 0
	ctrlPost['time'] = 1

	ctrlPost['prob'].describe()


	# combine data

	df = pd.concat([trtPre,trtPost,ctrlPre,ctrlPost,],ignore_index=True)

	(trtPre.prob.mean()-trtPost.prob.mean()) - (ctrlPre.prob.mean() - ctrlPost.prob.mean())


	#----------------------------------------
	# estimate DID model
	#----------------------------------------


	# Y = B0 + B1Treat + B2Post + B3TreatPost + e

	# raw comparisons
	results = smf.ols('prob ~ treat + time + treat*time', data=df).fit() # linear probability model

	results.summary()


	#------------------------------------
	# test on a sample
	#-----------------------------------

	nSize = 150


	np.random.seed(321)
	sample_t = trtPre.sample(nSize, replace=False) # get 500 trt IDs

	np.random.seed(321)
	sample_c = ctrlPre.sample(nSize, replace=False) # get 500 ctrl IDs

	ids = pd.concat([sample_t,sample_c,],ignore_index=True)
	ids['rsample'] = 1


	# flag and subset randomly sampled IDs
	rs = pd.merge(df,ids[['rsample','ID']], on='ID', how='left')
	mask = (rs['rsample'] == 1)
	rs = rs[mask]


	# did estimate
	results = smf.ols('prob ~ treat + time + treat*time', data=rs).fit() # linear probability model

	results.summary()


	#--------------------------------
	# power analysis - compute power for a given sample size
	#--------------------------------

	# define power analysis function

	nSize = 150

	def run_tests(trt_dat,ctrl_dat,size):
	"""uses monte carlo type resampling of simulated data to construct power, trt_dat is simulated pre period data for the treatment group, ctrl_dat is simulated pre period data for the control group, size is the # of reps in the simualtion"""
	# Create an empty array to store replicates
	est = np.empty(size)
	seeds = np.empty(size)
	pvals = np.empty(size)
	# pull bootstrapped samples
	for i in range(size):
	print (i)
	# Create a bootstrap sample
	np.random.seed(i)
	sample_t = trt_dat.sample(nSize, replace=False) # get randome set of trt IDs
	np.random.seed(i)
	sample_c = ctrl_dat.sample(nSize, replace=False) # get random set of ctrl IDs

	ids = pd.concat([sample_t,sample_c,],ignore_index=True)
	ids['rsample'] = 1


	# flag and subset randomly sampled IDs
	rs = pd.merge(df,ids[['rsample','ID']], on='ID', how='left')
	mask = (rs['rsample'] == 1)
	rs = rs[mask]

	# fit model
	model = smf.ols('prob ~ treat + time + treat*time', data=rs).fit()
	est[i] = model.params['treat:time']
	pvals[i] = model.pvalues['treat:time']
	#print(tmp.y1.mean())
	seeds[i] = i

	return est, seeds,pvals


	# get simulated trt and control data

	run_trt = trtPre
	run_ctrl = ctrlPre

	# estimate power using function defined above

	results = run_tests(run_trt,run_ctrl,500)

	report = pd.DataFrame(columns=['seed','est'])
	report['seed'] = results[1]
	report['est'] = results[0]
	report['pvals'] = results[2]
	report['sig'] = np.where(report['pvals'] < .10,1,0) # alpha = .10, change to desired level of confidence
	# print(report)

	sum(report['sig'])/len(report['sig']) # power