bbdaniels/selection-interactive.do

## selection-interactive.do
 // Selection bias for normally-distributed effect with choice
// Setup
qui {
  clear all
  cls
  version 13

  // Reproducibility with bit.ly/stata-random
  set seed 422698	// Timestamp: 2021-01-22 16:41:57 UTC

// Request parameter values
  cap prog drop selection_setup
  prog def selection_setup

    di in smcl `"{ red : Welcome to the selection bias simulation program.}"'
    di in smcl `"{ red : Please enter the requested parameter values:}"'
    di `"Enter number of observations (integer)"'  _request(n)
    di `"Enter true effect size (real)"' _request(t)
    di `"Enter number of iterations (integer)"' _request(it)

  end
}

// Request parameters

  selection_setup

qui {
// Create simulation program
  cap prog drop selection
  prog def selection , rclass // Allow statistics output
  qui {
    syntax , [pause] [exit] [clear] //  Allow pausing and breaking

    if "`exit'" == "" preserve // Reset Stata state after finishing program run

    // Set parameters for simulation
    local n = ${n} // Number of observations
    local t = ${t} // True β of treatment effect
    local v = 1 // True standard deviation of treatment effect
    local s = 1 // True standard deviation of potential outcomes

    // Potential outcomes
    `clear'
    set obs `n' // Create simulation observations
      gen id = _n // ID variable
        lab var id "Unique ID"
      gen y_0 = rnormal(0,`s')         // Potential outcome 0: untreated
        lab var y_0 "Untreated Potential Outcome"
      gen y_1 = y_0 + rnormal(`t',`v') // Potential outcome 1: treated
        lab var y_1 "Treated Potential Outcome"

    // Allow choice if positive returns
    gen return = y_1 - y_0
      lab var return "Individual Gain to Education"
    gen education = (return > 0) // Choose education
      lab var education "Educated"
    gen y = y_0 // Observed outcome
      replace y = y_1 if (education == 1)
      lab var y "Observed Outcome"

    // Create randomization
    gen y_rand = y_0 // Observed outcome under randomization
      gen education_rand = (rnormal() > 0)
        lab var education_rand "Randomized Treatment"
      replace y_rand = y_1 if (education_rand == 1)
        lab var y_rand "Randomized Observed Outcome"

    // Regression time
    sum return // Investigate true β
      return scalar true = `r(mean)'
    reg y education // Investigate observed β
      mat results = r(table) // Get regression results
        local beta = results[1,1]
        local pval = results[4,1]

      // Push results to main memory
        return scalar sigb = (`pval' < 0.05)
        return scalar beta = `beta'
        return matrix results = results

    // Regression time (randomization)
    reg y_rand education_rand // Investigate observed β
      mat results = r(table) // Get regression results
        local beta_rand = results[1,1]
        local pval_rand = results[4,1]

      // Push results to main memory
        return scalar sigb_rand = (`pval_rand' < 0.05)
        return scalar beta_rand = `beta_rand'
        return matrix results_rand = results

    // Pause or here if we want to see microdata
    pause on
    `pause' // Type "BREAK" in console to quit, "q" to continue
    `exit' // Ends here if [exit] option called
  }
  end

// Multiple simulations

  clear
  tempfile sims
}
  simulate r(true) r(beta) r(beta_rand) r(sigb) r(sigb_rand) ///
    , reps(${it}) saving(`sims') seed(858488) /// Timestamp: 2021-01-22 17:25:50 UTC
    : selection , clear


qui {
  // Set up graphing
  noi di in smcl `"{ red : Compiling results visualization...}"' _newline _newline

  use `sims' , clear

    qui su _sim_2
      local m2 = r(min)
    qui su _sim_3
      local m3 = r(min)

    local min = min(`m2',`m3')

  // Visualize
  tw ///
    (histogram _sim_2 , w(0.05) lc(none) fc(red%75) barwidth(0.04) start(`min') freq) ///
    (histogram _sim_2 if (_sim_4 == 1), w(0.05) lc(black) fc(none) barwidth(0.04) start(`min') freq) ///
    (histogram _sim_1 , w(0.05) lc(none) fc(black%50) barwidth(0.04) start(`min')  freq) ///
  , legend(on order(0 "Effect Size With Choice:" 1 "Estimated Returns" 2 "Significant Estimates" 3 "True Returns") ///
      ring(1) pos(12) r(1) symxsize(small) region(lc(none))) ///
    nodraw saving(g1.gph , replace)

  // Visualize with randomization
  tw ///
    (histogram _sim_3 , w(0.05) lc(none) fc(red%75) barwidth(0.045) start(`min')  freq) ///
    (histogram _sim_3 if (_sim_5 == 1), w(0.05) lc(black) fc(none) barwidth(0.045) start(`min')  freq) ///
    (histogram _sim_1 , w(0.05) lc(none) fc(black%50) barwidth(0.04) start(`min')  freq) ///
  , legend(on order(0 "Effect Size With Randomization:" 1 "Estimated Returns" 2 "Significant Estimates" 3 "True Returns") ///
      ring(1) pos(12) r(1) symxsize(small) region(lc(none))) ///
    nodraw saving(g2.gph , replace)

  // Graph
  graph combine g1.gph g2.gph , c(1) xcom
  !rm g1.gph g2.gph

}
// End of dofile

## simple-selection.do
// Selection bias for normally-distributed effect with choice

// Setup
clear
cls
version 13

// Reproducibility with bit.ly/stata-random
set seed 422698	// Timestamp: 2021-01-22 16:41:57 UTC

// Create simulation program
cap prog drop selection
prog def selection , rclass // Allow statistics output

  syntax , [pause] [exit] [clear] //  Allow pausing and breaking

  if "`exit'" == "" preserve // Reset Stata state after finishing program run

  // Set parameters for simulation
  local n = 1000 // Number of observations
  local t = 0 // True β of treatment effect
  local v = 1 // True standard deviation of treatment effect
  local s = 1 // True standard deviation of potential outcomes

  // Potential outcomes
  `clear'
  set obs `n' // Create simulation observations
    gen id = _n // ID variable
      lab var id "Unique ID"
    gen y_0 = rnormal(0,`s')         // Potential outcome 0: untreated
      lab var y_0 "Untreated Potential Outcome"
    gen y_1 = y_0 + rnormal(`t',`v') // Potential outcome 1: treated
      lab var y_1 "Treated Potential Outcome"

  // Allow choice if positive returns
  gen return = y_1 - y_0
    lab var return "Individual Gain to Education"
  gen education = (return > 0) // Choose education
    lab var education "Educated"
  gen y = y_0 // Observed outcome
    replace y = y_1 if (education == 1)
    lab var y "Observed Outcome"

  // Create randomization
  gen y_rand = y_0 // Observed outcome under randomization
    gen education_rand = (rnormal() > 0)
      lab var education_rand "Randomized Treatment"
    replace y_rand = y_1 if (education_rand == 1)
      lab var y_rand "Randomized Observed Outcome"

  // Regression time
  sum return // Investigate true β
    return scalar true = `r(mean)'
  reg y education // Investigate observed β
    mat results = r(table) // Get regression results
      local beta = results[1,1]

    // Push results to main memory
      return scalar beta = `beta'
      return matrix results = results

  // Regression time (randomization)
  reg y_rand education_rand // Investigate observed β
    mat results = r(table) // Get regression results
      local beta_rand = results[1,1]

    // Push results to main memory
      return scalar beta_rand = `beta_rand'
      return matrix results_rand = results

  // Pause or here if we want to see microdata
  pause on
  `pause' // Type "BREAK" in console to quit, "q" to continue
  `exit' // Ends here if [exit] option called

end

// Single simulation run

  selection
    matlist r(results) // View returned estimates
    di r(beta) // View estimated beta
    di r(true) // View true beta

// Single simulation run with microdata

  selection, exit

// Multiple simulations

  clear
  tempfile sims

  simulate r(true) r(beta) r(beta_rand) ///
    , reps(100) saving(`sims') seed(858488) /// Timestamp: 2021-01-22 17:25:50 UTC
    : selection , clear

  use `sims' , clear

  // Visualize
  tw ///
    (histogram _sim_2 , w(0.05) lc(none) fc(red) barwidth(0.04) start(-0.2) freq) ///
    (histogram _sim_1 , w(0.05) lc(none) fc(gs12) barwidth(0.04) start(-0.2)  freq) ///
  , legend(on c(1) order(0 "With Choice:" 1 "Estimated Returns" 2 "True Returns") ring(0) pos(1))

  // Visualize with randomization
  tw ///
    (histogram _sim_3 , w(0.05) lc(none) fc(red%75) barwidth(0.045) start(-0.2)  freq) ///
    (histogram _sim_1 , w(0.05) lc(none) fc(black%75) barwidth(0.04) start(-0.2)  freq) ///
  , legend(on c(1) order(0 "With Randomization:" 1 "Estimated Returns" 2 "True Returns") ring(0) pos(1))

// End of dofile

## skill-selection.do
/*
Simulation 2:
All for a more structured wage equation.
Lets take a more complicated function,
so that your `skills’ are √sθ.
Here, θ is exogenous ability.
The benefits of being educated w^e=a+√sθ and,
in fact, s itself will depend on θ,
so that w^e=a+√(s(θ)θ).
So. people choose s to max. w^e
and the costs of education are given by rs,
where r is the interest rate. So, simulation does the following:

Step 1: Draw θ from a distribution
Step 2: Let the user define the interest rate between 0 and 1
Step 3: Assign an optimal s to everyone, which just max. w^e - rs, which implies that s*=θ/(4r^2)
Step 4: Construct a regression by adding some noise
Step 5: Regress w^e on s

Then, modifications and questions for understanding this better could be:
Q1. What would happen if s was allocated randomly?
  (problem here is that s is not either 0 or 1,
  so perhaps see if we can bound r to make things more sensible)
Q2. Would increasing the sample size solve the problem?
*/

// Start of dofile

// Setup
clear
cls
version 13

// Reproducibility with bit.ly/stata-random
set seed 155966	// Timestamp: 2021-01-22 18:25:30 UTC

  set obs 100 // allow x education levels
    gen education = _n-1
    tsset education
  gen r = 0.05 // Interest rate (1=100%)
  local n = 5 // number of individuals

  local colors = "maroon navy dkgreen black dkorange"

  forv i = 1/`n' {
    local theta`i' = runiform()
    gen payoff`i' = sqrt(education*`theta`i'')
    gen margin`i' = d.payoff`i'
    gen chosen`i' = margin`i' > r
      gen opt`i' = f.chosen`i' != chosen`i' & education != 99

    local color : word `i' of `colors'

    local graphs = "`graphs' " ///
      + "(line payoff`i' education , lc(`color'))" ///
      + "(line payoff`i' education if chosen`i' == 1 , lw(vthick) lc(`color'))" ///
      + "(scatter payoff`i' education if opt`i' == 1 , mc(black) msize(large))" ///

    local graphs2 = "`graphs2' " ///
      + "(line margin`i' education if chosen`i' == 1, lc(`color') lp(dash))" ///
      + "(line margin`i' education if chosen`i' == 0, lc(gs14) lp(dash))"
  }

  tempfile g1 g2

  tw `graphs' ///
    , title("Returns") xtit(" ") ///
      saving(`g1'.gph, replace) nodraw

  tw `graphs2' if education > 10  ///
   , title("Marginal Returns") xtit(" ") yline(0.05) ///
     saving(`g2'.gph, replace) nodraw ylab(0 "0" 0.05 "R")

  graph combine `g1'.gph `g2'.gph , c(1) xcom ysize(7)

// End of dofile
	// Selection bias for normally-distributed effect with choice
	// Setup
	qui {
	clear all
	cls
	version 13

	// Reproducibility with bit.ly/stata-random
	set seed 422698 // Timestamp: 2021-01-22 16:41:57 UTC

	// Request parameter values
	cap prog drop selection_setup
	prog def selection_setup

	di in smcl `"{ red : Welcome to the selection bias simulation program.}"'
	di in smcl `"{ red : Please enter the requested parameter values:}"'
	di `"Enter number of observations (integer)"' _request(n)
	di `"Enter true effect size (real)"' _request(t)
	di `"Enter number of iterations (integer)"' _request(it)

	end
	}

	// Request parameters

	selection_setup

	qui {
	// Create simulation program
	cap prog drop selection
	prog def selection , rclass // Allow statistics output
	qui {
	syntax , [pause] [exit] [clear] // Allow pausing and breaking

	if "`exit'" == "" preserve // Reset Stata state after finishing program run

	// Set parameters for simulation
	local n = ${n} // Number of observations
	local t = ${t} // True β of treatment effect
	local v = 1 // True standard deviation of treatment effect
	local s = 1 // True standard deviation of potential outcomes

	// Potential outcomes
	`clear'
	set obs `n' // Create simulation observations
	gen id = _n // ID variable
	lab var id "Unique ID"
	gen y_0 = rnormal(0,`s') // Potential outcome 0: untreated
	lab var y_0 "Untreated Potential Outcome"
	gen y_1 = y_0 + rnormal(`t',`v') // Potential outcome 1: treated
	lab var y_1 "Treated Potential Outcome"

	// Allow choice if positive returns
	gen return = y_1 - y_0
	lab var return "Individual Gain to Education"
	gen education = (return > 0) // Choose education
	lab var education "Educated"
	gen y = y_0 // Observed outcome
	replace y = y_1 if (education == 1)
	lab var y "Observed Outcome"

	// Create randomization
	gen y_rand = y_0 // Observed outcome under randomization
	gen education_rand = (rnormal() > 0)
	lab var education_rand "Randomized Treatment"
	replace y_rand = y_1 if (education_rand == 1)
	lab var y_rand "Randomized Observed Outcome"

	// Regression time
	sum return // Investigate true β
	return scalar true = `r(mean)'
	reg y education // Investigate observed β
	mat results = r(table) // Get regression results
	local beta = results[1,1]
	local pval = results[4,1]

	// Push results to main memory
	return scalar sigb = (`pval' < 0.05)
	return scalar beta = `beta'
	return matrix results = results

	// Regression time (randomization)
	reg y_rand education_rand // Investigate observed β
	mat results = r(table) // Get regression results
	local beta_rand = results[1,1]
	local pval_rand = results[4,1]

	// Push results to main memory
	return scalar sigb_rand = (`pval_rand' < 0.05)
	return scalar beta_rand = `beta_rand'
	return matrix results_rand = results

	// Pause or here if we want to see microdata
	pause on
	`pause' // Type "BREAK" in console to quit, "q" to continue
	`exit' // Ends here if [exit] option called
	}
	end

	// Multiple simulations

	clear
	tempfile sims
	}
	simulate r(true) r(beta) r(beta_rand) r(sigb) r(sigb_rand) ///
	, reps(${it}) saving(`sims') seed(858488) /// Timestamp: 2021-01-22 17:25:50 UTC
	: selection , clear



	qui {
	// Set up graphing
	noi di in smcl `"{ red : Compiling results visualization...}"' _newline _newline

	use `sims' , clear

	qui su _sim_2
	local m2 = r(min)
	qui su _sim_3
	local m3 = r(min)

	local min = min(`m2',`m3')

	// Visualize
	tw ///
	(histogram _sim_2 , w(0.05) lc(none) fc(red%75) barwidth(0.04) start(`min') freq) ///
	(histogram _sim_2 if (_sim_4 == 1), w(0.05) lc(black) fc(none) barwidth(0.04) start(`min') freq) ///
	(histogram _sim_1 , w(0.05) lc(none) fc(black%50) barwidth(0.04) start(`min') freq) ///
	, legend(on order(0 "Effect Size With Choice:" 1 "Estimated Returns" 2 "Significant Estimates" 3 "True Returns") ///
	ring(1) pos(12) r(1) symxsize(small) region(lc(none))) ///
	nodraw saving(g1.gph , replace)

	// Visualize with randomization
	tw ///
	(histogram _sim_3 , w(0.05) lc(none) fc(red%75) barwidth(0.045) start(`min') freq) ///
	(histogram _sim_3 if (_sim_5 == 1), w(0.05) lc(black) fc(none) barwidth(0.045) start(`min') freq) ///
	(histogram _sim_1 , w(0.05) lc(none) fc(black%50) barwidth(0.04) start(`min') freq) ///
	, legend(on order(0 "Effect Size With Randomization:" 1 "Estimated Returns" 2 "Significant Estimates" 3 "True Returns") ///
	ring(1) pos(12) r(1) symxsize(small) region(lc(none))) ///
	nodraw saving(g2.gph , replace)

	// Graph
	graph combine g1.gph g2.gph , c(1) xcom
	!rm g1.gph g2.gph

	}
	// End of dofile
	/*
	Simulation 2:
	All for a more structured wage equation.
	Lets take a more complicated function,
	so that your `skills’ are √sθ.
	Here, θ is exogenous ability.
	The benefits of being educated w^e=a+√sθ and,
	in fact, s itself will depend on θ,
	so that w^e=a+√(s(θ)θ).
	So. people choose s to max. w^e
	and the costs of education are given by rs,
	where r is the interest rate. So, simulation does the following:

	Step 1: Draw θ from a distribution
	Step 2: Let the user define the interest rate between 0 and 1
	Step 3: Assign an optimal s to everyone, which just max. w^e - rs, which implies that s*=θ/(4r^2)
	Step 4: Construct a regression by adding some noise
	Step 5: Regress w^e on s

	Then, modifications and questions for understanding this better could be:
	Q1. What would happen if s was allocated randomly?
	(problem here is that s is not either 0 or 1,
	so perhaps see if we can bound r to make things more sensible)
	Q2. Would increasing the sample size solve the problem?
	*/

	// Start of dofile

	// Setup
	clear
	cls
	version 13

	// Reproducibility with bit.ly/stata-random
	set seed 155966 // Timestamp: 2021-01-22 18:25:30 UTC

	set obs 100 // allow x education levels
	gen education = _n-1
	tsset education
	gen r = 0.05 // Interest rate (1=100%)
	local n = 5 // number of individuals

	local colors = "maroon navy dkgreen black dkorange"

	forv i = 1/`n' {
	local theta`i' = runiform()
	gen payoff`i' = sqrt(education*`theta`i'')
	gen margin`i' = d.payoff`i'
	gen chosen`i' = margin`i' > r
	gen opt`i' = f.chosen`i' != chosen`i' & education != 99

	local color : word `i' of `colors'

	local graphs = "`graphs' " ///
	+ "(line payoff`i' education , lc(`color'))" ///
	+ "(line payoff`i' education if chosen`i' == 1 , lw(vthick) lc(`color'))" ///
	+ "(scatter payoff`i' education if opt`i' == 1 , mc(black) msize(large))" ///

	local graphs2 = "`graphs2' " ///
	+ "(line margin`i' education if chosen`i' == 1, lc(`color') lp(dash))" ///
	+ "(line margin`i' education if chosen`i' == 0, lc(gs14) lp(dash))"
	}

	tempfile g1 g2

	tw `graphs' ///
	, title("Returns") xtit(" ") ///
	saving(`g1'.gph, replace) nodraw

	tw `graphs2' if education > 10 ///
	, title("Marginal Returns") xtit(" ") yline(0.05) ///
	saving(`g2'.gph, replace) nodraw ylab(0 "0" 0.05 "R")

	graph combine `g1'.gph `g2'.gph , c(1) xcom ysize(7)

	// End of dofile