statsccpr/exer_survey_analysis_stata.do Secret

## exer_survey_analysis_stata.do
***********************************************
* Step 0) Understand Survey Data
***********************************************

*load data
import delimited "https://raw.githubusercontent.com/coreysparks/data/master/brfss_11.csv", clear


*convert all NAs to "" (Stata missing for strings)
ds, has(type string)
local strvars "`r(varlist)'"
foreach var of local strvars {
replace `var' = "" if `var' == "NA"
}

gen statefip = string(state, "%02.0f")
gen cofip = string(cnty, "%03.0f")
gen cofips = statefip + cofip

gen weight = cntywt

*just for brevity, I just select TX respondents with non missing weights
drop if weight == .

*********************************************
*wrangle 'measurement' variables
*********************************************

*smoking currently
capture drop smoke
recode smoker3 (1/2=1) (3/4=0) (nonmissing = .), gen(smoke)

*low physical activity
capture drop lowact
recode pacat (1/2=0) (3/4=1) (nonmissing=.), gen(lowact)

*hc
capture drop hc
gen hc = .
replace hc = 1 if toldhi2 == "1"
replace hc = 0 if toldhi2 == "2"


*marital status
capture drop marst
label define marst 1 "married" 2 "divorced" 3 "widowed" 4 "separated" 5 "nm" 6 "cohab"
gen marst = marital
label values marst marst
replace marst = . if marst == 9

*income
capture drop inc
gen inc = incomg
replace inc = . if inc == 9


*age z-standardized
egen agez = std(age)


* parec
* what is cryptically named "parec" ?
* exercise: look up in codebook
summ parec, detail


* the 'measurement' variables to use
global names_measure_var agez hc marst lowact smoke inc

summ $names_measure_var, detail

li $names_measure_var in 1/10

*********************************************
*wrangle 'design' variables
*********************************************
*cntywt ok as is
desc cntywt

capture drop sampleid
gen sampleid = string(state) + " " + string(ststr)

* within each sampling unit, sum the weights
sort sampleid
by sampleid: gen sumwt = sum(cntywt)
by sampleid: replace sumwt = sumwt[_N]

*number of people per sampling unit
gen n = 1
by sampleid: gen jn = sum(n)
by sampleid: replace jn = jn[_N]


*In the new data set, multiply the original weight by the fraction of the
* sampling unit total population each person represents
gen swts = cntywt*jn/sumwt

hist swts

*the 'design' varaibles to use
global names_design_var ststr swts

*save data
save brfss_11, replace

*********************************************
* gather 'measurement' and 'design' variables
*********************************************

*keep $names_measure_var $names_design_var

keep $names_measure_var $names_design_var age


***********************************************
* Step 1) Declare the Survey Design
***********************************************

****************************************
* set survey design
* non-imputation example
****************************************

/* In Stata, to analysize survey data, you use the same commands as you
 would to analyze non-survey data, but specify the prefix "svy: "
 before the command, which will tell Stata to perform the command
 while accounting for the survey design characteristic specified
 in a "svyset" command */

*use brfss_11, clear

*Stata help for all survey commands
help survey

*help for just svyset
help svyset

*first we need to our survey design variables in svyset
*if a psu variable, specify immediately after svyset
*[pw=swts] is probability weights
*singleunit(centered) is single unit strata centered at grand mean
svyset [pw=swts], strata(ststr) singleunit(centered)

*example with jackknife replicate weights
*svyset [pw=weight], jkrweight(jkw_*) vce(jackknife)

*********************************************************
* Step 2a) Conduct Survey Analysis (without Missing Data)
*********************************************************

**************************
* descriptives
**************************

*Now when we precede a command with "svy:", Stata will incorporate
* the survey design specified in "svyset" in the estimation

*help for survey prefix for estimation commands
help svy

display _newline "Note: age does not have missing cases"
svy: mean age

display _newline "Note: smoke has missing cases, mean of smoke"
svy: mean smoke

*we will compare these results to results after imputation later
estimates store mean_smoke

display _newline "Note: smoke has missing cases, total of smoke"
svy: total smoke
estimates store total_smoke


*EXERCISE: try computing total counts of smokers, accounting for survey design

*********************************************************
* Regression
*********************************************************

* codebook says the cryptically named 'parec' variable is
* parec: Aerobic and Strengthening Guideline

svy: regress agez i.parec


* note: default omit missing from model
svy: logit smoke c.agez i.inc c.lowact

* store results to compare later
estimates store logit_smoke


*********************************************************
* Subpopulation (Subdomain) Analysis
*********************************************************

*for more help on subpopulation analysis
*go to help page for svy, scroll down to "Options" section, and click on blue link for
* [SVY] subpopulation estimation
help svy

di 'mean of age on whole sample'
svy: mean age

*subpop is an option of "svy: "
di 'mean of age on Age 50 and older sample'
svy, subpop(if age >= 50): mean age

message('mean of age on Age under 50 sample')
svy, subpop(if age < 50): mean age


*********************************************************
* Step 2b1) Impute Missing Data + Propagate into Design
*********************************************************
****************************************
* impute via chained equations
****************************************
*load data
use brfss_11, clear

global names_design_var ststr swts
global names_measure_var agez hc marst lowact smoke inc

di "design variables are $names_design_var"

di "measured variables are $names_measure_var"

di "dataset for imputation is " $names_design_var $names_measure_var


* help for multiple imputation in general
help mi
* help for multiple imputation with chained equations (mice)
help mi impute chained

*reduce dataset to just design and measure vars
keep $names_design_var $names_measure_var


/*steps in Stat multiple imputation
  1. mi set: declare data to be multiply-imputed, specify if imputations appear
     as new rows of data or as new columns
  2. mi register: tell Stata which variables need to be imputed
  3. mi impute [chained] : impute missing values
  4. mi estimate : perform estimation with mi data
*/
help mi set
help mi register
help mi impute
help mi estimate


*first need to mi set data, imputed data will be appended as new datasets
*syntax: mi set _style_, where _style_ is wide, mlong, flong, or flongsep
*flong means add new datasets, with all rows of data (whether or not row has
*  any missing)
mi set flong

*then need to declare which variables need to be imputed
*to check which variables have missing, first use misstable summarize
misstable summarize

*we need to impute for smoke, lowact, hc, marst, inc
* so we register these as imputed
mi register imputed smoke lowact hc marst inc

*impute using chained equations
*5 imputations
*R mice defaults:
*  predictive mean matching with 5 donors for continuous (numeric),
*  multinomial logistic for categorical with > 2 levels (factor)

*for imputation with chained equations:
* "mi impute chained"
* then list of variables to be imputed goes before "="
* specify imputation model (e.g. regress, logit, mlogit, pmm, etc.)
*    before each set
* variables with no missing used in imputation models go to the right of "="

*this won't work because of perfect prediction error
mi impute chained (pmm, knn(5)) smoke lowact hc (mlogit) marst inc = ststr swts, add(5)

*click on blue link
*"The issue of perfect prediction during imputation of categorical data"*
*in error returned for more information

*augment adds observations with tiny weights to prevent perfect prediction errors
mi impute chained (pmm, knn(5)) smoke lowact hc (mlogit) marst inc = ststr swts, add(5) augment

save imputed_all.dta, replace

****************************************
* use 'full' PredictorMatrix (default)
* eg X_j|X_{-j} for all j
* if 'design' variables part of j, then they are used to impute
****************************************

*impute again, but omitting design variables from imputation model
*NOT RECOMMENDED
use brfss_11, clear
mi set flong
mi register imputed smoke lowact hc marst inc
mi impute chained (pmm, knn(5)) smoke lowact hc (mlogit) marst inc , add(5) augment

save imputed_no_design.dta, replace


*********************************************************
* Propagate Imputation Uncertainty into Design via 'mitools'
*********************************************************

*********************************************************
* Step 2b2) Conduct Survey Analysis (with Missing Data)
*********************************************************

*load imputed data
use imputed_all.dta, clear

* for mi data, we need "mi svyset" instead of just "svyset"
help mi svyset

mi svyset [pw=swts], strata(ststr) singleunit(centered)

*for analysis of imputed, complex survey data, we need to use both
* mi estimate:
*   AND THEN
* svy:

*now to estimate
*notice n=7594, previously n = 7540
di as error _newline "proportion of smoke after imputation"
mi estimate: svy: mean smoke

di as error _newline "total counts of smoke after imputation"
mi estimate: svy: total smoke

*compare to results of non-imputed data (total changes a lot more, why?)
estimates replay mean_smoke
estimates replay total_smoke

**LOGISTIC REGRESSION
mi estimate: svy: logit smoke c.agez i.inc c.lowact

*to see individual regressions, add "noisily" option to mi estimate:
mi estimate, noisily: svy: logit smoke c.agez i.inc c.lowact

*final coefficient for lowact is mean of the 5 coefficients
di (.2858381 +  .3188377 +.3226212+.3425455+.3060013)/5

*compare to analysis of non-imputed data
estimates replay logit_smoke
	***********************************************
	* Step 0) Understand Survey Data
	***********************************************

	*load data
	import delimited "https://raw.githubusercontent.com/coreysparks/data/master/brfss_11.csv", clear


	*convert all NAs to "" (Stata missing for strings)
	ds, has(type string)
	local strvars "`r(varlist)'"
	foreach var of local strvars {
	replace `var' = "" if `var' == "NA"
	}

	gen statefip = string(state, "%02.0f")
	gen cofip = string(cnty, "%03.0f")
	gen cofips = statefip + cofip

	gen weight = cntywt

	*just for brevity, I just select TX respondents with non missing weights
	drop if weight == .

	*********************************************
	*wrangle 'measurement' variables
	*********************************************

	*smoking currently
	capture drop smoke
	recode smoker3 (1/2=1) (3/4=0) (nonmissing = .), gen(smoke)

	*low physical activity
	capture drop lowact
	recode pacat (1/2=0) (3/4=1) (nonmissing=.), gen(lowact)

	*hc
	capture drop hc
	gen hc = .
	replace hc = 1 if toldhi2 == "1"
	replace hc = 0 if toldhi2 == "2"


	*marital status
	capture drop marst
	label define marst 1 "married" 2 "divorced" 3 "widowed" 4 "separated" 5 "nm" 6 "cohab"
	gen marst = marital
	label values marst marst
	replace marst = . if marst == 9

	*income
	capture drop inc
	gen inc = incomg
	replace inc = . if inc == 9


	*age z-standardized
	egen agez = std(age)


	* parec
	* what is cryptically named "parec" ?
	* exercise: look up in codebook
	summ parec, detail


	* the 'measurement' variables to use
	global names_measure_var agez hc marst lowact smoke inc

	summ $names_measure_var, detail

	li $names_measure_var in 1/10

	*********************************************
	*wrangle 'design' variables
	*********************************************
	*cntywt ok as is
	desc cntywt

	capture drop sampleid
	gen sampleid = string(state) + " " + string(ststr)

	* within each sampling unit, sum the weights
	sort sampleid
	by sampleid: gen sumwt = sum(cntywt)
	by sampleid: replace sumwt = sumwt[_N]

	*number of people per sampling unit
	gen n = 1
	by sampleid: gen jn = sum(n)
	by sampleid: replace jn = jn[_N]


	*In the new data set, multiply the original weight by the fraction of the
	* sampling unit total population each person represents
	gen swts = cntywt*jn/sumwt

	hist swts

	*the 'design' varaibles to use
	global names_design_var ststr swts

	*save data
	save brfss_11, replace

	*********************************************
	* gather 'measurement' and 'design' variables
	*********************************************

	*keep $names_measure_var $names_design_var

	keep $names_measure_var $names_design_var age


	***********************************************
	* Step 1) Declare the Survey Design
	***********************************************

	****************************************
	* set survey design
	* non-imputation example
	****************************************

	/* In Stata, to analysize survey data, you use the same commands as you
	would to analyze non-survey data, but specify the prefix "svy: "
	before the command, which will tell Stata to perform the command
	while accounting for the survey design characteristic specified
	in a "svyset" command */

	*use brfss_11, clear

	*Stata help for all survey commands
	help survey

	*help for just svyset
	help svyset

	*first we need to our survey design variables in svyset
	*if a psu variable, specify immediately after svyset
	*[pw=swts] is probability weights
	*singleunit(centered) is single unit strata centered at grand mean
	svyset [pw=swts], strata(ststr) singleunit(centered)

	*example with jackknife replicate weights
	svyset [pw=weight], jkrweight(jkw_) vce(jackknife)

	*********************************************************
	* Step 2a) Conduct Survey Analysis (without Missing Data)
	*********************************************************

	**************************
	* descriptives
	**************************

	*Now when we precede a command with "svy:", Stata will incorporate
	* the survey design specified in "svyset" in the estimation

	*help for survey prefix for estimation commands
	help svy

	display _newline "Note: age does not have missing cases"
	svy: mean age

	display _newline "Note: smoke has missing cases, mean of smoke"
	svy: mean smoke

	*we will compare these results to results after imputation later
	estimates store mean_smoke

	display _newline "Note: smoke has missing cases, total of smoke"
	svy: total smoke
	estimates store total_smoke


	*EXERCISE: try computing total counts of smokers, accounting for survey design

	*********************************************************
	* Regression
	*********************************************************

	* codebook says the cryptically named 'parec' variable is
	* parec: Aerobic and Strengthening Guideline

	svy: regress agez i.parec


	* note: default omit missing from model
	svy: logit smoke c.agez i.inc c.lowact

	* store results to compare later
	estimates store logit_smoke


	*********************************************************
	* Subpopulation (Subdomain) Analysis
	*********************************************************

	*for more help on subpopulation analysis
	*go to help page for svy, scroll down to "Options" section, and click on blue link for
	* [SVY] subpopulation estimation
	help svy

	di 'mean of age on whole sample'
	svy: mean age

	*subpop is an option of "svy: "
	di 'mean of age on Age 50 and older sample'
	svy, subpop(if age >= 50): mean age

	message('mean of age on Age under 50 sample')
	svy, subpop(if age < 50): mean age


	*********************************************************
	* Step 2b1) Impute Missing Data + Propagate into Design
	*********************************************************
	****************************************
	* impute via chained equations
	****************************************
	*load data
	use brfss_11, clear

	global names_design_var ststr swts
	global names_measure_var agez hc marst lowact smoke inc

	di "design variables are $names_design_var"

	di "measured variables are $names_measure_var"

	di "dataset for imputation is " $names_design_var $names_measure_var


	* help for multiple imputation in general
	help mi
	* help for multiple imputation with chained equations (mice)
	help mi impute chained

	*reduce dataset to just design and measure vars
	keep $names_design_var $names_measure_var




	/*steps in Stat multiple imputation
	1. mi set: declare data to be multiply-imputed, specify if imputations appear
	as new rows of data or as new columns
	2. mi register: tell Stata which variables need to be imputed
	3. mi impute [chained] : impute missing values
	4. mi estimate : perform estimation with mi data
	*/
	help mi set
	help mi register
	help mi impute
	help mi estimate


	*first need to mi set data, imputed data will be appended as new datasets
	*syntax: mi set _style_, where _style_ is wide, mlong, flong, or flongsep
	*flong means add new datasets, with all rows of data (whether or not row has
	* any missing)
	mi set flong

	*then need to declare which variables need to be imputed
	*to check which variables have missing, first use misstable summarize
	misstable summarize

	*we need to impute for smoke, lowact, hc, marst, inc
	* so we register these as imputed
	mi register imputed smoke lowact hc marst inc

	*impute using chained equations
	*5 imputations
	*R mice defaults:
	* predictive mean matching with 5 donors for continuous (numeric),
	* multinomial logistic for categorical with > 2 levels (factor)

	*for imputation with chained equations:
	* "mi impute chained"
	* then list of variables to be imputed goes before "="
	* specify imputation model (e.g. regress, logit, mlogit, pmm, etc.)
	* before each set
	* variables with no missing used in imputation models go to the right of "="

	*this won't work because of perfect prediction error
	mi impute chained (pmm, knn(5)) smoke lowact hc (mlogit) marst inc = ststr swts, add(5)

	*click on blue link
	"The issue of perfect prediction during imputation of categorical data"
	*in error returned for more information

	*augment adds observations with tiny weights to prevent perfect prediction errors
	mi impute chained (pmm, knn(5)) smoke lowact hc (mlogit) marst inc = ststr swts, add(5) augment

	save imputed_all.dta, replace

	****************************************
	* use 'full' PredictorMatrix (default)
	* eg X_j\|X_{-j} for all j
	* if 'design' variables part of j, then they are used to impute
	****************************************

	*impute again, but omitting design variables from imputation model
	*NOT RECOMMENDED
	use brfss_11, clear
	mi set flong
	mi register imputed smoke lowact hc marst inc
	mi impute chained (pmm, knn(5)) smoke lowact hc (mlogit) marst inc , add(5) augment

	save imputed_no_design.dta, replace


	*********************************************************
	* Propagate Imputation Uncertainty into Design via 'mitools'
	*********************************************************

	*********************************************************
	* Step 2b2) Conduct Survey Analysis (with Missing Data)
	*********************************************************

	*load imputed data
	use imputed_all.dta, clear

	* for mi data, we need "mi svyset" instead of just "svyset"
	help mi svyset

	mi svyset [pw=swts], strata(ststr) singleunit(centered)

	*for analysis of imputed, complex survey data, we need to use both
	* mi estimate:
	* AND THEN
	* svy:

	*now to estimate
	*notice n=7594, previously n = 7540
	di as error _newline "proportion of smoke after imputation"
	mi estimate: svy: mean smoke

	di as error _newline "total counts of smoke after imputation"
	mi estimate: svy: total smoke

	*compare to results of non-imputed data (total changes a lot more, why?)
	estimates replay mean_smoke
	estimates replay total_smoke

	**LOGISTIC REGRESSION
	mi estimate: svy: logit smoke c.agez i.inc c.lowact

	*to see individual regressions, add "noisily" option to mi estimate:
	mi estimate, noisily: svy: logit smoke c.agez i.inc c.lowact

	*final coefficient for lowact is mean of the 5 coefficients
	di (.2858381 + .3188377 +.3226212+.3425455+.3060013)/5

	*compare to analysis of non-imputed data
	estimates replay logit_smoke