-
-
Save statsccpr/ba1163978afb8d8ddd3115b76c37d5c6 to your computer and use it in GitHub Desktop.
exer_survey_analysis_stata.do
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
*********************************************** | |
* Step 0) Understand Survey Data | |
*********************************************** | |
*load data | |
import delimited "https://raw.githubusercontent.com/coreysparks/data/master/brfss_11.csv", clear | |
*convert all NAs to "" (Stata missing for strings) | |
ds, has(type string) | |
local strvars "`r(varlist)'" | |
foreach var of local strvars { | |
replace `var' = "" if `var' == "NA" | |
} | |
gen statefip = string(state, "%02.0f") | |
gen cofip = string(cnty, "%03.0f") | |
gen cofips = statefip + cofip | |
gen weight = cntywt | |
*just for brevity, I just select TX respondents with non missing weights | |
drop if weight == . | |
********************************************* | |
*wrangle 'measurement' variables | |
********************************************* | |
*smoking currently | |
capture drop smoke | |
recode smoker3 (1/2=1) (3/4=0) (nonmissing = .), gen(smoke) | |
*low physical activity | |
capture drop lowact | |
recode pacat (1/2=0) (3/4=1) (nonmissing=.), gen(lowact) | |
*hc | |
capture drop hc | |
gen hc = . | |
replace hc = 1 if toldhi2 == "1" | |
replace hc = 0 if toldhi2 == "2" | |
*marital status | |
capture drop marst | |
label define marst 1 "married" 2 "divorced" 3 "widowed" 4 "separated" 5 "nm" 6 "cohab" | |
gen marst = marital | |
label values marst marst | |
replace marst = . if marst == 9 | |
*income | |
capture drop inc | |
gen inc = incomg | |
replace inc = . if inc == 9 | |
*age z-standardized | |
egen agez = std(age) | |
* parec | |
* what is cryptically named "parec" ? | |
* exercise: look up in codebook | |
summ parec, detail | |
* the 'measurement' variables to use | |
global names_measure_var agez hc marst lowact smoke inc | |
summ $names_measure_var, detail | |
li $names_measure_var in 1/10 | |
********************************************* | |
*wrangle 'design' variables | |
********************************************* | |
*cntywt ok as is | |
desc cntywt | |
capture drop sampleid | |
gen sampleid = string(state) + " " + string(ststr) | |
* within each sampling unit, sum the weights | |
sort sampleid | |
by sampleid: gen sumwt = sum(cntywt) | |
by sampleid: replace sumwt = sumwt[_N] | |
*number of people per sampling unit | |
gen n = 1 | |
by sampleid: gen jn = sum(n) | |
by sampleid: replace jn = jn[_N] | |
*In the new data set, multiply the original weight by the fraction of the | |
* sampling unit total population each person represents | |
gen swts = cntywt*jn/sumwt | |
hist swts | |
*the 'design' varaibles to use | |
global names_design_var ststr swts | |
*save data | |
save brfss_11, replace | |
********************************************* | |
* gather 'measurement' and 'design' variables | |
********************************************* | |
*keep $names_measure_var $names_design_var | |
keep $names_measure_var $names_design_var age | |
*********************************************** | |
* Step 1) Declare the Survey Design | |
*********************************************** | |
**************************************** | |
* set survey design | |
* non-imputation example | |
**************************************** | |
/* In Stata, to analysize survey data, you use the same commands as you | |
would to analyze non-survey data, but specify the prefix "svy: " | |
before the command, which will tell Stata to perform the command | |
while accounting for the survey design characteristic specified | |
in a "svyset" command */ | |
*use brfss_11, clear | |
*Stata help for all survey commands | |
help survey | |
*help for just svyset | |
help svyset | |
*first we need to our survey design variables in svyset | |
*if a psu variable, specify immediately after svyset | |
*[pw=swts] is probability weights | |
*singleunit(centered) is single unit strata centered at grand mean | |
svyset [pw=swts], strata(ststr) singleunit(centered) | |
*example with jackknife replicate weights | |
*svyset [pw=weight], jkrweight(jkw_*) vce(jackknife) | |
********************************************************* | |
* Step 2a) Conduct Survey Analysis (without Missing Data) | |
********************************************************* | |
************************** | |
* descriptives | |
************************** | |
*Now when we precede a command with "svy:", Stata will incorporate | |
* the survey design specified in "svyset" in the estimation | |
*help for survey prefix for estimation commands | |
help svy | |
display _newline "Note: age does not have missing cases" | |
svy: mean age | |
display _newline "Note: smoke has missing cases, mean of smoke" | |
svy: mean smoke | |
*we will compare these results to results after imputation later | |
estimates store mean_smoke | |
display _newline "Note: smoke has missing cases, total of smoke" | |
svy: total smoke | |
estimates store total_smoke | |
*EXERCISE: try computing total counts of smokers, accounting for survey design | |
********************************************************* | |
* Regression | |
********************************************************* | |
* codebook says the cryptically named 'parec' variable is | |
* parec: Aerobic and Strengthening Guideline | |
svy: regress agez i.parec | |
* note: default omit missing from model | |
svy: logit smoke c.agez i.inc c.lowact | |
* store results to compare later | |
estimates store logit_smoke | |
********************************************************* | |
* Subpopulation (Subdomain) Analysis | |
********************************************************* | |
*for more help on subpopulation analysis | |
*go to help page for svy, scroll down to "Options" section, and click on blue link for | |
* [SVY] subpopulation estimation | |
help svy | |
di 'mean of age on whole sample' | |
svy: mean age | |
*subpop is an option of "svy: " | |
di 'mean of age on Age 50 and older sample' | |
svy, subpop(if age >= 50): mean age | |
message('mean of age on Age under 50 sample') | |
svy, subpop(if age < 50): mean age | |
********************************************************* | |
* Step 2b1) Impute Missing Data + Propagate into Design | |
********************************************************* | |
**************************************** | |
* impute via chained equations | |
**************************************** | |
*load data | |
use brfss_11, clear | |
global names_design_var ststr swts | |
global names_measure_var agez hc marst lowact smoke inc | |
di "design variables are $names_design_var" | |
di "measured variables are $names_measure_var" | |
di "dataset for imputation is " $names_design_var $names_measure_var | |
* help for multiple imputation in general | |
help mi | |
* help for multiple imputation with chained equations (mice) | |
help mi impute chained | |
*reduce dataset to just design and measure vars | |
keep $names_design_var $names_measure_var | |
/*steps in Stat multiple imputation | |
1. mi set: declare data to be multiply-imputed, specify if imputations appear | |
as new rows of data or as new columns | |
2. mi register: tell Stata which variables need to be imputed | |
3. mi impute [chained] : impute missing values | |
4. mi estimate : perform estimation with mi data | |
*/ | |
help mi set | |
help mi register | |
help mi impute | |
help mi estimate | |
*first need to mi set data, imputed data will be appended as new datasets | |
*syntax: mi set _style_, where _style_ is wide, mlong, flong, or flongsep | |
*flong means add new datasets, with all rows of data (whether or not row has | |
* any missing) | |
mi set flong | |
*then need to declare which variables need to be imputed | |
*to check which variables have missing, first use misstable summarize | |
misstable summarize | |
*we need to impute for smoke, lowact, hc, marst, inc | |
* so we register these as imputed | |
mi register imputed smoke lowact hc marst inc | |
*impute using chained equations | |
*5 imputations | |
*R mice defaults: | |
* predictive mean matching with 5 donors for continuous (numeric), | |
* multinomial logistic for categorical with > 2 levels (factor) | |
*for imputation with chained equations: | |
* "mi impute chained" | |
* then list of variables to be imputed goes before "=" | |
* specify imputation model (e.g. regress, logit, mlogit, pmm, etc.) | |
* before each set | |
* variables with no missing used in imputation models go to the right of "=" | |
*this won't work because of perfect prediction error | |
mi impute chained (pmm, knn(5)) smoke lowact hc (mlogit) marst inc = ststr swts, add(5) | |
*click on blue link | |
*"The issue of perfect prediction during imputation of categorical data"* | |
*in error returned for more information | |
*augment adds observations with tiny weights to prevent perfect prediction errors | |
mi impute chained (pmm, knn(5)) smoke lowact hc (mlogit) marst inc = ststr swts, add(5) augment | |
save imputed_all.dta, replace | |
**************************************** | |
* use 'full' PredictorMatrix (default) | |
* eg X_j|X_{-j} for all j | |
* if 'design' variables part of j, then they are used to impute | |
**************************************** | |
*impute again, but omitting design variables from imputation model | |
*NOT RECOMMENDED | |
use brfss_11, clear | |
mi set flong | |
mi register imputed smoke lowact hc marst inc | |
mi impute chained (pmm, knn(5)) smoke lowact hc (mlogit) marst inc , add(5) augment | |
save imputed_no_design.dta, replace | |
********************************************************* | |
* Propagate Imputation Uncertainty into Design via 'mitools' | |
********************************************************* | |
********************************************************* | |
* Step 2b2) Conduct Survey Analysis (with Missing Data) | |
********************************************************* | |
*load imputed data | |
use imputed_all.dta, clear | |
* for mi data, we need "mi svyset" instead of just "svyset" | |
help mi svyset | |
mi svyset [pw=swts], strata(ststr) singleunit(centered) | |
*for analysis of imputed, complex survey data, we need to use both | |
* mi estimate: | |
* AND THEN | |
* svy: | |
*now to estimate | |
*notice n=7594, previously n = 7540 | |
di as error _newline "proportion of smoke after imputation" | |
mi estimate: svy: mean smoke | |
di as error _newline "total counts of smoke after imputation" | |
mi estimate: svy: total smoke | |
*compare to results of non-imputed data (total changes a lot more, why?) | |
estimates replay mean_smoke | |
estimates replay total_smoke | |
**LOGISTIC REGRESSION | |
mi estimate: svy: logit smoke c.agez i.inc c.lowact | |
*to see individual regressions, add "noisily" option to mi estimate: | |
mi estimate, noisily: svy: logit smoke c.agez i.inc c.lowact | |
*final coefficient for lowact is mean of the 5 coefficients | |
di (.2858381 + .3188377 +.3226212+.3425455+.3060013)/5 | |
*compare to analysis of non-imputed data | |
estimates replay logit_smoke | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment