-
-
Save statsccpr/0febfa55853e895029d898ca67af68b1 to your computer and use it in GitHub Desktop.
exer_survey_analysis.r
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# https://www.cdc.gov/brfss/questionnaires/modules/state2017.htm | |
# https://www.cdc.gov/brfss/annual_data/2017/pdf/Complex-Smple-Weights-Prep-Module-Data-Analysis-2017-508.pdf | |
# https://www.cdc.gov/brfss/annual_data/2017/pdf/codebook17_llcp-508.pdf | |
# http://asdfree.com/behavioral-risk-factor-surveillance-system-brfss.html | |
# lowdown to dl data ----------------------------------------------------- | |
# | |
# | |
# # catalog thru ftp | |
# library(lodown) | |
# lodown( "brfss" , | |
# output_dir = here::here("/home/rstudio/mntcnt/workshop_survey_2018/BRFSS") ) | |
# | |
# | |
# # examine all available BRFSS microdata files | |
# brfss_cat <- | |
# get_catalog( "brfss" , | |
# output_dir = here::here("/home/rstudio/mntcnt/workshop_survey_2018/BRFSS/")) | |
# | |
# # get 2017 only | |
# brfss_cat <- subset( brfss_cat , year == 2017 ) | |
# # download the microdata to your local computer | |
# brfss_cat <- lodown( "brfss" , brfss_cat ) | |
# | |
# list.files(here::here("/home/rstudio/mntcnt/workshop_survey_2018/BRFSS/")) | |
# | |
# brfss_raw <- | |
# readRDS(here::here("/home/rstudio/mntcnt/workshop_survey_2018/BRFSS/2017 main.rds")) | |
# | |
# # lowdown appends prefix 'x' to all varnames | |
# | |
# | |
# # clean data -------------------------------------------------------------- | |
# | |
# | |
# brfss_17 = brfss_raw %>% dplyr::select(., | |
# contains("smoke"), | |
# contains("pacat"), | |
# contains("toldhi2"), | |
# contains("marital"), | |
# contains("incomg"), | |
# contains("act"), | |
# contains("age"), | |
# contains("cnty"), | |
# contains("state"), | |
# contains("cofip"), | |
# contains("cntywt"), | |
# contains("psu"), | |
# contains("ststr"), | |
# contains("pwt") | |
# ) %>% | |
# filter(xstate %in% c("6")) | |
# | |
# | |
# | |
# # wrangle 'measurement' variables | |
# | |
# | |
# # ?car::recode | |
# | |
# # smoking currently | |
# brfss_17$smoke = car::recode(brfss_17$xsmoker3, recodes="1:2=1; 3:4=0; else=NA") | |
# | |
# # low physical activity | |
# brfss_17$lowact = car::recode(brfss_17$xpacat, recodes="1:2=0; 3:4=1; else=NA") | |
# | |
# # high cholesterol | |
# brfss_17$hc = car::recode(brfss_17$toldhi2, recodes="1=1; 2=0; else=NA") | |
# | |
# # marital status | |
# brfss_17$marst = car::recode(brfss_17$marital, | |
# as.factor=TRUE, | |
# recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA") | |
# brfss_17$marst = relevel(brfss_17$marst, ref='married') | |
# | |
# # income | |
# brfss_17$inc = as.factor(ifelse(brfss_17$xincomg==9, NA, brfss_17$xincomg)) | |
# | |
# # age | |
# brfss_17$agez = brfss_17$xage80 | |
# | |
# # the 'measurement' variables to use | |
# names_measure_var = c("agez",'hc','marst','lowact','smoke','inc') | |
# | |
# brfss_17[,names_measure_var] %>% summary | |
# brfss_17[,names_measure_var] %>% head | |
# | |
# | |
# | |
# # wrangle 'design' variables | |
# | |
# # names(brfss_17) | |
# | |
# # note: from brfss doc | |
# # # # Create survey design | |
# # # brfssdsgn <- svydesign( | |
# # # id=~1, | |
# # # strata = ~ststr, | |
# # # weights = ~llcpwt, | |
# # # data = BRFSS) | |
# | |
# # rename to use as 'swts' 'ststr' | |
# brfss_17$swts = brfss_17$xllcpwt | |
# brfss_17$ststr = brfss_17$xststr | |
# | |
# # the 'design' variables to use | |
# # names_design_var = c("xststr","xllcpwt") | |
# names_design_var = c("ststr","swts") | |
# | |
# brfss_17[,names_design_var] %>% head | |
# brfss_17[,names_design_var] %>% summary | |
# | |
# | |
# ### | |
# # gather 'measurement' and 'design' variables | |
# ### | |
# brfss_17 = brfss_17[,c(names_design_var,names_measure_var)] | |
# | |
# | |
# # data for analysis going forward | |
# | |
# dat_use = brfss_17 | |
# | |
# summary(dat_use) | |
# saveRDS(dat_use,file=here::here("/mntcnt/workshop_survey_2018/dat_use_brfss.rds")) | |
# write_dta(dat_use,path='~/projects/survey_complex/data/proc/dat_use_brfss.DTA') | |
# # read_dta('~/projects/survey_complex/data/proc/dat_use_brfss.DTA') | |
## find /home/user{1..40} -maxdepth 0 -exec cp dat_use_brfss.rds {} \; | |
# Analysis ---------------------------------------------------------------- | |
dat_use = readRDS("/home/user100/dat_use_brfss.rds") | |
dim(dat_use) | |
summary(dat_use) | |
# non-impute example -------------------------------------------------- | |
options(survey.lonely.psu = "adjust") | |
# ?svydesign | |
des_non_impute = svydesign(ids=~1, | |
strata=~ststr, | |
weights=~swts, | |
data=dat_use) | |
attributes(des_non_impute) | |
# descriptives | |
# head(des_non_impute) | |
# str(des_non_impute) | |
# summary(des_non_impute$variables) | |
# ?svymean | |
message('Note: age does not have missing cases') | |
svymean(~agez, design = des_non_impute) | |
message('Note: smoke has missing cases, hence NA svymean() result') | |
svymean(~as.factor(smoke), design = des_non_impute) | |
message('Note: supply svymean(...,na.rm=TRUE) option to svymean() for case deletion') | |
svymean(~as.factor(smoke), design = des_non_impute,na.rm=TRUE) | |
message('Note: supply svytotal(...,na.rm=TRUE) option to svytotal() for case deletion') | |
# regression | |
# codebook says the cryptically named 'parec' variable is | |
# parec: Aerobic and Strengthening Guideline | |
fit_non_impute_age = svyglm(design=des_non_impute, | |
formula=as.numeric(inc) ~ agez, | |
family='gaussian') | |
summary(fit_non_impute_age) | |
# note: default omitted NAs | |
fit_non_impute_smoke = svyglm(design=des_non_impute, | |
formula=smoke ~ agez + inc + lowact, | |
family=quasibinomial()) | |
summary(fit_non_impute_smoke) | |
# subpopulation (subdomain) analysis | |
# library(survey) | |
methods(subset) | |
# ?subset | |
# ?subset.survey.design | |
# names(des_non_impute$variables) %>% sort | |
message('mean of age on whole sample') | |
svymean(~agez, design = des_non_impute) | |
message('mean of age on Age 50 and older sample') | |
des_sub1 = survey:::subset.survey.design(agez>=50,x=des_non_impute) | |
svymean(~agez,design=des_sub1) | |
message('mean of age on Age under 50 sample') | |
des_sub2 = survey:::subset.survey.design(agez<50,x=des_non_impute) | |
svymean(~agez,design=des_sub2) | |
# impute ------------------------------------------------------------------ | |
mice::md.pattern(dat_use) | |
# library(dplyr) | |
# gather 'survey design variables' and 'measure variables' | |
dim(dat_use) | |
summary(dat_use) | |
# note: later, we will even use vars in 'names_design_var' to impute | |
message('design variables') | |
names_design_var | |
message('measurement variables') | |
names_measure_var | |
message('dataset for imputation') | |
names(dat_use) | |
# use mice to impute and pool | |
# library(mice) | |
?mice | |
# mice::md.pattern(dat_use) | |
#### | |
# use 'full' PredictorMatrix (default) | |
# eg X_j|X_{-j} for all j | |
# if 'design' variables part of j, then they are used to impute | |
#### | |
imp_pred_mat_all <- mice(dat_use,m=5,seed=1234) | |
summary(imp_pred_mat_all) | |
# 'imp' on lhs will be what's used in design+analysis going forward | |
imp = imp_pred_mat_all | |
# note: interpret 1s/0s of 'PredictorMatrix' | |
#### | |
# exclude variables in column 1 and 2 | |
#### | |
# names(dat_use) | |
# # columns 1 and 2 were 'design' variables | |
# predictorMatrix_cust = (1 - diag(1, ncol(dat_use))) | |
# predictorMatrix_cust[,1] = 0 | |
# predictorMatrix_cust[,2] = 0 | |
# | |
# predictorMatrix_cust | |
# | |
# imp_pred_mat_custom <- mice(dat_use,n.imp=5,seed=1234, | |
# predictorMatrix=predictorMatrix_cust) | |
# summary(imp_pred_mat_custom) | |
# this is just a demo on how to toggle custom predictorMatrix | |
# should include design variables in imputation | |
# note: 'imp_pred_mat_custom' is not used going forward | |
#### | |
# middle man processeing step | |
# mice::complete + imputationList() | |
# 'survey' needs list structure | |
#### | |
# ?mice::complete # to concatenate 'observed' and 'imputed' items | |
# ?mitools::imputationList # to structure as list, where list item is a 'completed' dataset | |
library(mitools) | |
?mice::complete | |
?mitools::imputationList | |
data_imputed <- mitools::imputationList(complete(imp,action="all",include=FALSE)) | |
message('the list of imputations') | |
summary(data_imputed) | |
#### | |
# use library(survey) | |
# 1) respect the survey design | |
# 2) later: conduct survey analysis | |
#### | |
# library(survey) | |
# respect survey design | |
# ouptut of imputationList() 'data_imputed' as data argument | |
# svydesign(data=data_imputed,.) | |
# previously 'data_no_imputation' was data argument when no imputation used | |
# svydesign(data=data_no_imputation,.) | |
# ?survey::svydesign | |
options(survey.lonely.psu = "adjust") | |
# force completed data thru asserted design | |
des_imputed <- svydesign(data=data_imputed, | |
ids=~1, strata=~ststr, weights=~swts, | |
nest=TRUE) | |
message('the list of imputations propogated into the survey design') | |
class(des_imputed) | |
summary(des_imputed) | |
# descriptives | |
summary(des_imputed) | |
message('proportion of smoke after imputation') | |
MIcombine(with(des_imputed, svymean(~as.factor(smoke)))) | |
message('total counts of smoke after imputation') | |
MIcombine(with(des_imputed, svytotal(~as.factor(smoke)))) | |
message('compare with non-imputed design ') | |
message('proportion of smoke on non-imputed') | |
svymean(~as.factor(smoke),design=des_non_impute) | |
message('proportion of smoke on case-deleted') | |
svymean(~as.factor(smoke),design=des_non_impute,na.rm=TRUE) | |
message('total counts of smoke on case-deleted') | |
svytotal(~as.factor(smoke),design=des_non_impute,na.rm=TRUE) | |
#### | |
# smoke as outcome in logistic regression | |
# run model with svyglm using 5 imputed data sets contained in design | |
# NOTE: regression estimates / inference are 'design-based' | |
#### | |
fit_imputed_smoke <- with(des_imputed, svyglm(smoke ~ agez + inc + lowact, | |
family=quasibinomial)) | |
message('glm results from 1st imputed dataset') | |
fit_imputed_smoke[[1]] | |
message('glm results from 5th imputed dataset') | |
fit_imputed_smoke[[5]] | |
message('glm results from combining all imputed datasets') | |
summary(MIcombine(fit_imputed_smoke)) | |
message('compare glm results with non-imputed design') | |
message('note: default glm() behavior omits rows with NAs (case-deletion)') | |
fit_non_impute_smoke = svyglm(design=des_non_impute, | |
formula=smoke ~ agez + inc + lowact, | |
family=quasibinomial()) | |
summary(fit_non_impute_smoke) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment