Skip to content

Instantly share code, notes, and snippets.

@statsccpr
Last active February 1, 2022 01:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save statsccpr/0febfa55853e895029d898ca67af68b1 to your computer and use it in GitHub Desktop.
Save statsccpr/0febfa55853e895029d898ca67af68b1 to your computer and use it in GitHub Desktop.
exer_survey_analysis.r
# https://www.cdc.gov/brfss/questionnaires/modules/state2017.htm
# https://www.cdc.gov/brfss/annual_data/2017/pdf/Complex-Smple-Weights-Prep-Module-Data-Analysis-2017-508.pdf
# https://www.cdc.gov/brfss/annual_data/2017/pdf/codebook17_llcp-508.pdf
# http://asdfree.com/behavioral-risk-factor-surveillance-system-brfss.html
# lowdown to dl data -----------------------------------------------------
#
#
# # catalog thru ftp
# library(lodown)
# lodown( "brfss" ,
# output_dir = here::here("/home/rstudio/mntcnt/workshop_survey_2018/BRFSS") )
#
#
# # examine all available BRFSS microdata files
# brfss_cat <-
# get_catalog( "brfss" ,
# output_dir = here::here("/home/rstudio/mntcnt/workshop_survey_2018/BRFSS/"))
#
# # get 2017 only
# brfss_cat <- subset( brfss_cat , year == 2017 )
# # download the microdata to your local computer
# brfss_cat <- lodown( "brfss" , brfss_cat )
#
# list.files(here::here("/home/rstudio/mntcnt/workshop_survey_2018/BRFSS/"))
#
# brfss_raw <-
# readRDS(here::here("/home/rstudio/mntcnt/workshop_survey_2018/BRFSS/2017 main.rds"))
#
# # lowdown appends prefix 'x' to all varnames
#
#
# # clean data --------------------------------------------------------------
#
#
# brfss_17 = brfss_raw %>% dplyr::select(.,
# contains("smoke"),
# contains("pacat"),
# contains("toldhi2"),
# contains("marital"),
# contains("incomg"),
# contains("act"),
# contains("age"),
# contains("cnty"),
# contains("state"),
# contains("cofip"),
# contains("cntywt"),
# contains("psu"),
# contains("ststr"),
# contains("pwt")
# ) %>%
# filter(xstate %in% c("6"))
#
#
#
# # wrangle 'measurement' variables
#
#
# # ?car::recode
#
# # smoking currently
# brfss_17$smoke = car::recode(brfss_17$xsmoker3, recodes="1:2=1; 3:4=0; else=NA")
#
# # low physical activity
# brfss_17$lowact = car::recode(brfss_17$xpacat, recodes="1:2=0; 3:4=1; else=NA")
#
# # high cholesterol
# brfss_17$hc = car::recode(brfss_17$toldhi2, recodes="1=1; 2=0; else=NA")
#
# # marital status
# brfss_17$marst = car::recode(brfss_17$marital,
# as.factor=TRUE,
# recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA")
# brfss_17$marst = relevel(brfss_17$marst, ref='married')
#
# # income
# brfss_17$inc = as.factor(ifelse(brfss_17$xincomg==9, NA, brfss_17$xincomg))
#
# # age
# brfss_17$agez = brfss_17$xage80
#
# # the 'measurement' variables to use
# names_measure_var = c("agez",'hc','marst','lowact','smoke','inc')
#
# brfss_17[,names_measure_var] %>% summary
# brfss_17[,names_measure_var] %>% head
#
#
#
# # wrangle 'design' variables
#
# # names(brfss_17)
#
# # note: from brfss doc
# # # # Create survey design
# # # brfssdsgn <- svydesign(
# # # id=~1,
# # # strata = ~ststr,
# # # weights = ~llcpwt,
# # # data = BRFSS)
#
# # rename to use as 'swts' 'ststr'
# brfss_17$swts = brfss_17$xllcpwt
# brfss_17$ststr = brfss_17$xststr
#
# # the 'design' variables to use
# # names_design_var = c("xststr","xllcpwt")
# names_design_var = c("ststr","swts")
#
# brfss_17[,names_design_var] %>% head
# brfss_17[,names_design_var] %>% summary
#
#
# ###
# # gather 'measurement' and 'design' variables
# ###
# brfss_17 = brfss_17[,c(names_design_var,names_measure_var)]
#
#
# # data for analysis going forward
#
# dat_use = brfss_17
#
# summary(dat_use)
# saveRDS(dat_use,file=here::here("/mntcnt/workshop_survey_2018/dat_use_brfss.rds"))
# write_dta(dat_use,path='~/projects/survey_complex/data/proc/dat_use_brfss.DTA')
# # read_dta('~/projects/survey_complex/data/proc/dat_use_brfss.DTA')
## find /home/user{1..40} -maxdepth 0 -exec cp dat_use_brfss.rds {} \;
# Analysis ----------------------------------------------------------------
dat_use = readRDS("/home/user100/dat_use_brfss.rds")
dim(dat_use)
summary(dat_use)
# non-impute example --------------------------------------------------
options(survey.lonely.psu = "adjust")
# ?svydesign
des_non_impute = svydesign(ids=~1,
strata=~ststr,
weights=~swts,
data=dat_use)
attributes(des_non_impute)
# descriptives
# head(des_non_impute)
# str(des_non_impute)
# summary(des_non_impute$variables)
# ?svymean
message('Note: age does not have missing cases')
svymean(~agez, design = des_non_impute)
message('Note: smoke has missing cases, hence NA svymean() result')
svymean(~as.factor(smoke), design = des_non_impute)
message('Note: supply svymean(...,na.rm=TRUE) option to svymean() for case deletion')
svymean(~as.factor(smoke), design = des_non_impute,na.rm=TRUE)
message('Note: supply svytotal(...,na.rm=TRUE) option to svytotal() for case deletion')
# regression
# codebook says the cryptically named 'parec' variable is
# parec: Aerobic and Strengthening Guideline
fit_non_impute_age = svyglm(design=des_non_impute,
formula=as.numeric(inc) ~ agez,
family='gaussian')
summary(fit_non_impute_age)
# note: default omitted NAs
fit_non_impute_smoke = svyglm(design=des_non_impute,
formula=smoke ~ agez + inc + lowact,
family=quasibinomial())
summary(fit_non_impute_smoke)
# subpopulation (subdomain) analysis
# library(survey)
methods(subset)
# ?subset
# ?subset.survey.design
# names(des_non_impute$variables) %>% sort
message('mean of age on whole sample')
svymean(~agez, design = des_non_impute)
message('mean of age on Age 50 and older sample')
des_sub1 = survey:::subset.survey.design(agez>=50,x=des_non_impute)
svymean(~agez,design=des_sub1)
message('mean of age on Age under 50 sample')
des_sub2 = survey:::subset.survey.design(agez<50,x=des_non_impute)
svymean(~agez,design=des_sub2)
# impute ------------------------------------------------------------------
mice::md.pattern(dat_use)
# library(dplyr)
# gather 'survey design variables' and 'measure variables'
dim(dat_use)
summary(dat_use)
# note: later, we will even use vars in 'names_design_var' to impute
message('design variables')
names_design_var
message('measurement variables')
names_measure_var
message('dataset for imputation')
names(dat_use)
# use mice to impute and pool
# library(mice)
?mice
# mice::md.pattern(dat_use)
####
# use 'full' PredictorMatrix (default)
# eg X_j|X_{-j} for all j
# if 'design' variables part of j, then they are used to impute
####
imp_pred_mat_all <- mice(dat_use,m=5,seed=1234)
summary(imp_pred_mat_all)
# 'imp' on lhs will be what's used in design+analysis going forward
imp = imp_pred_mat_all
# note: interpret 1s/0s of 'PredictorMatrix'
####
# exclude variables in column 1 and 2
####
# names(dat_use)
# # columns 1 and 2 were 'design' variables
# predictorMatrix_cust = (1 - diag(1, ncol(dat_use)))
# predictorMatrix_cust[,1] = 0
# predictorMatrix_cust[,2] = 0
#
# predictorMatrix_cust
#
# imp_pred_mat_custom <- mice(dat_use,n.imp=5,seed=1234,
# predictorMatrix=predictorMatrix_cust)
# summary(imp_pred_mat_custom)
# this is just a demo on how to toggle custom predictorMatrix
# should include design variables in imputation
# note: 'imp_pred_mat_custom' is not used going forward
####
# middle man processeing step
# mice::complete + imputationList()
# 'survey' needs list structure
####
# ?mice::complete # to concatenate 'observed' and 'imputed' items
# ?mitools::imputationList # to structure as list, where list item is a 'completed' dataset
library(mitools)
?mice::complete
?mitools::imputationList
data_imputed <- mitools::imputationList(complete(imp,action="all",include=FALSE))
message('the list of imputations')
summary(data_imputed)
####
# use library(survey)
# 1) respect the survey design
# 2) later: conduct survey analysis
####
# library(survey)
# respect survey design
# ouptut of imputationList() 'data_imputed' as data argument
# svydesign(data=data_imputed,.)
# previously 'data_no_imputation' was data argument when no imputation used
# svydesign(data=data_no_imputation,.)
# ?survey::svydesign
options(survey.lonely.psu = "adjust")
# force completed data thru asserted design
des_imputed <- svydesign(data=data_imputed,
ids=~1, strata=~ststr, weights=~swts,
nest=TRUE)
message('the list of imputations propogated into the survey design')
class(des_imputed)
summary(des_imputed)
# descriptives
summary(des_imputed)
message('proportion of smoke after imputation')
MIcombine(with(des_imputed, svymean(~as.factor(smoke))))
message('total counts of smoke after imputation')
MIcombine(with(des_imputed, svytotal(~as.factor(smoke))))
message('compare with non-imputed design ')
message('proportion of smoke on non-imputed')
svymean(~as.factor(smoke),design=des_non_impute)
message('proportion of smoke on case-deleted')
svymean(~as.factor(smoke),design=des_non_impute,na.rm=TRUE)
message('total counts of smoke on case-deleted')
svytotal(~as.factor(smoke),design=des_non_impute,na.rm=TRUE)
####
# smoke as outcome in logistic regression
# run model with svyglm using 5 imputed data sets contained in design
# NOTE: regression estimates / inference are 'design-based'
####
fit_imputed_smoke <- with(des_imputed, svyglm(smoke ~ agez + inc + lowact,
family=quasibinomial))
message('glm results from 1st imputed dataset')
fit_imputed_smoke[[1]]
message('glm results from 5th imputed dataset')
fit_imputed_smoke[[5]]
message('glm results from combining all imputed datasets')
summary(MIcombine(fit_imputed_smoke))
message('compare glm results with non-imputed design')
message('note: default glm() behavior omits rows with NAs (case-deletion)')
fit_non_impute_smoke = svyglm(design=des_non_impute,
formula=smoke ~ agez + inc + lowact,
family=quasibinomial())
summary(fit_non_impute_smoke)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment