statsccpr/exer_survey_analysis.r Secret

## exer_survey_analysis.r
# https://www.cdc.gov/brfss/questionnaires/modules/state2017.htm
# https://www.cdc.gov/brfss/annual_data/2017/pdf/Complex-Smple-Weights-Prep-Module-Data-Analysis-2017-508.pdf
# https://www.cdc.gov/brfss/annual_data/2017/pdf/codebook17_llcp-508.pdf

# http://asdfree.com/behavioral-risk-factor-surveillance-system-brfss.html


# lowdown to dl data -----------------------------------------------------
#
#
# # catalog thru ftp
# library(lodown)
# lodown( "brfss" ,
#         output_dir = here::here("/home/rstudio/mntcnt/workshop_survey_2018/BRFSS") )
#
#
# # examine all available BRFSS microdata files
# brfss_cat <-
#   get_catalog( "brfss" ,
#                output_dir = here::here("/home/rstudio/mntcnt/workshop_survey_2018/BRFSS/"))
#
# # get 2017 only
# brfss_cat <- subset( brfss_cat , year == 2017 )
# # download the microdata to your local computer
# brfss_cat <- lodown( "brfss" , brfss_cat )
#
# list.files(here::here("/home/rstudio/mntcnt/workshop_survey_2018/BRFSS/"))
#
# brfss_raw <-
#   readRDS(here::here("/home/rstudio/mntcnt/workshop_survey_2018/BRFSS/2017 main.rds"))
#
# # lowdown appends prefix 'x' to all varnames
#
#
# # clean data --------------------------------------------------------------
#
#
# brfss_17 = brfss_raw %>% dplyr::select(.,
#                                        contains("smoke"),
#                                        contains("pacat"),
#                                        contains("toldhi2"),
#                                        contains("marital"),
#                                        contains("incomg"),
#                                        contains("act"),
#                                        contains("age"),
#                                        contains("cnty"),
#                                        contains("state"),
#                                        contains("cofip"),
#                                        contains("cntywt"),
#                                        contains("psu"),
#                                        contains("ststr"),
#                                        contains("pwt")
#                                        ) %>%
#   filter(xstate %in% c("6"))
#
#
#
# # wrangle 'measurement' variables
#
#
# # ?car::recode
#
# # smoking currently
# brfss_17$smoke = car::recode(brfss_17$xsmoker3, recodes="1:2=1; 3:4=0; else=NA")
#
# # low physical activity
# brfss_17$lowact = car::recode(brfss_17$xpacat, recodes="1:2=0; 3:4=1; else=NA")
#
# # high cholesterol
# brfss_17$hc = car::recode(brfss_17$toldhi2, recodes="1=1; 2=0; else=NA")
#
# # marital status
# brfss_17$marst = car::recode(brfss_17$marital,
#                              as.factor=TRUE,
#                              recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA")
# brfss_17$marst = relevel(brfss_17$marst, ref='married')
#
# # income
# brfss_17$inc = as.factor(ifelse(brfss_17$xincomg==9, NA, brfss_17$xincomg))
#
# # age
# brfss_17$agez = brfss_17$xage80
#
# # the 'measurement' variables to use
# names_measure_var = c("agez",'hc','marst','lowact','smoke','inc')
#
# brfss_17[,names_measure_var] %>% summary
# brfss_17[,names_measure_var] %>% head
#
#
#
# # wrangle 'design' variables
#
# # names(brfss_17)
#
# # note: from brfss doc
# # # # Create survey design
# # # brfssdsgn <- svydesign(
# # #   id=~1,
# # #   strata = ~ststr,
# # #   weights = ~llcpwt,
# # #   data = BRFSS)
#
# # rename to use as 'swts' 'ststr'
# brfss_17$swts = brfss_17$xllcpwt
# brfss_17$ststr = brfss_17$xststr
#
# # the 'design' variables to use
# # names_design_var = c("xststr","xllcpwt")
# names_design_var = c("ststr","swts")
#
# brfss_17[,names_design_var] %>% head
# brfss_17[,names_design_var] %>% summary
#
#
# ###
# # gather 'measurement' and 'design' variables
# ###
# brfss_17 = brfss_17[,c(names_design_var,names_measure_var)]
#
#
# # data for analysis going forward
#
# dat_use = brfss_17
#
# summary(dat_use)

# saveRDS(dat_use,file=here::here("/mntcnt/workshop_survey_2018/dat_use_brfss.rds"))

# write_dta(dat_use,path='~/projects/survey_complex/data/proc/dat_use_brfss.DTA')
# # read_dta('~/projects/survey_complex/data/proc/dat_use_brfss.DTA')


## find /home/user{1..40} -maxdepth 0 -exec cp dat_use_brfss.rds {} \;

# Analysis ----------------------------------------------------------------
dat_use = readRDS("/home/user100/dat_use_brfss.rds")
dim(dat_use)
summary(dat_use)


# non-impute example --------------------------------------------------


options(survey.lonely.psu = "adjust")

# ?svydesign

des_non_impute = svydesign(ids=~1,
                           strata=~ststr,
                           weights=~swts,
                           data=dat_use)

attributes(des_non_impute)


# descriptives


# head(des_non_impute)
# str(des_non_impute)

# summary(des_non_impute$variables)

# ?svymean
message('Note: age does not have missing cases')
svymean(~agez, design = des_non_impute)

message('Note: smoke has missing cases, hence NA svymean() result')
svymean(~as.factor(smoke), design = des_non_impute)

message('Note: supply svymean(...,na.rm=TRUE) option to svymean() for case deletion')
svymean(~as.factor(smoke), design = des_non_impute,na.rm=TRUE)


message('Note: supply svytotal(...,na.rm=TRUE) option to svytotal() for case deletion')


# regression


# codebook says the cryptically named 'parec' variable is
# parec: Aerobic and Strengthening Guideline

fit_non_impute_age = svyglm(design=des_non_impute,
                            formula=as.numeric(inc) ~ agez,
                            family='gaussian')
summary(fit_non_impute_age)

# note: default omitted NAs
fit_non_impute_smoke = svyglm(design=des_non_impute,
                              formula=smoke ~ agez + inc + lowact,
                              family=quasibinomial())

summary(fit_non_impute_smoke)


# subpopulation (subdomain) analysis


# library(survey)
methods(subset)
# ?subset

# ?subset.survey.design
# names(des_non_impute$variables) %>% sort

message('mean of age on whole sample')
svymean(~agez, design = des_non_impute)

message('mean of age on Age 50 and older sample')
des_sub1 = survey:::subset.survey.design(agez>=50,x=des_non_impute)
svymean(~agez,design=des_sub1)

message('mean of age on Age under 50 sample')
des_sub2 = survey:::subset.survey.design(agez<50,x=des_non_impute)
svymean(~agez,design=des_sub2)


# impute ------------------------------------------------------------------

mice::md.pattern(dat_use)

# library(dplyr)

# gather 'survey design variables' and 'measure variables'


dim(dat_use)
summary(dat_use)


# note: later, we will even use vars in 'names_design_var' to impute
message('design variables')
names_design_var

message('measurement variables')
names_measure_var

message('dataset for imputation')
names(dat_use)


# use mice to impute and pool
# library(mice)
?mice

# mice::md.pattern(dat_use)


####
# use 'full' PredictorMatrix (default)
# eg X_j|X_{-j} for all j
# if 'design' variables part of j, then they are used to impute
####

imp_pred_mat_all <- mice(dat_use,m=5,seed=1234)
summary(imp_pred_mat_all)

# 'imp' on lhs will be what's used in design+analysis going forward
imp = imp_pred_mat_all

# note: interpret 1s/0s of 'PredictorMatrix'

####
# exclude variables in column 1 and 2
####

# names(dat_use)
# # columns 1 and 2 were 'design' variables
# predictorMatrix_cust = (1 - diag(1, ncol(dat_use)))
# predictorMatrix_cust[,1] = 0
# predictorMatrix_cust[,2] = 0
#
# predictorMatrix_cust
#
# imp_pred_mat_custom <- mice(dat_use,n.imp=5,seed=1234,
#                             predictorMatrix=predictorMatrix_cust)
# summary(imp_pred_mat_custom)

# this is just a demo on how to toggle custom predictorMatrix
# should include design variables in imputation
# note: 'imp_pred_mat_custom' is not used going forward

####
# middle man processeing step
# mice::complete + imputationList()
# 'survey' needs list structure
####
# ?mice::complete  # to concatenate 'observed' and 'imputed' items
# ?mitools::imputationList  # to structure as list, where list item is a 'completed' dataset

library(mitools)

?mice::complete
?mitools::imputationList

data_imputed <- mitools::imputationList(complete(imp,action="all",include=FALSE))

message('the list of imputations')
summary(data_imputed)

####
# use library(survey)
# 1) respect the survey design
# 2) later: conduct survey analysis
####
# library(survey)

# respect survey design
# ouptut of imputationList() 'data_imputed' as data argument
# svydesign(data=data_imputed,.)
# previously 'data_no_imputation' was data argument when no imputation used
# svydesign(data=data_no_imputation,.)

# ?survey::svydesign

options(survey.lonely.psu = "adjust")

# force completed data thru asserted design
des_imputed <- svydesign(data=data_imputed,
                         ids=~1, strata=~ststr, weights=~swts,
                         nest=TRUE)

message('the list of imputations propogated into the survey design')

class(des_imputed)
summary(des_imputed)


# descriptives

summary(des_imputed)

message('proportion of smoke after imputation')
MIcombine(with(des_imputed, svymean(~as.factor(smoke))))

message('total counts of smoke after imputation')
MIcombine(with(des_imputed, svytotal(~as.factor(smoke))))

message('compare with non-imputed design ')

message('proportion of smoke on non-imputed')
svymean(~as.factor(smoke),design=des_non_impute)

message('proportion of smoke on case-deleted')
svymean(~as.factor(smoke),design=des_non_impute,na.rm=TRUE)

message('total counts of smoke on case-deleted')
svytotal(~as.factor(smoke),design=des_non_impute,na.rm=TRUE)


####
# smoke as outcome in logistic regression
# run model with svyglm using 5 imputed data sets contained in design
# NOTE: regression estimates / inference are 'design-based'
####

fit_imputed_smoke <- with(des_imputed, svyglm(smoke ~ agez + inc + lowact,
                                              family=quasibinomial))

message('glm results from 1st imputed dataset')
fit_imputed_smoke[[1]]

message('glm results from 5th imputed dataset')
fit_imputed_smoke[[5]]

message('glm results from combining all imputed datasets')
summary(MIcombine(fit_imputed_smoke))


message('compare glm results with non-imputed design')
message('note: default glm() behavior omits rows with NAs (case-deletion)')

fit_non_impute_smoke = svyglm(design=des_non_impute,
                              formula=smoke ~ agez + inc + lowact,
                              family=quasibinomial())

summary(fit_non_impute_smoke)
	# https://www.cdc.gov/brfss/questionnaires/modules/state2017.htm
	# https://www.cdc.gov/brfss/annual_data/2017/pdf/Complex-Smple-Weights-Prep-Module-Data-Analysis-2017-508.pdf
	# https://www.cdc.gov/brfss/annual_data/2017/pdf/codebook17_llcp-508.pdf

	# http://asdfree.com/behavioral-risk-factor-surveillance-system-brfss.html


	# lowdown to dl data -----------------------------------------------------
	#
	#
	# # catalog thru ftp
	# library(lodown)
	# lodown( "brfss" ,
	# output_dir = here::here("/home/rstudio/mntcnt/workshop_survey_2018/BRFSS") )
	#
	#
	# # examine all available BRFSS microdata files
	# brfss_cat <-
	# get_catalog( "brfss" ,
	# output_dir = here::here("/home/rstudio/mntcnt/workshop_survey_2018/BRFSS/"))
	#
	# # get 2017 only
	# brfss_cat <- subset( brfss_cat , year == 2017 )
	# # download the microdata to your local computer
	# brfss_cat <- lodown( "brfss" , brfss_cat )
	#
	# list.files(here::here("/home/rstudio/mntcnt/workshop_survey_2018/BRFSS/"))
	#
	# brfss_raw <-
	# readRDS(here::here("/home/rstudio/mntcnt/workshop_survey_2018/BRFSS/2017 main.rds"))
	#
	# # lowdown appends prefix 'x' to all varnames
	#
	#
	# # clean data --------------------------------------------------------------
	#
	#
	# brfss_17 = brfss_raw %>% dplyr::select(.,
	# contains("smoke"),
	# contains("pacat"),
	# contains("toldhi2"),
	# contains("marital"),
	# contains("incomg"),
	# contains("act"),
	# contains("age"),
	# contains("cnty"),
	# contains("state"),
	# contains("cofip"),
	# contains("cntywt"),
	# contains("psu"),
	# contains("ststr"),
	# contains("pwt")
	# ) %>%
	# filter(xstate %in% c("6"))
	#
	#
	#
	# # wrangle 'measurement' variables
	#
	#
	# # ?car::recode
	#
	# # smoking currently
	# brfss_17$smoke = car::recode(brfss_17$xsmoker3, recodes="1:2=1; 3:4=0; else=NA")
	#
	# # low physical activity
	# brfss_17$lowact = car::recode(brfss_17$xpacat, recodes="1:2=0; 3:4=1; else=NA")
	#
	# # high cholesterol
	# brfss_17$hc = car::recode(brfss_17$toldhi2, recodes="1=1; 2=0; else=NA")
	#
	# # marital status
	# brfss_17$marst = car::recode(brfss_17$marital,
	# as.factor=TRUE,
	# recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA")
	# brfss_17$marst = relevel(brfss_17$marst, ref='married')
	#
	# # income
	# brfss_17$inc = as.factor(ifelse(brfss_17$xincomg==9, NA, brfss_17$xincomg))
	#
	# # age
	# brfss_17$agez = brfss_17$xage80
	#
	# # the 'measurement' variables to use
	# names_measure_var = c("agez",'hc','marst','lowact','smoke','inc')
	#
	# brfss_17[,names_measure_var] %>% summary
	# brfss_17[,names_measure_var] %>% head
	#
	#
	#
	# # wrangle 'design' variables
	#
	# # names(brfss_17)
	#
	# # note: from brfss doc
	# # # # Create survey design
	# # # brfssdsgn <- svydesign(
	# # # id=~1,
	# # # strata = ~ststr,
	# # # weights = ~llcpwt,
	# # # data = BRFSS)
	#
	# # rename to use as 'swts' 'ststr'
	# brfss_17$swts = brfss_17$xllcpwt
	# brfss_17$ststr = brfss_17$xststr
	#
	# # the 'design' variables to use
	# # names_design_var = c("xststr","xllcpwt")
	# names_design_var = c("ststr","swts")
	#
	# brfss_17[,names_design_var] %>% head
	# brfss_17[,names_design_var] %>% summary
	#
	#
	# ###
	# # gather 'measurement' and 'design' variables
	# ###
	# brfss_17 = brfss_17[,c(names_design_var,names_measure_var)]
	#
	#
	# # data for analysis going forward
	#
	# dat_use = brfss_17
	#
	# summary(dat_use)

	# saveRDS(dat_use,file=here::here("/mntcnt/workshop_survey_2018/dat_use_brfss.rds"))

	# write_dta(dat_use,path='~/projects/survey_complex/data/proc/dat_use_brfss.DTA')
	# # read_dta('~/projects/survey_complex/data/proc/dat_use_brfss.DTA')


	## find /home/user{1..40} -maxdepth 0 -exec cp dat_use_brfss.rds {} \;

	# Analysis ----------------------------------------------------------------
	dat_use = readRDS("/home/user100/dat_use_brfss.rds")
	dim(dat_use)
	summary(dat_use)


	# non-impute example --------------------------------------------------


	options(survey.lonely.psu = "adjust")

	# ?svydesign

	des_non_impute = svydesign(ids=~1,
	strata=~ststr,
	weights=~swts,
	data=dat_use)

	attributes(des_non_impute)


	# descriptives


	# head(des_non_impute)
	# str(des_non_impute)

	# summary(des_non_impute$variables)

	# ?svymean
	message('Note: age does not have missing cases')
	svymean(~agez, design = des_non_impute)

	message('Note: smoke has missing cases, hence NA svymean() result')
	svymean(~as.factor(smoke), design = des_non_impute)

	message('Note: supply svymean(...,na.rm=TRUE) option to svymean() for case deletion')
	svymean(~as.factor(smoke), design = des_non_impute,na.rm=TRUE)


	message('Note: supply svytotal(...,na.rm=TRUE) option to svytotal() for case deletion')



	# regression


	# codebook says the cryptically named 'parec' variable is
	# parec: Aerobic and Strengthening Guideline

	fit_non_impute_age = svyglm(design=des_non_impute,
	formula=as.numeric(inc) ~ agez,
	family='gaussian')
	summary(fit_non_impute_age)

	# note: default omitted NAs
	fit_non_impute_smoke = svyglm(design=des_non_impute,
	formula=smoke ~ agez + inc + lowact,
	family=quasibinomial())

	summary(fit_non_impute_smoke)



	# subpopulation (subdomain) analysis


	# library(survey)
	methods(subset)
	# ?subset

	# ?subset.survey.design
	# names(des_non_impute$variables) %>% sort

	message('mean of age on whole sample')
	svymean(~agez, design = des_non_impute)

	message('mean of age on Age 50 and older sample')
	des_sub1 = survey:::subset.survey.design(agez>=50,x=des_non_impute)
	svymean(~agez,design=des_sub1)

	message('mean of age on Age under 50 sample')
	des_sub2 = survey:::subset.survey.design(agez<50,x=des_non_impute)
	svymean(~agez,design=des_sub2)



	# impute ------------------------------------------------------------------

	mice::md.pattern(dat_use)

	# library(dplyr)

	# gather 'survey design variables' and 'measure variables'


	dim(dat_use)
	summary(dat_use)


	# note: later, we will even use vars in 'names_design_var' to impute
	message('design variables')
	names_design_var

	message('measurement variables')
	names_measure_var

	message('dataset for imputation')
	names(dat_use)


	# use mice to impute and pool
	# library(mice)
	?mice

	# mice::md.pattern(dat_use)


	####
	# use 'full' PredictorMatrix (default)
	# eg X_j\|X_{-j} for all j
	# if 'design' variables part of j, then they are used to impute
	####

	imp_pred_mat_all <- mice(dat_use,m=5,seed=1234)
	summary(imp_pred_mat_all)

	# 'imp' on lhs will be what's used in design+analysis going forward
	imp = imp_pred_mat_all

	# note: interpret 1s/0s of 'PredictorMatrix'

	####
	# exclude variables in column 1 and 2
	####

	# names(dat_use)
	# # columns 1 and 2 were 'design' variables
	# predictorMatrix_cust = (1 - diag(1, ncol(dat_use)))
	# predictorMatrix_cust[,1] = 0
	# predictorMatrix_cust[,2] = 0
	#
	# predictorMatrix_cust
	#
	# imp_pred_mat_custom <- mice(dat_use,n.imp=5,seed=1234,
	# predictorMatrix=predictorMatrix_cust)
	# summary(imp_pred_mat_custom)

	# this is just a demo on how to toggle custom predictorMatrix
	# should include design variables in imputation
	# note: 'imp_pred_mat_custom' is not used going forward

	####
	# middle man processeing step
	# mice::complete + imputationList()
	# 'survey' needs list structure
	####
	# ?mice::complete # to concatenate 'observed' and 'imputed' items
	# ?mitools::imputationList # to structure as list, where list item is a 'completed' dataset

	library(mitools)

	?mice::complete
	?mitools::imputationList

	data_imputed <- mitools::imputationList(complete(imp,action="all",include=FALSE))

	message('the list of imputations')
	summary(data_imputed)

	####
	# use library(survey)
	# 1) respect the survey design
	# 2) later: conduct survey analysis
	####
	# library(survey)

	# respect survey design
	# ouptut of imputationList() 'data_imputed' as data argument
	# svydesign(data=data_imputed,.)
	# previously 'data_no_imputation' was data argument when no imputation used
	# svydesign(data=data_no_imputation,.)

	# ?survey::svydesign

	options(survey.lonely.psu = "adjust")

	# force completed data thru asserted design
	des_imputed <- svydesign(data=data_imputed,
	ids=~1, strata=~ststr, weights=~swts,
	nest=TRUE)

	message('the list of imputations propogated into the survey design')

	class(des_imputed)
	summary(des_imputed)



	# descriptives

	summary(des_imputed)

	message('proportion of smoke after imputation')
	MIcombine(with(des_imputed, svymean(~as.factor(smoke))))

	message('total counts of smoke after imputation')
	MIcombine(with(des_imputed, svytotal(~as.factor(smoke))))

	message('compare with non-imputed design ')

	message('proportion of smoke on non-imputed')
	svymean(~as.factor(smoke),design=des_non_impute)

	message('proportion of smoke on case-deleted')
	svymean(~as.factor(smoke),design=des_non_impute,na.rm=TRUE)

	message('total counts of smoke on case-deleted')
	svytotal(~as.factor(smoke),design=des_non_impute,na.rm=TRUE)


	####
	# smoke as outcome in logistic regression
	# run model with svyglm using 5 imputed data sets contained in design
	# NOTE: regression estimates / inference are 'design-based'
	####

	fit_imputed_smoke <- with(des_imputed, svyglm(smoke ~ agez + inc + lowact,
	family=quasibinomial))

	message('glm results from 1st imputed dataset')
	fit_imputed_smoke[[1]]

	message('glm results from 5th imputed dataset')
	fit_imputed_smoke[[5]]

	message('glm results from combining all imputed datasets')
	summary(MIcombine(fit_imputed_smoke))


	message('compare glm results with non-imputed design')
	message('note: default glm() behavior omits rows with NAs (case-deletion)')

	fit_non_impute_smoke = svyglm(design=des_non_impute,
	formula=smoke ~ agez + inc + lowact,
	family=quasibinomial())

	summary(fit_non_impute_smoke)