ajdamico/using R to replicate the NHIS Multiple Imputation technique.R

## using R to replicate the NHIS Multiple Imputation technique.R
#page 118 of the NHIS document
#ftp://ftp.cdc.gov/pub/health_statistics/nchs/dataset_documentation/nhis/2010/srvydesc.pdf
#displays the R code to load the persons file into R as a survey object
#the code below creates a slightly different survey object, one that includes appropriately-imputed income.


#this R code:
#	reads the year 2000 personsx file into R
#	reads in all five imputed income files
#	merges the 2000 personsx file with the five imputed income files
#	creates a MI survey object
#	runs six example analyses with the multiply-imputed R survey object

#set the number of digits displayed
options(digits=16)

#NOTE that the three packages below must be installed (just once)
#if this is the first time using R with these packages, run this line:
#install.packages(c("mitools","survey","sas7bdat"))

#load the multiple imputation package
library(mitools)

#load the complex survey analysis package
library(survey)

#set the current working directory to the location where the 2000 personsx SAS data set is stored.
setwd("//dcdata/v/National Health Interview Survey/Income Imputation/2000")

#load the .sas7bdat importation package
library(sas7bdat)

###############################
#import NHIS 2000 PERSONSX file

#this can be done a number of different ways

#read in the NHIS 2000 PERSONSX file from a SAS file  (this takes a while)
x <- read.sas7bdat( "personsx.sas7bdat" )
#now the NHIS 2000 PERSONSX file is a data frame (x) stored in R!

#if using read.sas7bdat, these four columns will be factor variables
#and must be converted to numeric ones in order for the merge to work

for ( i in c("SRVY_YR","HHX","FMX","PX") ){
	 x[ , i ] <- as.numeric( as.character( x[ , i ] ) )
}

#end import of PERSONSX file
###############################


#loop through j = 1 through 5 and..
#read in the five imputed income files. (this also takes a while)
for ( j in 1:5 ){
	#print the current iteration of the loop
	print( j )

	#set the current working directory to the location where the five 2000 imputed income SAS data sets are stored.
	setwd("//dcdata/v/National Health Interview Survey/Income Imputation/2000")

	#read in the SAS imputed income file number "j"
	y <- read.sas7bdat(paste( "incmimp" , j , ".sas7bdat", sep=""))

	#dump RECTYPE variable, which is already on the PERSONSX file
	y$RECTYPE <- NULL

	#merge the personsx file with the imputed income file
	z <- merge( x , y , by.x=c("SRVY_YR","HHX","FMX","PX") , by.y=c("SRVY_YR","HHX","FMX","FPX") )

	#print the number of rows in the personsx data frame and
	#also the number of rows in the merged personsx-imputed income data frame
	#to make sure they are the same!
	print( nrow( x ) )
	print( nrow( y ) )
	print( nrow( z ) )

	###########################
	#START OF VARIABLE RECODING
	#any new variables that the user would like to create should be constructed here

	#create the NOTCOV variable
	#shown on page 47 (PDF page 51) of http://www.cdc.gov/nchs/data/nhis/tecdoc_2010.pdf
	z <- transform( z , NOTCOV = ifelse( NOTCOV %in% 7:9 , NA , NOTCOV ))

	#create the POVERTYI variable
	#shown on page 48 (PDF page 52) of http://www.cdc.gov/nchs/data/nhis/tecdoc_2010.pdf
	z <- transform( z , POVERTYI =
		ifelse( RAT_CATI %in% 1:3 , 1 ,
		ifelse( RAT_CATI %in% 4:7 , 2 ,
		ifelse( RAT_CATI %in% 8:11 , 3 ,
		ifelse( RAT_CATI %in% 12:14 , 4 , NA ) ) ) ) )

	#END OF VARIABLE RECODING
	#########################

	#########################
	#delete columns you don't need to free up RAM
	mini_z <- z[ , c("PSU","STRATUM","WTFA","POVERTYI","NOTCOV") ]
	#end of column deletions
	#########################

	#save the current data frame (z) as z1, z2, z3, z4, or z5
	#depending on the current iteration of the j = 1 through 5 loop
	assign( paste( "z" , j , sep = "" ) , mini_z )

	#delete the y and z data frames to free up RAM
	y <- z <- NULL

	#garbage collection: this frees up RAM
	gc()
}
#when the loop has terminated, data frames z1 through z5 exist
#each are the personsx file merged with one of the five imputed income files
#and each include all recoded variables.


#using all five merged personsx-MI files,
#create the multiple imputation survey object
nhissvy <- svydesign( id = ~PSU , strata=~STRATUM , weight=~WTFA , data=imputationList(list(z1,z2,z3,z4,z5)) , nest=T )

#delete the y and z data frames to free up RAM
x <- z1 <- z2 <- z3 <- z4 <- z5 <- NULL

#garbage collection: this frees up RAM
gc()


##################################################################
#now that the R survey object (nhissvy) has been constructed,
#analyses can be run.

#the following output matches PDF page 60 on http://www.cdc.gov/nchs/data/nhis/tecdoc_2010.pdf

#this displays the crosstab statistics..
	#not broken out by the POVERTYI variable

#print the unweighted N
MIcombine( with( subset( nhissvy , !is.na(POVERTYI)) , unwtd.count( ~factor(NOTCOV) , na.rm=T ) ) )
#print the weighted N
MIcombine( with( subset( nhissvy , !is.na(POVERTYI)) , svytotal( ~factor(NOTCOV) , na.rm=T ) ) )
#print the overall percents
MIcombine( with( subset( nhissvy , !is.na(POVERTYI)) , svymean( ~factor(NOTCOV) , na.rm=T ) ) )

	#broken out by the POVERTYI variable

#print the unweighted N
MIcombine( with( nhissvy , svyby(~factor(NOTCOV) , ~factor(POVERTYI) , unwtd.count , na.rm=T ) ) )
#print the weighted N
MIcombine( with( nhissvy , svyby(~factor(NOTCOV) , ~factor(POVERTYI) , svytotal , na.rm=T ) ) )
#print the row percents
MIcombine( with( nhissvy , svyby(~factor(NOTCOV) , ~factor(POVERTYI) , svymean , na.rm=T ) ) )
	#page 118 of the NHIS document
	#ftp://ftp.cdc.gov/pub/health_statistics/nchs/dataset_documentation/nhis/2010/srvydesc.pdf
	#displays the R code to load the persons file into R as a survey object
	#the code below creates a slightly different survey object, one that includes appropriately-imputed income.


	#this R code:
	# reads the year 2000 personsx file into R
	# reads in all five imputed income files
	# merges the 2000 personsx file with the five imputed income files
	# creates a MI survey object
	# runs six example analyses with the multiply-imputed R survey object

	#set the number of digits displayed
	options(digits=16)

	#NOTE that the three packages below must be installed (just once)
	#if this is the first time using R with these packages, run this line:
	#install.packages(c("mitools","survey","sas7bdat"))

	#load the multiple imputation package
	library(mitools)

	#load the complex survey analysis package
	library(survey)

	#set the current working directory to the location where the 2000 personsx SAS data set is stored.
	setwd("//dcdata/v/National Health Interview Survey/Income Imputation/2000")

	#load the .sas7bdat importation package
	library(sas7bdat)

	###############################
	#import NHIS 2000 PERSONSX file

	#this can be done a number of different ways

	#read in the NHIS 2000 PERSONSX file from a SAS file (this takes a while)
	x <- read.sas7bdat( "personsx.sas7bdat" )
	#now the NHIS 2000 PERSONSX file is a data frame (x) stored in R!

	#if using read.sas7bdat, these four columns will be factor variables
	#and must be converted to numeric ones in order for the merge to work

	for ( i in c("SRVY_YR","HHX","FMX","PX") ){
	x[ , i ] <- as.numeric( as.character( x[ , i ] ) )
	}

	#end import of PERSONSX file
	###############################



	#loop through j = 1 through 5 and..
	#read in the five imputed income files. (this also takes a while)
	for ( j in 1:5 ){
	#print the current iteration of the loop
	print( j )

	#set the current working directory to the location where the five 2000 imputed income SAS data sets are stored.
	setwd("//dcdata/v/National Health Interview Survey/Income Imputation/2000")

	#read in the SAS imputed income file number "j"
	y <- read.sas7bdat(paste( "incmimp" , j , ".sas7bdat", sep=""))

	#dump RECTYPE variable, which is already on the PERSONSX file
	y$RECTYPE <- NULL

	#merge the personsx file with the imputed income file
	z <- merge( x , y , by.x=c("SRVY_YR","HHX","FMX","PX") , by.y=c("SRVY_YR","HHX","FMX","FPX") )

	#print the number of rows in the personsx data frame and
	#also the number of rows in the merged personsx-imputed income data frame
	#to make sure they are the same!
	print( nrow( x ) )
	print( nrow( y ) )
	print( nrow( z ) )

	###########################
	#START OF VARIABLE RECODING
	#any new variables that the user would like to create should be constructed here

	#create the NOTCOV variable
	#shown on page 47 (PDF page 51) of http://www.cdc.gov/nchs/data/nhis/tecdoc_2010.pdf
	z <- transform( z , NOTCOV = ifelse( NOTCOV %in% 7:9 , NA , NOTCOV ))

	#create the POVERTYI variable
	#shown on page 48 (PDF page 52) of http://www.cdc.gov/nchs/data/nhis/tecdoc_2010.pdf
	z <- transform( z , POVERTYI =
	ifelse( RAT_CATI %in% 1:3 , 1 ,
	ifelse( RAT_CATI %in% 4:7 , 2 ,
	ifelse( RAT_CATI %in% 8:11 , 3 ,
	ifelse( RAT_CATI %in% 12:14 , 4 , NA ) ) ) ) )

	#END OF VARIABLE RECODING
	#########################

	#########################
	#delete columns you don't need to free up RAM
	mini_z <- z[ , c("PSU","STRATUM","WTFA","POVERTYI","NOTCOV") ]
	#end of column deletions
	#########################

	#save the current data frame (z) as z1, z2, z3, z4, or z5
	#depending on the current iteration of the j = 1 through 5 loop
	assign( paste( "z" , j , sep = "" ) , mini_z )

	#delete the y and z data frames to free up RAM
	y <- z <- NULL

	#garbage collection: this frees up RAM
	gc()
	}
	#when the loop has terminated, data frames z1 through z5 exist
	#each are the personsx file merged with one of the five imputed income files
	#and each include all recoded variables.


	#using all five merged personsx-MI files,
	#create the multiple imputation survey object
	nhissvy <- svydesign( id = ~PSU , strata=~STRATUM , weight=~WTFA , data=imputationList(list(z1,z2,z3,z4,z5)) , nest=T )

	#delete the y and z data frames to free up RAM
	x <- z1 <- z2 <- z3 <- z4 <- z5 <- NULL

	#garbage collection: this frees up RAM
	gc()


	##################################################################
	#now that the R survey object (nhissvy) has been constructed,
	#analyses can be run.

	#the following output matches PDF page 60 on http://www.cdc.gov/nchs/data/nhis/tecdoc_2010.pdf

	#this displays the crosstab statistics..
	#not broken out by the POVERTYI variable

	#print the unweighted N
	MIcombine( with( subset( nhissvy , !is.na(POVERTYI)) , unwtd.count( ~factor(NOTCOV) , na.rm=T ) ) )
	#print the weighted N
	MIcombine( with( subset( nhissvy , !is.na(POVERTYI)) , svytotal( ~factor(NOTCOV) , na.rm=T ) ) )
	#print the overall percents
	MIcombine( with( subset( nhissvy , !is.na(POVERTYI)) , svymean( ~factor(NOTCOV) , na.rm=T ) ) )

	#broken out by the POVERTYI variable

	#print the unweighted N
	MIcombine( with( nhissvy , svyby(~factor(NOTCOV) , ~factor(POVERTYI) , unwtd.count , na.rm=T ) ) )
	#print the weighted N
	MIcombine( with( nhissvy , svyby(~factor(NOTCOV) , ~factor(POVERTYI) , svytotal , na.rm=T ) ) )
	#print the row percents
	MIcombine( with( nhissvy , svyby(~factor(NOTCOV) , ~factor(POVERTYI) , svymean , na.rm=T ) ) )