/using R to replicate the NHIS Multiple Imputation technique.R
Created Feb 12, 2012
replicate the national health interview survey's multiply imputed income technique using R instead of SUDAAN
#page 118 of the NHIS document | |
#ftp://ftp.cdc.gov/pub/health_statistics/nchs/dataset_documentation/nhis/2010/srvydesc.pdf | |
#displays the R code to load the persons file into R as a survey object | |
#the code below creates a slightly different survey object, one that includes appropriately-imputed income. | |
#this R code: | |
# reads the year 2000 personsx file into R | |
# reads in all five imputed income files | |
# merges the 2000 personsx file with the five imputed income files | |
# creates a MI survey object | |
# runs six example analyses with the multiply-imputed R survey object | |
#set the number of digits displayed | |
options(digits=16) | |
#NOTE that the three packages below must be installed (just once) | |
#if this is the first time using R with these packages, run this line: | |
#install.packages(c("mitools","survey","sas7bdat")) | |
#load the multiple imputation package | |
library(mitools) | |
#load the complex survey analysis package | |
library(survey) | |
#set the current working directory to the location where the 2000 personsx SAS data set is stored. | |
setwd("//dcdata/v/National Health Interview Survey/Income Imputation/2000") | |
#load the .sas7bdat importation package | |
library(sas7bdat) | |
############################### | |
#import NHIS 2000 PERSONSX file | |
#this can be done a number of different ways | |
#read in the NHIS 2000 PERSONSX file from a SAS file (this takes a while) | |
x <- read.sas7bdat( "personsx.sas7bdat" ) | |
#now the NHIS 2000 PERSONSX file is a data frame (x) stored in R! | |
#if using read.sas7bdat, these four columns will be factor variables | |
#and must be converted to numeric ones in order for the merge to work | |
for ( i in c("SRVY_YR","HHX","FMX","PX") ){ | |
x[ , i ] <- as.numeric( as.character( x[ , i ] ) ) | |
} | |
#end import of PERSONSX file | |
############################### | |
#loop through j = 1 through 5 and.. | |
#read in the five imputed income files. (this also takes a while) | |
for ( j in 1:5 ){ | |
#print the current iteration of the loop | |
print( j ) | |
#set the current working directory to the location where the five 2000 imputed income SAS data sets are stored. | |
setwd("//dcdata/v/National Health Interview Survey/Income Imputation/2000") | |
#read in the SAS imputed income file number "j" | |
y <- read.sas7bdat(paste( "incmimp" , j , ".sas7bdat", sep="")) | |
#dump RECTYPE variable, which is already on the PERSONSX file | |
y$RECTYPE <- NULL | |
#merge the personsx file with the imputed income file | |
z <- merge( x , y , by.x=c("SRVY_YR","HHX","FMX","PX") , by.y=c("SRVY_YR","HHX","FMX","FPX") ) | |
#print the number of rows in the personsx data frame and | |
#also the number of rows in the merged personsx-imputed income data frame | |
#to make sure they are the same! | |
print( nrow( x ) ) | |
print( nrow( y ) ) | |
print( nrow( z ) ) | |
########################### | |
#START OF VARIABLE RECODING | |
#any new variables that the user would like to create should be constructed here | |
#create the NOTCOV variable | |
#shown on page 47 (PDF page 51) of http://www.cdc.gov/nchs/data/nhis/tecdoc_2010.pdf | |
z <- transform( z , NOTCOV = ifelse( NOTCOV %in% 7:9 , NA , NOTCOV )) | |
#create the POVERTYI variable | |
#shown on page 48 (PDF page 52) of http://www.cdc.gov/nchs/data/nhis/tecdoc_2010.pdf | |
z <- transform( z , POVERTYI = | |
ifelse( RAT_CATI %in% 1:3 , 1 , | |
ifelse( RAT_CATI %in% 4:7 , 2 , | |
ifelse( RAT_CATI %in% 8:11 , 3 , | |
ifelse( RAT_CATI %in% 12:14 , 4 , NA ) ) ) ) ) | |
#END OF VARIABLE RECODING | |
######################### | |
######################### | |
#delete columns you don't need to free up RAM | |
mini_z <- z[ , c("PSU","STRATUM","WTFA","POVERTYI","NOTCOV") ] | |
#end of column deletions | |
######################### | |
#save the current data frame (z) as z1, z2, z3, z4, or z5 | |
#depending on the current iteration of the j = 1 through 5 loop | |
assign( paste( "z" , j , sep = "" ) , mini_z ) | |
#delete the y and z data frames to free up RAM | |
y <- z <- NULL | |
#garbage collection: this frees up RAM | |
gc() | |
} | |
#when the loop has terminated, data frames z1 through z5 exist | |
#each are the personsx file merged with one of the five imputed income files | |
#and each include all recoded variables. | |
#using all five merged personsx-MI files, | |
#create the multiple imputation survey object | |
nhissvy <- svydesign( id = ~PSU , strata=~STRATUM , weight=~WTFA , data=imputationList(list(z1,z2,z3,z4,z5)) , nest=T ) | |
#delete the y and z data frames to free up RAM | |
x <- z1 <- z2 <- z3 <- z4 <- z5 <- NULL | |
#garbage collection: this frees up RAM | |
gc() | |
################################################################## | |
#now that the R survey object (nhissvy) has been constructed, | |
#analyses can be run. | |
#the following output matches PDF page 60 on http://www.cdc.gov/nchs/data/nhis/tecdoc_2010.pdf | |
#this displays the crosstab statistics.. | |
#not broken out by the POVERTYI variable | |
#print the unweighted N | |
MIcombine( with( subset( nhissvy , !is.na(POVERTYI)) , unwtd.count( ~factor(NOTCOV) , na.rm=T ) ) ) | |
#print the weighted N | |
MIcombine( with( subset( nhissvy , !is.na(POVERTYI)) , svytotal( ~factor(NOTCOV) , na.rm=T ) ) ) | |
#print the overall percents | |
MIcombine( with( subset( nhissvy , !is.na(POVERTYI)) , svymean( ~factor(NOTCOV) , na.rm=T ) ) ) | |
#broken out by the POVERTYI variable | |
#print the unweighted N | |
MIcombine( with( nhissvy , svyby(~factor(NOTCOV) , ~factor(POVERTYI) , unwtd.count , na.rm=T ) ) ) | |
#print the weighted N | |
MIcombine( with( nhissvy , svyby(~factor(NOTCOV) , ~factor(POVERTYI) , svytotal , na.rm=T ) ) ) | |
#print the row percents | |
MIcombine( with( nhissvy , svyby(~factor(NOTCOV) , ~factor(POVERTYI) , svymean , na.rm=T ) ) ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment