Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
replicate the national health interview survey's multiply imputed income technique using R instead of SUDAAN
#page 118 of the NHIS document
#ftp://ftp.cdc.gov/pub/health_statistics/nchs/dataset_documentation/nhis/2010/srvydesc.pdf
#displays the R code to load the persons file into R as a survey object
#the code below creates a slightly different survey object, one that includes appropriately-imputed income.
#this R code:
# reads the year 2000 personsx file into R
# reads in all five imputed income files
# merges the 2000 personsx file with the five imputed income files
# creates a MI survey object
# runs six example analyses with the multiply-imputed R survey object
#set the number of digits displayed
options(digits=16)
#NOTE that the three packages below must be installed (just once)
#if this is the first time using R with these packages, run this line:
#install.packages(c("mitools","survey","sas7bdat"))
#load the multiple imputation package
library(mitools)
#load the complex survey analysis package
library(survey)
#set the current working directory to the location where the 2000 personsx SAS data set is stored.
setwd("//dcdata/v/National Health Interview Survey/Income Imputation/2000")
#load the .sas7bdat importation package
library(sas7bdat)
###############################
#import NHIS 2000 PERSONSX file
#this can be done a number of different ways
#read in the NHIS 2000 PERSONSX file from a SAS file (this takes a while)
x <- read.sas7bdat( "personsx.sas7bdat" )
#now the NHIS 2000 PERSONSX file is a data frame (x) stored in R!
#if using read.sas7bdat, these four columns will be factor variables
#and must be converted to numeric ones in order for the merge to work
for ( i in c("SRVY_YR","HHX","FMX","PX") ){
x[ , i ] <- as.numeric( as.character( x[ , i ] ) )
}
#end import of PERSONSX file
###############################
#loop through j = 1 through 5 and..
#read in the five imputed income files. (this also takes a while)
for ( j in 1:5 ){
#print the current iteration of the loop
print( j )
#set the current working directory to the location where the five 2000 imputed income SAS data sets are stored.
setwd("//dcdata/v/National Health Interview Survey/Income Imputation/2000")
#read in the SAS imputed income file number "j"
y <- read.sas7bdat(paste( "incmimp" , j , ".sas7bdat", sep=""))
#dump RECTYPE variable, which is already on the PERSONSX file
y$RECTYPE <- NULL
#merge the personsx file with the imputed income file
z <- merge( x , y , by.x=c("SRVY_YR","HHX","FMX","PX") , by.y=c("SRVY_YR","HHX","FMX","FPX") )
#print the number of rows in the personsx data frame and
#also the number of rows in the merged personsx-imputed income data frame
#to make sure they are the same!
print( nrow( x ) )
print( nrow( y ) )
print( nrow( z ) )
###########################
#START OF VARIABLE RECODING
#any new variables that the user would like to create should be constructed here
#create the NOTCOV variable
#shown on page 47 (PDF page 51) of http://www.cdc.gov/nchs/data/nhis/tecdoc_2010.pdf
z <- transform( z , NOTCOV = ifelse( NOTCOV %in% 7:9 , NA , NOTCOV ))
#create the POVERTYI variable
#shown on page 48 (PDF page 52) of http://www.cdc.gov/nchs/data/nhis/tecdoc_2010.pdf
z <- transform( z , POVERTYI =
ifelse( RAT_CATI %in% 1:3 , 1 ,
ifelse( RAT_CATI %in% 4:7 , 2 ,
ifelse( RAT_CATI %in% 8:11 , 3 ,
ifelse( RAT_CATI %in% 12:14 , 4 , NA ) ) ) ) )
#END OF VARIABLE RECODING
#########################
#########################
#delete columns you don't need to free up RAM
mini_z <- z[ , c("PSU","STRATUM","WTFA","POVERTYI","NOTCOV") ]
#end of column deletions
#########################
#save the current data frame (z) as z1, z2, z3, z4, or z5
#depending on the current iteration of the j = 1 through 5 loop
assign( paste( "z" , j , sep = "" ) , mini_z )
#delete the y and z data frames to free up RAM
y <- z <- NULL
#garbage collection: this frees up RAM
gc()
}
#when the loop has terminated, data frames z1 through z5 exist
#each are the personsx file merged with one of the five imputed income files
#and each include all recoded variables.
#using all five merged personsx-MI files,
#create the multiple imputation survey object
nhissvy <- svydesign( id = ~PSU , strata=~STRATUM , weight=~WTFA , data=imputationList(list(z1,z2,z3,z4,z5)) , nest=T )
#delete the y and z data frames to free up RAM
x <- z1 <- z2 <- z3 <- z4 <- z5 <- NULL
#garbage collection: this frees up RAM
gc()
##################################################################
#now that the R survey object (nhissvy) has been constructed,
#analyses can be run.
#the following output matches PDF page 60 on http://www.cdc.gov/nchs/data/nhis/tecdoc_2010.pdf
#this displays the crosstab statistics..
#not broken out by the POVERTYI variable
#print the unweighted N
MIcombine( with( subset( nhissvy , !is.na(POVERTYI)) , unwtd.count( ~factor(NOTCOV) , na.rm=T ) ) )
#print the weighted N
MIcombine( with( subset( nhissvy , !is.na(POVERTYI)) , svytotal( ~factor(NOTCOV) , na.rm=T ) ) )
#print the overall percents
MIcombine( with( subset( nhissvy , !is.na(POVERTYI)) , svymean( ~factor(NOTCOV) , na.rm=T ) ) )
#broken out by the POVERTYI variable
#print the unweighted N
MIcombine( with( nhissvy , svyby(~factor(NOTCOV) , ~factor(POVERTYI) , unwtd.count , na.rm=T ) ) )
#print the weighted N
MIcombine( with( nhissvy , svyby(~factor(NOTCOV) , ~factor(POVERTYI) , svytotal , na.rm=T ) ) )
#print the row percents
MIcombine( with( nhissvy , svyby(~factor(NOTCOV) , ~factor(POVERTYI) , svymean , na.rm=T ) ) )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.