Skip to content

Instantly share code, notes, and snippets.

@ajdamico
Created September 30, 2011 14:38
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save ajdamico/1253937 to your computer and use it in GitHub Desktop.
Save ajdamico/1253937 to your computer and use it in GitHub Desktop.
download every file for every year of the National Health Interview Survey and convert them all to csv and stata files
#load necessary libraries
library(stringr)
library(foreign)
library(survey)
library(RCurl)
#set the temporary directory to download all files to!
setwd("s:/temp")
#main FTP directory with data sets..
nhis_file_ftp <- "ftp://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NHIS/"
#choose which years to download
which_years_to_download <- 1963:2010
for ( year in which_years_to_download ){
#create the full string to the FTP folder of the current year
year_ftp_dir <- paste( nhis_file_ftp , year , "/" , sep="" )
#figure out what all of the files within that folder are named
filenames <- getURL( year_ftp_dir , dirlistonly=T )
filenames <- tolower( strsplit(filenames, "\r*\n")[[1]] )
#if a file.zip and a file.exe both exist, only take the file.exe!
exe_filenames <- filenames[ grepl( ".exe" , filenames ) ]
zip_filenames <- filenames[ grepl( ".zip" , filenames ) ]
exe_filenames <- gsub( ".exe" , "" , exe_filenames )
zip_filenames <- gsub( ".zip" , "" , zip_filenames )
exe_and_zip_filenames <- zip_filenames[ (zip_filenames %in% exe_filenames) ]
zip_filenames_with_exe_matches <- paste( exe_and_zip_filenames , ".zip" , sep = "" )
filenames <- filenames[ ! (filenames %in% zip_filenames_with_exe_matches) ]
#throw out readme.txt!
filenames <- filenames[ ! ( filenames %in% "readme.txt" ) ]
#throw out folders (files without a . in them)
filenames <- filenames[ grepl( "\\." , filenames ) ]
#MANUAL RECODE FOR A SUBSET OF FILES
#to only run a subset of files within a particular year, change the filenames line here!
#filenames <- c("personsx.exe","soa2puxx.exe","year2000.exe")
#END MANUAL RECODE FOR A SUBSET OF FILES
#manual recode to throw out a few files!
#skip 1988 mdevices file!
if ( year == 1988 ) filenames <- filenames[ ! ( filenames %in% "mdevices.exe" ) ]
#skip 1994 and 1995 dfs files!
if ( year %in% c(1994,1995) ) filenames <- filenames[ ! ( filenames %in% c("dfschild.exe","dfsadult.exe") ) ]
#skip the 1992 cancepid and all of the nursing home files!
if ( year == 1992 ) filenames <- filenames[ ! ( filenames %in% c("conditnh.exe","cancepid.exe","drvisinh.exe","hospitnh.exe","househnh.exe","personnh.exe") ) ]
#skip the 2007 alternative medicine and injury verbatim files!
if ( year == 2007 ) filenames <- filenames[ ! ( filenames %in% c("althealt.exe","injverbt.exe") ) ]
#skip the 1999 and 2000 injury verbatim file!
if ( year %in% c(1998:2000,2008,2009) ) filenames <- filenames[ ! ( filenames %in% "injverbt.exe" ) ]
#manual recode for 2004 - since everything is in its own directory,
#just go straight to the file!
if ( year == 2004 ){
filenames <- c(
"./familyfile/familyxx.exe",
"./household/househld.exe",
"./InjuryPoison/injpoiep.exe",
#note that this skips the injury verbatim file!
#"./InjuryVerbatim/injverbt.exe",
"./person/personsx.exe",
"./sampleadult/samadult.exe",
"./samplechild/samchild.exe" )
}
#end of manual recodes!
for ( filename in filenames ){
#print the current year and file name
print( "currently working on..")
print( year )
print( "file..")
print( filename )
#create a temporary file..
tf <- tempfile()
#create a temporary directory..
td <- tempdir()
#full file location
full_file_location <- paste( year_ftp_dir , filename , sep="" )
file_download <- download.file(
full_file_location ,
tf ,
mode = "wb"
)
#now there's a zipped file in the temporary directory.. unzip it
#this command does two things-- it unzips the ZIP file
#and it stores a vector of character strings in the file name (fn) variable, so we now know where the unzipped files landed.
fn <- unzip( tf , exdir = td , overwrite = T )
#find the filename before the .
fileval <- substr( filename , 1 , regexpr("\\.",filename) - 1 )
#path to the SAS read-in line
sas_ri <- paste( "ftp://ftp.cdc.gov/pub/Health_Statistics/NCHS/Program_Code/NHIS/" , year , "/" , fileval , ".sas" , sep = "" )
#manual recode for 2004
if (year == 2004){
fileval <- substr( filename , gregexpr("\\/",filename)[[1]][2] + 1 , gregexpr("\\.",filename)[[1]][2] - 1)
sas_ri <- paste( "ftp://ftp.cdc.gov/pub/Health_Statistics/NCHS/Program_Code/NHIS/" , year , "/" , substr( filename , 1 , gregexpr("\\.",filename)[[1]][2] - 1 ) , ".sas" , sep = "" )
}
#end manual recode for 2004
#############now download the SAS import instructions from the FTP site
#first store the FTP path to the SAS instructions in sas_ri
#wait ten seconds so as not to overwhelm the connection
Sys.sleep( 10 )
##and now actually pull the entire file into R from the FTP site, line-by-line
SASinput <- readLines( sas_ri )
##find the first line with the word INPUT in it, which is where the ASCII variable locations occur.
#lines that start with input
ip <- grep("INPUT",toupper(SASinput))
#lines that aren't commented out
nc <- which( ! grepl( "\\*" , SASinput ) )
firstline <- min( intersect( ip , nc ) )
##find the first semicolon ending that input line
a<-grep(";",toupper(SASinput))
lastline<-min(a[a>firstline])
#manual recode to account for ending semicolon on last input line instead of its own line
if ( year == 1999 & fileval %in% "injverbt" ) lastline <- lastline + 1
if ( year == 2007 & fileval %in% "ratcat07" ) lastline <- lastline + 1
#end of manual recodes!
##isolate the Fixed-Width File (FWF) input lines, throwing out the first line (which is just INPUT) and the last line (which is just the semicolon)
FWFlines<-SASinput[(firstline+1):(lastline-1)]
#throw out all lines that have been commented out
FWFlines <- FWFlines[ !grepl( "\\*" , FWFlines ) ]
#throw out all lines that are empty
FWFlines <- FWFlines[ which( FWFlines != "" ) ]
#manual recode
if (year == 1994 & fileval == "personsx") FWFlines <- gsub("-"," - " , FWFlines)
#end manual recode
##break apart all FWF lines
z<-strsplit(FWFlines," ",perl=T)
##create FWF structure file (x)
x<-NULL
#manually fix problems with the SAS input files
#the 1968 conditionf file should not have QUESTS2 QUESTS3 in the SAS input line!
if ( year == 1968 & fileval == "conditionf") z <- z[-41]
#the 1974 healthin file has WTBDD2W and WTBDD2WB in the wrong order
if ( year == 1974 & fileval == "healthin") z <- c( z[1:50] , z[56] , z[ 51:55 ] , z[ 57:83 ] )
#end manual fixes
#now cycle through every line of the FWFlines..
for ( j in 1:length(z) ){
#make a new variable with just one line
y<-z[[j]]
#get rid of dashes!
y <- gsub( "-" , "" , y )
#throw out empty strings
y<-y[! y %in% ""]
#if there's anything in the current FWF line and it's not commented out..
if ( length(y) > 0 & ! ("\\*" %in% z[[j]] )){
#create a SKIP variable to 'exit' the loop..
skip2 <- 0
#loop through all the different words in the specific FWF line..
for ( k in 1:length(y) ){
#..so long as the SKIP hasn't been triggered
if ( skip2 < k ) {
#if the SAS line is a character variable starting AT a location in the file..
if ( grepl( "\\@" , y[k] ) ){
#get rid of the at sign
pm <- sub( "\\@" , "" , y[k] )
#specify this variable to be a CHARACTER type ($s are character input formats in SAS)
fm <- "$"
#create a one-line table with the variable name, the variable's placement in the file, and character format
df <- data.frame( varname = y[k+1] , placement = as.numeric(pm) , format = "$" , divisor = 1 )
#add the one-line table to the full FWF structure file
x<-rbind(x,df)
#and skip the next two words in the line (which are the variable name and the placement location.
skip2 <- k+2
} else {
#if the current word isn't a number AND it's not a symbol, then assume it's the variable's name
if ( is.na( as.numeric( y[k]) ) & ! (y[k] %in% c("-","$") ) ) {
#set the default format to NUMERIC
fm <- ""
#take the word directly after the variable's name as the placement location
pm <- y[k+1]
#set the default divisor to one (assumes the numeric value doesn't need to be divided)
div <- 1
#if there's at least one word after the variable name, check if it's a $, indicating a character instead of numeric format..
if ( length( y ) >= k + 1 ){
#and if it is, set the format to character and look for the location-placement number in the 2nd word after the variable
if ( y[ k + 1 ] == "$" ) {
fm <- "$"
pm <- y[k+2]
}
}
#if there's more than two words after the variable name..
if ( length( y ) >= k + 3 ){
#check if the third word is a divisor
if( !is.na( as.numeric( y[ k + 3] ) ) ) {
if ( as.numeric( y[ k + 3 ] ) < 1 ) {
#and if it is, add it to the FWF structure file!
div = y[ k + 3 ]
}
}
}
#based on all of these options, create a one-row data frame..
df <- data.frame( varname = y[k] , placement = as.numeric(pm) , format = fm , divisor = div )
#and add it to the FWF structure file!
x<-rbind(x,df)
}
}
#save the final numeric value in the SASinput file so you know the length of the last field in the FWF
if ( !is.na( as.numeric( y[k] ) ) & as.numeric( y[k] ) >= 1 ){
last_numeric <- as.numeric( y[k] )
}
}
}
}
}
#cycle through the entire FWF structure file and subtract each placement from the previous placement, in order to calculate each variable's width (number of characters)
for ( j in 1:(nrow(x)-1) ){
x[ j , "width" ] <- x[ j + 1 , "placement" ] - x[ j , "placement" ]
}
#..and for the last field, use the last_numeric field (stored from above)
x[ j + 1 , "width" ] <- last_numeric - x[ j + 1 , "placement" ] + 1
##determine exact width of each variable
#x$width<-as.numeric(substr(x$format , ifelse(grepl("\\$",x$format),2,1) , regexpr("\\.",x$format)-1))
##determine number of decimals each variable should have
#x$decimals<-as.numeric(substr(x$format , regexpr("\\.",x$format)+1,length(x$format)))
##determine whether character or string
x$charvar<-grepl("\\$",x$format)
#if the FIRST placement of the FIRST variable is NOT 1, then add a blank variable so the file reads in properly
if ( x[ 1 , "placement" ] != 1 ) {
df <- data.frame( varname = "TOSS" , width = (x[ 1 , "placement" ] - 1) , placement = 1 , format = "" , divisor = 1 , charvar = F )
x <- rbind( df , x )
}
##################################################################################################
#now that we've got an appropriate FWF structure file, we can actually read the ASCII data into R!
##################################################################################################
#manually fix problems with the SAS input files
#the 1975 personsx file should have a smaller PER100 field in the SAS input line!
if ( year == 1975 & fileval == "personsx") {
x[ 58 , "width" ] <- 2
df <- data.frame( varname = "TOSS" , width = 4 , placement = 102 , format = "" , divisor = 1 , charvar = F )
x <- rbind( x[ 1:58 , ] , df , x[ 59:nrow(x) , ] )
}
#the 1993 immunize file should have a smaller RESPOND field in the SAS input line!
if ( year == 1993 & fileval == "immunize") {
x[ 49 , "width" ] <- 1
df1 <- data.frame( varname = "TOSS" , width = 6 , placement = 92 , format = "" , divisor = 1 , charvar = F )
x[ 80 , "width" ] <- 6
df2 <- data.frame( varname = "TOSS" , width = 123 , placement = 213 , format = "" , divisor = 1 , charvar = F )
x <- rbind( x[ 1:49 , ] , df1 , x[ 50:80 , ] , df2 , x[ 81:nrow(x) , ] )
x[ x$varname %in% "MEAS1MO" , "width" ] <- 2
x[ x$varname %in% "MEAS1MO" , "placement" ] <- 436
x[ x$varname %in% "MEASMMR1" , "width" ] <- 1
}
#the 1993 familyres file should have a smaller F_172 field in the SAS input line!
if ( year == 1993 & fileval == "famlyres") {
x[ 3 , "width" ] <- 12
df1 <- data.frame( varname = "TOSS" , width = 156 , placement = 17 , format = "" , divisor = 1 , charvar = F )
x[ 4 , "width" ] <- 6
df2 <- data.frame( varname = "TOSS" , width = 11 , placement = 178 , format = "" , divisor = 1 , charvar = F )
x <- rbind( x[ 1:3 , ] , df1 , x[4 , ] , df2 , x[ 5:nrow(x) , ] )
}
#the 1995 aidsknow file has a typo at 434
if ( year == 1995 & fileval == "aidsknow") {
x[ 119 , "width" ] <- 1
x[ 120 , "width" ] <- 1
x[ 120 , "placement" ] <- 434
}
#the 1999 injverbt file needs to end at 677
if ( year == 1999 & fileval == "injverbt") {
x <- x[ 1:(nrow(x)-1) , ]
x[ 18 , "width" ] <- 85
}
#the 2007 ratcat07 file needs to end at 14
if ( year == 2007 & fileval == "ratcat07") {
x <- x[ 1:(nrow(x)-1) , ]
x[ 4 , "width" ] <- 2
}
#end manual fixes
##input actual SAS data text-delimited file to read in
SASfile <- read.fwf( fn , x$width , col.names=x$varname)
##convert character strings appropriately within data frame
for (l in 1:nrow(x)){
if (x[l,"charvar"]) SASfile[,l]<-as.character(SASfile[,l])
else SASfile[,l]<-as.numeric(SASfile[,l])
if (x[l,"divisor"] != 1 ) SASfile[,l] <- as.character( SASfile[,l] * as.numeric( x[l,"divisor"]) )
}
#make all names lowercase
names( SASfile ) <- tolower( names( SASfile ) )
#throw out any TOSS variables
SASfile$toss <-NULL
SASfile$toss.1 <-NULL
#output directory
output_directory <- paste( "R:/National Health Interview Survey/Data/" , year , sep = "" )
#if the year path doesn't exist, make it!
try( dir.create( output_directory ) , silent = T )
setwd( output_directory )
write.csv( SASfile , paste( fileval , ".csv" , sep = "" ) , row.names = F )
write.dta( SASfile , paste( fileval , ".dta" , sep = "" ) )
#trashremoval
trash<-ls()[! ls() %in% c("trash","filename","filenames","year","which_years_to_download","nhis_file_ftp","year_ftp_dir")]
for (ij in 1:length(trash)) rm(list=trash[ij])
gc()
}
}
@kundupro
Copy link

Hi,

This is a great effort for downloading NHIS files. I am trying to download NHIS 2004 dataset using your code. I get the following error:

Error in download.file(full_file_location, tf, mode = "wb") :
cannot open URL 'ftp://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NHIS/2004/./familyfile/familyxx.exe'

Can you help me on this?

Thanks,
Prosenjit

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment