Created September 30, 2011
download every file for every year of the National Health Interview Survey and convert them all to csv and stata files
#load necessary libraries
#set the temporary directory to download all files to!
#main FTP directory with data sets..
nhis_file_ftp <- ""
#choose which years to download
which_years_to_download <- 1963:2010
for ( year in which_years_to_download ){
#create the full string to the FTP folder of the current year
year_ftp_dir <- paste( nhis_file_ftp , year , "/" , sep="" )
#figure out what all of the files within that folder are named
filenames <- getURL( year_ftp_dir , dirlistonly=T )
filenames <- tolower( strsplit(filenames, "\r*\n")[[1]] )
#if a and a file.exe both exist, only take the file.exe!
exe_filenames <- filenames[ grepl( ".exe" , filenames ) ]
zip_filenames <- filenames[ grepl( ".zip" , filenames ) ]
exe_filenames <- gsub( ".exe" , "" , exe_filenames )
zip_filenames <- gsub( ".zip" , "" , zip_filenames )
exe_and_zip_filenames <- zip_filenames[ (zip_filenames %in% exe_filenames) ]
zip_filenames_with_exe_matches <- paste( exe_and_zip_filenames , ".zip" , sep = "" )
filenames <- filenames[ ! (filenames %in% zip_filenames_with_exe_matches) ]
#throw out readme.txt!
filenames <- filenames[ ! ( filenames %in% "readme.txt" ) ]
#throw out folders (files without a . in them)
filenames <- filenames[ grepl( "\\." , filenames ) ]
#to only run a subset of files within a particular year, change the filenames line here!
#filenames <- c("personsx.exe","soa2puxx.exe","year2000.exe")
#manual recode to throw out a few files!
#skip 1988 mdevices file!
if ( year == 1988 ) filenames <- filenames[ ! ( filenames %in% "mdevices.exe" ) ]
#skip 1994 and 1995 dfs files!
if ( year %in% c(1994,1995) ) filenames <- filenames[ ! ( filenames %in% c("dfschild.exe","dfsadult.exe") ) ]
#skip the 1992 cancepid and all of the nursing home files!
if ( year == 1992 ) filenames <- filenames[ ! ( filenames %in% c("conditnh.exe","cancepid.exe","drvisinh.exe","hospitnh.exe","househnh.exe","personnh.exe") ) ]
#skip the 2007 alternative medicine and injury verbatim files!
if ( year == 2007 ) filenames <- filenames[ ! ( filenames %in% c("althealt.exe","injverbt.exe") ) ]
#skip the 1999 and 2000 injury verbatim file!
if ( year %in% c(1998:2000,2008,2009) ) filenames <- filenames[ ! ( filenames %in% "injverbt.exe" ) ]
#manual recode for 2004 - since everything is in its own directory,
#just go straight to the file!
if ( year == 2004 ){
filenames <- c(
#note that this skips the injury verbatim file!
"./samplechild/samchild.exe" )
#end of manual recodes!
for ( filename in filenames ){
#print the current year and file name
print( "currently working on..")
print( year )
print( "file..")
print( filename )
#create a temporary file..
tf <- tempfile()
#create a temporary directory..
td <- tempdir()
#full file location
full_file_location <- paste( year_ftp_dir , filename , sep="" )
file_download <- download.file(
full_file_location ,
tf ,
mode = "wb"
#now there's a zipped file in the temporary directory.. unzip it
#this command does two things-- it unzips the ZIP file
#and it stores a vector of character strings in the file name (fn) variable, so we now know where the unzipped files landed.
fn <- unzip( tf , exdir = td , overwrite = T )
#find the filename before the .
fileval <- substr( filename , 1 , regexpr("\\.",filename) - 1 )
#path to the SAS read-in line
sas_ri <- paste( "" , year , "/" , fileval , ".sas" , sep = "" )
#manual recode for 2004
if (year == 2004){
fileval <- substr( filename , gregexpr("\\/",filename)[[1]][2] + 1 , gregexpr("\\.",filename)[[1]][2] - 1)
sas_ri <- paste( "" , year , "/" , substr( filename , 1 , gregexpr("\\.",filename)[[1]][2] - 1 ) , ".sas" , sep = "" )
#end manual recode for 2004
#############now download the SAS import instructions from the FTP site
#first store the FTP path to the SAS instructions in sas_ri
#wait ten seconds so as not to overwhelm the connection
Sys.sleep( 10 )
##and now actually pull the entire file into R from the FTP site, line-by-line
SASinput <- readLines( sas_ri )
##find the first line with the word INPUT in it, which is where the ASCII variable locations occur.
#lines that start with input
ip <- grep("INPUT",toupper(SASinput))
#lines that aren't commented out
nc <- which( ! grepl( "\\*" , SASinput ) )
firstline <- min( intersect( ip , nc ) )
##find the first semicolon ending that input line
#manual recode to account for ending semicolon on last input line instead of its own line
if ( year == 1999 & fileval %in% "injverbt" ) lastline <- lastline + 1
if ( year == 2007 & fileval %in% "ratcat07" ) lastline <- lastline + 1
#end of manual recodes!
##isolate the Fixed-Width File (FWF) input lines, throwing out the first line (which is just INPUT) and the last line (which is just the semicolon)
#throw out all lines that have been commented out
FWFlines <- FWFlines[ !grepl( "\\*" , FWFlines ) ]
#throw out all lines that are empty
FWFlines <- FWFlines[ which( FWFlines != "" ) ]
#manual recode
if (year == 1994 & fileval == "personsx") FWFlines <- gsub("-"," - " , FWFlines)
#end manual recode
##break apart all FWF lines
z<-strsplit(FWFlines," ",perl=T)
##create FWF structure file (x)
#manually fix problems with the SAS input files
#the 1968 conditionf file should not have QUESTS2 QUESTS3 in the SAS input line!
if ( year == 1968 & fileval == "conditionf") z <- z[-41]
#the 1974 healthin file has WTBDD2W and WTBDD2WB in the wrong order
if ( year == 1974 & fileval == "healthin") z <- c( z[1:50] , z[56] , z[ 51:55 ] , z[ 57:83 ] )
#end manual fixes
#now cycle through every line of the FWFlines..
for ( j in 1:length(z) ){
#make a new variable with just one line
#get rid of dashes!
y <- gsub( "-" , "" , y )
#throw out empty strings
y<-y[! y %in% ""]
#if there's anything in the current FWF line and it's not commented out..
if ( length(y) > 0 & ! ("\\*" %in% z[[j]] )){
#create a SKIP variable to 'exit' the loop..
skip2 <- 0
#loop through all the different words in the specific FWF line..
for ( k in 1:length(y) ){ long as the SKIP hasn't been triggered
if ( skip2 < k ) {
#if the SAS line is a character variable starting AT a location in the file..
if ( grepl( "\\@" , y[k] ) ){
#get rid of the at sign
pm <- sub( "\\@" , "" , y[k] )
#specify this variable to be a CHARACTER type ($s are character input formats in SAS)
fm <- "$"
#create a one-line table with the variable name, the variable's placement in the file, and character format
df <- data.frame( varname = y[k+1] , placement = as.numeric(pm) , format = "$" , divisor = 1 )
#add the one-line table to the full FWF structure file
#and skip the next two words in the line (which are the variable name and the placement location.
skip2 <- k+2
} else {
#if the current word isn't a number AND it's not a symbol, then assume it's the variable's name
if ( as.numeric( y[k]) ) & ! (y[k] %in% c("-","$") ) ) {
#set the default format to NUMERIC
fm <- ""
#take the word directly after the variable's name as the placement location
pm <- y[k+1]
#set the default divisor to one (assumes the numeric value doesn't need to be divided)
div <- 1
#if there's at least one word after the variable name, check if it's a $, indicating a character instead of numeric format..
if ( length( y ) >= k + 1 ){
#and if it is, set the format to character and look for the location-placement number in the 2nd word after the variable
if ( y[ k + 1 ] == "$" ) {
fm <- "$"
pm <- y[k+2]
#if there's more than two words after the variable name..
if ( length( y ) >= k + 3 ){
#check if the third word is a divisor
if( ! as.numeric( y[ k + 3] ) ) ) {
if ( as.numeric( y[ k + 3 ] ) < 1 ) {
#and if it is, add it to the FWF structure file!
div = y[ k + 3 ]
#based on all of these options, create a one-row data frame..
df <- data.frame( varname = y[k] , placement = as.numeric(pm) , format = fm , divisor = div )
#and add it to the FWF structure file!
#save the final numeric value in the SASinput file so you know the length of the last field in the FWF
if ( ! as.numeric( y[k] ) ) & as.numeric( y[k] ) >= 1 ){
last_numeric <- as.numeric( y[k] )
#cycle through the entire FWF structure file and subtract each placement from the previous placement, in order to calculate each variable's width (number of characters)
for ( j in 1:(nrow(x)-1) ){
x[ j , "width" ] <- x[ j + 1 , "placement" ] - x[ j , "placement" ]
#..and for the last field, use the last_numeric field (stored from above)
x[ j + 1 , "width" ] <- last_numeric - x[ j + 1 , "placement" ] + 1
##determine exact width of each variable
#x$width<-as.numeric(substr(x$format , ifelse(grepl("\\$",x$format),2,1) , regexpr("\\.",x$format)-1))
##determine number of decimals each variable should have
#x$decimals<-as.numeric(substr(x$format , regexpr("\\.",x$format)+1,length(x$format)))
##determine whether character or string
#if the FIRST placement of the FIRST variable is NOT 1, then add a blank variable so the file reads in properly
if ( x[ 1 , "placement" ] != 1 ) {
df <- data.frame( varname = "TOSS" , width = (x[ 1 , "placement" ] - 1) , placement = 1 , format = "" , divisor = 1 , charvar = F )
x <- rbind( df , x )
#now that we've got an appropriate FWF structure file, we can actually read the ASCII data into R!
#manually fix problems with the SAS input files
#the 1975 personsx file should have a smaller PER100 field in the SAS input line!
if ( year == 1975 & fileval == "personsx") {
x[ 58 , "width" ] <- 2
df <- data.frame( varname = "TOSS" , width = 4 , placement = 102 , format = "" , divisor = 1 , charvar = F )
x <- rbind( x[ 1:58 , ] , df , x[ 59:nrow(x) , ] )
#the 1993 immunize file should have a smaller RESPOND field in the SAS input line!
if ( year == 1993 & fileval == "immunize") {
x[ 49 , "width" ] <- 1
df1 <- data.frame( varname = "TOSS" , width = 6 , placement = 92 , format = "" , divisor = 1 , charvar = F )
x[ 80 , "width" ] <- 6
df2 <- data.frame( varname = "TOSS" , width = 123 , placement = 213 , format = "" , divisor = 1 , charvar = F )
x <- rbind( x[ 1:49 , ] , df1 , x[ 50:80 , ] , df2 , x[ 81:nrow(x) , ] )
x[ x$varname %in% "MEAS1MO" , "width" ] <- 2
x[ x$varname %in% "MEAS1MO" , "placement" ] <- 436
x[ x$varname %in% "MEASMMR1" , "width" ] <- 1
#the 1993 familyres file should have a smaller F_172 field in the SAS input line!
if ( year == 1993 & fileval == "famlyres") {
x[ 3 , "width" ] <- 12
df1 <- data.frame( varname = "TOSS" , width = 156 , placement = 17 , format = "" , divisor = 1 , charvar = F )
x[ 4 , "width" ] <- 6
df2 <- data.frame( varname = "TOSS" , width = 11 , placement = 178 , format = "" , divisor = 1 , charvar = F )
x <- rbind( x[ 1:3 , ] , df1 , x[4 , ] , df2 , x[ 5:nrow(x) , ] )
#the 1995 aidsknow file has a typo at 434
if ( year == 1995 & fileval == "aidsknow") {
x[ 119 , "width" ] <- 1
x[ 120 , "width" ] <- 1
x[ 120 , "placement" ] <- 434
#the 1999 injverbt file needs to end at 677
if ( year == 1999 & fileval == "injverbt") {
x <- x[ 1:(nrow(x)-1) , ]
x[ 18 , "width" ] <- 85
#the 2007 ratcat07 file needs to end at 14
if ( year == 2007 & fileval == "ratcat07") {
x <- x[ 1:(nrow(x)-1) , ]
x[ 4 , "width" ] <- 2
#end manual fixes
##input actual SAS data text-delimited file to read in
SASfile <- read.fwf( fn , x$width , col.names=x$varname)
##convert character strings appropriately within data frame
for (l in 1:nrow(x)){
if (x[l,"charvar"]) SASfile[,l]<-as.character(SASfile[,l])
else SASfile[,l]<-as.numeric(SASfile[,l])
if (x[l,"divisor"] != 1 ) SASfile[,l] <- as.character( SASfile[,l] * as.numeric( x[l,"divisor"]) )
#make all names lowercase
names( SASfile ) <- tolower( names( SASfile ) )
#throw out any TOSS variables
SASfile$toss <-NULL
SASfile$toss.1 <-NULL
#output directory
output_directory <- paste( "R:/National Health Interview Survey/Data/" , year , sep = "" )
#if the year path doesn't exist, make it!
try( dir.create( output_directory ) , silent = T )
setwd( output_directory )
write.csv( SASfile , paste( fileval , ".csv" , sep = "" ) , row.names = F )
write.dta( SASfile , paste( fileval , ".dta" , sep = "" ) )
trash<-ls()[! ls() %in% c("trash","filename","filenames","year","which_years_to_download","nhis_file_ftp","year_ftp_dir")]
for (ij in 1:length(trash)) rm(list=trash[ij])
This is a great effort for downloading NHIS files. I am trying to download NHIS 2004 dataset using your code. I get the following error:

Error in download.file(full_file_location, tf, mode = "wb") :
cannot open URL ''

Can you help me on this?


