Skip to content

Instantly share code, notes, and snippets.

@ajdamico
Created September 19, 2011 22:45
Show Gist options
  • Save ajdamico/1227815 to your computer and use it in GitHub Desktop.
Save ajdamico/1227815 to your computer and use it in GitHub Desktop.
download fifty years of National Health Interview Survey documentation PDFs
#install RCurl on your version of R if you don't already have it
#just run this once
#install.packages("RCurl")
#program start
#load the RCurl package
library(RCurl)
#set your output folder - this is where the pdfs will get saved
setwd("R:/National Health Interview Survey/documentation")
#main NHIS FTP directory with documentation
nhis_doc_ftp <- "ftp://ftp.cdc.gov/pub/Health_Statistics/NCHS/Dataset_Documentation/NHIS/"
#loop through 1963 - 2010
for ( year in 1963:2010 ){
#create the full string to the FTP folder of the current year
year_ftp_dir <- paste( nhis_doc_ftp , year , "/" , sep="" )
#figure out what all of the files within that folder are named
filenames <- getURL( year_ftp_dir , dirlistonly=T )
filenames <- tolower( strsplit(filenames, "\r*\n")[[1]] )
#as written, the program downloads EVERY file in each year's directory
#however, if you only want to download files with "person" or "core" in their filename
#uncomment this line..
#save only the files with the word "person" or "core" in them
#filenames <- filenames[ grepl("person" , filenames) | grepl("core" , filenames) ]
#loop through all of those files and save them to your working directory
for ( i in filenames ){
#determine the year directory
pth <- paste( "./" , year , "/" , sep="" )
#if the directory doesn't exist, make it!
if (!file.exists(pth)){
dir.create(pth)
}
download.file( paste( year_ftp_dir , i , sep="" ) , paste( "./" , year , "/" , i , sep = "" ) , mode="wb" )
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment