Created
December 4, 2011 13:17
-
-
Save timriffe/1430171 to your computer and use it in GitHub Desktop.
HFDget(), a function for getting HFD data into R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# instructions: | |
#1) Register for the Human Fertility Database (http://www.humanfertility.org) | |
#2) Download the complete zip file from this page: http://www.humanfertility.org/cgi-bin/zipfiles.php (it's the #one labeled "All types of HFD data" | |
#3) make a folder whereever you keep your data, and call it something memorable, like, 'HFD' | |
#4) unpack the HFD zip file, which will have a couple nested folders, the last of which contains the text files #for each variable/series.- move these into your HFD directory. | |
# then the following function ought to work just fine for now: | |
HFDget <- function(abbrev = TRUE, countries = FALSE, years = FALSE, NoCohortNAs = TRUE, path) { | |
# where the files at? | |
if (missing(path)){ | |
path <- choose.dir() | |
} | |
# remove '+' and '-' | |
parseages1 <- function(x){ | |
gsub(pattern="\\-",replacement="",x) | |
} | |
parseages2 <- function(x){ | |
as.integer(gsub(pattern="\\+",replacement="",x)) | |
} | |
# descriptions | |
getdescription <- function(x){ | |
readLines(paste(path,x,sep=""),n=1) | |
} | |
HFDfiles <- list.files(path) | |
HFDshorts <- gsub(pattern = ".txt", replacement = "",HFDfiles) | |
if (abbrev==FALSE){ | |
fullnames <- unlist(sapply(HFDfiles, getdescription)) | |
selection <- HFDfiles[fullnames == select.list(fullnames)] | |
} else { | |
selection <- HFDfiles[HFDshorts == select.list(HFDshorts)] | |
} | |
DATA <- read.table(paste(path, "\\", selection, sep = ""), skip = 2, na.strings = ".", as.is = TRUE, header = TRUE) | |
# remove "+" and "-" from open ages | |
if ("Age" %in% colnames(DATA)){ | |
DATA$Age <- parseages2(parseages1(DATA$Age)) | |
} | |
# ------- select countries ----------- | |
# get a vector of country codes | |
if ("Code" %in% colnames(DATA)){ | |
ctries <- unique(DATA$Code) | |
# optional countries | |
if (is.logical(countries)){ | |
if (countries == TRUE){ | |
ctry <- select.list(ctries, multiple = TRUE) | |
ind <- DATA$Code %in% ctry | |
DATA <- DATA[ind, ] | |
} | |
} else { | |
if (is.character(countries)){ | |
ind <- DATA$Code %in% countries | |
DATA <- DATA[ind,] | |
} else { | |
cat("\n\'countries\' must be either a logical, if TRUE, a selection menu will pop up,\nor a character vector of HFD country codes\nNo special country selection will be made.") | |
} | |
} | |
} | |
# ------- select years ----------- | |
# get a vector of country codes | |
if ("Year" %in% colnames(DATA)){ | |
yrs <- unique(DATA$Year) | |
# optional countries | |
if (is.logical(years)){ | |
if (years==TRUE){ | |
yr <- select.list(yrs, multiple = TRUE) | |
ind <- DATA$Year %in% yr | |
DATA <- DATA[ind, ] | |
} | |
} else { | |
if (is.integer(years)){ | |
ind <- DATA$Year %in% years | |
DATA <- DATA[ind, ] | |
} else { | |
cat("\n\'years\' must be either a logical, if TRUE, a selection menu will pop up,\nor a numeric vector of years\nNo special country selection will be made.") | |
} | |
} | |
} | |
# in case user wants to impute values for cohorts in open age groups: | |
if (NoCohortNAs == TRUE){ | |
if ("Cohort" %in% colnames(DATA)){ | |
if (any(is.na(DATA$Cohort))){ | |
# maybe not the clearest code here: | |
ind <- DATA$Age == 12 & is.na(DATA$Cohort) | |
DATA$Cohort[ind] <- DATA$Cohort[c(FALSE,ind[-length(ind)])] | |
ind <- DATA$Age == 55 & is.na(DATA$Cohort) | |
DATA$Cohort[ind] <- DATA$Cohort[c(ind[-1],FALSE)] | |
} | |
} | |
} | |
return(DATA) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment