Skip to content

Instantly share code, notes, and snippets.

@leeper
Forked from jbryer/parse.codebook.r
Last active August 29, 2015 14:17
Show Gist options
  • Save leeper/415a53442b5ae9fc4f6f to your computer and use it in GitHub Desktop.
Save leeper/415a53442b5ae9fc4f6f to your computer and use it in GitHub Desktop.
#' Parse a codebook file with variable and level information.
#'
#' Parses a codebook file where lines starting at column zero (far left) represet
#' variable information (e.g. name, description, type) and indented lines
#' (i.e. lines beginning with white space, either tabs or spaces, etc.) represent factor
#' levels and labels.
#'
#' Note that white space at the beginning and end of each line is stripped before
#' processing that line.
#'
#' This function will return a data frame with two additional
#'
#' @param file codebook file name.
#' @param var.names the name of the columns for variable rows.
#' @param level.names the name of the columns for level rows.
#' @param var.sep the separator for variable rows.
#' @param level.sep the separator for level rows.
#' @param level.indent character vector providing character(s) at the beginning
#' of the line that indicate the line represents a factor level. Each
#' element should have 1 character as only the first character of the line
#' is compared.
#' @param var.name the name in \code{var.names} that represents the variable name.
#' This should be a valid R variable name as this will be the column name
#' in the corresponding data file, as well as the name used in the \code{list}
#' of levels stored as an attribute to the returned object.
#' @param ... currently unused.
#' @return the codebook as a data frame with one attribute, \code{levels}, that
#' is a list of data frames corresponding the levels and labels for each
#' varaible.
#' @author Jason Bryer <<jason@@bryer.org>>
#' @export
parse.codebook <- function(file,
var.names,
level.names,
level.indent=c(' ','\t'),
var.sep,
level.sep,
var.widths,
level.widths,
var.name = var.names[1],
...) {
stopifnot(var.name %in% var.names)
stripWhite <- function(x) {
sub('([[:space:]]+$)', '', sub("(^[[:space:]]+)", '', x, perl=TRUE), perl=TRUE)
}
codebook.raw <- readLines(file)
#Remove blank lines
blanklines <- which(nchar(stripWhite(codebook.raw)) == 0)
linenums <- which(!(substr(codebook.raw, 1, 1) %in% level.indent))
linenums <- linenums[!linenums %in% blanklines]
linenums.levels <- which(substr(codebook.raw, 1, 1) %in% level.indent)
linenums.levels <- linenums.levels[!linenums.levels %in% blanklines]
if(length(blanklines) > 0) {
codebook.raw <- codebook.raw[-blanklines]
}
rows <- which(!(substr(codebook.raw, 1, 1) %in% level.indent))
rows.levels <- which(substr(codebook.raw, 1, 1) %in% level.indent)
rowmapping <- data.frame(pre=linenums, post=rows)
rowmapping.levels <- data.frame(pre=linenums.levels, post=rows.levels)
codebook <- codebook.raw[rows]
if(!missing(var.sep)) { #Fields are delimited
split <- strsplit(codebook, var.sep, fixed=TRUE)
badrows <- codebook[sapply(split, length) != length(var.names)]
if(length(badrows) > 0) {
stop(paste("The following rows do not have ", length(var.names),
" values: ", paste(badrows, collapse=', '), sep=''))
}
codebook <- as.data.frame(matrix(stripWhite(
unlist(strsplit(codebook, var.sep, fixed=TRUE))),
ncol=length(var.names), byrow=TRUE,
dimnames=list(1:length(rows), var.names)),
stringsAsFactors=FALSE)
codebook$linenum <- rows
} else if(!missing(var.widths)) { #Fields are fixed with
stopifnot(length(var.names) == length(var.widths))
left <- 1
cb <- data.frame(linenum=linenums)
for(i in seq_along(var.widths)) {
cb[,var.names[i]] <- sapply(
codebook, FUN=function(x) {
stripWhite(substr(x, left, min(nchar(x), (left + var.widths[i]))))
})
left <- left + var.widths[i]
}
codebook <- cb
rm(cb)
} else {
stop("Must specify either var.sep or var.widths")
}
varsWithFactors <- which(sapply(1:(length(rows)-1),
FUN=function(x) { rows[x] + 1 != rows[x+1] }))
varlevels <- list()
for(i in seq_along(rows[varsWithFactors])) {
start <- rows[varsWithFactors][i]
end <- rows[which(rows == start) + 1]
levels.raw <- codebook.raw[ (start + 1):(end - 1) ]
if(!missing(level.widths)) { #Fixed with levels
levels.raw <- lapply(levels.raw, FUN=function(x) {
left <- 1
lc <- character()
for(i in seq_along(level.widths)) {
lc <- c(lc,
substr(x, left, min(nchar(x), (left + level.widths[[i]])))
)
left <- left + level.widths[i]
}
return(lc)
})
} else if(!missing(level.sep)) { #Delimited levels
levels.raw <- strsplit(sub('(^[[:space:]]+)', '', levels.raw, perl=TRUE),
level.sep, fixed=TRUE)
} else {
stop('Must specify either level.sep or level.widths')
}
levels.df <- data.frame(linenum=rowmapping.levels[rowmapping.levels$post > start &
rowmapping.levels$post < end, 'pre'])
for(i in seq_along(level.names)) {
levels.df[,level.names[i]] <- sapply(levels.raw, FUN=function(x) { stripWhite(x[i]) })
}
var <- codebook[codebook$linenum == rowmapping[start == rowmapping$post,'pre'], var.name]
varlevels[[var]] <- levels.df
}
codebook$isfactor <- codebook$var %in% names(varlevels)
attr(codebook, 'levels') <- varlevels
class(codebook) <- c('codebook', 'data.frame')
return(codebook)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment