Skip to content

Instantly share code, notes, and snippets.

@jbryer
Last active March 9, 2023 15:19
Show Gist options
  • Save jbryer/4497585 to your computer and use it in GitHub Desktop.
Save jbryer/4497585 to your computer and use it in GitHub Desktop.
Parses a codebook file where lines starting at column zero (far left) represet variable information (e.g. name, description, type) and indented lines (i.e. lines beginning with white space, either tabs or spaces, etc.) represent factor levels and labels.
#' Parse a codebook file with variable and level information.
#'
#' Parses a codebook file where lines starting at column zero (far left) represet
#' variable information (e.g. name, description, type) and indented lines
#' (i.e. lines beginning with white space, either tabs or spaces, etc.) represent factor
#' levels and labels.
#'
#' Note that white space at the beginning and end of each line is stripped before
#' processing that line.
#'
#' This function will return a data frame with two additional
#'
#' @param file codebook file name.
#' @param var.names the name of the columns for variable rows.
#' @param level.names the name of the columns for level rows.
#' @param var.sep the separator for variable rows.
#' @param level.sep the separator for level rows.
#' @param level.indent character vector providing character(s) at the beginning
#' of the line that indicate the line represents a factor level. Each
#' element should have 1 character as only the first character of the line
#' is compared.
#' @param var.name the name in \code{var.names} that represents the variable name.
#' This should be a valid R variable name as this will be the column name
#' in the corresponding data file, as well as the name used in the \code{list}
#' of levels stored as an attribute to the returned object.
#' @param ... currently unused.
#' @return the codebook as a data frame with one attribute, \code{levels}, that
#' is a list of data frames corresponding the levels and labels for each
#' varaible.
#' @author Jason Bryer <<jason@@bryer.org>>
#' @export
parse.codebook <- function(file,
var.names,
level.names,
level.indent=c(' ','\t'),
var.sep,
level.sep,
var.widths,
level.widths,
var.name = var.names[1],
...) {
stopifnot(var.name %in% var.names)
stripWhite <- function(x) {
sub('([[:space:]]+$)', '', sub("(^[[:space:]]+)", '', x, perl=TRUE), perl=TRUE)
}
codebook.raw <- readLines(file)
#Remove blank lines
blanklines <- which(nchar(stripWhite(codebook.raw)) == 0)
linenums <- which(!(substr(codebook.raw, 1, 1) %in% level.indent))
linenums <- linenums[!linenums %in% blanklines]
linenums.levels <- which(substr(codebook.raw, 1, 1) %in% level.indent)
linenums.levels <- linenums.levels[!linenums.levels %in% blanklines]
if(length(blanklines) > 0) {
codebook.raw <- codebook.raw[-blanklines]
}
rows <- which(!(substr(codebook.raw, 1, 1) %in% level.indent))
rows.levels <- which(substr(codebook.raw, 1, 1) %in% level.indent)
rowmapping <- data.frame(pre=linenums, post=rows)
rowmapping.levels <- data.frame(pre=linenums.levels, post=rows.levels)
codebook <- codebook.raw[rows]
if(!missing(var.sep)) { #Fields are delimited
split <- strsplit(codebook, var.sep, fixed=TRUE)
badrows <- codebook[sapply(split, length) != length(var.names)]
if(length(badrows) > 0) {
stop(paste("The following rows do not have ", length(var.names),
" values: ", paste(badrows, collapse=', '), sep=''))
}
codebook <- as.data.frame(matrix(stripWhite(
unlist(strsplit(codebook, var.sep, fixed=TRUE))),
ncol=length(var.names), byrow=TRUE,
dimnames=list(1:length(rows), var.names)),
stringsAsFactors=FALSE)
codebook$linenum <- rows
} else if(!missing(var.widths)) { #Fields are fixed with
stopifnot(length(var.names) == length(var.widths))
left <- 1
cb <- data.frame(linenum=linenums)
for(i in seq_along(var.widths)) {
cb[,var.names[i]] <- sapply(
codebook, FUN=function(x) {
stripWhite(substr(x, left, min(nchar(x), (left + var.widths[i]))))
})
left <- left + var.widths[i]
}
codebook <- cb
rm(cb)
} else {
stop("Must specify either var.sep or var.widths")
}
varsWithFactors <- which(sapply(1:(length(rows)-1),
FUN=function(x) { rows[x] + 1 != rows[x+1] }))
varlevels <- list()
for(i in seq_along(rows[varsWithFactors])) {
start <- rows[varsWithFactors][i]
end <- rows[which(rows == start) + 1]
levels.raw <- codebook.raw[ (start + 1):(end - 1) ]
if(!missing(level.widths)) { #Fixed with levels
levels.raw <- lapply(levels.raw, FUN=function(x) {
left <- 1
lc <- character()
for(i in seq_along(level.widths)) {
lc <- c(lc,
substr(x, left, min(nchar(x), (left + level.widths[[i]])))
)
left <- left + level.widths[i]
}
return(lc)
})
} else if(!missing(level.sep)) { #Delimited levels
levels.raw <- strsplit(sub('(^[[:space:]]+)', '', levels.raw, perl=TRUE),
level.sep, fixed=TRUE)
} else {
stop('Must specify either level.sep or level.widths')
}
levels.df <- data.frame(linenum=rowmapping.levels[rowmapping.levels$post > start &
rowmapping.levels$post < end, 'pre'])
for(i in seq_along(level.names)) {
levels.df[,level.names[i]] <- sapply(levels.raw, FUN=function(x) { stripWhite(x[i]) })
}
var <- codebook[codebook$linenum == rowmapping[start == rowmapping$post,'pre'], var.name]
varlevels[[var]] <- levels.df
}
codebook$isfactor <- codebook$var %in% names(varlevels)
attr(codebook, 'levels') <- varlevels
class(codebook) <- c('codebook', 'data.frame')
return(codebook)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment