public
Last active

Parses a codebook file where lines starting at column zero (far left) represet variable information (e.g. name, description, type) and indented lines (i.e. lines beginning with white space, either tabs or spaces, etc.) represent factor levels and labels.

  • Download Gist
parse.codebook.r
R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
#' Parse a codebook file with variable and level information.
#'
#' Parses a codebook file where lines starting at column zero (far left) represet
#' variable information (e.g. name, description, type) and indented lines
#' (i.e. lines beginning with white space, either tabs or spaces, etc.) represent factor
#' levels and labels.
#'
#' Note that white space at the beginning and end of each line is stripped before
#' processing that line.
#'
#' This function will return a data frame with two additional
#'
#' @param file codebook file name.
#' @param var.names the name of the columns for variable rows.
#' @param level.names the name of the columns for level rows.
#' @param var.sep the separator for variable rows.
#' @param level.sep the separator for level rows.
#' @param level.indent character vector providing character(s) at the beginning
#' of the line that indicate the line represents a factor level. Each
#' element should have 1 character as only the first character of the line
#' is compared.
#' @param var.name the name in \code{var.names} that represents the variable name.
#' This should be a valid R variable name as this will be the column name
#' in the corresponding data file, as well as the name used in the \code{list}
#' of levels stored as an attribute to the returned object.
#' @param ... currently unused.
#' @return the codebook as a data frame with one attribute, \code{levels}, that
#' is a list of data frames corresponding the levels and labels for each
#' varaible.
#' @author Jason Bryer <<jason@@bryer.org>>
#' @export
parse.codebook <- function(file,
var.names,
level.names,
level.indent=c(' ','\t'),
var.sep,
level.sep,
var.widths,
level.widths,
var.name = var.names[1],
...) {
stopifnot(var.name %in% var.names)
stripWhite <- function(x) {
sub('([[:space:]]+$)', '', sub("(^[[:space:]]+)", '', x, perl=TRUE), perl=TRUE)
}
codebook.raw <- readLines(file)
#Remove blank lines
blanklines <- which(nchar(stripWhite(codebook.raw)) == 0)
linenums <- which(!(substr(codebook.raw, 1, 1) %in% level.indent))
linenums <- linenums[!linenums %in% blanklines]
linenums.levels <- which(substr(codebook.raw, 1, 1) %in% level.indent)
linenums.levels <- linenums.levels[!linenums.levels %in% blanklines]
if(length(blanklines) > 0) {
codebook.raw <- codebook.raw[-blanklines]
}
rows <- which(!(substr(codebook.raw, 1, 1) %in% level.indent))
rows.levels <- which(substr(codebook.raw, 1, 1) %in% level.indent)
rowmapping <- data.frame(pre=linenums, post=rows)
rowmapping.levels <- data.frame(pre=linenums.levels, post=rows.levels)
codebook <- codebook.raw[rows]
if(!missing(var.sep)) { #Fields are delimited
split <- strsplit(codebook, var.sep, fixed=TRUE)
badrows <- codebook[sapply(split, length) != length(var.names)]
if(length(badrows) > 0) {
stop(paste("The following rows do not have ", length(var.names),
" values: ", paste(badrows, collapse=', '), sep=''))
}
codebook <- as.data.frame(matrix(stripWhite(
unlist(strsplit(codebook, var.sep, fixed=TRUE))),
ncol=length(var.names), byrow=TRUE,
dimnames=list(1:length(rows), var.names)),
stringsAsFactors=FALSE)
codebook$linenum <- rows
} else if(!missing(var.widths)) { #Fields are fixed with
stopifnot(length(var.names) == length(var.widths))
left <- 1
cb <- data.frame(linenum=linenums)
for(i in seq_along(var.widths)) {
cb[,var.names[i]] <- sapply(
codebook, FUN=function(x) {
stripWhite(substr(x, left, min(nchar(x), (left + var.widths[i]))))
})
left <- left + var.widths[i]
}
codebook <- cb
rm(cb)
} else {
stop("Must specify either var.sep or var.widths")
}
varsWithFactors <- which(sapply(1:(length(rows)-1),
FUN=function(x) { rows[x] + 1 != rows[x+1] }))
varlevels <- list()
for(i in seq_along(rows[varsWithFactors])) {
start <- rows[varsWithFactors][i]
end <- rows[which(rows == start) + 1]
levels.raw <- codebook.raw[ (start + 1):(end - 1) ]
if(!missing(level.widths)) { #Fixed with levels
levels.raw <- lapply(levels.raw, FUN=function(x) {
left <- 1
lc <- character()
for(i in seq_along(level.widths)) {
lc <- c(lc,
substr(x, left, min(nchar(x), (left + level.widths[[i]])))
)
left <- left + level.widths[i]
}
return(lc)
})
} else if(!missing(level.sep)) { #Delimited levels
levels.raw <- strsplit(sub('(^[[:space:]]+)', '', levels.raw, perl=TRUE),
level.sep, fixed=TRUE)
} else {
stop('Must specify either level.sep or level.widths')
}
levels.df <- data.frame(linenum=rowmapping.levels[rowmapping.levels$post > start &
rowmapping.levels$post < end, 'pre'])
for(i in seq_along(level.names)) {
levels.df[,level.names[i]] <- sapply(levels.raw, FUN=function(x) { stripWhite(x[i]) })
}
var <- codebook[codebook$linenum == rowmapping[start == rowmapping$post,'pre'], var.name]
varlevels[[var]] <- levels.df
}
codebook$isfactor <- codebook$var %in% names(varlevels)
attr(codebook, 'levels') <- varlevels
class(codebook) <- c('codebook', 'data.frame')
return(codebook)
}

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.