HenrikBengtsson/readtable-ex.R

## readtable-ex.R
source("https://gist.githubusercontent.com/HenrikBengtsson/ed1eeb41a1b4d6c43b47/raw/ebe58f76e518dd014423bea466a5c93d2efd3c99/readtable-fix.R")

kkk <- c("a\tb",
         "3.14\tx")

colClasses <- c(a="numeric", b="character")
data <- read.table(textConnection(kkk),
                   sep="\t",
                   header = TRUE,
                   colClasses = colClasses)
str(data)

## Does not work
data <- read.table(textConnection(kkk),
                   sep="\t",
                   header = TRUE,
                   colClasses = rev(colClasses))
str(data)

## readtable-fix.R
C_readtablehead <- utils:::C_readtablehead

read.table <-
function(file, header = FALSE, sep = "", quote = "\"'", dec = ".",
	 numerals = c("allow.loss", "warn.loss", "no.loss"),
         row.names, col.names, as.is = !stringsAsFactors,
         na.strings = "NA", colClasses = NA,
         nrows = -1, skip = 0,
         check.names = TRUE, fill = !blank.lines.skip,
         strip.white = FALSE, blank.lines.skip = TRUE,
         comment.char = "#", allowEscapes = FALSE, flush = FALSE,
         stringsAsFactors = default.stringsAsFactors(),
         fileEncoding = "", encoding = "unknown", text, skipNul = FALSE)
{
    if (missing(file) && !missing(text)) {
	file <- textConnection(text, encoding = "UTF-8")
	encoding <- "UTF-8"
	on.exit(close(file))
    }
    if(is.character(file)) {
        file <- if(nzchar(fileEncoding))
            file(file, "rt", encoding = fileEncoding) else file(file, "rt")
        on.exit(close(file))
    }
    if(!inherits(file, "connection"))
        stop("'file' must be a character string or connection")
    if(!isOpen(file, "rt")) {
        open(file, "rt")
        on.exit(close(file))
    }
    pbEncoding <- if (encoding %in% c("", "bytes", "UTF-8")) encoding else "bytes"
    numerals <- match.arg(numerals)

    if(skip > 0L) readLines(file, skip)
    ## read a few lines to determine header, no of cols.
    nlines <- n0lines <- if (nrows < 0L) 5 else min(5L, (header + nrows))

    lines <- .External(C_readtablehead, file, nlines, comment.char,
                       blank.lines.skip, quote, sep, skipNul)
    if (encoding %in% c("UTF-8", "latin1")) Encoding(lines) <- encoding
    nlines <- length(lines)
    if(!nlines) {
        if(missing(col.names)) stop("no lines available in input")
        rlabp <- FALSE
        cols <- length(col.names)
    } else {
        if(all(!nzchar(lines))) stop("empty beginning of file")
        if(nlines < n0lines && file == 0L)  { # stdin() has reached EOF
            pushBack(c(lines, lines, ""), file, encoding = pbEncoding)
            on.exit((clearPushBack(stdin())))
        } else pushBack(c(lines, lines), file, encoding = pbEncoding)
        first <- scan(file, what = "", sep = sep, quote = quote,
                      nlines = 1, quiet = TRUE, skip = 0,
                      strip.white = TRUE,
                      blank.lines.skip = blank.lines.skip,
                      comment.char = comment.char, allowEscapes = allowEscapes,
                      encoding = encoding, skipNul = skipNul)
        col1 <- if(missing(col.names)) length(first) else length(col.names)
        col <- numeric(nlines - 1L)
        if (nlines > 1L)
            for (i in seq_along(col))
                col[i] <- length(scan(file, what = "", sep = sep,
                                      quote = quote,
                                      nlines = 1, quiet = TRUE, skip = 0,
                                      strip.white = strip.white,
                                      blank.lines.skip = blank.lines.skip,
                                      comment.char = comment.char,
                                      allowEscapes = allowEscapes,
				      encoding = encoding,
                                      skipNul = skipNul))
        cols <- max(col1, col)

        ##	basic column counting and header determination;
        ##	rlabp (logical) := it looks like we have column names

        rlabp <- (cols - col1) == 1L
        if(rlabp && missing(header))
            header <- TRUE
        if(!header) rlabp <- FALSE

        if (header) {
            ## skip over header
           .External(C_readtablehead, file, 1L, comment.char,
                     blank.lines.skip, quote, sep, skipNul)
            if(missing(col.names)) col.names <- first
            else if(length(first) != length(col.names))
                warning("header and 'col.names' are of different lengths")

        } else if (missing(col.names))
            col.names <- paste0("V", 1L:cols)
        if(length(col.names) + rlabp < cols)
            stop("more columns than column names")
        if(fill && length(col.names) > cols)
            cols <- length(col.names)
        if(!fill && cols > 0L && length(col.names) > cols)
            stop("more column names than columns")
        if(cols == 0L) stop("first five rows are empty: giving up")
    }

    if(check.names) col.names <- make.names(col.names, unique = TRUE)
    if (rlabp) col.names <- c("row.names", col.names)

    nmColClasses <- names(colClasses)
    if(length(colClasses) <= cols)
        if(is.null(nmColClasses)) {
            colClasses <- rep_len(colClasses, cols)
        } else {
            tmp <- rep_len(NA_character_, cols)
            names(tmp) <- col.names
            i <- match(nmColClasses, col.names, 0L)
            if(any(i <= 0L))
                warning("not all columns named in 'colClasses' exist")
            tmp[ i[i > 0L] ] <- colClasses
            colClasses <- tmp
        }


    ##	set up for the scan of the file.
    ##	we read unknown values as character strings and convert later.

    what <- rep.int(list(""), cols)
    names(what) <- col.names

    colClasses[colClasses %in% c("real", "double")] <- "numeric"
    known <- colClasses %in% c("logical", "integer", "numeric", "complex",
                               "character", "raw")
    what[known] <- sapply(colClasses[known], do.call, list(0))
    what[colClasses %in% "NULL"] <- list(NULL)
    keep <- !sapply(what, is.null)

    data <- scan(file = file, what = what, sep = sep, quote = quote,
                 dec = dec, nmax = nrows, skip = 0,
		 na.strings = na.strings, quiet = TRUE, fill = fill,
                 strip.white = strip.white,
                 blank.lines.skip = blank.lines.skip, multi.line = FALSE,
                 comment.char = comment.char, allowEscapes = allowEscapes,
                 flush = flush, encoding = encoding, skipNul = skipNul)

    nlines <- length(data[[ which.max(keep) ]])

    ##	now we have the data;
    ##	convert to numeric or factor variables
    ##	(depending on the specified value of "as.is").
    ##	we do this here so that columns match up

    if(cols != length(data)) { # this should never happen
	warning("cols = ", cols, " != length(data) = ", length(data),
                domain = NA)
	cols <- length(data)
    }

    if(is.logical(as.is)) {
	as.is <- rep_len(as.is, cols)
    } else if(is.numeric(as.is)) {
	if(any(as.is < 1 | as.is > cols))
	    stop("invalid numeric 'as.is' expression")
	i <- rep.int(FALSE, cols)
	i[as.is] <- TRUE
	as.is <- i
    } else if(is.character(as.is)) {
        i <- match(as.is, col.names, 0L)
        if(any(i <= 0L))
            warning("not all columns named in 'as.is' exist")
        i <- i[i > 0L]
        as.is <- rep.int(FALSE, cols)
        as.is[i] <- TRUE
    } else if (length(as.is) != cols)
	stop(gettextf("'as.is' has the wrong length %d  != cols = %d",
                     length(as.is), cols), domain = NA)

    do <- keep & !known # & !as.is
    if(rlabp) do[1L] <- FALSE # don't convert "row.names"
    for (i in (1L:cols)[do]) {
        data[[i]] <-
            if (is.na(colClasses[i]))
                type.convert(data[[i]], as.is = as.is[i], dec=dec,
			     numerals=numerals, na.strings = character(0L))
        ## as na.strings have already been converted to <NA>
            else if (colClasses[i] == "factor") as.factor(data[[i]])
            else if (colClasses[i] == "Date") as.Date(data[[i]])
            else if (colClasses[i] == "POSIXct") as.POSIXct(data[[i]])
            else methods::as(data[[i]], colClasses[i])
    }

    ##	now determine row names
    compactRN <- TRUE
    if (missing(row.names)) {
	if (rlabp) {
	    row.names <- data[[1L]]
	    data <- data[-1L]
            keep <- keep[-1L]
            compactRN <- FALSE
	}
	else row.names <- .set_row_names(as.integer(nlines))
    } else if (is.null(row.names)) {
	row.names <- .set_row_names(as.integer(nlines))
    } else if (is.character(row.names)) {
        compactRN <- FALSE
	if (length(row.names) == 1L) {
	    rowvar <- (1L:cols)[match(col.names, row.names, 0L) == 1L]
	    row.names <- data[[rowvar]]
	    data <- data[-rowvar]
            keep <- keep[-rowvar]
	}
    } else if (is.numeric(row.names) && length(row.names) == 1L) {
        compactRN <- FALSE
	rlabp <- row.names
	row.names <- data[[rlabp]]
	data <- data[-rlabp]
        keep <- keep[-rlabp]
    } else stop("invalid 'row.names' specification")
    data <- data[keep]

    ## rownames<- is interpreted, so avoid it for efficiency (it will copy)
    if(is.object(row.names) || !(is.integer(row.names)) )
        row.names <- as.character(row.names)
    if(!compactRN) {
        if (length(row.names) != nlines)
            stop("invalid 'row.names' length")
        if (anyDuplicated(row.names))
            stop("duplicate 'row.names' are not allowed")
        if (anyNA(row.names))
            stop("missing values in 'row.names' are not allowed")
    }

    ##	this is extremely underhanded
    ##	we should use the constructor function ...
    ##	don't try this at home kids

    class(data) <- "data.frame"
    attr(data, "row.names") <- row.names
    data
}
	source("https://gist.githubusercontent.com/HenrikBengtsson/ed1eeb41a1b4d6c43b47/raw/ebe58f76e518dd014423bea466a5c93d2efd3c99/readtable-fix.R")

	kkk <- c("a\tb",
	"3.14\tx")

	colClasses <- c(a="numeric", b="character")
	data <- read.table(textConnection(kkk),
	sep="\t",
	header = TRUE,
	colClasses = colClasses)
	str(data)

	## Does not work
	data <- read.table(textConnection(kkk),
	sep="\t",
	header = TRUE,
	colClasses = rev(colClasses))
	str(data)
	C_readtablehead <- utils:::C_readtablehead

	read.table <-
	function(file, header = FALSE, sep = "", quote = "\"'", dec = ".",
	numerals = c("allow.loss", "warn.loss", "no.loss"),
	row.names, col.names, as.is = !stringsAsFactors,
	na.strings = "NA", colClasses = NA,
	nrows = -1, skip = 0,
	check.names = TRUE, fill = !blank.lines.skip,
	strip.white = FALSE, blank.lines.skip = TRUE,
	comment.char = "#", allowEscapes = FALSE, flush = FALSE,
	stringsAsFactors = default.stringsAsFactors(),
	fileEncoding = "", encoding = "unknown", text, skipNul = FALSE)
	{
	if (missing(file) && !missing(text)) {
	file <- textConnection(text, encoding = "UTF-8")
	encoding <- "UTF-8"
	on.exit(close(file))
	}
	if(is.character(file)) {
	file <- if(nzchar(fileEncoding))
	file(file, "rt", encoding = fileEncoding) else file(file, "rt")
	on.exit(close(file))
	}
	if(!inherits(file, "connection"))
	stop("'file' must be a character string or connection")
	if(!isOpen(file, "rt")) {
	open(file, "rt")
	on.exit(close(file))
	}
	pbEncoding <- if (encoding %in% c("", "bytes", "UTF-8")) encoding else "bytes"
	numerals <- match.arg(numerals)

	if(skip > 0L) readLines(file, skip)
	## read a few lines to determine header, no of cols.
	nlines <- n0lines <- if (nrows < 0L) 5 else min(5L, (header + nrows))

	lines <- .External(C_readtablehead, file, nlines, comment.char,
	blank.lines.skip, quote, sep, skipNul)
	if (encoding %in% c("UTF-8", "latin1")) Encoding(lines) <- encoding
	nlines <- length(lines)
	if(!nlines) {
	if(missing(col.names)) stop("no lines available in input")
	rlabp <- FALSE
	cols <- length(col.names)
	} else {
	if(all(!nzchar(lines))) stop("empty beginning of file")
	if(nlines < n0lines && file == 0L) { # stdin() has reached EOF
	pushBack(c(lines, lines, ""), file, encoding = pbEncoding)
	on.exit((clearPushBack(stdin())))
	} else pushBack(c(lines, lines), file, encoding = pbEncoding)
	first <- scan(file, what = "", sep = sep, quote = quote,
	nlines = 1, quiet = TRUE, skip = 0,
	strip.white = TRUE,
	blank.lines.skip = blank.lines.skip,
	comment.char = comment.char, allowEscapes = allowEscapes,
	encoding = encoding, skipNul = skipNul)
	col1 <- if(missing(col.names)) length(first) else length(col.names)
	col <- numeric(nlines - 1L)
	if (nlines > 1L)
	for (i in seq_along(col))
	col[i] <- length(scan(file, what = "", sep = sep,
	quote = quote,
	nlines = 1, quiet = TRUE, skip = 0,
	strip.white = strip.white,
	blank.lines.skip = blank.lines.skip,
	comment.char = comment.char,
	allowEscapes = allowEscapes,
	encoding = encoding,
	skipNul = skipNul))
	cols <- max(col1, col)

	## basic column counting and header determination;
	## rlabp (logical) := it looks like we have column names

	rlabp <- (cols - col1) == 1L
	if(rlabp && missing(header))
	header <- TRUE
	if(!header) rlabp <- FALSE

	if (header) {
	## skip over header
	.External(C_readtablehead, file, 1L, comment.char,
	blank.lines.skip, quote, sep, skipNul)
	if(missing(col.names)) col.names <- first
	else if(length(first) != length(col.names))
	warning("header and 'col.names' are of different lengths")

	} else if (missing(col.names))
	col.names <- paste0("V", 1L:cols)
	if(length(col.names) + rlabp < cols)
	stop("more columns than column names")
	if(fill && length(col.names) > cols)
	cols <- length(col.names)
	if(!fill && cols > 0L && length(col.names) > cols)
	stop("more column names than columns")
	if(cols == 0L) stop("first five rows are empty: giving up")
	}

	if(check.names) col.names <- make.names(col.names, unique = TRUE)
	if (rlabp) col.names <- c("row.names", col.names)

	nmColClasses <- names(colClasses)
	if(length(colClasses) <= cols)
	if(is.null(nmColClasses)) {
	colClasses <- rep_len(colClasses, cols)
	} else {
	tmp <- rep_len(NA_character_, cols)
	names(tmp) <- col.names
	i <- match(nmColClasses, col.names, 0L)
	if(any(i <= 0L))
	warning("not all columns named in 'colClasses' exist")
	tmp[ i[i > 0L] ] <- colClasses
	colClasses <- tmp
	}


	## set up for the scan of the file.
	## we read unknown values as character strings and convert later.

	what <- rep.int(list(""), cols)
	names(what) <- col.names

	colClasses[colClasses %in% c("real", "double")] <- "numeric"
	known <- colClasses %in% c("logical", "integer", "numeric", "complex",
	"character", "raw")
	what[known] <- sapply(colClasses[known], do.call, list(0))
	what[colClasses %in% "NULL"] <- list(NULL)
	keep <- !sapply(what, is.null)

	data <- scan(file = file, what = what, sep = sep, quote = quote,
	dec = dec, nmax = nrows, skip = 0,
	na.strings = na.strings, quiet = TRUE, fill = fill,
	strip.white = strip.white,
	blank.lines.skip = blank.lines.skip, multi.line = FALSE,
	comment.char = comment.char, allowEscapes = allowEscapes,
	flush = flush, encoding = encoding, skipNul = skipNul)

	nlines <- length(data[[ which.max(keep) ]])

	## now we have the data;
	## convert to numeric or factor variables
	## (depending on the specified value of "as.is").
	## we do this here so that columns match up

	if(cols != length(data)) { # this should never happen
	warning("cols = ", cols, " != length(data) = ", length(data),
	domain = NA)
	cols <- length(data)
	}

	if(is.logical(as.is)) {
	as.is <- rep_len(as.is, cols)
	} else if(is.numeric(as.is)) {
	if(any(as.is < 1 \| as.is > cols))
	stop("invalid numeric 'as.is' expression")
	i <- rep.int(FALSE, cols)
	i[as.is] <- TRUE
	as.is <- i
	} else if(is.character(as.is)) {
	i <- match(as.is, col.names, 0L)
	if(any(i <= 0L))
	warning("not all columns named in 'as.is' exist")
	i <- i[i > 0L]
	as.is <- rep.int(FALSE, cols)
	as.is[i] <- TRUE
	} else if (length(as.is) != cols)
	stop(gettextf("'as.is' has the wrong length %d != cols = %d",
	length(as.is), cols), domain = NA)

	do <- keep & !known # & !as.is
	if(rlabp) do[1L] <- FALSE # don't convert "row.names"
	for (i in (1L:cols)[do]) {
	data[[i]] <-
	if (is.na(colClasses[i]))
	type.convert(data[[i]], as.is = as.is[i], dec=dec,
	numerals=numerals, na.strings = character(0L))
	## as na.strings have already been converted to <NA>
	else if (colClasses[i] == "factor") as.factor(data[[i]])
	else if (colClasses[i] == "Date") as.Date(data[[i]])
	else if (colClasses[i] == "POSIXct") as.POSIXct(data[[i]])
	else methods::as(data[[i]], colClasses[i])
	}

	## now determine row names
	compactRN <- TRUE
	if (missing(row.names)) {
	if (rlabp) {
	row.names <- data[[1L]]
	data <- data[-1L]
	keep <- keep[-1L]
	compactRN <- FALSE
	}
	else row.names <- .set_row_names(as.integer(nlines))
	} else if (is.null(row.names)) {
	row.names <- .set_row_names(as.integer(nlines))
	} else if (is.character(row.names)) {
	compactRN <- FALSE
	if (length(row.names) == 1L) {
	rowvar <- (1L:cols)[match(col.names, row.names, 0L) == 1L]
	row.names <- data[[rowvar]]
	data <- data[-rowvar]
	keep <- keep[-rowvar]
	}
	} else if (is.numeric(row.names) && length(row.names) == 1L) {
	compactRN <- FALSE
	rlabp <- row.names
	row.names <- data[[rlabp]]
	data <- data[-rlabp]
	keep <- keep[-rlabp]
	} else stop("invalid 'row.names' specification")
	data <- data[keep]

	## rownames<- is interpreted, so avoid it for efficiency (it will copy)
	if(is.object(row.names) \|\| !(is.integer(row.names)) )
	row.names <- as.character(row.names)
	if(!compactRN) {
	if (length(row.names) != nlines)
	stop("invalid 'row.names' length")
	if (anyDuplicated(row.names))
	stop("duplicate 'row.names' are not allowed")
	if (anyNA(row.names))
	stop("missing values in 'row.names' are not allowed")
	}

	## this is extremely underhanded
	## we should use the constructor function ...
	## don't try this at home kids

	class(data) <- "data.frame"
	attr(data, "row.names") <- row.names
	data
	}