leeper/code.R

## code.R
read.fwf2 <- function (file, widths, header = FALSE, sep = "\t", skip = 0,
    row.names, col.names, n = -1, ...)
{
    doone <- function(x) {
        x <- substring(x, first, last)
        x[!nzchar(x)] <- NA_character_
        paste0(x, collapse = sep)
    }
    if (is.list(widths)) {
        recordlength <- length(widths)
        widths <- do.call("c", widths)
    } else {
        recordlength <- 1L
    }
    drop <- (widths < 0L)
    widths <- abs(widths)
    if (is.character(file)) {
        file <- file(file, "rt")
        on.exit(close(file), add = TRUE)
    } else if (!isOpen(file)) {
        open(file, "rt")
        on.exit(close(file), add = TRUE)
    }
    if (skip)
        readLines(file, n = skip)
    if (header) {
        headerline <- readLines(file, n = 1L)
        text[1] <- headerline
    }
    raw <- readLines(file, n = n)
    nread <- length(raw)
    if (recordlength > 1L && nread%%recordlength) {
        raw <- raw[1L:(nread - nread%%recordlength)]
        warning(sprintf(ngettext(nread%%recordlength, "last record incomplete, %d line discarded",
            "last record incomplete, %d lines discarded"),
            nread%%recordlength), domain = NA)
    }
    if (recordlength > 1L) {
        raw <- matrix(raw, nrow = recordlength)
        raw <- apply(raw, 2L, paste, collapse = "")
    }
    st <- c(1L, 1L + cumsum(widths))
    first <- st[-length(st)][!drop]
    last <- cumsum(widths)[!drop]
    if(header)
        text <- c(headerline, sapply(raw, doone))
    else
        text <- sapply(raw, doone)
    read.table(text = text, header = header, sep = sep, row.names = row.names,
        col.names = col.names, quote = "", ...)
}

## microbenchmarks.R
library("microbenchmark")
set.seed(1)
ff <- tempfile()
dat <- sapply(1:1e4, function(x) paste0(sample(0:9), collapse = ""))
cat(file = ff, dat, sep = "\n")

identical(read.fwf(ff, widths = c(1,2,3,5)), read.fwf2(ff, widths = c(1,2,3,5)))
# [1] TRUE

library("readr")
identical(read.fwf(ff, widths = c(1,2,3,5)), read_fwf(ff, fwf_widths(c(1,2,3,5))))
# [1] FALSE

library("LaF")
identical(read.fwf(ff, widths = c(1,2,3,5)),
          laf_open_fwf(ff, column_types = rep("numeric", 4), column_widths = c(1,2,3,5)))
# [1] FALSE

utils <- function() read.fwf(ff, widths = c(1,2,3,5))
leeper <- function() read.fwf2(ff, widths = c(1,2,3,5))
hadley <- function() read_fwf(ff, fwf_widths(c(1,2,3,5)))
laf <- function() laf_open_fwf(ff, column_types = rep("numeric", 4), column_widths = c(1,2,3,5))
microbenchmark(utils(), leeper(), hadley(), laf(), times = 25)
# Unit: milliseconds
#      expr        min         lq       mean     median         uq        max neval cld
#   utils() 650.815590 675.114110 690.610924 690.093809 693.658223 760.670054    25   c
#  leeper() 395.106120 424.477414 431.464297 432.641197 437.710894 480.624730    25  b
#  hadley()   8.159549   9.066336  11.753338   9.719871  15.496214  18.416717    25 a
#     laf()   1.226242   1.551084   1.639269   1.628832   1.672708   2.783485    25 a


utils <- function() read.fwf(ff, colClasses = rep("numeric", 4), widths = c(1,2,3,5))
leeper <- function() read.fwf2(ff, colClasses = rep("numeric", 4), widths = c(1,2,3,5))
hadley <- function() read_fwf(ff, col_types = "iiii", fwf_widths(c(1,2,3,5)))
laf <- function() laf_open_fwf(ff, column_types = rep("numeric", 4), column_widths = c(1,2,3,5))
microbenchmark(utils(), leeper(), hadley(), laf(), times = 25)
# Unit: milliseconds
#      expr        min         lq       mean      median          uq         max neval  cld
#   utils() 648.555934 660.188679  684.65795  668.364392  686.839611  766.526839    25   c
#  leeper() 397.034390 411.351318  426.30090  419.492392  430.818385  534.639401    25  b
#  hadley() 928.281034 977.235245 1012.91864 1010.140934 1040.972865 1099.289389    25    d
#     laf()   1.270504   1.587264    1.59236    1.645381    1.686949    1.795871    25 a
	read.fwf2 <- function (file, widths, header = FALSE, sep = "\t", skip = 0,
	row.names, col.names, n = -1, ...)
	{
	doone <- function(x) {
	x <- substring(x, first, last)
	x[!nzchar(x)] <- NA_character_
	paste0(x, collapse = sep)
	}
	if (is.list(widths)) {
	recordlength <- length(widths)
	widths <- do.call("c", widths)
	} else {
	recordlength <- 1L
	}
	drop <- (widths < 0L)
	widths <- abs(widths)
	if (is.character(file)) {
	file <- file(file, "rt")
	on.exit(close(file), add = TRUE)
	} else if (!isOpen(file)) {
	open(file, "rt")
	on.exit(close(file), add = TRUE)
	}
	if (skip)
	readLines(file, n = skip)
	if (header) {
	headerline <- readLines(file, n = 1L)
	text[1] <- headerline
	}
	raw <- readLines(file, n = n)
	nread <- length(raw)
	if (recordlength > 1L && nread%%recordlength) {
	raw <- raw[1L:(nread - nread%%recordlength)]
	warning(sprintf(ngettext(nread%%recordlength, "last record incomplete, %d line discarded",
	"last record incomplete, %d lines discarded"),
	nread%%recordlength), domain = NA)
	}
	if (recordlength > 1L) {
	raw <- matrix(raw, nrow = recordlength)
	raw <- apply(raw, 2L, paste, collapse = "")
	}
	st <- c(1L, 1L + cumsum(widths))
	first <- st[-length(st)][!drop]
	last <- cumsum(widths)[!drop]
	if(header)
	text <- c(headerline, sapply(raw, doone))
	else
	text <- sapply(raw, doone)
	read.table(text = text, header = header, sep = sep, row.names = row.names,
	col.names = col.names, quote = "", ...)
	}
	library("microbenchmark")
	set.seed(1)
	ff <- tempfile()
	dat <- sapply(1:1e4, function(x) paste0(sample(0:9), collapse = ""))
	cat(file = ff, dat, sep = "\n")

	identical(read.fwf(ff, widths = c(1,2,3,5)), read.fwf2(ff, widths = c(1,2,3,5)))
	# [1] TRUE

	library("readr")
	identical(read.fwf(ff, widths = c(1,2,3,5)), read_fwf(ff, fwf_widths(c(1,2,3,5))))
	# [1] FALSE

	library("LaF")
	identical(read.fwf(ff, widths = c(1,2,3,5)),
	laf_open_fwf(ff, column_types = rep("numeric", 4), column_widths = c(1,2,3,5)))
	# [1] FALSE

	utils <- function() read.fwf(ff, widths = c(1,2,3,5))
	leeper <- function() read.fwf2(ff, widths = c(1,2,3,5))
	hadley <- function() read_fwf(ff, fwf_widths(c(1,2,3,5)))
	laf <- function() laf_open_fwf(ff, column_types = rep("numeric", 4), column_widths = c(1,2,3,5))
	microbenchmark(utils(), leeper(), hadley(), laf(), times = 25)
	# Unit: milliseconds
	# expr min lq mean median uq max neval cld
	# utils() 650.815590 675.114110 690.610924 690.093809 693.658223 760.670054 25 c
	# leeper() 395.106120 424.477414 431.464297 432.641197 437.710894 480.624730 25 b
	# hadley() 8.159549 9.066336 11.753338 9.719871 15.496214 18.416717 25 a
	# laf() 1.226242 1.551084 1.639269 1.628832 1.672708 2.783485 25 a


	utils <- function() read.fwf(ff, colClasses = rep("numeric", 4), widths = c(1,2,3,5))
	leeper <- function() read.fwf2(ff, colClasses = rep("numeric", 4), widths = c(1,2,3,5))
	hadley <- function() read_fwf(ff, col_types = "iiii", fwf_widths(c(1,2,3,5)))
	laf <- function() laf_open_fwf(ff, column_types = rep("numeric", 4), column_widths = c(1,2,3,5))
	microbenchmark(utils(), leeper(), hadley(), laf(), times = 25)
	# Unit: milliseconds
	# expr min lq mean median uq max neval cld
	# utils() 648.555934 660.188679 684.65795 668.364392 686.839611 766.526839 25 c
	# leeper() 397.034390 411.351318 426.30090 419.492392 430.818385 534.639401 25 b
	# hadley() 928.281034 977.235245 1012.91864 1010.140934 1040.972865 1099.289389 25 d
	# laf() 1.270504 1.587264 1.59236 1.645381 1.686949 1.795871 25 a