Skip to content

Instantly share code, notes, and snippets.

@leeper
Last active August 29, 2015 14:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save leeper/19b32b14ac746734cc91 to your computer and use it in GitHub Desktop.
Save leeper/19b32b14ac746734cc91 to your computer and use it in GitHub Desktop.
Reading fixed width format data
read.fwf2 <- function (file, widths, header = FALSE, sep = "\t", skip = 0,
row.names, col.names, n = -1, ...)
{
doone <- function(x) {
x <- substring(x, first, last)
x[!nzchar(x)] <- NA_character_
paste0(x, collapse = sep)
}
if (is.list(widths)) {
recordlength <- length(widths)
widths <- do.call("c", widths)
} else {
recordlength <- 1L
}
drop <- (widths < 0L)
widths <- abs(widths)
if (is.character(file)) {
file <- file(file, "rt")
on.exit(close(file), add = TRUE)
} else if (!isOpen(file)) {
open(file, "rt")
on.exit(close(file), add = TRUE)
}
if (skip)
readLines(file, n = skip)
if (header) {
headerline <- readLines(file, n = 1L)
text[1] <- headerline
}
raw <- readLines(file, n = n)
nread <- length(raw)
if (recordlength > 1L && nread%%recordlength) {
raw <- raw[1L:(nread - nread%%recordlength)]
warning(sprintf(ngettext(nread%%recordlength, "last record incomplete, %d line discarded",
"last record incomplete, %d lines discarded"),
nread%%recordlength), domain = NA)
}
if (recordlength > 1L) {
raw <- matrix(raw, nrow = recordlength)
raw <- apply(raw, 2L, paste, collapse = "")
}
st <- c(1L, 1L + cumsum(widths))
first <- st[-length(st)][!drop]
last <- cumsum(widths)[!drop]
if(header)
text <- c(headerline, sapply(raw, doone))
else
text <- sapply(raw, doone)
read.table(text = text, header = header, sep = sep, row.names = row.names,
col.names = col.names, quote = "", ...)
}
library("microbenchmark")
set.seed(1)
ff <- tempfile()
dat <- sapply(1:1e4, function(x) paste0(sample(0:9), collapse = ""))
cat(file = ff, dat, sep = "\n")
identical(read.fwf(ff, widths = c(1,2,3,5)), read.fwf2(ff, widths = c(1,2,3,5)))
# [1] TRUE
library("readr")
identical(read.fwf(ff, widths = c(1,2,3,5)), read_fwf(ff, fwf_widths(c(1,2,3,5))))
# [1] FALSE
library("LaF")
identical(read.fwf(ff, widths = c(1,2,3,5)),
laf_open_fwf(ff, column_types = rep("numeric", 4), column_widths = c(1,2,3,5)))
# [1] FALSE
utils <- function() read.fwf(ff, widths = c(1,2,3,5))
leeper <- function() read.fwf2(ff, widths = c(1,2,3,5))
hadley <- function() read_fwf(ff, fwf_widths(c(1,2,3,5)))
laf <- function() laf_open_fwf(ff, column_types = rep("numeric", 4), column_widths = c(1,2,3,5))
microbenchmark(utils(), leeper(), hadley(), laf(), times = 25)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# utils() 650.815590 675.114110 690.610924 690.093809 693.658223 760.670054 25 c
# leeper() 395.106120 424.477414 431.464297 432.641197 437.710894 480.624730 25 b
# hadley() 8.159549 9.066336 11.753338 9.719871 15.496214 18.416717 25 a
# laf() 1.226242 1.551084 1.639269 1.628832 1.672708 2.783485 25 a
utils <- function() read.fwf(ff, colClasses = rep("numeric", 4), widths = c(1,2,3,5))
leeper <- function() read.fwf2(ff, colClasses = rep("numeric", 4), widths = c(1,2,3,5))
hadley <- function() read_fwf(ff, col_types = "iiii", fwf_widths(c(1,2,3,5)))
laf <- function() laf_open_fwf(ff, column_types = rep("numeric", 4), column_widths = c(1,2,3,5))
microbenchmark(utils(), leeper(), hadley(), laf(), times = 25)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# utils() 648.555934 660.188679 684.65795 668.364392 686.839611 766.526839 25 c
# leeper() 397.034390 411.351318 426.30090 419.492392 430.818385 534.639401 25 b
# hadley() 928.281034 977.235245 1012.91864 1010.140934 1040.972865 1099.289389 25 d
# laf() 1.270504 1.587264 1.59236 1.645381 1.686949 1.795871 25 a
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment