This is a short exploration of the most efficient way to read a complete file
(including newlines) into R - previously I'd used readLines()
plus paste()
but that's clearly the least efficient option.
Here are the options:
-
Use
readLines()
andpaste()
read_file1 <- function(path) { paste0(paste0(readLines(path), collapse = "\n"), "\n") }
-
Find out the size of the file and then use
readChar()
read_file2 <- function(path) { size <- file.info(path)$size readChar(path, size, useBytes = TRUE) }
-
As above, but using
readBin()
, then converting to a character vector. Unfortunately you can't read into a character vector directly because usetype = "character"
is limited to 10000 charactersread_file3 <- function(path) { size <- file.info(path)$size rawToChar(readBin(path, "raw", size)) }
-
A safer approach that doesn't use a separate call to
file.info()
- this avoids race conditions where the file changes between asking for its size and reading it. (Suggested by @klmr)read_file4 <- function(path, chunk_size = 1e4) { con <- file(path, "rb", raw = TRUE) on.exit(close(con)) # Guess approximate number of chunks n <- file.info(path)$size / chunk_size chunks <- vector("list", n) i <- 1L chunks[[i]] <- readBin(con, "raw", n = chunk_size) while(length(chunks[[i]]) == chunk_size) { i <- i + 1L chunks[[i]] <- readBin(con, "raw", n = chunk_size) } rawToChar(unlist(chunks, use.names = FALSE)) }
-
An alternative would be to use C++. This version was supplied by @tim_yates
library(Rcpp) sourceCpp("read-file.cpp")
-
An alternative would be to use C++.
read_file_cpp1
came from @tim_yates, andread_file_cpp2
from @the_beliallibrary(Rcpp) sourceCpp("read-file.cpp")
-
An alternative in C.
library(inline) read_file_c <- cfunction( signature(Sfile="character"),language="C",convention=".Call", includes= " #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <stdio.h> #include <unistd.h> ", body= " const char *file; int fd; char *filebuf; off_t filesize; ssize_t bytesread; SEXP ans; file = CHAR(STRING_ELT(Sfile,0)); fd = open(file,O_RDONLY); filesize = lseek(fd,0,SEEK_END); lseek(fd,0,SEEK_SET); filebuf = malloc(filesize+1); filebuf[filesize] = '\\0'; bytesread = read(fd, filebuf, filesize); PROTECT(ans = allocVector(STRSXP,1)); SET_STRING_ELT(ans,0, mkChar(filebuf)); UNPROTECT(1); free(filebuf); return ans; ")
We'll compare the results on a file included with R:
path <- file.path(R.home("doc"), "COPYING")
file.info(path)$size / 1024
# [1] 17.7
First we need to check they all return the same results. (They won't if the file doesn't include a trailing newline)
stopifnot(identical(read_file1(path), read_file2(path)))
stopifnot(identical(read_file1(path), read_file3(path)))
stopifnot(identical(read_file1(path), read_file4(path)))
stopifnot(identical(read_file1(path), read_file_cpp1(path)))
stopifnot(identical(read_file1(path), read_file_cpp2(path)))
stopifnot(identical(read_file1(path), read_file_c(path)))
The benchmarking results are clear: readChar()
is the best base R option, and is
about four times faster for this file. The safer approach using chunked readBin()
reads is about 50% slower. The C++ functions both fast (2x faster than readChar()
and 10x faster than readLines()
) and safe.
microbenchmark(
readLines = read_file1(path),
readChar = read_file2(path),
readBin = read_file3(path),
chunked_read = read_file4(path),
Rcpp = read_file_cpp1(path),
Rcpp2 = read_file_cpp2(path),
C = read_file_c(path)
)
# Unit: microseconds
# expr min lq median uq max neval
# readLines 1715.3 1728.1 1734.8 1745.6 1778.6 100
# readChar 186.0 190.9 195.1 200.2 231.4 100
# readBin 208.2 212.3 215.7 219.7 248.7 100
# chunked_read 286.1 293.1 301.6 313.0 2003.0 100
# Rcpp 71.2 78.5 87.6 94.6 102.7 100
# Rcpp2 63.1 64.4 69.2 76.4 86.5 100
# C 55.0 56.3 57.0 62.7 74.9 100