jeffreyhorner/.gitignore

## .gitignore
.Rproj.user
.Rhistory
.RData
*.Rproj
*.html

## read-file.cpp
#include <fstream>
#include <sstream>
#include <string>
#include <Rcpp.h>
using namespace Rcpp;

// [[Rcpp::export]]
CharacterVector read_file_cpp1(std::string path) {
  std::ifstream t(path.c_str());
  std::stringstream ss;
  ss << t.rdbuf();
  return ss.str();
}

// [[Rcpp::export]]
CharacterVector read_file_cpp2(std::string path) {
  std::ifstream in(path.c_str());
  std::string contents;
  in.seekg(0, std::ios::end);
  contents.resize(in.tellg());
  in.seekg(0, std::ios::beg);
  in.read(&contents[0], contents.size());
  in.close();
  return(contents);
}

## read-file.md

      
    Raw
  

              read-file.md
            
          
    Reading a complete file with R

This is a short exploration of the most efficient way to read a complete file
(including newlines) into R - previously I'd used readLines() plus paste()
but that's clearly the least efficient option.
Here are the options:


Use readLines() and paste()
read_file1 <- function(path) {
  paste0(paste0(readLines(path), collapse = "\n"), "\n")
}


Find out the size of the file and then use readChar()
read_file2 <- function(path) {
  size <- file.info(path)$size
  readChar(path, size, useBytes = TRUE)
}


As above, but using readBin(), then converting to a character vector.
Unfortunately you can't read into a character vector directly because
use type = "character" is limited to 10000 characters
read_file3 <- function(path) {
  size <- file.info(path)$size
  rawToChar(readBin(path, "raw", size))
}


A safer approach that doesn't use a separate call to file.info() - this avoids race conditions where the file changes between asking for its size and reading it. (Suggested by @klmr)
read_file4 <- function(path, chunk_size = 1e4) {
  con <- file(path, "rb", raw = TRUE)
  on.exit(close(con))
  
  # Guess approximate number of chunks
  n <- file.info(path)$size / chunk_size
  chunks <- vector("list", n)

  i <- 1L
  chunks[[i]] <- readBin(con, "raw", n = chunk_size)
  while(length(chunks[[i]]) == chunk_size) {
    i <- i + 1L
    chunks[[i]] <- readBin(con, "raw", n = chunk_size)
  }
  
  rawToChar(unlist(chunks, use.names = FALSE))
}


An alternative would be to use C++.  This version was supplied by @tim_yates
library(Rcpp)
sourceCpp("read-file.cpp")


An alternative would be to use C++.  read_file_cpp1 came from @tim_yates, and read_file_cpp2 from @the_belial
library(Rcpp)
sourceCpp("read-file.cpp")


An alternative in C.
library(inline)
read_file_c <- cfunction(
    signature(Sfile="character"),language="C",convention=".Call",
    includes=
"
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdio.h>
#include <unistd.h>
",
    body=
"
    const char *file;
    int fd;
    char *filebuf;
    off_t filesize;
    ssize_t bytesread;
    SEXP ans;

    file = CHAR(STRING_ELT(Sfile,0));
    fd = open(file,O_RDONLY);
    filesize = lseek(fd,0,SEEK_END);
    lseek(fd,0,SEEK_SET);
    filebuf = malloc(filesize+1);
    filebuf[filesize] = '\\0';
    bytesread = read(fd, filebuf, filesize);
    PROTECT(ans = allocVector(STRSXP,1));
    SET_STRING_ELT(ans,0, mkChar(filebuf));
    UNPROTECT(1);
    free(filebuf);
    return ans;
")


We'll compare the results on a file included with R:
path <- file.path(R.home("doc"), "COPYING")
file.info(path)$size / 1024
# [1] 17.7

First we need to check they all return the same results. (They won't if the file
doesn't include a trailing newline)
stopifnot(identical(read_file1(path), read_file2(path)))
stopifnot(identical(read_file1(path), read_file3(path)))
stopifnot(identical(read_file1(path), read_file4(path)))
stopifnot(identical(read_file1(path), read_file_cpp1(path)))
stopifnot(identical(read_file1(path), read_file_cpp2(path)))
stopifnot(identical(read_file1(path), read_file_c(path)))
The benchmarking results are clear: readChar() is the best base R option, and is
about four times faster for this file.  The safer approach using chunked readBin() reads is about 50% slower. The C++ functions both fast (2x faster than readChar() and 10x faster than readLines()) and safe.
microbenchmark(
  readLines = read_file1(path),   
  readChar = read_file2(path),   
  readBin = read_file3(path),
  chunked_read = read_file4(path),
  Rcpp = read_file_cpp1(path),
  Rcpp2 = read_file_cpp2(path),
  C = read_file_c(path)
)
# Unit: microseconds
#          expr    min     lq median     uq    max neval
#     readLines 1715.3 1728.1 1734.8 1745.6 1778.6   100
#      readChar  186.0  190.9  195.1  200.2  231.4   100
#       readBin  208.2  212.3  215.7  219.7  248.7   100
#  chunked_read  286.1  293.1  301.6  313.0 2003.0   100
#          Rcpp   71.2   78.5   87.6   94.6  102.7   100
#         Rcpp2   63.1   64.4   69.2   76.4   86.5   100
#             C   55.0   56.3   57.0   62.7   74.9   100


## read-file.rmd
```{r, echo = FALSE}
library(microbenchmark)
options(digits = 3)
opts_chunk$set(comment = "#", tidy = FALSE)
```

# Reading a complete file with R

This is a short exploration of the most efficient way to read a complete file
(including newlines) into R - previously I'd used `readLines()` plus `paste()`
but that's clearly the least efficient option.

Here are the options:

* Use `readLines()` and `paste()`

    ```{r}
    read_file1 <- function(path) {
      paste0(paste0(readLines(path), collapse = "\n"), "\n")
    }
    ```

* Find out the size of the file and then use `readChar()`

    ```{r}
    read_file2 <- function(path) {
      size <- file.info(path)$size
      readChar(path, size, useBytes = TRUE)
    }
    ```

* As above, but using `readBin()`, then converting to a character vector.
  Unfortunately you can't read into a character vector directly because
  use `type = "character"` is limited to 10000 characters

    ```{r}
    read_file3 <- function(path) {
      size <- file.info(path)$size
      rawToChar(readBin(path, "raw", size))
    }
    ```

* A safer approach that doesn't use a separate call to `file.info()` - this avoids race conditions where the file changes between asking for its size and reading it. (Suggested by [@klmr](http://twitter.com/klmr))

    ```{r}
    read_file4 <- function(path, chunk_size = 1e4) {
      con <- file(path, "rb", raw = TRUE)
      on.exit(close(con))

      # Guess approximate number of chunks
      n <- file.info(path)$size / chunk_size
      chunks <- vector("list", n)

      i <- 1L
      chunks[[i]] <- readBin(con, "raw", n = chunk_size)
      while(length(chunks[[i]]) == chunk_size) {
        i <- i + 1L
        chunks[[i]] <- readBin(con, "raw", n = chunk_size)
      }

      rawToChar(unlist(chunks, use.names = FALSE))
    }
    ```

* An alternative would be to use C++.  This version was supplied by [@tim_yates](http://twitter.com/tim_yates/status/372369074019258370)

    ```{r}
    library(Rcpp)
    sourceCpp("read-file.cpp")
    ```

* An alternative would be to use C++.  `read_file_cpp1` came from [@tim_yates](http://twitter.com/tim_yates/status/372369074019258370), and `read_file_cpp2` from [@the_belial](http://twitter.com/the_belal/status/372392150467489792)

    ```{r}
    library(Rcpp)
    sourceCpp("read-file.cpp")
    ```

* An alternative in C.
    ```{r}
    library(inline)
    read_file_c <- cfunction(
        signature(Sfile="character"),language="C",convention=".Call",
        includes=
    "
    #include <sys/types.h>
    #include <sys/stat.h>
    #include <fcntl.h>
    #include <stdio.h>
    #include <unistd.h>
    ",
        body=
    "
        const char *file;
        int fd;
        char *filebuf;
        off_t filesize;
        ssize_t bytesread;
        SEXP ans;

        file = CHAR(STRING_ELT(Sfile,0));
        fd = open(file,O_RDONLY);
        filesize = lseek(fd,0,SEEK_END);
        lseek(fd,0,SEEK_SET);
        filebuf = malloc(filesize+1);
        filebuf[filesize] = '\\0';
        bytesread = read(fd, filebuf, filesize);
        PROTECT(ans = allocVector(STRSXP,1));
        SET_STRING_ELT(ans,0, mkChar(filebuf));
        UNPROTECT(1);
        free(filebuf);
        return ans;
    ")
    ```

We'll compare the results on a file included with R:
```{r}
path <- file.path(R.home("doc"), "COPYING")
file.info(path)$size / 1024
```

First we need to check they all return the same results. (They won't if the file
doesn't include a trailing newline)

```{r}
stopifnot(identical(read_file1(path), read_file2(path)))
stopifnot(identical(read_file1(path), read_file3(path)))
stopifnot(identical(read_file1(path), read_file4(path)))
stopifnot(identical(read_file1(path), read_file_cpp1(path)))
stopifnot(identical(read_file1(path), read_file_cpp2(path)))
stopifnot(identical(read_file1(path), read_file_c(path)))
```

The benchmarking results are clear: `readChar()` is the best base R option, and is
about four times faster for this file.  The safer approach using chunked `readBin()` reads is about 50% slower. The C++ functions both fast (2x faster than `readChar()` and 10x faster than `readLines()`) and safe.

```{r}
microbenchmark(
  readLines = read_file1(path),
  readChar = read_file2(path),
  readBin = read_file3(path),
  chunked_read = read_file4(path),
  Rcpp = read_file_cpp1(path),
  Rcpp2 = read_file_cpp2(path),
  C = read_file_c(path)
)
```

## readlines.md

      
    Raw
  

              readlines.md
            
          
    library(microbenchmark)
path <- file.path(R.home("doc"), "COPYING")

microbenchmark(readLines(path), unit = "ms")
## Unit: milliseconds
##             expr    min     lq median     uq    max neval
##  readLines(path) 0.3829 0.3838 0.3847 0.3984 0.5088   100


## readlines.rmd
```{r}
library(microbenchmark)
path <- file.path(R.home("doc"), "COPYING")

microbenchmark(readLines(path), unit = "ms")
```
	#include <fstream>
	#include <sstream>
	#include <string>
	#include <Rcpp.h>
	using namespace Rcpp;

	// [[Rcpp::export]]
	CharacterVector read_file_cpp1(std::string path) {
	std::ifstream t(path.c_str());
	std::stringstream ss;
	ss << t.rdbuf();
	return ss.str();
	}

	// [[Rcpp::export]]
	CharacterVector read_file_cpp2(std::string path) {
	std::ifstream in(path.c_str());
	std::string contents;
	in.seekg(0, std::ios::end);
	contents.resize(in.tellg());
	in.seekg(0, std::ios::beg);
	in.read(&contents[0], contents.size());
	in.close();
	return(contents);
	}
	```{r, echo = FALSE}
	library(microbenchmark)
	options(digits = 3)
	opts_chunk$set(comment = "#", tidy = FALSE)
	```

	# Reading a complete file with R

	This is a short exploration of the most efficient way to read a complete file
	(including newlines) into R - previously I'd used `readLines()` plus `paste()`
	but that's clearly the least efficient option.

	Here are the options:

	* Use `readLines()` and `paste()`

	```{r}
	read_file1 <- function(path) {
	paste0(paste0(readLines(path), collapse = "\n"), "\n")
	}
	```

	* Find out the size of the file and then use `readChar()`

	```{r}
	read_file2 <- function(path) {
	size <- file.info(path)$size
	readChar(path, size, useBytes = TRUE)
	}
	```

	* As above, but using `readBin()`, then converting to a character vector.
	Unfortunately you can't read into a character vector directly because
	use `type = "character"` is limited to 10000 characters

	```{r}
	read_file3 <- function(path) {
	size <- file.info(path)$size
	rawToChar(readBin(path, "raw", size))
	}
	```

	* A safer approach that doesn't use a separate call to `file.info()` - this avoids race conditions where the file changes between asking for its size and reading it. (Suggested by [@klmr](http://twitter.com/klmr))

	```{r}
	read_file4 <- function(path, chunk_size = 1e4) {
	con <- file(path, "rb", raw = TRUE)
	on.exit(close(con))

	# Guess approximate number of chunks
	n <- file.info(path)$size / chunk_size
	chunks <- vector("list", n)

	i <- 1L
	chunks[[i]] <- readBin(con, "raw", n = chunk_size)
	while(length(chunks[[i]]) == chunk_size) {
	i <- i + 1L
	chunks[[i]] <- readBin(con, "raw", n = chunk_size)
	}

	rawToChar(unlist(chunks, use.names = FALSE))
	}
	```

	* An alternative would be to use C++. This version was supplied by [@tim_yates](http://twitter.com/tim_yates/status/372369074019258370)

	```{r}
	library(Rcpp)
	sourceCpp("read-file.cpp")
	```

	* An alternative would be to use C++. `read_file_cpp1` came from [@tim_yates](http://twitter.com/tim_yates/status/372369074019258370), and `read_file_cpp2` from [@the_belial](http://twitter.com/the_belal/status/372392150467489792)

	```{r}
	library(Rcpp)
	sourceCpp("read-file.cpp")
	```

	* An alternative in C.
	```{r}
	library(inline)
	read_file_c <- cfunction(
	signature(Sfile="character"),language="C",convention=".Call",
	includes=
	"
	#include <sys/types.h>
	#include <sys/stat.h>
	#include <fcntl.h>
	#include <stdio.h>
	#include <unistd.h>
	",
	body=
	"
	const char *file;
	int fd;
	char *filebuf;
	off_t filesize;
	ssize_t bytesread;
	SEXP ans;

	file = CHAR(STRING_ELT(Sfile,0));
	fd = open(file,O_RDONLY);
	filesize = lseek(fd,0,SEEK_END);
	lseek(fd,0,SEEK_SET);
	filebuf = malloc(filesize+1);
	filebuf[filesize] = '\\0';
	bytesread = read(fd, filebuf, filesize);
	PROTECT(ans = allocVector(STRSXP,1));
	SET_STRING_ELT(ans,0, mkChar(filebuf));
	UNPROTECT(1);
	free(filebuf);
	return ans;
	")
	```

	We'll compare the results on a file included with R:
	```{r}
	path <- file.path(R.home("doc"), "COPYING")
	file.info(path)$size / 1024
	```

	First we need to check they all return the same results. (They won't if the file
	doesn't include a trailing newline)

	```{r}
	stopifnot(identical(read_file1(path), read_file2(path)))
	stopifnot(identical(read_file1(path), read_file3(path)))
	stopifnot(identical(read_file1(path), read_file4(path)))
	stopifnot(identical(read_file1(path), read_file_cpp1(path)))
	stopifnot(identical(read_file1(path), read_file_cpp2(path)))
	stopifnot(identical(read_file1(path), read_file_c(path)))
	```

	The benchmarking results are clear: `readChar()` is the best base R option, and is
	about four times faster for this file. The safer approach using chunked `readBin()` reads is about 50% slower. The C++ functions both fast (2x faster than `readChar()` and 10x faster than `readLines()`) and safe.

	```{r}
	microbenchmark(
	readLines = read_file1(path),
	readChar = read_file2(path),
	readBin = read_file3(path),
	chunked_read = read_file4(path),
	Rcpp = read_file_cpp1(path),
	Rcpp2 = read_file_cpp2(path),
	C = read_file_c(path)
	)
	```