Skip to content

Instantly share code, notes, and snippets.

@oseiskar
Created October 1, 2015 08:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save oseiskar/15c4a3fd9b6ec5856c89 to your computer and use it in GitHub Desktop.
Save oseiskar/15c4a3fd9b6ec5856c89 to your computer and use it in GitHub Desktop.
R fast CSV writing proof of concept
#include <R.h>
#include <errno.h>
#include <Rinternals.h>
void write_csv_fast(SEXP list_of_columns, SEXP filename) {
SEXP *columns = NULL;
int error_number = 0;
const char COL_SEP = ',';
const char *ROW_SEP = "\n";
FILE *f = fopen(CHAR(STRING_ELT(filename, 0)), "wb");
if (f == NULL) goto end;
R_xlen_t ncols = LENGTH(list_of_columns);
SEXP names = getAttrib(list_of_columns, R_NamesSymbol);
columns = malloc(ncols * sizeof(SEXP));
if (columns == NULL) goto end;
for (R_xlen_t i=0; i<ncols; ++i) {
/* write column name */
if (i > 0) fputc(COL_SEP, f);
fputs(CHAR(STRING_ELT(names, i)), f);
/* store column expression */
columns[i] = VECTOR_ELT(list_of_columns, i);
}
fputs(ROW_SEP, f);
for (R_xlen_t row_i = 0; row_i < LENGTH(columns[0]); ++row_i) {
for (int col_i = 0; col_i < ncols; ++col_i) {
if (col_i > 0) fputc(COL_SEP, f);
fputs(CHAR(STRING_ELT(columns[col_i], row_i)), f);
}
if (fputs(ROW_SEP, f) < 0) goto end;
}
end:
error_number = errno;
if (columns != NULL) free(columns);
if (f != NULL) fclose(f);
if (error_number) error(strerror(errno));
}
fwrite <- function(dt, file.path) {
stopifnot(is.data.frame(dt))
stopifnot(ncol(dt) > 0)
# convert data.table to a named list of columns,
# all of character type
l <- list()
for (c in names(dt)) {
str_col = as.character(dt[[c]])
str_col[is.na(str_col)] <- ''
l[[c]] <- str_col
}
stopifnot(length(l) == length(names(dt)))
.Call('write_csv_fast', l, file.path)
}
generating test table
fwrite 1
write.csv 1
fwrite 2
write.csv 2
fwrite 3
write.csv 3
speedup 3.34688884909892
fwrite write.csv
1: 1.823002 secs 5.805012 secs
2: 1.684003 secs 5.748010 secs
3: 1.682003 secs 5.814011 secs
performanceTest <- function () {
library(data.table)
message('generating test table')
x <- c(1:100000)
dt <- data.table(ints=x, floats=as.double(x))
for (col in as.character(c(1:100))) dt[, (col) := as.character(x)]
timeIt <- function(func) {
f <- tempfile()
t0 <- Sys.time()
func(dt, f)
time_diff <- Sys.time() - t0
file.remove(f)
time_diff
}
alternatives = list(
fwrite = fwrite,
write.csv = function(dt, f) {
write.csv(dt, f, quote=FALSE, row.names=FALSE)
}
)
n_rounds <- 3
results <- rbindlist(
lapply(1:n_rounds,
function (i) {
round <- list()
for (alt in names(alternatives)) {
message(alt, ' ', i)
round[[alt]] <- timeIt(alternatives[[alt]])
}
round
}
)
)
means <- results[, lapply(.SD, mean)][, lapply(.SD, as.numeric)]
message('speedup x ', means[, write.csv] / means[, fwrite])
results
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment