Created
October 1, 2015 08:08
-
-
Save oseiskar/15c4a3fd9b6ec5856c89 to your computer and use it in GitHub Desktop.
R fast CSV writing proof of concept
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <R.h> | |
#include <errno.h> | |
#include <Rinternals.h> | |
void write_csv_fast(SEXP list_of_columns, SEXP filename) { | |
SEXP *columns = NULL; | |
int error_number = 0; | |
const char COL_SEP = ','; | |
const char *ROW_SEP = "\n"; | |
FILE *f = fopen(CHAR(STRING_ELT(filename, 0)), "wb"); | |
if (f == NULL) goto end; | |
R_xlen_t ncols = LENGTH(list_of_columns); | |
SEXP names = getAttrib(list_of_columns, R_NamesSymbol); | |
columns = malloc(ncols * sizeof(SEXP)); | |
if (columns == NULL) goto end; | |
for (R_xlen_t i=0; i<ncols; ++i) { | |
/* write column name */ | |
if (i > 0) fputc(COL_SEP, f); | |
fputs(CHAR(STRING_ELT(names, i)), f); | |
/* store column expression */ | |
columns[i] = VECTOR_ELT(list_of_columns, i); | |
} | |
fputs(ROW_SEP, f); | |
for (R_xlen_t row_i = 0; row_i < LENGTH(columns[0]); ++row_i) { | |
for (int col_i = 0; col_i < ncols; ++col_i) { | |
if (col_i > 0) fputc(COL_SEP, f); | |
fputs(CHAR(STRING_ELT(columns[col_i], row_i)), f); | |
} | |
if (fputs(ROW_SEP, f) < 0) goto end; | |
} | |
end: | |
error_number = errno; | |
if (columns != NULL) free(columns); | |
if (f != NULL) fclose(f); | |
if (error_number) error(strerror(errno)); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
fwrite <- function(dt, file.path) { | |
stopifnot(is.data.frame(dt)) | |
stopifnot(ncol(dt) > 0) | |
# convert data.table to a named list of columns, | |
# all of character type | |
l <- list() | |
for (c in names(dt)) { | |
str_col = as.character(dt[[c]]) | |
str_col[is.na(str_col)] <- '' | |
l[[c]] <- str_col | |
} | |
stopifnot(length(l) == length(names(dt))) | |
.Call('write_csv_fast', l, file.path) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
generating test table | |
fwrite 1 | |
write.csv 1 | |
fwrite 2 | |
write.csv 2 | |
fwrite 3 | |
write.csv 3 | |
speedup 3.34688884909892 | |
fwrite write.csv | |
1: 1.823002 secs 5.805012 secs | |
2: 1.684003 secs 5.748010 secs | |
3: 1.682003 secs 5.814011 secs |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
performanceTest <- function () { | |
library(data.table) | |
message('generating test table') | |
x <- c(1:100000) | |
dt <- data.table(ints=x, floats=as.double(x)) | |
for (col in as.character(c(1:100))) dt[, (col) := as.character(x)] | |
timeIt <- function(func) { | |
f <- tempfile() | |
t0 <- Sys.time() | |
func(dt, f) | |
time_diff <- Sys.time() - t0 | |
file.remove(f) | |
time_diff | |
} | |
alternatives = list( | |
fwrite = fwrite, | |
write.csv = function(dt, f) { | |
write.csv(dt, f, quote=FALSE, row.names=FALSE) | |
} | |
) | |
n_rounds <- 3 | |
results <- rbindlist( | |
lapply(1:n_rounds, | |
function (i) { | |
round <- list() | |
for (alt in names(alternatives)) { | |
message(alt, ' ', i) | |
round[[alt]] <- timeIt(alternatives[[alt]]) | |
} | |
round | |
} | |
) | |
) | |
means <- results[, lapply(.SD, mean)][, lapply(.SD, as.numeric)] | |
message('speedup x ', means[, write.csv] / means[, fwrite]) | |
results | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment