Skip to content

Instantly share code, notes, and snippets.

@mrdwab
Created January 16, 2016 17:42
Show Gist options
  • Save mrdwab/fed6ef8df8e8ebff49d7 to your computer and use it in GitHub Desktop.
Save mrdwab/fed6ef8df8e8ebff49d7 to your computer and use it in GitHub Desktop.
library(tidyr)
library(data.table)
library(iotools)
library(microbenchmark)
###
### Sample data
###
mydf <- data.frame(
V1 = c("0001 This is text for 0001", "0002 This has spaces in between",
"0003 Yet this is only supposed to be two columns",
"0009 Why didnt they just comma delimit you may ask?",
"001 Who knows", "0010 Or even use quotations?",
"0012 But now Im here with his file",
"0013 And hoping someone has an elegant solution?"))
m10k <- do.call(rbind, replicate(ceiling(10000/nrow(mydf)), mydf, FALSE))
m1M <- do.call(rbind, replicate(ceiling(1000000/nrow(m10k)), m10k, FALSE))
###
### Functions to split column in already read-in data
###
fun1 <- function(mydf) separate(mydf, V1, c("nr","text"), sep = " ", extra = "merge")
fun2 <- function(mydf) dstrsplit(as.character(mydf$V1), nsep = " ", col_types = "character")
fun3 <- function(mydf) as.data.table(mydf)[, tstrsplit(V1, "(?<=\\d)\\s+", perl=TRUE)]
###
### Benchmarking
###
# microbenchmark(fun1(mydf), fun2(mydf), fun3(mydf))
microbenchmark(fun1(m10k), fun2(m10k), fun3(m10k), times = 10)
# Unit: milliseconds
# expr min lq mean median uq max neval
# fun1(m10k) 10.863449 10.966978 11.202731 11.160874 11.388236 11.812617 10
# fun2(m10k) 1.926897 1.986694 2.060191 2.032211 2.103611 2.251319 10
# fun3(m10k) 8.773226 8.949047 9.321084 9.172840 9.304260 10.981259 10
# microbenchmark(fun1(m1M), fun2(m1M), fun3(m1M), times = 10)
###
### Sample data, presuming we are reading from a file
###
x <- tempfile(); writeLines(as.character(mydf$V1), x)
x10k <- tempfile(); writeLines(as.character(m10k$V1), x10k)
x1M <- tempfile(); writeLines(as.character(m1M$V1), x1M)
###
### Functions to test splitting directly while reading
###
fun1b <- function(file) {
separate(fread(file, header = FALSE, sep = "`"),
V1, c("nr","text"), sep = " ", extra = "merge")
}
fun2b <- function(file) {
input.file(file, formatter = dstrsplit,
nsep = " ", col_types = "character")
}
fun3b <- function(file) {
fread(file, header = FALSE, sep = "`")[
, tstrsplit(V1, "(?<=\\d)\\s+", perl=TRUE)]
}
###
### Benchmarking
###
# microbenchmark(fun1b(x), fun2b(x), fun3b(x))
# microbenchmark(fun1b(x10k), fun2b(x10k), fun3b(x10k), times = 10)
microbenchmark(fun1b(x1M), fun2b(x1M), fun3b(x1M), times = 10)
# Unit: milliseconds
# expr min lq mean median uq max neval
# fun1b(x1M) 1344.8792 1580.551 1671.8558 1654.6721 1816.0376 1905.5084 10
# fun2b(x1M) 343.9002 465.724 500.9675 518.1803 536.9665 603.1356 10
# fun3b(x1M) 1080.6441 1262.807 1350.0958 1347.0190 1505.3472 1540.2883 10
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment