mrdwab/SO34829033.R

## SO34829033.R
library(tidyr)
library(data.table)
library(iotools)
library(microbenchmark)

###
### Sample data
###

mydf <- data.frame(
  V1 = c("0001 This is text for 0001", "0002 This has spaces in between",
         "0003 Yet this is only supposed to be two columns",
         "0009 Why didnt they just comma delimit you may ask?",
         "001  Who knows", "0010 Or even use quotations?",
         "0012 But now Im here with his file",
         "0013 And hoping someone has an elegant solution?"))

m10k <- do.call(rbind, replicate(ceiling(10000/nrow(mydf)), mydf, FALSE))
m1M <- do.call(rbind, replicate(ceiling(1000000/nrow(m10k)), m10k, FALSE))

###
### Functions to split column in already read-in data
###

fun1 <- function(mydf) separate(mydf, V1, c("nr","text"), sep = " ", extra = "merge")
fun2 <- function(mydf) dstrsplit(as.character(mydf$V1), nsep = " ", col_types = "character")
fun3 <- function(mydf) as.data.table(mydf)[, tstrsplit(V1, "(?<=\\d)\\s+", perl=TRUE)]

###
### Benchmarking
###

# microbenchmark(fun1(mydf), fun2(mydf), fun3(mydf))

microbenchmark(fun1(m10k), fun2(m10k), fun3(m10k), times = 10)
# Unit: milliseconds
#        expr       min        lq      mean    median        uq       max neval
#  fun1(m10k) 10.863449 10.966978 11.202731 11.160874 11.388236 11.812617    10
#  fun2(m10k)  1.926897  1.986694  2.060191  2.032211  2.103611  2.251319    10
#  fun3(m10k)  8.773226  8.949047  9.321084  9.172840  9.304260 10.981259    10

# microbenchmark(fun1(m1M), fun2(m1M), fun3(m1M), times = 10)

###
### Sample data, presuming we are reading from a file
###

x <- tempfile(); writeLines(as.character(mydf$V1), x)
x10k <- tempfile(); writeLines(as.character(m10k$V1), x10k)
x1M <- tempfile(); writeLines(as.character(m1M$V1), x1M)

###
### Functions to test splitting directly while reading
###

fun1b <- function(file) {
  separate(fread(file, header = FALSE, sep = "`"),
           V1, c("nr","text"), sep = " ", extra = "merge")
}
fun2b <- function(file) {
  input.file(file, formatter = dstrsplit,
             nsep = " ", col_types = "character")
}
fun3b <- function(file) {
  fread(file, header = FALSE, sep = "`")[
    , tstrsplit(V1, "(?<=\\d)\\s+", perl=TRUE)]
}

###
### Benchmarking
###

# microbenchmark(fun1b(x), fun2b(x), fun3b(x))
# microbenchmark(fun1b(x10k), fun2b(x10k), fun3b(x10k), times = 10)

microbenchmark(fun1b(x1M), fun2b(x1M), fun3b(x1M), times = 10)
# Unit: milliseconds
#        expr       min       lq      mean    median        uq       max neval
#  fun1b(x1M) 1344.8792 1580.551 1671.8558 1654.6721 1816.0376 1905.5084    10
#  fun2b(x1M)  343.9002  465.724  500.9675  518.1803  536.9665  603.1356    10
#  fun3b(x1M) 1080.6441 1262.807 1350.0958 1347.0190 1505.3472 1540.2883    10
	library(tidyr)
	library(data.table)
	library(iotools)
	library(microbenchmark)

	###
	### Sample data
	###

	mydf <- data.frame(
	V1 = c("0001 This is text for 0001", "0002 This has spaces in between",
	"0003 Yet this is only supposed to be two columns",
	"0009 Why didnt they just comma delimit you may ask?",
	"001 Who knows", "0010 Or even use quotations?",
	"0012 But now Im here with his file",
	"0013 And hoping someone has an elegant solution?"))

	m10k <- do.call(rbind, replicate(ceiling(10000/nrow(mydf)), mydf, FALSE))
	m1M <- do.call(rbind, replicate(ceiling(1000000/nrow(m10k)), m10k, FALSE))

	###
	### Functions to split column in already read-in data
	###

	fun1 <- function(mydf) separate(mydf, V1, c("nr","text"), sep = " ", extra = "merge")
	fun2 <- function(mydf) dstrsplit(as.character(mydf$V1), nsep = " ", col_types = "character")
	fun3 <- function(mydf) as.data.table(mydf)[, tstrsplit(V1, "(?<=\\d)\\s+", perl=TRUE)]

	###
	### Benchmarking
	###

	# microbenchmark(fun1(mydf), fun2(mydf), fun3(mydf))

	microbenchmark(fun1(m10k), fun2(m10k), fun3(m10k), times = 10)
	# Unit: milliseconds
	# expr min lq mean median uq max neval
	# fun1(m10k) 10.863449 10.966978 11.202731 11.160874 11.388236 11.812617 10
	# fun2(m10k) 1.926897 1.986694 2.060191 2.032211 2.103611 2.251319 10
	# fun3(m10k) 8.773226 8.949047 9.321084 9.172840 9.304260 10.981259 10

	# microbenchmark(fun1(m1M), fun2(m1M), fun3(m1M), times = 10)

	###
	### Sample data, presuming we are reading from a file
	###

	x <- tempfile(); writeLines(as.character(mydf$V1), x)
	x10k <- tempfile(); writeLines(as.character(m10k$V1), x10k)
	x1M <- tempfile(); writeLines(as.character(m1M$V1), x1M)

	###
	### Functions to test splitting directly while reading
	###

	fun1b <- function(file) {
	separate(fread(file, header = FALSE, sep = "`"),
	V1, c("nr","text"), sep = " ", extra = "merge")
	}
	fun2b <- function(file) {
	input.file(file, formatter = dstrsplit,
	nsep = " ", col_types = "character")
	}
	fun3b <- function(file) {
	fread(file, header = FALSE, sep = "`")[
	, tstrsplit(V1, "(?<=\\d)\\s+", perl=TRUE)]
	}

	###
	### Benchmarking
	###

	# microbenchmark(fun1b(x), fun2b(x), fun3b(x))
	# microbenchmark(fun1b(x10k), fun2b(x10k), fun3b(x10k), times = 10)

	microbenchmark(fun1b(x1M), fun2b(x1M), fun3b(x1M), times = 10)
	# Unit: milliseconds
	# expr min lq mean median uq max neval
	# fun1b(x1M) 1344.8792 1580.551 1671.8558 1654.6721 1816.0376 1905.5084 10
	# fun2b(x1M) 343.9002 465.724 500.9675 518.1803 536.9665 603.1356 10
	# fun3b(x1M) 1080.6441 1262.807 1350.0958 1347.0190 1505.3472 1540.2883 10