Skip to content

Instantly share code, notes, and snippets.

@arunsrinivasan
Last active December 31, 2015 00:49
Show Gist options
  • Save arunsrinivasan/7909609 to your computer and use it in GitHub Desktop.
Save arunsrinivasan/7909609 to your computer and use it in GitHub Desktop.
Comparison of pandas with data.table (along with base:::merge and plyr:::join)
require(data.table)
# tested on 1.8.10 AND 1.8.11, results don't differ much at all.
require(plyr)
# 1_8 version
set.seed(1000) # for reproducibility
N <- 1e4
foo <- function() paste(sample(letters, 10), collapse="")
indices <- replicate(N, foo())
indices2 <- replicate(N, foo())
left <- data.frame(key=rep(indices[1:8000], 10), key2=rep(indices2[1:8000], 10), value=rnorm(80000))
right <- data.frame(key=indices[2001:10000], key2=indices2[2001:10000], value2=rnorm(8000))
right2 <- data.frame(key=rep(right$key, 2), key2=rep(right$key2, 2), value2=rnorm(16000))
left.dt <- data.table(left)
system.time(setkey(left.dt, key, key2))
# user system elapsed
# 0.020 0.001 0.021
right.dt <- data.table(right)
system.time(setkey(right.dt, key, key2))
# user system elapsed
# 0.004 0.000 0.004
right2.dt <- data.table(right2)
system.time(setkey(right2.dt, key, key2))
# user system elapsed
# 0.007 0.000 0.008
## Benchmarking function
## ---------------------
timeit <- function(func, niter=10) {
timing = rep(NA, niter)
for (i in 1:niter) {
gc()
timing[i] <- system.time(func())[3]
}
mean(timing)
}
## base:::merge
## ------------
left.join <- function(sort=FALSE) result <- base::merge(left, right, all.x=TRUE, sort=sort)
right.join <- function(sort=FALSE) result <- base::merge(left, right, all.y=TRUE, sort=sort)
outer.join <- function(sort=FALSE) result <- base::merge(left, right, all=TRUE, sort=sort)
inner.join <- function(sort=FALSE) result <- base::merge(left, right, all=FALSE, sort=sort)
## data.table merge/join
## ---------------------
left.join.dt <- function(sort=FALSE) result <- right.dt[left.dt]
right.join.dt <- function(sort=FALSE) result <- left.dt[right.dt]
outer.join.dt <- function(sort=FALSE) result <- merge(left.dt, right.dt, all=TRUE, sort=sort)
inner.join.dt <- function(sort=FALSE) result <- merge(left.dt, right.dt, all=FALSE, sort=sort)
## plyr merge/join
## ---------------
plyr.join <- function(type) result <- plyr::join(left, right, by=c("key", "key2"), type=type, match="first")
sort.options <- c(FALSE, TRUE)
## -----------
## many-to-one
## -----------
results <- matrix(nrow=4, ncol=3)
colnames(results) <- c("base::merge", "plyr", "data.table")
rownames(results) <- c("inner", "outer", "left", "right")
base.functions <- c(inner.join, outer.join, left.join, right.join)
plyr.functions <- c(function() plyr.join("inner"), function() plyr.join("full"),
function() plyr.join("left"), function() plyr.join("right"))
dt.functions <- c(inner.join.dt, outer.join.dt, left.join.dt, right.join.dt)
for (i in 1:4) {
base.func <- base.functions[[i]]
plyr.func <- plyr.functions[[i]]
dt.func <- dt.functions[[i]]
results[i, 1] <- timeit(base.func)
results[i, 2] <- timeit(plyr.func)
results[i, 3] <- timeit(dt.func)
}
results
# base::merge plyr data.table
# inner 0.9587 0.6214 0.0734
# outer 1.4238 0.8191 0.2116
# left 1.1039 0.6241 0.0528
# right 1.1714 0.3678 0.0258
## ------------
## many-to-many
## ------------
left.join <- function(sort=FALSE) result <- base::merge(left, right2, all.x=TRUE, sort=sort)
right.join <- function(sort=FALSE) result <- base::merge(left, right2, all.y=TRUE, sort=sort)
outer.join <- function(sort=FALSE) result <- base::merge(left, right2, all=TRUE, sort=sort)
inner.join <- function(sort=FALSE) result <- base::merge(left, right2, all=FALSE, sort=sort)
left.join.dt <- function(sort=FALSE) result <- right2.dt[left.dt, allow.cartesian=TRUE]
right.join.dt <- function(sort=FALSE) result <- left.dt[right2.dt, allow.cartesian=TRUE]
outer.join.dt <- function(sort=FALSE) result <- merge(left.dt, right2.dt, all=TRUE, sort=sort, allow.cartesian=TRUE)
inner.join.dt <- function(sort=FALSE) result <- merge(left.dt, right2.dt, all=FALSE, sort=sort, allow.cartesian=TRUE)
sort.options <- c(FALSE, TRUE)
results <- matrix(nrow=4, ncol=3)
colnames(results) <- c("base::merge", "plyr", "data.table")
rownames(results) <- c("inner", "outer", "left", "right")
base.functions <- c(inner.join, outer.join, left.join, right.join)
plyr.functions <- c(function() plyr.join("inner"), function() plyr.join("full"),
function() plyr.join("left"), function() plyr.join("right"))
dt.functions <- c(inner.join.dt, outer.join.dt, left.join.dt, right.join.dt)
for (i in 1:4) {
base.func <- base.functions[[i]]
plyr.func <- plyr.functions[[i]]
dt.func <- dt.functions[[i]]
results[i, 1] <- timeit(base.func)
results[i, 2] <- timeit(plyr.func)
results[i, 3] <- timeit(dt.func)
}
results
# base::merge plyr data.table
# inner 1.9289 0.6080 0.0917
# outer 2.8860 0.8485 0.3490
# left 2.1343 0.6279 0.0758
# right 2.3700 0.3745 0.0429
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment