szilard/min_benchm_datatable_pandas.R

## min_benchm_datatable_pandas.R

## Minimal example of R's data.table vs pandas aggregation and join benchmark
## ( more detailed but still basic benchmark here:
##   http://datascience.la/dplyr-and-a-very-basic-benchmark/ )

## Just copy paste into R and Ipython, respectively
## Timings on a decent server with data.table 1.9.4 & pandas 0.15.1 (Nov 2014)


#### R:

library(data.table)
n <- 10e6
m <- 1e6
d <- data.table(x = sample(m, n, replace=TRUE), y = runif(n))

## aggregation
system.time(
  d[, mean(y), by=x]
)
# ~ 1 sec

d2 <- data.table(x = sample(m))
setkey(d)

## join
system.time(
  d[d2, nomatch=0]
)
# ~ 0.5 sec


#### Ipython:

import pandas as pd
import numpy as np
n = 10e6
m = 1e6
d = pd.DataFrame({"x": np.random.randint(0,m,n), "y": np.random.random(n)})

## aggregation
%time dd = d.groupby("x")["y"].mean()
## ~ 1.5 sec

## join
d2 = pd.DataFrame({"x": np.random.permutation(np.arange(m))})
d = d.sort_index(by = "x")

%time dd = pd.merge(d, d2)
## ~ 2.5 sec

	## Minimal example of R's data.table vs pandas aggregation and join benchmark
	## ( more detailed but still basic benchmark here:
	## http://datascience.la/dplyr-and-a-very-basic-benchmark/ )

	## Just copy paste into R and Ipython, respectively
	## Timings on a decent server with data.table 1.9.4 & pandas 0.15.1 (Nov 2014)



	#### R:

	library(data.table)
	n <- 10e6
	m <- 1e6
	d <- data.table(x = sample(m, n, replace=TRUE), y = runif(n))

	## aggregation
	system.time(
	d[, mean(y), by=x]
	)
	# ~ 1 sec

	d2 <- data.table(x = sample(m))
	setkey(d)

	## join
	system.time(
	d[d2, nomatch=0]
	)
	# ~ 0.5 sec



	#### Ipython:

	import pandas as pd
	import numpy as np
	n = 10e6
	m = 1e6
	d = pd.DataFrame({"x": np.random.randint(0,m,n), "y": np.random.random(n)})

	## aggregation
	%time dd = d.groupby("x")["y"].mean()
	## ~ 1.5 sec

	## join
	d2 = pd.DataFrame({"x": np.random.permutation(np.arange(m))})
	d = d.sort_index(by = "x")

	%time dd = pd.merge(d, d2)
	## ~ 2.5 sec