Generating some data first:
# R version 3.3.0
require(data.table) ## 1.9.7, commit 2433, github
require(dplyr) ## devel, commit 3189, github
Checking for exact equality of FPs
require(dplyr)
DF = data.frame(a=seq(0, 1, by=0.2), b=1:2)
merge(data.frame(a=0.6), DF, all.x=TRUE)
# a b
# 1 0.6 NA
# The post with benchmarking results is the link given below: | |
# http://stackoverflow.com/questions/18003717/is-there-any-efficient-way-than-rbind-filllist/18004698#18004698 | |
# This is the script with which the benchmarking and plots were generated in case anyone else wants to replicate it. | |
# Note: it takes about 2-3 hours for the benchmarking to finish. | |
require(plyr) | |
require(data.table) | |
require(ggplot2) | |
require(microbenchmark) |
require(dplyr) | |
require(data.table) | |
foo <- function(N) { | |
group_sizes = 10^(1:(log10(N)-1L)) | |
uniqval <- unique(runif(2*N)) | |
fans <- vector("list", length(group_sizes)) | |
for (i in seq_along(group_sizes)) { |
require(data.table) | |
set.seed(1L) | |
DT1 <- data.table(x=sample(1e7), y=as.numeric(sample(1e7)), z=sample(letters, 1e7, TRUE)) | |
DT2 <- copy(DT1) | |
val <- runif(1e7) | |
# 'set' seems faster when adding 1-column | |
# ======================================= |
# here's some sample data to test it out | |
require(data.table) | |
require(dplyr) | |
set.seed(45) | |
DF <- data.frame(x=sample(3, 25, TRUE), y=1:25, z=26:50) | |
DP <- tbl_df(DF) # for DPLYR data.frame object | |
DT <- data.table(DF) | |
# 1) row-wise subset (usually based on conditions): |
require(data.table) | |
# let's create data huge data.table | |
set.seed(1) | |
N <- 2e7 # size of DT | |
# generate a character vector of length about 1e5 | |
foo <- function() paste(sample(letters, sample(5:9, 1), TRUE), collapse="") | |
ch <- replicate(1e5, foo()) | |
ch <- unique(ch) |
# version 1.8.11 | |
require(data.table) | |
# Loading required package: data.table | |
# data.table 1.8.11 For help type: help("data.table") | |
## create a huge data.table: | |
## ------------------------- | |
set.seed(1) | |
N <- 2e7 # size of DT |
# version 1.8.11 (commit 1048) | |
require(data.table) | |
# Loading required package: data.table | |
# data.table 1.8.11 For help type: help("data.table") | |
## create a huge data.table: | |
## ------------------------- | |
set.seed(1) | |
N <- 2e7 # size of DT |
from pandas import * | |
from pandas.util.testing import rands | |
import random | |
N = 10000 | |
ngroups = 10 | |
def get_test_data(ngroups=100, n=N): | |
unique_groups = range(ngroups) | |
arr = np.asarray(np.tile(unique_groups, n / ngroups), dtype=object) |