Arun Srinivasan arunsrinivasan

## tweet_reply.md

      
              1 file
            
          
              2 forks
            
          
              0 comments
            
          
              5 stars
            
          
                arunsrinivasan
                / tweet_reply.md
            
            
              Last active
              July 27, 2018 04:38
            
              
                automatic indexing vs between() on integer ranges
              
          
    Updated June 16 with latest devel

data.table's automatic indexing:

Generating some data first:
# R version 3.3.0
require(data.table) ## 1.9.7, commit 2433, github
require(dplyr) ## devel, commit 3189, github

  
## floating_points.md

      
              1 file
            
          
              0 forks
            
          
              1 comment
            
          
              0 stars
            
          
                arunsrinivasan
                / floating_points.md
            
            
              Last active
              July 18, 2018 08:58
            
              
                data.table, dplyr and R - floating point comparisons
              
          
    Checking for exact equality of FPs
require(dplyr)
DF = data.frame(a=seq(0, 1, by=0.2), b=1:2)

merge(data.frame(a=0.6), DF, all.x=TRUE)
#     a  b
# 1 0.6 NA

  
## rbind_fill_benchmarking
# The post with benchmarking results is the link given below:
# http://stackoverflow.com/questions/18003717/is-there-any-efficient-way-than-rbind-filllist/18004698#18004698

# This is the script with which the benchmarking and plots were generated in case anyone else wants to replicate it.
# Note: it takes about 2-3 hours for the benchmarking to finish.

require(plyr)
require(data.table)
require(ggplot2)
require(microbenchmark)

## SO_21308436.R
require(dplyr)
require(data.table)

foo <- function(N) {

    group_sizes = 10^(1:(log10(N)-1L))
    uniqval <- unique(runif(2*N))

    fans <- vector("list", length(group_sizes))
    for (i in seq_along(group_sizes)) {

## DT_comp_set.R
require(data.table)

set.seed(1L)
DT1 <- data.table(x=sample(1e7), y=as.numeric(sample(1e7)), z=sample(letters, 1e7, TRUE))
DT2 <- copy(DT1)

val <- runif(1e7)

# 'set' seems faster when adding 1-column
# =======================================

## base_dplyr_datatable.R
# here's some sample data to test it out
require(data.table)
require(dplyr)
set.seed(45)
DF <- data.frame(x=sample(3, 25, TRUE), y=1:25, z=26:50)
DP <- tbl_df(DF) # for DPLYR data.frame object
DT <- data.table(DF)

# 1) row-wise subset (usually based on conditions):

## FR_5241.R
require(data.table)

# let's create data huge data.table
set.seed(1)
N <- 2e7 # size of DT

# generate a character vector of length about 1e5
foo <- function() paste(sample(letters, sample(5:9, 1), TRUE), collapse="")
ch <- replicate(1e5, foo())
ch <- unique(ch)

## dplyr_data.table_mini_benchmark.R
# version 1.8.11
require(data.table)
# Loading required package: data.table
# data.table 1.8.11  For help type: help("data.table")

## create a huge data.table:
## -------------------------
set.seed(1)
N <- 2e7 # size of DT

## DT_1.8.10vs1.8.11.R
# version 1.8.11 (commit 1048)
require(data.table)
# Loading required package: data.table
# data.table 1.8.11  For help type: help("data.table")

## create a huge data.table:
## -------------------------
set.seed(1)
N <- 2e7 # size of DT

## pandas_data.table.py
from pandas import *
from pandas.util.testing import rands
import random

N = 10000
ngroups = 10

def get_test_data(ngroups=100, n=N):
    unique_groups = range(ngroups)
    arr = np.asarray(np.tile(unique_groups, n / ngroups), dtype=object)
	# The post with benchmarking results is the link given below:
	# http://stackoverflow.com/questions/18003717/is-there-any-efficient-way-than-rbind-filllist/18004698#18004698

	# This is the script with which the benchmarking and plots were generated in case anyone else wants to replicate it.
	# Note: it takes about 2-3 hours for the benchmarking to finish.

	require(plyr)
	require(data.table)
	require(ggplot2)
	require(microbenchmark)
	require(dplyr)
	require(data.table)

	foo <- function(N) {

	group_sizes = 10^(1:(log10(N)-1L))
	uniqval <- unique(runif(2*N))

	fans <- vector("list", length(group_sizes))
	for (i in seq_along(group_sizes)) {
	require(data.table)

	set.seed(1L)
	DT1 <- data.table(x=sample(1e7), y=as.numeric(sample(1e7)), z=sample(letters, 1e7, TRUE))
	DT2 <- copy(DT1)

	val <- runif(1e7)

	# 'set' seems faster when adding 1-column
	# =======================================
	# here's some sample data to test it out
	require(data.table)
	require(dplyr)
	set.seed(45)
	DF <- data.frame(x=sample(3, 25, TRUE), y=1:25, z=26:50)
	DP <- tbl_df(DF) # for DPLYR data.frame object
	DT <- data.table(DF)

	# 1) row-wise subset (usually based on conditions):
	require(data.table)

	# let's create data huge data.table
	set.seed(1)
	N <- 2e7 # size of DT

	# generate a character vector of length about 1e5
	foo <- function() paste(sample(letters, sample(5:9, 1), TRUE), collapse="")
	ch <- replicate(1e5, foo())
	ch <- unique(ch)
	# version 1.8.11
	require(data.table)
	# Loading required package: data.table
	# data.table 1.8.11 For help type: help("data.table")

	## create a huge data.table:
	## -------------------------
	set.seed(1)
	N <- 2e7 # size of DT
	# version 1.8.11 (commit 1048)
	require(data.table)
	# Loading required package: data.table
	# data.table 1.8.11 For help type: help("data.table")

	## create a huge data.table:
	## -------------------------
	set.seed(1)
	N <- 2e7 # size of DT
	from pandas import *
	from pandas.util.testing import rands
	import random

	N = 10000
	ngroups = 10

	def get_test_data(ngroups=100, n=N):
	unique_groups = range(ngroups)
	arr = np.asarray(np.tile(unique_groups, n / ngroups), dtype=object)