Skip to content

Instantly share code, notes, and snippets.

@romainfrancois
Last active December 26, 2015 11:49
Show Gist options
  • Save romainfrancois/7146128 to your computer and use it in GitHub Desktop.
Save romainfrancois/7146128 to your computer and use it in GitHub Desktop.
Benchmark sorted vs. unsorted grouped summarise
#include <dplyr.h>
// [[Rcpp::depends(BH,dplyr)]]
using namespace Rcpp ;
using namespace dplyr ;
class IndexHelper {
public:
IndexHelper( int pos_, int n_) : pos(pos_), n(n_){}
inline int size() const { return n ; }
inline int operator[](int i) const { return pos + i ; }
private:
int pos, n ;
} ;
// [[Rcpp::export]]
SEXP summarise_not_sorted( GroupedDataFrame gdf ){
DataFrame df = gdf.data() ;
int n = gdf.ngroups() ;
NumericVector out(n);
Language call( "mean_", Rf_install("AB") );
Language::Proxy proxy(call,1) ;
IntegerVector AB = df["AB"] ;
Armor<SEXP> res ;
for( int i=0; i<n; i++){
proxy = wrap_subset<INTSXP>(AB, gdf.group(i) ) ;
res = call.fast_eval() ;
out[i] = REAL(res)[0];
}
return out ;
}
// [[Rcpp::export]]
SEXP summarise_not_sorted_less_allocations( GroupedDataFrame gdf, int biggest ){
IntegerVector chunk( biggest );
int* chunk_start = chunk.begin() ;
DataFrame df = gdf.data() ;
int n = gdf.ngroups() ;
NumericVector out(n);
Language call( "mean_", chunk );
IntegerVector AB = df["AB"] ;
int* ptr = AB.begin() ;
Armor<SEXP> res;
for( int i=0; i<n; i++){
Index_0_based index = gdf.group(i) ;
int chunk_size = index.size() ;
SETLENGTH(chunk, chunk_size ) ;
for( int j=0; j<chunk_size; j++)
chunk_start[j] = ptr[index[j] ] ;
res = call.fast_eval() ;
out[i] = REAL(res)[0];
}
SETLENGTH(chunk, biggest) ;
return out ;
}
// [[Rcpp::export]]
SEXP summarise_not_sorted_internal( GroupedDataFrame gdf ){
DataFrame df = gdf.data() ;
int n = gdf.ngroups() ;
NumericVector out(n);
IntegerVector AB = df["AB"] ; int* ptr = AB.begin() ;
for( int i=0; i<n; i++){
out[i] = dplyr::internal::Mean_internal<INTSXP,false,Index_0_based>::process( ptr, gdf.group(i) ) ;
}
return out ;
}
// [[Rcpp::export]]
List helper_group_sizes( IntegerVector groups, int ngroups){
IntegerVector out(ngroups) ;
int n = groups.size() ;
int biggest = 0 ;
for( int i=0, j=0; i<ngroups; i++){
int current = groups[j] ;
int count = 0 ;
while( groups[j] == current){
j++;
count++ ;
}
out[i] = count ;
biggest = std::max(biggest, count) ;
}
return List::create( out, biggest ) ;
}
// [[Rcpp::export]]
SEXP summarise_sorted( DataFrame df, IntegerVector sizes, int biggest ){
IntegerVector chunk( biggest );
int* chunk_start = chunk.begin() ;
int n = sizes.size() ;
NumericVector out(n);
IntegerVector AB = df["AB"] ;
int* ptr = AB.begin() ;
Language call( "mean_", chunk );
int pos = 0 ;
Armor<SEXP> res ;
for( int i=0; i<n; i++){
SETLENGTH( chunk, sizes[i] ) ;
memcpy( chunk_start, ptr + pos, sizes[i] * sizeof(int) ) ;
res = call.fast_eval() ;
out[i] = REAL(res)[0] ;
pos += sizes[i] ;
}
SETLENGTH(chunk, biggest) ;
return out ;
}
// [[Rcpp::export]]
SEXP summarise_sorted_internal( DataFrame df, IntegerVector sizes, int biggest ){
int n = sizes.size() ;
NumericVector out(n);
IntegerVector AB = df["AB"] ;
int* ptr = AB.begin() ;
int pos = 0 ;
for( int i=0; i<n; i++){
out[i] = dplyr::internal::Mean_internal<INTSXP,false,IndexHelper>::process( ptr, IndexHelper(pos, sizes[i] ) ) ;
pos += sizes[i] ;
}
return out ;
}
/*** R
library(Rcpp)
library(dplyr)
library(microbenchmark)
library(data.table)
library(Lahman)
batting_cpp <- tbl_cpp(Batting)
players_cpp <- group_by(batting_cpp, playerID)
players_cpp_sorted <- arrange( batting_cpp, playerID)
mean_ <- function(.) .Internal(mean(.))
groups <- match(players_cpp_sorted$playerID, players_cpp_sorted$playerID)
ngroups <- length( attr( players_cpp, "index" ) )
helps <- helper_group_sizes( groups, ngroups )
group_sizes <- helps[[1]]
biggest <- helps[[2]]
res1 <- sort( summarise_not_sorted( players_cpp ) )
res2 <- sort( summarise_sorted( players_cpp_sorted, group_sizes, biggest ) )
identical(res1, res2)
options( width = 150, digits= 4 )
microbenchmark(
arrange = arrange( batting_cpp, playerID),
unsorted = summarise_not_sorted( players_cpp ),
unsorted_less_alloc = summarise_not_sorted_less_allocations( players_cpp, biggest ),
sorted = summarise_sorted( players_cpp_sorted, group_sizes, biggest ),
unsorted_internal = summarise_not_sorted_internal( players_cpp ),
sorted_internal = summarise_sorted_internal( players_cpp_sorted, group_sizes, biggest )
)
*/
> microbenchmark(arrange = arrange(batting_cpp, playerID),
+ unsorted = summarise_not_sorted(players_cpp), unsorted_less_alloc = summarise_not_so .... [TRUNCATED]
Unit: microseconds
expr min lq median uq max neval
arrange 10593.2 11060.1 11527.4 13168.2 50620.5 100
unsorted 12194.0 13007.9 13932.9 14681.5 50063.7 100
unsorted_less_alloc 10379.6 11053.8 11384.9 12434.8 49670.9 100
sorted 8908.2 9283.6 9426.6 10703.8 49739.7 100
unsorted_internal 877.9 1051.2 1200.2 1363.1 1459.1 100
sorted_internal 612.2 631.5 672.7 695.2 800.9 100
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment