Skip to content

Instantly share code, notes, and snippets.

@romainfrancois
Created November 25, 2013 17:16
Show Gist options
  • Save romainfrancois/7644921 to your computer and use it in GitHub Desktop.
Save romainfrancois/7644921 to your computer and use it in GitHub Desktop.
Decomposing `build_index_cpp`
require(dplyr)
require(Rcpp)
require(microbenchmark)
# from the cran-logs-dply repo
logs <- readRDS("logs.rds")
# system.time( { by_day <- group_by(logs, date) } )
sourceCpp( "timings.cpp" )
vars <- list( date = as.name("date" ) )
attr( logs, "vars" ) <- vars
attr( logs, "drop" ) <- TRUE
microbenchmark(
training_data_1(logs),
training_data_2(logs),
training_data_3(logs),
training_data_4(logs),
training_data_5(logs),
dplyr:::build_index_cpp(logs),
times = 5
)

build_index_cpp is the workhorse behind group_by.tbl_cpp.

> microbenchmark(
+   training_data_1(logs),
+   training_data_2(logs),
+   training_data_3(logs),
+   training_data_4(logs),
+   training_data_5(logs),
+   dplyr:::build_index_cpp(logs),
+   times = 5
+ )
Unit: seconds
                          expr      min       lq    median        uq       max
         training_data_1(logs) 1.285893 1.442886  1.462196  1.479791  1.515522
         training_data_2(logs) 1.263335 1.271388  1.277310  1.450229  1.505441
         training_data_3(logs) 1.363866 1.380815  1.382819  1.617304  1.773812
         training_data_4(logs) 5.888116 7.290060  7.420711  9.225859  9.700325
         training_data_5(logs) 7.282824 7.306273  7.532543  9.732684 17.104558
 dplyr:::build_index_cpp(logs) 5.462359 7.304617 10.042755 11.679634 28.748817

What do they do:

  • training_data_1 only trains a ChunkIndexMap.
  • training_data_2 does training_data_1 and extracts the labels.
  • training_data_3 does training_data_2 and calculates a vector of indices
  • training_data_4 does training_data_3 and subsets data with these indices.
  • training_data_5 does training_data_4 and reorders the labels
  • dplyr:::build_index_cpp does the same as training_data_5

So clearly these lines (between 3 and 4):

DataFrameVisitors all_variables_visitors(data, data.names() ) ;
data = all_variables_visitors.subset( indices, classes_grouped() ) ;

are where the big bump happens.

#include <dplyr.h>
// [[Rcpp::depends(dplyr,BH)]]
using namespace Rcpp ;
using namespace dplyr ;
template <typename TargetContainer, typename SourceContainer>
void push_back( TargetContainer& x, const SourceContainer& y ){
x.insert( x.end(), y.begin(), y.end() ) ;
}
template <typename Container>
void push_back( Container& x, typename Container::value_type value, int n ){
for( int i=0; i<n; i++)
x.push_back( value ) ;
}
// [[Rcpp::export]]
void training_data_1( DataFrame data){
CharacterVector vars = Rf_getAttrib( data.attr( "vars" ), R_NamesSymbol ) ;
DataFrameVisitors visitors(data, vars) ;
ChunkIndexMap map( visitors ) ;
train_push_back( map, data.nrows() ) ;
}
// [[Rcpp::export]]
void training_data_2( DataFrame data){
CharacterVector vars = Rf_getAttrib( data.attr( "vars" ), R_NamesSymbol ) ;
DataFrameVisitors visitors(data, vars) ;
ChunkIndexMap map( visitors ) ;
train_push_back( map, data.nrows() ) ;
DataFrame labels = visitors.subset( map, "data.frame") ;
int ngroups = labels.nrows() ;
OrderVisitors order_labels( labels, vars ) ;
IntegerVector orders = order_labels.apply() ;
}
// [[Rcpp::export]]
void training_data_3( DataFrame data){
CharacterVector vars = Rf_getAttrib( data.attr( "vars" ), R_NamesSymbol ) ;
DataFrameVisitors visitors(data, vars) ;
ChunkIndexMap map( visitors ) ;
train_push_back( map, data.nrows() ) ;
DataFrame labels = visitors.subset( map, "data.frame") ;
int ngroups = labels.nrows() ;
OrderVisitors order_labels( labels, vars ) ;
IntegerVector orders = order_labels.apply() ;
std::vector< const std::vector<int>* > chunks(ngroups) ;
ChunkIndexMap::const_iterator it = map.begin() ;
for( int i=0; i<ngroups; i++, ++it){
chunks[ i ] = &it->second ;
}
IntegerVector group_sizes = no_init( ngroups );
int biggest_group = 0 ;
std::vector<int> indices ;
indices.reserve( data.nrows() );
for( int i=0; i<ngroups; i++){
const std::vector<int>& chunk = *chunks[orders[i]] ;
push_back( indices, chunk ) ;
biggest_group = std::max( biggest_group, (int)chunk.size() );
group_sizes[i] = chunk.size() ;
}
}
// [[Rcpp::export]]
void training_data_4( DataFrame data){
CharacterVector vars = Rf_getAttrib( data.attr( "vars" ), R_NamesSymbol ) ;
DataFrameVisitors visitors(data, vars) ;
ChunkIndexMap map( visitors ) ;
train_push_back( map, data.nrows() ) ;
DataFrame labels = visitors.subset( map, "data.frame") ;
int ngroups = labels.nrows() ;
OrderVisitors order_labels( labels, vars ) ;
IntegerVector orders = order_labels.apply() ;
std::vector< const std::vector<int>* > chunks(ngroups) ;
ChunkIndexMap::const_iterator it = map.begin() ;
for( int i=0; i<ngroups; i++, ++it){
chunks[ i ] = &it->second ;
}
IntegerVector group_sizes = no_init( ngroups );
int biggest_group = 0 ;
std::vector<int> indices ;
indices.reserve( data.nrows() );
for( int i=0; i<ngroups; i++){
const std::vector<int>& chunk = *chunks[orders[i]] ;
push_back( indices, chunk ) ;
biggest_group = std::max( biggest_group, (int)chunk.size() );
group_sizes[i] = chunk.size() ;
}
DataFrameVisitors all_variables_visitors(data, data.names() ) ;
data = all_variables_visitors.subset( indices, classes_grouped() ) ;
}
// [[Rcpp::export]]
void training_data_5( DataFrame data){
CharacterVector vars = Rf_getAttrib( data.attr( "vars" ), R_NamesSymbol ) ;
DataFrameVisitors visitors(data, vars) ;
ChunkIndexMap map( visitors ) ;
train_push_back( map, data.nrows() ) ;
DataFrame labels = visitors.subset( map, "data.frame") ;
int ngroups = labels.nrows() ;
OrderVisitors order_labels( labels, vars ) ;
IntegerVector orders = order_labels.apply() ;
std::vector< const std::vector<int>* > chunks(ngroups) ;
ChunkIndexMap::const_iterator it = map.begin() ;
for( int i=0; i<ngroups; i++, ++it){
chunks[ i ] = &it->second ;
}
IntegerVector group_sizes = no_init( ngroups );
int biggest_group = 0 ;
std::vector<int> indices ;
indices.reserve( data.nrows() );
for( int i=0; i<ngroups; i++){
const std::vector<int>& chunk = *chunks[orders[i]] ;
push_back( indices, chunk ) ;
biggest_group = std::max( biggest_group, (int)chunk.size() );
group_sizes[i] = chunk.size() ;
}
DataFrameVisitors all_variables_visitors(data, data.names() ) ;
data = all_variables_visitors.subset( indices, classes_grouped() ) ;
// TODO: we own labels, so perhaps we can do an inplace sort,
// to reuse its memory instead of creating a new data frame
DataFrameVisitors labels_visitors( labels, vars) ;
labels = labels_visitors.subset( orders, "data.frame" ) ;
labels.attr( "vars" ) = R_NilValue ;
data.attr( "group_sizes") = group_sizes ;
data.attr( "biggest_group_size" ) = biggest_group ;
data.attr( "labels" ) = labels ;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment