|
#include <dplyr.h> |
|
// [[Rcpp::depends(dplyr,BH)]] |
|
|
|
using namespace Rcpp ; |
|
using namespace dplyr ; |
|
|
|
template <typename TargetContainer, typename SourceContainer> |
|
void push_back( TargetContainer& x, const SourceContainer& y ){ |
|
x.insert( x.end(), y.begin(), y.end() ) ; |
|
} |
|
template <typename Container> |
|
void push_back( Container& x, typename Container::value_type value, int n ){ |
|
for( int i=0; i<n; i++) |
|
x.push_back( value ) ; |
|
} |
|
|
|
// [[Rcpp::export]] |
|
void training_data_1( DataFrame data){ |
|
CharacterVector vars = Rf_getAttrib( data.attr( "vars" ), R_NamesSymbol ) ; |
|
|
|
DataFrameVisitors visitors(data, vars) ; |
|
ChunkIndexMap map( visitors ) ; |
|
train_push_back( map, data.nrows() ) ; |
|
|
|
} |
|
|
|
// [[Rcpp::export]] |
|
void training_data_2( DataFrame data){ |
|
CharacterVector vars = Rf_getAttrib( data.attr( "vars" ), R_NamesSymbol ) ; |
|
|
|
DataFrameVisitors visitors(data, vars) ; |
|
ChunkIndexMap map( visitors ) ; |
|
train_push_back( map, data.nrows() ) ; |
|
|
|
DataFrame labels = visitors.subset( map, "data.frame") ; |
|
int ngroups = labels.nrows() ; |
|
|
|
OrderVisitors order_labels( labels, vars ) ; |
|
IntegerVector orders = order_labels.apply() ; |
|
|
|
} |
|
|
|
// [[Rcpp::export]] |
|
void training_data_3( DataFrame data){ |
|
CharacterVector vars = Rf_getAttrib( data.attr( "vars" ), R_NamesSymbol ) ; |
|
|
|
DataFrameVisitors visitors(data, vars) ; |
|
ChunkIndexMap map( visitors ) ; |
|
train_push_back( map, data.nrows() ) ; |
|
|
|
DataFrame labels = visitors.subset( map, "data.frame") ; |
|
int ngroups = labels.nrows() ; |
|
|
|
OrderVisitors order_labels( labels, vars ) ; |
|
IntegerVector orders = order_labels.apply() ; |
|
|
|
std::vector< const std::vector<int>* > chunks(ngroups) ; |
|
ChunkIndexMap::const_iterator it = map.begin() ; |
|
for( int i=0; i<ngroups; i++, ++it){ |
|
chunks[ i ] = &it->second ; |
|
} |
|
IntegerVector group_sizes = no_init( ngroups ); |
|
int biggest_group = 0 ; |
|
std::vector<int> indices ; |
|
indices.reserve( data.nrows() ); |
|
for( int i=0; i<ngroups; i++){ |
|
const std::vector<int>& chunk = *chunks[orders[i]] ; |
|
push_back( indices, chunk ) ; |
|
biggest_group = std::max( biggest_group, (int)chunk.size() ); |
|
group_sizes[i] = chunk.size() ; |
|
} |
|
|
|
} |
|
|
|
// [[Rcpp::export]] |
|
void training_data_4( DataFrame data){ |
|
CharacterVector vars = Rf_getAttrib( data.attr( "vars" ), R_NamesSymbol ) ; |
|
|
|
DataFrameVisitors visitors(data, vars) ; |
|
ChunkIndexMap map( visitors ) ; |
|
train_push_back( map, data.nrows() ) ; |
|
|
|
DataFrame labels = visitors.subset( map, "data.frame") ; |
|
int ngroups = labels.nrows() ; |
|
|
|
OrderVisitors order_labels( labels, vars ) ; |
|
IntegerVector orders = order_labels.apply() ; |
|
|
|
std::vector< const std::vector<int>* > chunks(ngroups) ; |
|
ChunkIndexMap::const_iterator it = map.begin() ; |
|
for( int i=0; i<ngroups; i++, ++it){ |
|
chunks[ i ] = &it->second ; |
|
} |
|
IntegerVector group_sizes = no_init( ngroups ); |
|
int biggest_group = 0 ; |
|
std::vector<int> indices ; |
|
indices.reserve( data.nrows() ); |
|
for( int i=0; i<ngroups; i++){ |
|
const std::vector<int>& chunk = *chunks[orders[i]] ; |
|
push_back( indices, chunk ) ; |
|
biggest_group = std::max( biggest_group, (int)chunk.size() ); |
|
group_sizes[i] = chunk.size() ; |
|
} |
|
|
|
DataFrameVisitors all_variables_visitors(data, data.names() ) ; |
|
data = all_variables_visitors.subset( indices, classes_grouped() ) ; |
|
|
|
} |
|
|
|
|
|
// [[Rcpp::export]] |
|
void training_data_5( DataFrame data){ |
|
CharacterVector vars = Rf_getAttrib( data.attr( "vars" ), R_NamesSymbol ) ; |
|
|
|
DataFrameVisitors visitors(data, vars) ; |
|
ChunkIndexMap map( visitors ) ; |
|
train_push_back( map, data.nrows() ) ; |
|
|
|
DataFrame labels = visitors.subset( map, "data.frame") ; |
|
int ngroups = labels.nrows() ; |
|
|
|
OrderVisitors order_labels( labels, vars ) ; |
|
IntegerVector orders = order_labels.apply() ; |
|
|
|
std::vector< const std::vector<int>* > chunks(ngroups) ; |
|
ChunkIndexMap::const_iterator it = map.begin() ; |
|
for( int i=0; i<ngroups; i++, ++it){ |
|
chunks[ i ] = &it->second ; |
|
} |
|
IntegerVector group_sizes = no_init( ngroups ); |
|
int biggest_group = 0 ; |
|
std::vector<int> indices ; |
|
indices.reserve( data.nrows() ); |
|
for( int i=0; i<ngroups; i++){ |
|
const std::vector<int>& chunk = *chunks[orders[i]] ; |
|
push_back( indices, chunk ) ; |
|
biggest_group = std::max( biggest_group, (int)chunk.size() ); |
|
group_sizes[i] = chunk.size() ; |
|
} |
|
|
|
DataFrameVisitors all_variables_visitors(data, data.names() ) ; |
|
data = all_variables_visitors.subset( indices, classes_grouped() ) ; |
|
|
|
// TODO: we own labels, so perhaps we can do an inplace sort, |
|
// to reuse its memory instead of creating a new data frame |
|
DataFrameVisitors labels_visitors( labels, vars) ; |
|
|
|
labels = labels_visitors.subset( orders, "data.frame" ) ; |
|
labels.attr( "vars" ) = R_NilValue ; |
|
|
|
data.attr( "group_sizes") = group_sizes ; |
|
data.attr( "biggest_group_size" ) = biggest_group ; |
|
data.attr( "labels" ) = labels ; |
|
} |