Skip to content

Instantly share code, notes, and snippets.

@romainfrancois
Created December 30, 2013 19:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save romainfrancois/8187193 to your computer and use it in GitHub Desktop.
Save romainfrancois/8187193 to your computer and use it in GitHub Desktop.
unordered pairs. This thread http://permalink.gmane.org/gmane.comp.lang.r.rcpp/6562 on Rcpp-devel
ndiff <- 2597797
f <- function(){
n <- round(runif( 1, min = 4, max = 10 ))
paste( sample(letters, n, replace = TRUE ), collapse = "" )
}
a <- replicate( ndiff, f() )
b <- replicate( ndiff, f() )
rows <- sample( 1:ndiff, 3018671, replace = T )
cols <- sample( c(T,F), 3018671, replace = T )
m <- cbind(
ifelse(cols, a, b),
ifelse(cols, b, a)
)[rows,]
#include <Rcpp.h>
using namespace Rcpp ;
class StringPairHash{
public:
StringPairHash( const StringMatrix& data_, int colA, int colB ) : data(data_){
int nr = data.nrow() ;
SEXP* start = Rcpp::internal::r_vector_start<STRSXP>(data) ;
columnA = start + nr*(colA-1) ;
columnB = start + nr*(colB-1) ;
}
size_t operator()(int i){
SEXP x = columnA[i] ;
SEXP y = columnB[i] ;
if( x < y ){
return hash(x,y) ;
} else {
return hash(y,x) ;
}
}
private:
inline size_t hash(SEXP first, SEXP second){
std::hash<SEXP> hasher ;
size_t seed = hasher(first) ;
seed ^= hasher(second) + 0x9e3779b9 + (seed<<6) + (seed>>2);
return seed ;
}
StringMatrix data ;
SEXP* columnA ;
SEXP* columnB ;
} ;
class StringPairEqual{
public:
StringPairEqual( const StringMatrix& data_, int colA, int colB ) : data(data_){
int nr = data.nrow() ;
SEXP* start = Rcpp::internal::r_vector_start<STRSXP>(data) ;
columnA = start + nr*(colA-1) ;
columnB = start + nr*(colB-1) ;
}
bool operator()(int i, int j){
SEXP xA = columnA[i], xB = columnB[i] ;
SEXP yA = columnA[j], yB = columnB[j] ;
return ( xA == yA && xB == yB ) || ( xA == yB && xB == yA ) ;
}
private:
StringMatrix data ;
SEXP* columnA ;
SEXP* columnB ;
} ;
typedef std::unordered_set<int,StringPairHash,StringPairEqual> Set ;
// [[Rcpp::export]]
SEXP unordered_pairs( StringMatrix input, int colA, int colB ){
StringPairHash hasher( input, colA, colB) ;
StringPairEqual equal( input, colA, colB) ;
Set set( 1024, hasher, equal ) ;
int nr = input.nrow() ;
for( int i=0; i<nr; i++)
set.insert(i) ;
int n = set.size() ;
StringMatrix out(n,2) ;
Set::const_iterator it = set.begin() ;
for( int i=0; i<n; i++, ++it){
int j = *it ;
out(i,0) = input(i,colA-1) ;
out(i,1) = input(i,colB-1) ;
}
return out ;
}
> microbenchmark( unordered_pairs( m, 1, 2 ), times=10 )
Unit: seconds
expr min lq median uq max neval
unordered_pairs(m, 1, 2) 2.534141 2.536889 2.556607 2.714883 2.804312 10
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment