Created
January 20, 2013 12:13
-
-
Save dobrokot/4578190 to your computer and use it in GitHub Desktop.
modified c++ code from http://lemire.me/blog/archives/2012/07/23/is-cc-worth-it/#comments
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Fast computations of cumulative sums. | |
* D. Lemire, July 2012 | |
* | |
* Best results with: | |
* | |
$ g++-4.7 -funroll-loops -O3 -o cumulsum cumulsum.cpp | |
$ ./unrolldeltas | |
but we want to avoid the -funroll-loops flag. | |
*/ | |
// This may require GCC 4.4/ | |
#include <sys/stat.h> | |
#include <sys/time.h> | |
#include <sys/types.h> | |
#include <iostream> | |
#include <cassert> | |
#include <vector> | |
#include <stdexcept> | |
using namespace std; | |
class WallClockTimer { | |
public: | |
struct timeval t1, t2; | |
WallClockTimer() : | |
t1(), t2() { | |
gettimeofday(&t1, 0); | |
t2 = t1; | |
} | |
void reset() { | |
gettimeofday(&t1, 0); | |
t2 = t1; | |
} | |
int elapsed() { | |
return ((t2.tv_sec - t1.tv_sec) * 1000) + ((t2.tv_usec - t1. tv_usec) | |
/ 1000); | |
} | |
int split() { | |
gettimeofday(&t2, 0); | |
return elapsed(); | |
} | |
}; | |
int* givemeanarray_simple(size_t N) { | |
int* bigarray; | |
bigarray=new int[N]; | |
for(unsigned int k=0;k<N;k++) | |
bigarray[k]=(k+k/3); | |
return bigarray; | |
} | |
vector<int> givemeanarray(size_t N) { | |
vector<int> bigarray; | |
bigarray.reserve(N); | |
for(unsigned int k = 0; k<N; ++k) | |
bigarray.push_back(k+k/3); | |
return bigarray; | |
} | |
void straightsum(int * data, size_t size) { | |
if(size == 0) return; | |
for (size_t i = 1; i != size; ++i) { | |
data[i] += data[i - 1] ; | |
} | |
} | |
void slowishSum(vector<int> & data) { | |
if(data.size() == 0) return; | |
for (size_t i = 1; i != data.size(); ++i) { | |
data[i] += data[i - 1] ; | |
} | |
} | |
// By David Stocking | |
// (reported as buggy?) | |
void iterSum(vector<int> & data) { | |
if(data.size() == 0) return; | |
for ( vector<int>::iterator i=data.begin(); i != data.end(); ++i ) { | |
*(i+1) += *i; | |
} | |
} | |
// by Nathanael Schaeffer | |
void sum2(vector<int> & data) { | |
size_t n = data.size(); | |
if(n == 0) return; | |
int s0 = 0; | |
int s1 = 0; | |
size_t i = 0; | |
for (; i < n - 1; i+=2) { // keep things aligned, starting at 0 (not 1) | |
int x0 = data[i]; | |
int x1 = data[i+1]; | |
s0 = s1+x0; | |
s1 += (x1+x0); | |
data[i] = s0; | |
data[i+1] = s1; | |
} | |
for (; i<n; i++) { | |
data[i] += data[i-1]; | |
} | |
} | |
// by Nathanael Schaeffer | |
void sum3(vector<int> & data) { | |
size_t n = data.size(); | |
if(n == 0) return; | |
int s0 = 0; | |
int s1 = 0; | |
int s2 = 0; | |
size_t i = 0; | |
for (; i < n - 2; i+=3) { // keep things aligned, starting at 0 (not 1) | |
int x0 = data[i]; | |
int x1 = data[i+1]; | |
int x2 = data[i+2]; | |
s0 = s2 + x0; // three independant adds | |
s1 = s2 + (x1+x0); // are performed in parallel | |
s2 = s0 + (x1+x2); // on modern x86. | |
data[i] = s0; | |
data[i+1] = s1; | |
data[i+2] = s2; | |
} | |
for (; i<n; i++) { | |
data[i] += data[i-1]; | |
} | |
} | |
// By Vasily Volkov, improved by Daniel Lemire | |
void fastSum(vector<int> & data) { | |
if(data.size() == 0) return; | |
const size_t UnrollQty = 4; | |
size_t sz0 = (data.size() / UnrollQty) * UnrollQty; // equal to 0, if data.size() < UnrollQty | |
size_t i = 1; | |
if (sz0>=UnrollQty) { | |
int a = data[0]; | |
for (; i < sz0 - UnrollQty ; i += UnrollQty) { | |
a = (data[i] += a); | |
a = (data[i + 1] += a); | |
a = (data[i + 2] += a); | |
a = (data[i + 3] += a); | |
} | |
} | |
for (; i != data.size(); ++i) { | |
data[i] += data[i- 1] ; | |
} | |
} | |
// loop unrolling helps avoid the need for -funroll-loops | |
void sum(vector<int> & data) { | |
if(data.size() == 0) return; | |
size_t i = 1; | |
for (; i < data.size() - 1; i+=2) { | |
data[i] += data[i - 1]; | |
data[i+1] += data[i ]; | |
} | |
for (; i != data.size(); ++i) { | |
data[i] += data[i - 1]; | |
} | |
} | |
void f1(std::vector<int> &data) { | |
if (data.empty()) return; | |
for (size_t i = 1; i != data.size(); ++i) { | |
data[i] += data[i - 1] ; | |
} | |
} | |
void f2(std::vector<int> &data) { | |
int a = 0; | |
int *data_ptr = data.empty() ? 0 : &data[0]; //should be container method... | |
for (size_t i = 0, i_end = data.size(); i != i_end; ++i) { | |
a += data_ptr[i]; | |
data_ptr[i] = a; | |
} | |
} | |
void f3(std::vector<int> &data) { | |
int a = 0; | |
int *data_ptr = data.empty() ? 0 : &data[0]; //should be container method... | |
size_t n = data.size(); | |
for (size_t i = 0, i_end = n&(~size_t(3)); i < i_end; i += 4) { | |
a += data_ptr[i]; | |
size_t a0 = a; | |
a += data_ptr[i+1]; | |
size_t a1 = a; | |
a += data_ptr[i+2]; | |
size_t a2 = a; | |
a += data_ptr[i+3]; | |
size_t a3 = a; | |
data_ptr[i] = a0; | |
data_ptr[i+1] = a1; | |
data_ptr[i+2] = a2; | |
data_ptr[i+3] = a3; | |
} | |
for (size_t i = n&(~size_t(3)); i < n; ++i) { | |
a += data_ptr[i]; | |
data_ptr[i] = a; | |
} | |
} | |
void test(size_t N ) { | |
WallClockTimer time; | |
int bogus = 0; | |
cout<<"N = "<<N<<endl; | |
for(int t = 0; t<15;++t) { | |
cout <<" test # "<< t<<endl; | |
vector<int> data = givemeanarray(N) ; | |
vector<int> copydata(data); | |
time.reset(); | |
straightsum(&data[0],N); | |
cout<<":straight sum (C-like) "<<N/(1000.0*time.split())<<endl; | |
bogus += data.back(); | |
data = copydata; | |
time.reset(); | |
slowishSum(data); | |
cout<<":basic sum (C++-like) "<<N/(1000.0*time.split())<<endl; | |
bogus += data.back(); | |
data = copydata; | |
time.reset(); | |
iterSum(data); | |
cout<<":iterator-based sum (C++-like) "<<N/(1000.0*time.split())<<endl; | |
bogus += data.back(); | |
data = copydata; | |
time.reset(); | |
sum2(data); | |
cout<<":smarter sum2 "<<N/(1000.0*time.split())<<endl; | |
time.reset(); | |
sum3(data); | |
cout<<":smarter sum3 "<<N/(1000.0*time.split())<<endl; | |
time.reset(); | |
sum(data); | |
cout<<":smarter sum "<<N/(1000.0*time.split())<<endl; | |
time.reset(); | |
f1(data); | |
cout<<":base line 1 "<<N/(1000.0*time.split())<<endl; | |
time.reset(); | |
f2(data); | |
cout<<":winnie sum 2 "<<N/(1000.0*time.split())<<endl; | |
time.reset(); | |
f3(data); | |
cout<<":winnie sum 3 "<<N/(1000.0*time.split())<<endl; | |
bogus += data.back(); | |
data = copydata; | |
time.reset(); | |
fastSum(data); | |
cout<<":fast sum "<<N/(1000.0*time.split())<<endl; | |
bogus += data.back(); | |
cout<<endl<<endl<<endl; | |
} | |
cout<<"ignore this: "<<bogus<<endl; | |
} | |
int main() { | |
cout << "reporting speed in million of integers per second "<<endl; | |
size_t N = 50 * 1000 * 1000 ; | |
for (int i = 0; i < 10; ++i) { | |
cout<<"============"<<endl; | |
test(N); | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment