Skip to content

Instantly share code, notes, and snippets.

@sehe
Created November 16, 2011 01:35
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sehe/1368992 to your computer and use it in GitHub Desktop.
Save sehe/1368992 to your computer and use it in GitHub Desktop.
#define MAGNITUDE 20
#define SAMPLES (1ul<<MAGNITUDE)
#define ITERATIONS 1024
#define VERIFICATION 1
#define VERBOSE 0
#define PROFILING 0
#define WORSTCASE 1
#define USE_FLOATS 0
#if !WORSTCASE
# define LIMITED_RANGE 1 // hide difference in output due to absense of overflows
#endif
/////////////////////////////
#if VERIFICATION
# include <boost/functional/hash.hpp>
#endif
#include <cassert>
#include <vector>
#include <algorithm>
#include <numeric>
#include <iterator>
#include <cstdint>
#include <random>
#include <iostream>
#include <chrono>
#if USE_FLOATS // UGLY hack, not?
typedef float element_t;
#define uniform_distribution uniform_real_distribution
#define xxx_ float_
#else
typedef int64_t element_t;
#define uniform_distribution uniform_int_distribution
#define xxx_ int_
#endif
#if VERBOSE
# include <boost/spirit/include/karma.hpp>
namespace
{
template <typename T>
std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
{
using namespace boost::spirit::karma;
return os << format("v(" << xxx_ % ",\t" << ")", v);
}
}
#endif
namespace original_code // the code from the Original Post
{
void theloop(const element_t in[], element_t out[], size_t N)
{
element_t max = in[0];
out[0] = max;
for(uint32_t i = 1; i < N; i++) {
element_t v = in[i];
max += v;
if (v > max) max = v;
out[i] = max;
}
}
template <typename C> void invoke(const C& data, C& output)
{
original_code::theloop(&data[0], &output[0], data.size());
}
}
namespace heuristic
{
inline element_t calc(element_t previous_max, element_t v)
{
previous_max += v;
if (v > previous_max)
previous_max = v;
return previous_max;
}
template <typename InIt, typename OutIt>
OutIt loopify(element_t &max, InIt& b, const InIt e, OutIt o)
{
do *o++ = (max = calc(max, *b++));
while (b!=e && (*b>max || *b<0));
return o;
}
template <typename C> void invoke(const C& data, C& output)
{
typedef typename C::const_iterator InIt;
typedef typename C::iterator OutIt;
InIt b = data.begin();
InIt e = data.end();
OutIt o = output.begin();
element_t max = 0l;
/*
*while (b!=e)
* o = loopify(max, b, e, o);
*/
#if PROFILING
size_t fastforward = 0, conventional = 0;
#endif
InIt m;
while (b!=e)
{
m = std::find_if(b,e, [](element_t i) { return i<0; });
#if PROFILING
if (b!=m) fastforward++;
#endif
//o = std::partial_sum(b,m,o, [&max](element_t prv, element_t v) { return max = calc(prv, v); });
//o = std::transform(b,m,o, [&max](element_t v) { return max = calc(max, v); });
o = std::transform(b,m,o, [&max](element_t v) { return max += v; });
if (m==e) break;
b = m;
m = e; // redundant
#if PROFILING
conventional++;
#endif
o = loopify(max,b,e,o);
}
#if PROFILING
if (m==e)
{
std::cerr << "heuristics: processed " << (m-data.begin()) << " out of " << data.size() << " (last region [" << (b-data.begin()) << ",+" << (m-b) << "), " << (e-m) << " remaining)" << std::endl;
std::cerr << "regions: " << conventional << " conventional, " << fastforward << " fastforward" << std::endl;
}
#endif
}
}
namespace measuring
{
auto ts = std::chrono::system_clock::now();
auto stopwatch_split() -> decltype(ts)
{
auto split = ts;
ts = std::chrono::system_clock::now();
return split;
}
size_t report(const std::string& msg, bool newline=true)
{
auto split = stopwatch_split();
auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(ts - split);
std::cout << msg << elapsed.count() << " msecs.";
if (newline)
std::cout << std::endl;
return elapsed.count();
}
template <typename C>
size_t run_benchmark(const C& data, C& output,
const std::string& label,
void(implementation)(const C&,C&),
size_t& reference_hash)
{
fill(output.begin(), output.end(), 0);
for (int i=0; i<ITERATIONS; ++i)
{
implementation(data, output);
}
#if VERIFICATION
size_t hash = boost::hash_range(output.begin(), output.end());
if (!reference_hash)
{
reference_hash = hash;
}
else
{
#if VERBOSE
if (reference_hash!=hash)
{
std::cerr << label << " hash mismatch! " << std::hex << hash << " (reference implementation: " << reference_hash << ")" << std::dec << std::endl;
std::cerr << "data: " << data << std::endl;
std::cerr << "output: " << output << std::endl;
original_code::invoke(data, output);
std::cerr << "ref output: " << output << std::endl;
}
else
{
//std::cerr << "data: " << data << std::endl;
//std::cerr << "output: " << output << std::endl;
//run_original_code(data, output);
//std::cerr << "ref output: " << output << std::endl;
}
#endif
#endif
}
size_t elapsed = report(label + " elapsed: ", false);
std::cout << " Hash " << std::hex << hash << std::dec << (hash==reference_hash? " ok" : " FAILED!") << std::endl;
return elapsed;
}
}
int main()
{
typedef std::vector<element_t> data_t;
data_t data(SAMPLES);
data_t output(data.size());
size_t total_op = 0ul,
total_heuristic = 0ul;
std::cout << "Benchmarking 100 x " << ITERATIONS << " iterations for data length " << data.size() << " (i.e. 100 different random seeds)" << std::endl;
for (size_t seed = 0; seed < 10; seed++)
{
std::mt19937 rng(seed);
std::uniform_distribution<element_t> gen; // uniform, unbiased
std::uniform_distribution<element_t> biased_worstcase(100,1115);
#if USE_FLOATS
std::uniform_distribution<element_t> prevent_overflow; // uniform, unbiased
std::uniform_distribution<element_t> negative_only(std::numeric_limits<element_t>::min(), 0);
std::uniform_distribution<element_t> positive_only(0, std::numeric_limits<int64_t>::max());
#else
std::uniform_distribution<element_t> prevent_overflow(
std::numeric_limits<element_t>::min()/SAMPLES,
std::numeric_limits<element_t>::max()/SAMPLES);
std::uniform_distribution<element_t> negative_only(std::numeric_limits<element_t>::min()/SAMPLES, 0);
std::uniform_distribution<element_t> positive_only(0, std::numeric_limits<element_t>::max()/SAMPLES);
#endif
#ifdef WORSTCASE
std::generate(data.begin(), data.end(), std::bind(positive_only, rng));
for (size_t i=0; i<SAMPLES; i += 2)
data[i] = -data[i];
#else
# if LIMITED_RANGE
std::generate(data.begin(), data.end(), std::bind(prevent_overflow, rng));
# else
std::generate(data.begin(), data.end(), std::bind(gen, rng));
# endif
#endif
measuring::report(" --- randomized from new seed --- ");
size_t reference_hash = 0ul;
total_op += measuring::run_benchmark(data, output, "OP's code ", original_code::invoke<data_t>, reference_hash);
total_heuristic += measuring::run_benchmark(data, output, "Heuristic code ", heuristic::invoke<data_t>, reference_hash);
}
std::cout << "total time in OP algorithm: " << total_op << " msecs" << std::endl;
std::cout << "total time in Heuristic algorithm: " << total_heuristic << " msecs" << std::endl;
std::cout << "speedup factor: " << (1.0*total_op)/(1.0*total_heuristic) << " x" << std::endl;
}
all:test heuristic
CPPFLAGS+= -std=c++0x
CPPFLAGS+= -Wall -g
CPPFLAGS+= -march=native -mtune=native
CPPFLAGS+= -Ofast
#CPPFLAGS+= -Ofast -ffast-math #-ffast-fp
# CPPFLAGS+=-D_GLIBCXX_PARALLEL -fopenmp
%:%.cpp
g++ $(CPPFLAGS) $^ -o $@
#define MAGNITUDE 20
#define ITERATIONS 1024
#define VERIFICATION 1
#define VERBOSE 0
#define LIMITED_RANGE 0 // hide difference in output due to absense of overflows
#define USE_FLOATS 0
/////////////////////////////
#if VERIFICATION
# include <boost/functional/hash.hpp>
#endif
#include <cassert>
#include <vector>
#include <algorithm>
#include <numeric>
#include <iterator>
#include <cstdint>
#include <random>
#include <iostream>
#include <chrono>
#if USE_FLOATS // UGLY hack, not?
#define int64_t float
#define uniform_int_distribution uniform_real_distribution
#endif
#if VERBOSE
# include <boost/spirit/include/karma.hpp>
namespace
{
template <typename T>
std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
{
using namespace boost::spirit::karma;
return os << format("v(" << auto_ % ", " << ")", v);
}
}
#endif
namespace stackoverflow // the code from the Original Post
{
void theloop(const int64_t in[], int64_t out[], size_t N)
{
int64_t max = in[0];
out[0] = max;
for(uint32_t i = 1; i < N; i++) {
int64_t v = in[i];
max += v;
if (v > max) max = v;
out[i] = max;
}
}
void unrolled_only(const int64_t in[], int64_t out[], size_t N)
{
int64_t max = 0;
assert(N % 4 == 0);
for(size_t i = 0; i < N; i+=4) {
int64_t t_in0 = in[i+0];
int64_t t_in1 = in[i+1];
int64_t t_in2 = in[i+2];
int64_t t_in3 = in[i+3];
max += t_in0;
if (t_in0 > max) max = t_in0;
out[i+0] = max;
max += t_in1;
if (t_in1 > max) max = t_in1;
out[i+1] = max;
max += t_in2;
if (t_in2 > max) max = t_in2;
out[i+2] = max;
max += t_in3;
if (t_in3 > max) max = t_in3;
out[i+3] = max;
}
}
void bitfiddle_hack(const int64_t in[], int64_t out[], size_t N)
{
#if !USE_FLOATS
int64_t max = 0;
assert(N % 4 == 0);
for(size_t i = 0; i < N; i++) {
int64_t v = in[i+0];
max &= -max >> (sizeof(int64_t)/sizeof(char)*8 - 1);
max += v;
out[i] = max;
}
#endif
}
}
template <typename C> void run_original_code(const C& data, C& output)
{
stackoverflow::theloop(&data[0], &output[0], data.size());
}
template <typename C> void unrolled_only(const C& data, C& output)
{
stackoverflow::unrolled_only(&data[0], &output[0], data.size());
}
template <typename C> void bitfiddle_hack(const C& data, C& output)
{
stackoverflow::bitfiddle_hack(&data[0], &output[0], data.size());
}
template <typename C> void partial_sum_correct(const C& data, C& output)
{
int64_t wrongmax = 0;
std::partial_sum(data.begin(), data.end(), output.begin(),
[&wrongmax](int64_t max, int64_t v) -> int64_t
{
max += v;
if (v > max) max = v;
wrongmax = wrongmax<0 ? v : wrongmax + v;
return max;
});
//int64_t offset = output.back() - wrongmax;
//std::cerr << "OFFSET is " << offset << std::endl;
}
template <typename C> void partial_sum_incorrect(const C& data, C& output)
{
std::partial_sum(data.begin(), data.end(), output.begin(),
[](int64_t max, int64_t v)
{
return max<0 ? v : max + v;
});
}
namespace measuring
{
auto ts = std::chrono::system_clock::now();
auto stopwatch_split() -> decltype(ts)
{
auto split = ts;
ts = std::chrono::system_clock::now();
return split;
}
void report(const std::string& msg, bool newline=true)
{
auto split = stopwatch_split();
auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(ts - split);
std::cout << msg << elapsed.count() << " msecs.";
if (newline)
std::cout << std::endl;
}
template <typename C>
void run_benchmark(const C& data, C& output,
const std::string& label,
void(implementation)(const C&,C&),
size_t& reference_hash)
{
for (int i=0; i<ITERATIONS; ++i)
{
implementation(data, output);
}
#if VERIFICATION
size_t hash = boost::hash_range(output.begin(), output.end());
if (reference_hash && (reference_hash!=hash))
{
#if VERBOSE
std::cerr << label << " hash mismatch! " << std::hex << hash << " (reference implementation: " << reference_hash << ")" << std::dec << std::endl;
std::cerr << "data: " << data << std::endl;
std::cerr << "output: " << output << std::endl;
run_original_code(data, output);
std::cerr << "ref output: " << output << std::endl;
#endif
}
else
{
reference_hash = hash;
}
#endif
report(label + " elapsed: ", false);
std::cout << " Hash " << std::hex << hash << std::dec << (hash==reference_hash? " ok" : " FAILED!") << std::endl;
}
}
int main()
{
typedef std::vector<int64_t> data_t;
data_t data(1ul << MAGNITUDE);
data_t output(data.size());
std::cout << "Benchmarking 100 x " << ITERATIONS << " iterations for data length " << data.size() << " (i.e. 100 different random seeds)" << std::endl;
for (size_t seed = 0; seed < 100; seed++)
{
std::mt19937 rng(seed);
#if LIMITED_RANGE
std::uniform_int_distribution<int64_t> gen(
std::numeric_limits<int64_t>::min() >> MAGNITUDE,
std::numeric_limits<int64_t>::max() >> MAGNITUDE);
std::generate(data.begin(), data.end(), std::bind(gen, rng));
#else
std::uniform_int_distribution<int64_t> gen;
std::generate(data.begin(), data.end(), std::bind(gen, rng));
#endif
measuring::report(" --- randomized from new seed --- ");
size_t reference_hash = 0ul;
measuring::run_benchmark(data, output, "OP's code ", run_original_code <data_t> , reference_hash);
// these handle overflow exactly like the original algorithm:
measuring::run_benchmark(data, output, "Unfold Only ", unrolled_only<data_t> , reference_hash);
measuring::run_benchmark(data, output, "STL correct ", partial_sum_correct<data_t> , reference_hash);
// these handle overflow differently from the original algorithm:
#if !USE_FLOATS
measuring::run_benchmark(data, output, "Bit Fiddle ", bitfiddle_hack<data_t> , reference_hash);
#endif
measuring::run_benchmark(data, output, "STL based ", partial_sum_incorrect <data_t>, reference_hash);
}
}
@sehe
Copy link
Author

sehe commented Nov 16, 2011

g++ -std=c++0x -Wall -g -march=native -mtune=native -O3 test.cpp -o test

Ouput fragment

Benchmarking 100 x 1024 iterations for data length 1048576 (i.e. 100 different random seeds)
 --- randomized from new seed --- 68 msecs.
OP's code    elapsed: 4972 msecs. Hash 72e2f1861319fb15 ok
Unfold Only  elapsed: 4962 msecs. Hash 72e2f1861319fb15 ok
STL correct  elapsed: 4990 msecs. Hash 72e2f1861319fb15 ok
Bit Fiddle   elapsed: 4961 msecs. Hash dda2e64a12de5d15 FAILED!
STL based    elapsed: 4953 msecs. Hash dda2e64a12de5d15 FAILED!
 --- randomized from new seed --- 59 msecs.
OP's code    elapsed: 5044 msecs. Hash 182b28b5ebd2f4a0 ok
Unfold Only  elapsed: 5099 msecs. Hash 182b28b5ebd2f4a0 ok
STL correct  elapsed: 5155 msecs. Hash 182b28b5ebd2f4a0 ok
Bit Fiddle   elapsed: 5303 msecs. Hash ceec63e494f99544 FAILED!
STL based    elapsed: 5331 msecs. Hash ceec63e494f99544 FAILED!
 --- randomized from new seed --- 59 msecs.
OP's code    elapsed: 5071 msecs. Hash dfa2bd5b64b9d54f ok
Unfold Only  elapsed: 5259 msecs. Hash dfa2bd5b64b9d54f ok
STL correct  elapsed: 5059 msecs. Hash dfa2bd5b64b9d54f ok
Bit Fiddle   elapsed: 5199 msecs. Hash 90dee249fb10d5db FAILED!
STL based    elapsed: 5114 msecs. Hash 90dee249fb10d5db FAILED!
 --- randomized from new seed --- 59 msecs.
OP's code    elapsed: 5276 msecs. Hash 3b90d4869d1c7646 ok
Unfold Only  elapsed: 5145 msecs. Hash 3b90d4869d1c7646 ok
STL correct  elapsed: 5003 msecs. Hash 3b90d4869d1c7646 ok
Bit Fiddle   elapsed: 5106 msecs. Hash 356ba74305eb3382 FAILED!
STL based    elapsed: 5010 msecs. Hash 356ba74305eb3382 FAILED!
 --- randomized from new seed --- 59 msecs.
OP's code    elapsed: 5008 msecs. Hash 6cd427292d79805 ok
Unfold Only  elapsed: 5005 msecs. Hash 6cd427292d79805 ok
STL correct  elapsed: 5005 msecs. Hash 6cd427292d79805 ok
Bit Fiddle   elapsed: 4954 msecs. Hash 950349f3a9ce4486 FAILED!
STL based    elapsed: 4984 msecs. Hash 950349f3a9ce4486 FAILED!
 --- randomized from new seed --- 59 msecs.

@sehe
Copy link
Author

sehe commented Nov 16, 2011

Floating point results

Benchmarking 100 x 1024 iterations for data length 1048576 (i.e. 100 different random seeds)
 --- randomized from new seed --- 12 msecs.
OP's code    elapsed: 2507 msecs. Hash d76092c36e269120 ok
Unfold Only  elapsed: 2737 msecs. Hash d76092c36e269120 ok
STL correct  elapsed: 3606 msecs. Hash d76092c36e269120 ok
STL based    elapsed: 2031 msecs. Hash d76092c36e269120 ok
 --- randomized from new seed --- 8 msecs.
OP's code    elapsed: 2507 msecs. Hash 84563f345e8ac8c1 ok
Unfold Only  elapsed: 2733 msecs. Hash 84563f345e8ac8c1 ok
STL correct  elapsed: 3611 msecs. Hash 84563f345e8ac8c1 ok
STL based    elapsed: 2036 msecs. Hash 84563f345e8ac8c1 ok
 --- randomized from new seed --- 8 msecs.
OP's code    elapsed: 2518 msecs. Hash 3f199e6aa4c42926 ok
Unfold Only  elapsed: 2738 msecs. Hash 3f199e6aa4c42926 ok
STL correct  elapsed: 3606 msecs. Hash 3f199e6aa4c42926 ok
STL based    elapsed: 2032 msecs. Hash 3f199e6aa4c42926 ok
 --- randomized from new seed --- 8 msecs.
OP's code    elapsed: 2510 msecs. Hash 98638e7f8637d05 ok
Unfold Only  elapsed: 2737 msecs. Hash 98638e7f8637d05 ok
STL correct  elapsed: 3608 msecs. Hash 98638e7f8637d05 ok
STL based    elapsed: 2019 msecs. Hash 98638e7f8637d05 ok
 --- randomized from new seed --- 8 msecs.
^C
real    0m44.074s
user    0m44.060s
sys 0m0.010s

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment