Created
November 16, 2011 01:35
-
-
Save sehe/1368992 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define MAGNITUDE 20 | |
#define SAMPLES (1ul<<MAGNITUDE) | |
#define ITERATIONS 1024 | |
#define VERIFICATION 1 | |
#define VERBOSE 0 | |
#define PROFILING 0 | |
#define WORSTCASE 1 | |
#define USE_FLOATS 0 | |
#if !WORSTCASE | |
# define LIMITED_RANGE 1 // hide difference in output due to absense of overflows | |
#endif | |
///////////////////////////// | |
#if VERIFICATION | |
# include <boost/functional/hash.hpp> | |
#endif | |
#include <cassert> | |
#include <vector> | |
#include <algorithm> | |
#include <numeric> | |
#include <iterator> | |
#include <cstdint> | |
#include <random> | |
#include <iostream> | |
#include <chrono> | |
#if USE_FLOATS // UGLY hack, not? | |
typedef float element_t; | |
#define uniform_distribution uniform_real_distribution | |
#define xxx_ float_ | |
#else | |
typedef int64_t element_t; | |
#define uniform_distribution uniform_int_distribution | |
#define xxx_ int_ | |
#endif | |
#if VERBOSE | |
# include <boost/spirit/include/karma.hpp> | |
namespace | |
{ | |
template <typename T> | |
std::ostream& operator<<(std::ostream& os, const std::vector<T>& v) | |
{ | |
using namespace boost::spirit::karma; | |
return os << format("v(" << xxx_ % ",\t" << ")", v); | |
} | |
} | |
#endif | |
namespace original_code // the code from the Original Post | |
{ | |
void theloop(const element_t in[], element_t out[], size_t N) | |
{ | |
element_t max = in[0]; | |
out[0] = max; | |
for(uint32_t i = 1; i < N; i++) { | |
element_t v = in[i]; | |
max += v; | |
if (v > max) max = v; | |
out[i] = max; | |
} | |
} | |
template <typename C> void invoke(const C& data, C& output) | |
{ | |
original_code::theloop(&data[0], &output[0], data.size()); | |
} | |
} | |
namespace heuristic | |
{ | |
inline element_t calc(element_t previous_max, element_t v) | |
{ | |
previous_max += v; | |
if (v > previous_max) | |
previous_max = v; | |
return previous_max; | |
} | |
template <typename InIt, typename OutIt> | |
OutIt loopify(element_t &max, InIt& b, const InIt e, OutIt o) | |
{ | |
do *o++ = (max = calc(max, *b++)); | |
while (b!=e && (*b>max || *b<0)); | |
return o; | |
} | |
template <typename C> void invoke(const C& data, C& output) | |
{ | |
typedef typename C::const_iterator InIt; | |
typedef typename C::iterator OutIt; | |
InIt b = data.begin(); | |
InIt e = data.end(); | |
OutIt o = output.begin(); | |
element_t max = 0l; | |
/* | |
*while (b!=e) | |
* o = loopify(max, b, e, o); | |
*/ | |
#if PROFILING | |
size_t fastforward = 0, conventional = 0; | |
#endif | |
InIt m; | |
while (b!=e) | |
{ | |
m = std::find_if(b,e, [](element_t i) { return i<0; }); | |
#if PROFILING | |
if (b!=m) fastforward++; | |
#endif | |
//o = std::partial_sum(b,m,o, [&max](element_t prv, element_t v) { return max = calc(prv, v); }); | |
//o = std::transform(b,m,o, [&max](element_t v) { return max = calc(max, v); }); | |
o = std::transform(b,m,o, [&max](element_t v) { return max += v; }); | |
if (m==e) break; | |
b = m; | |
m = e; // redundant | |
#if PROFILING | |
conventional++; | |
#endif | |
o = loopify(max,b,e,o); | |
} | |
#if PROFILING | |
if (m==e) | |
{ | |
std::cerr << "heuristics: processed " << (m-data.begin()) << " out of " << data.size() << " (last region [" << (b-data.begin()) << ",+" << (m-b) << "), " << (e-m) << " remaining)" << std::endl; | |
std::cerr << "regions: " << conventional << " conventional, " << fastforward << " fastforward" << std::endl; | |
} | |
#endif | |
} | |
} | |
namespace measuring | |
{ | |
auto ts = std::chrono::system_clock::now(); | |
auto stopwatch_split() -> decltype(ts) | |
{ | |
auto split = ts; | |
ts = std::chrono::system_clock::now(); | |
return split; | |
} | |
size_t report(const std::string& msg, bool newline=true) | |
{ | |
auto split = stopwatch_split(); | |
auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(ts - split); | |
std::cout << msg << elapsed.count() << " msecs."; | |
if (newline) | |
std::cout << std::endl; | |
return elapsed.count(); | |
} | |
template <typename C> | |
size_t run_benchmark(const C& data, C& output, | |
const std::string& label, | |
void(implementation)(const C&,C&), | |
size_t& reference_hash) | |
{ | |
fill(output.begin(), output.end(), 0); | |
for (int i=0; i<ITERATIONS; ++i) | |
{ | |
implementation(data, output); | |
} | |
#if VERIFICATION | |
size_t hash = boost::hash_range(output.begin(), output.end()); | |
if (!reference_hash) | |
{ | |
reference_hash = hash; | |
} | |
else | |
{ | |
#if VERBOSE | |
if (reference_hash!=hash) | |
{ | |
std::cerr << label << " hash mismatch! " << std::hex << hash << " (reference implementation: " << reference_hash << ")" << std::dec << std::endl; | |
std::cerr << "data: " << data << std::endl; | |
std::cerr << "output: " << output << std::endl; | |
original_code::invoke(data, output); | |
std::cerr << "ref output: " << output << std::endl; | |
} | |
else | |
{ | |
//std::cerr << "data: " << data << std::endl; | |
//std::cerr << "output: " << output << std::endl; | |
//run_original_code(data, output); | |
//std::cerr << "ref output: " << output << std::endl; | |
} | |
#endif | |
#endif | |
} | |
size_t elapsed = report(label + " elapsed: ", false); | |
std::cout << " Hash " << std::hex << hash << std::dec << (hash==reference_hash? " ok" : " FAILED!") << std::endl; | |
return elapsed; | |
} | |
} | |
int main() | |
{ | |
typedef std::vector<element_t> data_t; | |
data_t data(SAMPLES); | |
data_t output(data.size()); | |
size_t total_op = 0ul, | |
total_heuristic = 0ul; | |
std::cout << "Benchmarking 100 x " << ITERATIONS << " iterations for data length " << data.size() << " (i.e. 100 different random seeds)" << std::endl; | |
for (size_t seed = 0; seed < 10; seed++) | |
{ | |
std::mt19937 rng(seed); | |
std::uniform_distribution<element_t> gen; // uniform, unbiased | |
std::uniform_distribution<element_t> biased_worstcase(100,1115); | |
#if USE_FLOATS | |
std::uniform_distribution<element_t> prevent_overflow; // uniform, unbiased | |
std::uniform_distribution<element_t> negative_only(std::numeric_limits<element_t>::min(), 0); | |
std::uniform_distribution<element_t> positive_only(0, std::numeric_limits<int64_t>::max()); | |
#else | |
std::uniform_distribution<element_t> prevent_overflow( | |
std::numeric_limits<element_t>::min()/SAMPLES, | |
std::numeric_limits<element_t>::max()/SAMPLES); | |
std::uniform_distribution<element_t> negative_only(std::numeric_limits<element_t>::min()/SAMPLES, 0); | |
std::uniform_distribution<element_t> positive_only(0, std::numeric_limits<element_t>::max()/SAMPLES); | |
#endif | |
#ifdef WORSTCASE | |
std::generate(data.begin(), data.end(), std::bind(positive_only, rng)); | |
for (size_t i=0; i<SAMPLES; i += 2) | |
data[i] = -data[i]; | |
#else | |
# if LIMITED_RANGE | |
std::generate(data.begin(), data.end(), std::bind(prevent_overflow, rng)); | |
# else | |
std::generate(data.begin(), data.end(), std::bind(gen, rng)); | |
# endif | |
#endif | |
measuring::report(" --- randomized from new seed --- "); | |
size_t reference_hash = 0ul; | |
total_op += measuring::run_benchmark(data, output, "OP's code ", original_code::invoke<data_t>, reference_hash); | |
total_heuristic += measuring::run_benchmark(data, output, "Heuristic code ", heuristic::invoke<data_t>, reference_hash); | |
} | |
std::cout << "total time in OP algorithm: " << total_op << " msecs" << std::endl; | |
std::cout << "total time in Heuristic algorithm: " << total_heuristic << " msecs" << std::endl; | |
std::cout << "speedup factor: " << (1.0*total_op)/(1.0*total_heuristic) << " x" << std::endl; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
all:test heuristic | |
CPPFLAGS+= -std=c++0x | |
CPPFLAGS+= -Wall -g | |
CPPFLAGS+= -march=native -mtune=native | |
CPPFLAGS+= -Ofast | |
#CPPFLAGS+= -Ofast -ffast-math #-ffast-fp | |
# CPPFLAGS+=-D_GLIBCXX_PARALLEL -fopenmp | |
%:%.cpp | |
g++ $(CPPFLAGS) $^ -o $@ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define MAGNITUDE 20 | |
#define ITERATIONS 1024 | |
#define VERIFICATION 1 | |
#define VERBOSE 0 | |
#define LIMITED_RANGE 0 // hide difference in output due to absense of overflows | |
#define USE_FLOATS 0 | |
///////////////////////////// | |
#if VERIFICATION | |
# include <boost/functional/hash.hpp> | |
#endif | |
#include <cassert> | |
#include <vector> | |
#include <algorithm> | |
#include <numeric> | |
#include <iterator> | |
#include <cstdint> | |
#include <random> | |
#include <iostream> | |
#include <chrono> | |
#if USE_FLOATS // UGLY hack, not? | |
#define int64_t float | |
#define uniform_int_distribution uniform_real_distribution | |
#endif | |
#if VERBOSE | |
# include <boost/spirit/include/karma.hpp> | |
namespace | |
{ | |
template <typename T> | |
std::ostream& operator<<(std::ostream& os, const std::vector<T>& v) | |
{ | |
using namespace boost::spirit::karma; | |
return os << format("v(" << auto_ % ", " << ")", v); | |
} | |
} | |
#endif | |
namespace stackoverflow // the code from the Original Post | |
{ | |
void theloop(const int64_t in[], int64_t out[], size_t N) | |
{ | |
int64_t max = in[0]; | |
out[0] = max; | |
for(uint32_t i = 1; i < N; i++) { | |
int64_t v = in[i]; | |
max += v; | |
if (v > max) max = v; | |
out[i] = max; | |
} | |
} | |
void unrolled_only(const int64_t in[], int64_t out[], size_t N) | |
{ | |
int64_t max = 0; | |
assert(N % 4 == 0); | |
for(size_t i = 0; i < N; i+=4) { | |
int64_t t_in0 = in[i+0]; | |
int64_t t_in1 = in[i+1]; | |
int64_t t_in2 = in[i+2]; | |
int64_t t_in3 = in[i+3]; | |
max += t_in0; | |
if (t_in0 > max) max = t_in0; | |
out[i+0] = max; | |
max += t_in1; | |
if (t_in1 > max) max = t_in1; | |
out[i+1] = max; | |
max += t_in2; | |
if (t_in2 > max) max = t_in2; | |
out[i+2] = max; | |
max += t_in3; | |
if (t_in3 > max) max = t_in3; | |
out[i+3] = max; | |
} | |
} | |
void bitfiddle_hack(const int64_t in[], int64_t out[], size_t N) | |
{ | |
#if !USE_FLOATS | |
int64_t max = 0; | |
assert(N % 4 == 0); | |
for(size_t i = 0; i < N; i++) { | |
int64_t v = in[i+0]; | |
max &= -max >> (sizeof(int64_t)/sizeof(char)*8 - 1); | |
max += v; | |
out[i] = max; | |
} | |
#endif | |
} | |
} | |
template <typename C> void run_original_code(const C& data, C& output) | |
{ | |
stackoverflow::theloop(&data[0], &output[0], data.size()); | |
} | |
template <typename C> void unrolled_only(const C& data, C& output) | |
{ | |
stackoverflow::unrolled_only(&data[0], &output[0], data.size()); | |
} | |
template <typename C> void bitfiddle_hack(const C& data, C& output) | |
{ | |
stackoverflow::bitfiddle_hack(&data[0], &output[0], data.size()); | |
} | |
template <typename C> void partial_sum_correct(const C& data, C& output) | |
{ | |
int64_t wrongmax = 0; | |
std::partial_sum(data.begin(), data.end(), output.begin(), | |
[&wrongmax](int64_t max, int64_t v) -> int64_t | |
{ | |
max += v; | |
if (v > max) max = v; | |
wrongmax = wrongmax<0 ? v : wrongmax + v; | |
return max; | |
}); | |
//int64_t offset = output.back() - wrongmax; | |
//std::cerr << "OFFSET is " << offset << std::endl; | |
} | |
template <typename C> void partial_sum_incorrect(const C& data, C& output) | |
{ | |
std::partial_sum(data.begin(), data.end(), output.begin(), | |
[](int64_t max, int64_t v) | |
{ | |
return max<0 ? v : max + v; | |
}); | |
} | |
namespace measuring | |
{ | |
auto ts = std::chrono::system_clock::now(); | |
auto stopwatch_split() -> decltype(ts) | |
{ | |
auto split = ts; | |
ts = std::chrono::system_clock::now(); | |
return split; | |
} | |
void report(const std::string& msg, bool newline=true) | |
{ | |
auto split = stopwatch_split(); | |
auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(ts - split); | |
std::cout << msg << elapsed.count() << " msecs."; | |
if (newline) | |
std::cout << std::endl; | |
} | |
template <typename C> | |
void run_benchmark(const C& data, C& output, | |
const std::string& label, | |
void(implementation)(const C&,C&), | |
size_t& reference_hash) | |
{ | |
for (int i=0; i<ITERATIONS; ++i) | |
{ | |
implementation(data, output); | |
} | |
#if VERIFICATION | |
size_t hash = boost::hash_range(output.begin(), output.end()); | |
if (reference_hash && (reference_hash!=hash)) | |
{ | |
#if VERBOSE | |
std::cerr << label << " hash mismatch! " << std::hex << hash << " (reference implementation: " << reference_hash << ")" << std::dec << std::endl; | |
std::cerr << "data: " << data << std::endl; | |
std::cerr << "output: " << output << std::endl; | |
run_original_code(data, output); | |
std::cerr << "ref output: " << output << std::endl; | |
#endif | |
} | |
else | |
{ | |
reference_hash = hash; | |
} | |
#endif | |
report(label + " elapsed: ", false); | |
std::cout << " Hash " << std::hex << hash << std::dec << (hash==reference_hash? " ok" : " FAILED!") << std::endl; | |
} | |
} | |
int main() | |
{ | |
typedef std::vector<int64_t> data_t; | |
data_t data(1ul << MAGNITUDE); | |
data_t output(data.size()); | |
std::cout << "Benchmarking 100 x " << ITERATIONS << " iterations for data length " << data.size() << " (i.e. 100 different random seeds)" << std::endl; | |
for (size_t seed = 0; seed < 100; seed++) | |
{ | |
std::mt19937 rng(seed); | |
#if LIMITED_RANGE | |
std::uniform_int_distribution<int64_t> gen( | |
std::numeric_limits<int64_t>::min() >> MAGNITUDE, | |
std::numeric_limits<int64_t>::max() >> MAGNITUDE); | |
std::generate(data.begin(), data.end(), std::bind(gen, rng)); | |
#else | |
std::uniform_int_distribution<int64_t> gen; | |
std::generate(data.begin(), data.end(), std::bind(gen, rng)); | |
#endif | |
measuring::report(" --- randomized from new seed --- "); | |
size_t reference_hash = 0ul; | |
measuring::run_benchmark(data, output, "OP's code ", run_original_code <data_t> , reference_hash); | |
// these handle overflow exactly like the original algorithm: | |
measuring::run_benchmark(data, output, "Unfold Only ", unrolled_only<data_t> , reference_hash); | |
measuring::run_benchmark(data, output, "STL correct ", partial_sum_correct<data_t> , reference_hash); | |
// these handle overflow differently from the original algorithm: | |
#if !USE_FLOATS | |
measuring::run_benchmark(data, output, "Bit Fiddle ", bitfiddle_hack<data_t> , reference_hash); | |
#endif | |
measuring::run_benchmark(data, output, "STL based ", partial_sum_incorrect <data_t>, reference_hash); | |
} | |
} |
Floating point results
Benchmarking 100 x 1024 iterations for data length 1048576 (i.e. 100 different random seeds)
--- randomized from new seed --- 12 msecs.
OP's code elapsed: 2507 msecs. Hash d76092c36e269120 ok
Unfold Only elapsed: 2737 msecs. Hash d76092c36e269120 ok
STL correct elapsed: 3606 msecs. Hash d76092c36e269120 ok
STL based elapsed: 2031 msecs. Hash d76092c36e269120 ok
--- randomized from new seed --- 8 msecs.
OP's code elapsed: 2507 msecs. Hash 84563f345e8ac8c1 ok
Unfold Only elapsed: 2733 msecs. Hash 84563f345e8ac8c1 ok
STL correct elapsed: 3611 msecs. Hash 84563f345e8ac8c1 ok
STL based elapsed: 2036 msecs. Hash 84563f345e8ac8c1 ok
--- randomized from new seed --- 8 msecs.
OP's code elapsed: 2518 msecs. Hash 3f199e6aa4c42926 ok
Unfold Only elapsed: 2738 msecs. Hash 3f199e6aa4c42926 ok
STL correct elapsed: 3606 msecs. Hash 3f199e6aa4c42926 ok
STL based elapsed: 2032 msecs. Hash 3f199e6aa4c42926 ok
--- randomized from new seed --- 8 msecs.
OP's code elapsed: 2510 msecs. Hash 98638e7f8637d05 ok
Unfold Only elapsed: 2737 msecs. Hash 98638e7f8637d05 ok
STL correct elapsed: 3608 msecs. Hash 98638e7f8637d05 ok
STL based elapsed: 2019 msecs. Hash 98638e7f8637d05 ok
--- randomized from new seed --- 8 msecs.
^C
real 0m44.074s
user 0m44.060s
sys 0m0.010s
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Ouput fragment