public
Last active

  • Download Gist
Makefile
Makefile
1 2 3 4 5 6 7 8 9 10 11 12
all:test heuristic
 
CPPFLAGS+= -std=c++0x
CPPFLAGS+= -Wall -g
CPPFLAGS+= -march=native -mtune=native
CPPFLAGS+= -Ofast
#CPPFLAGS+= -Ofast -ffast-math #-ffast-fp
 
# CPPFLAGS+=-D_GLIBCXX_PARALLEL -fopenmp
 
%:%.cpp
g++ $(CPPFLAGS) $^ -o $@
heuristic.cpp
C++
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263
#define MAGNITUDE 20
#define SAMPLES (1ul<<MAGNITUDE)
#define ITERATIONS 1024
 
#define VERIFICATION 1
#define VERBOSE 0
#define PROFILING 0
 
#define WORSTCASE 1
#define USE_FLOATS 0
 
#if !WORSTCASE
# define LIMITED_RANGE 1 // hide difference in output due to absense of overflows
#endif
 
/////////////////////////////
#if VERIFICATION
# include <boost/functional/hash.hpp>
#endif
#include <cassert>
#include <vector>
#include <algorithm>
#include <numeric>
#include <iterator>
#include <cstdint>
#include <random>
#include <iostream>
#include <chrono>
 
#if USE_FLOATS // UGLY hack, not?
typedef float element_t;
#define uniform_distribution uniform_real_distribution
#define xxx_ float_
#else
typedef int64_t element_t;
#define uniform_distribution uniform_int_distribution
#define xxx_ int_
#endif
 
#if VERBOSE
# include <boost/spirit/include/karma.hpp>
 
namespace
{
template <typename T>
std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
{
using namespace boost::spirit::karma;
return os << format("v(" << xxx_ % ",\t" << ")", v);
}
}
#endif
 
namespace original_code // the code from the Original Post
{
void theloop(const element_t in[], element_t out[], size_t N)
{
element_t max = in[0];
out[0] = max;
 
for(uint32_t i = 1; i < N; i++) {
element_t v = in[i];
max += v;
if (v > max) max = v;
out[i] = max;
}
}
 
template <typename C> void invoke(const C& data, C& output)
{
original_code::theloop(&data[0], &output[0], data.size());
}
}
 
namespace heuristic
{
inline element_t calc(element_t previous_max, element_t v)
{
previous_max += v;
if (v > previous_max)
previous_max = v;
return previous_max;
}
 
template <typename InIt, typename OutIt>
OutIt loopify(element_t &max, InIt& b, const InIt e, OutIt o)
{
do *o++ = (max = calc(max, *b++));
while (b!=e && (*b>max || *b<0));
return o;
}
 
template <typename C> void invoke(const C& data, C& output)
{
typedef typename C::const_iterator InIt;
typedef typename C::iterator OutIt;
 
InIt b = data.begin();
InIt e = data.end();
OutIt o = output.begin();
 
element_t max = 0l;
 
/*
*while (b!=e)
* o = loopify(max, b, e, o);
*/
#if PROFILING
size_t fastforward = 0, conventional = 0;
#endif
 
InIt m;
while (b!=e)
{
m = std::find_if(b,e, [](element_t i) { return i<0; });
#if PROFILING
if (b!=m) fastforward++;
#endif
 
//o = std::partial_sum(b,m,o, [&max](element_t prv, element_t v) { return max = calc(prv, v); });
//o = std::transform(b,m,o, [&max](element_t v) { return max = calc(max, v); });
o = std::transform(b,m,o, [&max](element_t v) { return max += v; });
if (m==e) break;
b = m;
m = e; // redundant
#if PROFILING
conventional++;
#endif
o = loopify(max,b,e,o);
}
 
#if PROFILING
if (m==e)
{
std::cerr << "heuristics: processed " << (m-data.begin()) << " out of " << data.size() << " (last region [" << (b-data.begin()) << ",+" << (m-b) << "), " << (e-m) << " remaining)" << std::endl;
std::cerr << "regions: " << conventional << " conventional, " << fastforward << " fastforward" << std::endl;
}
#endif
}
 
}
 
namespace measuring
{
auto ts = std::chrono::system_clock::now();
 
auto stopwatch_split() -> decltype(ts)
{
auto split = ts;
ts = std::chrono::system_clock::now();
return split;
}
 
size_t report(const std::string& msg, bool newline=true)
{
auto split = stopwatch_split();
auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(ts - split);
 
std::cout << msg << elapsed.count() << " msecs.";
if (newline)
std::cout << std::endl;
 
return elapsed.count();
}
 
template <typename C>
size_t run_benchmark(const C& data, C& output,
const std::string& label,
void(implementation)(const C&,C&),
size_t& reference_hash)
{
fill(output.begin(), output.end(), 0);
for (int i=0; i<ITERATIONS; ++i)
{
implementation(data, output);
}
 
#if VERIFICATION
size_t hash = boost::hash_range(output.begin(), output.end());
 
if (!reference_hash)
{
reference_hash = hash;
}
else
{
#if VERBOSE
if (reference_hash!=hash)
{
std::cerr << label << " hash mismatch! " << std::hex << hash << " (reference implementation: " << reference_hash << ")" << std::dec << std::endl;
std::cerr << "data: " << data << std::endl;
std::cerr << "output: " << output << std::endl;
original_code::invoke(data, output);
std::cerr << "ref output: " << output << std::endl;
}
else
{
//std::cerr << "data: " << data << std::endl;
//std::cerr << "output: " << output << std::endl;
//run_original_code(data, output);
//std::cerr << "ref output: " << output << std::endl;
}
#endif
#endif
}
 
size_t elapsed = report(label + " elapsed: ", false);
std::cout << " Hash " << std::hex << hash << std::dec << (hash==reference_hash? " ok" : " FAILED!") << std::endl;
return elapsed;
}
}
 
int main()
{
typedef std::vector<element_t> data_t;
data_t data(SAMPLES);
data_t output(data.size());
 
size_t total_op = 0ul,
total_heuristic = 0ul;
 
std::cout << "Benchmarking 100 x " << ITERATIONS << " iterations for data length " << data.size() << " (i.e. 100 different random seeds)" << std::endl;
for (size_t seed = 0; seed < 10; seed++)
{
std::mt19937 rng(seed);
 
std::uniform_distribution<element_t> gen; // uniform, unbiased
std::uniform_distribution<element_t> biased_worstcase(100,1115);
#if USE_FLOATS
std::uniform_distribution<element_t> prevent_overflow; // uniform, unbiased
std::uniform_distribution<element_t> negative_only(std::numeric_limits<element_t>::min(), 0);
std::uniform_distribution<element_t> positive_only(0, std::numeric_limits<int64_t>::max());
#else
std::uniform_distribution<element_t> prevent_overflow(
std::numeric_limits<element_t>::min()/SAMPLES,
std::numeric_limits<element_t>::max()/SAMPLES);
std::uniform_distribution<element_t> negative_only(std::numeric_limits<element_t>::min()/SAMPLES, 0);
std::uniform_distribution<element_t> positive_only(0, std::numeric_limits<element_t>::max()/SAMPLES);
#endif
 
#ifdef WORSTCASE
std::generate(data.begin(), data.end(), std::bind(positive_only, rng));
for (size_t i=0; i<SAMPLES; i += 2)
data[i] = -data[i];
#else
# if LIMITED_RANGE
std::generate(data.begin(), data.end(), std::bind(prevent_overflow, rng));
# else
std::generate(data.begin(), data.end(), std::bind(gen, rng));
# endif
#endif
measuring::report(" --- randomized from new seed --- ");
 
size_t reference_hash = 0ul;
total_op += measuring::run_benchmark(data, output, "OP's code ", original_code::invoke<data_t>, reference_hash);
total_heuristic += measuring::run_benchmark(data, output, "Heuristic code ", heuristic::invoke<data_t>, reference_hash);
}
 
std::cout << "total time in OP algorithm: " << total_op << " msecs" << std::endl;
std::cout << "total time in Heuristic algorithm: " << total_heuristic << " msecs" << std::endl;
std::cout << "speedup factor: " << (1.0*total_op)/(1.0*total_heuristic) << " x" << std::endl;
}
test.cpp
C++
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237
#define MAGNITUDE 20
#define ITERATIONS 1024
#define VERIFICATION 1
#define VERBOSE 0
 
#define LIMITED_RANGE 0 // hide difference in output due to absense of overflows
#define USE_FLOATS 0
 
/////////////////////////////
#if VERIFICATION
# include <boost/functional/hash.hpp>
#endif
#include <cassert>
#include <vector>
#include <algorithm>
#include <numeric>
#include <iterator>
#include <cstdint>
#include <random>
#include <iostream>
#include <chrono>
 
#if USE_FLOATS // UGLY hack, not?
#define int64_t float
#define uniform_int_distribution uniform_real_distribution
#endif
 
#if VERBOSE
# include <boost/spirit/include/karma.hpp>
 
namespace
{
template <typename T>
std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
{
using namespace boost::spirit::karma;
return os << format("v(" << auto_ % ", " << ")", v);
}
}
#endif
 
namespace stackoverflow // the code from the Original Post
{
void theloop(const int64_t in[], int64_t out[], size_t N)
{
int64_t max = in[0];
out[0] = max;
 
for(uint32_t i = 1; i < N; i++) {
int64_t v = in[i];
max += v;
if (v > max) max = v;
out[i] = max;
}
}
 
void unrolled_only(const int64_t in[], int64_t out[], size_t N)
{
int64_t max = 0;
 
assert(N % 4 == 0);
for(size_t i = 0; i < N; i+=4) {
int64_t t_in0 = in[i+0];
int64_t t_in1 = in[i+1];
int64_t t_in2 = in[i+2];
int64_t t_in3 = in[i+3];
 
max += t_in0;
if (t_in0 > max) max = t_in0;
out[i+0] = max;
 
max += t_in1;
if (t_in1 > max) max = t_in1;
out[i+1] = max;
 
max += t_in2;
if (t_in2 > max) max = t_in2;
out[i+2] = max;
 
max += t_in3;
if (t_in3 > max) max = t_in3;
out[i+3] = max;
}
}
 
void bitfiddle_hack(const int64_t in[], int64_t out[], size_t N)
{
#if !USE_FLOATS
int64_t max = 0;
 
assert(N % 4 == 0);
for(size_t i = 0; i < N; i++) {
int64_t v = in[i+0];
 
max &= -max >> (sizeof(int64_t)/sizeof(char)*8 - 1);
max += v;
out[i] = max;
}
#endif
}
}
 
template <typename C> void run_original_code(const C& data, C& output)
{
stackoverflow::theloop(&data[0], &output[0], data.size());
}
 
template <typename C> void unrolled_only(const C& data, C& output)
{
stackoverflow::unrolled_only(&data[0], &output[0], data.size());
}
 
template <typename C> void bitfiddle_hack(const C& data, C& output)
{
stackoverflow::bitfiddle_hack(&data[0], &output[0], data.size());
}
 
template <typename C> void partial_sum_correct(const C& data, C& output)
{
int64_t wrongmax = 0;
 
std::partial_sum(data.begin(), data.end(), output.begin(),
[&wrongmax](int64_t max, int64_t v) -> int64_t
{
max += v;
if (v > max) max = v;
 
wrongmax = wrongmax<0 ? v : wrongmax + v;
 
return max;
});
 
//int64_t offset = output.back() - wrongmax;
//std::cerr << "OFFSET is " << offset << std::endl;
}
 
template <typename C> void partial_sum_incorrect(const C& data, C& output)
{
std::partial_sum(data.begin(), data.end(), output.begin(),
[](int64_t max, int64_t v)
{
return max<0 ? v : max + v;
});
}
 
namespace measuring
{
auto ts = std::chrono::system_clock::now();
 
auto stopwatch_split() -> decltype(ts)
{
auto split = ts;
ts = std::chrono::system_clock::now();
return split;
}
 
void report(const std::string& msg, bool newline=true)
{
auto split = stopwatch_split();
auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(ts - split);
 
std::cout << msg << elapsed.count() << " msecs.";
if (newline)
std::cout << std::endl;
}
 
template <typename C>
void run_benchmark(const C& data, C& output,
const std::string& label,
void(implementation)(const C&,C&),
size_t& reference_hash)
{
for (int i=0; i<ITERATIONS; ++i)
{
implementation(data, output);
}
 
#if VERIFICATION
size_t hash = boost::hash_range(output.begin(), output.end());
 
if (reference_hash && (reference_hash!=hash))
{
#if VERBOSE
std::cerr << label << " hash mismatch! " << std::hex << hash << " (reference implementation: " << reference_hash << ")" << std::dec << std::endl;
std::cerr << "data: " << data << std::endl;
std::cerr << "output: " << output << std::endl;
run_original_code(data, output);
std::cerr << "ref output: " << output << std::endl;
#endif
}
else
{
reference_hash = hash;
}
#endif
 
report(label + " elapsed: ", false);
std::cout << " Hash " << std::hex << hash << std::dec << (hash==reference_hash? " ok" : " FAILED!") << std::endl;
}
}
 
int main()
{
typedef std::vector<int64_t> data_t;
data_t data(1ul << MAGNITUDE);
data_t output(data.size());
 
std::cout << "Benchmarking 100 x " << ITERATIONS << " iterations for data length " << data.size() << " (i.e. 100 different random seeds)" << std::endl;
for (size_t seed = 0; seed < 100; seed++)
{
std::mt19937 rng(seed);
#if LIMITED_RANGE
std::uniform_int_distribution<int64_t> gen(
std::numeric_limits<int64_t>::min() >> MAGNITUDE,
std::numeric_limits<int64_t>::max() >> MAGNITUDE);
std::generate(data.begin(), data.end(), std::bind(gen, rng));
#else
std::uniform_int_distribution<int64_t> gen;
std::generate(data.begin(), data.end(), std::bind(gen, rng));
#endif
measuring::report(" --- randomized from new seed --- ");
 
size_t reference_hash = 0ul;
measuring::run_benchmark(data, output, "OP's code ", run_original_code <data_t> , reference_hash);
 
// these handle overflow exactly like the original algorithm:
measuring::run_benchmark(data, output, "Unfold Only ", unrolled_only<data_t> , reference_hash);
measuring::run_benchmark(data, output, "STL correct ", partial_sum_correct<data_t> , reference_hash);
 
// these handle overflow differently from the original algorithm:
#if !USE_FLOATS
measuring::run_benchmark(data, output, "Bit Fiddle ", bitfiddle_hack<data_t> , reference_hash);
#endif
measuring::run_benchmark(data, output, "STL based ", partial_sum_incorrect <data_t>, reference_hash);
}
}
g++ -std=c++0x -Wall -g -march=native -mtune=native -O3 test.cpp -o test

Ouput fragment

Benchmarking 100 x 1024 iterations for data length 1048576 (i.e. 100 different random seeds)
 --- randomized from new seed --- 68 msecs.
OP's code    elapsed: 4972 msecs. Hash 72e2f1861319fb15 ok
Unfold Only  elapsed: 4962 msecs. Hash 72e2f1861319fb15 ok
STL correct  elapsed: 4990 msecs. Hash 72e2f1861319fb15 ok
Bit Fiddle   elapsed: 4961 msecs. Hash dda2e64a12de5d15 FAILED!
STL based    elapsed: 4953 msecs. Hash dda2e64a12de5d15 FAILED!
 --- randomized from new seed --- 59 msecs.
OP's code    elapsed: 5044 msecs. Hash 182b28b5ebd2f4a0 ok
Unfold Only  elapsed: 5099 msecs. Hash 182b28b5ebd2f4a0 ok
STL correct  elapsed: 5155 msecs. Hash 182b28b5ebd2f4a0 ok
Bit Fiddle   elapsed: 5303 msecs. Hash ceec63e494f99544 FAILED!
STL based    elapsed: 5331 msecs. Hash ceec63e494f99544 FAILED!
 --- randomized from new seed --- 59 msecs.
OP's code    elapsed: 5071 msecs. Hash dfa2bd5b64b9d54f ok
Unfold Only  elapsed: 5259 msecs. Hash dfa2bd5b64b9d54f ok
STL correct  elapsed: 5059 msecs. Hash dfa2bd5b64b9d54f ok
Bit Fiddle   elapsed: 5199 msecs. Hash 90dee249fb10d5db FAILED!
STL based    elapsed: 5114 msecs. Hash 90dee249fb10d5db FAILED!
 --- randomized from new seed --- 59 msecs.
OP's code    elapsed: 5276 msecs. Hash 3b90d4869d1c7646 ok
Unfold Only  elapsed: 5145 msecs. Hash 3b90d4869d1c7646 ok
STL correct  elapsed: 5003 msecs. Hash 3b90d4869d1c7646 ok
Bit Fiddle   elapsed: 5106 msecs. Hash 356ba74305eb3382 FAILED!
STL based    elapsed: 5010 msecs. Hash 356ba74305eb3382 FAILED!
 --- randomized from new seed --- 59 msecs.
OP's code    elapsed: 5008 msecs. Hash 6cd427292d79805 ok
Unfold Only  elapsed: 5005 msecs. Hash 6cd427292d79805 ok
STL correct  elapsed: 5005 msecs. Hash 6cd427292d79805 ok
Bit Fiddle   elapsed: 4954 msecs. Hash 950349f3a9ce4486 FAILED!
STL based    elapsed: 4984 msecs. Hash 950349f3a9ce4486 FAILED!
 --- randomized from new seed --- 59 msecs.

Floating point results

Benchmarking 100 x 1024 iterations for data length 1048576 (i.e. 100 different random seeds)
 --- randomized from new seed --- 12 msecs.
OP's code    elapsed: 2507 msecs. Hash d76092c36e269120 ok
Unfold Only  elapsed: 2737 msecs. Hash d76092c36e269120 ok
STL correct  elapsed: 3606 msecs. Hash d76092c36e269120 ok
STL based    elapsed: 2031 msecs. Hash d76092c36e269120 ok
 --- randomized from new seed --- 8 msecs.
OP's code    elapsed: 2507 msecs. Hash 84563f345e8ac8c1 ok
Unfold Only  elapsed: 2733 msecs. Hash 84563f345e8ac8c1 ok
STL correct  elapsed: 3611 msecs. Hash 84563f345e8ac8c1 ok
STL based    elapsed: 2036 msecs. Hash 84563f345e8ac8c1 ok
 --- randomized from new seed --- 8 msecs.
OP's code    elapsed: 2518 msecs. Hash 3f199e6aa4c42926 ok
Unfold Only  elapsed: 2738 msecs. Hash 3f199e6aa4c42926 ok
STL correct  elapsed: 3606 msecs. Hash 3f199e6aa4c42926 ok
STL based    elapsed: 2032 msecs. Hash 3f199e6aa4c42926 ok
 --- randomized from new seed --- 8 msecs.
OP's code    elapsed: 2510 msecs. Hash 98638e7f8637d05 ok
Unfold Only  elapsed: 2737 msecs. Hash 98638e7f8637d05 ok
STL correct  elapsed: 3608 msecs. Hash 98638e7f8637d05 ok
STL based    elapsed: 2019 msecs. Hash 98638e7f8637d05 ok
 --- randomized from new seed --- 8 msecs.
^C
real    0m44.074s
user    0m44.060s
sys 0m0.010s

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.