sehe/Makefile

## heuristic.cpp
#define MAGNITUDE     20
#define SAMPLES       (1ul<<MAGNITUDE)
#define ITERATIONS    1024

#define VERIFICATION  1
#define VERBOSE       0
#define PROFILING     0

#define WORSTCASE     1
#define USE_FLOATS    0

#if !WORSTCASE
#   define LIMITED_RANGE 1    // hide difference in output due to absense of overflows
#endif

/////////////////////////////
#if VERIFICATION
#   include <boost/functional/hash.hpp>
#endif
#include <cassert>
#include <vector>
#include <algorithm>
#include <numeric>
#include <iterator>
#include <cstdint>
#include <random>
#include <iostream>
#include <chrono>

#if USE_FLOATS // UGLY hack, not?
    typedef float element_t;
    #define uniform_distribution uniform_real_distribution
    #define xxx_ float_
#else
    typedef int64_t element_t;
    #define uniform_distribution uniform_int_distribution
    #define xxx_ int_
#endif

#if VERBOSE
#   include <boost/spirit/include/karma.hpp>

namespace
{
    template <typename T>
        std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
    {
        using namespace boost::spirit::karma;
        return os << format("v(" << xxx_ % ",\t" << ")", v);
    }
}
#endif

namespace original_code // the code from the Original Post
{
    void theloop(const element_t in[], element_t out[], size_t N)
    {
        element_t max = in[0];
        out[0] = max;

        for(uint32_t i = 1; i < N; i++) {
            element_t v = in[i];
            max += v;
            if (v > max) max = v;
            out[i] = max;
        }
    }

    template <typename C> void invoke(const C& data, C& output)
    {
        original_code::theloop(&data[0], &output[0], data.size());
    }
}

namespace heuristic
{
    inline element_t calc(element_t previous_max, element_t v)
    {
        previous_max += v;
        if (v > previous_max)
            previous_max = v;
        return previous_max;
    }

    template <typename InIt, typename OutIt>
    OutIt loopify(element_t &max, InIt& b, const InIt e, OutIt o)
    {
        do    *o++ = (max = calc(max, *b++));
        while (b!=e && (*b>max || *b<0));
        return o;
    }

    template <typename C> void invoke(const C& data, C& output)
    {
        typedef typename C::const_iterator InIt;
        typedef typename C::iterator OutIt;

        InIt  b = data.begin();
        InIt  e = data.end();
        OutIt o = output.begin();

        element_t max = 0l;

        /*
         *while (b!=e)
         *    o = loopify(max, b, e, o);
         */
#if PROFILING
        size_t fastforward = 0, conventional = 0;
#endif

        InIt m;
        while (b!=e)
        {
            m = std::find_if(b,e, [](element_t i) { return i<0; });
#if PROFILING
            if (b!=m) fastforward++;
#endif

          //o = std::partial_sum(b,m,o, [&max](element_t prv, element_t v) { return max = calc(prv, v); });
          //o = std::transform(b,m,o, [&max](element_t v) { return max = calc(max, v); });
            o = std::transform(b,m,o, [&max](element_t v) { return max += v; });
            if (m==e) break;
            b = m;
            m = e; // redundant
#if PROFILING
            conventional++;
#endif
            o = loopify(max,b,e,o);
        }

#if PROFILING
        if (m==e)
        {
            std::cerr << "heuristics: processed " << (m-data.begin()) << " out of " << data.size() << " (last region [" << (b-data.begin()) << ",+" << (m-b) << "), " << (e-m) << " remaining)" << std::endl;
            std::cerr << "regions:  " << conventional << " conventional, " << fastforward << " fastforward" << std::endl;
        }
#endif
    }

}

namespace measuring
{
    auto ts = std::chrono::system_clock::now();

    auto stopwatch_split() -> decltype(ts)
    {
        auto split = ts;
        ts = std::chrono::system_clock::now();
        return split;
    }

    size_t report(const std::string& msg, bool newline=true)
    {
        auto split = stopwatch_split();
        auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(ts - split);

        std::cout << msg << elapsed.count() << " msecs.";
        if (newline)
            std::cout << std::endl;

        return elapsed.count();
    }

    template <typename C>
    size_t run_benchmark(const C& data, C& output,
            const std::string& label,
            void(implementation)(const C&,C&),
            size_t& reference_hash)
    {
        fill(output.begin(), output.end(), 0);
        for (int i=0; i<ITERATIONS; ++i)
        {
            implementation(data, output);
        }

#if VERIFICATION
        size_t hash = boost::hash_range(output.begin(), output.end());

        if (!reference_hash)
        {
            reference_hash = hash;
        }
        else
        {
#if VERBOSE
            if (reference_hash!=hash)
            {
                std::cerr << label << " hash mismatch! " << std::hex << hash << " (reference implementation: " << reference_hash << ")" << std::dec << std::endl;
                std::cerr << "data:       " << data << std::endl;
                std::cerr << "output:     " << output << std::endl;
                original_code::invoke(data, output);
                std::cerr << "ref output: " << output << std::endl;
            }
            else
            {
                //std::cerr << "data:       " << data << std::endl;
                //std::cerr << "output:     " << output << std::endl;
                //run_original_code(data, output);
                //std::cerr << "ref output: " << output << std::endl;
            }
#endif
#endif
        }

        size_t elapsed = report(label + " elapsed: ", false);
        std::cout << " Hash " << std::hex << hash << std::dec << (hash==reference_hash? " ok" : " FAILED!") << std::endl;
        return elapsed;
    }

}

int main()
{
    typedef std::vector<element_t> data_t;
    data_t data(SAMPLES);
    data_t output(data.size());

    size_t total_op = 0ul,
           total_heuristic = 0ul;

    std::cout << "Benchmarking 100 x " << ITERATIONS << " iterations for data length " << data.size() << " (i.e. 100 different random seeds)" << std::endl;
    for (size_t seed = 0; seed < 10; seed++)
    {
        std::mt19937 rng(seed);

        std::uniform_distribution<element_t> gen; // uniform, unbiased
        std::uniform_distribution<element_t> biased_worstcase(100,1115);
#if USE_FLOATS
        std::uniform_distribution<element_t> prevent_overflow; // uniform, unbiased
        std::uniform_distribution<element_t> negative_only(std::numeric_limits<element_t>::min(), 0);
        std::uniform_distribution<element_t> positive_only(0, std::numeric_limits<int64_t>::max());
#else
        std::uniform_distribution<element_t> prevent_overflow(
                std::numeric_limits<element_t>::min()/SAMPLES,
                std::numeric_limits<element_t>::max()/SAMPLES);
        std::uniform_distribution<element_t> negative_only(std::numeric_limits<element_t>::min()/SAMPLES, 0);
        std::uniform_distribution<element_t> positive_only(0, std::numeric_limits<element_t>::max()/SAMPLES);
#endif

#ifdef WORSTCASE
        std::generate(data.begin(), data.end(), std::bind(positive_only, rng));
        for (size_t i=0; i<SAMPLES; i += 2)
            data[i] = -data[i];
#else
#   if LIMITED_RANGE
        std::generate(data.begin(), data.end(), std::bind(prevent_overflow, rng));
#   else
        std::generate(data.begin(), data.end(), std::bind(gen, rng));
#   endif
#endif
        measuring::report(" --- randomized from new seed --- ");

        size_t reference_hash = 0ul;
        total_op        += measuring::run_benchmark(data, output, "OP's code      ", original_code::invoke<data_t>, reference_hash);
        total_heuristic += measuring::run_benchmark(data, output, "Heuristic code ", heuristic::invoke<data_t>, reference_hash);
    }

    std::cout << "total time in OP algorithm: " << total_op << " msecs" << std::endl;
    std::cout << "total time in Heuristic algorithm: " << total_heuristic << " msecs" << std::endl;
    std::cout << "speedup factor: " << (1.0*total_op)/(1.0*total_heuristic) << " x" << std::endl;
}

## Makefile
all:test heuristic

CPPFLAGS+= -std=c++0x
CPPFLAGS+= -Wall -g
CPPFLAGS+= -march=native -mtune=native
CPPFLAGS+= -Ofast
#CPPFLAGS+= -Ofast -ffast-math #-ffast-fp

# CPPFLAGS+=-D_GLIBCXX_PARALLEL -fopenmp

%:%.cpp
	g++ $(CPPFLAGS) $^ -o $@

## test.cpp
#define MAGNITUDE     20
#define ITERATIONS    1024
#define VERIFICATION  1
#define VERBOSE       0

#define LIMITED_RANGE 0    // hide difference in output due to absense of overflows
#define USE_FLOATS    0

/////////////////////////////
#if VERIFICATION
#   include <boost/functional/hash.hpp>
#endif
#include <cassert>
#include <vector>
#include <algorithm>
#include <numeric>
#include <iterator>
#include <cstdint>
#include <random>
#include <iostream>
#include <chrono>

#if USE_FLOATS // UGLY hack, not?
    #define int64_t float
    #define uniform_int_distribution uniform_real_distribution
#endif

#if VERBOSE
#   include <boost/spirit/include/karma.hpp>

namespace
{
    template <typename T>
        std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
    {
        using namespace boost::spirit::karma;
        return os << format("v(" << auto_ % ", " << ")", v);
    }
}
#endif

namespace stackoverflow // the code from the Original Post
{
    void theloop(const int64_t in[], int64_t out[], size_t N)
    {
        int64_t max = in[0];
        out[0] = max;

        for(uint32_t i = 1; i < N; i++) {
            int64_t v = in[i];
            max += v;
            if (v > max) max = v;
            out[i] = max;
        }
    }

    void unrolled_only(const int64_t in[], int64_t out[], size_t N)
    {
        int64_t max = 0;

        assert(N % 4 == 0);
        for(size_t i = 0; i < N; i+=4) {
            int64_t t_in0 = in[i+0];
            int64_t t_in1 = in[i+1];
            int64_t t_in2 = in[i+2];
            int64_t t_in3 = in[i+3];

            max += t_in0;
            if (t_in0 > max) max = t_in0;
            out[i+0] = max;

            max += t_in1;
            if (t_in1 > max) max = t_in1;
            out[i+1] = max;

            max += t_in2;
            if (t_in2 > max) max = t_in2;
            out[i+2] = max;

            max += t_in3;
            if (t_in3 > max) max = t_in3;
            out[i+3] = max;
        }
    }

    void bitfiddle_hack(const int64_t in[], int64_t out[], size_t N)
    {
#if !USE_FLOATS
        int64_t max = 0;

        assert(N % 4 == 0);
        for(size_t i = 0; i < N; i++) {
            int64_t v = in[i+0];

            max &= -max >> (sizeof(int64_t)/sizeof(char)*8 - 1);
            max += v;
            out[i] = max;
        }
#endif
    }
}

template <typename C> void run_original_code(const C& data, C& output)
{
    stackoverflow::theloop(&data[0], &output[0], data.size());
}

template <typename C> void unrolled_only(const C& data, C& output)
{
    stackoverflow::unrolled_only(&data[0], &output[0], data.size());
}

template <typename C> void bitfiddle_hack(const C& data, C& output)
{
    stackoverflow::bitfiddle_hack(&data[0], &output[0], data.size());
}

template <typename C> void partial_sum_correct(const C& data, C& output)
{
    int64_t wrongmax = 0;

    std::partial_sum(data.begin(), data.end(), output.begin(),
            [&wrongmax](int64_t max, int64_t v) -> int64_t
            {
                max += v;
                if (v > max) max = v;

                wrongmax = wrongmax<0 ? v : wrongmax + v;

                return max;
            });

    //int64_t offset = output.back() - wrongmax;
    //std::cerr << "OFFSET is " << offset << std::endl;
}

template <typename C> void partial_sum_incorrect(const C& data, C& output)
{
    std::partial_sum(data.begin(), data.end(), output.begin(),
            [](int64_t max, int64_t v)
            {
                return max<0 ? v :  max + v;
            });
}

namespace measuring
{
    auto ts = std::chrono::system_clock::now();

    auto stopwatch_split() -> decltype(ts)
    {
        auto split = ts;
        ts = std::chrono::system_clock::now();
        return split;
    }

    void report(const std::string& msg, bool newline=true)
    {
        auto split = stopwatch_split();
        auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(ts - split);

        std::cout << msg << elapsed.count() << " msecs.";
        if (newline)
            std::cout << std::endl;
    }

    template <typename C>
    void run_benchmark(const C& data, C& output,
            const std::string& label,
            void(implementation)(const C&,C&),
            size_t& reference_hash)
    {
        for (int i=0; i<ITERATIONS; ++i)
        {
            implementation(data, output);
        }

#if VERIFICATION
        size_t hash = boost::hash_range(output.begin(), output.end());

        if (reference_hash && (reference_hash!=hash))
        {
#if VERBOSE
            std::cerr << label << " hash mismatch! " << std::hex << hash << " (reference implementation: " << reference_hash << ")" << std::dec << std::endl;
            std::cerr << "data:       " << data << std::endl;
            std::cerr << "output:     " << output << std::endl;
            run_original_code(data, output);
            std::cerr << "ref output: " << output << std::endl;
#endif
        }
        else
        {
            reference_hash = hash;
        }
#endif

        report(label + " elapsed: ", false);
        std::cout << " Hash " << std::hex << hash << std::dec << (hash==reference_hash? " ok" : " FAILED!") << std::endl;
    }

}

int main()
{
    typedef std::vector<int64_t> data_t;
    data_t data(1ul << MAGNITUDE);
    data_t output(data.size());

    std::cout << "Benchmarking 100 x " << ITERATIONS << " iterations for data length " << data.size() << " (i.e. 100 different random seeds)" << std::endl;
    for (size_t seed = 0; seed < 100; seed++)
    {
        std::mt19937 rng(seed);
#if LIMITED_RANGE
        std::uniform_int_distribution<int64_t> gen(
                std::numeric_limits<int64_t>::min() >> MAGNITUDE,
                std::numeric_limits<int64_t>::max() >> MAGNITUDE);
        std::generate(data.begin(), data.end(), std::bind(gen, rng));
#else
        std::uniform_int_distribution<int64_t> gen;
        std::generate(data.begin(), data.end(), std::bind(gen, rng));
#endif
        measuring::report(" --- randomized from new seed --- ");

        size_t reference_hash = 0ul;
        measuring::run_benchmark(data, output, "OP's code   ", run_original_code <data_t>    , reference_hash);

        // these handle overflow exactly like the original algorithm:
        measuring::run_benchmark(data, output, "Unfold Only ", unrolled_only<data_t>         , reference_hash);
        measuring::run_benchmark(data, output, "STL correct ", partial_sum_correct<data_t>   , reference_hash);

        // these handle overflow differently from the original algorithm:
#if !USE_FLOATS
        measuring::run_benchmark(data, output, "Bit Fiddle  ", bitfiddle_hack<data_t>        , reference_hash);
#endif
        measuring::run_benchmark(data, output, "STL based   ", partial_sum_incorrect <data_t>, reference_hash);
    }
}
	#define MAGNITUDE 20
	#define SAMPLES (1ul<<MAGNITUDE)
	#define ITERATIONS 1024

	#define VERIFICATION 1
	#define VERBOSE 0
	#define PROFILING 0

	#define WORSTCASE 1
	#define USE_FLOATS 0

	#if !WORSTCASE
	# define LIMITED_RANGE 1 // hide difference in output due to absense of overflows
	#endif

	/////////////////////////////
	#if VERIFICATION
	# include <boost/functional/hash.hpp>
	#endif
	#include <cassert>
	#include <vector>
	#include <algorithm>
	#include <numeric>
	#include <iterator>
	#include <cstdint>
	#include <random>
	#include <iostream>
	#include <chrono>

	#if USE_FLOATS // UGLY hack, not?
	typedef float element_t;
	#define uniform_distribution uniform_real_distribution
	#define xxx_ float_
	#else
	typedef int64_t element_t;
	#define uniform_distribution uniform_int_distribution
	#define xxx_ int_
	#endif

	#if VERBOSE
	# include <boost/spirit/include/karma.hpp>

	namespace
	{
	template <typename T>
	std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
	{
	using namespace boost::spirit::karma;
	return os << format("v(" << xxx_ % ",\t" << ")", v);
	}
	}
	#endif

	namespace original_code // the code from the Original Post
	{
	void theloop(const element_t in[], element_t out[], size_t N)
	{
	element_t max = in[0];
	out[0] = max;

	for(uint32_t i = 1; i < N; i++) {
	element_t v = in[i];
	max += v;
	if (v > max) max = v;
	out[i] = max;
	}
	}

	template <typename C> void invoke(const C& data, C& output)
	{
	original_code::theloop(&data[0], &output[0], data.size());
	}
	}

	namespace heuristic
	{
	inline element_t calc(element_t previous_max, element_t v)
	{
	previous_max += v;
	if (v > previous_max)
	previous_max = v;
	return previous_max;
	}

	template <typename InIt, typename OutIt>
	OutIt loopify(element_t &max, InIt& b, const InIt e, OutIt o)
	{
	do o++ = (max = calc(max, b++));
	while (b!=e && (b>max \|\| b<0));
	return o;
	}

	template <typename C> void invoke(const C& data, C& output)
	{
	typedef typename C::const_iterator InIt;
	typedef typename C::iterator OutIt;

	InIt b = data.begin();
	InIt e = data.end();
	OutIt o = output.begin();

	element_t max = 0l;

	/*
	*while (b!=e)
	* o = loopify(max, b, e, o);
	*/
	#if PROFILING
	size_t fastforward = 0, conventional = 0;
	#endif

	InIt m;
	while (b!=e)
	{
	m = std::find_if(b,e, [](element_t i) { return i<0; });
	#if PROFILING
	if (b!=m) fastforward++;
	#endif

	//o = std::partial_sum(b,m,o, [&max](element_t prv, element_t v) { return max = calc(prv, v); });
	//o = std::transform(b,m,o, [&max](element_t v) { return max = calc(max, v); });
	o = std::transform(b,m,o, [&max](element_t v) { return max += v; });
	if (m==e) break;
	b = m;
	m = e; // redundant
	#if PROFILING
	conventional++;
	#endif
	o = loopify(max,b,e,o);
	}

	#if PROFILING
	if (m==e)
	{
	std::cerr << "heuristics: processed " << (m-data.begin()) << " out of " << data.size() << " (last region [" << (b-data.begin()) << ",+" << (m-b) << "), " << (e-m) << " remaining)" << std::endl;
	std::cerr << "regions: " << conventional << " conventional, " << fastforward << " fastforward" << std::endl;
	}
	#endif
	}

	}

	namespace measuring
	{
	auto ts = std::chrono::system_clock::now();

	auto stopwatch_split() -> decltype(ts)
	{
	auto split = ts;
	ts = std::chrono::system_clock::now();
	return split;
	}

	size_t report(const std::string& msg, bool newline=true)
	{
	auto split = stopwatch_split();
	auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(ts - split);

	std::cout << msg << elapsed.count() << " msecs.";
	if (newline)
	std::cout << std::endl;

	return elapsed.count();
	}

	template <typename C>
	size_t run_benchmark(const C& data, C& output,
	const std::string& label,
	void(implementation)(const C&,C&),
	size_t& reference_hash)
	{
	fill(output.begin(), output.end(), 0);
	for (int i=0; i<ITERATIONS; ++i)
	{
	implementation(data, output);
	}

	#if VERIFICATION
	size_t hash = boost::hash_range(output.begin(), output.end());

	if (!reference_hash)
	{
	reference_hash = hash;
	}
	else
	{
	#if VERBOSE
	if (reference_hash!=hash)
	{
	std::cerr << label << " hash mismatch! " << std::hex << hash << " (reference implementation: " << reference_hash << ")" << std::dec << std::endl;
	std::cerr << "data: " << data << std::endl;
	std::cerr << "output: " << output << std::endl;
	original_code::invoke(data, output);
	std::cerr << "ref output: " << output << std::endl;
	}
	else
	{
	//std::cerr << "data: " << data << std::endl;
	//std::cerr << "output: " << output << std::endl;
	//run_original_code(data, output);
	//std::cerr << "ref output: " << output << std::endl;
	}
	#endif
	#endif
	}

	size_t elapsed = report(label + " elapsed: ", false);
	std::cout << " Hash " << std::hex << hash << std::dec << (hash==reference_hash? " ok" : " FAILED!") << std::endl;
	return elapsed;
	}

	}

	int main()
	{
	typedef std::vector<element_t> data_t;
	data_t data(SAMPLES);
	data_t output(data.size());

	size_t total_op = 0ul,
	total_heuristic = 0ul;

	std::cout << "Benchmarking 100 x " << ITERATIONS << " iterations for data length " << data.size() << " (i.e. 100 different random seeds)" << std::endl;
	for (size_t seed = 0; seed < 10; seed++)
	{
	std::mt19937 rng(seed);

	std::uniform_distribution<element_t> gen; // uniform, unbiased
	std::uniform_distribution<element_t> biased_worstcase(100,1115);
	#if USE_FLOATS
	std::uniform_distribution<element_t> prevent_overflow; // uniform, unbiased
	std::uniform_distribution<element_t> negative_only(std::numeric_limits<element_t>::min(), 0);
	std::uniform_distribution<element_t> positive_only(0, std::numeric_limits<int64_t>::max());
	#else
	std::uniform_distribution<element_t> prevent_overflow(
	std::numeric_limits<element_t>::min()/SAMPLES,
	std::numeric_limits<element_t>::max()/SAMPLES);
	std::uniform_distribution<element_t> negative_only(std::numeric_limits<element_t>::min()/SAMPLES, 0);
	std::uniform_distribution<element_t> positive_only(0, std::numeric_limits<element_t>::max()/SAMPLES);
	#endif

	#ifdef WORSTCASE
	std::generate(data.begin(), data.end(), std::bind(positive_only, rng));
	for (size_t i=0; i<SAMPLES; i += 2)
	data[i] = -data[i];
	#else
	# if LIMITED_RANGE
	std::generate(data.begin(), data.end(), std::bind(prevent_overflow, rng));
	# else
	std::generate(data.begin(), data.end(), std::bind(gen, rng));
	# endif
	#endif
	measuring::report(" --- randomized from new seed --- ");

	size_t reference_hash = 0ul;
	total_op += measuring::run_benchmark(data, output, "OP's code ", original_code::invoke<data_t>, reference_hash);
	total_heuristic += measuring::run_benchmark(data, output, "Heuristic code ", heuristic::invoke<data_t>, reference_hash);
	}

	std::cout << "total time in OP algorithm: " << total_op << " msecs" << std::endl;
	std::cout << "total time in Heuristic algorithm: " << total_heuristic << " msecs" << std::endl;
	std::cout << "speedup factor: " << (1.0total_op)/(1.0total_heuristic) << " x" << std::endl;
	}
	all:test heuristic

	CPPFLAGS+= -std=c++0x
	CPPFLAGS+= -Wall -g
	CPPFLAGS+= -march=native -mtune=native
	CPPFLAGS+= -Ofast
	#CPPFLAGS+= -Ofast -ffast-math #-ffast-fp

	# CPPFLAGS+=-D_GLIBCXX_PARALLEL -fopenmp

	%:%.cpp
	g++ $(CPPFLAGS) $^ -o $@