ACUVE/main.cpp Secret

## main.cpp
#include <atomic>
#include <chrono>
#include <iostream>
#include <limits>
#include <condition_variable>
#include <thread>
#include <vector>
#include <shared_mutex>
#include <omp.h>

template< typename FUNC, typename... Args >
inline std::chrono::nanoseconds time( FUNC &&func, Args &&... args )
{
    auto const start = std::chrono::high_resolution_clock::now();
    func( std::forward< Args >( args )... );
    auto const end = std::chrono::high_resolution_clock::now();
    auto const duration = end - start;
    return std::chrono::duration_cast< std::chrono::nanoseconds >( duration );
}

class ThreadPool
{
public:
    using func_type = void (*)( void *data, unsigned int num, unsigned int thread_num );

private:
    std::atomic< func_type >    func;
    std::atomic< void * >       func_data;
    std::mutex                  mutex;
    std::atomic< unsigned int > waiting_counter;
    std::condition_variable     cond;
    unsigned int                counter;
    unsigned int const          num_thread;
    std::vector< std::thread >  thread;

private:
    void WaitAllWaiting()
    {
        unsigned int const len = num_thread - 1;
        while( waiting_counter.load() != len )
            std::this_thread::yield();
    }

public:
    ThreadPool( unsigned int const hc = std::thread::hardware_concurrency() )
        : func( nullptr )
        , func_data( nullptr )
        , waiting_counter( 0 )
        , counter( 0 )
        , num_thread( [ & ]{ if( hc ) return hc; return 1u; }() )
    {
        unsigned int const len = num_thread - 1;
        for( unsigned int i = 0; i < len; ++i )
            thread.emplace_back( &ThreadPool::Thread, this, i );
        WaitAllWaiting();
    }
    ~ThreadPool()
    {
        func.store( nullptr );
        {
            std::lock_guard< decltype( mutex ) > lock( mutex );
            counter += 1;
        }
        cond.notify_all();
        for( auto &&th : thread )
            th.join();
    }
    void Thread( unsigned int const num )
    {
        unsigned int c = counter;
        unsigned int const num_t = num_thread;
        while( true )
        {
            waiting_counter.fetch_add( 1 );
            {
                std::unique_lock< decltype( mutex ) > lock( mutex );
                cond.wait( lock, [ & ] { return c != counter; } );
            }
            c = counter;
            auto f = func.load();
            if( !f )
                break;
            f( func_data.load(), num + 1, num_t );
        }
    }
    void Do( func_type f, void *data )
    {
        if( f == nullptr )
            return;
        func.store( f, std::memory_order_relaxed );
        func_data.store( data, std::memory_order_relaxed );
        waiting_counter.store( 0, std::memory_order_release );
        {
            std::lock_guard< decltype( mutex ) > lock( mutex );
            counter += 1;
        }
        cond.notify_all();
        f( data, 0, num_thread );
        WaitAllWaiting();
    }
};

struct ThreadData
{
    unsigned int c;
    std::atomic< double > *s;
};

void aaa( void *data, unsigned int const num, unsigned int const thread_num )
{
    auto const td = reinterpret_cast< ThreadData * >( data );
    auto const c = td->c;
    std::atomic< double > *const ret = td->s;
    decltype( c ) const
        part = c / thread_num,
        rest = c % thread_num,
        s = part * num + (num < rest ? num : rest),
        e = s + part + (num < rest);
    double sum = 0;
    double const delta = 1.0 / c;
    for( auto i = s; i < e; ++i )
    {
        double const x = (delta * i);
        double const area = delta * 1.0 / (x * x + 1.0);
        sum += area;
    }
    double expected = ret->load();
    while( !std::atomic_compare_exchange_weak( ret, &expected, expected + sum ) );
}

void openmp( std::atomic< double > *const v, unsigned int const c )
{
    double sum = 0;
    double const delta = 1.0 / c;
    #pragma omp parallel for reduction ( + : sum )
    for( unsigned int i = 0; i < c; ++i )
    {
        double const x = (delta * i);
        double const area = delta * 1.0 / (x * x + 1.0);
        sum += area;
    }
    v->store( sum );
}

int main( int argc, char **argv )
{
    constexpr unsigned int NUM_TEST = 100000;
    unsigned int const count = 10 * argc;
    unsigned long long int mintime_t = std::numeric_limits< decltype( mintime_t ) >::max();
    unsigned long long int mintime_o = std::numeric_limits< decltype( mintime_o ) >::max();
    unsigned long long int sumtime_t = 0, sumtime_o = 0;
    double minv_t = std::numeric_limits< decltype( minv_t ) >::max();
    double minv_o = std::numeric_limits< decltype( minv_o ) >::max();
    std::atomic< double > v;
    {
        ThreadPool pool;
        ThreadData td;
        td.c = count;
        td.s = &v;
        for( unsigned int i = 0; i < NUM_TEST; ++i )
        {
            v = 0;
            unsigned long long int const t = time( std::bind( &ThreadPool::Do, &pool, aaa, &td ) ).count();
            mintime_t = std::min( mintime_t, t );
            minv_t = std::min( minv_t, v.load() );
            sumtime_t += t;
        }
    }
    {
        for( unsigned int i = 0; i < NUM_TEST; ++i )
        {
            v = 0;
            unsigned long long int const t = time( openmp, &v, count ).count();
            mintime_o = std::min( mintime_o, t );
            minv_o = std::min( minv_o, v.load() );
            sumtime_o += t;
        }
    }
    {
        std::cout << "NUM_TSET: " << NUM_TEST << ", count: " << count << std::endl;
        std::cout << "Thread min time[ns]: " << mintime_t << ", OpenMP min time[ns]: " << mintime_o << std::endl;
        std::cout << "Thread mean time[ns]: " << static_cast< double >( sumtime_t ) / NUM_TEST << ", OpenMP mean time[ns]: " << static_cast< double >( sumtime_o ) / NUM_TEST << std::endl;
        std::cout << "Thread_v: " << minv_t << ", OpenMP_v: " << minv_o << std::endl;
    }
}
	#include <atomic>
	#include <chrono>
	#include <iostream>
	#include <limits>
	#include <condition_variable>
	#include <thread>
	#include <vector>
	#include <shared_mutex>
	#include <omp.h>

	template< typename FUNC, typename... Args >
	inline std::chrono::nanoseconds time( FUNC &&func, Args &&... args )
	{
	auto const start = std::chrono::high_resolution_clock::now();
	func( std::forward< Args >( args )... );
	auto const end = std::chrono::high_resolution_clock::now();
	auto const duration = end - start;
	return std::chrono::duration_cast< std::chrono::nanoseconds >( duration );
	}

	class ThreadPool
	{
	public:
	using func_type = void ()( void data, unsigned int num, unsigned int thread_num );

	private:
	std::atomic< func_type > func;
	std::atomic< void * > func_data;
	std::mutex mutex;
	std::atomic< unsigned int > waiting_counter;
	std::condition_variable cond;
	unsigned int counter;
	unsigned int const num_thread;
	std::vector< std::thread > thread;

	private:
	void WaitAllWaiting()
	{
	unsigned int const len = num_thread - 1;
	while( waiting_counter.load() != len )
	std::this_thread::yield();
	}

	public:
	ThreadPool( unsigned int const hc = std::thread::hardware_concurrency() )
	: func( nullptr )
	, func_data( nullptr )
	, waiting_counter( 0 )
	, counter( 0 )
	, num_thread( [ & ]{ if( hc ) return hc; return 1u; }() )
	{
	unsigned int const len = num_thread - 1;
	for( unsigned int i = 0; i < len; ++i )
	thread.emplace_back( &ThreadPool::Thread, this, i );
	WaitAllWaiting();
	}
	~ThreadPool()
	{
	func.store( nullptr );
	{
	std::lock_guard< decltype( mutex ) > lock( mutex );
	counter += 1;
	}
	cond.notify_all();
	for( auto &&th : thread )
	th.join();
	}
	void Thread( unsigned int const num )
	{
	unsigned int c = counter;
	unsigned int const num_t = num_thread;
	while( true )
	{
	waiting_counter.fetch_add( 1 );
	{
	std::unique_lock< decltype( mutex ) > lock( mutex );
	cond.wait( lock, [ & ] { return c != counter; } );
	}
	c = counter;
	auto f = func.load();
	if( !f )
	break;
	f( func_data.load(), num + 1, num_t );
	}
	}
	void Do( func_type f, void *data )
	{
	if( f == nullptr )
	return;
	func.store( f, std::memory_order_relaxed );
	func_data.store( data, std::memory_order_relaxed );
	waiting_counter.store( 0, std::memory_order_release );
	{
	std::lock_guard< decltype( mutex ) > lock( mutex );
	counter += 1;
	}
	cond.notify_all();
	f( data, 0, num_thread );
	WaitAllWaiting();
	}
	};

	struct ThreadData
	{
	unsigned int c;
	std::atomic< double > *s;
	};

	void aaa( void *data, unsigned int const num, unsigned int const thread_num )
	{
	auto const td = reinterpret_cast< ThreadData * >( data );
	auto const c = td->c;
	std::atomic< double > *const ret = td->s;
	decltype( c ) const
	part = c / thread_num,
	rest = c % thread_num,
	s = part * num + (num < rest ? num : rest),
	e = s + part + (num < rest);
	double sum = 0;
	double const delta = 1.0 / c;
	for( auto i = s; i < e; ++i )
	{
	double const x = (delta * i);
	double const area = delta * 1.0 / (x * x + 1.0);
	sum += area;
	}
	double expected = ret->load();
	while( !std::atomic_compare_exchange_weak( ret, &expected, expected + sum ) );
	}

	void openmp( std::atomic< double > *const v, unsigned int const c )
	{
	double sum = 0;
	double const delta = 1.0 / c;
	#pragma omp parallel for reduction ( + : sum )
	for( unsigned int i = 0; i < c; ++i )
	{
	double const x = (delta * i);
	double const area = delta * 1.0 / (x * x + 1.0);
	sum += area;
	}
	v->store( sum );
	}

	int main( int argc, char **argv )
	{
	constexpr unsigned int NUM_TEST = 100000;
	unsigned int const count = 10 * argc;
	unsigned long long int mintime_t = std::numeric_limits< decltype( mintime_t ) >::max();
	unsigned long long int mintime_o = std::numeric_limits< decltype( mintime_o ) >::max();
	unsigned long long int sumtime_t = 0, sumtime_o = 0;
	double minv_t = std::numeric_limits< decltype( minv_t ) >::max();
	double minv_o = std::numeric_limits< decltype( minv_o ) >::max();
	std::atomic< double > v;
	{
	ThreadPool pool;
	ThreadData td;
	td.c = count;
	td.s = &v;
	for( unsigned int i = 0; i < NUM_TEST; ++i )
	{
	v = 0;
	unsigned long long int const t = time( std::bind( &ThreadPool::Do, &pool, aaa, &td ) ).count();
	mintime_t = std::min( mintime_t, t );
	minv_t = std::min( minv_t, v.load() );
	sumtime_t += t;
	}
	}
	{
	for( unsigned int i = 0; i < NUM_TEST; ++i )
	{
	v = 0;
	unsigned long long int const t = time( openmp, &v, count ).count();
	mintime_o = std::min( mintime_o, t );
	minv_o = std::min( minv_o, v.load() );
	sumtime_o += t;
	}
	}
	{
	std::cout << "NUM_TSET: " << NUM_TEST << ", count: " << count << std::endl;
	std::cout << "Thread min time[ns]: " << mintime_t << ", OpenMP min time[ns]: " << mintime_o << std::endl;
	std::cout << "Thread mean time[ns]: " << static_cast< double >( sumtime_t ) / NUM_TEST << ", OpenMP mean time[ns]: " << static_cast< double >( sumtime_o ) / NUM_TEST << std::endl;
	std::cout << "Thread_v: " << minv_t << ", OpenMP_v: " << minv_o << std::endl;
	}
	}