owent/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Benchmark Data

2019-09-29 更新一版运行结果，增加 C++20 Coroutine 测试结果


组件(Avg)
协程数：1 切换开销
协程数：1000 创建开销
协程数：1000 切换开销
协程数：30000 创建开销
协程数：30000 切换开销


栈大小（如果可指定）
16 KB
2 MB
2 MB
64 KB
64 KB


C++20 Coroutine - Clang
5 ns
130 ns
6 ns
136 ns
9 ns


C++20 Coroutine - MSVC
10 ns
407 ns
14 ns
369 ns
28 ns


libcopp
77 ns
4.1 us
105 ns
3.8 us
273 ns


libcopp+动态栈池
74 ns
101 ns
110 ns
222 ns
270 ns


libcopp+libcotask
96 ns
4.2 us
156 ns
4.2 us
389 ns


libcopp+libcotask+动态栈池
96 ns
197 ns
153 ns
329 ns
371 ns


libco+静态栈池
84 ns
3.9 us
168 ns
4.2 us
450 ns


libco（共享栈4K占用）
83 ns
3.9 us
529 ns
3.9 us
1073 ns


libco（共享栈8K占用）
86 ns
4.0 us
828 ns
3.9 us
1596 ns


libco（共享栈32K占用）
-
4.0 us
9152 ns
3.9 us
11.5 us


libgo
30 ns
8.3 us
32 us
5.5 us
150 ns


libgo 2018年版本 with boost
197 ns
5.3 us
124 ns
2.3 us
441 ns


libgo 2018年版本 with ucontext
539 ns
7.0 us
482 ns
2.7 us
921 ns


goroutine(golang)
425 ns
1.0 us
710 ns
1.0 us
1047 ns


linux ucontext
439 ns
4.4 us
505 ns
4.8 us
890 ns


关于libgo： libgo 似乎比我最早测试的版本来了一次大改版，现在它也和libcopp一样内置了boost.context了，不再能自己切ucontext；另外它的执行线程Processer采用一次性从调度器分配一堆Task的方式去调度任务。这使得它的协程任务调度过程中不再需要保证线程安全了，少了很多同步操作和状态检查的开销。它可以这么设计的其中一个原因也是它是把协程任务的接口给隐藏起来的，用户不太会去显式调用它，也就不会有误用的问题。另外它的Task对象和以前的libcopp v1版一样是另外存储的，简单压测的话和libcopp v1版一样，缓存命中率会偏高，是更加无法真实地反映实际项目中的使用的。所以简单压测协程切换它的性能逼近 boost.context 原生cache 不miss时的开销（boost.context 原生切换 cache的miss的时候也到不了30ns，印象中以前测过是60ns+）。也是这个原因，它的协程数量上来以后命中率下降之后它的性能也下降得也非常厉害。我也尝试了下稍微改动了 libgo 的代码，提高cache miss的情况下，测试数据下降也很明显，不过 libgo 的切换性能仍然是和 libcopp 相近，不过我没太深入去看它的细节，不知道为什么增大栈的情况下它的切换性能和创建性能变化很剧烈，按道理只是分配了地址空间，并没有使用到不应该有这么大的变化。另外 libgo 的 co::CoroutineOptions 可以自定义全局的分配器，所以它也是可以自己手动接栈池的，否则它每次创建协程都会走mmap系统调用然后触发缺页中断，创建开销比较大。
按照 libgo 作者自己的测试来看， 关于libgo的性能大约是 goroutine(golang) 的3-4倍，这个数值可我之前测试的老版本的测试方法结果比较相近，也和 libcopp 的性能接近。所以也贴了以前的测试结果（以前跑测试的老机器和现在测试的新机器硬件有差异，老机器的单核切换性能普片比新机器高一些）。

Build & run script

libcopp&libcotask

bash ./cmake_dev.sh -b RelWithDebInfo -us;
make -j4;
make benchmark;
goroutine

/usr/local/go/bin/go build -o goroutine_benchmark goroutine_benchmark.go;
./goroutine_benchmark 1 3000000;
./goroutine_benchmark 1000 1000;
./goroutine_benchmark 30000 100;
ucontext

g++ -O2 -g -DNDEBUG -ggdb   -Wall -Werror -fPIC ucontext_benchmark.cpp -o ucontext_benchmark ;
./ucontext_benchmark 1 3000000 16;
./ucontext_benchmark 1000 1000 2048;
./ucontext_benchmark 30000 100 64;
libco

# build libco
mkdir build && cd build;
cmake ../libco;
make -j4;
cd ..;

# build exe
g++ -O2 -g -DNDEBUG -ggdb -Wall -Werror -fPIC libco_benchmark.cpp build/libcolib.a -o libco_benchmark -Ilibco -lpthread -lz -lm -ldl ;
# static stack pool
./libco_benchmark 1 3000000 16;
./libco_benchmark 1000 1000 2048;
./libco_benchmark 30000 100 64;

# shared stack=4K
./libco_benchmark 1 3000000 16 4;
./libco_benchmark 1000 1000 2048 4;
./libco_benchmark 30000 100 64 4;

# shared stack=8K
./libco_benchmark 1 3000000 16 8;
./libco_benchmark 1000 1000 2048 8;
./libco_benchmark 30000 100 64 8;

# shared stack=32K
./libco_benchmark 1000 1000 2048 32;
./libco_benchmark 30000 100 64 32;
libgo with boost


最新的 libgo 已经内置 boost.context 了, 不需要再指定BOOST安装地址

# build libgo
mkdir build && cd build;
cmake ../libgo -DCMAKE_BUILD_TYPE=RelWithDebInfo;
make -j4;
cd ..;

# build exe
g++ -O2 -g -DNDEBUG -ggdb -Wall -Werror -fPIC libgo_benchmark.cpp -o libgo_benchmark -Ilibgo -Ilibgo/libgo -Ilibgo/libgo/linux -Lbuild -llibgo -lrt -lpthread -ldl;
./libgo_benchmark 1 3000000 16;
./libgo_benchmark 1000 1000 2048;
./libgo_benchmark 30000 100 64;
libgo with ucontext


最新的 libgo 已经内置 boost.context了，没有使用ucontext的选项了，这个分支仅针对老版本

# build libgo
mkdir build && cd build;
cmake ../libgo -DCMAKE_BUILD_TYPE=RelWithDebInfo -DENABLE_BOOST_CONTEXT=NO;
make -j4;
cd ..;

# build exe
g++ -O2 -g -DNDEBUG -ggdb -Wall -Werror -fPIC libgo_benchmark.cpp -o libgo_benchmark -Ilibgo -Ilibgo/libgo -Ilibgo/libgo/linux -Lbuild -llibgo -lrt -lpthread -ldl;
./libgo_benchmark 1 3000000 16;
./libgo_benchmark 1000 1000 2048;
./libgo_benchmark 30000 100 64;
C++20 Coroutine

clang++ -std=c++2a -O2 -g -ggdb -stdlib=libc++ -fcoroutines-ts -lc++ -lc++abi cxx20_coroutine.cpp -o cxx20_coroutine
./cxx20_coroutine 1 3000000
./cxx20_coroutine 1000 1000
./cxx20_coroutine 30000 1000
cl /nologo /O2 /std:c++latest /Zi /MDd /Zc:__cplusplus /EHsc /await cxx20_coroutine.cpp
./cxx20_coroutine.exe 1 3000000
./cxx20_coroutine.exe 1000 1000
./cxx20_coroutine.exe 30000 1000

  
## cxx20_coroutine.cpp
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <ctime>
#include <inttypes.h>
#include <stdint.h>
#include <vector>
#include <memory>
#include <iostream>

#include <experimental/coroutine>

#include <chrono>
#define CALC_CLOCK_T std::chrono::system_clock::time_point
#define CALC_CLOCK_NOW() std::chrono::system_clock::now()
#define CALC_MS_CLOCK(x) static_cast<int>(std::chrono::duration_cast<std::chrono::milliseconds>(x).count())
#define CALC_NS_AVG_CLOCK(x, y) static_cast<long long>(std::chrono::duration_cast<std::chrono::nanoseconds>(x).count() / (y ? y : 1))


static std::vector<std::pair<int*, std::experimental::coroutine_handle<> > > g_test_rpc_manager;

struct test_custom_coroutine_data;
struct test_custom_coroutine {
    using data_ptr = std::unique_ptr<test_custom_coroutine_data>;

    struct promise_type {
        data_ptr refer_data;
        char fake_cache_miss_[64 - sizeof(test_custom_coroutine_data*)];
        promise_type();

        static test_custom_coroutine get_return_object_on_allocation_failure();

        test_custom_coroutine get_return_object();

        std::experimental::suspend_always initial_suspend();

        std::experimental::suspend_always final_suspend();

        void unhandled_exception();

        // 用以支持 co_return
        void return_void();

        // 用以支持 co_yield
        std::experimental::suspend_always yield_value(test_custom_coroutine_data*&);
    };

    // 下面的接入用侵入式的方式支持 co_await test_rpc_generator
    // MSVC 目前支持使用非侵入式的方式实现，但是clang不支持
    bool await_ready() noexcept;

    void await_resume();

    void await_suspend(std::experimental::coroutine_handle<promise_type>);

    int resume();
    void set_sum_times(int);
    bool is_done() const;
    test_custom_coroutine_data* data();
private:
    test_custom_coroutine(test_custom_coroutine_data*);
    test_custom_coroutine_data* data_;
    char fake_cache_miss_[64 - sizeof(test_custom_coroutine_data*)];
};

struct test_custom_coroutine_data {
    int sum_times;
    int yield_times;

    std::experimental::coroutine_handle<test_custom_coroutine::promise_type> handle;
};

test_custom_coroutine::promise_type::promise_type() {
    refer_data = std::make_unique<test_custom_coroutine_data>();
    refer_data->sum_times = 0;
    refer_data->yield_times = 0;
}
test_custom_coroutine test_custom_coroutine::promise_type::get_return_object_on_allocation_failure() {
    return test_custom_coroutine{ nullptr };
}

test_custom_coroutine test_custom_coroutine::promise_type::get_return_object() {
    return test_custom_coroutine{ refer_data.get() };
}

std::experimental::suspend_always test_custom_coroutine::promise_type::initial_suspend() {
    refer_data->handle = std::experimental::coroutine_handle<promise_type>::from_promise(*this);
    return std::experimental::suspend_always{}; // STL提供了一些自带的awaiter实现，我们其实很多情况下也不需要另外写，直接用STL就好了
}

std::experimental::suspend_always test_custom_coroutine::promise_type::final_suspend() {
    return std::experimental::suspend_always{}; // 和上面一样，也是STL自带的awaiter实现
}

void test_custom_coroutine::promise_type::unhandled_exception() {
    std::terminate();
}

// 用以支持 co_return
void test_custom_coroutine::promise_type::return_void() {
    refer_data->handle = nullptr;
}

// 用以支持 co_yield
std::experimental::suspend_always test_custom_coroutine::promise_type::yield_value(test_custom_coroutine_data*& coro_data) {
    // 每次调用都会执行,创建handle用以后面恢复数据
    if (nullptr != refer_data) {
        refer_data->handle = std::experimental::coroutine_handle<promise_type>::from_promise(*this);
        ++refer_data->yield_times;
    }

    coro_data = refer_data.get();
    return std::experimental::suspend_always{};
}

// 下面的接入用侵入式的方式支持 co_await test_custom_coroutine , 实际上benchmark过程中并没有用到
// MSVC 目前支持使用非侵入式的方式实现，但是clang不支持
bool test_custom_coroutine::await_ready() noexcept {
    // 准备好地标志是协程handle执行完毕了
    return !data_ || !data_->handle || data_->handle.done();
}

void test_custom_coroutine::await_resume() {
    // do nothing when benchmark
}

void test_custom_coroutine::await_suspend(std::experimental::coroutine_handle<promise_type>) {
    // do nothing when benchmark
    // 被外部模块 co_await , 这里完整的协程任务链流程应该是要把coroutine_handle记录到test_custom_coroutine
    // 在return_void后需要对这些coroutine_handle做resume操作，但是这里为了减少benchmark的额外开销和保持干净
    // 所以留空
}

int test_custom_coroutine::resume() {
    if (!await_ready()) {
        data_->handle.resume();
        return 1;
    }

    return 0;
}

void test_custom_coroutine::set_sum_times(int times) {
    if (data_) {
        data_->sum_times = times;
    }
}

bool test_custom_coroutine::is_done() const {
    return !(data_ && data_->handle);
}

test_custom_coroutine_data* test_custom_coroutine::data() {
    return data_;
}

test_custom_coroutine::test_custom_coroutine(test_custom_coroutine_data* d) : data_(d) {}

// 异步协程函数
test_custom_coroutine coroutine_start_main(test_custom_coroutine_data*& coro_data) {
    // create done

    // begin to yield
    while (coro_data != nullptr && coro_data->yield_times < coro_data->sum_times) {
        co_yield coro_data;
    }

    // finish all yield
    co_return;
}

// 这里模拟生成数据
bool coroutine_resume(std::vector<test_custom_coroutine>& in, long long& real_switch_times) {
    bool ret = false;
    for (auto& co : in) {
        real_switch_times += co.resume();
        if (!co.is_done()) {
            ret = true;
        }
    }

    return ret;
}

int main(int argc, char* argv[]) {
#ifdef __cpp_coroutines
    std::cout << "__cpp_coroutines: " << __cpp_coroutines << std::endl;
#endif
    puts("###################### C++20 coroutine ###################");
    printf("########## Cmd:");
    for (int i = 0; i < argc; ++i) {
        printf(" %s", argv[i]);
    }
    puts("");

    int switch_count = 100;
    int max_coroutine_number = 100000; // 协程数量

    if (argc > 1) {
        max_coroutine_number = atoi(argv[1]);
    }

    if (argc > 2) {
        switch_count = atoi(argv[2]);
    }

    std::vector<test_custom_coroutine> co_arr;
    std::vector<test_custom_coroutine_data*> co_data_arr;
    co_arr.reserve(static_cast<size_t>(max_coroutine_number));
    co_data_arr.resize(static_cast<size_t>(max_coroutine_number), nullptr);

    time_t       begin_time = time(NULL);
    CALC_CLOCK_T begin_clock = CALC_CLOCK_NOW();

    // create coroutines
    for (int i = 0; i < max_coroutine_number; ++i) {
        co_arr.emplace_back(coroutine_start_main(co_data_arr[i]));
        co_arr.back().set_sum_times(switch_count);
        co_data_arr[i] = co_arr.back().data();
    }

    time_t       end_time = time(NULL);
    CALC_CLOCK_T end_clock = CALC_CLOCK_NOW();
    printf("create %d coroutine, cost time: %d s, clock time: %d ms, avg: %lld ns\n", max_coroutine_number,
        static_cast<int>(end_time - begin_time), CALC_MS_CLOCK(end_clock - begin_clock),
        CALC_NS_AVG_CLOCK(end_clock - begin_clock, max_coroutine_number));

    begin_time = end_time;
    begin_clock = end_clock;

    // yield & resume from runner
    long long real_switch_times = static_cast<long long>(0);

    bool is_continue = true;
    while (is_continue) {
        is_continue = coroutine_resume(co_arr, real_switch_times);
    }
    // sub create - resume
    real_switch_times -= max_coroutine_number;

    end_time = time(NULL);
    end_clock = CALC_CLOCK_NOW();
    printf("switch %d coroutine contest %lld times, cost time: %d s, clock time: %d ms, avg: %lld ns\n", max_coroutine_number,
        real_switch_times, static_cast<int>(end_time - begin_time), CALC_MS_CLOCK(end_clock - begin_clock),
        CALC_NS_AVG_CLOCK(end_clock - begin_clock, real_switch_times));

    begin_time = end_time;
    begin_clock = end_clock;

    co_arr.clear();

    end_time = time(NULL);
    end_clock = CALC_CLOCK_NOW();
    printf("remove %d coroutine, cost time: %d s, clock time: %d ms, avg: %lld ns\n", max_coroutine_number,
        static_cast<int>(end_time - begin_time), CALC_MS_CLOCK(end_clock - begin_clock),
        CALC_NS_AVG_CLOCK(end_clock - begin_clock, max_coroutine_number));

    return 0;
}

## goroutine_benchmark.go
package main

import (
	"fmt"
	"os"
	"strconv"
	"time"
)

func runCallback(in, out chan int64) {
	for n, ok := <-in; ok; n, ok = <-in {
		out <- n
	}
}

func runTest(round int, coroutineNum, switchTimes int64) {
	fmt.Printf("##### Round: %v\n", round)
	start := time.Now()
	channelsIn, channelsOut := make([]chan int64, coroutineNum), make([]chan int64, coroutineNum)
	for i := int64(0); i < coroutineNum; i++ {
		channelsIn[i] = make(chan int64, 1)
		channelsOut[i] = make(chan int64, 1)
	}
	end := time.Now()
	fmt.Printf("Create %v goroutines and channels cost %vns, avg %vns\n", coroutineNum, end.Sub(start).Nanoseconds(), end.Sub(start).Nanoseconds()/coroutineNum)

	start = time.Now()
	for i := int64(0); i < coroutineNum; i++ {
		go runCallback(channelsIn[i], channelsOut[i])
	}
	end = time.Now()
	fmt.Printf("Start %v goroutines and channels cost %vns, avg %vns\n", coroutineNum, end.Sub(start).Nanoseconds(), end.Sub(start).Nanoseconds()/coroutineNum)

	var sum int64 = 0
	start = time.Now()
	for i := int64(0); i < switchTimes; i++ {
		for j := int64(0); j < coroutineNum; j++ {
			channelsIn[j] <- 1
			sum += <-channelsOut[j]
		}
	}
	end = time.Now()
	fmt.Printf("Switch %v goroutines for %v times cost %vns, avg %vns\n", coroutineNum, sum, end.Sub(start).Nanoseconds(), end.Sub(start).Nanoseconds()/sum)

	start = time.Now()
	for i := int64(0); i < coroutineNum; i++ {
		close(channelsIn[i])
		close(channelsOut[i])
	}
	end = time.Now()
	fmt.Printf("Close %v goroutines cost %vns, avg %vns\n", coroutineNum, end.Sub(start).Nanoseconds(), end.Sub(start).Nanoseconds()/coroutineNum)
}

func main() {
	var coroutineNum, switchTimes int64 = 30000, 1000

	fmt.Printf("### Run: ")
	for _, v := range os.Args {
		fmt.Printf(" \"%s\"", v)
	}
	fmt.Printf("\n")

	if (len(os.Args)) > 1 {
		v, _ := strconv.Atoi(os.Args[1])
		coroutineNum = int64(v)
	}

	if (len(os.Args)) > 2 {
		v, _ := strconv.Atoi(os.Args[2])
		switchTimes = int64(v)
	}

	for i := 1; i <= 5; i++ {
		runTest(i, coroutineNum, switchTimes)
	}
}

## libco_benchmark.cpp

#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <ctime>
#include <inttypes.h>
#include <stdint.h>
#include <vector>
#include <memory>

#include <co_routine.h>

#include <chrono>
#define CALC_CLOCK_T std::chrono::system_clock::time_point
#define CALC_CLOCK_NOW() std::chrono::system_clock::now()
#define CALC_MS_CLOCK(x) static_cast<int>(std::chrono::duration_cast<std::chrono::milliseconds>(x).count())
#define CALC_NS_AVG_CLOCK(x, y) static_cast<long long>(std::chrono::duration_cast<std::chrono::nanoseconds>(x).count() / (y ? y : 1))

class coroutine_context;

int switch_count = 100;
int max_coroutine_number = 100000; // 协程数量

class coroutine_context {
public:
    coroutine_context(const stCoRoutineAttr_t* libco_attr, int share_stack) {
        callee_ctx_ = NULL;
        share_stack_ = share_stack;
        co_create(&callee_ctx_, libco_attr, &start_callback, this);

        is_in_callback_ = false;
        is_finished_ = false;
    }

    ~coroutine_context() {
        if (NULL != callee_ctx_) {
            co_release(callee_ctx_);
        }
    }

    void resume() {
        if (is_in_callback_) {
            return;
        }

        co_resume(callee_ctx_);
        is_in_callback_ = false;
    }

    void yield() {
        if(!is_in_callback_) {
            return;
        }

        co_yield(callee_ctx_);
        is_in_callback_ = true;
    }

    static void* start_callback(void* arg) {
        coroutine_context* this_coroutine = reinterpret_cast<coroutine_context*>(arg);
        this_coroutine->is_in_callback_ = true;

        // 可能需要占用一部分空间，测试共享栈的copy性能
        void* stack_buffer = NULL;
        if (this_coroutine->share_stack_ > 0) {
            stack_buffer = alloca(static_cast<size_t>(this_coroutine->share_stack_));
            memset(stack_buffer, 0, static_cast<size_t>(this_coroutine->share_stack_));
            memcpy(stack_buffer, this_coroutine, sizeof(coroutine_context));
            memcpy(static_cast<char*>(stack_buffer) + static_cast<size_t>(this_coroutine->share_stack_) - sizeof(coroutine_context),
                this_coroutine, sizeof(coroutine_context));
        }

        int count = switch_count; // 每个协程N次切换
        while (count-- > 0) {
            this_coroutine->yield();
        }

        this_coroutine->is_finished_ = true;
        this_coroutine->yield();

        return stack_buffer;
    }

    inline bool is_finished() const { return is_finished_; }

private:
    stCoRoutine_t* callee_ctx_;
    int share_stack_;
    bool is_in_callback_;
    bool is_finished_;
};

int main(int argc, char *argv[]) {
    puts("###################### ucontext coroutine ###################");
    printf("########## Cmd:");
    for (int i = 0; i < argc; ++i) {
        printf(" %s", argv[i]);
    }
    puts("");

    if (argc > 1) {
        max_coroutine_number = atoi(argv[1]);
    }

    if (argc > 2) {
        switch_count = atoi(argv[2]);
    }

    size_t stack_size = 16 * 1024;
    if (argc > 3) {
        stack_size = atoi(argv[3]) * 1024;
    }

    int enable_share_stack = 0;
    if (argc > 4) {
        enable_share_stack = atoi(argv[4]) * 1024;
    }

    stCoRoutineAttr_t libco_attr;
    libco_attr.stack_size = static_cast<int>(stack_size);
    if (0 != enable_share_stack) {
        libco_attr.share_stack = co_alloc_sharestack(1, libco_attr.stack_size);
    } else {
        libco_attr.share_stack = co_alloc_sharestack(max_coroutine_number, libco_attr.stack_size);
    }

    time_t       begin_time  = time(NULL);
    CALC_CLOCK_T begin_clock = CALC_CLOCK_NOW();

    // create coroutines
    std::vector<std::unique_ptr<coroutine_context> > co_arr;
    co_arr.resize(static_cast<size_t>(max_coroutine_number));
    for (size_t i = 0; i < 64; ++ i) {
        for (size_t j = 0; i + j * 64 < co_arr.size(); ++ j) {
            co_arr[i + j * 64].reset(new coroutine_context(&libco_attr, enable_share_stack));
        }
    }

    time_t       end_time  = time(NULL);
    CALC_CLOCK_T end_clock = CALC_CLOCK_NOW();
    printf("create %d coroutine, cost time: %d s, clock time: %d ms, avg: %lld ns\n", max_coroutine_number,
           static_cast<int>(end_time - begin_time), CALC_MS_CLOCK(end_clock - begin_clock),
           CALC_NS_AVG_CLOCK(end_clock - begin_clock, max_coroutine_number));

    begin_time  = end_time;
    begin_clock = end_clock;

    // yield & resume from runner
    bool      continue_flag     = true;
    long long real_switch_times = static_cast<long long>(0);

    while (continue_flag) {
        continue_flag = false;
        for (int i = 0; i < max_coroutine_number; ++i) {
            if (false == co_arr[i]->is_finished()) {
                continue_flag = true;
                ++real_switch_times;
                co_arr[i]->resume();
            }
        }
    }

    end_time  = time(NULL);
    end_clock = CALC_CLOCK_NOW();
    printf("switch %d coroutine contest %lld times, cost time: %d s, clock time: %d ms, avg: %lld ns\n", max_coroutine_number,
           real_switch_times, static_cast<int>(end_time - begin_time), CALC_MS_CLOCK(end_clock - begin_clock),
           CALC_NS_AVG_CLOCK(end_clock - begin_clock, real_switch_times));

    begin_time  = end_time;
    begin_clock = end_clock;

    co_arr.clear();

    end_time  = time(NULL);
    end_clock = CALC_CLOCK_NOW();
    printf("remove %d coroutine, cost time: %d s, clock time: %d ms, avg: %lld ns\n", max_coroutine_number,
           static_cast<int>(end_time - begin_time), CALC_MS_CLOCK(end_clock - begin_clock),
           CALC_NS_AVG_CLOCK(end_clock - begin_clock, max_coroutine_number));

    return 0;
}

## libgo_benchmark.cpp

#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <ctime>
#include <inttypes.h>
#include <stdint.h>
#include <vector>
#include <memory>

#include <libgo/coroutine.h>

#include <chrono>
#define CALC_CLOCK_T std::chrono::system_clock::time_point
#define CALC_CLOCK_NOW() std::chrono::system_clock::now()
#define CALC_MS_CLOCK(x) static_cast<int>(std::chrono::duration_cast<std::chrono::milliseconds>(x).count())
#define CALC_NS_AVG_CLOCK(x, y) static_cast<long long>(std::chrono::duration_cast<std::chrono::nanoseconds>(x).count() / (y ? y : 1))

int switch_count = 100;
int max_coroutine_number = 100000; // 协程数量

int main(int argc, char *argv[]) {
    puts("###################### ucontext coroutine ###################");
    printf("########## Cmd:");
    for (int i = 0; i < argc; ++i) {
        printf(" %s", argv[i]);
    }
    puts("");

    if (argc > 1) {
        max_coroutine_number = atoi(argv[1]);
    }

    if (argc > 2) {
        switch_count = atoi(argv[2]);
    }

    size_t stack_size = 16 * 1024;
    if (argc > 3) {
        stack_size = atoi(argv[3]) * 1024;
    }

    time_t       begin_time  = time(NULL);
    CALC_CLOCK_T begin_clock = CALC_CLOCK_NOW();

    // create coroutines
    int finish_count = 0;
    for (int i = 0; i < max_coroutine_number; ++ i) {
        go_stack(stack_size) [&finish_count]{
            int left_count = switch_count;
            while (left_count -- > 0) {
                co_yield;
            }

            ++ finish_count;
            if (finish_count >= max_coroutine_number) {
                co_sched.Stop();
            }
        };
    }

    time_t       end_time  = time(NULL);
    CALC_CLOCK_T end_clock = CALC_CLOCK_NOW();
    printf("create %d coroutine, cost time: %d s, clock time: %d ms, avg: %lld ns\n", max_coroutine_number,
           static_cast<int>(end_time - begin_time), CALC_MS_CLOCK(end_clock - begin_clock),
           CALC_NS_AVG_CLOCK(end_clock - begin_clock, max_coroutine_number));

    begin_time  = end_time;
    begin_clock = end_clock;

    // yield & resume from runner
    co_sched.Start();
    long long real_switch_times = max_coroutine_number * static_cast<long long>(switch_count);

    end_time  = time(NULL);
    end_clock = CALC_CLOCK_NOW();
    printf("switch %d coroutine contest %lld times, cost time: %d s, clock time: %d ms, avg: %lld ns\n", max_coroutine_number,
           real_switch_times, static_cast<int>(end_time - begin_time), CALC_MS_CLOCK(end_clock - begin_clock),
           CALC_NS_AVG_CLOCK(end_clock - begin_clock, real_switch_times));

    begin_time  = end_time;
    begin_clock = end_clock;

    end_time  = time(NULL);
    end_clock = CALC_CLOCK_NOW();
    printf("remove %d coroutine, cost time: %d s, clock time: %d ms, avg: %lld ns\n", max_coroutine_number,
           static_cast<int>(end_time - begin_time), CALC_MS_CLOCK(end_clock - begin_clock),
           CALC_NS_AVG_CLOCK(end_clock - begin_clock, max_coroutine_number));

    return 0;
}

## ucontext_benchmark.cpp

#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <ctime>
#include <inttypes.h>
#include <stdint.h>
#include <vector>
#include <memory>

extern "C" {
#include <errno.h>
#include <fcntl.h>
#include <signal.h>
#include <sys/mman.h>
#include <sys/resource.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/types.h>
#include <unistd.h>

#include <ucontext.h>
}

#include <chrono>
#define CALC_CLOCK_T std::chrono::system_clock::time_point
#define CALC_CLOCK_NOW() std::chrono::system_clock::now()
#define CALC_MS_CLOCK(x) static_cast<int>(std::chrono::duration_cast<std::chrono::milliseconds>(x).count())
#define CALC_NS_AVG_CLOCK(x, y) static_cast<long long>(std::chrono::duration_cast<std::chrono::nanoseconds>(x).count() / (y ? y : 1))

class coroutine_context;

int switch_count = 100;
thread_local coroutine_context* this_coroutine = nullptr;
int max_coroutine_number = 100000; // 协程数量

class coroutine_context {
public:
    coroutine_context(size_t stack_sz) {
        // padding to 4K
        stack_sz = (stack_sz + 4095) % 4096 + 4096;
        // conform to POSIX.4 (POSIX.1b-1993, _POSIX_C_SOURCE=199309L)
        void *start_ptr =
#if defined(macintosh) || defined(__APPLE__) || defined(__APPLE_CC__)
            ::mmap(0, stack_sz, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
#else
            ::mmap(0, stack_sz, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
#endif

        if (!start_ptr || MAP_FAILED == start_ptr) {
            stack_ctx_.sp = NULL;
            return;
        }

        // memset(start_ptr, 0, size_);
        ::mprotect(start_ptr, 4096, PROT_NONE);

        stack_ctx_.size = stack_sz;
        stack_ctx_.sp = static_cast<char *>(start_ptr) + stack_ctx_.size; // stack down


        // ==============================================================
        getcontext(&callee_ctx_);
        callee_ctx_.uc_stack.ss_sp = static_cast<char *>(start_ptr) + 4096;
        callee_ctx_.uc_stack.ss_size = stack_sz - 4096;
        callee_ctx_.uc_link = &caller_ctx_;
        makecontext(&callee_ctx_, start_callback, 0);

        caller_coroutine_ = NULL;
        is_in_callback_ = false;
        is_finished_ = false;
    }

    ~coroutine_context() {
        if (NULL != stack_ctx_.sp) {
            void *start_ptr = static_cast<char *>(stack_ctx_.sp) - stack_ctx_.size;
            ::munmap(start_ptr, stack_ctx_.size);
        }
    }

    void resume() {
        if (is_in_callback_) {
            return;
        }
        is_in_callback_ = true;
        caller_coroutine_ = this_coroutine;
        this_coroutine = this;

        swapcontext(&caller_ctx_, &callee_ctx_);
    }

    void yield() {
        if(!is_in_callback_) {
            return;
        }
        is_in_callback_ = false;
        this_coroutine = caller_coroutine_;
        caller_coroutine_ = NULL;

        swapcontext(&callee_ctx_, &caller_ctx_);
    }

    static void start_callback() {
        this_coroutine->is_finished_ = false;
        int count = switch_count; // 每个协程N次切换
        while (count-- > 0)
            this_coroutine->yield();

        this_coroutine->is_finished_ = true;
        this_coroutine->yield();
    }

    inline bool is_finished() const { return is_finished_; }

private:
    struct stack_context {
        size_t size;    /** @brief stack size **/
        void* sp;       /** @brief stack end pointer **/
    };
    stack_context stack_ctx_;
    ucontext_t callee_ctx_;
    ucontext_t caller_ctx_;
    coroutine_context* caller_coroutine_;
    bool is_in_callback_;
    bool is_finished_;
};

int main(int argc, char *argv[]) {
    puts("###################### ucontext coroutine ###################");
    printf("########## Cmd:");
    for (int i = 0; i < argc; ++i) {
        printf(" %s", argv[i]);
    }
    puts("");

    if (argc > 1) {
        max_coroutine_number = atoi(argv[1]);
    }

    if (argc > 2) {
        switch_count = atoi(argv[2]);
    }

    size_t stack_size = 16 * 1024;
    if (argc > 3) {
        stack_size = atoi(argv[3]) * 1024;
    }

    time_t       begin_time  = time(NULL);
    CALC_CLOCK_T begin_clock = CALC_CLOCK_NOW();

    // create coroutines
    std::vector<std::unique_ptr<coroutine_context> > co_arr;
    co_arr.resize(static_cast<size_t>(max_coroutine_number));
    for (size_t i = 0; i < 64; ++ i) {
        for (size_t j = 0; i + j * 64 < co_arr.size(); ++ j) {
            co_arr[i + j * 64].reset(new coroutine_context(stack_size));
        }
    }

    time_t       end_time  = time(NULL);
    CALC_CLOCK_T end_clock = CALC_CLOCK_NOW();
    printf("create %d coroutine, cost time: %d s, clock time: %d ms, avg: %lld ns\n", max_coroutine_number,
           static_cast<int>(end_time - begin_time), CALC_MS_CLOCK(end_clock - begin_clock),
           CALC_NS_AVG_CLOCK(end_clock - begin_clock, max_coroutine_number));

    begin_time  = end_time;
    begin_clock = end_clock;

    // yield & resume from runner
    bool      continue_flag     = true;
    long long real_switch_times = static_cast<long long>(0);

    while (continue_flag) {
        continue_flag = false;
        for (int i = 0; i < max_coroutine_number; ++i) {
            if (false == co_arr[i]->is_finished()) {
                continue_flag = true;
                ++real_switch_times;
                co_arr[i]->resume();
            }
        }
    }

    end_time  = time(NULL);
    end_clock = CALC_CLOCK_NOW();
    printf("switch %d coroutine contest %lld times, cost time: %d s, clock time: %d ms, avg: %lld ns\n", max_coroutine_number,
           real_switch_times, static_cast<int>(end_time - begin_time), CALC_MS_CLOCK(end_clock - begin_clock),
           CALC_NS_AVG_CLOCK(end_clock - begin_clock, real_switch_times));

    begin_time  = end_time;
    begin_clock = end_clock;

    co_arr.clear();

    end_time  = time(NULL);
    end_clock = CALC_CLOCK_NOW();
    printf("remove %d coroutine, cost time: %d s, clock time: %d ms, avg: %lld ns\n", max_coroutine_number,
           static_cast<int>(end_time - begin_time), CALC_MS_CLOCK(end_clock - begin_clock),
           CALC_NS_AVG_CLOCK(end_clock - begin_clock, max_coroutine_number));

    return 0;
}
组件(Avg)	协程数：1 切换开销	协程数：1000 创建开销	协程数：1000 切换开销	协程数：30000 创建开销	协程数：30000 切换开销
栈大小（如果可指定）	16 KB	2 MB	2 MB	64 KB	64 KB
C++20 Coroutine - Clang	5 ns	130 ns	6 ns	136 ns	9 ns
C++20 Coroutine - MSVC	10 ns	407 ns	14 ns	369 ns	28 ns
libcopp	77 ns	4.1 us	105 ns	3.8 us	273 ns
libcopp+动态栈池	74 ns	101 ns	110 ns	222 ns	270 ns
libcopp+libcotask	96 ns	4.2 us	156 ns	4.2 us	389 ns
libcopp+libcotask+动态栈池	96 ns	197 ns	153 ns	329 ns	371 ns
libco+静态栈池	84 ns	3.9 us	168 ns	4.2 us	450 ns
libco（共享栈4K占用）	83 ns	3.9 us	529 ns	3.9 us	1073 ns
libco（共享栈8K占用）	86 ns	4.0 us	828 ns	3.9 us	1596 ns
libco（共享栈32K占用）	-	4.0 us	9152 ns	3.9 us	11.5 us
libgo	30 ns	8.3 us	~~32 us~~	5.5 us	~~150 ns~~
libgo 2018年版本 with boost	197 ns	5.3 us	124 ns	2.3 us	441 ns
libgo 2018年版本 with ucontext	539 ns	7.0 us	482 ns	2.7 us	921 ns
goroutine(golang)	425 ns	1.0 us	710 ns	1.0 us	1047 ns
linux ucontext	439 ns	4.4 us	505 ns	4.8 us	890 ns
	#include <cstdio>
	#include <cstdlib>
	#include <cstring>
	#include <ctime>
	#include <inttypes.h>
	#include <stdint.h>
	#include <vector>
	#include <memory>
	#include <iostream>

	#include <experimental/coroutine>

	#include <chrono>
	#define CALC_CLOCK_T std::chrono::system_clock::time_point
	#define CALC_CLOCK_NOW() std::chrono::system_clock::now()
	#define CALC_MS_CLOCK(x) static_cast<int>(std::chrono::duration_cast<std::chrono::milliseconds>(x).count())
	#define CALC_NS_AVG_CLOCK(x, y) static_cast<long long>(std::chrono::duration_cast<std::chrono::nanoseconds>(x).count() / (y ? y : 1))


	static std::vector<std::pair<int*, std::experimental::coroutine_handle<> > > g_test_rpc_manager;

	struct test_custom_coroutine_data;
	struct test_custom_coroutine {
	using data_ptr = std::unique_ptr<test_custom_coroutine_data>;

	struct promise_type {
	data_ptr refer_data;
	char fake_cache_miss_[64 - sizeof(test_custom_coroutine_data*)];
	promise_type();

	static test_custom_coroutine get_return_object_on_allocation_failure();

	test_custom_coroutine get_return_object();

	std::experimental::suspend_always initial_suspend();

	std::experimental::suspend_always final_suspend();

	void unhandled_exception();

	// 用以支持 co_return
	void return_void();

	// 用以支持 co_yield
	std::experimental::suspend_always yield_value(test_custom_coroutine_data*&);
	};

	// 下面的接入用侵入式的方式支持 co_await test_rpc_generator
	// MSVC 目前支持使用非侵入式的方式实现，但是clang不支持
	bool await_ready() noexcept;

	void await_resume();

	void await_suspend(std::experimental::coroutine_handle<promise_type>);

	int resume();
	void set_sum_times(int);
	bool is_done() const;
	test_custom_coroutine_data* data();
	private:
	test_custom_coroutine(test_custom_coroutine_data*);
	test_custom_coroutine_data* data_;
	char fake_cache_miss_[64 - sizeof(test_custom_coroutine_data*)];
	};

	struct test_custom_coroutine_data {
	int sum_times;
	int yield_times;

	std::experimental::coroutine_handle<test_custom_coroutine::promise_type> handle;
	};

	test_custom_coroutine::promise_type::promise_type() {
	refer_data = std::make_unique<test_custom_coroutine_data>();
	refer_data->sum_times = 0;
	refer_data->yield_times = 0;
	}
	test_custom_coroutine test_custom_coroutine::promise_type::get_return_object_on_allocation_failure() {
	return test_custom_coroutine{ nullptr };
	}

	test_custom_coroutine test_custom_coroutine::promise_type::get_return_object() {
	return test_custom_coroutine{ refer_data.get() };
	}

	std::experimental::suspend_always test_custom_coroutine::promise_type::initial_suspend() {
	refer_data->handle = std::experimental::coroutine_handle<promise_type>::from_promise(*this);
	return std::experimental::suspend_always{}; // STL提供了一些自带的awaiter实现，我们其实很多情况下也不需要另外写，直接用STL就好了
	}

	std::experimental::suspend_always test_custom_coroutine::promise_type::final_suspend() {
	return std::experimental::suspend_always{}; // 和上面一样，也是STL自带的awaiter实现
	}

	void test_custom_coroutine::promise_type::unhandled_exception() {
	std::terminate();
	}

	// 用以支持 co_return
	void test_custom_coroutine::promise_type::return_void() {
	refer_data->handle = nullptr;
	}

	// 用以支持 co_yield
	std::experimental::suspend_always test_custom_coroutine::promise_type::yield_value(test_custom_coroutine_data*& coro_data) {
	// 每次调用都会执行,创建handle用以后面恢复数据
	if (nullptr != refer_data) {
	refer_data->handle = std::experimental::coroutine_handle<promise_type>::from_promise(*this);
	++refer_data->yield_times;
	}

	coro_data = refer_data.get();
	return std::experimental::suspend_always{};
	}

	// 下面的接入用侵入式的方式支持 co_await test_custom_coroutine , 实际上benchmark过程中并没有用到
	// MSVC 目前支持使用非侵入式的方式实现，但是clang不支持
	bool test_custom_coroutine::await_ready() noexcept {
	// 准备好地标志是协程handle执行完毕了
	return !data_ \|\| !data_->handle \|\| data_->handle.done();
	}

	void test_custom_coroutine::await_resume() {
	// do nothing when benchmark
	}

	void test_custom_coroutine::await_suspend(std::experimental::coroutine_handle<promise_type>) {
	// do nothing when benchmark
	// 被外部模块 co_await , 这里完整的协程任务链流程应该是要把coroutine_handle记录到test_custom_coroutine
	// 在return_void后需要对这些coroutine_handle做resume操作，但是这里为了减少benchmark的额外开销和保持干净
	// 所以留空
	}

	int test_custom_coroutine::resume() {
	if (!await_ready()) {
	data_->handle.resume();
	return 1;
	}

	return 0;
	}

	void test_custom_coroutine::set_sum_times(int times) {
	if (data_) {
	data_->sum_times = times;
	}
	}

	bool test_custom_coroutine::is_done() const {
	return !(data_ && data_->handle);
	}

	test_custom_coroutine_data* test_custom_coroutine::data() {
	return data_;
	}

	test_custom_coroutine::test_custom_coroutine(test_custom_coroutine_data* d) : data_(d) {}

	// 异步协程函数
	test_custom_coroutine coroutine_start_main(test_custom_coroutine_data*& coro_data) {
	// create done

	// begin to yield
	while (coro_data != nullptr && coro_data->yield_times < coro_data->sum_times) {
	co_yield coro_data;
	}

	// finish all yield
	co_return;
	}

	// 这里模拟生成数据
	bool coroutine_resume(std::vector<test_custom_coroutine>& in, long long& real_switch_times) {
	bool ret = false;
	for (auto& co : in) {
	real_switch_times += co.resume();
	if (!co.is_done()) {
	ret = true;
	}
	}

	return ret;
	}

	int main(int argc, char* argv[]) {
	#ifdef __cpp_coroutines
	std::cout << "__cpp_coroutines: " << __cpp_coroutines << std::endl;
	#endif
	puts("###################### C++20 coroutine ###################");
	printf("########## Cmd:");
	for (int i = 0; i < argc; ++i) {
	printf(" %s", argv[i]);
	}
	puts("");

	int switch_count = 100;
	int max_coroutine_number = 100000; // 协程数量

	if (argc > 1) {
	max_coroutine_number = atoi(argv[1]);
	}

	if (argc > 2) {
	switch_count = atoi(argv[2]);
	}

	std::vector<test_custom_coroutine> co_arr;
	std::vector<test_custom_coroutine_data*> co_data_arr;
	co_arr.reserve(static_cast<size_t>(max_coroutine_number));
	co_data_arr.resize(static_cast<size_t>(max_coroutine_number), nullptr);

	time_t begin_time = time(NULL);
	CALC_CLOCK_T begin_clock = CALC_CLOCK_NOW();

	// create coroutines
	for (int i = 0; i < max_coroutine_number; ++i) {
	co_arr.emplace_back(coroutine_start_main(co_data_arr[i]));
	co_arr.back().set_sum_times(switch_count);
	co_data_arr[i] = co_arr.back().data();
	}

	time_t end_time = time(NULL);
	CALC_CLOCK_T end_clock = CALC_CLOCK_NOW();
	printf("create %d coroutine, cost time: %d s, clock time: %d ms, avg: %lld ns\n", max_coroutine_number,
	static_cast<int>(end_time - begin_time), CALC_MS_CLOCK(end_clock - begin_clock),
	CALC_NS_AVG_CLOCK(end_clock - begin_clock, max_coroutine_number));

	begin_time = end_time;
	begin_clock = end_clock;

	// yield & resume from runner
	long long real_switch_times = static_cast<long long>(0);

	bool is_continue = true;
	while (is_continue) {
	is_continue = coroutine_resume(co_arr, real_switch_times);
	}
	// sub create - resume
	real_switch_times -= max_coroutine_number;

	end_time = time(NULL);
	end_clock = CALC_CLOCK_NOW();
	printf("switch %d coroutine contest %lld times, cost time: %d s, clock time: %d ms, avg: %lld ns\n", max_coroutine_number,
	real_switch_times, static_cast<int>(end_time - begin_time), CALC_MS_CLOCK(end_clock - begin_clock),
	CALC_NS_AVG_CLOCK(end_clock - begin_clock, real_switch_times));

	begin_time = end_time;
	begin_clock = end_clock;

	co_arr.clear();

	end_time = time(NULL);
	end_clock = CALC_CLOCK_NOW();
	printf("remove %d coroutine, cost time: %d s, clock time: %d ms, avg: %lld ns\n", max_coroutine_number,
	static_cast<int>(end_time - begin_time), CALC_MS_CLOCK(end_clock - begin_clock),
	CALC_NS_AVG_CLOCK(end_clock - begin_clock, max_coroutine_number));

	return 0;
	}
	package main

	import (
	"fmt"
	"os"
	"strconv"
	"time"
	)

	func runCallback(in, out chan int64) {
	for n, ok := <-in; ok; n, ok = <-in {
	out <- n
	}
	}

	func runTest(round int, coroutineNum, switchTimes int64) {
	fmt.Printf("##### Round: %v\n", round)
	start := time.Now()
	channelsIn, channelsOut := make([]chan int64, coroutineNum), make([]chan int64, coroutineNum)
	for i := int64(0); i < coroutineNum; i++ {
	channelsIn[i] = make(chan int64, 1)
	channelsOut[i] = make(chan int64, 1)
	}
	end := time.Now()
	fmt.Printf("Create %v goroutines and channels cost %vns, avg %vns\n", coroutineNum, end.Sub(start).Nanoseconds(), end.Sub(start).Nanoseconds()/coroutineNum)

	start = time.Now()
	for i := int64(0); i < coroutineNum; i++ {
	go runCallback(channelsIn[i], channelsOut[i])
	}
	end = time.Now()
	fmt.Printf("Start %v goroutines and channels cost %vns, avg %vns\n", coroutineNum, end.Sub(start).Nanoseconds(), end.Sub(start).Nanoseconds()/coroutineNum)

	var sum int64 = 0
	start = time.Now()
	for i := int64(0); i < switchTimes; i++ {
	for j := int64(0); j < coroutineNum; j++ {
	channelsIn[j] <- 1
	sum += <-channelsOut[j]
	}
	}
	end = time.Now()
	fmt.Printf("Switch %v goroutines for %v times cost %vns, avg %vns\n", coroutineNum, sum, end.Sub(start).Nanoseconds(), end.Sub(start).Nanoseconds()/sum)

	start = time.Now()
	for i := int64(0); i < coroutineNum; i++ {
	close(channelsIn[i])
	close(channelsOut[i])
	}
	end = time.Now()
	fmt.Printf("Close %v goroutines cost %vns, avg %vns\n", coroutineNum, end.Sub(start).Nanoseconds(), end.Sub(start).Nanoseconds()/coroutineNum)
	}

	func main() {
	var coroutineNum, switchTimes int64 = 30000, 1000

	fmt.Printf("### Run: ")
	for _, v := range os.Args {
	fmt.Printf(" \"%s\"", v)
	}
	fmt.Printf("\n")

	if (len(os.Args)) > 1 {
	v, _ := strconv.Atoi(os.Args[1])
	coroutineNum = int64(v)
	}

	if (len(os.Args)) > 2 {
	v, _ := strconv.Atoi(os.Args[2])
	switchTimes = int64(v)
	}

	for i := 1; i <= 5; i++ {
	runTest(i, coroutineNum, switchTimes)
	}
	}