Last active
December 4, 2018 07:16
-
-
Save FinalTheory/3a116245ccdfbd198b29d42fa6467775 to your computer and use it in GitHub Desktop.
Minimal framework internal cost demo
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// g++ example.cpp -O2 -std=c++11 -o example | |
#include <cstdlib> | |
#include <string> | |
#include <chrono> | |
#include <thread> | |
#include <mutex> | |
#include <vector> | |
#include <functional> | |
#include <condition_variable> | |
#include <atomic> | |
class completion { | |
public: | |
completion() = default; | |
explicit completion(int num_tasks) : _num_tasks(num_tasks) {} | |
void wait() { | |
std::unique_lock<std::mutex> lock(_mtx); | |
_cv.wait(lock, [this]() { return _completed == _num_tasks; }); | |
} | |
template<typename Rep, typename Period> | |
bool wait(const std::chrono::duration<Rep, Period>& t) { | |
std::unique_lock<std::mutex> lock(_mtx); | |
return _cv.wait_for(lock, t, [this]() { return _completed == _num_tasks; }); | |
} | |
void complete() { | |
{ | |
std::unique_lock<std::mutex> lock(_mtx); | |
_completed++; | |
} | |
_cv.notify_all(); | |
} | |
void reset() { _completed = 0; } | |
private: | |
std::mutex _mtx; | |
std::condition_variable _cv; | |
const int _num_tasks{1}; | |
int _completed{0}; | |
}; | |
class completion_pooling { | |
public: | |
completion_pooling() = default; | |
explicit completion_pooling(int num_tasks) : _num_tasks(num_tasks) {} | |
void wait() { | |
while (_completed != _num_tasks) {} | |
} | |
template<typename Rep, typename Period> | |
bool wait(const std::chrono::duration<Rep, Period>& t) { | |
auto start = std::chrono::high_resolution_clock::now(); | |
while (_completed != _num_tasks) { | |
auto cur = std::chrono::high_resolution_clock::now(); | |
if (cur - start >= t) { return false; } | |
} | |
return true; | |
} | |
void complete() { | |
_completed++; | |
} | |
void reset() { _completed = 0; } | |
private: | |
const int _num_tasks{1}; | |
std::atomic_int _completed{0}; | |
}; | |
using wall_clock_t = std::chrono::time_point<std::chrono::high_resolution_clock>; | |
// 这个函数的目的是统计发生在自己外面的那个worker()函数的开销,但是不统计自身的开销 | |
void mock_simulator(const std::string&, void*) { | |
static thread_local wall_clock_t wall_clk; | |
static thread_local bool wall_clk_inited{false}; | |
static thread_local double total_time{0.}; | |
if (!wall_clk_inited) { | |
wall_clk_inited = true; | |
wall_clk = std::chrono::high_resolution_clock::now(); | |
std::atexit([]() { | |
fprintf(stderr, "total = %.3f ms\n", total_time * 1e3); | |
}); | |
} | |
std::chrono::duration<double> delta = std::chrono::high_resolution_clock::now() - wall_clk; | |
total_time += delta.count(); | |
auto comp = std::make_shared<completion>(); | |
// 注释掉下面这一行,打印出来的总耗时减少3~5倍 | |
// 也就是说,由于这里的等待,导致worker()内部的开销变大了? | |
comp->wait(std::chrono::milliseconds(100)); | |
// e.g. total = 0.372 ms vs total = 0.100 ms | |
wall_clk = std::chrono::high_resolution_clock::now(); | |
} | |
// 也就是说,统计的总时间是worker函数上下文的耗时,但是不包括mock_simulator内部的耗时 | |
// 这也是模拟器统计框架耗时的基本原理 | |
void worker() { | |
std::vector<std::uint32_t> params; | |
// params.reserve(32); | |
for (int i = 0; i < 32; i++) { params.push_back(rand()); } | |
mock_simulator("kernel_name", ¶ms[0]); | |
} | |
void do_test() { | |
for (int i = 0; i < 100; i++) { worker(); } | |
} | |
int main() { | |
do_test(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment