Skip to content

Instantly share code, notes, and snippets.

@FinalTheory
Last active December 4, 2018 07:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save FinalTheory/3a116245ccdfbd198b29d42fa6467775 to your computer and use it in GitHub Desktop.
Save FinalTheory/3a116245ccdfbd198b29d42fa6467775 to your computer and use it in GitHub Desktop.
Minimal framework internal cost demo
// g++ example.cpp -O2 -std=c++11 -o example
#include <cstdlib>
#include <string>
#include <chrono>
#include <thread>
#include <mutex>
#include <vector>
#include <functional>
#include <condition_variable>
#include <atomic>
class completion {
public:
completion() = default;
explicit completion(int num_tasks) : _num_tasks(num_tasks) {}
void wait() {
std::unique_lock<std::mutex> lock(_mtx);
_cv.wait(lock, [this]() { return _completed == _num_tasks; });
}
template<typename Rep, typename Period>
bool wait(const std::chrono::duration<Rep, Period>& t) {
std::unique_lock<std::mutex> lock(_mtx);
return _cv.wait_for(lock, t, [this]() { return _completed == _num_tasks; });
}
void complete() {
{
std::unique_lock<std::mutex> lock(_mtx);
_completed++;
}
_cv.notify_all();
}
void reset() { _completed = 0; }
private:
std::mutex _mtx;
std::condition_variable _cv;
const int _num_tasks{1};
int _completed{0};
};
class completion_pooling {
public:
completion_pooling() = default;
explicit completion_pooling(int num_tasks) : _num_tasks(num_tasks) {}
void wait() {
while (_completed != _num_tasks) {}
}
template<typename Rep, typename Period>
bool wait(const std::chrono::duration<Rep, Period>& t) {
auto start = std::chrono::high_resolution_clock::now();
while (_completed != _num_tasks) {
auto cur = std::chrono::high_resolution_clock::now();
if (cur - start >= t) { return false; }
}
return true;
}
void complete() {
_completed++;
}
void reset() { _completed = 0; }
private:
const int _num_tasks{1};
std::atomic_int _completed{0};
};
using wall_clock_t = std::chrono::time_point<std::chrono::high_resolution_clock>;
// 这个函数的目的是统计发生在自己外面的那个worker()函数的开销,但是不统计自身的开销
void mock_simulator(const std::string&, void*) {
static thread_local wall_clock_t wall_clk;
static thread_local bool wall_clk_inited{false};
static thread_local double total_time{0.};
if (!wall_clk_inited) {
wall_clk_inited = true;
wall_clk = std::chrono::high_resolution_clock::now();
std::atexit([]() {
fprintf(stderr, "total = %.3f ms\n", total_time * 1e3);
});
}
std::chrono::duration<double> delta = std::chrono::high_resolution_clock::now() - wall_clk;
total_time += delta.count();
auto comp = std::make_shared<completion>();
// 注释掉下面这一行,打印出来的总耗时减少3~5倍
// 也就是说,由于这里的等待,导致worker()内部的开销变大了?
comp->wait(std::chrono::milliseconds(100));
// e.g. total = 0.372 ms vs total = 0.100 ms
wall_clk = std::chrono::high_resolution_clock::now();
}
// 也就是说,统计的总时间是worker函数上下文的耗时,但是不包括mock_simulator内部的耗时
// 这也是模拟器统计框架耗时的基本原理
void worker() {
std::vector<std::uint32_t> params;
// params.reserve(32);
for (int i = 0; i < 32; i++) { params.push_back(rand()); }
mock_simulator("kernel_name", &params[0]);
}
void do_test() {
for (int i = 0; i < 100; i++) { worker(); }
}
int main() {
do_test();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment