Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save sandeepkumar-skb/03c2f7582fda454889e5f010e7e06e97 to your computer and use it in GitHub Desktop.
Save sandeepkumar-skb/03c2f7582fda454889e5f010e7e06e97 to your computer and use it in GitHub Desktop.
#include <chrono>
#include <iostream>
#include <vector>
#include <thread>
__global__ void do_nothing(int time_us, int clock_rate) {
clock_t start = clock64();
clock_t end;
for (;;) {
end = clock64();
// 1.12 is just an empirical correction for straggler threads, etc.
int elapsed_time_us =
static_cast<int>(static_cast<float>(end - start) / clock_rate * 1000000 * 1.12);
if (elapsed_time_us > time_us) {
break;
}
}
}
int main() {
using namespace std::chrono;
int num_calls = 10000;
int num_threads = 10;
int wait_us = 35;
int kernel_us = 5;
cudaStream_t default_stream = 0;
cudaDeviceProp props;
cudaGetDeviceProperties(&props, 0);
// clock is in kHz
int clock_rate = props.clockRate * 1000;
// Titan RTX has 4608 CUDA cores and 72 SMs. 4608 / 72 = 64
dim3 grid(72, 1, 1);
dim3 block(64, 1, 1);
auto f1 = [grid, block, kernel_us, wait_us, clock_rate](cudaStream_t stream) {
do_nothing<<<grid, block, 0, stream>>>(kernel_us, clock_rate);
std::this_thread::sleep_for(microseconds(wait_us));
};
auto t1 = high_resolution_clock::now();
for (int i = 0; i < num_calls; ++i) {
f1(default_stream);
}
cudaDeviceSynchronize();
auto t2 = high_resolution_clock::now();
auto f2 =
[f1, num_calls, num_threads]() {
cudaStream_t local_stream;
cudaStreamCreateWithFlags(&local_stream, cudaStreamNonBlocking);
for (int i = 0; i < num_calls / num_threads; ++i) {
f1(local_stream);
}
cudaStreamSynchronize(local_stream);
cudaStreamDestroy(local_stream);
};
std::vector<std::thread> threads;
auto t3 = high_resolution_clock::now();
for (int i = 0; i < num_threads; ++i) {
threads.push_back(std::move(std::thread(f2)));
}
for (int i = 0; i < num_threads; ++i) {
if (threads[i].joinable()) {
threads[i].join();
}
}
auto t4 = high_resolution_clock::now();
auto single_dur = duration_cast<duration<double, std::micro>>(t2 - t1).count();
auto multi_dur = duration_cast<duration<double, std::micro>>(t4 - t3).count();
std::cout << "Single-threaded time: " << single_dur << " us" << std::endl;
std::cout << "Multi-threaded time: " << multi_dur << " us" << std::endl;
std::cout << "Multi-threaded speed-up: " << (1.0 * single_dur / multi_dur) << "x" << std::endl;
return 0;
}
@sandeepkumar-skb
Copy link
Author

Single Stream:
image
Multi Stream:
image

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment