Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mkolod/66403f7eda342315fcd61352f72d0966 to your computer and use it in GitHub Desktop.
Save mkolod/66403f7eda342315fcd61352f72d0966 to your computer and use it in GitHub Desktop.
Multi-Streaming Experiments
#include <stdio.h>
#include <thread>
#include <chrono>
#include <iostream>
const int N = 1 << 20;
__global__ void kernel(float *x, int n)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
for (int i = tid; i < n; i += blockDim.x * gridDim.x) {
x[i] = sqrt(pow(3.14159,i));
}
}
void launch_kernel()
{
float *data;
cudaMalloc(&data, N * sizeof(float));
kernel<<<1, 64>>>(data, N);
cudaStreamSynchronize(0);
}
int main()
{
const int num_threads = 8;
std::array<std::thread, num_threads> workers;
std::chrono::high_resolution_clock::time_point gpu_start;
std::chrono::high_resolution_clock::time_point gpu_end ;
std::chrono::duration<double> gpu_span;
int count = 100;
while(count > 0){
gpu_start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < num_threads; i++) {
workers[i] = std::thread(launch_kernel);
}
for (int i = 0; i < num_threads; i++) {
workers[i].join();
}
cudaDeviceReset();
gpu_end = std::chrono::high_resolution_clock::now();
gpu_span = std::chrono::duration_cast<std::chrono::duration<double>>(gpu_end - gpu_start);
std::cout << "gpu time: " << gpu_span.count()*1000 << "ms" << std::endl;
--count;
}
return 0;
}
#include <chrono>
#include <iostream>
const int N = 1 << 20;
__global__ void kernel(float *x, int n)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
for (int i = tid; i < n; i += blockDim.x * gridDim.x) {
x[i] = sqrt(pow(3.14159,i));
}
}
int main()
{
const int num_streams = 8;
cudaStream_t streams[num_streams];
float *data[num_streams];
std::chrono::high_resolution_clock::time_point gpu_start;
std::chrono::high_resolution_clock::time_point gpu_end ;
std::chrono::duration<double> gpu_span;
int count = 25;
while(count >0){
gpu_start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < num_streams; i++) {
//cudaStreamCreate(&streams[i]);
cudaMalloc(&data[i], N * sizeof(float));
// launch one worker kernel per stream
//kernel<<<1, 64, 0, streams[i]>>>(data[i], N);
kernel<<<1, 64 >>>(data[i], N);
// launch a dummy kernel on the default stream
//kernel<<<1, 1>>>(0, 0);
}
cudaDeviceReset();
gpu_end = std::chrono::high_resolution_clock::now();
gpu_span = std::chrono::duration_cast<std::chrono::duration<double>>(gpu_end - gpu_start);
std::cout << "gpu time: " << gpu_span.count()*1000 << "ms" << std::endl;
--count;
}
return 0;
}
#include <chrono>
#include <iostream>
const int N = 1 << 20;
__global__ void kernel(float *x, int n)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
for (int i = tid; i < n; i += blockDim.x * gridDim.x) {
x[i] = sqrt(pow(3.14159,i));
}
}
int main()
{
const int num_streams = 8;
cudaStream_t streams[num_streams];
float *data[num_streams];
std::chrono::high_resolution_clock::time_point gpu_start;
std::chrono::high_resolution_clock::time_point gpu_end ;
std::chrono::duration<double> gpu_span;
int count = 25;
while(count >0){
gpu_start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < num_streams; i++) {
cudaStreamCreate(&streams[i]);
cudaMalloc(&data[i], N * sizeof(float));
// launch one worker kernel per stream
kernel<<<1, 64, 0, streams[i]>>>(data[i], N);
//kernel<<<1, 64 >>>(data[i], N);
// launch a dummy kernel on the default stream
//kernel<<<1, 1>>>(0, 0);
}
cudaDeviceReset();
gpu_end = std::chrono::high_resolution_clock::now();
gpu_span = std::chrono::duration_cast<std::chrono::duration<double>>(gpu_end - gpu_start);
std::cout << "gpu time: " << gpu_span.count()*1000 << "ms" << std::endl;
--count;
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment