Skip to content

Instantly share code, notes, and snippets.

@ShigekiKarita
Last active December 26, 2016 17:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ShigekiKarita/61b9ef537f9f5c4d3a1b28cda3ef4195 to your computer and use it in GitHub Desktop.
Save ShigekiKarita/61b9ef537f9f5c4d3a1b28cda3ef4195 to your computer and use it in GitHub Desktop.
CUDA8のマルチストリーム/スレッド処理にC++11のfuture/threadを使う ref: http://qiita.com/ShigekiKarita/items/a3891f13f3acc3575c08
$ nvcc ./stream_test.cu -o ./stream_legacy
$ nvvp ./stream_legacy
#include <array>
#include <future>
#include "cuda_check.hpp"
const int N = 1 << 20;
__global__ void kernel(float *x, int n) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
for (int i = tid; i < n; i += blockDim.x * gridDim.x) {
x[i] = sqrt(pow(3.14159,i));
}
}
int main() {
const int num_streams = 8;
std::future<cudaError_t> fs[num_streams];
float* data[num_streams];
for (int i = 0; i < num_streams; i++) {
fs[i] = std::async(
std::launch::async, [=]() mutable
{
CUDA_CHECK(cudaMalloc(&data[i], N * sizeof(float)));
// launch one worker kernel per stream
kernel<<<1, 64, 0>>>(data[i], N);
CUDA_CHECK(cudaPeekAtLastError());
return cudaSuccess;
});
// launch a dummy kernel on the default stream
kernel<<<1, 1>>>(0, 0);
CUDA_CHECK(cudaPeekAtLastError());
}
for (auto&f: fs) {
CUDA_CHECK(f.get());
}
cudaDeviceReset();
}
int main() {
const int num_streams = 8;
cudaStream_t streams[num_streams];
std::future<cudaError_t> fs[num_streams];
float* data[num_streams];
for (int i = 0; i < num_streams; i++) {
// こちらはこのforループ内では実行されない
fs[i] = std::async(
std::launch::deferred, [=]() mutable
{
CUDA_CHECK(cudaStreamCreate(&streams[i]));
CUDA_CHECK(cudaMalloc(&data[i], N * sizeof(float)));
kernel<<<1, 64, 0, streams[i]>>>(data[i], N);
CUDA_CHECK(cudaPeekAtLastError());
return cudaSuccess;
});
// こちらは上のコードより先にこのforループ内で実行される
kernel<<<1, 1>>>(0, 0);
CUDA_CHECK(cudaPeekAtLastError());
}
for (auto&f: fs) {
// ここではじめて deferred されたラムダ式を実行
CUDA_CHECK(f.get());
}
cudaDeviceReset();
}
#include <future>
#include <thread>
#include <iostream>
#include "cuda_check.hpp"
const int N = 1 << 20;
__global__ void kernel(float *x, int n) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
for (int i = tid; i < n; i += blockDim.x * gridDim.x) {
x[i] = sqrt(pow(3.14159,i));
}
}
std::string launch_kernel() {
float *data;
CUDA_CHECK(cudaMalloc(&data, N * sizeof(float)));
kernel<<<1, 64>>>(data, N);
CUDA_CHECK(cudaPeekAtLastError());
CUDA_CHECK(cudaStreamSynchronize(0));
return "ok";
}
int main() {
const int num_threads = 8;
std::future<std::string> fs[num_threads];
for (auto& f: fs) {
try {
std::packaged_task<std::string()> task(launch_kernel);
f = task.get_future();
std::thread(std::move(task)).detach();
} catch(std::exception& e) {
std::cerr << "Error creating thread: " << e.what() << std::endl;
}
}
for (auto& f: fs) {
try {
std::cout << f.get() << std::endl;
} catch(std::exception& e) {
std::cerr << "Error joining thread: " << e.what() << std::endl;
}
}
cudaDeviceReset();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment