Skip to content

Instantly share code, notes, and snippets.

@fwyzard
Last active November 23, 2021 14:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fwyzard/1aac6214c8303b4a74abcf170c61b7b9 to your computer and use it in GitHub Desktop.
Save fwyzard/1aac6214c8303b4a74abcf170c61b7b9 to your computer and use it in GitHub Desktop.
Test program to check what CUDA operations can be performed on a device different than the current one
#include <cuda_runtime.h>
#include "cudaCheck.h"
int main(void) {
// 3D buffer layout
constexpr size_t pitch = 32;
constexpr size_t width = 20;
static_assert(width <= pitch, "The 3D buffer `width` cannot be larger than the `pitch`.");
constexpr size_t height = 8;
constexpr size_t slices = 4;
const auto extent = make_cudaExtent(width, height, slices);
// linear buffer layout
constexpr size_t size = pitch * height * slices;
//char host_buffer[size];
char* host_buffer = nullptr;
CUDA_CHECK(cudaMallocHost(&host_buffer, size));
char* buffer = nullptr;
char* async_buffer = nullptr;
// CUDA streams and events
cudaStream_t stream;
cudaEvent_t event;
// ===========================================================================
// select the first device
CUDA_CHECK(cudaSetDevice(0));
// create a CUDA stream and a CUDA event on the first device
CUDA_CHECK(cudaStreamCreate(&stream));
CUDA_CHECK(cudaEventCreate(&event));
// allocate memory on the first device
CUDA_CHECK(cudaMalloc(&buffer, size));
CUDA_CHECK(cudaMallocAsync(&async_buffer, size, stream));
// cudaMemset variants need to be called with the correct device
CUDA_CHECK(cudaMemset(buffer, 0x00, size));
CUDA_CHECK(cudaMemset2D(buffer, pitch, 0x42, width, height * slices));
CUDA_CHECK(cudaMemset3D(
make_cudaPitchedPtr(buffer, pitch, width, height),
0x99, extent));
CUDA_CHECK(cudaMemset(async_buffer, 0x00, size));
CUDA_CHECK(cudaMemset2D(async_buffer, pitch, 0x42, width, height * slices));
CUDA_CHECK(cudaMemset3D(
make_cudaPitchedPtr(async_buffer, pitch, width, height),
0x99, extent));
// cudaMemcpy variants on cudaMallocAsync'ed memory need to be called with the correct device
// see https://github.com/fwyzard/nvidia_bug_3446335
CUDA_CHECK(cudaMemcpy(async_buffer, host_buffer, size, cudaMemcpyDefault));
CUDA_CHECK(cudaMemcpy(host_buffer, async_buffer, size, cudaMemcpyDefault));
// cudaMemcpyAsync variants on cudaMallocAsync'ed memory need to be called with the correct device
// see https://github.com/fwyzard/nvidia_bug_3446335
CUDA_CHECK(cudaMemcpyAsync(async_buffer, host_buffer, size, cudaMemcpyDefault, stream));
CUDA_CHECK(cudaMemcpyAsync(host_buffer, async_buffer, size, cudaMemcpyDefault, stream));
// ===========================================================================
// select the second device
CUDA_CHECK(cudaSetDevice(1));
// test
CUDA_CHECK(cudaMemcpy(host_buffer, buffer, size, cudaMemcpyDefault));
CUDA_CHECK(cudaMemcpy(buffer, host_buffer, size, cudaMemcpyDefault));
// cudaMemcpyAsync variants on cudaMalloc'ed memory can be called with a different device
CUDA_CHECK(cudaMemcpyAsync(host_buffer, buffer, size, cudaMemcpyDefault, stream));
CUDA_CHECK(cudaMemcpyAsync(buffer, host_buffer, size, cudaMemcpyDefault, stream));
// cudaMemsetAsync variants can be called with a different device
CUDA_CHECK(cudaMemsetAsync(buffer, 0x00, size, stream));
CUDA_CHECK(cudaMemset2DAsync(buffer, pitch, 0x42, width, height * slices, stream));
CUDA_CHECK(cudaMemset3DAsync(
make_cudaPitchedPtr(buffer, pitch, width, height),
0x99, extent, stream));
CUDA_CHECK(cudaMemsetAsync(async_buffer, 0x00, size, stream));
CUDA_CHECK(cudaMemset2DAsync(async_buffer, pitch, 0x42, width, height * slices, stream));
CUDA_CHECK(cudaMemset3DAsync(
make_cudaPitchedPtr(async_buffer, pitch, width, height),
0x99, extent, stream));
// cudaFree and cudaFreeAsync can be called with a different device
CUDA_CHECK(cudaFree(buffer));
CUDA_CHECK(cudaFreeAsync(async_buffer, stream));
// cudaEventRecord can be called with a different device
// (as long as stream and event refer to the same device)
CUDA_CHECK(cudaEventRecord(event, stream));
// cudaStreamSynchronize and cudaStreamDestroy can be called with a different device
CUDA_CHECK(cudaStreamSynchronize(stream));
CUDA_CHECK(cudaStreamDestroy(stream));
// cudaEventQuery and cudaEventDestroy can be called with a different device
CUDA_CHECK(cudaEventQuery(event));
CUDA_CHECK(cudaEventDestroy(event));
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment