fwyzard/test.cu

## test.cu
#include <cuda_runtime.h>
#include "cudaCheck.h"

int main(void) {
  // 3D buffer layout
  constexpr size_t pitch = 32;
  constexpr size_t width = 20;
  static_assert(width <= pitch, "The 3D buffer `width` cannot be larger than the `pitch`.");
  constexpr size_t height = 8;
  constexpr size_t slices = 4;
  const auto extent = make_cudaExtent(width, height, slices);

  // linear buffer layout
  constexpr size_t size = pitch * height * slices;

  //char host_buffer[size];
  char* host_buffer = nullptr;
  CUDA_CHECK(cudaMallocHost(&host_buffer, size));

  char* buffer = nullptr;
  char* async_buffer = nullptr;

  // CUDA streams and events
  cudaStream_t stream;
  cudaEvent_t event;

  // ===========================================================================
  // select the first device
  CUDA_CHECK(cudaSetDevice(0));

  // create a CUDA stream and a CUDA event on the first device
  CUDA_CHECK(cudaStreamCreate(&stream));
  CUDA_CHECK(cudaEventCreate(&event));

  // allocate memory on the first device
  CUDA_CHECK(cudaMalloc(&buffer, size));
  CUDA_CHECK(cudaMallocAsync(&async_buffer, size, stream));

  // cudaMemset variants need to be called with the correct device
  CUDA_CHECK(cudaMemset(buffer, 0x00, size));
  CUDA_CHECK(cudaMemset2D(buffer, pitch, 0x42, width, height * slices));
  CUDA_CHECK(cudaMemset3D(
    make_cudaPitchedPtr(buffer, pitch, width, height),
    0x99, extent));

  CUDA_CHECK(cudaMemset(async_buffer, 0x00, size));
  CUDA_CHECK(cudaMemset2D(async_buffer, pitch, 0x42, width, height * slices));
  CUDA_CHECK(cudaMemset3D(
    make_cudaPitchedPtr(async_buffer, pitch, width, height),
    0x99, extent));

  // cudaMemcpy variants on cudaMallocAsync'ed memory need to be called with the correct device
  // see https://github.com/fwyzard/nvidia_bug_3446335
  CUDA_CHECK(cudaMemcpy(async_buffer, host_buffer, size, cudaMemcpyDefault));
  CUDA_CHECK(cudaMemcpy(host_buffer, async_buffer, size, cudaMemcpyDefault));

  // cudaMemcpyAsync variants on cudaMallocAsync'ed memory need to be called with the correct device
  // see https://github.com/fwyzard/nvidia_bug_3446335
  CUDA_CHECK(cudaMemcpyAsync(async_buffer, host_buffer, size, cudaMemcpyDefault, stream));
  CUDA_CHECK(cudaMemcpyAsync(host_buffer, async_buffer, size, cudaMemcpyDefault, stream));

  // ===========================================================================
  // select the second device
  CUDA_CHECK(cudaSetDevice(1));

  // test
  CUDA_CHECK(cudaMemcpy(host_buffer, buffer, size, cudaMemcpyDefault));
  CUDA_CHECK(cudaMemcpy(buffer, host_buffer, size, cudaMemcpyDefault));

  // cudaMemcpyAsync variants on cudaMalloc'ed memory can be called with a different device
  CUDA_CHECK(cudaMemcpyAsync(host_buffer, buffer, size, cudaMemcpyDefault, stream));
  CUDA_CHECK(cudaMemcpyAsync(buffer, host_buffer, size, cudaMemcpyDefault, stream));

  // cudaMemsetAsync variants can be called with a different device
  CUDA_CHECK(cudaMemsetAsync(buffer, 0x00, size, stream));
  CUDA_CHECK(cudaMemset2DAsync(buffer, pitch, 0x42, width, height * slices, stream));
  CUDA_CHECK(cudaMemset3DAsync(
    make_cudaPitchedPtr(buffer, pitch, width, height),
    0x99, extent, stream));

  CUDA_CHECK(cudaMemsetAsync(async_buffer, 0x00, size, stream));
  CUDA_CHECK(cudaMemset2DAsync(async_buffer, pitch, 0x42, width, height * slices, stream));
  CUDA_CHECK(cudaMemset3DAsync(
    make_cudaPitchedPtr(async_buffer, pitch, width, height),
    0x99, extent, stream));

  // cudaFree and cudaFreeAsync can be called with a different device
  CUDA_CHECK(cudaFree(buffer));
  CUDA_CHECK(cudaFreeAsync(async_buffer, stream));

  // cudaEventRecord can be called with a different device
  // (as long as stream and event refer to the same device)
  CUDA_CHECK(cudaEventRecord(event, stream));

  // cudaStreamSynchronize and cudaStreamDestroy can be called with a different device
  CUDA_CHECK(cudaStreamSynchronize(stream));
  CUDA_CHECK(cudaStreamDestroy(stream));

  // cudaEventQuery and cudaEventDestroy can be called with a different device
  CUDA_CHECK(cudaEventQuery(event));
  CUDA_CHECK(cudaEventDestroy(event));
}
	#include <cuda_runtime.h>
	#include "cudaCheck.h"

	int main(void) {
	// 3D buffer layout
	constexpr size_t pitch = 32;
	constexpr size_t width = 20;
	static_assert(width <= pitch, "The 3D buffer `width` cannot be larger than the `pitch`.");
	constexpr size_t height = 8;
	constexpr size_t slices = 4;
	const auto extent = make_cudaExtent(width, height, slices);

	// linear buffer layout
	constexpr size_t size = pitch * height * slices;

	//char host_buffer[size];
	char* host_buffer = nullptr;
	CUDA_CHECK(cudaMallocHost(&host_buffer, size));

	char* buffer = nullptr;
	char* async_buffer = nullptr;

	// CUDA streams and events
	cudaStream_t stream;
	cudaEvent_t event;

	// ===========================================================================
	// select the first device
	CUDA_CHECK(cudaSetDevice(0));

	// create a CUDA stream and a CUDA event on the first device
	CUDA_CHECK(cudaStreamCreate(&stream));
	CUDA_CHECK(cudaEventCreate(&event));

	// allocate memory on the first device
	CUDA_CHECK(cudaMalloc(&buffer, size));
	CUDA_CHECK(cudaMallocAsync(&async_buffer, size, stream));

	// cudaMemset variants need to be called with the correct device
	CUDA_CHECK(cudaMemset(buffer, 0x00, size));
	CUDA_CHECK(cudaMemset2D(buffer, pitch, 0x42, width, height * slices));
	CUDA_CHECK(cudaMemset3D(
	make_cudaPitchedPtr(buffer, pitch, width, height),
	0x99, extent));

	CUDA_CHECK(cudaMemset(async_buffer, 0x00, size));
	CUDA_CHECK(cudaMemset2D(async_buffer, pitch, 0x42, width, height * slices));
	CUDA_CHECK(cudaMemset3D(
	make_cudaPitchedPtr(async_buffer, pitch, width, height),
	0x99, extent));

	// cudaMemcpy variants on cudaMallocAsync'ed memory need to be called with the correct device
	// see https://github.com/fwyzard/nvidia_bug_3446335
	CUDA_CHECK(cudaMemcpy(async_buffer, host_buffer, size, cudaMemcpyDefault));
	CUDA_CHECK(cudaMemcpy(host_buffer, async_buffer, size, cudaMemcpyDefault));

	// cudaMemcpyAsync variants on cudaMallocAsync'ed memory need to be called with the correct device
	// see https://github.com/fwyzard/nvidia_bug_3446335
	CUDA_CHECK(cudaMemcpyAsync(async_buffer, host_buffer, size, cudaMemcpyDefault, stream));
	CUDA_CHECK(cudaMemcpyAsync(host_buffer, async_buffer, size, cudaMemcpyDefault, stream));

	// ===========================================================================
	// select the second device
	CUDA_CHECK(cudaSetDevice(1));

	// test
	CUDA_CHECK(cudaMemcpy(host_buffer, buffer, size, cudaMemcpyDefault));
	CUDA_CHECK(cudaMemcpy(buffer, host_buffer, size, cudaMemcpyDefault));

	// cudaMemcpyAsync variants on cudaMalloc'ed memory can be called with a different device
	CUDA_CHECK(cudaMemcpyAsync(host_buffer, buffer, size, cudaMemcpyDefault, stream));
	CUDA_CHECK(cudaMemcpyAsync(buffer, host_buffer, size, cudaMemcpyDefault, stream));

	// cudaMemsetAsync variants can be called with a different device
	CUDA_CHECK(cudaMemsetAsync(buffer, 0x00, size, stream));
	CUDA_CHECK(cudaMemset2DAsync(buffer, pitch, 0x42, width, height * slices, stream));
	CUDA_CHECK(cudaMemset3DAsync(
	make_cudaPitchedPtr(buffer, pitch, width, height),
	0x99, extent, stream));

	CUDA_CHECK(cudaMemsetAsync(async_buffer, 0x00, size, stream));
	CUDA_CHECK(cudaMemset2DAsync(async_buffer, pitch, 0x42, width, height * slices, stream));
	CUDA_CHECK(cudaMemset3DAsync(
	make_cudaPitchedPtr(async_buffer, pitch, width, height),
	0x99, extent, stream));

	// cudaFree and cudaFreeAsync can be called with a different device
	CUDA_CHECK(cudaFree(buffer));
	CUDA_CHECK(cudaFreeAsync(async_buffer, stream));

	// cudaEventRecord can be called with a different device
	// (as long as stream and event refer to the same device)
	CUDA_CHECK(cudaEventRecord(event, stream));

	// cudaStreamSynchronize and cudaStreamDestroy can be called with a different device
	CUDA_CHECK(cudaStreamSynchronize(stream));
	CUDA_CHECK(cudaStreamDestroy(stream));

	// cudaEventQuery and cudaEventDestroy can be called with a different device
	CUDA_CHECK(cudaEventQuery(event));
	CUDA_CHECK(cudaEventDestroy(event));
	}