neoblizz/launch.cuh

## launch.cuh
#pragma once

// Includes CUDA
#include <cuda_runtime.h>
#include <cooperative_groups.h>

#include <utility>

namespace cg = cooperative_groups;

// Helper functions.
inline void for_each_argument_address(void**) {}

template <typename arg_t, typename... args_t>
inline void for_each_argument_address(void** collected_addresses,
                                      arg_t&& arg,
                                      args_t&&... args) {
  collected_addresses[0] = const_cast<void*>(static_cast<const void*>(&arg));
  for_each_argument_address(collected_addresses + 1,
                            ::std::forward<args_t>(args)...);
}

/**
 * @brief Launch a given kernel using cudaLaunchCooperativeKernel API for
 * Cooperative Groups (CG). This is a C++ wrapper that makes the C-based API of
 * CG more accessible. See the example below for use:
 *
 * @note For an example use see the commented code below.
 * @note GodBolt.org link: https://tinyurl.com/cgeg289hw
 *
 * @tparam func_t type of kernel
 * @tparam args_t types of the arguments (variadic)
 * @param stream cuda stream
 * @param f function name/identifier
 * @param block_dimensions block dimension (dim3)
 * @param grid_dimensions  grid dimension (dim3)
 * @param shared_memory_bytes allocated dynamic shared memory in bytes
 * @param args all the arguments to the function f
 */
template <typename func_t, typename... args_t>
void launch_cooperative(cudaStream_t& stream,
                        const func_t& f,
                        dim3 block_dimensions,
                        dim3 grid_dimensions,
                        std::size_t shared_memory_bytes,
                        args_t&&... args) {
  constexpr const auto non_zero_num_params =
      sizeof...(args_t) == 0 ? 1 : sizeof...(args_t);
  void* argument_ptrs[non_zero_num_params];
  for_each_argument_address(argument_ptrs, ::std::forward<args_t>(args)...);

  cudaLaunchCooperativeKernel<func_t>(
      &f, grid_dimensions, block_dimensions,
      argument_ptrs, shared_memory_bytes, stream);
}

// EOF
	#pragma once

	// Includes CUDA
	#include <cuda_runtime.h>
	#include <cooperative_groups.h>

	#include <utility>

	namespace cg = cooperative_groups;

	// Helper functions.
	inline void for_each_argument_address(void**) {}

	template <typename arg_t, typename... args_t>
	inline void for_each_argument_address(void** collected_addresses,
	arg_t&& arg,
	args_t&&... args) {
	collected_addresses[0] = const_cast<void>(static_cast<const void>(&arg));
	for_each_argument_address(collected_addresses + 1,
	::std::forward<args_t>(args)...);
	}

	/**
	* @brief Launch a given kernel using cudaLaunchCooperativeKernel API for
	* Cooperative Groups (CG). This is a C++ wrapper that makes the C-based API of
	* CG more accessible. See the example below for use:
	*
	* @note For an example use see the commented code below.
	* @note GodBolt.org link: https://tinyurl.com/cgeg289hw
	*
	* @tparam func_t type of kernel
	* @tparam args_t types of the arguments (variadic)
	* @param stream cuda stream
	* @param f function name/identifier
	* @param block_dimensions block dimension (dim3)
	* @param grid_dimensions grid dimension (dim3)
	* @param shared_memory_bytes allocated dynamic shared memory in bytes
	* @param args all the arguments to the function f
	*/
	template <typename func_t, typename... args_t>
	void launch_cooperative(cudaStream_t& stream,
	const func_t& f,
	dim3 block_dimensions,
	dim3 grid_dimensions,
	std::size_t shared_memory_bytes,
	args_t&&... args) {
	constexpr const auto non_zero_num_params =
	sizeof...(args_t) == 0 ? 1 : sizeof...(args_t);
	void* argument_ptrs[non_zero_num_params];
	for_each_argument_address(argument_ptrs, ::std::forward<args_t>(args)...);

	cudaLaunchCooperativeKernel<func_t>(
	&f, grid_dimensions, block_dimensions,
	argument_ptrs, shared_memory_bytes, stream);
	}

	// EOF