hageboeck/Cuda_classesAndSpans.cxx

## Cuda_classesAndSpans.cxx
#include <vector>
#include <cuda.h>
#include <numeric>
#include <iostream>
#ifndef __CUDACC__
#include <span>
#else
namespace std {
  // Some hacks because CUDA doesn't speak C++20.
  // Still works for me, since CUDA doesn't touch the span
  template<typename T>
  class span{
    public:
    span(T*, std::size_t){ };
    std::size_t size() { return 0; }
    T* data() { return nullptr; }
  };
}
#endif


// Implement as usual
struct Computer {
  static __host__ __device__ void compute(const double* in, double* out, std::size_t n, const double parameter) {

    // The #ifdef magic here is optional. If you rewrite the code anyway, don't even bother.
#ifdef __CUDACC__
    auto idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx >= n)
      return;

    {
#else
    for (std::size_t idx=0; idx < n; ++idx)
    {
#endif
      out[idx] = parameter * in[idx] + 0.2222222222;

      if (idx == 0 || idx == n - 1) {
        printf("In  is %f %f %f ... %f\n", in[0], in[1], in[2], in[n-1]);
        printf("Out[%d] is %f ... \n", idx, out[idx]);
      }
    }
  }
};


// Kernel being invoked from host. Cannot be a class function, hence free func.
template<typename T, typename... Parameters>
__global__ void startKernel(const double* in, double* out, std::size_t n, Parameters... pars) {
  // Method 1: Create a full object on the GPU
  T instance;
  // If needed, you can set up the instance here ...

  // ... and invoke the member
  instance.compute(in, out, n, pars...);


  // Method 2: Only call static function
  T::compute(in, out, n, pars...);
}


constexpr std::size_t threadsPerBlock = 32;
// Free function that invokes whatever kernel we want.
template<typename T, typename... Parameters>
void submitKernel(std::span<const double> in, std::span<double> out, Parameters... pars) {
  // Do all memcpy here if necessary
  if (data not on device)
    memcpy ....

  const auto nBlock = in.size() / threadsPerBlock + (in.size()%threadsPerBlock != 0);
  startKernel<T><<<nBlock, threadsPerBlock>>>(in.data(), out.data(), in.size(), pars...);
}


int main() {
  double* dev_in;
  double* dev_out;
  cudaMalloc(&dev_in, 1000 * sizeof(double));
  cudaMalloc(&dev_out, 1000 * sizeof(double));
  std::span<const double> in(dev_in, 1000);
  std::span<double> out(dev_out, 1000);

  std::vector<double> values(in.size());
  std::iota(values.begin(), values.end(), 0.);

  cudaMemcpy(dev_in, values.data(), values.size() * sizeof(double), cudaMemcpyHostToDevice);
  submitKernel<Computer>(in, out, 2.);

  std::vector<double> results(out.size());
  cudaMemcpy(results.data(), out.data(), out.size() * sizeof(double), cudaMemcpyDeviceToHost);

  std::cout << "End result:\n\t" << results[0] << "\t" << results[1] << "\t...\t" << results.back() << std::endl;
  return 0;
}
	#include <vector>
	#include <cuda.h>
	#include <numeric>
	#include <iostream>
	#ifndef __CUDACC__
	#include <span>
	#else
	namespace std {
	// Some hacks because CUDA doesn't speak C++20.
	// Still works for me, since CUDA doesn't touch the span
	template<typename T>
	class span{
	public:
	span(T*, std::size_t){ };
	std::size_t size() { return 0; }
	T* data() { return nullptr; }
	};
	}
	#endif



	// Implement as usual
	struct Computer {
	static __host__ __device__ void compute(const double* in, double* out, std::size_t n, const double parameter) {

	// The #ifdef magic here is optional. If you rewrite the code anyway, don't even bother.
	#ifdef __CUDACC__
	auto idx = threadIdx.x + blockIdx.x * blockDim.x;
	if (idx >= n)
	return;

	{
	#else
	for (std::size_t idx=0; idx < n; ++idx)
	{
	#endif
	out[idx] = parameter * in[idx] + 0.2222222222;

	if (idx == 0 \|\| idx == n - 1) {
	printf("In is %f %f %f ... %f\n", in[0], in[1], in[2], in[n-1]);
	printf("Out[%d] is %f ... \n", idx, out[idx]);
	}
	}
	}
	};


	// Kernel being invoked from host. Cannot be a class function, hence free func.
	template<typename T, typename... Parameters>
	__global__ void startKernel(const double* in, double* out, std::size_t n, Parameters... pars) {
	// Method 1: Create a full object on the GPU
	T instance;
	// If needed, you can set up the instance here ...

	// ... and invoke the member
	instance.compute(in, out, n, pars...);


	// Method 2: Only call static function
	T::compute(in, out, n, pars...);
	}


	constexpr std::size_t threadsPerBlock = 32;
	// Free function that invokes whatever kernel we want.
	template<typename T, typename... Parameters>
	void submitKernel(std::span<const double> in, std::span<double> out, Parameters... pars) {
	// Do all memcpy here if necessary
	if (data not on device)
	memcpy ....

	const auto nBlock = in.size() / threadsPerBlock + (in.size()%threadsPerBlock != 0);
	startKernel<T><<<nBlock, threadsPerBlock>>>(in.data(), out.data(), in.size(), pars...);
	}


	int main() {
	double* dev_in;
	double* dev_out;
	cudaMalloc(&dev_in, 1000 * sizeof(double));
	cudaMalloc(&dev_out, 1000 * sizeof(double));
	std::span<const double> in(dev_in, 1000);
	std::span<double> out(dev_out, 1000);

	std::vector<double> values(in.size());
	std::iota(values.begin(), values.end(), 0.);

	cudaMemcpy(dev_in, values.data(), values.size() * sizeof(double), cudaMemcpyHostToDevice);
	submitKernel<Computer>(in, out, 2.);

	std::vector<double> results(out.size());
	cudaMemcpy(results.data(), out.data(), out.size() * sizeof(double), cudaMemcpyDeviceToHost);

	std::cout << "End result:\n\t" << results[0] << "\t" << results[1] << "\t...\t" << results.back() << std::endl;
	return 0;
	}