Skip to content

Instantly share code, notes, and snippets.

@hageboeck
Last active December 7, 2020 18:38
Show Gist options
  • Save hageboeck/cbddec072b97b3c9ac782549d5d1986d to your computer and use it in GitHub Desktop.
Save hageboeck/cbddec072b97b3c9ac782549d5d1986d to your computer and use it in GitHub Desktop.
Invoking kernels that are class functions
#include <vector>
#include <cuda.h>
#include <numeric>
#include <iostream>
#ifndef __CUDACC__
#include <span>
#else
namespace std {
// Some hacks because CUDA doesn't speak C++20.
// Still works for me, since CUDA doesn't touch the span
template<typename T>
class span{
public:
span(T*, std::size_t){ };
std::size_t size() { return 0; }
T* data() { return nullptr; }
};
}
#endif
// Implement as usual
struct Computer {
static __host__ __device__ void compute(const double* in, double* out, std::size_t n, const double parameter) {
// The #ifdef magic here is optional. If you rewrite the code anyway, don't even bother.
#ifdef __CUDACC__
auto idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx >= n)
return;
{
#else
for (std::size_t idx=0; idx < n; ++idx)
{
#endif
out[idx] = parameter * in[idx] + 0.2222222222;
if (idx == 0 || idx == n - 1) {
printf("In is %f %f %f ... %f\n", in[0], in[1], in[2], in[n-1]);
printf("Out[%d] is %f ... \n", idx, out[idx]);
}
}
}
};
// Kernel being invoked from host. Cannot be a class function, hence free func.
template<typename T, typename... Parameters>
__global__ void startKernel(const double* in, double* out, std::size_t n, Parameters... pars) {
// Method 1: Create a full object on the GPU
T instance;
// If needed, you can set up the instance here ...
// ... and invoke the member
instance.compute(in, out, n, pars...);
// Method 2: Only call static function
T::compute(in, out, n, pars...);
}
constexpr std::size_t threadsPerBlock = 32;
// Free function that invokes whatever kernel we want.
template<typename T, typename... Parameters>
void submitKernel(std::span<const double> in, std::span<double> out, Parameters... pars) {
// Do all memcpy here if necessary
if (data not on device)
memcpy ....
const auto nBlock = in.size() / threadsPerBlock + (in.size()%threadsPerBlock != 0);
startKernel<T><<<nBlock, threadsPerBlock>>>(in.data(), out.data(), in.size(), pars...);
}
int main() {
double* dev_in;
double* dev_out;
cudaMalloc(&dev_in, 1000 * sizeof(double));
cudaMalloc(&dev_out, 1000 * sizeof(double));
std::span<const double> in(dev_in, 1000);
std::span<double> out(dev_out, 1000);
std::vector<double> values(in.size());
std::iota(values.begin(), values.end(), 0.);
cudaMemcpy(dev_in, values.data(), values.size() * sizeof(double), cudaMemcpyHostToDevice);
submitKernel<Computer>(in, out, 2.);
std::vector<double> results(out.size());
cudaMemcpy(results.data(), out.data(), out.size() * sizeof(double), cudaMemcpyDeviceToHost);
std::cout << "End result:\n\t" << results[0] << "\t" << results[1] << "\t...\t" << results.back() << std::endl;
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment