Last active
December 7, 2020 18:38
-
-
Save hageboeck/cbddec072b97b3c9ac782549d5d1986d to your computer and use it in GitHub Desktop.
Invoking kernels that are class functions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <vector> | |
#include <cuda.h> | |
#include <numeric> | |
#include <iostream> | |
#ifndef __CUDACC__ | |
#include <span> | |
#else | |
namespace std { | |
// Some hacks because CUDA doesn't speak C++20. | |
// Still works for me, since CUDA doesn't touch the span | |
template<typename T> | |
class span{ | |
public: | |
span(T*, std::size_t){ }; | |
std::size_t size() { return 0; } | |
T* data() { return nullptr; } | |
}; | |
} | |
#endif | |
// Implement as usual | |
struct Computer { | |
static __host__ __device__ void compute(const double* in, double* out, std::size_t n, const double parameter) { | |
// The #ifdef magic here is optional. If you rewrite the code anyway, don't even bother. | |
#ifdef __CUDACC__ | |
auto idx = threadIdx.x + blockIdx.x * blockDim.x; | |
if (idx >= n) | |
return; | |
{ | |
#else | |
for (std::size_t idx=0; idx < n; ++idx) | |
{ | |
#endif | |
out[idx] = parameter * in[idx] + 0.2222222222; | |
if (idx == 0 || idx == n - 1) { | |
printf("In is %f %f %f ... %f\n", in[0], in[1], in[2], in[n-1]); | |
printf("Out[%d] is %f ... \n", idx, out[idx]); | |
} | |
} | |
} | |
}; | |
// Kernel being invoked from host. Cannot be a class function, hence free func. | |
template<typename T, typename... Parameters> | |
__global__ void startKernel(const double* in, double* out, std::size_t n, Parameters... pars) { | |
// Method 1: Create a full object on the GPU | |
T instance; | |
// If needed, you can set up the instance here ... | |
// ... and invoke the member | |
instance.compute(in, out, n, pars...); | |
// Method 2: Only call static function | |
T::compute(in, out, n, pars...); | |
} | |
constexpr std::size_t threadsPerBlock = 32; | |
// Free function that invokes whatever kernel we want. | |
template<typename T, typename... Parameters> | |
void submitKernel(std::span<const double> in, std::span<double> out, Parameters... pars) { | |
// Do all memcpy here if necessary | |
if (data not on device) | |
memcpy .... | |
const auto nBlock = in.size() / threadsPerBlock + (in.size()%threadsPerBlock != 0); | |
startKernel<T><<<nBlock, threadsPerBlock>>>(in.data(), out.data(), in.size(), pars...); | |
} | |
int main() { | |
double* dev_in; | |
double* dev_out; | |
cudaMalloc(&dev_in, 1000 * sizeof(double)); | |
cudaMalloc(&dev_out, 1000 * sizeof(double)); | |
std::span<const double> in(dev_in, 1000); | |
std::span<double> out(dev_out, 1000); | |
std::vector<double> values(in.size()); | |
std::iota(values.begin(), values.end(), 0.); | |
cudaMemcpy(dev_in, values.data(), values.size() * sizeof(double), cudaMemcpyHostToDevice); | |
submitKernel<Computer>(in, out, 2.); | |
std::vector<double> results(out.size()); | |
cudaMemcpy(results.data(), out.data(), out.size() * sizeof(double), cudaMemcpyDeviceToHost); | |
std::cout << "End result:\n\t" << results[0] << "\t" << results[1] << "\t...\t" << results.back() << std::endl; | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment