Skip to content

Instantly share code, notes, and snippets.

@robertmaynard
Last active August 16, 2018 13:18
Show Gist options
  • Save robertmaynard/8e39dba15b9e3232f7bcf52c1df63f9f to your computer and use it in GitHub Desktop.
Save robertmaynard/8e39dba15b9e3232f7bcf52c1df63f9f to your computer and use it in GitHub Desktop.
cuRand and thrust example error
nvcc -std=c++11 -arch=sm_30 -c main.cu [works]
nvcc -std=c++11 -arch=sm_32 -c main.cu [works]
nvcc -std=c++11 -arch=sm_35 -c main.cu [works]
nvcc -std=c++11 -arch=sm_50 -c main.cu [works]
nvcc -std=c++11 -arch=sm_60 -c main.cu [works]
nvcc -std=c++11 -arch=sm_70 -c main.cu [works]
nvcc -std=c++11 -arch=sm_30 -dc main.cu [works]
nvcc -std=c++11 -arch=sm_32 -dc main.cu [works]
nvcc -std=c++11 -arch=sm_35 -dc main.cu [fails]
nvcc -std=c++11 -arch=sm_50 -dc main.cu [fails]
nvcc -std=c++11 -arch=sm_60 -dc main.cu [fails]
nvcc -std=c++11 -arch=sm_70 -dc main.cu [fails]
/usr/local/cuda/bin/..//include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh(446): error: cannot pass an argument with a user-provided copy-constructor to a device-side kernel launch
detected during:
instantiation of "cudaError_t thrust::cuda_cub::cub::DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT, OutputT>::InvokeSingleTile<ActivePolicyT,SingleTileKernelT>(SingleTileKernelT) [with InputIteratorT=thrust::cuda_cub::transform_input_iterator_t<float, thrust::counting_iterator<int, thrust::use_default, thrust::use_default, thrust::use_default>, estimate_pi>, OutputIteratorT=float *, OffsetT=int, ReductionOpT=thrust::plus<float>, OutputT=float, ActivePolicyT=thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy130, SingleTileKernelT=void (*)(thrust::cuda_cub::transform_input_iterator_t<float, thrust::counting_iterator<int, thrust::use_default, thrust::use_default, thrust::use_default>, estimate_pi>, float *, int, thrust::plus<float>, float)]"
(599): here
instantiation of "cudaError_t thrust::cuda_cub::cub::DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT, OutputT>::Invoke<ActivePolicyT>() [with InputIteratorT=thrust::cuda_cub::transform_input_iterator_t<float, thrust::counting_iterator<int, thrust::use_default, thrust::use_default, thrust::use_default>, estimate_pi>, OutputIteratorT=float *, OffsetT=int, ReductionOpT=thrust::plus<float>, OutputT=float, ActivePolicyT=thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy130]"
/usr/local/cuda/bin/..//include/thrust/system/cuda/detail/cub/block/../iterator/../util_device.cuh(332): here
instantiation of "cudaError_t thrust::cuda_cub::cub::ChainedPolicy<PTX_VERSION, PolicyT, PolicyT>::Invoke(int, FunctorT &) [with PTX_VERSION=130, PolicyT=thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy130, FunctorT=thrust::cuda_cub::cub::DispatchReduce<thrust::cuda_cub::transform_input_iterator_t<float, thrust::counting_iterator<int, thrust::use_default, thrust::use_default, thrust::use_default>, estimate_pi>, float *, int, thrust::plus<float>, float>]"
/usr/local/cuda/bin/..//include/thrust/system/cuda/detail/cub/block/../iterator/../util_device.cuh(315): here
instantiation of "cudaError_t thrust::cuda_cub::cub::ChainedPolicy<PTX_VERSION, PolicyT, PrevPolicyT>::Invoke(int, FunctorT &) [with PTX_VERSION=200, PolicyT=thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy200, PrevPolicyT=thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy130, FunctorT=thrust::cuda_cub::cub::DispatchReduce<thrust::cuda_cub::transform_input_iterator_t<float, thrust::counting_iterator<int, thrust::use_default, thrust::use_default, thrust::use_default>, estimate_pi>, float *, int, thrust::plus<float>, float>]"
/usr/local/cuda/bin/..//include/thrust/system/cuda/detail/cub/block/../iterator/../util_device.cuh(315): here
instantiation of "cudaError_t thrust::cuda_cub::cub::ChainedPolicy<PTX_VERSION, PolicyT, PrevPolicyT>::Invoke(int, FunctorT &) [with PTX_VERSION=300, PolicyT=thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy300, PrevPolicyT=thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy200, FunctorT=thrust::cuda_cub::cub::DispatchReduce<thrust::cuda_cub::transform_input_iterator_t<float, thrust::counting_iterator<int, thrust::use_default, thrust::use_default, thrust::use_default>, estimate_pi>, float *, int, thrust::plus<float>, float>]"
/usr/local/cuda/bin/..//include/thrust/system/cuda/detail/cub/block/../iterator/../util_device.cuh(315): here
[ 3 instantiation contexts not shown ]
instantiation of "cudaError_t thrust::cuda_cub::cub::DeviceReduce::Reduce(void *, size_t &, InputIteratorT, OutputIteratorT, int, ReductionOpT, T, cudaStream_t, __nv_bool) [with InputIteratorT=thrust::cuda_cub::transform_input_iterator_t<float, thrust::counting_iterator<int, thrust::use_default, thrust::use_default, thrust::use_default>, estimate_pi>, OutputIteratorT=float *, ReductionOpT=thrust::plus<float>, T=float]"
/usr/local/cuda/bin/..//include/thrust/system/cuda/detail/reduce.h(950): here
instantiation of "T thrust::cuda_cub::reduce_n(thrust::cuda_cub::execution_policy<Derived> &, InputIt, Size, T, BinaryOp) [with Derived=thrust::cuda_cub::tag, InputIt=thrust::cuda_cub::transform_input_iterator_t<float, thrust::counting_iterator<int, thrust::use_default, thrust::use_default, thrust::use_default>, estimate_pi>, Size=signed long, T=float, BinaryOp=thrust::plus<float>]"
/usr/local/cuda/bin/..//include/thrust/system/cuda/detail/transform_reduce.h(62): here
instantiation of "T thrust::cuda_cub::transform_reduce(thrust::cuda_cub::execution_policy<Derived> &, InputIt, InputIt, TransformOp, T, ReduceOp) [with Derived=thrust::cuda_cub::tag, InputIt=thrust::counting_iterator<int, thrust::use_default, thrust::use_default, thrust::use_default>, TransformOp=estimate_pi, T=float, ReduceOp=thrust::plus<float>]"
/usr/local/cuda/bin/..//include/thrust/detail/transform_reduce.inl(47): here
instantiation of "OutputType thrust::transform_reduce(const thrust::detail::execution_policy_base<DerivedPolicy> &, InputIterator, InputIterator, UnaryFunction, OutputType, BinaryFunction) [with DerivedPolicy=thrust::cuda_cub::tag, InputIterator=thrust::counting_iterator<int, thrust::use_default, thrust::use_default, thrust::use_default>, UnaryFunction=estimate_pi, OutputType=float, BinaryFunction=thrust::plus<float>]"
/usr/local/cuda/bin/..//include/thrust/detail/transform_reduce.inl(67): here
instantiation of "OutputType thrust::transform_reduce(InputIterator, InputIterator, UnaryFunction, OutputType, BinaryFunction) [with InputIterator=thrust::counting_iterator<int, thrust::use_default, thrust::use_default, thrust::use_default>, UnaryFunction=estimate_pi, OutputType=float, BinaryFunction=thrust::plus<float>]"
#include <curand_kernel.h>
#include <thrust/functional.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/transform_reduce.h>
#include <iomanip>
#include <iostream>
//Example of using curand in device code to device out device linking
struct estimate_pi : public thrust::unary_function<unsigned int, float>
{
estimate_pi() = default;
estimate_pi(const estimate_pi&) = default;
__device__ float operator()(unsigned int thread_id)
{
float sum = 0;
unsigned int N = 10000; // samples per thread
unsigned int seed = thread_id;
curandState s;
// seed a random number generator
curand_init(seed, 0, 0, &s);
// take N samples in a quarter circle
for (unsigned int i = 0; i < N; ++i) {
// draw a sample from the unit square
float x = curand_uniform(&s);
float y = curand_uniform(&s);
// measure distance from the origin
float dist = sqrtf(x * x + y * y);
// add 1.0f if (u0,u1) is inside the quarter circle
if (dist <= 1.0f)
sum += 1.0f;
}
// multiply by 4 to get the area of the whole circle
sum *= 4.0f;
// divide by N
return sum / N;
}
};
int choose_cuda_device()
{
int nDevices = 0;
cudaError_t err = cudaGetDeviceCount(&nDevices);
if (err != cudaSuccess) {
std::cerr << "Failed to retrieve the number of CUDA enabled devices"
<< std::endl;
return 1;
}
for (int i = 0; i < nDevices; ++i) {
cudaDeviceProp prop;
cudaError_t err = cudaGetDeviceProperties(&prop, i);
if (err != cudaSuccess) {
std::cerr << "Could not retrieve properties from CUDA device " << i
<< std::endl;
return 1;
}
if (prop.major > 3 || (prop.major == 3 && prop.minor >= 5)) {
err = cudaSetDevice(i);
if (err != cudaSuccess) {
std::cout << "Could not select CUDA device " << i << std::endl;
} else {
return 0;
}
}
}
std::cout << "Could not find a CUDA enabled card supporting compute >=3.5"
<< std::endl;
return 1;
}
int main(int argc, char** argv)
{
int ret = choose_cuda_device();
if (ret) {
return 0;
}
// use 30K independent seeds
constexpr int M = 30000;
float estimate = thrust::transform_reduce(
thrust::counting_iterator<int>(0), thrust::counting_iterator<int>(M),
estimate_pi(), 0.0f, thrust::plus<float>());
estimate /= M;
std::cout << std::setprecision(3);
std::cout << "pi is approximately ";
std::cout << estimate << std::endl;
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment