Skip to content

Instantly share code, notes, and snippets.

@pkit
Created March 23, 2018 12:02
Show Gist options
  • Save pkit/e06c2d23046a265a4f0fea302a5ce539 to your computer and use it in GitHub Desktop.
Save pkit/e06c2d23046a265a4f0fea302a5ce539 to your computer and use it in GitHub Desktop.
Patch tensorflow 1.6.0 to dynamically load libcuda.so.1
diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
index 08961fc..5963c8a 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
@@ -16,6 +16,9 @@ limitations under the License.
#ifdef GOOGLE_CUDA
#include "cuda/include/cuda.h"
#include "tensorflow/stream_executor/cuda/cuda_activation.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/dso_loader.h"
+#include "tensorflow/core/platform/default/logging.h"
#endif // GOOGLE_CUDA
#include "tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h"
@@ -25,6 +28,41 @@ limitations under the License.
#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
#include "tensorflow/core/platform/stream_executor.h"
+namespace dynload {
+
+namespace gpu = ::perftools::gputools;
+
+#define CUDAMALLOC_LIBCUDA_WRAP(__name) \
+ struct DynLoadShim__##__name { \
+ static const char *kName; \
+ using FuncPointerT = std::add_pointer<decltype(::__name)>::type; \
+ static void *GetDsoHandle() { \
+ static auto status = gpu::internal::CachedDsoLoader::GetLibcudaDsoHandle(); \
+ return status.ValueOrDie(); \
+ } \
+ static FuncPointerT LoadOrDie() { \
+ void *f; \
+ gpu::port::Status s = gpu::port::Env::Default()->GetSymbolFromLibrary( \
+ GetDsoHandle(), kName, &f); \
+ CHECK(s.ok()) << "could not find " << kName \
+ << " in libcuda DSO; dlerror: " << s.error_message(); \
+ return reinterpret_cast<FuncPointerT>(f); \
+ } \
+ static FuncPointerT DynLoad() { \
+ static FuncPointerT f = LoadOrDie(); \
+ return f; \
+ } \
+ template <typename... Args> \
+ CUresult operator()(Args... args) { \
+ return DynLoad()(args...); \
+ } \
+ } __name; \
+ const char *DynLoadShim__##__name::kName = #__name;
+
+ CUDAMALLOC_LIBCUDA_WRAP(cuMemAlloc_v2);
+ CUDAMALLOC_LIBCUDA_WRAP(cuMemFree_v2);
+} // namespace dynload
+
namespace tensorflow {
GPUcudaMallocAllocator::GPUcudaMallocAllocator(VisitableAllocator* allocator,
@@ -40,7 +78,7 @@ void* GPUcudaMallocAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
// allocate with cudaMalloc
gpu::cuda::ScopedActivateExecutorContext scoped_activation{stream_exec_};
CUdeviceptr rv = 0;
- CUresult res = cuMemAlloc(&rv, num_bytes);
+ CUresult res = dynload::cuMemAlloc_v2(&rv, num_bytes);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << "cuMemAlloc failed to allocate " << num_bytes;
return nullptr;
@@ -53,7 +91,7 @@ void* GPUcudaMallocAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
void GPUcudaMallocAllocator::DeallocateRaw(void* ptr) {
#ifdef GOOGLE_CUDA
// free with cudaFree
- CUresult res = cuMemFree(reinterpret_cast<CUdeviceptr>(ptr));
+ CUresult res = dynload::cuMemFree_v2(reinterpret_cast<CUdeviceptr>(ptr));
if (res != CUDA_SUCCESS) {
LOG(ERROR) << "cuMemFree failed to free " << ptr;
}
diff --git a/tensorflow/stream_executor/BUILD b/tensorflow/stream_executor/BUILD
index 1865240..50b2930 100644
--- a/tensorflow/stream_executor/BUILD
+++ b/tensorflow/stream_executor/BUILD
@@ -76,7 +76,6 @@ cc_library(
] + if_cuda_is_configured([
"//tensorflow/core:cuda",
"@local_config_cuda//cuda:cublas",
- "@local_config_cuda//cuda:cuda_driver",
"@local_config_cuda//cuda:cudnn",
"@local_config_cuda//cuda:cufft",
"@local_config_cuda//cuda:curand",
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index a017ff6..7c5147a 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -21,7 +21,9 @@ limitations under the License.
#include <set>
#include <utility>
+#include "tensorflow/stream_executor/platform/port.h"
#include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
+#include "tensorflow/stream_executor/dso_loader.h"
#include "tensorflow/stream_executor/lib/casts.h"
#include "tensorflow/stream_executor/lib/env.h"
#include "tensorflow/stream_executor/lib/error.h"
@@ -57,6 +59,107 @@ namespace perftools {
namespace gputools {
namespace cuda {
+namespace dynload {
+
+#define PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(__name) \
+ struct DynLoadShim__##__name { \
+ static const char *kName; \
+ using FuncPointerT = std::add_pointer<decltype(::__name)>::type; \
+ static void *GetDsoHandle() { \
+ static auto status = internal::CachedDsoLoader::GetLibcudaDsoHandle(); \
+ return status.ValueOrDie(); \
+ } \
+ static FuncPointerT LoadOrDie() { \
+ void *f; \
+ port::Status s = port::Env::Default()->GetSymbolFromLibrary( \
+ GetDsoHandle(), kName, &f); \
+ CHECK(s.ok()) << "could not find " << kName \
+ << " in libcuda DSO; dlerror: " << s.error_message(); \
+ return reinterpret_cast<FuncPointerT>(f); \
+ } \
+ static FuncPointerT DynLoad() { \
+ static FuncPointerT f = LoadOrDie(); \
+ return f; \
+ } \
+ template <typename... Args> \
+ CUresult operator()(Args... args) { \
+ return DynLoad()(args...); \
+ } \
+ } __name; \
+ const char *DynLoadShim__##__name::kName = #__name;
+
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxCreate_v2);
+#if CUDA_VERSION >= 7000
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDevicePrimaryCtxRetain);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDevicePrimaryCtxRelease);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDevicePrimaryCtxSetFlags);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDevicePrimaryCtxGetState);
+#endif
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxDestroy);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxEnablePeerAccess);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxGetCurrent);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxGetDevice);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxGetSharedMemConfig);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxPopCurrent_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxSetCurrent);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxSetSharedMemConfig);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxSynchronize);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceComputeCapability);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceCanAccessPeer);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGet);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGetAttribute);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGetCount);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGetName);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGetPCIBusId);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGetProperties);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceTotalMem);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDriverGetVersion);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventCreate);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventDestroy_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventElapsedTime);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventQuery);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventRecord);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventSynchronize);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuFuncGetAttribute);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuFuncSetCacheConfig);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuGetErrorName);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuGetErrorString);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuInit);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuLaunchKernel);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemAlloc_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyDtoD_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyDtoH_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyHtoD_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyDtoDAsync_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyDtoHAsync_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyHtoDAsync_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemGetAddressRange_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemFree_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemFreeHost);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemGetInfo_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemHostAlloc);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemHostRegister_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemHostUnregister);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemsetD32_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemsetD32Async);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemsetD8_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemsetD8Async);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuModuleGetFunction);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuModuleGetGlobal_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuModuleLoadDataEx);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuModuleLoadFatBinary);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuModuleUnload);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuOccupancyMaxActiveBlocksPerMultiprocessor);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuPointerGetAttribute);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamAddCallback);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamCreate);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamDestroy_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamQuery);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamSynchronize);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamWaitEvent);
+
+} // namespace dynload
+
namespace {
// Manages the singleton map of contexts that we've created, mapping
@@ -273,7 +376,7 @@ namespace {
// Call cuCtxtSynchronize and crash if it doesn't succeed.
void SynchronizeOrDie() {
- auto res = cuCtxSynchronize();
+ auto res = dynload::cuCtxSynchronize();
if (res != CUDA_SUCCESS) {
LOG(FATAL) << "Synchronize found "
<< ToString(res) << " :: " << port::CurrentStackTrace();
@@ -309,7 +412,7 @@ ScopedActivateContext::ScopedActivateContext(CudaContext* cuda_context) {
to_restore_ = (tls->depth == 1 ? nullptr : tls->context);
// Set the context and update thread local.
- CHECK_EQ(CUDA_SUCCESS, cuCtxSetCurrent(cuda_context->context()));
+ CHECK_EQ(CUDA_SUCCESS, dynload::cuCtxSetCurrent(cuda_context->context()));
tls->id = cuda_context->id();
tls->context = cuda_context;
}
@@ -334,7 +437,7 @@ ScopedActivateContext::~ScopedActivateContext() {
}
// Set context and update thread local.
- CHECK_EQ(CUDA_SUCCESS, cuCtxSetCurrent(to_restore_->context()));
+ CHECK_EQ(CUDA_SUCCESS, dynload::cuCtxSetCurrent(to_restore_->context()));
tls->id = to_restore_->id();
tls->context = to_restore_;
}
@@ -395,8 +498,10 @@ static port::Status InternalInit() {
CUresult res = CUDA_ERROR_NO_DEVICE;
if (FLAGS_gpuexec_cuda_driver_inject_init_error) {
LOG(ERROR) << "injecting CUDA init error; initialization will fail";
- } else {
- res = cuInit(0 /* = flags */);
+ } else if (internal::CachedDsoLoader::GetLibcudaDsoHandle().ok()) {
+ // We only call cuInit if we can dynload libcuda.
+
+ res = dynload::cuInit(0 /* = flags */);
}
if (res == CUDA_SUCCESS) {
@@ -429,7 +534,7 @@ static port::Status InternalInit() {
/* static */ port::Status CUDADriver::GetDevice(int device_ordinal,
CUdevice *device) {
- CUresult res = cuDeviceGet(device, device_ordinal);
+ CUresult res = dynload::cuDeviceGet(device, device_ordinal);
if (res == CUDA_SUCCESS) {
return port::Status::OK();
}
@@ -443,7 +548,8 @@ static port::Status InternalInit() {
string *device_name) {
static const size_t kCharLimit = 64;
port::InlinedVector<char, 4> chars(kCharLimit);
- CUresult res = cuDeviceGetName(chars.begin(), kCharLimit - 1, device);
+ CUresult res =
+ dynload::cuDeviceGetName(chars.begin(), kCharLimit - 1, device);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << "failed to get device name for " << device << ": "
<< ToString(res);
@@ -499,8 +605,8 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
unsigned int former_primary_context_flags;
int former_primary_context_is_active;
CHECK_EQ(CUDA_SUCCESS,
- cuDevicePrimaryCtxGetState(device, &former_primary_context_flags,
- &former_primary_context_is_active));
+ dynload::cuDevicePrimaryCtxGetState(device, &former_primary_context_flags,
+ &former_primary_context_is_active));
if (former_primary_context_flags != flags) {
if (former_primary_context_is_active) {
LOG(ERROR)
@@ -508,16 +614,16 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
<< former_primary_context_flags << ") than the desired flag set ("
<< flags << ").";
} else {
- CHECK_EQ(CUDA_SUCCESS, cuDevicePrimaryCtxSetFlags(device, flags));
+ CHECK_EQ(CUDA_SUCCESS, dynload::cuDevicePrimaryCtxSetFlags(device, flags));
}
}
}
former_context = CUDADriver::CurrentContextOrDie();
- res = cuDevicePrimaryCtxRetain(&new_context, device);
+ res = dynload::cuDevicePrimaryCtxRetain(&new_context, device);
if (former_context != nullptr) {
CUdevice former_device;
- if (cuCtxGetDevice(&former_device) == CUDA_SUCCESS) {
+ if (dynload::cuCtxGetDevice(&former_device) == CUDA_SUCCESS) {
if (former_device == device) {
if (former_context == new_context) {
VLOG(2) << "The primary context " << former_context
@@ -544,10 +650,10 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
<< "creating context when one is currently active; existing: "
<< former_context;
}
- res = cuCtxCreate(&new_context, flags, device);
+ res = dynload::cuCtxCreate(&new_context, flags, device);
#endif
}
- CHECK_EQ(CUDA_SUCCESS, cuCtxSetCurrent(former_context));
+ CHECK_EQ(CUDA_SUCCESS, dynload::cuCtxSetCurrent(former_context));
if (res == CUDA_SUCCESS) {
*context = CreatedContexts::Add(new_context);
@@ -580,14 +686,14 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
}
#if CUDA_VERSION >= 7000
CUcontext former_context = CurrentContext();
- CUresult res = cuCtxSetCurrent(context->context());
+ CUresult res = dynload::cuCtxSetCurrent(context->context());
CUdevice device;
- cuCtxGetDevice(&device);
- cuCtxSetCurrent(former_context);
+ dynload::cuCtxGetDevice(&device);
+ dynload::cuCtxSetCurrent(former_context);
- res = cuDevicePrimaryCtxRelease(device);
+ res = dynload::cuDevicePrimaryCtxRelease(device);
#else
- CUresult res = cuCtxDestroy(context->context());
+ CUresult res = dynload::cuCtxDestroy_v2(context->context());
#endif
if (res != CUDA_SUCCESS) {
@@ -600,7 +706,7 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
/* static */ bool CUDADriver::FuncGetAttribute(CUfunction_attribute attribute,
CUfunction func,
int *attribute_value) {
- CUresult res = cuFuncGetAttribute(attribute_value, attribute, func);
+ CUresult res = dynload::cuFuncGetAttribute(attribute_value, attribute, func);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << "failed to query kernel attribute. kernel: " << func
<< ", attribute: " << attribute;
@@ -611,7 +717,7 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
/* static */ bool CUDADriver::FuncSetCacheConfig(CUfunction function,
CUfunc_cache cache_config) {
- CUresult res = cuFuncSetCacheConfig(function, cache_config);
+ CUresult res = dynload::cuFuncSetCacheConfig(function, cache_config);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << "failed to set CUDA kernel cache config. kernel: " << function
<< ", config: " << cache_config << ", result: " << ToString(res);
@@ -625,10 +731,10 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
CUsharedconfig shared_mem_config;
ScopedActivateContext activation{context};
- CUresult result = cuCtxGetSharedMemConfig(&shared_mem_config);
+ CUresult result = dynload::cuCtxGetSharedMemConfig(&shared_mem_config);
if (result != CUDA_SUCCESS) {
CUdevice device;
- cuCtxGetDevice(&device);
+ dynload::cuCtxGetDevice(&device);
LOG(ERROR) << "failed to get CUDA device shared memory config. "
<< "Context device ID: " << device
<< ", result: " << ToString(result);
@@ -642,10 +748,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
/* static */ port::Status CUDADriver::ContextSetSharedMemConfig(
CudaContext* context, CUsharedconfig shared_mem_config) {
ScopedActivateContext activation{context};
- CUresult result = cuCtxSetSharedMemConfig(shared_mem_config);
+ CUresult result = dynload::cuCtxSetSharedMemConfig(shared_mem_config);
if (result != CUDA_SUCCESS) {
CUdevice device;
- cuCtxGetDevice(&device);
+ dynload::cuCtxGetDevice(&device);
LOG(ERROR) << "failed to set CUDA device shared memory config. "
<< "Context device ID: " << device
<< ", config: " << shared_mem_config
@@ -668,9 +774,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
<< " gdy: " << grid_dim_y << " gdz: " << grid_dim_z
<< " bdx: " << block_dim_x << " bdy: " << block_dim_y
<< " bdz: " << block_dim_z;
- CUresult res = cuLaunchKernel(function, grid_dim_x, grid_dim_y, grid_dim_z,
- block_dim_x, block_dim_y, block_dim_z,
- shared_mem_bytes, stream, kernel_params, extra);
+ CUresult res = dynload::cuLaunchKernel(
+ function, grid_dim_x, grid_dim_y, grid_dim_z,
+ block_dim_x, block_dim_y, block_dim_z,
+ shared_mem_bytes, stream, kernel_params, extra);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << "failed to launch CUDA kernel: " << function
<< "; result: " << ToString(res);
@@ -684,7 +791,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
const char *cubin_bytes,
CUmodule *module) {
ScopedActivateContext activation{context};
- CUresult result = cuModuleLoadFatBinary(module, cubin_bytes);
+ CUresult result = dynload::cuModuleLoadFatBinary(module, cubin_bytes);
if (result != CUDA_SUCCESS) {
return port::Status{port::error::INTERNAL,
"failed to load in-memory CUBIN: " + ToString(result)};
@@ -727,8 +834,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
// TODO(leary) Need to see if NVIDIA can expunge the leakiness in their
// module loading: see http://b/13248943
- res = cuModuleLoadDataEx(module, ptx_data, ARRAYSIZE(options), options,
- option_values);
+ res = dynload::cuModuleLoadDataEx(module, ptx_data, ARRAYSIZE(options),
+ options, option_values);
}
// The PTX JIT mutates the values in the option values array to reflect the
@@ -767,7 +874,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
CUdeviceptr location,
uint8 value, size_t size) {
ScopedActivateContext activation{context};
- CUresult res = cuMemsetD8(location, value, size);
+ CUresult res = dynload::cuMemsetD8_v2(location, value, size);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << "failed to memset memory: " << ToString(res);
return false;
@@ -780,7 +887,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
uint32 value,
size_t uint32_count) {
ScopedActivateContext activation{context};
- CUresult res = cuMemsetD32(location, value, uint32_count);
+ CUresult res = dynload::cuMemsetD32_v2(location, value, uint32_count);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << "failed to memset memory: " << ToString(res);
return false;
@@ -794,7 +901,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
size_t uint32_count,
CUstream stream) {
ScopedActivateContext activation{context};
- CUresult res = cuMemsetD8Async(location, value, uint32_count, stream);
+ CUresult res =
+ dynload::cuMemsetD8Async(location, value, uint32_count, stream);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res);
return false;
@@ -809,7 +917,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
size_t uint32_count,
CUstream stream) {
ScopedActivateContext activation{context};
- CUresult res = cuMemsetD32Async(location, value, uint32_count, stream);
+ CUresult res =
+ dynload::cuMemsetD32Async(location, value, uint32_count, stream);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res);
return false;
@@ -823,7 +932,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
StreamCallback callback,
void *data) {
// Note: flags param is required to be zero according to CUDA 6.0.
- CUresult res = cuStreamAddCallback(stream, callback, data, 0 /* = flags */);
+ CUresult res =
+ dynload::cuStreamAddCallback(stream, callback, data, 0 /* = flags */);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << "unable to add host callback: " << ToString(res);
return false;
@@ -837,7 +947,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
CUfunction *function) {
ScopedActivateContext activated{context};
CHECK(module != nullptr && kernel_name != nullptr);
- CUresult res = cuModuleGetFunction(function, module, kernel_name);
+ CUresult res = dynload::cuModuleGetFunction(function, module, kernel_name);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << "failed to get PTX kernel \"" << kernel_name
<< "\" from module: " << ToString(res);
@@ -855,7 +965,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
ScopedActivateContext activated{context};
CHECK(module != nullptr && symbol_name != nullptr &&
(dptr != nullptr || bytes != nullptr));
- CUresult res = cuModuleGetGlobal(dptr, bytes, module, symbol_name);
+ CUresult res =
+ dynload::cuModuleGetGlobal_v2(dptr, bytes, module, symbol_name);
if (res != CUDA_SUCCESS) {
// symbol may not be found in the current module, but it may reside in
// another module.
@@ -870,7 +981,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
/* static */ void CUDADriver::UnloadModule(CudaContext *context,
CUmodule module) {
ScopedActivateContext activated{context};
- CUresult res = cuModuleUnload(module);
+ CUresult res = dynload::cuModuleUnload(module);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << "failed to unload module " << module
<< "; leaking: " << ToString(res);
@@ -881,7 +992,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
CudaContext* context) {
ScopedActivateContext activated{context};
CUdevice device = -1;
- CUresult result = cuCtxGetDevice(&device);
+ CUresult result = dynload::cuCtxGetDevice(&device);
if (result == CUDA_SUCCESS) {
return device;
}
@@ -897,7 +1008,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
// up synchronization with respect to memsets and any other things that have
// to occur on the default stream?
ScopedActivateContext activated{context};
- CUresult res = cuStreamCreate(out, 0);
+ CUresult res = dynload::cuStreamCreate(out, 0);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << "could not allocate CUDA stream for context " << context
<< ": " << ToString(res);
@@ -916,7 +1027,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
}
ScopedActivateContext activated{context};
- CUresult res = cuStreamDestroy(*stream);
+ CUresult res = dynload::cuStreamDestroy_v2(*stream);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << "failed to destroy CUDA stream for context " << context
<< ": " << ToString(res);
@@ -931,7 +1042,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
uint64 bytes) {
ScopedActivateContext activated{context};
CUdeviceptr result = 0;
- CUresult res = cuMemAlloc(&result, bytes);
+ CUresult res = dynload::cuMemAlloc_v2(&result, bytes);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << "failed to allocate "
<< port::HumanReadableNumBytes::ToString(bytes) << " (" << bytes
@@ -948,7 +1059,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
void *location) {
ScopedActivateContext activation{context};
CUdeviceptr pointer = port::bit_cast<CUdeviceptr>(location);
- CUresult res = cuMemFree(pointer);
+ CUresult res = dynload::cuMemFree_v2(pointer);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << "failed to free device memory at " << location
<< "; result: " << ToString(res);
@@ -962,7 +1073,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
ScopedActivateContext activation{context};
void *host_mem = nullptr;
// "Portable" memory is visible to all CUDA contexts. Safe for our use model.
- CUresult res = cuMemHostAlloc(&host_mem, bytes, CU_MEMHOSTALLOC_PORTABLE);
+ CUresult res =
+ dynload::cuMemHostAlloc(&host_mem, bytes, CU_MEMHOSTALLOC_PORTABLE);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << "failed to alloc " << bytes
<< " bytes on host: " << ToString(res);
@@ -973,7 +1085,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
/* static */ void CUDADriver::HostDeallocate(CudaContext* context,
void *location) {
ScopedActivateContext activation{context};
- CUresult res = cuMemFreeHost(location);
+ CUresult res = dynload::cuMemFreeHost(location);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << "error deallocating host memory at " << location << ": "
<< ToString(res);
@@ -985,7 +1097,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
ScopedActivateContext activation{context};
// "Portable" memory is visible to all CUDA contexts. Safe for our use model.
CUresult res =
- cuMemHostRegister(location, bytes, CU_MEMHOSTREGISTER_PORTABLE);
+ dynload::cuMemHostRegister(location, bytes, CU_MEMHOSTREGISTER_PORTABLE);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << "error registering host memory at " << location << ": "
<< ToString(res);
@@ -997,7 +1109,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
/* static */ bool CUDADriver::HostUnregister(CudaContext* context,
void *location) {
ScopedActivateContext activation{context};
- CUresult res = cuMemHostUnregister(location);
+ CUresult res = dynload::cuMemHostUnregister(location);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << "error unregistering host memory at " << location << ": "
<< ToString(res);
@@ -1014,7 +1126,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
}
ScopedActivateContext activated{context};
- CUresult res = cuEventDestroy(*event);
+ CUresult res = dynload::cuEventDestroy_v2(*event);
*event = nullptr;
switch (res) {
@@ -1038,7 +1150,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
CUevent event,
CUstream stream) {
ScopedActivateContext activated{context};
- CUresult res = cuEventRecord(event, stream);
+ CUresult res = dynload::cuEventRecord(event, stream);
switch (res) {
case CUDA_SUCCESS:
return port::Status::OK();
@@ -1059,7 +1171,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
/* static */ port::StatusOr<CUresult> CUDADriver::QueryEvent(
CudaContext *context, CUevent event) {
ScopedActivateContext activated{context};
- CUresult res = cuEventQuery(event);
+ CUresult res = dynload::cuEventQuery(event);
if (res != CUDA_SUCCESS && res != CUDA_ERROR_NOT_READY) {
return port::Status{
port::error::INTERNAL,
@@ -1075,12 +1187,12 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
ScopedActivateContext activated{context};
// The stop event must have completed in order for cuEventElapsedTime to
// work.
- CUresult res = cuEventSynchronize(stop);
+ CUresult res = dynload::cuEventSynchronize(stop);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << "failed to synchronize the stop event: " << ToString(res);
return false;
}
- res = cuEventElapsedTime(elapsed_milliseconds, start, stop);
+ res = dynload::cuEventElapsedTime(elapsed_milliseconds, start, stop);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << "failed to get elapsed time between events: "
<< ToString(res);
@@ -1094,7 +1206,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
CUstream stream,
CUevent event) {
ScopedActivateContext activation{context};
- CUresult res = cuStreamWaitEvent(stream, event, 0 /* = flags */);
+ CUresult res = dynload::cuStreamWaitEvent(stream, event, 0 /* = flags */);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << "could not wait stream on event: " << ToString(res);
return false;
@@ -1105,7 +1217,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
/* static */ bool CUDADriver::SynchronizeContext(CudaContext* context) {
ScopedActivateContext activation{context};
- CUresult res = cuCtxSynchronize();
+ CUresult res = dynload::cuCtxSynchronize();
if (res != CUDA_SUCCESS) {
LOG(ERROR) << "could not synchronize on CUDA context: " << ToString(res)
<< " :: " << port::CurrentStackTrace();
@@ -1119,7 +1231,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
CUstream stream) {
ScopedActivateContext activated{context};
CHECK(stream != nullptr);
- CUresult res = cuStreamSynchronize(stream);
+ CUresult res = dynload::cuStreamSynchronize(stream);
if (res != CUDA_SUCCESS) {
port::Status status = port::InternalError(
port::StrCat("could not synchronize on CUDA stream: ", ToString(res)));
@@ -1135,7 +1247,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
CUstream stream) {
ScopedActivateContext activated{context};
CHECK(stream != nullptr);
- CUresult res = cuStreamQuery(stream);
+ CUresult res = dynload::cuStreamQuery(stream);
if (res == CUDA_SUCCESS) {
return true;
}
@@ -1151,7 +1263,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
CUdeviceptr gpu_src,
uint64 size) {
ScopedActivateContext activation{context};
- CUresult res = cuMemcpyDtoH(host_dst, gpu_src, size);
+ CUresult res = dynload::cuMemcpyDtoH_v2(host_dst, gpu_src, size);
if (res != CUDA_SUCCESS) {
return port::InternalError(
port::Printf("failed to synchronous memcpy from device to host: %s; "
@@ -1169,7 +1281,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
const void *host_src,
uint64 size) {
ScopedActivateContext activation{context};
- CUresult res = cuMemcpyHtoD(gpu_dst, host_src, size);
+ CUresult res = dynload::cuMemcpyHtoD_v2(gpu_dst, host_src, size);
if (res != CUDA_SUCCESS) {
return port::InternalError(port::Printf(
"failed to synchronous memcpy from host to device: %s; GPU dst: %p;"
@@ -1186,7 +1298,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
CUdeviceptr gpu_src,
uint64 size) {
ScopedActivateContext activation{context};
- CUresult res = cuMemcpyDtoD(gpu_dst, gpu_src, size);
+ CUresult res = dynload::cuMemcpyDtoD_v2(gpu_dst, gpu_src, size);
if (res != CUDA_SUCCESS) {
return port::InternalError(port::Printf(
"failed to synchronous memcpy from host to device: %s; GPU dst: %p; "
@@ -1204,7 +1316,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
uint64 size,
CUstream stream) {
ScopedActivateContext activation{context};
- CUresult res = cuMemcpyDtoHAsync(host_dst, gpu_src, size, stream);
+ CUresult res = dynload::cuMemcpyDtoHAsync_v2(host_dst, gpu_src, size, stream);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << port::Printf(
"failed to enqueue async memcpy from device to host: %s; host dst: %p; "
@@ -1224,7 +1336,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
uint64 size,
CUstream stream) {
ScopedActivateContext activation{context};
- CUresult res = cuMemcpyHtoDAsync(gpu_dst, host_src, size, stream);
+ CUresult res = dynload::cuMemcpyHtoDAsync_v2(gpu_dst, host_src, size, stream);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << port::Printf(
"failed to enqueue async memcpy from host to device: %s; GPU dst: %p; "
@@ -1243,7 +1355,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
uint64 size,
CUstream stream) {
ScopedActivateContext activation{context};
- CUresult result = cuMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
+ CUresult result =
+ dynload::cuMemcpyDtoDAsync_v2(gpu_dst, gpu_src, size, stream);
if (result != CUDA_SUCCESS) {
LOG(ERROR) << port::Printf(
"failed to enqueue async memcpy from device to device: %s"
@@ -1279,7 +1392,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
}
ScopedActivateContext activated{context};
- CUresult res = cuEventCreate(result, cuflags);
+ CUresult res = dynload::cuEventCreate(result, cuflags);
if (res == CUDA_SUCCESS) {
return port::Status::OK();
@@ -1295,7 +1408,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
/* static */ int CUDADriver::GetDeviceCount() {
int device_count = 0;
- CUresult res = cuDeviceGetCount(&device_count);
+ CUresult res = dynload::cuDeviceGetCount(&device_count);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << "could not retrieve CUDA device count: " << ToString(res);
return 0;
@@ -1310,8 +1423,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
/* static */ port::StatusOr<CudaContext*> CUDADriver::GetPointerContext(
CUdeviceptr pointer) {
CudaContext* context = nullptr;
- CUresult result =
- cuPointerGetAttribute(&context, CU_POINTER_ATTRIBUTE_CONTEXT, pointer);
+ CUresult result = dynload::cuPointerGetAttribute(
+ &context, CU_POINTER_ATTRIBUTE_CONTEXT, pointer);
if (result == CUDA_SUCCESS) {
CHECK(context != nullptr) << "success should entail non-null context";
return context;
@@ -1326,8 +1439,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
/* static */ port::StatusOr<MemorySpace> CUDADriver::GetPointerMemorySpace(
CUdeviceptr pointer) {
unsigned int value;
- CUresult result =
- cuPointerGetAttribute(&value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, pointer);
+ CUresult result = dynload::cuPointerGetAttribute(
+ &value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, pointer);
if (result == CUDA_SUCCESS) {
switch (value) {
case CU_MEMORYTYPE_DEVICE:
@@ -1350,7 +1463,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
/* static */ port::Status CUDADriver::GetPointerAddressRange(CUdeviceptr dptr,
CUdeviceptr *base,
size_t *size) {
- CUresult result = cuMemGetAddressRange(base, size, dptr);
+ CUresult result = dynload::cuMemGetAddressRange(base, size, dptr);
if (result == CUDA_SUCCESS) {
return port::Status::OK();
} else if (result == CUDA_ERROR_NOT_FOUND) {
@@ -1384,7 +1497,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
CUdevice device) {
*cc_major = 0;
*cc_minor = 0;
- CUresult result = cuDeviceComputeCapability(cc_major, cc_minor, device);
+ CUresult result =
+ dynload::cuDeviceComputeCapability(cc_major, cc_minor, device);
if (result == CUDA_SUCCESS) {
return port::Status::OK();
}
@@ -1401,7 +1515,7 @@ template <typename T>
static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
CUdevice_attribute attribute) {
int value = -1;
- CUresult result = cuDeviceGetAttribute(&value, attribute, device);
+ CUresult result = dynload::cuDeviceGetAttribute(&value, attribute, device);
if (result != CUDA_SUCCESS) {
return port::Status{
port::error::NOT_FOUND,
@@ -1456,24 +1570,24 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
/* static */ bool CUDADriver::GetGridLimits(int *x, int *y, int *z,
CUdevice device) {
int value;
- CUresult res =
- cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, device);
+ CUresult res = dynload::cuDeviceGetAttribute(
+ &value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, device);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << "failed to query max grid dim x: " << ToString(res);
return false;
}
*x = value;
- res =
- cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, device);
+ res = dynload::cuDeviceGetAttribute(
+ &value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, device);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << "failed to query max grid dim y: " << ToString(res);
return false;
}
*y = value;
- res =
- cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, device);
+ res = dynload::cuDeviceGetAttribute(
+ &value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, device);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << "failed to query max grid dim z: " << ToString(res);
return false;
@@ -1483,7 +1597,7 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
}
/* static */ bool CUDADriver::GetDriverVersion(int *driver_version) {
- CUresult res = cuDriverGetVersion(driver_version);
+ CUresult res = dynload::cuDriverGetVersion(driver_version);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << "failed to query driver version: " << ToString(res);
return false;
@@ -1494,7 +1608,8 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
/* static */ bool CUDADriver::GetDeviceProperties(CUdevprop *device_properties,
int device_ordinal) {
- CUresult res = cuDeviceGetProperties(device_properties, device_ordinal);
+ CUresult res =
+ dynload::cuDeviceGetProperties(device_properties, device_ordinal);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << "failed to query device properties: " << ToString(res);
return false;
@@ -1505,8 +1620,8 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
/* static */ bool CUDADriver::IsEccEnabled(CUdevice device, bool *result) {
int value = -1;
- CUresult res =
- cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, device);
+ CUresult res = dynload::cuDeviceGetAttribute(
+ &value, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, device);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << "failed to query ECC status: " << ToString(res);
return false;
@@ -1522,7 +1637,7 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
ScopedActivateContext activation{context};
size_t free = 0;
size_t total = 0;
- CUresult res = cuMemGetInfo(&free, &total);
+ CUresult res = dynload::cuMemGetInfo_v2(&free, &total);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << "failed to query device memory info: " << ToString(res);
return false;
@@ -1536,7 +1651,7 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
/* static */ bool CUDADriver::GetDeviceTotalMemory(CUdevice device,
uint64 *result) {
size_t value = -1;
- CUresult res = cuDeviceTotalMem(&value, device);
+ CUresult res = dynload::cuDeviceTotalMem_v2(&value, device);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << "failed to query total available memory: " << ToString(res);
return false;
@@ -1551,7 +1666,8 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
static const int kBufferSize = 64;
port::InlinedVector<char, 4> chars(kBufferSize);
chars[kBufferSize - 1] = '\0';
- CUresult res = cuDeviceGetPCIBusId(chars.begin(), kBufferSize - 1, device);
+ CUresult res =
+ dynload::cuDeviceGetPCIBusId(chars.begin(), kBufferSize - 1, device);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << "failed to query PCI bus id for device: " << ToString(res);
return pci_bus_id;
@@ -1579,7 +1695,7 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
<< to_device.status();
return false;
}
- CUresult res = cuDeviceCanAccessPeer(
+ CUresult res = dynload::cuDeviceCanAccessPeer(
&can_access_peer, from_device.ValueOrDie(), to_device.ValueOrDie());
if (res != CUDA_SUCCESS) {
LOG(ERROR) << "failed to detect peer access capability: " << ToString(res);
@@ -1596,7 +1712,8 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
}
ScopedActivateContext activated{from};
- CUresult result = cuCtxEnablePeerAccess(to->context(), 0 /* = flags */);
+ CUresult result =
+ dynload::cuCtxEnablePeerAccess(to->context(), 0 /* = flags */);
if (result != CUDA_SUCCESS &&
result != CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED) {
return port::Status{
@@ -1614,7 +1731,7 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
ScopedActivateContext activation{context};
int max_blocks;
- CUresult result = cuOccupancyMaxActiveBlocksPerMultiprocessor(
+ CUresult result = dynload::cuOccupancyMaxActiveBlocksPerMultiprocessor(
&max_blocks, kernel, threads_per_block, dynamic_shared_memory_bytes);
if (result != CUDA_SUCCESS) {
return port::Status{
@@ -1628,7 +1745,7 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
/* static */ CUcontext CUDADriver::CurrentContextOrDie() {
CUcontext current = nullptr;
- CUresult result = cuCtxGetCurrent(&current);
+ CUresult result = dynload::cuCtxGetCurrent(&current);
if (result != CUDA_SUCCESS) {
LOG(FATAL) << "failed to query current context: " << ToString(result);
}
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index 4bbd531..e840e6c 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -24,7 +24,9 @@ limitations under the License.
#else
#include <unistd.h>
#endif
+#include "tensorflow/stream_executor/platform/port.h"
#include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
+#include "tensorflow/stream_executor/dso_loader.h"
#include "tensorflow/stream_executor/cuda/cuda_driver.h"
#include "tensorflow/stream_executor/cuda/cuda_event.h"
#include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
@@ -1159,6 +1161,19 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
namespace gpu = ::perftools::gputools;
void initialize_cuda_gpu_executor() {
+ port::StatusOr<void *> status =
+ gpu::internal::CachedDsoLoader::GetLibcudaDsoHandle();
+ if (!status.ok()) {
+ gpu::cuda::Diagnostician::LogDriverVersionInformation();
+ LOG(INFO) << "LD_LIBRARY_PATH: " << getenv("LD_LIBRARY_PATH");
+ LOG(INFO) << "failed to find libcuda.so on this system: "
+ << status.status();
+ }
+
+ // TODO(b/22689637): Temporary until users are migrated off of PlatformKind.
+ gpu::PluginRegistry::Instance()->MapPlatformKindToId(
+ gpu::PlatformKind::kCuda, gpu::cuda::kCudaPlatformId);
+
*gpu::internal::MakeCUDAExecutorImplementation() = [](
const gpu::PluginConfig &config) {
return new gpu::cuda::CUDAExecutor{config};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment