pkit/tensorflow-1.6.0-gpu-cuda-load.patch

## tensorflow-1.6.0-gpu-cuda-load.patch
diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
index 08961fc..5963c8a 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
@@ -16,6 +16,9 @@ limitations under the License.
 #ifdef GOOGLE_CUDA
 #include "cuda/include/cuda.h"
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/dso_loader.h"
+#include "tensorflow/core/platform/default/logging.h"
 #endif  // GOOGLE_CUDA

 #include "tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h"
@@ -25,6 +28,41 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/core/platform/stream_executor.h"

+namespace dynload {
+
+namespace gpu = ::perftools::gputools;
+
+#define CUDAMALLOC_LIBCUDA_WRAP(__name)                                           \
+  struct DynLoadShim__##__name {                                                  \
+    static const char *kName;                                                     \
+    using FuncPointerT = std::add_pointer<decltype(::__name)>::type;              \
+    static void *GetDsoHandle() {                                                 \
+      static auto status = gpu::internal::CachedDsoLoader::GetLibcudaDsoHandle(); \
+      return status.ValueOrDie();                                                 \
+    }                                                                             \
+    static FuncPointerT LoadOrDie() {                                             \
+      void *f;                                                                    \
+      gpu::port::Status s = gpu::port::Env::Default()->GetSymbolFromLibrary(      \
+          GetDsoHandle(), kName, &f);                                             \
+      CHECK(s.ok()) << "could not find " << kName                                 \
+                    << " in libcuda DSO; dlerror: " << s.error_message();         \
+      return reinterpret_cast<FuncPointerT>(f);                                   \
+    }                                                                             \
+    static FuncPointerT DynLoad() {                                               \
+      static FuncPointerT f = LoadOrDie();                                        \
+      return f;                                                                   \
+    }                                                                             \
+    template <typename... Args>                                                   \
+    CUresult operator()(Args... args) {                                           \
+      return DynLoad()(args...);                                                  \
+    }                                                                             \
+  } __name;                                                                       \
+  const char *DynLoadShim__##__name::kName = #__name;
+
+	CUDAMALLOC_LIBCUDA_WRAP(cuMemAlloc_v2);
+	CUDAMALLOC_LIBCUDA_WRAP(cuMemFree_v2);
+}  // namespace dynload
+
 namespace tensorflow {

 GPUcudaMallocAllocator::GPUcudaMallocAllocator(VisitableAllocator* allocator,
@@ -40,7 +78,7 @@ void* GPUcudaMallocAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
   // allocate with cudaMalloc
   gpu::cuda::ScopedActivateExecutorContext scoped_activation{stream_exec_};
   CUdeviceptr rv = 0;
-  CUresult res = cuMemAlloc(&rv, num_bytes);
+  CUresult res = dynload::cuMemAlloc_v2(&rv, num_bytes);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "cuMemAlloc failed to allocate " << num_bytes;
     return nullptr;
@@ -53,7 +91,7 @@ void* GPUcudaMallocAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
 void GPUcudaMallocAllocator::DeallocateRaw(void* ptr) {
 #ifdef GOOGLE_CUDA
   // free with cudaFree
-  CUresult res = cuMemFree(reinterpret_cast<CUdeviceptr>(ptr));
+  CUresult res = dynload::cuMemFree_v2(reinterpret_cast<CUdeviceptr>(ptr));
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "cuMemFree failed to free " << ptr;
   }
diff --git a/tensorflow/stream_executor/BUILD b/tensorflow/stream_executor/BUILD
index 1865240..50b2930 100644
--- a/tensorflow/stream_executor/BUILD
+++ b/tensorflow/stream_executor/BUILD
@@ -76,7 +76,6 @@ cc_library(
     ] + if_cuda_is_configured([
         "//tensorflow/core:cuda",
         "@local_config_cuda//cuda:cublas",
-        "@local_config_cuda//cuda:cuda_driver",
         "@local_config_cuda//cuda:cudnn",
         "@local_config_cuda//cuda:cufft",
         "@local_config_cuda//cuda:curand",
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index a017ff6..7c5147a 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -21,7 +21,9 @@ limitations under the License.
 #include <set>
 #include <utility>

+#include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
+#include "tensorflow/stream_executor/dso_loader.h"
 #include "tensorflow/stream_executor/lib/casts.h"
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/error.h"
@@ -57,6 +59,107 @@ namespace perftools {
 namespace gputools {
 namespace cuda {

+namespace dynload {
+
+#define PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(__name)                              \
+  struct DynLoadShim__##__name {                                             \
+    static const char *kName;                                                \
+    using FuncPointerT = std::add_pointer<decltype(::__name)>::type;         \
+    static void *GetDsoHandle() {                                            \
+      static auto status = internal::CachedDsoLoader::GetLibcudaDsoHandle(); \
+      return status.ValueOrDie();                                            \
+    }                                                                        \
+    static FuncPointerT LoadOrDie() {                                        \
+      void *f;                                                               \
+      port::Status s = port::Env::Default()->GetSymbolFromLibrary(           \
+          GetDsoHandle(), kName, &f);                                        \
+      CHECK(s.ok()) << "could not find " << kName                            \
+                    << " in libcuda DSO; dlerror: " << s.error_message();    \
+      return reinterpret_cast<FuncPointerT>(f);                              \
+    }                                                                        \
+    static FuncPointerT DynLoad() {                                          \
+      static FuncPointerT f = LoadOrDie();                                   \
+      return f;                                                              \
+    }                                                                        \
+    template <typename... Args>                                              \
+    CUresult operator()(Args... args) {                                      \
+      return DynLoad()(args...);                                             \
+    }                                                                        \
+  } __name;                                                                  \
+  const char *DynLoadShim__##__name::kName = #__name;
+
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxCreate_v2);
+#if CUDA_VERSION >= 7000
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDevicePrimaryCtxRetain);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDevicePrimaryCtxRelease);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDevicePrimaryCtxSetFlags);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDevicePrimaryCtxGetState);
+#endif
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxDestroy);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxEnablePeerAccess);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxGetCurrent);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxGetDevice);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxGetSharedMemConfig);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxPopCurrent_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxSetCurrent);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxSetSharedMemConfig);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuCtxSynchronize);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceComputeCapability);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceCanAccessPeer);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGet);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGetAttribute);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGetCount);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGetName);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGetPCIBusId);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceGetProperties);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDeviceTotalMem);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuDriverGetVersion);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventCreate);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventDestroy_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventElapsedTime);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventQuery);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventRecord);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventSynchronize);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuFuncGetAttribute);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuFuncSetCacheConfig);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuGetErrorName);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuGetErrorString);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuInit);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuLaunchKernel);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemAlloc_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyDtoD_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyDtoH_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyHtoD_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyDtoDAsync_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyDtoHAsync_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemcpyHtoDAsync_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemGetAddressRange_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemFree_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemFreeHost);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemGetInfo_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemHostAlloc);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemHostRegister_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemHostUnregister);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemsetD32_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemsetD32Async);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemsetD8_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuMemsetD8Async);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuModuleGetFunction);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuModuleGetGlobal_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuModuleLoadDataEx);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuModuleLoadFatBinary);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuModuleUnload);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuOccupancyMaxActiveBlocksPerMultiprocessor);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuPointerGetAttribute);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamAddCallback);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamCreate);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamDestroy_v2);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamQuery);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamSynchronize);
+PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuStreamWaitEvent);
+
+}  // namespace dynload
+
 namespace {

 // Manages the singleton map of contexts that we've created, mapping
@@ -273,7 +376,7 @@ namespace {

 // Call cuCtxtSynchronize and crash if it doesn't succeed.
 void SynchronizeOrDie() {
-  auto res = cuCtxSynchronize();
+  auto res = dynload::cuCtxSynchronize();
   if (res != CUDA_SUCCESS) {
     LOG(FATAL) << "Synchronize found "
                << ToString(res) << " :: " << port::CurrentStackTrace();
@@ -309,7 +412,7 @@ ScopedActivateContext::ScopedActivateContext(CudaContext* cuda_context) {
   to_restore_ = (tls->depth == 1 ? nullptr : tls->context);

   // Set the context and update thread local.
-  CHECK_EQ(CUDA_SUCCESS, cuCtxSetCurrent(cuda_context->context()));
+  CHECK_EQ(CUDA_SUCCESS, dynload::cuCtxSetCurrent(cuda_context->context()));
   tls->id = cuda_context->id();
   tls->context = cuda_context;
 }
@@ -334,7 +437,7 @@ ScopedActivateContext::~ScopedActivateContext() {
   }

   // Set context and update thread local.
-  CHECK_EQ(CUDA_SUCCESS, cuCtxSetCurrent(to_restore_->context()));
+  CHECK_EQ(CUDA_SUCCESS, dynload::cuCtxSetCurrent(to_restore_->context()));
   tls->id = to_restore_->id();
   tls->context = to_restore_;
 }
@@ -395,8 +498,10 @@ static port::Status InternalInit() {
   CUresult res = CUDA_ERROR_NO_DEVICE;
   if (FLAGS_gpuexec_cuda_driver_inject_init_error) {
     LOG(ERROR) << "injecting CUDA init error; initialization will fail";
-  } else {
-    res = cuInit(0 /* = flags */);
+  } else if (internal::CachedDsoLoader::GetLibcudaDsoHandle().ok()) {
+    // We only call cuInit if we can dynload libcuda.
+
+    res = dynload::cuInit(0 /* = flags */);
   }

   if (res == CUDA_SUCCESS) {
@@ -429,7 +534,7 @@ static port::Status InternalInit() {

 /* static */ port::Status CUDADriver::GetDevice(int device_ordinal,
                                                 CUdevice *device) {
-  CUresult res = cuDeviceGet(device, device_ordinal);
+  CUresult res = dynload::cuDeviceGet(device, device_ordinal);
   if (res == CUDA_SUCCESS) {
     return port::Status::OK();
   }
@@ -443,7 +548,8 @@ static port::Status InternalInit() {
                                             string *device_name) {
   static const size_t kCharLimit = 64;
   port::InlinedVector<char, 4> chars(kCharLimit);
-  CUresult res = cuDeviceGetName(chars.begin(), kCharLimit - 1, device);
+  CUresult res =
+      dynload::cuDeviceGetName(chars.begin(), kCharLimit - 1, device);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to get device name for " << device << ": "
                << ToString(res);
@@ -499,8 +605,8 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
       unsigned int former_primary_context_flags;
       int former_primary_context_is_active;
       CHECK_EQ(CUDA_SUCCESS,
-               cuDevicePrimaryCtxGetState(device, &former_primary_context_flags,
-                                          &former_primary_context_is_active));
+               dynload::cuDevicePrimaryCtxGetState(device, &former_primary_context_flags,
+                                                   &former_primary_context_is_active));
       if (former_primary_context_flags != flags) {
         if (former_primary_context_is_active) {
           LOG(ERROR)
@@ -508,16 +614,16 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
               << former_primary_context_flags << ") than the desired flag set ("
               << flags << ").";
         } else {
-          CHECK_EQ(CUDA_SUCCESS, cuDevicePrimaryCtxSetFlags(device, flags));
+          CHECK_EQ(CUDA_SUCCESS, dynload::cuDevicePrimaryCtxSetFlags(device, flags));
         }
       }
     }

     former_context = CUDADriver::CurrentContextOrDie();
-    res = cuDevicePrimaryCtxRetain(&new_context, device);
+    res = dynload::cuDevicePrimaryCtxRetain(&new_context, device);
     if (former_context != nullptr) {
       CUdevice former_device;
-      if (cuCtxGetDevice(&former_device) == CUDA_SUCCESS) {
+      if (dynload::cuCtxGetDevice(&former_device) == CUDA_SUCCESS) {
         if (former_device == device) {
           if (former_context == new_context) {
             VLOG(2) << "The primary context " << former_context
@@ -544,10 +650,10 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
           << "creating context when one is currently active; existing: "
           << former_context;
     }
-    res = cuCtxCreate(&new_context, flags, device);
+    res = dynload::cuCtxCreate(&new_context, flags, device);
 #endif
   }
-  CHECK_EQ(CUDA_SUCCESS, cuCtxSetCurrent(former_context));
+  CHECK_EQ(CUDA_SUCCESS, dynload::cuCtxSetCurrent(former_context));

   if (res == CUDA_SUCCESS) {
     *context = CreatedContexts::Add(new_context);
@@ -580,14 +686,14 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
   }
 #if CUDA_VERSION >= 7000
   CUcontext former_context = CurrentContext();
-  CUresult res = cuCtxSetCurrent(context->context());
+  CUresult res = dynload::cuCtxSetCurrent(context->context());
   CUdevice device;
-  cuCtxGetDevice(&device);
-  cuCtxSetCurrent(former_context);
+  dynload::cuCtxGetDevice(&device);
+  dynload::cuCtxSetCurrent(former_context);

-  res = cuDevicePrimaryCtxRelease(device);
+  res = dynload::cuDevicePrimaryCtxRelease(device);
 #else
-  CUresult res = cuCtxDestroy(context->context());
+  CUresult res = dynload::cuCtxDestroy_v2(context->context());
 #endif

   if (res != CUDA_SUCCESS) {
@@ -600,7 +706,7 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
 /* static */ bool CUDADriver::FuncGetAttribute(CUfunction_attribute attribute,
                                                CUfunction func,
                                                int *attribute_value) {
-  CUresult res = cuFuncGetAttribute(attribute_value, attribute, func);
+  CUresult res = dynload::cuFuncGetAttribute(attribute_value, attribute, func);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query kernel attribute. kernel: " << func
                << ", attribute: " << attribute;
@@ -611,7 +717,7 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,

 /* static */ bool CUDADriver::FuncSetCacheConfig(CUfunction function,
                                                  CUfunc_cache cache_config) {
-  CUresult res = cuFuncSetCacheConfig(function, cache_config);
+  CUresult res = dynload::cuFuncSetCacheConfig(function, cache_config);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to set CUDA kernel cache config. kernel: " << function
                << ", config: " << cache_config << ", result: " << ToString(res);
@@ -625,10 +731,10 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
 CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   CUsharedconfig shared_mem_config;
   ScopedActivateContext activation{context};
-  CUresult result = cuCtxGetSharedMemConfig(&shared_mem_config);
+  CUresult result = dynload::cuCtxGetSharedMemConfig(&shared_mem_config);
   if (result != CUDA_SUCCESS) {
     CUdevice device;
-    cuCtxGetDevice(&device);
+    dynload::cuCtxGetDevice(&device);
     LOG(ERROR) << "failed to get CUDA device shared memory config. "
                << "Context device ID: " << device
                << ", result: " << ToString(result);
@@ -642,10 +748,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
 /* static */ port::Status CUDADriver::ContextSetSharedMemConfig(
     CudaContext* context, CUsharedconfig shared_mem_config) {
   ScopedActivateContext activation{context};
-  CUresult result = cuCtxSetSharedMemConfig(shared_mem_config);
+  CUresult result = dynload::cuCtxSetSharedMemConfig(shared_mem_config);
   if (result != CUDA_SUCCESS) {
     CUdevice device;
-    cuCtxGetDevice(&device);
+    dynload::cuCtxGetDevice(&device);
     LOG(ERROR) << "failed to set CUDA device shared memory config. "
                << "Context device ID: " << device
                << ", config: " << shared_mem_config
@@ -668,9 +774,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
           << " gdy: " << grid_dim_y << " gdz: " << grid_dim_z
           << " bdx: " << block_dim_x << " bdy: " << block_dim_y
           << " bdz: " << block_dim_z;
-  CUresult res = cuLaunchKernel(function, grid_dim_x, grid_dim_y, grid_dim_z,
-                                block_dim_x, block_dim_y, block_dim_z,
-                                shared_mem_bytes, stream, kernel_params, extra);
+  CUresult res = dynload::cuLaunchKernel(
+      function, grid_dim_x, grid_dim_y, grid_dim_z,
+      block_dim_x, block_dim_y, block_dim_z,
+      shared_mem_bytes, stream, kernel_params, extra);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to launch CUDA kernel: " << function
                << "; result: " << ToString(res);
@@ -684,7 +791,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                 const char *cubin_bytes,
                                                 CUmodule *module) {
   ScopedActivateContext activation{context};
-  CUresult result = cuModuleLoadFatBinary(module, cubin_bytes);
+  CUresult result = dynload::cuModuleLoadFatBinary(module, cubin_bytes);
   if (result != CUDA_SUCCESS) {
     return port::Status{port::error::INTERNAL,
                         "failed to load in-memory CUBIN: " + ToString(result)};
@@ -727,8 +834,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
       // TODO(leary) Need to see if NVIDIA can expunge the leakiness in their
       // module loading: see http://b/13248943

-      res = cuModuleLoadDataEx(module, ptx_data, ARRAYSIZE(options), options,
-                               option_values);
+      res = dynload::cuModuleLoadDataEx(module, ptx_data, ARRAYSIZE(options),
+                                        options, option_values);
     }

     // The PTX JIT mutates the values in the option values array to reflect the
@@ -767,7 +874,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                      CUdeviceptr location,
                                                      uint8 value, size_t size) {
   ScopedActivateContext activation{context};
-  CUresult res = cuMemsetD8(location, value, size);
+  CUresult res = dynload::cuMemsetD8_v2(location, value, size);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to memset memory: " << ToString(res);
     return false;
@@ -780,7 +887,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                       uint32 value,
                                                       size_t uint32_count) {
   ScopedActivateContext activation{context};
-  CUresult res = cuMemsetD32(location, value, uint32_count);
+  CUresult res = dynload::cuMemsetD32_v2(location, value, uint32_count);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to memset memory: " << ToString(res);
     return false;
@@ -794,7 +901,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                       size_t uint32_count,
                                                       CUstream stream) {
   ScopedActivateContext activation{context};
-  CUresult res = cuMemsetD8Async(location, value, uint32_count, stream);
+  CUresult res =
+      dynload::cuMemsetD8Async(location, value, uint32_count, stream);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res);
     return false;
@@ -809,7 +917,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                        size_t uint32_count,
                                                        CUstream stream) {
   ScopedActivateContext activation{context};
-  CUresult res = cuMemsetD32Async(location, value, uint32_count, stream);
+  CUresult res =
+      dynload::cuMemsetD32Async(location, value, uint32_count, stream);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res);
     return false;
@@ -823,7 +932,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                 StreamCallback callback,
                                                 void *data) {
   // Note: flags param is required to be zero according to CUDA 6.0.
-  CUresult res = cuStreamAddCallback(stream, callback, data, 0 /* = flags */);
+  CUresult res =
+      dynload::cuStreamAddCallback(stream, callback, data, 0 /* = flags */);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "unable to add host callback: " << ToString(res);
     return false;
@@ -837,7 +947,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                 CUfunction *function) {
   ScopedActivateContext activated{context};
   CHECK(module != nullptr && kernel_name != nullptr);
-  CUresult res = cuModuleGetFunction(function, module, kernel_name);
+  CUresult res = dynload::cuModuleGetFunction(function, module, kernel_name);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to get PTX kernel \"" << kernel_name
                << "\" from module: " << ToString(res);
@@ -855,7 +965,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   ScopedActivateContext activated{context};
   CHECK(module != nullptr && symbol_name != nullptr &&
         (dptr != nullptr || bytes != nullptr));
-  CUresult res = cuModuleGetGlobal(dptr, bytes, module, symbol_name);
+  CUresult res =
+      dynload::cuModuleGetGlobal_v2(dptr, bytes, module, symbol_name);
   if (res != CUDA_SUCCESS) {
     // symbol may not be found in the current module, but it may reside in
     // another module.
@@ -870,7 +981,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
 /* static */ void CUDADriver::UnloadModule(CudaContext *context,
                                            CUmodule module) {
   ScopedActivateContext activated{context};
-  CUresult res = cuModuleUnload(module);
+  CUresult res = dynload::cuModuleUnload(module);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to unload module " << module
                << "; leaking: " << ToString(res);
@@ -881,7 +992,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
     CudaContext* context) {
   ScopedActivateContext activated{context};
   CUdevice device = -1;
-  CUresult result = cuCtxGetDevice(&device);
+  CUresult result = dynload::cuCtxGetDevice(&device);
   if (result == CUDA_SUCCESS) {
     return device;
   }
@@ -897,7 +1008,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   // up synchronization with respect to memsets and any other things that have
   // to occur on the default stream?
   ScopedActivateContext activated{context};
-  CUresult res = cuStreamCreate(out, 0);
+  CUresult res = dynload::cuStreamCreate(out, 0);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "could not allocate CUDA stream for context " << context
                << ": " << ToString(res);
@@ -916,7 +1027,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   }

   ScopedActivateContext activated{context};
-  CUresult res = cuStreamDestroy(*stream);
+  CUresult res = dynload::cuStreamDestroy_v2(*stream);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to destroy CUDA stream for context " << context
                << ": " << ToString(res);
@@ -931,7 +1042,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                               uint64 bytes) {
   ScopedActivateContext activated{context};
   CUdeviceptr result = 0;
-  CUresult res = cuMemAlloc(&result, bytes);
+  CUresult res = dynload::cuMemAlloc_v2(&result, bytes);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to allocate "
                << port::HumanReadableNumBytes::ToString(bytes) << " (" << bytes
@@ -948,7 +1059,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                void *location) {
   ScopedActivateContext activation{context};
   CUdeviceptr pointer = port::bit_cast<CUdeviceptr>(location);
-  CUresult res = cuMemFree(pointer);
+  CUresult res = dynload::cuMemFree_v2(pointer);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to free device memory at " << location
                << "; result: " << ToString(res);
@@ -962,7 +1073,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   ScopedActivateContext activation{context};
   void *host_mem = nullptr;
   // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
-  CUresult res = cuMemHostAlloc(&host_mem, bytes, CU_MEMHOSTALLOC_PORTABLE);
+  CUresult res =
+      dynload::cuMemHostAlloc(&host_mem, bytes, CU_MEMHOSTALLOC_PORTABLE);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to alloc " << bytes
                << " bytes on host: " << ToString(res);
@@ -973,7 +1085,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
 /* static */ void CUDADriver::HostDeallocate(CudaContext* context,
                                              void *location) {
   ScopedActivateContext activation{context};
-  CUresult res = cuMemFreeHost(location);
+  CUresult res = dynload::cuMemFreeHost(location);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "error deallocating host memory at " << location << ": "
                << ToString(res);
@@ -985,7 +1097,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   ScopedActivateContext activation{context};
   // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
   CUresult res =
-      cuMemHostRegister(location, bytes, CU_MEMHOSTREGISTER_PORTABLE);
+      dynload::cuMemHostRegister(location, bytes, CU_MEMHOSTREGISTER_PORTABLE);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "error registering host memory at " << location << ": "
                << ToString(res);
@@ -997,7 +1109,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
 /* static */ bool CUDADriver::HostUnregister(CudaContext* context,
                                              void *location) {
   ScopedActivateContext activation{context};
-  CUresult res = cuMemHostUnregister(location);
+  CUresult res = dynload::cuMemHostUnregister(location);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "error unregistering host memory at " << location << ": "
                << ToString(res);
@@ -1014,7 +1126,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   }

   ScopedActivateContext activated{context};
-  CUresult res = cuEventDestroy(*event);
+  CUresult res = dynload::cuEventDestroy_v2(*event);
   *event = nullptr;

   switch (res) {
@@ -1038,7 +1150,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                   CUevent event,
                                                   CUstream stream) {
   ScopedActivateContext activated{context};
-  CUresult res = cuEventRecord(event, stream);
+  CUresult res = dynload::cuEventRecord(event, stream);
   switch (res) {
     case CUDA_SUCCESS:
       return port::Status::OK();
@@ -1059,7 +1171,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
 /* static */ port::StatusOr<CUresult> CUDADriver::QueryEvent(
     CudaContext *context, CUevent event) {
   ScopedActivateContext activated{context};
-  CUresult res = cuEventQuery(event);
+  CUresult res = dynload::cuEventQuery(event);
   if (res != CUDA_SUCCESS && res != CUDA_ERROR_NOT_READY) {
     return port::Status{
         port::error::INTERNAL,
@@ -1075,12 +1187,12 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   ScopedActivateContext activated{context};
   // The stop event must have completed in order for cuEventElapsedTime to
   // work.
-  CUresult res = cuEventSynchronize(stop);
+  CUresult res = dynload::cuEventSynchronize(stop);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to synchronize the stop event: " << ToString(res);
     return false;
   }
-  res = cuEventElapsedTime(elapsed_milliseconds, start, stop);
+  res = dynload::cuEventElapsedTime(elapsed_milliseconds, start, stop);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to get elapsed time between events: "
                << ToString(res);
@@ -1094,7 +1206,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                 CUstream stream,
                                                 CUevent event) {
   ScopedActivateContext activation{context};
-  CUresult res = cuStreamWaitEvent(stream, event, 0 /* = flags */);
+  CUresult res = dynload::cuStreamWaitEvent(stream, event, 0 /* = flags */);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "could not wait stream on event: " << ToString(res);
     return false;
@@ -1105,7 +1217,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {

 /* static */ bool CUDADriver::SynchronizeContext(CudaContext* context) {
   ScopedActivateContext activation{context};
-  CUresult res = cuCtxSynchronize();
+  CUresult res = dynload::cuCtxSynchronize();
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "could not synchronize on CUDA context: " << ToString(res)
                << " :: " << port::CurrentStackTrace();
@@ -1119,7 +1231,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                         CUstream stream) {
   ScopedActivateContext activated{context};
   CHECK(stream != nullptr);
-  CUresult res = cuStreamSynchronize(stream);
+  CUresult res = dynload::cuStreamSynchronize(stream);
   if (res != CUDA_SUCCESS) {
     port::Status status = port::InternalError(
         port::StrCat("could not synchronize on CUDA stream: ", ToString(res)));
@@ -1135,7 +1247,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                            CUstream stream) {
   ScopedActivateContext activated{context};
   CHECK(stream != nullptr);
-  CUresult res = cuStreamQuery(stream);
+  CUresult res = dynload::cuStreamQuery(stream);
   if (res == CUDA_SUCCESS) {
     return true;
   }
@@ -1151,7 +1263,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                            CUdeviceptr gpu_src,
                                                            uint64 size) {
   ScopedActivateContext activation{context};
-  CUresult res = cuMemcpyDtoH(host_dst, gpu_src, size);
+  CUresult res = dynload::cuMemcpyDtoH_v2(host_dst, gpu_src, size);
   if (res != CUDA_SUCCESS) {
     return port::InternalError(
         port::Printf("failed to synchronous memcpy from device to host: %s; "
@@ -1169,7 +1281,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                            const void *host_src,
                                                            uint64 size) {
   ScopedActivateContext activation{context};
-  CUresult res = cuMemcpyHtoD(gpu_dst, host_src, size);
+  CUresult res = dynload::cuMemcpyHtoD_v2(gpu_dst, host_src, size);
   if (res != CUDA_SUCCESS) {
     return port::InternalError(port::Printf(
         "failed to synchronous memcpy from host to device: %s; GPU dst: %p;"
@@ -1186,7 +1298,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                            CUdeviceptr gpu_src,
                                                            uint64 size) {
   ScopedActivateContext activation{context};
-  CUresult res = cuMemcpyDtoD(gpu_dst, gpu_src, size);
+  CUresult res = dynload::cuMemcpyDtoD_v2(gpu_dst, gpu_src, size);
   if (res != CUDA_SUCCESS) {
     return port::InternalError(port::Printf(
         "failed to synchronous memcpy from host to device: %s; GPU dst: %p; "
@@ -1204,7 +1316,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                     uint64 size,
                                                     CUstream stream) {
   ScopedActivateContext activation{context};
-  CUresult res = cuMemcpyDtoHAsync(host_dst, gpu_src, size, stream);
+  CUresult res = dynload::cuMemcpyDtoHAsync_v2(host_dst, gpu_src, size, stream);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << port::Printf(
         "failed to enqueue async memcpy from device to host: %s; host dst: %p; "
@@ -1224,7 +1336,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                     uint64 size,
                                                     CUstream stream) {
   ScopedActivateContext activation{context};
-  CUresult res = cuMemcpyHtoDAsync(gpu_dst, host_src, size, stream);
+  CUresult res = dynload::cuMemcpyHtoDAsync_v2(gpu_dst, host_src, size, stream);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << port::Printf(
         "failed to enqueue async memcpy from host to device: %s; GPU dst: %p; "
@@ -1243,7 +1355,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                     uint64 size,
                                                     CUstream stream) {
   ScopedActivateContext activation{context};
-  CUresult result = cuMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
+  CUresult result =
+      dynload::cuMemcpyDtoDAsync_v2(gpu_dst, gpu_src, size, stream);
   if (result != CUDA_SUCCESS) {
     LOG(ERROR) << port::Printf(
         "failed to enqueue async memcpy from device to device: %s"
@@ -1279,7 +1392,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   }

   ScopedActivateContext activated{context};
-  CUresult res = cuEventCreate(result, cuflags);
+  CUresult res = dynload::cuEventCreate(result, cuflags);

   if (res == CUDA_SUCCESS) {
     return port::Status::OK();
@@ -1295,7 +1408,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {

 /* static */ int CUDADriver::GetDeviceCount() {
   int device_count = 0;
-  CUresult res = cuDeviceGetCount(&device_count);
+  CUresult res = dynload::cuDeviceGetCount(&device_count);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "could not retrieve CUDA device count: " << ToString(res);
     return 0;
@@ -1310,8 +1423,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
 /* static */ port::StatusOr<CudaContext*> CUDADriver::GetPointerContext(
     CUdeviceptr pointer) {
   CudaContext* context = nullptr;
-  CUresult result =
-      cuPointerGetAttribute(&context, CU_POINTER_ATTRIBUTE_CONTEXT, pointer);
+  CUresult result = dynload::cuPointerGetAttribute(
+      &context, CU_POINTER_ATTRIBUTE_CONTEXT, pointer);
   if (result == CUDA_SUCCESS) {
     CHECK(context != nullptr) << "success should entail non-null context";
     return context;
@@ -1326,8 +1439,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
 /* static */ port::StatusOr<MemorySpace> CUDADriver::GetPointerMemorySpace(
     CUdeviceptr pointer) {
   unsigned int value;
-  CUresult result =
-      cuPointerGetAttribute(&value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, pointer);
+  CUresult result = dynload::cuPointerGetAttribute(
+      &value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, pointer);
   if (result == CUDA_SUCCESS) {
     switch (value) {
       case CU_MEMORYTYPE_DEVICE:
@@ -1350,7 +1463,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
 /* static */ port::Status CUDADriver::GetPointerAddressRange(CUdeviceptr dptr,
                                                              CUdeviceptr *base,
                                                              size_t *size) {
-  CUresult result = cuMemGetAddressRange(base, size, dptr);
+  CUresult result = dynload::cuMemGetAddressRange(base, size, dptr);
   if (result == CUDA_SUCCESS) {
     return port::Status::OK();
   } else if (result == CUDA_ERROR_NOT_FOUND) {
@@ -1384,7 +1497,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                            CUdevice device) {
   *cc_major = 0;
   *cc_minor = 0;
-  CUresult result = cuDeviceComputeCapability(cc_major, cc_minor, device);
+  CUresult result =
+      dynload::cuDeviceComputeCapability(cc_major, cc_minor, device);
   if (result == CUDA_SUCCESS) {
     return port::Status::OK();
   }
@@ -1401,7 +1515,7 @@ template <typename T>
 static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
                                             CUdevice_attribute attribute) {
   int value = -1;
-  CUresult result = cuDeviceGetAttribute(&value, attribute, device);
+  CUresult result = dynload::cuDeviceGetAttribute(&value, attribute, device);
   if (result != CUDA_SUCCESS) {
     return port::Status{
         port::error::NOT_FOUND,
@@ -1456,24 +1570,24 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
 /* static */ bool CUDADriver::GetGridLimits(int *x, int *y, int *z,
                                             CUdevice device) {
   int value;
-  CUresult res =
-      cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, device);
+  CUresult res = dynload::cuDeviceGetAttribute(
+      &value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, device);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query max grid dim x: " << ToString(res);
     return false;
   }
   *x = value;

-  res =
-      cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, device);
+  res = dynload::cuDeviceGetAttribute(
+      &value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, device);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query max grid dim y: " << ToString(res);
     return false;
   }
   *y = value;

-  res =
-      cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, device);
+  res = dynload::cuDeviceGetAttribute(
+      &value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, device);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query max grid dim z: " << ToString(res);
     return false;
@@ -1483,7 +1597,7 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
 }

 /* static */ bool CUDADriver::GetDriverVersion(int *driver_version) {
-  CUresult res = cuDriverGetVersion(driver_version);
+  CUresult res = dynload::cuDriverGetVersion(driver_version);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query driver version: " << ToString(res);
     return false;
@@ -1494,7 +1608,8 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,

 /* static */ bool CUDADriver::GetDeviceProperties(CUdevprop *device_properties,
                                                   int device_ordinal) {
-  CUresult res = cuDeviceGetProperties(device_properties, device_ordinal);
+  CUresult res =
+      dynload::cuDeviceGetProperties(device_properties, device_ordinal);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query device properties: " << ToString(res);
     return false;
@@ -1505,8 +1620,8 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,

 /* static */ bool CUDADriver::IsEccEnabled(CUdevice device, bool *result) {
   int value = -1;
-  CUresult res =
-      cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, device);
+  CUresult res = dynload::cuDeviceGetAttribute(
+      &value, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, device);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query ECC status: " << ToString(res);
     return false;
@@ -1522,7 +1637,7 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   ScopedActivateContext activation{context};
   size_t free = 0;
   size_t total = 0;
-  CUresult res = cuMemGetInfo(&free, &total);
+  CUresult res = dynload::cuMemGetInfo_v2(&free, &total);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query device memory info: " << ToString(res);
     return false;
@@ -1536,7 +1651,7 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
 /* static */ bool CUDADriver::GetDeviceTotalMemory(CUdevice device,
                                                    uint64 *result) {
   size_t value = -1;
-  CUresult res = cuDeviceTotalMem(&value, device);
+  CUresult res = dynload::cuDeviceTotalMem_v2(&value, device);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query total available memory: " << ToString(res);
     return false;
@@ -1551,7 +1666,8 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   static const int kBufferSize = 64;
   port::InlinedVector<char, 4> chars(kBufferSize);
   chars[kBufferSize - 1] = '\0';
-  CUresult res = cuDeviceGetPCIBusId(chars.begin(), kBufferSize - 1, device);
+  CUresult res =
+      dynload::cuDeviceGetPCIBusId(chars.begin(), kBufferSize - 1, device);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query PCI bus id for device: " << ToString(res);
     return pci_bus_id;
@@ -1579,7 +1695,7 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
                << to_device.status();
     return false;
   }
-  CUresult res = cuDeviceCanAccessPeer(
+  CUresult res = dynload::cuDeviceCanAccessPeer(
       &can_access_peer, from_device.ValueOrDie(), to_device.ValueOrDie());
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to detect peer access capability: " << ToString(res);
@@ -1596,7 +1712,8 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   }

   ScopedActivateContext activated{from};
-  CUresult result = cuCtxEnablePeerAccess(to->context(), 0 /* = flags */);
+  CUresult result =
+      dynload::cuCtxEnablePeerAccess(to->context(), 0 /* = flags */);
   if (result != CUDA_SUCCESS &&
       result != CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED) {
     return port::Status{
@@ -1614,7 +1731,7 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   ScopedActivateContext activation{context};

   int max_blocks;
-  CUresult result = cuOccupancyMaxActiveBlocksPerMultiprocessor(
+  CUresult result = dynload::cuOccupancyMaxActiveBlocksPerMultiprocessor(
       &max_blocks, kernel, threads_per_block, dynamic_shared_memory_bytes);
   if (result != CUDA_SUCCESS) {
     return port::Status{
@@ -1628,7 +1745,7 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,

 /* static */ CUcontext CUDADriver::CurrentContextOrDie() {
   CUcontext current = nullptr;
-  CUresult result = cuCtxGetCurrent(&current);
+  CUresult result = dynload::cuCtxGetCurrent(&current);
   if (result != CUDA_SUCCESS) {
     LOG(FATAL) << "failed to query current context: " << ToString(result);
   }
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index 4bbd531..e840e6c 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -24,7 +24,9 @@ limitations under the License.
 #else
 #include <unistd.h>
 #endif
+#include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
+#include "tensorflow/stream_executor/dso_loader.h"
 #include "tensorflow/stream_executor/cuda/cuda_driver.h"
 #include "tensorflow/stream_executor/cuda/cuda_event.h"
 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
@@ -1159,6 +1161,19 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
 namespace gpu = ::perftools::gputools;

 void initialize_cuda_gpu_executor() {
+  port::StatusOr<void *> status =
+      gpu::internal::CachedDsoLoader::GetLibcudaDsoHandle();
+  if (!status.ok()) {
+    gpu::cuda::Diagnostician::LogDriverVersionInformation();
+    LOG(INFO) << "LD_LIBRARY_PATH: " << getenv("LD_LIBRARY_PATH");
+    LOG(INFO) << "failed to find libcuda.so on this system: "
+              << status.status();
+  }
+
+  // TODO(b/22689637): Temporary until users are migrated off of PlatformKind.
+  gpu::PluginRegistry::Instance()->MapPlatformKindToId(
+      gpu::PlatformKind::kCuda, gpu::cuda::kCudaPlatformId);
+
   *gpu::internal::MakeCUDAExecutorImplementation() = [](
       const gpu::PluginConfig &config) {
     return new gpu::cuda::CUDAExecutor{config};