Last active
July 8, 2020 01:48
-
-
Save dusty-nv/8a8bbf52d4f0c0999d07436484bb2988 to your computer and use it in GitHub Desktop.
PyTorch patch for building on JetPack 4.2 / 4.3
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp | |
index e48c020b03..0ecc111c4b 100644 | |
--- a/aten/src/ATen/cuda/CUDAContext.cpp | |
+++ b/aten/src/ATen/cuda/CUDAContext.cpp | |
@@ -24,6 +24,8 @@ void initCUDAContextVectors() { | |
void initDeviceProperty(DeviceIndex device_index) { | |
cudaDeviceProp device_prop; | |
AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index)); | |
+ // patch for "too many resources requested for launch" | |
+ device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2; | |
device_properties[device_index] = device_prop; | |
} | |
diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h | |
index af788ff8f8..fb27ab808c 100644 | |
--- a/aten/src/ATen/cuda/detail/KernelUtils.h | |
+++ b/aten/src/ATen/cuda/detail/KernelUtils.h | |
@@ -19,7 +19,10 @@ namespace at { namespace cuda { namespace detail { | |
for (int i=_i_n_d_e_x; _i_n_d_e_x < (n); _i_n_d_e_x+=blockDim.x * gridDim.x, i=_i_n_d_e_x) | |
// Use 1024 threads per block, which requires cuda sm_2x or above | |
-constexpr int CUDA_NUM_THREADS = 1024; | |
+//constexpr int CUDA_NUM_THREADS = 1024; | |
+ | |
+// patch for "too many resources requested for launch" | |
+constexpr int CUDA_NUM_THREADS = 512; | |
// CUDA: number of blocks for threads. | |
inline int GET_BLOCKS(const int N) | |
diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h | |
index 61cd90cdd6..cec1fa2698 100644 | |
--- a/aten/src/THCUNN/common.h | |
+++ b/aten/src/THCUNN/common.h | |
@@ -5,7 +5,10 @@ | |
"Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.") | |
// Use 1024 threads per block, which requires cuda sm_2x or above | |
-const int CUDA_NUM_THREADS = 1024; | |
+//const int CUDA_NUM_THREADS = 1024; | |
+ | |
+// patch for "too many resources requested for launch" | |
+const int CUDA_NUM_THREADS = 512; | |
// CUDA: number of blocks for threads. | |
inline int GET_BLOCKS(const int N) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment