Skip to content

Instantly share code, notes, and snippets.

@dusty-nv
Last active April 15, 2024 18:31
Show Gist options
  • Star 10 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dusty-nv/ce51796085178e1f38e3c6a1663a93a1 to your computer and use it in GitHub Desktop.
Save dusty-nv/ce51796085178e1f38e3c6a1663a93a1 to your computer and use it in GitHub Desktop.
PyTorch patch for building on JetPack >= 4.4
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h b/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h
index 2aac442d21..f2321dad7a 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h
@@ -26,6 +26,9 @@ namespace {
// Most likely we will do aarch32 support with inline asm.
#if defined(__aarch64__)
+// See https://github.com/pytorch/pytorch/issues/47098
+#if defined(__clang__) || (__GNUC__ > 8 || (__GNUC__ == 8 && __GNUC_MINOR__ > 3))
+
#ifdef __BIG_ENDIAN__
#error "Big endian is not supported."
#endif
@@ -713,6 +716,7 @@ Vectorized<float> inline fmadd(const Vectorized<float>& a, const Vectorized<floa
return Vectorized<float>(r0, r1);
}
+#endif /* defined(__clang__) || (__GNUC__ > 8 || (__GNUC__ == 8 && __GNUC_MINOR__ > 3)) */
#endif /* defined(aarch64) */
}}}
diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
index 1751128f1a..03e74f5ac2 100644
--- a/aten/src/ATen/cuda/CUDAContext.cpp
+++ b/aten/src/ATen/cuda/CUDAContext.cpp
@@ -24,6 +24,8 @@ void initCUDAContextVectors() {
void initDeviceProperty(DeviceIndex device_index) {
cudaDeviceProp device_prop;
AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index));
+ // patch for "too many resources requested for launch"
+ device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2;
device_properties[device_index] = device_prop;
}
diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h
index 91a61b04b8..5e9c128eed 100644
--- a/aten/src/ATen/cuda/detail/KernelUtils.h
+++ b/aten/src/ATen/cuda/detail/KernelUtils.h
@@ -19,7 +19,9 @@ namespace at { namespace cuda { namespace detail {
// Use 1024 threads per block, which requires cuda sm_2x or above
-constexpr int CUDA_NUM_THREADS = 1024;
+//constexpr int CUDA_NUM_THREADS = 1024;
+// patch for "too many resources requested for launch"
+constexpr int CUDA_NUM_THREADS = 512;
// CUDA: number of blocks for threads.
inline int GET_BLOCKS(const int64_t N, const int64_t max_threads_per_block=CUDA_NUM_THREADS) {
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index 00e6d5d45e..2ee832051a 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -1595,7 +1595,7 @@ def _get_cuda_arch_flags(cflags: Optional[List[str]] = None) -> List[str]:
])
supported_arches = ['3.5', '3.7', '5.0', '5.2', '5.3', '6.0', '6.1', '6.2',
- '7.0', '7.2', '7.5', '8.0', '8.6']
+ '7.0', '7.2', '7.5', '8.0', '8.6', '8.7']
valid_arch_strings = supported_arches + [s + "+PTX" for s in supported_arches]
# The default is sm_30 for CUDA 9.x and 10.x
diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
index e48c020b03..0ecc111c4b 100644
--- a/aten/src/ATen/cuda/CUDAContext.cpp
+++ b/aten/src/ATen/cuda/CUDAContext.cpp
@@ -24,6 +24,8 @@ void initCUDAContextVectors() {
void initDeviceProperty(DeviceIndex device_index) {
cudaDeviceProp device_prop;
AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index));
+ // patch for "too many resources requested for launch"
+ device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2;
device_properties[device_index] = device_prop;
}
diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h
index af788ff8f8..fb27ab808c 100644
--- a/aten/src/ATen/cuda/detail/KernelUtils.h
+++ b/aten/src/ATen/cuda/detail/KernelUtils.h
@@ -19,7 +19,10 @@ namespace at { namespace cuda { namespace detail {
for (int i=_i_n_d_e_x; _i_n_d_e_x < (n); _i_n_d_e_x+=blockDim.x * gridDim.x, i=_i_n_d_e_x)
// Use 1024 threads per block, which requires cuda sm_2x or above
-constexpr int CUDA_NUM_THREADS = 1024;
+//constexpr int CUDA_NUM_THREADS = 1024;
+
+// patch for "too many resources requested for launch"
+constexpr int CUDA_NUM_THREADS = 512;
// CUDA: number of blocks for threads.
inline int GET_BLOCKS(const int N)
diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h
index 61cd90cdd6..1d3f5383d4 100644
--- a/aten/src/THCUNN/common.h
+++ b/aten/src/THCUNN/common.h
@@ -5,7 +5,10 @@
"Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")
// Use 1024 threads per block, which requires cuda sm_2x or above
-const int CUDA_NUM_THREADS = 1024;
+//const int CUDA_NUM_THREADS = 1024;
+
+// patch for "too many resources requested for launch"
+constexpr int CUDA_NUM_THREADS = 512;
// CUDA: number of blocks for threads.
inline int GET_BLOCKS(const int N)
diff --git a/caffe2/operators/rnn/recurrent_op_cudnn.cc b/caffe2/operators/rnn/recurrent_op_cudnn.cc
index 8f69944dcf..3679c9d2a7 100644
--- a/caffe2/operators/rnn/recurrent_op_cudnn.cc
+++ b/caffe2/operators/rnn/recurrent_op_cudnn.cc
@@ -99,7 +99,7 @@ void RecurrentBaseOp<T>::initialize(
// RNN setup
{
#if CUDNN_VERSION_MIN(7, 0, 0)
- CUDNN_ENFORCE(cudnnSetRNNDescriptor(
+ CUDNN_ENFORCE(cudnnSetRNNDescriptor_v6(
cudnn_wrapper_.inline_cudnn_handle(),
rnnDesc_,
hiddenSize,
diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake
index a5c50b90df..22d738e0c1 100644
--- a/cmake/public/cuda.cmake
+++ b/cmake/public/cuda.cmake
@@ -147,7 +147,7 @@ endif()
# ---[ Extract versions
if(CAFFE2_USE_CUDNN)
# Get cuDNN version
- file(READ ${CUDNN_INCLUDE_PATH}/cudnn.h CUDNN_HEADER_CONTENTS)
+ file(READ ${CUDNN_INCLUDE_PATH}/cudnn_version.h CUDNN_HEADER_CONTENTS)
string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)"
CUDNN_VERSION_MAJOR "${CUDNN_HEADER_CONTENTS}")
string(REGEX REPLACE "define CUDNN_MAJOR * +([0-9]+)" "\\1"
diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
index fd51cc45e7..e3be2fd3bc 100644
--- a/aten/src/ATen/cuda/CUDAContext.cpp
+++ b/aten/src/ATen/cuda/CUDAContext.cpp
@@ -24,6 +24,8 @@ void initCUDAContextVectors() {
void initDeviceProperty(DeviceIndex device_index) {
cudaDeviceProp device_prop;
AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index));
+ // patch for "too many resources requested for launch"
+ device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2;
device_properties[device_index] = device_prop;
}
diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h
index af788ff8f8..fb27ab808c 100644
--- a/aten/src/ATen/cuda/detail/KernelUtils.h
+++ b/aten/src/ATen/cuda/detail/KernelUtils.h
@@ -19,7 +19,10 @@ namespace at { namespace cuda { namespace detail {
for (int i=_i_n_d_e_x; _i_n_d_e_x < (n); _i_n_d_e_x+=blockDim.x * gridDim.x, i=_i_n_d_e_x)
// Use 1024 threads per block, which requires cuda sm_2x or above
-constexpr int CUDA_NUM_THREADS = 1024;
+//constexpr int CUDA_NUM_THREADS = 1024;
+
+// patch for "too many resources requested for launch"
+constexpr int CUDA_NUM_THREADS = 512;
// CUDA: number of blocks for threads.
inline int GET_BLOCKS(const int N)
diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h
index 61cd90cdd6..1d3f5383d4 100644
--- a/aten/src/THCUNN/common.h
+++ b/aten/src/THCUNN/common.h
@@ -5,7 +5,10 @@
"Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")
// Use 1024 threads per block, which requires cuda sm_2x or above
-const int CUDA_NUM_THREADS = 1024;
+//const int CUDA_NUM_THREADS = 1024;
+
+// patch for "too many resources requested for launch"
+constexpr int CUDA_NUM_THREADS = 512;
// CUDA: number of blocks for threads.
inline int GET_BLOCKS(const int N)
diff --git a/caffe2/operators/rnn/recurrent_op_cudnn.cc b/caffe2/operators/rnn/recurrent_op_cudnn.cc
index 8f69944dcf..3679c9d2a7 100644
--- a/caffe2/operators/rnn/recurrent_op_cudnn.cc
+++ b/caffe2/operators/rnn/recurrent_op_cudnn.cc
@@ -99,7 +99,7 @@ void RecurrentBaseOp<T>::initialize(
// RNN setup
{
#if CUDNN_VERSION_MIN(7, 0, 0)
- CUDNN_ENFORCE(cudnnSetRNNDescriptor(
+ CUDNN_ENFORCE(cudnnSetRNNDescriptor_v6(
cudnn_wrapper_.inline_cudnn_handle(),
rnnDesc_,
hiddenSize,
diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake
index 545d6cd924..235beaeda8 100644
--- a/cmake/public/cuda.cmake
+++ b/cmake/public/cuda.cmake
@@ -145,7 +145,7 @@ endif()
# ---[ Extract versions
if(CAFFE2_USE_CUDNN)
# Get cuDNN version
- file(READ ${CUDNN_INCLUDE_PATH}/cudnn.h CUDNN_HEADER_CONTENTS)
+ file(READ ${CUDNN_INCLUDE_PATH}/cudnn_version.h CUDNN_HEADER_CONTENTS)
string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)"
CUDNN_VERSION_MAJOR "${CUDNN_HEADER_CONTENTS}")
string(REGEX REPLACE "define CUDNN_MAJOR * +([0-9]+)" "\\1"
diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
index fd51cc45e7..e3be2fd3bc 100644
--- a/aten/src/ATen/cuda/CUDAContext.cpp
+++ b/aten/src/ATen/cuda/CUDAContext.cpp
@@ -24,6 +24,8 @@ void initCUDAContextVectors() {
void initDeviceProperty(DeviceIndex device_index) {
cudaDeviceProp device_prop;
AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index));
+ // patch for "too many resources requested for launch"
+ device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2;
device_properties[device_index] = device_prop;
}
diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h
index af788ff8f8..fb27ab808c 100644
--- a/aten/src/ATen/cuda/detail/KernelUtils.h
+++ b/aten/src/ATen/cuda/detail/KernelUtils.h
@@ -19,7 +19,10 @@ namespace at { namespace cuda { namespace detail {
for (int i=_i_n_d_e_x; _i_n_d_e_x < (n); _i_n_d_e_x+=blockDim.x * gridDim.x, i=_i_n_d_e_x)
// Use 1024 threads per block, which requires cuda sm_2x or above
-constexpr int CUDA_NUM_THREADS = 1024;
+//constexpr int CUDA_NUM_THREADS = 1024;
+
+// patch for "too many resources requested for launch"
+constexpr int CUDA_NUM_THREADS = 512;
// CUDA: number of blocks for threads.
inline int GET_BLOCKS(const int N)
diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h
index 61cd90cdd6..c50c5506ca 100644
--- a/aten/src/THCUNN/common.h
+++ b/aten/src/THCUNN/common.h
@@ -5,7 +5,10 @@
"Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")
// Use 1024 threads per block, which requires cuda sm_2x or above
-const int CUDA_NUM_THREADS = 1024;
+//constexpr int CUDA_NUM_THREADS = 1024;
+
+// patch for "too many resources requested for launch"
+constexpr int CUDA_NUM_THREADS = 512;
// CUDA: number of blocks for threads.
inline int GET_BLOCKS(const int N)
diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
index fd51cc45e7..e3be2fd3bc 100644
--- a/aten/src/ATen/cuda/CUDAContext.cpp
+++ b/aten/src/ATen/cuda/CUDAContext.cpp
@@ -24,6 +24,8 @@ void initCUDAContextVectors() {
void initDeviceProperty(DeviceIndex device_index) {
cudaDeviceProp device_prop;
AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index));
+ // patch for "too many resources requested for launch"
+ device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2;
device_properties[device_index] = device_prop;
}
diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h
index af788ff8f8..fb27ab808c 100644
--- a/aten/src/ATen/cuda/detail/KernelUtils.h
+++ b/aten/src/ATen/cuda/detail/KernelUtils.h
@@ -19,7 +19,10 @@ namespace at { namespace cuda { namespace detail {
for (int i=_i_n_d_e_x; _i_n_d_e_x < (n); _i_n_d_e_x+=blockDim.x * gridDim.x, i=_i_n_d_e_x)
// Use 1024 threads per block, which requires cuda sm_2x or above
-constexpr int CUDA_NUM_THREADS = 1024;
+//constexpr int CUDA_NUM_THREADS = 1024;
+
+// patch for "too many resources requested for launch"
+constexpr int CUDA_NUM_THREADS = 512;
// CUDA: number of blocks for threads.
inline int GET_BLOCKS(const int N)
diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h
index 61cd90cdd6..c50c5506ca 100644
--- a/aten/src/THCUNN/common.h
+++ b/aten/src/THCUNN/common.h
@@ -5,7 +5,10 @@
"Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")
// Use 1024 threads per block, which requires cuda sm_2x or above
-const int CUDA_NUM_THREADS = 1024;
+//constexpr int CUDA_NUM_THREADS = 1024;
+
+// patch for "too many resources requested for launch"
+constexpr int CUDA_NUM_THREADS = 512;
// CUDA: number of blocks for threads.
inline int GET_BLOCKS(const int N)
diff --git a/aten/src/ATen/cpu/vec256/vec256_float_neon.h b/aten/src/ATen/cpu/vec256/vec256_float_neon.h
index cfe6b0ea0f..d1e75ab9af 100644
--- a/aten/src/ATen/cpu/vec256/vec256_float_neon.h
+++ b/aten/src/ATen/cpu/vec256/vec256_float_neon.h
@@ -25,6 +25,8 @@ namespace {
// https://bugs.llvm.org/show_bug.cgi?id=45824
// Most likely we will do aarch32 support with inline asm.
#if defined(__aarch64__)
+// See https://github.com/pytorch/pytorch/issues/47098
+#if defined(__clang__) || (__GNUC__ > 8 || (__GNUC__ == 8 && __GNUC_MINOR__ > 3))
#ifdef __BIG_ENDIAN__
#error "Big endian is not supported."
@@ -665,6 +667,7 @@ Vec256<float> inline fmadd(const Vec256<float>& a, const Vec256<float>& b, const
return Vec256<float>(r0, r1);
}
-#endif
+#endif /* defined(__clang__) || (__GNUC__ > 8 || (__GNUC__ == 8 && __GNUC_MINOR__ > 3)) */
+#endif /* defined(aarch64) */
}}}
diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
index fd51cc45e7..e3be2fd3bc 100644
--- a/aten/src/ATen/cuda/CUDAContext.cpp
+++ b/aten/src/ATen/cuda/CUDAContext.cpp
@@ -24,6 +24,8 @@ void initCUDAContextVectors() {
void initDeviceProperty(DeviceIndex device_index) {
cudaDeviceProp device_prop;
AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index));
+ // patch for "too many resources requested for launch"
+ device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2;
device_properties[device_index] = device_prop;
}
diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h
index 45056ab996..81a0246ceb 100644
--- a/aten/src/ATen/cuda/detail/KernelUtils.h
+++ b/aten/src/ATen/cuda/detail/KernelUtils.h
@@ -22,7 +22,10 @@ namespace at { namespace cuda { namespace detail {
// Use 1024 threads per block, which requires cuda sm_2x or above
-constexpr int CUDA_NUM_THREADS = 1024;
+//constexpr int CUDA_NUM_THREADS = 1024;
+
+// patch for "too many resources requested for launch"
+constexpr int CUDA_NUM_THREADS = 512;
// CUDA: number of blocks for threads.
inline int GET_BLOCKS(const int64_t N) {
diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h
index 69b7f3a4d3..85b0b1305f 100644
--- a/aten/src/THCUNN/common.h
+++ b/aten/src/THCUNN/common.h
@@ -5,7 +5,10 @@
"Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")
// Use 1024 threads per block, which requires cuda sm_2x or above
-const int CUDA_NUM_THREADS = 1024;
+//constexpr int CUDA_NUM_THREADS = 1024;
+
+// patch for "too many resources requested for launch"
+constexpr int CUDA_NUM_THREADS = 512;
// CUDA: number of blocks for threads.
inline int GET_BLOCKS(const int64_t N)
diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
index 1751128f1a..03e74f5ac2 100644
--- a/aten/src/ATen/cuda/CUDAContext.cpp
+++ b/aten/src/ATen/cuda/CUDAContext.cpp
@@ -24,6 +24,8 @@ void initCUDAContextVectors() {
void initDeviceProperty(DeviceIndex device_index) {
cudaDeviceProp device_prop;
AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index));
+ // patch for "too many resources requested for launch"
+ device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2;
device_properties[device_index] = device_prop;
}
diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h
index 45056ab996..81a0246ceb 100644
--- a/aten/src/ATen/cuda/detail/KernelUtils.h
+++ b/aten/src/ATen/cuda/detail/KernelUtils.h
@@ -22,7 +22,10 @@ namespace at { namespace cuda { namespace detail {
// Use 1024 threads per block, which requires cuda sm_2x or above
-constexpr int CUDA_NUM_THREADS = 1024;
+//constexpr int CUDA_NUM_THREADS = 1024;
+
+// patch for "too many resources requested for launch"
+constexpr int CUDA_NUM_THREADS = 512;
// CUDA: number of blocks for threads.
inline int GET_BLOCKS(const int64_t N) {
diff --git a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
index 4e9c799986..12c1453073 100644
--- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
@@ -24,7 +24,13 @@ using namespace vec256;
// copysign faster for the half-precision types
template<typename T>
T copysign(T a, T b) {
+#if (!defined(__aarch64__)) || defined(__clang__) || (__GNUC__ > 8)
+ // std::copysign gets ICE/Segfaults with gcc 7/8 on arm64
+ // (e.g. Jetson), see PyTorch PR #51834
return std::copysign(a, b);
+#else
+ return std::signbit(b) ? -std::abs(a) : std::abs(a);
+#endif
}
// Implement copysign for half precision floats using bit ops
@@ -149,6 +155,18 @@ void div_trunc_kernel(TensorIterator& iter) {
}
}
+// this is a function because MSVC does not like us to use #if inside AT_DISPATC
+template <typename scalar_t>
+static inline scalar_t signed_zero(scalar_t sign) {
+#if (!defined(__aarch64__)) || defined(__clang__) || (__GNUC__ > 8)
+ // std::copysign gets ICE/Segfaults with gcc 7/8 on arm64
+ // (e.g. Jetson), see PyTorch PR #51834
+ return std::copysign(scalar_t(0), sign);
+#else
+ return std::signbit(sign) ? -scalar_t(0) : scalar_t(0);
+#endif
+}
+
// NOTE: [Floor Division in Python]
// Python's __floordiv__ operator is more complicated than just floor(a / b).
// It aims to maintain the property: a == (a // b) * b + remainder(a, b)
@@ -201,7 +219,7 @@ void div_floor_kernel(TensorIterator& iter) {
floordiv += scalar_t(1.0);
}
} else {
- floordiv = copysign(scalar_t(0), a / b);
+ floordiv = signed_zero(a / b);
}
return floordiv;
});
diff --git a/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu b/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu
index e3ac2665a4..77e866b7f3 100644
--- a/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu
+++ b/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu
@@ -1,10 +1,11 @@
#include <ATen/AccumulateType.h>
#include <ATen/Dispatch.h>
+#include <ATen/native/BinaryOps.h>
#include <ATen/native/DispatchStub.h>
-#include <ATen/native/cuda/Loops.cuh>
#include <ATen/native/TensorIterator.h>
-#include <ATen/native/BinaryOps.h>
#include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAMathCompat.h>
+#include <ATen/native/cuda/Loops.cuh>
// NOTE: CUDA on Windows requires that the enclosing function
// of a __device__ lambda not have internal linkage.
@@ -139,7 +140,9 @@ void div_floor_kernel_cuda(TensorIterator& iter) {
floordiv += scalar_t(1.0);
}
} else {
- floordiv = std::copysign(scalar_t(0), a * inv_b);
+ // std::copysign gets ICE/Segfaults with gcc 7/8 on arm64
+ // (e.g. Jetson), see PyTorch PR #51834
+ floordiv = c10::cuda::compat::copysign(scalar_t(0), a * inv_b);
}
return floordiv;
});
@@ -160,7 +163,9 @@ void div_floor_kernel_cuda(TensorIterator& iter) {
floordiv += scalar_t(1.0);
}
} else {
- floordiv = std::copysign(scalar_t(0), a / b);
+ // std::copysign gets ICE/Segfaults with gcc 7/8 on arm64
+ // (e.g. Jetson), see PyTorch PR #51834
+ floordiv = c10::cuda::compat::copysign(scalar_t(0), a / b);
}
return floordiv;
});
diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h
index 69b7f3a4d3..85b0b1305f 100644
--- a/aten/src/THCUNN/common.h
+++ b/aten/src/THCUNN/common.h
@@ -5,7 +5,10 @@
"Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")
// Use 1024 threads per block, which requires cuda sm_2x or above
-const int CUDA_NUM_THREADS = 1024;
+//constexpr int CUDA_NUM_THREADS = 1024;
+
+// patch for "too many resources requested for launch"
+constexpr int CUDA_NUM_THREADS = 512;
// CUDA: number of blocks for threads.
inline int GET_BLOCKS(const int64_t N)
diff --git a/c10/cuda/CUDAMathCompat.h b/c10/cuda/CUDAMathCompat.h
index 1fb0c3ec29..a4c6655859 100644
--- a/c10/cuda/CUDAMathCompat.h
+++ b/c10/cuda/CUDAMathCompat.h
@@ -42,11 +42,80 @@ __MATH_FUNCTIONS_DECL__ double ceil(double x) {
return ::ceil(x);
}
+__MATH_FUNCTIONS_DECL__ float fp32_from_bits(uint32_t w) {
+#if defined(__OPENCL_VERSION__)
+ return as_float(w);
+#elif defined(__CUDA_ARCH__)
+ return __uint_as_float((unsigned int)w);
+#elif defined(__INTEL_COMPILER)
+ return _castu32_f32(w);
+#else
+ union {
+ uint32_t as_bits;
+ float as_value;
+ } fp32 = {w};
+ return fp32.as_value;
+#endif
+}
+
+__MATH_FUNCTIONS_DECL__ uint32_t fp32_to_bits(float f) {
+#if defined(__OPENCL_VERSION__)
+ return as_uint(f);
+#elif defined(__CUDA_ARCH__)
+ return (uint32_t)__float_as_uint(f);
+#elif defined(__INTEL_COMPILER)
+ return _castf32_u32(f);
+#else
+ union {
+ float as_value;
+ uint32_t as_bits;
+ } fp32 = {f};
+ return fp32.as_bits;
+#endif
+}
+
+__MATH_FUNCTIONS_DECL__ double fp64_from_bits(uint64_t w) {
+#if defined(__CUDA_ARCH__)
+ return __longlong_as_double(w);
+#else
+ union {
+ uint64_t as_bits;
+ double as_value;
+ } fp64 = {w};
+ return fp64.as_value;
+#endif
+}
+
+__MATH_FUNCTIONS_DECL__ uint64_t fp64_to_bits(double f) {
+#if defined(__CUDA_ARCH__)
+ return __double_as_longlong(f);
+#else
+ union {
+ double as_value;
+ int64_t as_bits;
+ } fp64 = {f};
+ return fp64.as_bits;
+#endif
+}
+
__MATH_FUNCTIONS_DECL__ float copysign(float x, float y) {
- return ::copysignf(x, y);
+#if (!defined(__aarch64__)) || defined(__clang__) || (__GNUC__ > 8)
+ // std::copysign gets ICE/Segfaults with gcc 7/8 on arm64
+ // (e.g. Jetson), see PyTorch PR #51834
+ return ::copysignf(x, y);
+#else
+ return fp32_from_bits(
+ (fp32_to_bits(x) & 0x7fffffffu) | (fp32_to_bits(y) & 0x80000000u));
+#endif
}
__MATH_FUNCTIONS_DECL__ double copysign(double x, double y) {
- return ::copysign(x, y);
+#if (!defined(__aarch64__)) || defined(__clang__) || (__GNUC__ > 8)
+ return ::copysign(x, y);
+#else
+ return fp64_from_bits(
+ (fp64_to_bits(x) & 0x7fffffffffffffffull) |
+ (fp64_to_bits(y) & 0x8000000000000000ull));
+#endif
}
__MATH_FUNCTIONS_DECL__ float floor(float x) {
diff --git a/aten/src/ATen/cpu/vec256/vec256_float_neon.h b/aten/src/ATen/cpu/vec256/vec256_float_neon.h
index 5f6134112c..15b311d4d6 100644
--- a/aten/src/ATen/cpu/vec256/vec256_float_neon.h
+++ b/aten/src/ATen/cpu/vec256/vec256_float_neon.h
@@ -26,6 +26,9 @@ namespace {
// Most likely we will do aarch32 support with inline asm.
#if defined(__aarch64__)
+// See https://github.com/pytorch/pytorch/issues/47098
+#if defined(__clang__) || (__GNUC__ > 8 || (__GNUC__ == 8 && __GNUC_MINOR__ > 3))
+
#ifdef __BIG_ENDIAN__
#error "Big endian is not supported."
#endif
@@ -710,6 +713,7 @@ Vec256<float> inline fmadd(const Vec256<float>& a, const Vec256<float>& b, const
return Vec256<float>(r0, r1);
}
+#endif /* defined(__clang__) || (__GNUC__ > 8 || (__GNUC__ == 8 && __GNUC_MINOR__ > 3)) */
#endif /* defined(aarch64) */
}}}
diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
index 1751128f1a..03e74f5ac2 100644
--- a/aten/src/ATen/cuda/CUDAContext.cpp
+++ b/aten/src/ATen/cuda/CUDAContext.cpp
@@ -24,6 +24,8 @@ void initCUDAContextVectors() {
void initDeviceProperty(DeviceIndex device_index) {
cudaDeviceProp device_prop;
AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index));
+ // patch for "too many resources requested for launch"
+ device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2;
device_properties[device_index] = device_prop;
}
diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h
index e707e94cc1..4c6fcf3e5e 100644
--- a/aten/src/ATen/cuda/detail/KernelUtils.h
+++ b/aten/src/ATen/cuda/detail/KernelUtils.h
@@ -22,7 +22,10 @@ namespace at { namespace cuda { namespace detail {
// Use 1024 threads per block, which requires cuda sm_2x or above
-constexpr int CUDA_NUM_THREADS = 1024;
+//constexpr int CUDA_NUM_THREADS = 1024;
+
+// patch for "too many resources requested for launch"
+constexpr int CUDA_NUM_THREADS = 512;
// CUDA: number of blocks for threads.
inline int GET_BLOCKS(const int64_t N) {
diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h
index 69b7f3a4d3..54455ab4b0 100644
--- a/aten/src/THCUNN/common.h
+++ b/aten/src/THCUNN/common.h
@@ -5,7 +5,10 @@
"Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")
// Use 1024 threads per block, which requires cuda sm_2x or above
-const int CUDA_NUM_THREADS = 1024;
+//const int CUDA_NUM_THREADS = 1024;
+
+// patch for "too many resources requested for launch"
+constexpr int CUDA_NUM_THREADS = 512;
// CUDA: number of blocks for threads.
inline int GET_BLOCKS(const int64_t N)
@rajb245
Copy link

rajb245 commented Aug 27, 2021

is this idea to cut the number of threads in half still needed with torch 1.8.1 / 1.8.2, and jetpack 4.6?

@rajb245
Copy link

rajb245 commented Aug 31, 2021

nevermind, the patch for 1.8 applies cleanly to 1.8.1, and it seems the jetpack version doesn't matter; at least this still works on the new 4.6

@maaaxac
Copy link

maaaxac commented Apr 15, 2024

hi @dusty-nv, does pytorch-1.9-jetpack-4.5.1.patch work for jetpack 4.6?

@dusty-nv
Copy link
Author

@maaaxac it's been a while since I built pytorch for JetPack 4, but these patches were not specific to individual versions of JetPack 4 really, but rather the version of PyTorch (1.9 in this case)

@maaaxac
Copy link

maaaxac commented Apr 15, 2024

@dusty-nv thank you

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment