dusty-nv/pytorch-1.10-jetpack-4.5.1.patch

## pytorch-1.10-jetpack-4.5.1.patch
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h b/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h
index 2aac442d21..f2321dad7a 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h
@@ -26,6 +26,9 @@ namespace {
 // Most likely we will do aarch32 support with inline asm.
 #if defined(__aarch64__)

+// See https://github.com/pytorch/pytorch/issues/47098
+#if defined(__clang__) || (__GNUC__ > 8 || (__GNUC__ == 8 && __GNUC_MINOR__ > 3))
+
 #ifdef __BIG_ENDIAN__
 #error "Big endian is not supported."
 #endif
@@ -713,6 +716,7 @@ Vectorized<float> inline fmadd(const Vectorized<float>& a, const Vectorized<floa
   return Vectorized<float>(r0, r1);
 }

+#endif /* defined(__clang__) || (__GNUC__ > 8 || (__GNUC__ == 8 && __GNUC_MINOR__ > 3)) */
 #endif /* defined(aarch64) */

 }}}
diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
index 1751128f1a..03e74f5ac2 100644
--- a/aten/src/ATen/cuda/CUDAContext.cpp
+++ b/aten/src/ATen/cuda/CUDAContext.cpp
@@ -24,6 +24,8 @@ void initCUDAContextVectors() {
 void initDeviceProperty(DeviceIndex device_index) {
   cudaDeviceProp device_prop;
   AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index));
+  // patch for "too many resources requested for launch"
+  device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2;
   device_properties[device_index] = device_prop;
 }

diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h
index 91a61b04b8..5e9c128eed 100644
--- a/aten/src/ATen/cuda/detail/KernelUtils.h
+++ b/aten/src/ATen/cuda/detail/KernelUtils.h
@@ -19,7 +19,9 @@ namespace at { namespace cuda { namespace detail {


 // Use 1024 threads per block, which requires cuda sm_2x or above
-constexpr int CUDA_NUM_THREADS = 1024;
+//constexpr int CUDA_NUM_THREADS = 1024;
+// patch for "too many resources requested for launch"
+constexpr int CUDA_NUM_THREADS = 512;

 // CUDA: number of blocks for threads.
 inline int GET_BLOCKS(const int64_t N, const int64_t max_threads_per_block=CUDA_NUM_THREADS) {

## pytorch-1.11-jetpack-5.0.patch
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index 00e6d5d45e..2ee832051a 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -1595,7 +1595,7 @@ def _get_cuda_arch_flags(cflags: Optional[List[str]] = None) -> List[str]:
     ])

     supported_arches = ['3.5', '3.7', '5.0', '5.2', '5.3', '6.0', '6.1', '6.2',
-                        '7.0', '7.2', '7.5', '8.0', '8.6']
+                        '7.0', '7.2', '7.5', '8.0', '8.6', '8.7']
     valid_arch_strings = supported_arches + [s + "+PTX" for s in supported_arches]

     # The default is sm_30 for CUDA 9.x and 10.x

## pytorch-1.4-diff-jetpack-4.4.patch
diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
index e48c020b03..0ecc111c4b 100644
--- a/aten/src/ATen/cuda/CUDAContext.cpp
+++ b/aten/src/ATen/cuda/CUDAContext.cpp
@@ -24,6 +24,8 @@ void initCUDAContextVectors() {
 void initDeviceProperty(DeviceIndex device_index) {
   cudaDeviceProp device_prop;
   AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index));
+  // patch for "too many resources requested for launch"
+  device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2;
   device_properties[device_index] = device_prop;
 }

diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h
index af788ff8f8..fb27ab808c 100644
--- a/aten/src/ATen/cuda/detail/KernelUtils.h
+++ b/aten/src/ATen/cuda/detail/KernelUtils.h
@@ -19,7 +19,10 @@ namespace at { namespace cuda { namespace detail {
   for (int i=_i_n_d_e_x; _i_n_d_e_x < (n); _i_n_d_e_x+=blockDim.x * gridDim.x, i=_i_n_d_e_x)

 // Use 1024 threads per block, which requires cuda sm_2x or above
-constexpr int CUDA_NUM_THREADS = 1024;
+//constexpr int CUDA_NUM_THREADS = 1024;
+
+// patch for "too many resources requested for launch"
+constexpr int CUDA_NUM_THREADS = 512;

 // CUDA: number of blocks for threads.
 inline int GET_BLOCKS(const int N)
diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h
index 61cd90cdd6..1d3f5383d4 100644
--- a/aten/src/THCUNN/common.h
+++ b/aten/src/THCUNN/common.h
@@ -5,7 +5,10 @@
   "Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")

 // Use 1024 threads per block, which requires cuda sm_2x or above
-const int CUDA_NUM_THREADS = 1024;
+//const int CUDA_NUM_THREADS = 1024;
+
+// patch for "too many resources requested for launch"
+constexpr int CUDA_NUM_THREADS = 512;

 // CUDA: number of blocks for threads.
 inline int GET_BLOCKS(const int N)
diff --git a/caffe2/operators/rnn/recurrent_op_cudnn.cc b/caffe2/operators/rnn/recurrent_op_cudnn.cc
index 8f69944dcf..3679c9d2a7 100644
--- a/caffe2/operators/rnn/recurrent_op_cudnn.cc
+++ b/caffe2/operators/rnn/recurrent_op_cudnn.cc
@@ -99,7 +99,7 @@ void RecurrentBaseOp<T>::initialize(
   // RNN setup
   {
 #if CUDNN_VERSION_MIN(7, 0, 0)
-    CUDNN_ENFORCE(cudnnSetRNNDescriptor(
+    CUDNN_ENFORCE(cudnnSetRNNDescriptor_v6(
         cudnn_wrapper_.inline_cudnn_handle(),
         rnnDesc_,
         hiddenSize,
diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake
index a5c50b90df..22d738e0c1 100644
--- a/cmake/public/cuda.cmake
+++ b/cmake/public/cuda.cmake
@@ -147,7 +147,7 @@ endif()
 # ---[ Extract versions
 if(CAFFE2_USE_CUDNN)
   # Get cuDNN version
-  file(READ ${CUDNN_INCLUDE_PATH}/cudnn.h CUDNN_HEADER_CONTENTS)
+  file(READ ${CUDNN_INCLUDE_PATH}/cudnn_version.h CUDNN_HEADER_CONTENTS)
   string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)"
                CUDNN_VERSION_MAJOR "${CUDNN_HEADER_CONTENTS}")
   string(REGEX REPLACE "define CUDNN_MAJOR * +([0-9]+)" "\\1"

## pytorch-1.5-diff-jetpack-4.4.patch
diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
index fd51cc45e7..e3be2fd3bc 100644
--- a/aten/src/ATen/cuda/CUDAContext.cpp
+++ b/aten/src/ATen/cuda/CUDAContext.cpp
@@ -24,6 +24,8 @@ void initCUDAContextVectors() {
 void initDeviceProperty(DeviceIndex device_index) {
   cudaDeviceProp device_prop;
   AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index));
+  // patch for "too many resources requested for launch"
+  device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2;
   device_properties[device_index] = device_prop;
 }

diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h
index af788ff8f8..fb27ab808c 100644
--- a/aten/src/ATen/cuda/detail/KernelUtils.h
+++ b/aten/src/ATen/cuda/detail/KernelUtils.h
@@ -19,7 +19,10 @@ namespace at { namespace cuda { namespace detail {
   for (int i=_i_n_d_e_x; _i_n_d_e_x < (n); _i_n_d_e_x+=blockDim.x * gridDim.x, i=_i_n_d_e_x)

 // Use 1024 threads per block, which requires cuda sm_2x or above
-constexpr int CUDA_NUM_THREADS = 1024;
+//constexpr int CUDA_NUM_THREADS = 1024;
+
+// patch for "too many resources requested for launch"
+constexpr int CUDA_NUM_THREADS = 512;

 // CUDA: number of blocks for threads.
 inline int GET_BLOCKS(const int N)
diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h
index 61cd90cdd6..1d3f5383d4 100644
--- a/aten/src/THCUNN/common.h
+++ b/aten/src/THCUNN/common.h
@@ -5,7 +5,10 @@
   "Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")

 // Use 1024 threads per block, which requires cuda sm_2x or above
-const int CUDA_NUM_THREADS = 1024;
+//const int CUDA_NUM_THREADS = 1024;
+
+// patch for "too many resources requested for launch"
+constexpr int CUDA_NUM_THREADS = 512;

 // CUDA: number of blocks for threads.
 inline int GET_BLOCKS(const int N)
diff --git a/caffe2/operators/rnn/recurrent_op_cudnn.cc b/caffe2/operators/rnn/recurrent_op_cudnn.cc
index 8f69944dcf..3679c9d2a7 100644
--- a/caffe2/operators/rnn/recurrent_op_cudnn.cc
+++ b/caffe2/operators/rnn/recurrent_op_cudnn.cc
@@ -99,7 +99,7 @@ void RecurrentBaseOp<T>::initialize(
   // RNN setup
   {
 #if CUDNN_VERSION_MIN(7, 0, 0)
-    CUDNN_ENFORCE(cudnnSetRNNDescriptor(
+    CUDNN_ENFORCE(cudnnSetRNNDescriptor_v6(
         cudnn_wrapper_.inline_cudnn_handle(),
         rnnDesc_,
         hiddenSize,
diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake
index 545d6cd924..235beaeda8 100644
--- a/cmake/public/cuda.cmake
+++ b/cmake/public/cuda.cmake
@@ -145,7 +145,7 @@ endif()
 # ---[ Extract versions
 if(CAFFE2_USE_CUDNN)
   # Get cuDNN version
-  file(READ ${CUDNN_INCLUDE_PATH}/cudnn.h CUDNN_HEADER_CONTENTS)
+  file(READ ${CUDNN_INCLUDE_PATH}/cudnn_version.h CUDNN_HEADER_CONTENTS)
   string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)"
                CUDNN_VERSION_MAJOR "${CUDNN_HEADER_CONTENTS}")
   string(REGEX REPLACE "define CUDNN_MAJOR * +([0-9]+)" "\\1"

## pytorch-1.6-jetpack-4.4.patch
diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
index fd51cc45e7..e3be2fd3bc 100644
--- a/aten/src/ATen/cuda/CUDAContext.cpp
+++ b/aten/src/ATen/cuda/CUDAContext.cpp
@@ -24,6 +24,8 @@ void initCUDAContextVectors() {
 void initDeviceProperty(DeviceIndex device_index) {
   cudaDeviceProp device_prop;
   AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index));
+  // patch for "too many resources requested for launch"
+  device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2;
   device_properties[device_index] = device_prop;
 }

diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h
index af788ff8f8..fb27ab808c 100644
--- a/aten/src/ATen/cuda/detail/KernelUtils.h
+++ b/aten/src/ATen/cuda/detail/KernelUtils.h
@@ -19,7 +19,10 @@ namespace at { namespace cuda { namespace detail {
   for (int i=_i_n_d_e_x; _i_n_d_e_x < (n); _i_n_d_e_x+=blockDim.x * gridDim.x, i=_i_n_d_e_x)

 // Use 1024 threads per block, which requires cuda sm_2x or above
-constexpr int CUDA_NUM_THREADS = 1024;
+//constexpr int CUDA_NUM_THREADS = 1024;
+
+// patch for "too many resources requested for launch"
+constexpr int CUDA_NUM_THREADS = 512;

 // CUDA: number of blocks for threads.
 inline int GET_BLOCKS(const int N)
diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h
index 61cd90cdd6..c50c5506ca 100644
--- a/aten/src/THCUNN/common.h
+++ b/aten/src/THCUNN/common.h
@@ -5,7 +5,10 @@
   "Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")

 // Use 1024 threads per block, which requires cuda sm_2x or above
-const int CUDA_NUM_THREADS = 1024;
+//constexpr int CUDA_NUM_THREADS = 1024;
+
+// patch for "too many resources requested for launch"
+constexpr int CUDA_NUM_THREADS = 512;

 // CUDA: number of blocks for threads.
 inline int GET_BLOCKS(const int N)

## pytorch-1.6-rc2-jetpack-4.4-GA.patch
diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
index fd51cc45e7..e3be2fd3bc 100644
--- a/aten/src/ATen/cuda/CUDAContext.cpp
+++ b/aten/src/ATen/cuda/CUDAContext.cpp
@@ -24,6 +24,8 @@ void initCUDAContextVectors() {
 void initDeviceProperty(DeviceIndex device_index) {
   cudaDeviceProp device_prop;
   AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index));
+  // patch for "too many resources requested for launch"
+  device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2;
   device_properties[device_index] = device_prop;
 }

diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h
index af788ff8f8..fb27ab808c 100644
--- a/aten/src/ATen/cuda/detail/KernelUtils.h
+++ b/aten/src/ATen/cuda/detail/KernelUtils.h
@@ -19,7 +19,10 @@ namespace at { namespace cuda { namespace detail {
   for (int i=_i_n_d_e_x; _i_n_d_e_x < (n); _i_n_d_e_x+=blockDim.x * gridDim.x, i=_i_n_d_e_x)

 // Use 1024 threads per block, which requires cuda sm_2x or above
-constexpr int CUDA_NUM_THREADS = 1024;
+//constexpr int CUDA_NUM_THREADS = 1024;
+
+// patch for "too many resources requested for launch"
+constexpr int CUDA_NUM_THREADS = 512;

 // CUDA: number of blocks for threads.
 inline int GET_BLOCKS(const int N)
diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h
index 61cd90cdd6..c50c5506ca 100644
--- a/aten/src/THCUNN/common.h
+++ b/aten/src/THCUNN/common.h
@@ -5,7 +5,10 @@
   "Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")

 // Use 1024 threads per block, which requires cuda sm_2x or above
-const int CUDA_NUM_THREADS = 1024;
+//constexpr int CUDA_NUM_THREADS = 1024;
+
+// patch for "too many resources requested for launch"
+constexpr int CUDA_NUM_THREADS = 512;

 // CUDA: number of blocks for threads.
 inline int GET_BLOCKS(const int N)

## pytorch-1.7-jetpack-4.4.1.patch
diff --git a/aten/src/ATen/cpu/vec256/vec256_float_neon.h b/aten/src/ATen/cpu/vec256/vec256_float_neon.h
index cfe6b0ea0f..d1e75ab9af 100644
--- a/aten/src/ATen/cpu/vec256/vec256_float_neon.h
+++ b/aten/src/ATen/cpu/vec256/vec256_float_neon.h
@@ -25,6 +25,8 @@ namespace {
 //    https://bugs.llvm.org/show_bug.cgi?id=45824
 // Most likely we will do aarch32 support with inline asm.
 #if defined(__aarch64__)
+// See https://github.com/pytorch/pytorch/issues/47098
+#if defined(__clang__) || (__GNUC__ > 8 || (__GNUC__ == 8 && __GNUC_MINOR__ > 3))

 #ifdef __BIG_ENDIAN__
 #error "Big endian is not supported."
@@ -665,6 +667,7 @@ Vec256<float> inline fmadd(const Vec256<float>& a, const Vec256<float>& b, const
   return Vec256<float>(r0, r1);
 }

-#endif
+#endif /* defined(__clang__) || (__GNUC__ > 8 || (__GNUC__ == 8 && __GNUC_MINOR__ > 3)) */
+#endif /* defined(aarch64) */

 }}}
diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
index fd51cc45e7..e3be2fd3bc 100644
--- a/aten/src/ATen/cuda/CUDAContext.cpp
+++ b/aten/src/ATen/cuda/CUDAContext.cpp
@@ -24,6 +24,8 @@ void initCUDAContextVectors() {
 void initDeviceProperty(DeviceIndex device_index) {
   cudaDeviceProp device_prop;
   AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index));
+  // patch for "too many resources requested for launch"
+  device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2;
   device_properties[device_index] = device_prop;
 }

diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h
index 45056ab996..81a0246ceb 100644
--- a/aten/src/ATen/cuda/detail/KernelUtils.h
+++ b/aten/src/ATen/cuda/detail/KernelUtils.h
@@ -22,7 +22,10 @@ namespace at { namespace cuda { namespace detail {


 // Use 1024 threads per block, which requires cuda sm_2x or above
-constexpr int CUDA_NUM_THREADS = 1024;
+//constexpr int CUDA_NUM_THREADS = 1024;
+
+// patch for "too many resources requested for launch"
+constexpr int CUDA_NUM_THREADS = 512;

 // CUDA: number of blocks for threads.
 inline int GET_BLOCKS(const int64_t N) {
diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h
index 69b7f3a4d3..85b0b1305f 100644
--- a/aten/src/THCUNN/common.h
+++ b/aten/src/THCUNN/common.h
@@ -5,7 +5,10 @@
   "Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")

 // Use 1024 threads per block, which requires cuda sm_2x or above
-const int CUDA_NUM_THREADS = 1024;
+//constexpr int CUDA_NUM_THREADS = 1024;
+
+// patch for "too many resources requested for launch"
+constexpr int CUDA_NUM_THREADS = 512;

 // CUDA: number of blocks for threads.
 inline int GET_BLOCKS(const int64_t N)

## pytorch-1.8-jetpack-4.4.1.patch
diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
index 1751128f1a..03e74f5ac2 100644
--- a/aten/src/ATen/cuda/CUDAContext.cpp
+++ b/aten/src/ATen/cuda/CUDAContext.cpp
@@ -24,6 +24,8 @@ void initCUDAContextVectors() {
 void initDeviceProperty(DeviceIndex device_index) {
   cudaDeviceProp device_prop;
   AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index));
+  // patch for "too many resources requested for launch"
+  device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2;
   device_properties[device_index] = device_prop;
 }

diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h
index 45056ab996..81a0246ceb 100644
--- a/aten/src/ATen/cuda/detail/KernelUtils.h
+++ b/aten/src/ATen/cuda/detail/KernelUtils.h
@@ -22,7 +22,10 @@ namespace at { namespace cuda { namespace detail {


 // Use 1024 threads per block, which requires cuda sm_2x or above
-constexpr int CUDA_NUM_THREADS = 1024;
+//constexpr int CUDA_NUM_THREADS = 1024;
+
+// patch for "too many resources requested for launch"
+constexpr int CUDA_NUM_THREADS = 512;

 // CUDA: number of blocks for threads.
 inline int GET_BLOCKS(const int64_t N) {
diff --git a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
index 4e9c799986..12c1453073 100644
--- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
@@ -24,7 +24,13 @@ using namespace vec256;
 // copysign faster for the half-precision types
 template<typename T>
 T copysign(T a, T b) {
+#if (!defined(__aarch64__)) || defined(__clang__) || (__GNUC__ > 8)
+  // std::copysign gets ICE/Segfaults with gcc 7/8 on arm64
+  // (e.g. Jetson), see PyTorch PR #51834
   return std::copysign(a, b);
+#else
+  return std::signbit(b) ? -std::abs(a) : std::abs(a);
+#endif
 }

 // Implement copysign for half precision floats using bit ops
@@ -149,6 +155,18 @@ void div_trunc_kernel(TensorIterator& iter) {
   }
 }

+// this is a function because MSVC does not like us to use #if inside AT_DISPATC
+template <typename scalar_t>
+static inline scalar_t signed_zero(scalar_t sign) {
+#if (!defined(__aarch64__)) || defined(__clang__) || (__GNUC__ > 8)
+  // std::copysign gets ICE/Segfaults with gcc 7/8 on arm64
+  // (e.g. Jetson), see PyTorch PR #51834
+  return std::copysign(scalar_t(0), sign);
+#else
+  return std::signbit(sign) ? -scalar_t(0) : scalar_t(0);
+#endif
+}
+
 // NOTE: [Floor Division in Python]
 // Python's __floordiv__ operator is more complicated than just floor(a / b).
 // It aims to maintain the property: a == (a // b) * b + remainder(a, b)
@@ -201,7 +219,7 @@ void div_floor_kernel(TensorIterator& iter) {
                 floordiv += scalar_t(1.0);
               }
             } else {
-              floordiv = copysign(scalar_t(0), a / b);
+              floordiv = signed_zero(a / b);
             }
             return floordiv;
           });
diff --git a/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu b/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu
index e3ac2665a4..77e866b7f3 100644
--- a/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu
+++ b/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu
@@ -1,10 +1,11 @@
 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
+#include <ATen/native/BinaryOps.h>
 #include <ATen/native/DispatchStub.h>
-#include <ATen/native/cuda/Loops.cuh>
 #include <ATen/native/TensorIterator.h>
-#include <ATen/native/BinaryOps.h>
 #include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAMathCompat.h>
+#include <ATen/native/cuda/Loops.cuh>

 // NOTE: CUDA on Windows requires that the enclosing function
 // of a __device__ lambda not have internal linkage.
@@ -139,7 +140,9 @@ void div_floor_kernel_cuda(TensorIterator& iter) {
             floordiv += scalar_t(1.0);
           }
         } else {
-          floordiv = std::copysign(scalar_t(0), a * inv_b);
+          // std::copysign gets ICE/Segfaults with gcc 7/8 on arm64
+          // (e.g. Jetson), see PyTorch PR #51834
+          floordiv = c10::cuda::compat::copysign(scalar_t(0), a * inv_b);
         }
         return floordiv;
       });
@@ -160,7 +163,9 @@ void div_floor_kernel_cuda(TensorIterator& iter) {
             floordiv += scalar_t(1.0);
           }
         } else {
-          floordiv = std::copysign(scalar_t(0), a / b);
+          // std::copysign gets ICE/Segfaults with gcc 7/8 on arm64
+          // (e.g. Jetson), see PyTorch PR #51834
+          floordiv = c10::cuda::compat::copysign(scalar_t(0), a / b);
         }
         return floordiv;
       });
diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h
index 69b7f3a4d3..85b0b1305f 100644
--- a/aten/src/THCUNN/common.h
+++ b/aten/src/THCUNN/common.h
@@ -5,7 +5,10 @@
   "Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")

 // Use 1024 threads per block, which requires cuda sm_2x or above
-const int CUDA_NUM_THREADS = 1024;
+//constexpr int CUDA_NUM_THREADS = 1024;
+
+// patch for "too many resources requested for launch"
+constexpr int CUDA_NUM_THREADS = 512;

 // CUDA: number of blocks for threads.
 inline int GET_BLOCKS(const int64_t N)
diff --git a/c10/cuda/CUDAMathCompat.h b/c10/cuda/CUDAMathCompat.h
index 1fb0c3ec29..a4c6655859 100644
--- a/c10/cuda/CUDAMathCompat.h
+++ b/c10/cuda/CUDAMathCompat.h
@@ -42,11 +42,80 @@ __MATH_FUNCTIONS_DECL__ double ceil(double x) {
   return ::ceil(x);
 }

+__MATH_FUNCTIONS_DECL__ float fp32_from_bits(uint32_t w) {
+#if defined(__OPENCL_VERSION__)
+  return as_float(w);
+#elif defined(__CUDA_ARCH__)
+  return __uint_as_float((unsigned int)w);
+#elif defined(__INTEL_COMPILER)
+  return _castu32_f32(w);
+#else
+  union {
+    uint32_t as_bits;
+    float as_value;
+  } fp32 = {w};
+  return fp32.as_value;
+#endif
+}
+
+__MATH_FUNCTIONS_DECL__ uint32_t fp32_to_bits(float f) {
+#if defined(__OPENCL_VERSION__)
+  return as_uint(f);
+#elif defined(__CUDA_ARCH__)
+  return (uint32_t)__float_as_uint(f);
+#elif defined(__INTEL_COMPILER)
+  return _castf32_u32(f);
+#else
+  union {
+    float as_value;
+    uint32_t as_bits;
+  } fp32 = {f};
+  return fp32.as_bits;
+#endif
+}
+
+__MATH_FUNCTIONS_DECL__ double fp64_from_bits(uint64_t w) {
+#if defined(__CUDA_ARCH__)
+  return __longlong_as_double(w);
+#else
+  union {
+    uint64_t as_bits;
+    double as_value;
+  } fp64 = {w};
+  return fp64.as_value;
+#endif
+}
+
+__MATH_FUNCTIONS_DECL__ uint64_t fp64_to_bits(double f) {
+#if defined(__CUDA_ARCH__)
+  return __double_as_longlong(f);
+#else
+  union {
+    double as_value;
+    int64_t as_bits;
+  } fp64 = {f};
+  return fp64.as_bits;
+#endif
+}
+
 __MATH_FUNCTIONS_DECL__ float copysign(float x, float y) {
-  return ::copysignf(x, y);
+#if (!defined(__aarch64__)) || defined(__clang__) || (__GNUC__ > 8)
+   // std::copysign gets ICE/Segfaults with gcc 7/8 on arm64
+   // (e.g. Jetson), see PyTorch PR #51834
+   return ::copysignf(x, y);
+#else
+   return fp32_from_bits(
+      (fp32_to_bits(x) & 0x7fffffffu) | (fp32_to_bits(y) & 0x80000000u));
+#endif
 }
 __MATH_FUNCTIONS_DECL__ double copysign(double x, double y) {
-  return ::copysign(x, y);
+#if (!defined(__aarch64__)) || defined(__clang__) || (__GNUC__ > 8)
+   return ::copysign(x, y);
+#else
+   return fp64_from_bits(
+       (fp64_to_bits(x) & 0x7fffffffffffffffull) |
+       (fp64_to_bits(y) & 0x8000000000000000ull));
+#endif
 }

 __MATH_FUNCTIONS_DECL__ float floor(float x) {

## pytorch-1.9-jetpack-4.5.1.patch
diff --git a/aten/src/ATen/cpu/vec256/vec256_float_neon.h b/aten/src/ATen/cpu/vec256/vec256_float_neon.h
index 5f6134112c..15b311d4d6 100644
--- a/aten/src/ATen/cpu/vec256/vec256_float_neon.h
+++ b/aten/src/ATen/cpu/vec256/vec256_float_neon.h
@@ -26,6 +26,9 @@ namespace {
 // Most likely we will do aarch32 support with inline asm.
 #if defined(__aarch64__)

+// See https://github.com/pytorch/pytorch/issues/47098
+#if defined(__clang__) || (__GNUC__ > 8 || (__GNUC__ == 8 && __GNUC_MINOR__ > 3))
+
 #ifdef __BIG_ENDIAN__
 #error "Big endian is not supported."
 #endif
@@ -710,6 +713,7 @@ Vec256<float> inline fmadd(const Vec256<float>& a, const Vec256<float>& b, const
   return Vec256<float>(r0, r1);
 }

+#endif /* defined(__clang__) || (__GNUC__ > 8 || (__GNUC__ == 8 && __GNUC_MINOR__ > 3)) */
 #endif /* defined(aarch64) */

 }}}
diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
index 1751128f1a..03e74f5ac2 100644
--- a/aten/src/ATen/cuda/CUDAContext.cpp
+++ b/aten/src/ATen/cuda/CUDAContext.cpp
@@ -24,6 +24,8 @@ void initCUDAContextVectors() {
 void initDeviceProperty(DeviceIndex device_index) {
   cudaDeviceProp device_prop;
   AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index));
+  // patch for "too many resources requested for launch"
+  device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2;
   device_properties[device_index] = device_prop;
 }

diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h
index e707e94cc1..4c6fcf3e5e 100644
--- a/aten/src/ATen/cuda/detail/KernelUtils.h
+++ b/aten/src/ATen/cuda/detail/KernelUtils.h
@@ -22,7 +22,10 @@ namespace at { namespace cuda { namespace detail {


 // Use 1024 threads per block, which requires cuda sm_2x or above
-constexpr int CUDA_NUM_THREADS = 1024;
+//constexpr int CUDA_NUM_THREADS = 1024;
+
+// patch for "too many resources requested for launch"
+constexpr int CUDA_NUM_THREADS = 512;

 // CUDA: number of blocks for threads.
 inline int GET_BLOCKS(const int64_t N) {
diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h
index 69b7f3a4d3..54455ab4b0 100644
--- a/aten/src/THCUNN/common.h
+++ b/aten/src/THCUNN/common.h
@@ -5,7 +5,10 @@
   "Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")

 // Use 1024 threads per block, which requires cuda sm_2x or above
-const int CUDA_NUM_THREADS = 1024;
+//const int CUDA_NUM_THREADS = 1024;
+
+// patch for "too many resources requested for launch"
+constexpr int CUDA_NUM_THREADS = 512;

 // CUDA: number of blocks for threads.
 inline int GET_BLOCKS(const int64_t N)
	diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h b/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h
	index 2aac442d21..f2321dad7a 100644
	--- a/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h
	+++ b/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h
	@@ -26,6 +26,9 @@ namespace {
	// Most likely we will do aarch32 support with inline asm.
	#if defined(__aarch64__)

	+// See https://github.com/pytorch/pytorch/issues/47098
	+#if defined(__clang__) \|\| (__GNUC__ > 8 \|\| (__GNUC__ == 8 && __GNUC_MINOR__ > 3))
	+
	#ifdef __BIG_ENDIAN__
	#error "Big endian is not supported."
	#endif
	@@ -713,6 +716,7 @@ Vectorized<float> inline fmadd(const Vectorized<float>& a, const Vectorized<floa
	return Vectorized<float>(r0, r1);
	}

	+#endif /* defined(__clang__) \|\| (__GNUC__ > 8 \|\| (__GNUC__ == 8 && __GNUC_MINOR__ > 3)) */
	#endif /* defined(aarch64) */

	}}}
	diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
	index 1751128f1a..03e74f5ac2 100644
	--- a/aten/src/ATen/cuda/CUDAContext.cpp
	+++ b/aten/src/ATen/cuda/CUDAContext.cpp
	@@ -24,6 +24,8 @@ void initCUDAContextVectors() {
	void initDeviceProperty(DeviceIndex device_index) {
	cudaDeviceProp device_prop;
	AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index));
	+ // patch for "too many resources requested for launch"
	+ device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2;
	device_properties[device_index] = device_prop;
	}

	diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h
	index 91a61b04b8..5e9c128eed 100644
	--- a/aten/src/ATen/cuda/detail/KernelUtils.h
	+++ b/aten/src/ATen/cuda/detail/KernelUtils.h
	@@ -19,7 +19,9 @@ namespace at { namespace cuda { namespace detail {


	// Use 1024 threads per block, which requires cuda sm_2x or above
	-constexpr int CUDA_NUM_THREADS = 1024;
	+//constexpr int CUDA_NUM_THREADS = 1024;
	+// patch for "too many resources requested for launch"
	+constexpr int CUDA_NUM_THREADS = 512;

	// CUDA: number of blocks for threads.
	inline int GET_BLOCKS(const int64_t N, const int64_t max_threads_per_block=CUDA_NUM_THREADS) {
	diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
	index 00e6d5d45e..2ee832051a 100644
	--- a/torch/utils/cpp_extension.py
	+++ b/torch/utils/cpp_extension.py
	@@ -1595,7 +1595,7 @@ def _get_cuda_arch_flags(cflags: Optional[List[str]] = None) -> List[str]:
	])

	supported_arches = ['3.5', '3.7', '5.0', '5.2', '5.3', '6.0', '6.1', '6.2',
	- '7.0', '7.2', '7.5', '8.0', '8.6']
	+ '7.0', '7.2', '7.5', '8.0', '8.6', '8.7']
	valid_arch_strings = supported_arches + [s + "+PTX" for s in supported_arches]

	# The default is sm_30 for CUDA 9.x and 10.x
	diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
	index e48c020b03..0ecc111c4b 100644
	--- a/aten/src/ATen/cuda/CUDAContext.cpp
	+++ b/aten/src/ATen/cuda/CUDAContext.cpp
	@@ -24,6 +24,8 @@ void initCUDAContextVectors() {
	void initDeviceProperty(DeviceIndex device_index) {
	cudaDeviceProp device_prop;
	AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index));
	+ // patch for "too many resources requested for launch"
	+ device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2;
	device_properties[device_index] = device_prop;
	}

	diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h
	index af788ff8f8..fb27ab808c 100644
	--- a/aten/src/ATen/cuda/detail/KernelUtils.h
	+++ b/aten/src/ATen/cuda/detail/KernelUtils.h
	@@ -19,7 +19,10 @@ namespace at { namespace cuda { namespace detail {
	for (int i=_i_n_d_e_x; _i_n_d_e_x < (n); _i_n_d_e_x+=blockDim.x * gridDim.x, i=_i_n_d_e_x)

	// Use 1024 threads per block, which requires cuda sm_2x or above
	-constexpr int CUDA_NUM_THREADS = 1024;
	+//constexpr int CUDA_NUM_THREADS = 1024;
	+
	+// patch for "too many resources requested for launch"
	+constexpr int CUDA_NUM_THREADS = 512;

	// CUDA: number of blocks for threads.
	inline int GET_BLOCKS(const int N)
	diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h
	index 61cd90cdd6..1d3f5383d4 100644
	--- a/aten/src/THCUNN/common.h
	+++ b/aten/src/THCUNN/common.h
	@@ -5,7 +5,10 @@
	"Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")

	// Use 1024 threads per block, which requires cuda sm_2x or above
	-const int CUDA_NUM_THREADS = 1024;
	+//const int CUDA_NUM_THREADS = 1024;
	+
	+// patch for "too many resources requested for launch"
	+constexpr int CUDA_NUM_THREADS = 512;

	// CUDA: number of blocks for threads.
	inline int GET_BLOCKS(const int N)
	diff --git a/caffe2/operators/rnn/recurrent_op_cudnn.cc b/caffe2/operators/rnn/recurrent_op_cudnn.cc
	index 8f69944dcf..3679c9d2a7 100644
	--- a/caffe2/operators/rnn/recurrent_op_cudnn.cc
	+++ b/caffe2/operators/rnn/recurrent_op_cudnn.cc
	@@ -99,7 +99,7 @@ void RecurrentBaseOp<T>::initialize(
	// RNN setup
	{
	#if CUDNN_VERSION_MIN(7, 0, 0)
	- CUDNN_ENFORCE(cudnnSetRNNDescriptor(
	+ CUDNN_ENFORCE(cudnnSetRNNDescriptor_v6(
	cudnn_wrapper_.inline_cudnn_handle(),
	rnnDesc_,
	hiddenSize,
	diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake
	index a5c50b90df..22d738e0c1 100644
	--- a/cmake/public/cuda.cmake
	+++ b/cmake/public/cuda.cmake
	@@ -147,7 +147,7 @@ endif()
	# ---[ Extract versions
	if(CAFFE2_USE_CUDNN)
	# Get cuDNN version
	- file(READ ${CUDNN_INCLUDE_PATH}/cudnn.h CUDNN_HEADER_CONTENTS)
	+ file(READ ${CUDNN_INCLUDE_PATH}/cudnn_version.h CUDNN_HEADER_CONTENTS)
	string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)"
	CUDNN_VERSION_MAJOR "${CUDNN_HEADER_CONTENTS}")
	string(REGEX REPLACE "define CUDNN_MAJOR * +([0-9]+)" "\\1"
	diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
	index fd51cc45e7..e3be2fd3bc 100644
	--- a/aten/src/ATen/cuda/CUDAContext.cpp
	+++ b/aten/src/ATen/cuda/CUDAContext.cpp
	@@ -24,6 +24,8 @@ void initCUDAContextVectors() {
	void initDeviceProperty(DeviceIndex device_index) {
	cudaDeviceProp device_prop;
	AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index));
	+ // patch for "too many resources requested for launch"
	+ device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2;
	device_properties[device_index] = device_prop;
	}

	diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h
	index af788ff8f8..fb27ab808c 100644
	--- a/aten/src/ATen/cuda/detail/KernelUtils.h
	+++ b/aten/src/ATen/cuda/detail/KernelUtils.h
	@@ -19,7 +19,10 @@ namespace at { namespace cuda { namespace detail {
	for (int i=_i_n_d_e_x; _i_n_d_e_x < (n); _i_n_d_e_x+=blockDim.x * gridDim.x, i=_i_n_d_e_x)

	// Use 1024 threads per block, which requires cuda sm_2x or above
	-constexpr int CUDA_NUM_THREADS = 1024;
	+//constexpr int CUDA_NUM_THREADS = 1024;
	+
	+// patch for "too many resources requested for launch"
	+constexpr int CUDA_NUM_THREADS = 512;

	// CUDA: number of blocks for threads.
	inline int GET_BLOCKS(const int N)
	diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h
	index 61cd90cdd6..1d3f5383d4 100644
	--- a/aten/src/THCUNN/common.h
	+++ b/aten/src/THCUNN/common.h
	@@ -5,7 +5,10 @@
	"Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")

	// Use 1024 threads per block, which requires cuda sm_2x or above
	-const int CUDA_NUM_THREADS = 1024;
	+//const int CUDA_NUM_THREADS = 1024;
	+
	+// patch for "too many resources requested for launch"
	+constexpr int CUDA_NUM_THREADS = 512;

	// CUDA: number of blocks for threads.
	inline int GET_BLOCKS(const int N)
	diff --git a/caffe2/operators/rnn/recurrent_op_cudnn.cc b/caffe2/operators/rnn/recurrent_op_cudnn.cc
	index 8f69944dcf..3679c9d2a7 100644
	--- a/caffe2/operators/rnn/recurrent_op_cudnn.cc
	+++ b/caffe2/operators/rnn/recurrent_op_cudnn.cc
	@@ -99,7 +99,7 @@ void RecurrentBaseOp<T>::initialize(
	// RNN setup
	{
	#if CUDNN_VERSION_MIN(7, 0, 0)
	- CUDNN_ENFORCE(cudnnSetRNNDescriptor(
	+ CUDNN_ENFORCE(cudnnSetRNNDescriptor_v6(
	cudnn_wrapper_.inline_cudnn_handle(),
	rnnDesc_,
	hiddenSize,
	diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake
	index 545d6cd924..235beaeda8 100644
	--- a/cmake/public/cuda.cmake
	+++ b/cmake/public/cuda.cmake
	@@ -145,7 +145,7 @@ endif()
	# ---[ Extract versions
	if(CAFFE2_USE_CUDNN)
	# Get cuDNN version
	- file(READ ${CUDNN_INCLUDE_PATH}/cudnn.h CUDNN_HEADER_CONTENTS)
	+ file(READ ${CUDNN_INCLUDE_PATH}/cudnn_version.h CUDNN_HEADER_CONTENTS)
	string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)"
	CUDNN_VERSION_MAJOR "${CUDNN_HEADER_CONTENTS}")
	string(REGEX REPLACE "define CUDNN_MAJOR * +([0-9]+)" "\\1"
	diff --git a/aten/src/ATen/cpu/vec256/vec256_float_neon.h b/aten/src/ATen/cpu/vec256/vec256_float_neon.h
	index cfe6b0ea0f..d1e75ab9af 100644
	--- a/aten/src/ATen/cpu/vec256/vec256_float_neon.h
	+++ b/aten/src/ATen/cpu/vec256/vec256_float_neon.h
	@@ -25,6 +25,8 @@ namespace {
	// https://bugs.llvm.org/show_bug.cgi?id=45824
	// Most likely we will do aarch32 support with inline asm.
	#if defined(__aarch64__)
	+// See https://github.com/pytorch/pytorch/issues/47098
	+#if defined(__clang__) \|\| (__GNUC__ > 8 \|\| (__GNUC__ == 8 && __GNUC_MINOR__ > 3))

	#ifdef __BIG_ENDIAN__
	#error "Big endian is not supported."
	@@ -665,6 +667,7 @@ Vec256<float> inline fmadd(const Vec256<float>& a, const Vec256<float>& b, const
	return Vec256<float>(r0, r1);
	}

	-#endif
	+#endif /* defined(__clang__) \|\| (__GNUC__ > 8 \|\| (__GNUC__ == 8 && __GNUC_MINOR__ > 3)) */
	+#endif /* defined(aarch64) */

	}}}
	diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
	index fd51cc45e7..e3be2fd3bc 100644
	--- a/aten/src/ATen/cuda/CUDAContext.cpp
	+++ b/aten/src/ATen/cuda/CUDAContext.cpp
	@@ -24,6 +24,8 @@ void initCUDAContextVectors() {
	void initDeviceProperty(DeviceIndex device_index) {
	cudaDeviceProp device_prop;
	AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index));
	+ // patch for "too many resources requested for launch"
	+ device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2;
	device_properties[device_index] = device_prop;
	}

	diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h
	index 45056ab996..81a0246ceb 100644
	--- a/aten/src/ATen/cuda/detail/KernelUtils.h
	+++ b/aten/src/ATen/cuda/detail/KernelUtils.h
	@@ -22,7 +22,10 @@ namespace at { namespace cuda { namespace detail {


	// Use 1024 threads per block, which requires cuda sm_2x or above
	-constexpr int CUDA_NUM_THREADS = 1024;
	+//constexpr int CUDA_NUM_THREADS = 1024;
	+
	+// patch for "too many resources requested for launch"
	+constexpr int CUDA_NUM_THREADS = 512;

	// CUDA: number of blocks for threads.
	inline int GET_BLOCKS(const int64_t N) {
	diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h
	index 69b7f3a4d3..85b0b1305f 100644
	--- a/aten/src/THCUNN/common.h
	+++ b/aten/src/THCUNN/common.h
	@@ -5,7 +5,10 @@
	"Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")

	// Use 1024 threads per block, which requires cuda sm_2x or above
	-const int CUDA_NUM_THREADS = 1024;
	+//constexpr int CUDA_NUM_THREADS = 1024;
	+
	+// patch for "too many resources requested for launch"
	+constexpr int CUDA_NUM_THREADS = 512;

	// CUDA: number of blocks for threads.
	inline int GET_BLOCKS(const int64_t N)
	diff --git a/aten/src/ATen/cpu/vec256/vec256_float_neon.h b/aten/src/ATen/cpu/vec256/vec256_float_neon.h
	index 5f6134112c..15b311d4d6 100644
	--- a/aten/src/ATen/cpu/vec256/vec256_float_neon.h
	+++ b/aten/src/ATen/cpu/vec256/vec256_float_neon.h
	@@ -26,6 +26,9 @@ namespace {
	// Most likely we will do aarch32 support with inline asm.
	#if defined(__aarch64__)

	+// See https://github.com/pytorch/pytorch/issues/47098
	+#if defined(__clang__) \|\| (__GNUC__ > 8 \|\| (__GNUC__ == 8 && __GNUC_MINOR__ > 3))
	+
	#ifdef __BIG_ENDIAN__
	#error "Big endian is not supported."
	#endif
	@@ -710,6 +713,7 @@ Vec256<float> inline fmadd(const Vec256<float>& a, const Vec256<float>& b, const
	return Vec256<float>(r0, r1);
	}

	+#endif /* defined(__clang__) \|\| (__GNUC__ > 8 \|\| (__GNUC__ == 8 && __GNUC_MINOR__ > 3)) */
	#endif /* defined(aarch64) */

	}}}
	diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
	index 1751128f1a..03e74f5ac2 100644
	--- a/aten/src/ATen/cuda/CUDAContext.cpp
	+++ b/aten/src/ATen/cuda/CUDAContext.cpp
	@@ -24,6 +24,8 @@ void initCUDAContextVectors() {
	void initDeviceProperty(DeviceIndex device_index) {
	cudaDeviceProp device_prop;
	AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index));
	+ // patch for "too many resources requested for launch"
	+ device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2;
	device_properties[device_index] = device_prop;
	}

	diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h
	index e707e94cc1..4c6fcf3e5e 100644
	--- a/aten/src/ATen/cuda/detail/KernelUtils.h
	+++ b/aten/src/ATen/cuda/detail/KernelUtils.h
	@@ -22,7 +22,10 @@ namespace at { namespace cuda { namespace detail {


	// Use 1024 threads per block, which requires cuda sm_2x or above
	-constexpr int CUDA_NUM_THREADS = 1024;
	+//constexpr int CUDA_NUM_THREADS = 1024;
	+
	+// patch for "too many resources requested for launch"
	+constexpr int CUDA_NUM_THREADS = 512;

	// CUDA: number of blocks for threads.
	inline int GET_BLOCKS(const int64_t N) {
	diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h
	index 69b7f3a4d3..54455ab4b0 100644
	--- a/aten/src/THCUNN/common.h
	+++ b/aten/src/THCUNN/common.h
	@@ -5,7 +5,10 @@
	"Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")

	// Use 1024 threads per block, which requires cuda sm_2x or above
	-const int CUDA_NUM_THREADS = 1024;
	+//const int CUDA_NUM_THREADS = 1024;
	+
	+// patch for "too many resources requested for launch"
	+constexpr int CUDA_NUM_THREADS = 512;

	// CUDA: number of blocks for threads.
	inline int GET_BLOCKS(const int64_t N)