chengscott/ait.patch

## ait.patch
diff --git a/python/aitemplate/backend/cuda/conv2d/common.py b/python/aitemplate/backend/cuda/conv2d/common.py
index 8cf7fb2..ca13a72 100644
--- a/python/aitemplate/backend/cuda/conv2d/common.py
+++ b/python/aitemplate/backend/cuda/conv2d/common.py
@@ -501,6 +501,7 @@ def emit_instance(op):
         emiter = cutlass_lib.conv2d_operation.EmitConv2dWithBroadcastInstance()
     else:
         emiter = cutlass_lib.conv2d_operation.EmitConv2dInstance()
+    op.tile_description.stages = 2
     op_def = emiter.emit(op)
     return op_def

diff --git a/python/aitemplate/backend/cuda/target_def.py b/python/aitemplate/backend/cuda/target_def.py
index 81d883e..9134500 100644
--- a/python/aitemplate/backend/cuda/target_def.py
+++ b/python/aitemplate/backend/cuda/target_def.py
@@ -153,6 +153,7 @@ class CUDA(Target):
         options = [
             "-t=0",
             "-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1",
+            "-DCUTLASS_NVCC_ARCHS=70",
             "-w",
             f"-gencode=arch=compute_{self._arch},code=[{','.join(code)}]",
             environ.get_compiler_opt_level(),
@@ -429,6 +430,7 @@ class FBCUDA(CUDA):
                     "-Xcompiler -fPIC",
                     "-Xcompiler -fvisibility=hidden",
                     "-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1",
+                    "-DCUTLASS_NVCC_ARCHS=70",
                     "-w",
                     "--expt-relaxed-constexpr",
                     f"-gencode=arch=compute_{nvcc_arch},code=[sm_{nvcc_arch},compute_{nvcc_arch}]",
	diff --git a/python/aitemplate/backend/cuda/conv2d/common.py b/python/aitemplate/backend/cuda/conv2d/common.py
	index 8cf7fb2..ca13a72 100644
	--- a/python/aitemplate/backend/cuda/conv2d/common.py
	+++ b/python/aitemplate/backend/cuda/conv2d/common.py
	@@ -501,6 +501,7 @@ def emit_instance(op):
	emiter = cutlass_lib.conv2d_operation.EmitConv2dWithBroadcastInstance()
	else:
	emiter = cutlass_lib.conv2d_operation.EmitConv2dInstance()
	+ op.tile_description.stages = 2
	op_def = emiter.emit(op)
	return op_def

	diff --git a/python/aitemplate/backend/cuda/target_def.py b/python/aitemplate/backend/cuda/target_def.py
	index 81d883e..9134500 100644
	--- a/python/aitemplate/backend/cuda/target_def.py
	+++ b/python/aitemplate/backend/cuda/target_def.py
	@@ -153,6 +153,7 @@ class CUDA(Target):
	options = [
	"-t=0",
	"-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1",
	+ "-DCUTLASS_NVCC_ARCHS=70",
	"-w",
	f"-gencode=arch=compute_{self._arch},code=[{','.join(code)}]",
	environ.get_compiler_opt_level(),
	@@ -429,6 +430,7 @@ class FBCUDA(CUDA):
	"-Xcompiler -fPIC",
	"-Xcompiler -fvisibility=hidden",
	"-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1",
	+ "-DCUTLASS_NVCC_ARCHS=70",
	"-w",
	"--expt-relaxed-constexpr",
	f"-gencode=arch=compute_{nvcc_arch},code=[sm_{nvcc_arch},compute_{nvcc_arch}]",