This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
# CUDA kernel with inline PTX | |
kernel_source = """ | |
__global__ void vector_add(const float* a, const float* b, float* c, int n) { | |
int idx; | |
asm("mov.u32 %0, %%ctaid.x;" : "=r"(idx)); | |
int tid; | |
asm("mov.u32 %0, %%tid.x;" : "=r"(tid)); | |
int ntid; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
pytorch_operator,base_name,overload,folder_name,is_mapped | |
aten._adaptive_avg_pool2d.default,_adaptive_avg_pool2d,default,_adaptive_avg_pool2d,True | |
aten._adaptive_avg_pool2d_backward.default,_adaptive_avg_pool2d_backward,default,_adaptive_avg_pool2d_backward,True | |
aten._cudnn_rnn.default,_cudnn_rnn,default,_cudnn_rnn,True | |
aten._log_softmax.default,_log_softmax,default,_log_softmax,True | |
aten._log_softmax_backward_data.default,_log_softmax_backward_data,default,_log_softmax_backward_data,True | |
aten._softmax.default,_softmax,default,_softmax,True | |
aten._softmax_backward_data.default,_softmax_backward_data,default,_softmax_backward_data,True | |
aten._sparse_coo_tensor_with_dims_and_tensors.default,_sparse_coo_tensor_with_dims_and_tensors,default,_sparse_coo_tensor_with_dims_and_tensors,True | |
aten._to_copy.default,_to_copy,default,_to_copy,True |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py | |
index 938cb7dd97a..d3ac1369e6a 100644 | |
--- a/torch/testing/_internal/common_methods_invocations.py | |
+++ b/torch/testing/_internal/common_methods_invocations.py | |
@@ -7443,6 +7443,57 @@ def reference_inputs_clone_contiguous(op, device, dtype, requires_grad, **kwargs | |
yield SampleInput(a, kwargs={'memory_format': torch.channels_last_3d}) | |
+def sample_inputs_copy(op_info, device, dtype, requires_grad, **kwargs): | |
+ """Sample inputs for copy and copy_ operations. |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
op_name | is_core | is_in_opinfo | is_in_torchbench | |
---|---|---|---|---|
__and__ | No | No | No | |
__iand__ | No | No | No | |
__ilshift__ | No | No | No | |
__ior__ | No | No | No | |
__irshift__ | No | No | No | |
__ixor__ | No | No | No | |
__lshift__ | No | No | No | |
__or__ | No | No | No | |
__rshift__ | No | No | No |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
from torch import nn | |
from torch.distributed.tensor.placement_types import Replicate, Shard | |
import torch.distributed as dist | |
from torch.distributed.device_mesh import init_device_mesh | |
from torch.distributed.tensor import DTensor | |
from torch.distributed.tensor.parallel import parallelize_module | |
def dist_print(*args, **kwargs): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
from torch.utils.cpp_extension import _get_cuda_arch_flags | |
def test_fix(): | |
print("Testing CUDA arch flags fix...") | |
user_arch_flags = ['-gencode=arch=compute_86,code=sm_86'] | |
result = _get_cuda_arch_flags(user_arch_flags) | |
print(f"User provided: {user_arch_flags}") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Stop all GPU monitoring services that block ncu | |
sudo systemctl stop nvidia-dcgm.service dynologd.service | |
# Verify they're stopped | |
sudo systemctl list-units --state=active | grep -E "(nvidia|dynolog)" | |
# Check GPU is clear | |
sudo lsof /dev/nvidia7 | grep -v python | |
# Now run ncu |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Limitations | |
1. Cannot do heavy templating, cannot use thrust for reductions | |
2. Cannot import any host includes | |
Thank you @malfet! | |
""" | |
import ctypes | |
import torch |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(pt) ➜ examples git:(msaroufim/noheader) ✗ python tensor_base_example.py | |
Clearing existing build directory: /home/marksaroufim/pytorch/examples/custom_extension_build | |
Created build directory: /home/marksaroufim/pytorch/examples/custom_extension_build | |
Compiling TensorBase CUDA extension with no_header=True... | |
Using /home/marksaroufim/.cache/torch_extensions/py310_cu124 as PyTorch extensions root... | |
Detected CUDA files, patching ldflags | |
Emitting ninja build file /home/marksaroufim/.cache/torch_extensions/py310_cu124/tensor_base_example/build.ninja... | |
Building extension module tensor_base_example... | |
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) | |
[1/2] /home/marksaroufim/.conda/envs/pt/bin/nvcc --generate-dependencies-with-compile --dependency-output cuda.cuda.o.d -ccbin /home/marksaroufim/.conda/envs/pt/bin/x86_64-conda-linux-gnu-cc -DTORCH_EXTENSION_NAME=tensor_base_example -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYB |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Minimal example that: | |
- Only includes <ATen/core/Tensor.h> (for at::Tensor) | |
and <ATen/Functions.h> (for at::empty). | |
- Avoids <torch/extension.h> or <torch/types.h>. | |
- Uses <torch/csrc/utils/pybind.h> so PyBind can cast torch.Tensor <-> at::Tensor. | |
- Demonstrates a custom CUDA kernel that adds x + y + 1. | |
- Uses no_implicit_headers=True to reduce compile overhead. | |
""" |
NewerOlder