Skip to content

Instantly share code, notes, and snippets.

from __future__ import annotations
import os
import torch
import helion
import helion.language as hl
import triton
import triton.language as tl
from torch._inductor.runtime.triton_compat import libdevice
from helion.runtime import default_launcher as _default_launcher
(pytorch) [shunting@devgpu011.ldc3 ~/ws/helion (all-reduce)]$ TEST_WORLD_SIZE=4 DISTRIBUTED_TESTS_DEFAULT_TIMEOUT=3600 python test/test_distributed.py -k test_allreduce_bias_rmsnorm_kernel_name_two_shot_allreduce_bias_rmsnorm_kernel_autotuner_fixed
INFO: Started process 0 with pid 1557599
INFO: Started process 1 with pid 1557600
INFO: Started process 2 with pid 1557601
INFO: Started process 3 with pid 1557602
NCCL version 2.28.9+cuda12.9
Rank1: n_max_threads 384
ERROR: Caught exception:
Traceback (most recent call last):
File "/home/shunting/ws/pytorch/torch/testing/_internal/common_distributed.py", line 942, in run_test
Generation 2: exploring neighbors 50% ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╺━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 65/131 3.8 configs/sRank1: before the sync for acc run 203
Rank3: before the sync for acc run 203
Rank2: before the sync for acc run 203
Rank0: before the sync for acc run 203
Generation 2: exploring neighbors 50% ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╺━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 65/131 3.8 configs/sRank1: Running benchmark for helion.Config(block_sizes=[4], indexing=['pointer', 'tensor_descriptor', 'pointer', 'tensor_descriptor', 'pointer', 'pointer', 'tensor_descriptor', 'tensor_descriptor', 'tensor_descriptor', 'pointer', 'pointer', 'pointer', 'pointer', 'tensor_descriptor', 'tensor_descriptor', 'tensor_descriptor', 'tensor_descriptor', 'tensor_descriptor', 'tensor_descriptor', 'pointer', 'tensor_descriptor'], load_eviction_policies=['last', '', 'last', 'first', 'last', 'last', 'last', 'first', '', 'first', '', 'first', 'first'], maxnreg=256, num_sm_mult
diff --git a/torch/_inductor/codegen/simd.py b/torch/_inductor/codegen/simd.py
index 1dd9430baa8..7e961222d16 100644
--- a/torch/_inductor/codegen/simd.py
+++ b/torch/_inductor/codegen/simd.py
@@ -1672,6 +1672,9 @@ class SIMDScheduling(BaseScheduling):
)
converted_nodes = []
+ for subnode in node1.get_nodes():
+ subnode.cancel_reduction_split()
import torch
torch.set_default_device("cuda")
@torch.compile
def f(x, w):
y = torch.nn.functional.rms_norm(x, x.shape[-1:], weight=None)
return y * w
B, H, D = 32 * 1024, 2, 1024
Error: Failed to get process executable name. Check that the process is running.
Reason: No such file or directory (os error 2)
Reason: No such file or directory (os error 2)
(pytorch) [shunting@devgpu011.ldc3 ~/ws/helion (all-reduce)]$ CUDA_LAUNCH_BLOCKING=1 python test/test_distributed.py -k TestDistributed.test_matmul_reduce_scatter_autotuner_LFBOTreeSearch
INFO: Started process 0 with pid 2081469
INFO: Started process 1 with pid 2081470
INFO: Started process 2 with pid 2081471
INFO: Started process 3 with pid 2081472
NCCL version 2.28.9+cuda12.9
[0s] Starting autotuning process, this may take a while...
[0s] Autotune random seed: 3824896922
[rank0]:[W312 14:37:03.774057936 CUDASymmetricMemory.cu:804] Warning: Pointer not within any SymmetricMemory allocation, is the tensor allocated from SymmetricMemory? (function rendezvous)
[rank0]:[W312 14:37:03.774080630 CUDASymmetricMemory.cu:804] Warning: Pointer not within any SymmetricMemory allocation, is the tensor allocated from SymmetricMemory? (function rendezvous)
(pytorch) [shunting@devgpu011.ldc3 ~/ws/helion (all-reduce)]$ python test/test_distributed.py -k TestDistributed.test_matmul_reduce_scatter_autotuner_LFBOTreeSearch
INFO: Started process 0 with pid 3868749
INFO: Started process 1 with pid 3868753
INFO: Started process 2 with pid 3868754
INFO: Started process 3 with pid 3868755
NCCL version 2.28.9+cuda12.9
[0s] Starting autotuning process, this may take a while...
[0s] Autotune random seed: 3823185645
[rank0]:[W312 14:08:31.497898362 CUDASymmetricMemory.cu:804] Warning: Pointer not within any SymmetricMemory allocation, is the tensor allocated from SymmetricMemory? (function rendezvous)
[rank0]:[W312 14:08:31.497917882 CUDASymmetricMemory.cu:804] Warning: Pointer not within any SymmetricMemory allocation, is the tensor allocated from SymmetricMemory? (function rendezvous)
(pytorch) [shunting@devgpu011.ldc3 ~/ws/helion (all-reduce)]$ python test/test_distributed.py -k TestDistributed.test_matmul_reduce_scatter_autotuner_LFBOTreeSearch
INFO: Started process 0 with pid 2143318
INFO: Started process 1 with pid 2143323
INFO: Started process 2 with pid 2143330
INFO: Started process 3 with pid 2143331
NCCL version 2.28.9+cuda12.9
[rank2]:[W310 23:33:39.020723478 CUDASymmetricMemory.cu:804] Warning: Pointer not within any SymmetricMemory allocation, is the tensor allocated from SymmetricMemory? (function rendezvous)
[rank2]:[W310 23:33:39.020748826 CUDASymmetricMemory.cu:804] Warning: Pointer not within any SymmetricMemory allocation, is the tensor allocated from SymmetricMemory? (function rendezvous)
[rank1]:[W310 23:33:39.030396315 CUDASymmetricMemory.cu:804] Warning: Pointer not within any SymmetricMemory allocation, is the tensor allocated from SymmetricMemory? (function rendezvous)
[rank1]:[W310 23:33:39.030415704 CUDASymmetricMemory.cu:804] Warning: Pointer not within any Symmet
from __future__ import annotations
import torch
import helion.language as hl
import triton
import triton.language as tl
from torch._inductor.runtime.triton_compat import libdevice
from helion.runtime import default_launcher as _default_launcher
import __main__ as _source_module