Skip to content

Instantly share code, notes, and snippets.

import torch
import math
from torchinductor.compile_fx import compile_fx
import torchdynamo
import torchinductor
torchinductor.config.debug=True
torchinductor.config.triton.cudagraphs=False
def _gelu_python(x):
import torch
import triton
import triton.language as tl
#@triton.jit
def mul_sum_kernel(
output_ptr, input_ptr0, input_ptr1,
si00, si01, si02, si03,
import torch
import triton
import triton.language as tl
from itertools import product
@triton.jit
def copy_kernel(
output_ptr, input_ptr,
@ngimel
ngimel / bench.sh
Last active January 24, 2022 17:21
python perf_lab_bcast.py --dtype_str double --op_str torch.eq
python perf_lab_bcast.py --dtype_str float --op_str torch.eq
python perf_lab_bcast.py --dtype_str half --op_str torch.eq
python perf_lab_bcast.py --dtype_str int --op_str torch.eq
python perf_lab_bcast.py --dtype_str uint8 --op_str torch.eq
python perf_lab_bcast.py --dtype_str double --op_str torch.add
python perf_lab_bcast.py --dtype_str float --op_str torch.add
python perf_lab_bcast.py --dtype_str half --op_str torch.add
python perf_lab_bcast.py --dtype_str int --op_str torch.add
python perf_lab_bcast.py --dtype_str uint8 --op_str torch.add
[--------------------- topK ---------------------]
| topk_new | topk_old
1 threads: ---------------------------------------
10, 2048, 100 | 21.2 | 19.1
10, 2048, 1000 | 20.9 | 18.1
10, 2048, 2000 | 23.0 | 19.9
10, 2048, 2048 | 19.2 | 17.1
10, 4096, 100 | 30.8 | 28.6
10, 4096, 1000 | 30.7 | 28.3
10, 4096, 2000 | 31.2 | 28.6
int cublasSgemm_v2(addr, int, int, int, int, int, float*, addr, int, addr, int, float*, addr, int);
int cublasGemmEx(addr, int, int, int, int, int, float*, addr, int, int, addr, int, int, float*, addr, int, int, int, int);
int cublasGemmBatchedEx(addr, int, int, int, int, int, float*, addr, int, int, addr, int, int, float*, addr, int, int, int, int, int);
int cublasSgemmStridedBatched(addr, int, int, int, int, int, float*, addr, int, int, addr, int, int, float*, addr, int, int, int);
int cublasGemmStridedBatchedEx(addr, int, int, int, int, int, float*, addr, int, int, int, addr, int, int, int, float*, addr, int, int, int, int, int, int);
import torch
import time
nlayers = 10
params = []
size = 1024
for _ in range(nlayers):
params.append(torch.randn(size, device="cuda", requires_grad=True))
params.append(torch.randn((size, size), device="cuda", requires_grad=True))
Thread 1 "python" hit Breakpoint 4, 0x00007ffff57a8fc0 in cudaGetDevice ()
from /usr/local/cuda/lib64/libcudart.so.8.0
(gdb) bt
#0 0x00007ffff57a8fc0 in cudaGetDevice () from /usr/local/cuda/lib64/libcudart.so.8.0
#1 0x00007ffff646fc25 in AutoGPU::setDevice (device=<optimized out>, this=0x7fffffffd280)
at /tmp/pip-yqg9zt2i-build/torch/csrc/utils/auto_gpu.h:32
#2 AutoGPU::AutoGPU (device=<optimized out>, this=0x7fffffffd280)
at /tmp/pip-yqg9zt2i-build/torch/csrc/utils/auto_gpu.h:15
#3 THCPAutoGPU::THCPAutoGPU (this=0x7fffffffd280, args=0x7ffff7e07048, self=<optimized out>)
at torch/csrc/cuda/AutoGPU.cpp:65