Natalia Gimelshein ngimel

## dense_gelu.py
import torch
import math
from torchinductor.compile_fx import compile_fx
import torchdynamo
import torchinductor

torchinductor.config.debug=True
torchinductor.config.triton.cudagraphs=False

def _gelu_python(x):

## mul_sum_triton.py

import torch

import triton
import triton.language as tl

#@triton.jit
def mul_sum_kernel(
    output_ptr, input_ptr0, input_ptr1,
    si00, si01, si02, si03,

## triton_transpose.py
import torch

import triton
import triton.language as tl
from itertools import product


@triton.jit
def copy_kernel(
    output_ptr, input_ptr,

## bench.sh
python perf_lab_bcast.py --dtype_str double --op_str torch.eq
python perf_lab_bcast.py --dtype_str float --op_str torch.eq
python perf_lab_bcast.py --dtype_str half --op_str torch.eq
python perf_lab_bcast.py --dtype_str int --op_str torch.eq
python perf_lab_bcast.py --dtype_str uint8 --op_str torch.eq
python perf_lab_bcast.py --dtype_str double --op_str torch.add
python perf_lab_bcast.py --dtype_str float --op_str torch.add
python perf_lab_bcast.py --dtype_str half --op_str torch.add
python perf_lab_bcast.py --dtype_str int --op_str torch.add
python perf_lab_bcast.py --dtype_str uint8 --op_str torch.add

## topk results (D1, D2, k)
[--------------------- topK ---------------------]
                          |  topk_new  |  topk_old
1 threads: ---------------------------------------
      10, 2048, 100       |     21.2   |     19.1
      10, 2048, 1000      |     20.9   |     18.1
      10, 2048, 2000      |     23.0   |     19.9
      10, 2048, 2048      |     19.2   |     17.1
      10, 4096, 100       |     30.8   |     28.6
      10, 4096, 1000      |     30.7   |     28.3
      10, 4096, 2000      |     31.2   |     28.6

## .ltrace.conf
int cublasSgemm_v2(addr, int, int, int, int, int, float*, addr, int, addr, int, float*, addr, int);
int cublasGemmEx(addr, int, int, int, int, int, float*, addr, int, int, addr, int, int, float*, addr, int, int, int, int);
int cublasGemmBatchedEx(addr, int, int, int, int, int, float*, addr, int, int, addr, int, int, float*, addr, int, int, int, int, int);
int cublasSgemmStridedBatched(addr, int, int, int, int, int, float*, addr, int, int, addr, int, int, float*, addr, int, int, int);
int cublasGemmStridedBatchedEx(addr, int, int, int, int, int, float*, addr, int, int, int, addr, int, int, int, float*, addr, int, int, int, int, int, int);


## norm_bench.py
import torch
import time


nlayers = 10
params = []
size = 1024
for _ in range(nlayers):
    params.append(torch.randn(size, device="cuda", requires_grad=True))
    params.append(torch.randn((size, size), device="cuda", requires_grad=True))

## bt.log
Thread 1 "python" hit Breakpoint 4, 0x00007ffff57a8fc0 in cudaGetDevice ()
   from /usr/local/cuda/lib64/libcudart.so.8.0
(gdb) bt
#0  0x00007ffff57a8fc0 in cudaGetDevice () from /usr/local/cuda/lib64/libcudart.so.8.0
#1  0x00007ffff646fc25 in AutoGPU::setDevice (device=<optimized out>, this=0x7fffffffd280)
    at /tmp/pip-yqg9zt2i-build/torch/csrc/utils/auto_gpu.h:32
#2  AutoGPU::AutoGPU (device=<optimized out>, this=0x7fffffffd280)
    at /tmp/pip-yqg9zt2i-build/torch/csrc/utils/auto_gpu.h:15
#3  THCPAutoGPU::THCPAutoGPU (this=0x7fffffffd280, args=0x7ffff7e07048, self=<optimized out>)
    at torch/csrc/cuda/AutoGPU.cpp:65
	import torch
	import math
	from torchinductor.compile_fx import compile_fx
	import torchdynamo
	import torchinductor

	torchinductor.config.debug=True
	torchinductor.config.triton.cudagraphs=False

	def _gelu_python(x):

	import torch

	import triton
	import triton.language as tl

	#@triton.jit
	def mul_sum_kernel(
	output_ptr, input_ptr0, input_ptr1,
	si00, si01, si02, si03,
	python perf_lab_bcast.py --dtype_str double --op_str torch.eq
	python perf_lab_bcast.py --dtype_str float --op_str torch.eq
	python perf_lab_bcast.py --dtype_str half --op_str torch.eq
	python perf_lab_bcast.py --dtype_str int --op_str torch.eq
	python perf_lab_bcast.py --dtype_str uint8 --op_str torch.eq
	python perf_lab_bcast.py --dtype_str double --op_str torch.add
	python perf_lab_bcast.py --dtype_str float --op_str torch.add
	python perf_lab_bcast.py --dtype_str half --op_str torch.add
	python perf_lab_bcast.py --dtype_str int --op_str torch.add
	python perf_lab_bcast.py --dtype_str uint8 --op_str torch.add
	[--------------------- topK ---------------------]
	\| topk_new \| topk_old
	1 threads: ---------------------------------------
	10, 2048, 100 \| 21.2 \| 19.1
	10, 2048, 1000 \| 20.9 \| 18.1
	10, 2048, 2000 \| 23.0 \| 19.9
	10, 2048, 2048 \| 19.2 \| 17.1
	10, 4096, 100 \| 30.8 \| 28.6
	10, 4096, 1000 \| 30.7 \| 28.3
	10, 4096, 2000 \| 31.2 \| 28.6
	int cublasSgemm_v2(addr, int, int, int, int, int, float, addr, int, addr, int, float, addr, int);
	int cublasGemmEx(addr, int, int, int, int, int, float, addr, int, int, addr, int, int, float, addr, int, int, int, int);
	int cublasGemmBatchedEx(addr, int, int, int, int, int, float, addr, int, int, addr, int, int, float, addr, int, int, int, int, int);
	int cublasSgemmStridedBatched(addr, int, int, int, int, int, float, addr, int, int, addr, int, int, float, addr, int, int, int);
	int cublasGemmStridedBatchedEx(addr, int, int, int, int, int, float, addr, int, int, int, addr, int, int, int, float, addr, int, int, int, int, int, int);
	import torch
	import time


	nlayers = 10
	params = []
	size = 1024
	for _ in range(nlayers):
	params.append(torch.randn(size, device="cuda", requires_grad=True))
	params.append(torch.randn((size, size), device="cuda", requires_grad=True))
	Thread 1 "python" hit Breakpoint 4, 0x00007ffff57a8fc0 in cudaGetDevice ()
	from /usr/local/cuda/lib64/libcudart.so.8.0
	(gdb) bt
	#0 0x00007ffff57a8fc0 in cudaGetDevice () from /usr/local/cuda/lib64/libcudart.so.8.0
	#1 0x00007ffff646fc25 in AutoGPU::setDevice (device=<optimized out>, this=0x7fffffffd280)
	at /tmp/pip-yqg9zt2i-build/torch/csrc/utils/auto_gpu.h:32
	#2 AutoGPU::AutoGPU (device=<optimized out>, this=0x7fffffffd280)
	at /tmp/pip-yqg9zt2i-build/torch/csrc/utils/auto_gpu.h:15
	#3 THCPAutoGPU::THCPAutoGPU (this=0x7fffffffd280, args=0x7ffff7e07048, self=<optimized out>)
	at torch/csrc/cuda/AutoGPU.cpp:65