David Berard davidberard98

## triton_only_repro.py
# AOT ID: ['0_backward']
from ctypes import c_void_p, c_long, c_int
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from cmath import nanj
from torch._inductor.hooks import run_intermediate_hooks

## gist:0304f27ecb8559b44402a0b9255be055
TORCHINDUCTOR_FORCE_DISABLE_CACHES=1
function finish() {
  pushd /home/dberard/local/pytorch-env7/triton
  git checkout -- scripts/build-llvm-project.sh
}
trap finish EXIT
git patch /home/dberard/local/pytorch-env7/diff.patch
make dev-install-llvm
code=$?
if [ $code -ne 0 ]

## triton_bisect.sh
# USAGE:
#   Put this in your triton repo directory.
#   1. Update the [BUILD COMMAND]
#   2. Update the [PYTORCH PATH]
#   3. Update the [TEST COMMAND]
#   4. Run the bisect:
#     $ git bisect start
#     $ git checkout [known good commit]
#     $ git bisect good
#     $ git checkout [known bad commit]

## layernorm_good.ptx
//
// Generated by LLVM NVPTX Back-End
//

.version 8.7
.target sm_90a
.address_size 64

	// .globl	_layer_norm_backward_kernel // -- Begin function _layer_norm_backward_kernel
.extern .shared .align 16 .b8 global_smem[];

## layernorm_bad.ptx
//
// Generated by LLVM NVPTX Back-End
//

.version 8.7
.target sm_90a
.address_size 64

	// .globl	_layer_norm_backward_kernel // -- Begin function _layer_norm_backward_kernel
.extern .shared .align 16 .b8 global_smem[];

## parse_glibcxx.py
#!/usr/bin/env python3

import argparse
import os
import sys
import stat
import subprocess
import re

def parse_glibcxx_version(version_string):

## tritonbench_33_34_ascending.csv
metric_id,Samples (3.3),Samples (3.4),speedup (3.3),speedup (3.4),speedup ((new-old)/old),speedup (delta)
tritonbench_ragged_attention_bwd[hstu]-tflops-avg,0,1,0,98.806091308594,-1,-98.806091308594
"tritonbench_ragged_attention_bwd[x_(128, 4, 1024, 128, 128, 1.0, 20, 0)-hstu]_tflops",0,1,0,132.42012023926,-1,-132.42012023926
"tritonbench_ragged_attention_bwd[x_(128, 4, 256, 128, 128, 1.0, 20, 0)-hstu]_tflops",0,1,0,65.420997619629,-1,-65.420997619629
"tritonbench_ragged_attention_bwd[x_(128, 4, 512, 128, 128, 1.0, 20, 0)-hstu]_tflops",0,1,0,98.577156066895,-1,-98.577156066895
tritonbench_ragged_attention_bwd[x_average-hstu]_tflops,0,1,0,98.806091308594,-1,-98.806091308594
"tritonbench_fp8_gemm_blockwise_fwd[x_(128, 2304, 6656)-_triton]_speedup",1,1,0.60214412212372,0.79111462831497,-0.23886615090629,-0.18897050619125
"tritonbench_fp8_gemm_blockwise_fwd[x_(128, 2304, 6656)-_triton]_tflops",1,1,73.069320678711,95.622283935547,-0.23585468081935,-22.552963256836
"tritonbench_int4_gemm_fwd[x_(16, 1, 8192,

## tritonbench_33_34_descending.csv
metric_id,Samples (3.3),Samples (3.4),speedup (3.3),speedup (3.4),speedup difference ((new-old)/old),speedup (delta)
tritonbench_fused_linear_jsd_bwd-pass,1,1,0,0,3.4028235e+38,0
"tritonbench_flex_attention_fwd[x_  (8, 16, 8192, 16, 8192, 128) |            noop-compiled]_speedup",1,1,0,0,3.4028235e+38,0
"tritonbench_flex_attention_fwd[x_  (8, 16, 8192, 16, 8192, 128) |            noop-compiled]_tflops",1,1,0,0,3.4028235e+38,0
"tritonbench_flex_attention_fwd[x_  (8, 16, 8192, 16, 8192, 128) |            noop-eager]_tflops",1,1,0,0,3.4028235e+38,0
"tritonbench_flex_attention_fwd[x_(8, 16, 16384, 16, 16384, 128) |            noop-compiled]_speedup",1,1,0,0,3.4028235e+38,0
"tritonbench_flex_attention_fwd[x_(8, 16, 16384, 16, 16384, 128) |            noop-compiled]_tflops",1,1,0,0,3.4028235e+38,0
"tritonbench_flex_attention_fwd[x_(8, 16, 16384, 16, 16384, 128) |            noop-eager]_tflops",1,1,0,0,3.4028235e+38,0
"tritonbench_flex_attention_bwd[x_  (8, 16, 8192, 16, 8192, 128) |            noop-compile

## fp8_mm_example.py
import argparse

import torch

import triton  # @manual=//triton:triton
import triton.language as tl  # @manual=//triton:triton
# best config selected: BLOCK_SIZE_M: 128, BLOCK_SIZE_N: 256, BLOCK_SIZE_K: 128, GROUP_SIZE_M: 8, num_warps: 8, num_ctas: 1, num_stages: 3, maxnreg: None;


def get_cuda_autotune_config():

## 3.2.ttir
#loc = loc("/tmp/torchinductor_dberard/gy/cgy5zvgvr3m2yfnc7jim2n7k35hawsgtqmr2ap6pyu3d4c6muk2u.py":17:0)
module {
  tt.func public @triton_tem_fused_zeros_7(%arg0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/gy/cgy5zvgvr3m2yfnc7jim2n7k35hawsgtqmr2ap6pyu3d4c6muk2u.py":17:0), %arg1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/gy/cgy5zvgvr3m2yfnc7jim2n7k35hawsgtqmr2ap6pyu3d4c6muk2u.py":17:0), %arg2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/gy/cgy5zvgvr3m2yfnc7jim2n7k35hawsgtqmr2ap6pyu3d4c6muk2u.py":17:0), %arg3: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/gy/cgy5zvgvr3m2yfnc7jim2n7k35hawsgtqmr2ap6pyu3d4c6muk2u.py":17:0), %arg4: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/gy/cgy5zvgvr3m2yfnc7jim2n7k35hawsgtqmr2ap6pyu3d4c6muk2u.py":17:0), %arg5: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/gy/cgy5zvgvr3m2yfnc7jim2n7k35hawsgtqmr2ap6
	# AOT ID: ['0_backward']
	from ctypes import c_void_p, c_long, c_int
	import torch
	import math
	import random
	import os
	import tempfile
	from math import inf, nan
	from cmath import nanj
	from torch._inductor.hooks import run_intermediate_hooks
	TORCHINDUCTOR_FORCE_DISABLE_CACHES=1
	function finish() {
	pushd /home/dberard/local/pytorch-env7/triton
	git checkout -- scripts/build-llvm-project.sh
	}
	trap finish EXIT
	git patch /home/dberard/local/pytorch-env7/diff.patch
	make dev-install-llvm
	code=$?
	if [ $code -ne 0 ]
	# USAGE:
	# Put this in your triton repo directory.
	# 1. Update the [BUILD COMMAND]
	# 2. Update the [PYTORCH PATH]
	# 3. Update the [TEST COMMAND]
	# 4. Run the bisect:
	# $ git bisect start
	# $ git checkout [known good commit]
	# $ git bisect good
	# $ git checkout [known bad commit]
	//
	// Generated by LLVM NVPTX Back-End
	//

	.version 8.7
	.target sm_90a
	.address_size 64

	// .globl _layer_norm_backward_kernel // -- Begin function _layer_norm_backward_kernel
	.extern .shared .align 16 .b8 global_smem[];
	#!/usr/bin/env python3

	import argparse
	import os
	import sys
	import stat
	import subprocess
	import re

	def parse_glibcxx_version(version_string):
	metric_id,Samples (3.3),Samples (3.4),speedup (3.3),speedup (3.4),speedup ((new-old)/old),speedup (delta)
	tritonbench_ragged_attention_bwd[hstu]-tflops-avg,0,1,0,98.806091308594,-1,-98.806091308594
	"tritonbench_ragged_attention_bwd[x_(128, 4, 1024, 128, 128, 1.0, 20, 0)-hstu]_tflops",0,1,0,132.42012023926,-1,-132.42012023926
	"tritonbench_ragged_attention_bwd[x_(128, 4, 256, 128, 128, 1.0, 20, 0)-hstu]_tflops",0,1,0,65.420997619629,-1,-65.420997619629
	"tritonbench_ragged_attention_bwd[x_(128, 4, 512, 128, 128, 1.0, 20, 0)-hstu]_tflops",0,1,0,98.577156066895,-1,-98.577156066895
	tritonbench_ragged_attention_bwd[x_average-hstu]_tflops,0,1,0,98.806091308594,-1,-98.806091308594
	"tritonbench_fp8_gemm_blockwise_fwd[x_(128, 2304, 6656)-_triton]_speedup",1,1,0.60214412212372,0.79111462831497,-0.23886615090629,-0.18897050619125
	"tritonbench_fp8_gemm_blockwise_fwd[x_(128, 2304, 6656)-_triton]_tflops",1,1,73.069320678711,95.622283935547,-0.23585468081935,-22.552963256836
	"tritonbench_int4_gemm_fwd[x_(16, 1, 8192,
	metric_id,Samples (3.3),Samples (3.4),speedup (3.3),speedup (3.4),speedup difference ((new-old)/old),speedup (delta)
	tritonbench_fused_linear_jsd_bwd-pass,1,1,0,0,3.4028235e+38,0
	"tritonbench_flex_attention_fwd[x_ (8, 16, 8192, 16, 8192, 128) \| noop-compiled]_speedup",1,1,0,0,3.4028235e+38,0
	"tritonbench_flex_attention_fwd[x_ (8, 16, 8192, 16, 8192, 128) \| noop-compiled]_tflops",1,1,0,0,3.4028235e+38,0
	"tritonbench_flex_attention_fwd[x_ (8, 16, 8192, 16, 8192, 128) \| noop-eager]_tflops",1,1,0,0,3.4028235e+38,0
	"tritonbench_flex_attention_fwd[x_(8, 16, 16384, 16, 16384, 128) \| noop-compiled]_speedup",1,1,0,0,3.4028235e+38,0
	"tritonbench_flex_attention_fwd[x_(8, 16, 16384, 16, 16384, 128) \| noop-compiled]_tflops",1,1,0,0,3.4028235e+38,0
	"tritonbench_flex_attention_fwd[x_(8, 16, 16384, 16, 16384, 128) \| noop-eager]_tflops",1,1,0,0,3.4028235e+38,0
	"tritonbench_flex_attention_bwd[x_ (8, 16, 8192, 16, 8192, 128) \| noop-compile
	import argparse

	import torch

	import triton # @manual=//triton:triton
	import triton.language as tl # @manual=//triton:triton
	# best config selected: BLOCK_SIZE_M: 128, BLOCK_SIZE_N: 256, BLOCK_SIZE_K: 128, GROUP_SIZE_M: 8, num_warps: 8, num_ctas: 1, num_stages: 3, maxnreg: None;


	def get_cuda_autotune_config():
	#loc = loc("/tmp/torchinductor_dberard/gy/cgy5zvgvr3m2yfnc7jim2n7k35hawsgtqmr2ap6pyu3d4c6muk2u.py":17:0)
	module {
	tt.func public @triton_tem_fused_zeros_7(%arg0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/gy/cgy5zvgvr3m2yfnc7jim2n7k35hawsgtqmr2ap6pyu3d4c6muk2u.py":17:0), %arg1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/gy/cgy5zvgvr3m2yfnc7jim2n7k35hawsgtqmr2ap6pyu3d4c6muk2u.py":17:0), %arg2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/gy/cgy5zvgvr3m2yfnc7jim2n7k35hawsgtqmr2ap6pyu3d4c6muk2u.py":17:0), %arg3: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/gy/cgy5zvgvr3m2yfnc7jim2n7k35hawsgtqmr2ap6pyu3d4c6muk2u.py":17:0), %arg4: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/gy/cgy5zvgvr3m2yfnc7jim2n7k35hawsgtqmr2ap6pyu3d4c6muk2u.py":17:0), %arg5: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/gy/cgy5zvgvr3m2yfnc7jim2n7k35hawsgtqmr2ap6