Skip to content

Instantly share code, notes, and snippets.

View davidberard98's full-sized avatar

David Berard davidberard98

  • PyTorch
  • Menlo Park, CA
View GitHub Profile
TORCHINDUCTOR_FORCE_DISABLE_CACHES=1
function finish() {
pushd /home/dberard/local/pytorch-env7/triton
git checkout -- scripts/build-llvm-project.sh
}
trap finish EXIT
git patch /home/dberard/local/pytorch-env7/diff.patch
make dev-install-llvm
code=$?
if [ $code -ne 0 ]
# USAGE:
# Put this in your triton repo directory.
# 1. Update the [BUILD COMMAND]
# 2. Update the [PYTORCH PATH]
# 3. Update the [TEST COMMAND]
# 4. Run the bisect:
# $ git bisect start
# $ git checkout [known good commit]
# $ git bisect good
# $ git checkout [known bad commit]
//
// Generated by LLVM NVPTX Back-End
//
.version 8.7
.target sm_90a
.address_size 64
// .globl _layer_norm_backward_kernel // -- Begin function _layer_norm_backward_kernel
.extern .shared .align 16 .b8 global_smem[];
//
// Generated by LLVM NVPTX Back-End
//
.version 8.7
.target sm_90a
.address_size 64
// .globl _layer_norm_backward_kernel // -- Begin function _layer_norm_backward_kernel
.extern .shared .align 16 .b8 global_smem[];
#!/usr/bin/env python3
import argparse
import os
import sys
import stat
import subprocess
import re
def parse_glibcxx_version(version_string):
We can make this file beautiful and searchable if this error is corrected: Unclosed quoted field in line 9.
metric_id,Samples (3.3),Samples (3.4),speedup (3.3),speedup (3.4),speedup ((new-old)/old),speedup (delta)
tritonbench_ragged_attention_bwd[hstu]-tflops-avg,0,1,0,98.806091308594,-1,-98.806091308594
"tritonbench_ragged_attention_bwd[x_(128, 4, 1024, 128, 128, 1.0, 20, 0)-hstu]_tflops",0,1,0,132.42012023926,-1,-132.42012023926
"tritonbench_ragged_attention_bwd[x_(128, 4, 256, 128, 128, 1.0, 20, 0)-hstu]_tflops",0,1,0,65.420997619629,-1,-65.420997619629
"tritonbench_ragged_attention_bwd[x_(128, 4, 512, 128, 128, 1.0, 20, 0)-hstu]_tflops",0,1,0,98.577156066895,-1,-98.577156066895
tritonbench_ragged_attention_bwd[x_average-hstu]_tflops,0,1,0,98.806091308594,-1,-98.806091308594
"tritonbench_fp8_gemm_blockwise_fwd[x_(128, 2304, 6656)-_triton]_speedup",1,1,0.60214412212372,0.79111462831497,-0.23886615090629,-0.18897050619125
"tritonbench_fp8_gemm_blockwise_fwd[x_(128, 2304, 6656)-_triton]_tflops",1,1,73.069320678711,95.622283935547,-0.23585468081935,-22.552963256836
"tritonbench_int4_gemm_fwd[x_(16, 1, 8192,
We can make this file beautiful and searchable if this error is corrected: Unclosed quoted field in line 9.
metric_id,Samples (3.3),Samples (3.4),speedup (3.3),speedup (3.4),speedup difference ((new-old)/old),speedup (delta)
tritonbench_fused_linear_jsd_bwd-pass,1,1,0,0,3.4028235e+38,0
"tritonbench_flex_attention_fwd[x_ (8, 16, 8192, 16, 8192, 128) | noop-compiled]_speedup",1,1,0,0,3.4028235e+38,0
"tritonbench_flex_attention_fwd[x_ (8, 16, 8192, 16, 8192, 128) | noop-compiled]_tflops",1,1,0,0,3.4028235e+38,0
"tritonbench_flex_attention_fwd[x_ (8, 16, 8192, 16, 8192, 128) | noop-eager]_tflops",1,1,0,0,3.4028235e+38,0
"tritonbench_flex_attention_fwd[x_(8, 16, 16384, 16, 16384, 128) | noop-compiled]_speedup",1,1,0,0,3.4028235e+38,0
"tritonbench_flex_attention_fwd[x_(8, 16, 16384, 16, 16384, 128) | noop-compiled]_tflops",1,1,0,0,3.4028235e+38,0
"tritonbench_flex_attention_fwd[x_(8, 16, 16384, 16, 16384, 128) | noop-eager]_tflops",1,1,0,0,3.4028235e+38,0
"tritonbench_flex_attention_bwd[x_ (8, 16, 8192, 16, 8192, 128) | noop-compile
import argparse
import torch
import triton # @manual=//triton:triton
import triton.language as tl # @manual=//triton:triton
# best config selected: BLOCK_SIZE_M: 128, BLOCK_SIZE_N: 256, BLOCK_SIZE_K: 128, GROUP_SIZE_M: 8, num_warps: 8, num_ctas: 1, num_stages: 3, maxnreg: None;
def get_cuda_autotune_config():
#loc = loc("/tmp/torchinductor_dberard/gy/cgy5zvgvr3m2yfnc7jim2n7k35hawsgtqmr2ap6pyu3d4c6muk2u.py":17:0)
module {
tt.func public @triton_tem_fused_zeros_7(%arg0: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/gy/cgy5zvgvr3m2yfnc7jim2n7k35hawsgtqmr2ap6pyu3d4c6muk2u.py":17:0), %arg1: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/gy/cgy5zvgvr3m2yfnc7jim2n7k35hawsgtqmr2ap6pyu3d4c6muk2u.py":17:0), %arg2: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/gy/cgy5zvgvr3m2yfnc7jim2n7k35hawsgtqmr2ap6pyu3d4c6muk2u.py":17:0), %arg3: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/gy/cgy5zvgvr3m2yfnc7jim2n7k35hawsgtqmr2ap6pyu3d4c6muk2u.py":17:0), %arg4: !tt.ptr<f32> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/gy/cgy5zvgvr3m2yfnc7jim2n7k35hawsgtqmr2ap6pyu3d4c6muk2u.py":17:0), %arg5: !tt.ptr<bf16> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/gy/cgy5zvgvr3m2yfnc7jim2n7k35hawsgtqmr2ap6
# AOT ID: ['0_backward']
from ctypes import c_void_p, c_long, c_int
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from cmath import nanj
from torch._inductor.hooks import run_intermediate_hooks