Skip to content

Instantly share code, notes, and snippets.

View antiagainst's full-sized avatar

Lei Zhang antiagainst

View GitHub Profile
# Build: docker build -f triton-torch-hip.Dockerfile -t triton-torch-hip .
# Run: docker run -it --rm --device /dev/kfd --device /dev/dri triton-torch-hip
FROM ubuntu:22.04
ARG TORCH_VERSION=2.4.0.dev20240530
# Setup ROCm package signing key
# Build with `docker build . -t sdxl-repro --build-arg DOCKER_USERID=$(id -u) --build-arg DOCKER_GROUPID=$(id -g)`
# Run with `docker run -it --rm --network=host --device=/dev/kfd --device=/dev/dri --group-add video --group-add $(getent group render | cut -d: -f3)
# --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v /path/to/downloaded/sdxl/weights:/weights sdxl-repro`
# To benchmark inside docker: `./ N /weights`
FROM rocm/dev-ubuntu-22.04
SHELL ["/bin/bash", "-o", "pipefail", "-c"]
# Disable apt-key parse waring
# Use manylinux that ships with many Python versions for wheels. manylinux_2_28
# is AlmaLinux 8 based and is binary-compatible with Red Hat Enterprise Linux.
SHELL ["/bin/bash", "-o", "pipefail", "-c"]
######## Setup Python #######
# Choe our default Python version
ENV PATH="/opt/python/cp311-cp311/bin:${PATH}"
#loc = loc(unknown)
module attributes {"triton_gpu.compute-capability" = 90 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.shared = 0 : i32, "triton_gpu.threads-per-warp" = 64 : i32} {
llvm.func @__ockl_printf_append_args(i64, i32, i64, i64, i64, i64, i64, i64, i64, i32) -> i64 loc(#loc)
llvm.func @__ockl_printf_append_string_n(i64, !llvm.ptr, i64, i32) -> i64 loc(#loc)
llvm.func @__ockl_printf_begin(i64) -> i64 loc(#loc) internal constant @printfFormat_0("pid (%u, %u, %u) idx (%3u) x: %u\0A\00") {addr_space = 0 : i32} loc(#loc) internal constant @printfPrefix_0(" x: ") {addr_space = 0 : i32} loc(#loc) external @global_smem() {addr_space = 3 : i32, alignment = 16 : i64} : !llvm.array<0 x i8> loc(#loc)
llvm.func @kernel_device_print_0d1d(%arg0: !llvm.ptr<1> {tt.divisibility = 16 : i32} loc(unknown), %arg1: !llvm.ptr<1> {tt.divisibility = 16 : i32} loc(unknown)) attributes {noinline = false, nvvm.kernel = 1 :
0 -> Top Stat
1 -> System Info
2 -> System Speed-of-Light
2.1 -> Speed-of-Light
2.1.0 -> VALU_FLOPs
hal.executable public @main$async_dispatch_205 {
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mfma_layout<F16_16x16x16_F32>, #iree_gpu.mfma_layout<F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>) {
hal.executable.export public @main$async_dispatch_205_conv_2d_nhwc_hwcf_2x32x32x1280x3x3x1280_f16 ordinal(0) layout(#hal.pipeline.layout<push_constants = 3, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>], subgroup_size = 64 : index, translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute, {mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mfma_layout<F16_16x16x16_F32>, subgroup_m_count = 1, subgroup_n_count = 4, subgroup_m_tile_count = 2, subgroup_n_tile_count = 4, subgroup_k_tile_count = 2>}>, workgroup_size = [256 :
// -----// IR Dump After CSE (cse) //----- //
module {
func.func @conv_dispatch_1_conv_2d_nchw_fchw_2x8x16x16x8x3x3_f16() {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x8x33x33xf16>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8x8x3x3xf16>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8xf16>>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x8x16x16xf16>>
%workgroup_id_z =[2] : index
module { external @__dynamic_shared_memory__() {addr_space = 3 : i32, alignment = 16 : i64} : !llvm.array<0 x i8> private @__shared_memory___0() {addr_space = 3 : i32, alignment = 2 : i64} : !llvm.array<8 x array<40 x f16>> private @__shared_memory__() {addr_space = 3 : i32, alignment = 2 : i64} : !llvm.array<32 x array<16 x f16>>
llvm.func @matmul_accumulate_512x128xf16_times_128x512xf16_into_512x512xf32_for_LLVMGPUVectorDistribute_32_32_8_64_1_1_dispatch_0_matmul_512x512x128_f16xf16xf32(%arg0: !llvm.ptr<1> {llvm.align = 16 : i32, llvm.noalias, llvm.readonly}, %arg1: !llvm.ptr<1> {llvm.align = 16 : i32, llvm.noalias, llvm.readonly}, %arg2: !llvm.ptr<1> {llvm.align = 16 : i32, llvm.noalias}) {
%0 = llvm.mlir.constant(0 : i32) : i32
%1 = llvm.mlir.constant(3 : i64) : i64
%2 = llvm.mlir.constant(2 : i64) : i64
%3 = llvm.mlir.constant(1 : i64) : i64
%4 = llvm.mlir.constant(63 : index) : i64
FROM ubuntu:22.04
SHELL ["/bin/bash", "-e", "-u", "-o", "pipefail", "-c"]
# Disable apt-key parse waring
# Basic development environment
RUN apt-get update && apt-get install -y \
curl wget \
// Generated by LLVM NVPTX Back-End
.version 7.6
.target sm_80
.address_size 64
// .globl matmul_3456x1024x2048_f32t_f32t_f32t_tile_config_default_dispatch_0_matmul_3456x1024x2048_f32
.extern .shared .align 16 .b8 __dynamic_shared_memory__[];