Max191

## igemm_conv.mlir
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
#map1 = affine_map<(d0, d1, d2, d3) -> (d3, d2)>
#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
  #hal.descriptor_set.layout<0, bindings = [
    #hal.descriptor_set.binding<0, storage_buffer>,
    #hal.descriptor_set.binding<1, storage_buffer>
  ]>
]>
#config_mm = #iree_gpu.lowering_config<{workgroup = [1, 64, 64, 0], reduction = [0, 0, 0, 4], thread = [1, 8, 4, 0]}>

## before_after_elementwise_fusion.mlir

// -----// IR Dump After CSE (cse) //----- //
util.func public @jit_eval_174(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_174(%input0: tensor<1024x7x7x2xi8>) -> (%output0: tensor<1024x7x7x2xi8>)"}} {
  %cst = arith.constant 1.000000e+00 : f32
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant 4.000000e+00 : f32
  %cst_2 = arith.constant 2.000000e+00 : f32
  %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<1024x7x7x2xi8>
  %1 = tensor.empty() : tensor<2x1024x7x7xi8>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d2, d3, d0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<1024x7x7x2xi8>) outs(%1 : tensor<2x1024x7x7xi8>) {

## pad_winograd.mlir
module {
  func.func @main$async_dispatch_1_winograd_input_transform_11x11x16x8x8xf32() attributes {translation_info = #iree_codegen.translation_info<CPULinalgExtTileAndVectorize>} {
    %cst = arith.constant 0.000000e+00 : f32
    %c0 = arith.constant 0 : index
    %c262144 = arith.constant 262144 : index
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<16x64x64xf32>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c262144) : !flow.dispatch.tensor<writeonly:tensor<11x11x16x8x8xbf16>>
    %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [16, 64, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x64x64xf32>> -> tensor<16x64x64xf32>
    %3 = tensor.empty() : tensor<11x11x16x8x8xbf16>
    %4 = tensor.empty() : tensor<11x11x16x8x8xf32>

## README.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                Max191
                / README.md
            
            
              Created
              May 28, 2024 21:49
            
              
                VAE Numerics testing
              
          
    Instructions for testing VAE numerics

Prerequisites


Follow instructions in https://github.com/nod-ai/playbook/blob/main/HOWTO/tres_leches_demo.md for how to compile and run VAE 2b on CPU.


Get necessary input .npy files


Instructions


## elementwise_pad.mlir

  %expanded_127 = tensor.expand_shape %collapsed_126 [[0], [1], [2, 3]] output_shape [1, 512, 128, 128] : tensor<1x512x16384xf32> into tensor<1x512x128x128xf32>
  %23 = tensor.empty() : tensor<512x128x128xf32>
  %expanded_128 = tensor.expand_shape %cst_89 [[0, 1]] output_shape [1, 512] : tensor<512xf32> into tensor<1x512xf32>
  %expanded_129 = tensor.expand_shape %cst_90 [[0, 1]] output_shape [1, 512] : tensor<512xf32> into tensor<1x512xf32>
  %24 = tensor.empty() : tensor<1x512x128x128xf32>
  %25 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_127, %expanded_128, %expanded_129 : tensor<1x512x128x128xf32>, tensor<1x512xf32>, tensor<1x512xf32>) outs(%24 : tensor<1x512x128x128xf32>) {
  ^bb0(%in: f32, %in_741: f32, %in_742: f32, %out: f32):
    %436 = arith.mulf

## README.md

      
              3 files
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                Max191
                / README.md
            
            
              Last active
              May 21, 2024 21:09
            
              
                IR reference for winograd ops
              
          
    Constant matrices for decomposition

https://github.com/iree-org/iree/blob/714bf6073d569143682408ebfe0af303cd546a35/compiler/src/iree/compiler/Dialect/LinalgExt/Utils/WinogradConstants.h
Relevant matrices are
output transform: A_6x6_3x3, AT_6x6_3x3
input transform: B_6x6_3x3, BT_6x6_3x3
Decompositon pattern reference


## packing.mlir
module {
  func.func @pack_bad(%arg0: tensor<29241x128x64xbf16>) -> tensor<64x1828x64x16x2xbf16> {
    %cst = arith.constant 0.000000e+00 : bf16
    %4 = tensor.empty() : tensor<64x1828x64x16x2xbf16>
    %pack = tensor.pack %arg0 padding_value(%cst : bf16) outer_dims_perm = [2, 0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 2] into %4 : tensor<29241x128x64xbf16> -> tensor<64x1828x64x16x2xbf16>
    return %pack : tensor<64x1828x64x16x2xbf16>
  }

  func.func @pack_good(%arg0: tensor<64x29241x128xbf16>) -> tensor<64x1828x64x16x2xbf16> {
    %cst = arith.constant 0.000000e+00 : bf16

## pure_unpack.mlir
module attributes {hal.device.targets = [#hal.device.target<"local", [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "mmt4d,pack"}>]>]} {
  hal.executable private @main$async_dispatch_330 {
    hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4",

## conv_2d_nchw_fchw_dispatch_1_batch_matmul_64x242x1280x1280_f16xf16xf32.mlir
module attributes {hal.device.targets = [#hal.device.target<"rocm", {legacy_sync}, [#hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, #iree_gpu.mma_layout<MFMA_F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none", waves_per_eu = 2 : i64}>]>]} {
  hal.executable private @conv_2d_nchw_fchw_dispatch_1 {
    hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, #iree_gpu.mma_layout<MFMA_F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none", waves_per_eu = 2 : i64}>) {
      hal.executable.export public @conv_2d_nchw_fchw_dispatch_1_batch_matmul_64x242x1280x1280_f16xf16xf32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.

## batch_matmul_64x72x1280x1280_f16xf16xf32.mlir
hal.executable public @conv_2d_nchw_fchw_dispatch_1 {
  hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, #iree_gpu.mma_layout<MFMA_F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none", waves_per_eu = 2 : i64}>) {
    hal.executable.export public @conv_2d_nchw_fchw_dispatch_1_batch_matmul_64x72x1280x1280_f16xf16xf32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} {
    ^bb0(%arg0: !hal.device):
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      hal.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @conv_2d_nchw_fchw_dispatch_1_batch_matmul_64x72x1280x1280_f16xf16xf32() {
        %cst = arith.constant 0.000000e+00 : f
	#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
	#map1 = affine_map<(d0, d1, d2, d3) -> (d3, d2)>
	#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
	#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
	#hal.descriptor_set.layout<0, bindings = [
	#hal.descriptor_set.binding<0, storage_buffer>,
	#hal.descriptor_set.binding<1, storage_buffer>
	]>
	]>
	#config_mm = #iree_gpu.lowering_config<{workgroup = [1, 64, 64, 0], reduction = [0, 0, 0, 4], thread = [1, 8, 4, 0]}>

	// -----// IR Dump After CSE (cse) //----- //
	util.func public @jit_eval_174(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @jit_eval_174(%input0: tensor<1024x7x7x2xi8>) -> (%output0: tensor<1024x7x7x2xi8>)"}} {
	%cst = arith.constant 1.000000e+00 : f32
	%cst_0 = arith.constant 0.000000e+00 : f32
	%cst_1 = arith.constant 4.000000e+00 : f32
	%cst_2 = arith.constant 2.000000e+00 : f32
	%0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<1024x7x7x2xi8>
	%1 = tensor.empty() : tensor<2x1024x7x7xi8>
	%2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d2, d3, d0)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : tensor<1024x7x7x2xi8>) outs(%1 : tensor<2x1024x7x7xi8>) {
	module {
	func.func @main$async_dispatch_1_winograd_input_transform_11x11x16x8x8xf32() attributes {translation_info = #iree_codegen.translation_info<CPULinalgExtTileAndVectorize>} {
	%cst = arith.constant 0.000000e+00 : f32
	%c0 = arith.constant 0 : index
	%c262144 = arith.constant 262144 : index
	%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<16x64x64xf32>>
	%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c262144) : !flow.dispatch.tensor<writeonly:tensor<11x11x16x8x8xbf16>>
	%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [16, 64, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x64x64xf32>> -> tensor<16x64x64xf32>
	%3 = tensor.empty() : tensor<11x11x16x8x8xbf16>
	%4 = tensor.empty() : tensor<11x11x16x8x8xf32>

	%expanded_127 = tensor.expand_shape %collapsed_126 [[0], [1], [2, 3]] output_shape [1, 512, 128, 128] : tensor<1x512x16384xf32> into tensor<1x512x128x128xf32>
	%23 = tensor.empty() : tensor<512x128x128xf32>
	%expanded_128 = tensor.expand_shape %cst_89 [[0, 1]] output_shape [1, 512] : tensor<512xf32> into tensor<1x512xf32>
	%expanded_129 = tensor.expand_shape %cst_90 [[0, 1]] output_shape [1, 512] : tensor<512xf32> into tensor<1x512xf32>
	%24 = tensor.empty() : tensor<1x512x128x128xf32>
	%25 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_127, %expanded_128, %expanded_129 : tensor<1x512x128x128xf32>, tensor<1x512xf32>, tensor<1x512xf32>) outs(%24 : tensor<1x512x128x128xf32>) {
	^bb0(%in: f32, %in_741: f32, %in_742: f32, %out: f32):
	%436 = arith.mulf
	module {
	func.func @pack_bad(%arg0: tensor<29241x128x64xbf16>) -> tensor<64x1828x64x16x2xbf16> {
	%cst = arith.constant 0.000000e+00 : bf16
	%4 = tensor.empty() : tensor<64x1828x64x16x2xbf16>
	%pack = tensor.pack %arg0 padding_value(%cst : bf16) outer_dims_perm = [2, 0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 2] into %4 : tensor<29241x128x64xbf16> -> tensor<64x1828x64x16x2xbf16>
	return %pack : tensor<64x1828x64x16x2xbf16>
	}

	func.func @pack_good(%arg0: tensor<64x29241x128xbf16>) -> tensor<64x1828x64x16x2xbf16> {
	%cst = arith.constant 0.000000e+00 : bf16
	module attributes {hal.device.targets = [#hal.device.target<"local", [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr,+evex512", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "mmt4d,pack"}>]>]} {
	hal.executable private @main$async_dispatch_330 {
	hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4",
	module attributes {hal.device.targets = [#hal.device.target<"rocm", {legacy_sync}, [#hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, #iree_gpu.mma_layout<MFMA_F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none", waves_per_eu = 2 : i64}>]>]} {
	hal.executable private @conv_2d_nchw_fchw_dispatch_1 {
	hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, #iree_gpu.mma_layout<MFMA_F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none", waves_per_eu = 2 : i64}>) {
	hal.executable.export public @conv_2d_nchw_fchw_dispatch_1_batch_matmul_64x242x1280x1280_f16xf16xf32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.
	hal.executable public @conv_2d_nchw_fchw_dispatch_1 {
	hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, #iree_gpu.mma_layout<MFMA_F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none", waves_per_eu = 2 : i64}>) {
	hal.executable.export public @conv_2d_nchw_fchw_dispatch_1_batch_matmul_64x72x1280x1280_f16xf16xf32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} {
	^bb0(%arg0: !hal.device):
	%x, %y, %z = flow.dispatch.workgroup_count_from_slice
	hal.return %x, %y, %z : index, index, index
	}
	builtin.module {
	func.func @conv_2d_nchw_fchw_dispatch_1_batch_matmul_64x72x1280x1280_f16xf16xf32() {
	%cst = arith.constant 0.000000e+00 : f