benvanik/unidirectional_lstm vulkan error

## unidirectional_lstm vulkan error
          %25 = xla_hlo.minimum %23, %24 : tensor<1x10xf32>
          %26 = "xla_hlo.broadcast_in_dim"(%cst_0) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<f32>) -> tensor<1x10xf32>
          %27 = xla_hlo.maximum %25, %26 : tensor<1x10xf32>
          %28 = "xla_hlo.slice"(%8) {limit_indices = dense<[1, 40]> : tensor<2xi64>, start_indices = dense<[0, 30]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x40xf32>) -> tensor<1x10xf32>
          %29 = xla_hlo.multiply %6, %28 : tensor<1x10xf32>
          %30 = "xla_hlo.tanh"(%29) : (tensor<1x10xf32>) -> tensor<1x10xf32>
          %31 = xla_hlo.multiply %6, %30 : tensor<1x10xf32>
          %32 = xla_hlo.add %6, %31 : tensor<1x10xf32>
          %33 = "xla_hlo.tanh"(%27) : (tensor<1x10xf32>) -> tensor<1x10xf32>
          %34 = xla_hlo.multiply %32, %33 : tensor<1x10xf32>
          %35 = "xla_hlo.select"(%5, %arg0, %34) : (tensor<1x10xi1>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32>
          %36 = "xla_hlo.reshape"(%35) : (tensor<1x10xf32>) -> tensor<1x1x10xf32>
          return %35, %36 : tensor<1x10xf32>, tensor<1x1x10xf32>
        }
        hal.interface @legacy_io attributes {sym_visibility = "private"} {
          hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
          hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
          hal.interface.binding @arg2, set=0, binding=2, type="StorageBuffer", access="Read"
          hal.interface.binding @arg3, set=0, binding=3, type="StorageBuffer", access="Read"
          hal.interface.binding @arg4, set=0, binding=4, type="StorageBuffer", access="Read"
          hal.interface.binding @arg5, set=0, binding=5, type="StorageBuffer", access="Read"
          hal.interface.binding @arg6, set=0, binding=6, type="StorageBuffer", access="Read"
          hal.interface.binding @ret0, set=0, binding=7, type="StorageBuffer", access="Write|Discard"
          hal.interface.binding @ret1, set=0, binding=8, type="StorageBuffer", access="Write|Discard"
        }
      }
    }
  }
  func @main(%arg0: tensor<1x5xf32> {iree.reflection = {}}, %arg1: tensor<1x5x2x2xf32> {iree.reflection = {}}) -> (tensor<5x1x10xf32> {iree.reflection = {}}) attributes {iree.module.export, iree.reflection = {f = "I19!B5!d1d5B9!d1d5d2d2R11!B8!d5d1d10", fv = "1"}} {
    %c0 = constant 0 : index
    %c64 = constant 64 : index
    %c74 = constant 74 : index
    %cst = constant dense<4.200000e-01> : tensor<74x40xf32>
    %cst_0 = constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
    %c1 = constant 1 : index
    %c10 = constant 10 : index
    %c20 = constant 20 : index
    %c320 = constant 320 : index
    %c5 = constant 5 : index
    %c40 = constant 40 : index
    %c50 = constant 50 : index
    %0:7 = flow.ex.stream.fragment(%arg2 = %c10 : index, %arg3 = %c20 : index, %arg4 = %arg1 : tensor<1x5x2x2xf32>, %arg5 = %c320 : index, %arg6 = %c5 : index, %arg7 = %arg0 : tensor<1x5xf32>, %arg8 = %cst_0 : tensor<5xi32>, %arg9 = %c1 : index, %arg10 = %c40 : index, %arg11 = %c50 : index) -> (tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<5x1x10xf32>) {
      %25 = flow.dispatch @main_ex_dispatch_0::@main_ex_dispatch_0[%arg2 : index]() : () -> tensor<1x10xf32>
      %26 = flow.dispatch @main_ex_dispatch_1::@main_ex_dispatch_1[%arg3 : index](%arg4) : (tensor<1x5x2x2xf32>) -> tensor<1x5x4xf32>
      %27 = flow.dispatch @main_ex_dispatch_2::@main_ex_dispatch_2[%arg5 : index](%26) : (tensor<1x5x4xf32>) -> tensor<1x5x64xf32>
      %28 = flow.dispatch @main_ex_dispatch_3::@main_ex_dispatch_3[%arg5 : index](%27) : (tensor<1x5x64xf32>) -> tensor<5x1x64xf32>
      %29 = flow.dispatch @main_ex_dispatch_4::@main_ex_dispatch_4[%arg6 : index](%arg7) : (tensor<1x5xf32>) -> tensor<5x1x1xf32>
      %30 = flow.dispatch @main_ex_dispatch_5::@main_ex_dispatch_5[%arg6 : index](%29) : (tensor<5x1x1xf32>) -> tensor<5xf32>
      %31:2 = flow.dispatch @main_ex_dispatch_6::@main_ex_dispatch_6[%arg6 : index](%30, %arg8) : (tensor<5xf32>, tensor<5xi32>) -> (tensor<5xi32>, tensor<5xi32>)
      %32 = flow.dispatch @main_ex_dispatch_7::@main_ex_dispatch_7[%arg9 : index](%31#0) : (tensor<5xi32>) -> tensor<i32>
      %33 = flow.dispatch @main_ex_dispatch_8::@main_ex_dispatch_8[%arg9 : index](%32) : (tensor<i32>) -> tensor<i32>
      %34 = flow.dispatch @main_ex_dispatch_9::@main_ex_dispatch_9[%arg9 : index](%31#1) : (tensor<5xi32>) -> tensor<i32>
      %35 = flow.dispatch @main_ex_dispatch_10::@main_ex_dispatch_10[%arg9 : index](%34, %32) : (tensor<i32>, tensor<i32>) -> tensor<i32>
      %36 = flow.dispatch @main_ex_dispatch_11::@main_ex_dispatch_11[%arg10 : index]() : () -> tensor<40xf32>
      %37 = flow.dispatch @main_ex_dispatch_12::@main_ex_dispatch_12[%arg11 : index]() : () -> tensor<5x1x10xf32>
      flow.return %25, %28, %29, %33, %35, %36, %37 : tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<5x1x10xf32>
    }
    br ^bb1(%0#4, %0#3, %0#5, %cst, %0#0, %0#0, %0#1, %0#2, %0#6 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
  ^bb1(%1: tensor<i32>, %2: tensor<i32>, %3: tensor<40xf32>, %4: tensor<74x40xf32>, %5: tensor<1x10xf32>, %6: tensor<1x10xf32>, %7: tensor<5x1x64xf32>, %8: tensor<5x1x1xf32>, %9: tensor<5x1x10xf32>):  // 2 preds: ^bb0, ^bb2
    %10 = flow.ex.stream.fragment(%arg2 = %c1 : index, %arg3 = %1 : tensor<i32>, %arg4 = %2 : tensor<i32>) -> tensor<i1> {
      %25 = flow.dispatch @main_ex_dispatch_13::@main_ex_dispatch_13[%arg2 : index](%arg3, %arg4) : (tensor<i32>, tensor<i32>) -> tensor<i1>
      flow.return %25 : tensor<i1>
    }
    %11 = flow.tensor.load %10 : tensor<i1>
    cond_br %11, ^bb2(%1, %2, %3, %4, %5, %6, %7, %8, %9 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>), ^bb3(%9 : tensor<5x1x10xf32>)
  ^bb2(%12: tensor<i32>, %13: tensor<i32>, %14: tensor<40xf32>, %15: tensor<74x40xf32>, %16: tensor<1x10xf32>, %17: tensor<1x10xf32>, %18: tensor<5x1x64xf32>, %19: tensor<5x1x1xf32>, %20: tensor<5x1x10xf32>):  // pred: ^bb1
    %21 = flow.tensor.load %12 : tensor<i32>
    %22 = index_cast %21 : i32 to index
    %23:4 = flow.ex.stream.fragment(%arg2 = %c1 : index, %arg3 = %12 : tensor<i32>, %arg4 = %c64 : index, %arg5 = %18 : tensor<5x1x64xf32>, %arg6 = %c74 : index, %arg7 = %17 : tensor<1x10xf32>, %arg8 = %c40 : index, %arg9 = %15 : tensor<74x40xf32>, %arg10 = %c10 : index, %arg11 = %16 : tensor<1x10xf32>, %arg12 = %14 : tensor<40xf32>, %arg13 = %0#0 : tensor<1x10xf32>, %arg14 = %19 : tensor<5x1x1xf32>, %arg15 = %20 : tensor<5x1x10xf32>, %arg16 = %22 : index, %arg17 = %c0 : index) -> (tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>) {
      %25 = flow.dispatch @main_ex_dispatch_14::@main_ex_dispatch_14[%arg2 : index](%arg3) : (tensor<i32>) -> tensor<i32>
      %26 = flow.dispatch @main_ex_dispatch_15::@main_ex_dispatch_15[%arg4 : index](%arg5, %arg3) : (tensor<5x1x64xf32>, tensor<i32>) -> tensor<1x64xf32>
      %27 = flow.dispatch @main_ex_dispatch_16::@main_ex_dispatch_16[%arg6 : index](%26, %arg7) : (tensor<1x64xf32>, tensor<1x10xf32>) -> tensor<1x74xf32>
      %28 = flow.dispatch @main_ex_dispatch_17::@main_ex_dispatch_17[%arg8 : index](%27, %arg9) : (tensor<1x74xf32>, tensor<74x40xf32>) -> tensor<1x40xf32>
      %29 = flow.dispatch @main_ex_dispatch_18::@main_ex_dispatch_18[%arg10 : index](%arg11, %28, %arg12, %arg13, %arg14, %arg3) : (tensor<1x10xf32>, tensor<1x40xf32>, tensor<40xf32>, tensor<1x10xf32>, tensor<5x1x1xf32>, tensor<i32>) -> tensor<1x10xf32>
      %30:2 = flow.dispatch @main_ex_dispatch_19::@main_ex_dispatch_19[%arg10 : index](%arg7, %arg11, %28, %arg12, %arg13, %arg14, %arg3) : (tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x40xf32>, tensor<40xf32>, tensor<1x10xf32>, tensor<5x1x1xf32>, tensor<i32>) -> (tensor<1x10xf32>, tensor<1x1x10xf32>)
      %31 = flow.tensor.update %30#1, %arg15[%arg16, %arg17, %arg17] : tensor<1x1x10xf32> -> tensor<5x1x10xf32>
      flow.return %25, %29, %30#0, %31 : tensor<i32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x10xf32>
    }
    br ^bb1(%23#0, %13, %14, %15, %23#1, %23#2, %18, %19, %23#3 : tensor<i32>, tensor<i32>, tensor<40xf32>, tensor<74x40xf32>, tensor<1x10xf32>, tensor<1x10xf32>, tensor<5x1x64xf32>, tensor<5x1x1xf32>, tensor<5x1x10xf32>)
  ^bb3(%24: tensor<5x1x10xf32>):  // pred: ^bb1
    return %24 : tensor<5x1x10xf32>
  }
}

// *** IR Dump After Inliner ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_0() {
    %c0 = constant 0 : index
    %cst = constant dense<0.000000e+00> : tensor<f32>
    %0 = "xla_hlo.broadcast_in_dim"(%cst) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<f32>) -> tensor<1x10xf32>
    hal.interface.store.tensor %0, @legacy_io::@ret0, offset = %c0 : tensor<1x10xf32>
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @ret0, set=0, binding=0, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::Shape::`anonymous-namespace'::TieDynamicShapesPass ***
func @main_ex_dispatch_0() {
  %c0 = constant 0 : index
  %cst = constant dense<0.000000e+00> : tensor<f32>
  %0 = "xla_hlo.broadcast_in_dim"(%cst) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<f32>) -> tensor<1x10xf32>
  hal.interface.store.tensor %0, @legacy_io::@ret0, offset = %c0 : tensor<1x10xf32>
  return
}

// *** IR Dump After mlir::iree_compiler::Shape::`anonymous-namespace'::MaterializeShapeCalculationsPass ***
func @main_ex_dispatch_0() {
  %c0 = constant 0 : index
  %cst = constant dense<0.000000e+00> : tensor<f32>
  %0 = "xla_hlo.broadcast_in_dim"(%cst) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<f32>) -> tensor<1x10xf32>
  hal.interface.store.tensor %0, @legacy_io::@ret0, offset = %c0 : tensor<1x10xf32>
  return
}

// *** IR Dump After mlir::iree_compiler::Shape::`anonymous-namespace'::HoistShapeCalculations ***
func @main_ex_dispatch_0() {
  %c0 = constant 0 : index
  %cst = constant dense<0.000000e+00> : tensor<f32>
  %0 = "xla_hlo.broadcast_in_dim"(%cst) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<f32>) -> tensor<1x10xf32>
  hal.interface.store.tensor %0, @legacy_io::@ret0, offset = %c0 : tensor<1x10xf32>
  return
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::DecomposeHLOClampPass ***
func @main_ex_dispatch_0() {
  %c0 = constant 0 : index
  %cst = constant dense<0.000000e+00> : tensor<f32>
  %0 = "xla_hlo.broadcast_in_dim"(%cst) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<f32>) -> tensor<1x10xf32>
  hal.interface.store.tensor %0, @legacy_io::@ret0, offset = %c0 : tensor<1x10xf32>
  return
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::ConvertHLOToLinalgOnTensorsPass ***
func @main_ex_dispatch_0() {
  %c0 = constant 0 : index
  %cst = constant dense<0.000000e+00> : tensor<f32>
  %0 = linalg.generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} %cst {
  ^bb0(%arg0: f32):  // no predecessors
    linalg.yield %arg0 : f32
  }: tensor<f32> -> tensor<1x10xf32>
  hal.interface.store.tensor %0, @legacy_io::@ret0, offset = %c0 : tensor<1x10xf32>
  return
}

// *** IR Dump After LinalgFusionOfTensorOps ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_0() {
    %c0 = constant 0 : index
    %cst = constant 0.000000e+00 : f32
    %0 = linalg.generic {args_in = 0 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]}  {
      linalg.yield %cst : f32
    }:  -> tensor<1x10xf32>
    hal.interface.store.tensor %0, @legacy_io::@ret0, offset = %c0 : tensor<1x10xf32>
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @ret0, set=0, binding=0, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::ConvertHLOToLinalgOnBuffersPass ***
func @main_ex_dispatch_0() {
  %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x10xf32>
  %c0 = constant 0 : index
  %cst = constant 0.000000e+00 : f32
  linalg.generic {args_in = 0 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} %0 {
  ^bb0(%arg0: f32):  // no predecessors
    linalg.yield %cst : f32
  }: memref<1x10xf32>
  return
}

// *** IR Dump After Canonicalizer ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_0() {
    %cst = constant 0.000000e+00 : f32
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x10xf32>
    linalg.generic {args_in = 0 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} %0 {
    ^bb0(%arg0: f32):  // no predecessors
      linalg.yield %cst : f32
    }: memref<1x10xf32>
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @ret0, set=0, binding=0, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After CSE ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_0() {
    %cst = constant 0.000000e+00 : f32
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x10xf32>
    linalg.generic {args_in = 0 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} %0 {
    ^bb0(%arg0: f32):  // no predecessors
      linalg.yield %cst : f32
    }: memref<1x10xf32>
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @ret0, set=0, binding=0, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::LinalgTileAndFusePass ***
func @main_ex_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[32, 4, 1]> : vector<3xi32>}} {
  %cst = constant 0.000000e+00 : f32
  %c0 = constant 0 : index
  %c4 = constant 4 : index
  %c32 = constant 32 : index
  %c10 = constant 10 : index
  %c1 = constant 1 : index
  %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x10xf32>
  scf.parallel (%arg0, %arg1) = (%c0, %c0) to (%c1, %c10) step (%c4, %c32) {
    %1 = affine.min affine_map<(d0, d1, d2) -> (4, d1 - d2)>(%c4, %c1, %arg0)
    %2 = affine.min affine_map<(d0, d1, d2) -> (32, d1 - d2)>(%c32, %c10, %arg1)
    %3 = subview %0[%arg0, %arg1] [%1, %2] [%c1, %c1]  : memref<1x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2)>>
    linalg.generic {args_in = 0 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} {__internal_linalg_transform__ = "workitem"} %3 {
    ^bb0(%arg2: f32):  // no predecessors
      linalg.yield %cst : f32
    }: memref<?x?xf32, affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2)>>
    scf.yield
  }
  return
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::SplitDispatchFunctionPass ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[32, 4, 1]> : vector<3xi32>}} {
    %cst = constant 0.000000e+00 : f32
    %c0 = constant 0 : index
    %c4 = constant 4 : index
    %c32 = constant 32 : index
    %c10 = constant 10 : index
    %c1 = constant 1 : index
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x10xf32>
    scf.parallel (%arg0, %arg1) = (%c0, %c0) to (%c1, %c10) step (%c4, %c32) {
      %1 = affine.min affine_map<(d0, d1, d2) -> (4, d1 - d2)>(%c4, %c1, %arg0)
      %2 = affine.min affine_map<(d0, d1, d2) -> (32, d1 - d2)>(%c32, %c10, %arg1)
      %3 = subview %0[%arg0, %arg1] [%1, %2] [%c1, %c1]  : memref<1x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2)>>
      linalg.generic {args_in = 0 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} {__internal_linalg_transform__ = "workitem"} %3 {
      ^bb0(%arg2: f32):  // no predecessors
        linalg.yield %cst : f32
      }: memref<?x?xf32, affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2)>>
      scf.yield
    }
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @ret0, set=0, binding=0, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::LinalgTileAndFusePass ***
func @main_ex_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[32, 4, 1]> : vector<3xi32>}} {
  %cst = constant 0.000000e+00 : f32
  %c0 = constant 0 : index
  %c4 = constant 4 : index
  %c32 = constant 32 : index
  %c10 = constant 10 : index
  %c1 = constant 1 : index
  %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x10xf32>
  scf.parallel (%arg0, %arg1) = (%c0, %c0) to (%c1, %c10) step (%c4, %c32) {
    %1 = affine.min affine_map<(d0, d1, d2) -> (4, d1 - d2)>(%c4, %c1, %arg0)
    %2 = affine.min affine_map<(d0, d1, d2) -> (32, d1 - d2)>(%c32, %c10, %arg1)
    %3 = subview %0[%arg0, %arg1] [%1, %2] [%c1, %c1]  : memref<1x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2)>>
    linalg.generic {args_in = 0 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} {__internal_linalg_transform__ = "workitem"} %3 {
    ^bb0(%arg2: f32):  // no predecessors
      linalg.yield %cst : f32
    }: memref<?x?xf32, affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2)>>
    scf.yield
  }
  return
}

// *** IR Dump After Canonicalizer ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[32, 4, 1]> : vector<3xi32>}} {
    %cst = constant 0.000000e+00 : f32
    %c0 = constant 0 : index
    %c4 = constant 4 : index
    %c32 = constant 32 : index
    %c10 = constant 10 : index
    %c1 = constant 1 : index
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x10xf32>
    scf.parallel (%arg0, %arg1) = (%c0, %c0) to (%c1, %c10) step (%c4, %c32) {
      %1 = affine.min affine_map<(d0) -> (4, -d0 + 1)>(%arg0)
      %2 = affine.min affine_map<(d0) -> (32, -d0 + 10)>(%arg1)
      %3 = subview %0[%arg0, %arg1] [%1, %2] [1, 1]  : memref<1x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
      linalg.generic {args_in = 0 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} {__internal_linalg_transform__ = "workitem"} %3 {
      ^bb0(%arg2: f32):  // no predecessors
        linalg.yield %cst : f32
      }: memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
      scf.yield
    }
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @ret0, set=0, binding=0, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::ConvertToGPUPass ***
func @main_ex_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[32, 4, 1]> : vector<3xi32>}} {
  %cst = constant 0.000000e+00 : f32
  %c0 = constant 0 : index
  %c4 = constant 4 : index
  %c32 = constant 32 : index
  %c10 = constant 10 : index
  %c1 = constant 1 : index
  %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x10xf32>
  %1 = "gpu.block_id"() {dimension = "x"} : () -> index
  %2 = "gpu.grid_dim"() {dimension = "x"} : () -> index
  %3 = "gpu.block_id"() {dimension = "y"} : () -> index
  %4 = "gpu.grid_dim"() {dimension = "y"} : () -> index
  %5 = muli %c4, %3 : index
  %6 = addi %c0, %5 : index
  %7 = muli %c4, %4 : index
  %8 = muli %c32, %1 : index
  %9 = addi %c0, %8 : index
  %10 = muli %c32, %2 : index
  scf.for %arg0 = %6 to %c1 step %7 {
    scf.for %arg1 = %9 to %c10 step %10 {
      %11 = affine.min affine_map<(d0) -> (4, -d0 + 1)>(%arg0)
      %12 = affine.min affine_map<(d0) -> (32, -d0 + 10)>(%arg1)
      %13 = subview %0[%arg0, %arg1] [%11, %12] [1, 1]  : memref<1x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
      %14 = dim %13, 0 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
      %15 = dim %13, 1 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
      %16 = affine.apply affine_map<(d0) -> (d0)>(%14)
      %17 = affine.apply affine_map<(d0) -> (d0)>(%15)
      %c0_0 = constant 0 : index
      %c1_1 = constant 1 : index
      %c0_2 = constant 0 : index
      %c1_3 = constant 1 : index
      %18 = "gpu.thread_id"() {dimension = "x"} : () -> index
      %19 = "gpu.block_dim"() {dimension = "x"} : () -> index
      %20 = "gpu.thread_id"() {dimension = "y"} : () -> index
      %21 = "gpu.block_dim"() {dimension = "y"} : () -> index
      %22 = muli %c1_1, %20 : index
      %23 = addi %c0_0, %22 : index
      %24 = muli %c1_1, %21 : index
      %25 = muli %c1_3, %18 : index
      %26 = addi %c0_2, %25 : index
      %27 = muli %c1_3, %19 : index
      scf.for %arg2 = %23 to %16 step %24 {
        scf.for %arg3 = %26 to %17 step %27 {
          %28 = affine.apply affine_map<(d0) -> (d0)>(%arg2)
          %29 = affine.apply affine_map<(d0) -> (d0)>(%arg3)
          %30 = load %13[%28, %29] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
          %31 = affine.apply affine_map<(d0) -> (d0)>(%arg2)
          %32 = affine.apply affine_map<(d0) -> (d0)>(%arg3)
          store %cst, %13[%31, %32] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
        }
      }
    }
  }
  return
}

// *** IR Dump After ConvertAffineToStandard ***
func @main_ex_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[32, 4, 1]> : vector<3xi32>}} {
  %cst = constant 0.000000e+00 : f32
  %c0 = constant 0 : index
  %c4 = constant 4 : index
  %c32 = constant 32 : index
  %c10 = constant 10 : index
  %c1 = constant 1 : index
  %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x10xf32>
  %1 = "gpu.block_id"() {dimension = "x"} : () -> index
  %2 = "gpu.grid_dim"() {dimension = "x"} : () -> index
  %3 = "gpu.block_id"() {dimension = "y"} : () -> index
  %4 = "gpu.grid_dim"() {dimension = "y"} : () -> index
  %5 = muli %c4, %3 : index
  %6 = addi %c0, %5 : index
  %7 = muli %c4, %4 : index
  %8 = muli %c32, %1 : index
  %9 = addi %c0, %8 : index
  %10 = muli %c32, %2 : index
  scf.for %arg0 = %6 to %c1 step %7 {
    scf.for %arg1 = %9 to %c10 step %10 {
      %c4_0 = constant 4 : index
      %c-1 = constant -1 : index
      %11 = muli %arg0, %c-1 : index
      %c1_1 = constant 1 : index
      %12 = addi %11, %c1_1 : index
      %13 = cmpi "slt", %c4_0, %12 : index
      %14 = select %13, %c4_0, %12 : index
      %c32_2 = constant 32 : index
      %c-1_3 = constant -1 : index
      %15 = muli %arg1, %c-1_3 : index
      %c10_4 = constant 10 : index
      %16 = addi %15, %c10_4 : index
      %17 = cmpi "slt", %c32_2, %16 : index
      %18 = select %17, %c32_2, %16 : index
      %19 = subview %0[%arg0, %arg1] [%14, %18] [1, 1]  : memref<1x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
      %20 = dim %19, 0 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
      %21 = dim %19, 1 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
      %c0_5 = constant 0 : index
      %c1_6 = constant 1 : index
      %c0_7 = constant 0 : index
      %c1_8 = constant 1 : index
      %22 = "gpu.thread_id"() {dimension = "x"} : () -> index
      %23 = "gpu.block_dim"() {dimension = "x"} : () -> index
      %24 = "gpu.thread_id"() {dimension = "y"} : () -> index
      %25 = "gpu.block_dim"() {dimension = "y"} : () -> index
      %26 = muli %c1_6, %24 : index
      %27 = addi %c0_5, %26 : index
      %28 = muli %c1_6, %25 : index
      %29 = muli %c1_8, %22 : index
      %30 = addi %c0_7, %29 : index
      %31 = muli %c1_8, %23 : index
      scf.for %arg2 = %27 to %20 step %28 {
        scf.for %arg3 = %30 to %21 step %31 {
          %32 = load %19[%arg2, %arg3] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
          store %cst, %19[%arg2, %arg3] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
        }
      }
    }
  }
  return
}

// *** IR Dump After Canonicalizer ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[32, 4, 1]> : vector<3xi32>}} {
    %cst = constant 0.000000e+00 : f32
    %c4 = constant 4 : index
    %c1 = constant 1 : index
    %c32 = constant 32 : index
    %c-1 = constant -1 : index
    %c10 = constant 10 : index
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x10xf32>
    %1 = "gpu.block_id"() {dimension = "x"} : () -> index
    %2 = "gpu.grid_dim"() {dimension = "x"} : () -> index
    %3 = "gpu.block_id"() {dimension = "y"} : () -> index
    %4 = "gpu.grid_dim"() {dimension = "y"} : () -> index
    %5 = muli %3, %c4 : index
    %6 = muli %4, %c4 : index
    %7 = muli %1, %c32 : index
    %8 = muli %2, %c32 : index
    scf.for %arg0 = %5 to %c1 step %6 {
      scf.for %arg1 = %7 to %c10 step %8 {
        %9 = muli %arg0, %c-1 : index
        %10 = addi %9, %c1 : index
        %11 = cmpi "slt", %c4, %10 : index
        %12 = select %11, %c4, %10 : index
        %13 = muli %arg1, %c-1 : index
        %14 = addi %13, %c10 : index
        %15 = cmpi "slt", %c32, %14 : index
        %16 = select %15, %c32, %14 : index
        %17 = subview %0[%arg0, %arg1] [%12, %16] [1, 1]  : memref<1x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
        %18 = "gpu.thread_id"() {dimension = "x"} : () -> index
        %19 = "gpu.block_dim"() {dimension = "x"} : () -> index
        %20 = "gpu.thread_id"() {dimension = "y"} : () -> index
        %21 = "gpu.block_dim"() {dimension = "y"} : () -> index
        scf.for %arg2 = %20 to %12 step %21 {
          scf.for %arg3 = %18 to %16 step %19 {
            store %cst, %17[%arg2, %arg3] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
          }
        }
      }
    }
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @ret0, set=0, binding=0, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After CSE ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[32, 4, 1]> : vector<3xi32>}} {
    %cst = constant 0.000000e+00 : f32
    %c4 = constant 4 : index
    %c1 = constant 1 : index
    %c32 = constant 32 : index
    %c-1 = constant -1 : index
    %c10 = constant 10 : index
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x10xf32>
    %1 = "gpu.block_id"() {dimension = "x"} : () -> index
    %2 = "gpu.grid_dim"() {dimension = "x"} : () -> index
    %3 = "gpu.block_id"() {dimension = "y"} : () -> index
    %4 = "gpu.grid_dim"() {dimension = "y"} : () -> index
    %5 = muli %3, %c4 : index
    %6 = muli %4, %c4 : index
    %7 = muli %1, %c32 : index
    %8 = muli %2, %c32 : index
    scf.for %arg0 = %5 to %c1 step %6 {
      scf.for %arg1 = %7 to %c10 step %8 {
        %9 = muli %arg0, %c-1 : index
        %10 = addi %9, %c1 : index
        %11 = cmpi "slt", %c4, %10 : index
        %12 = select %11, %c4, %10 : index
        %13 = muli %arg1, %c-1 : index
        %14 = addi %13, %c10 : index
        %15 = cmpi "slt", %c32, %14 : index
        %16 = select %15, %c32, %14 : index
        %17 = subview %0[%arg0, %arg1] [%12, %16] [1, 1]  : memref<1x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
        %18 = "gpu.thread_id"() {dimension = "x"} : () -> index
        %19 = "gpu.block_dim"() {dimension = "x"} : () -> index
        %20 = "gpu.thread_id"() {dimension = "y"} : () -> index
        %21 = "gpu.block_dim"() {dimension = "y"} : () -> index
        scf.for %arg2 = %20 to %12 step %21 {
          scf.for %arg3 = %18 to %16 step %19 {
            store %cst, %17[%arg2, %arg3] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
          }
        }
      }
    }
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @ret0, set=0, binding=0, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::ResolveShapeOpsPass ***
func @main_ex_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[32, 4, 1]> : vector<3xi32>}} {
  %cst = constant 0.000000e+00 : f32
  %c4 = constant 4 : index
  %c1 = constant 1 : index
  %c32 = constant 32 : index
  %c-1 = constant -1 : index
  %c10 = constant 10 : index
  %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x10xf32>
  %1 = "gpu.block_id"() {dimension = "x"} : () -> index
  %2 = "gpu.grid_dim"() {dimension = "x"} : () -> index
  %3 = "gpu.block_id"() {dimension = "y"} : () -> index
  %4 = "gpu.grid_dim"() {dimension = "y"} : () -> index
  %5 = muli %3, %c4 : index
  %6 = muli %4, %c4 : index
  %7 = muli %1, %c32 : index
  %8 = muli %2, %c32 : index
  scf.for %arg0 = %5 to %c1 step %6 {
    scf.for %arg1 = %7 to %c10 step %8 {
      %9 = muli %arg0, %c-1 : index
      %10 = addi %9, %c1 : index
      %11 = cmpi "slt", %c4, %10 : index
      %12 = select %11, %c4, %10 : index
      %13 = muli %arg1, %c-1 : index
      %14 = addi %13, %c10 : index
      %15 = cmpi "slt", %c32, %14 : index
      %16 = select %15, %c32, %14 : index
      %17 = subview %0[%arg0, %arg1] [%12, %16] [1, 1]  : memref<1x10xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
      %18 = "gpu.thread_id"() {dimension = "x"} : () -> index
      %19 = "gpu.block_dim"() {dimension = "x"} : () -> index
      %20 = "gpu.thread_id"() {dimension = "y"} : () -> index
      %21 = "gpu.block_dim"() {dimension = "y"} : () -> index
      scf.for %arg2 = %20 to %12 step %21 {
        scf.for %arg3 = %18 to %16 step %19 {
          store %cst, %17[%arg2, %arg3] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 10 + s0 + d1)>>
        }
      }
    }
  }
  return
}

// *** IR Dump After LegalizeStandardForSPIRV ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[32, 4, 1]> : vector<3xi32>}} {
    %cst = constant 0.000000e+00 : f32
    %c4 = constant 4 : index
    %c1 = constant 1 : index
    %c32 = constant 32 : index
    %c-1 = constant -1 : index
    %c10 = constant 10 : index
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x10xf32>
    %1 = "gpu.block_id"() {dimension = "x"} : () -> index
    %2 = "gpu.grid_dim"() {dimension = "x"} : () -> index
    %3 = "gpu.block_id"() {dimension = "y"} : () -> index
    %4 = "gpu.grid_dim"() {dimension = "y"} : () -> index
    %5 = muli %3, %c4 : index
    %6 = muli %4, %c4 : index
    %7 = muli %1, %c32 : index
    %8 = muli %2, %c32 : index
    scf.for %arg0 = %5 to %c1 step %6 {
      scf.for %arg1 = %7 to %c10 step %8 {
        %9 = muli %arg0, %c-1 : index
        %10 = addi %9, %c1 : index
        %11 = cmpi "slt", %c4, %10 : index
        %12 = select %11, %c4, %10 : index
        %13 = muli %arg1, %c-1 : index
        %14 = addi %13, %c10 : index
        %15 = cmpi "slt", %c32, %14 : index
        %16 = select %15, %c32, %14 : index
        %17 = "gpu.thread_id"() {dimension = "x"} : () -> index
        %18 = "gpu.block_dim"() {dimension = "x"} : () -> index
        %19 = "gpu.thread_id"() {dimension = "y"} : () -> index
        %20 = "gpu.block_dim"() {dimension = "y"} : () -> index
        scf.for %arg2 = %19 to %12 step %20 {
          scf.for %arg3 = %17 to %16 step %18 {
            %21 = addi %arg0, %arg2 : index
            %22 = addi %arg1, %arg3 : index
            store %cst, %0[%21, %22] : memref<1x10xf32>
          }
        }
      }
    }
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @ret0, set=0, binding=0, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After Canonicalizer ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[32, 4, 1]> : vector<3xi32>}} {
    %cst = constant 0.000000e+00 : f32
    %c4 = constant 4 : index
    %c1 = constant 1 : index
    %c32 = constant 32 : index
    %c-1 = constant -1 : index
    %c10 = constant 10 : index
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x10xf32>
    %1 = "gpu.block_id"() {dimension = "x"} : () -> index
    %2 = "gpu.grid_dim"() {dimension = "x"} : () -> index
    %3 = "gpu.block_id"() {dimension = "y"} : () -> index
    %4 = "gpu.grid_dim"() {dimension = "y"} : () -> index
    %5 = muli %3, %c4 : index
    %6 = muli %4, %c4 : index
    %7 = muli %1, %c32 : index
    %8 = muli %2, %c32 : index
    scf.for %arg0 = %5 to %c1 step %6 {
      scf.for %arg1 = %7 to %c10 step %8 {
        %9 = muli %arg0, %c-1 : index
        %10 = addi %9, %c1 : index
        %11 = cmpi "slt", %c4, %10 : index
        %12 = select %11, %c4, %10 : index
        %13 = muli %arg1, %c-1 : index
        %14 = addi %13, %c10 : index
        %15 = cmpi "slt", %c32, %14 : index
        %16 = select %15, %c32, %14 : index
        %17 = "gpu.thread_id"() {dimension = "x"} : () -> index
        %18 = "gpu.block_dim"() {dimension = "x"} : () -> index
        %19 = "gpu.thread_id"() {dimension = "y"} : () -> index
        %20 = "gpu.block_dim"() {dimension = "y"} : () -> index
        scf.for %arg2 = %19 to %12 step %20 {
          scf.for %arg3 = %17 to %16 step %18 {
            %21 = addi %arg0, %arg2 : index
            %22 = addi %arg1, %arg3 : index
            store %cst, %0[%21, %22] : memref<1x10xf32>
          }
        }
      }
    }
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @ret0, set=0, binding=0, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After CSE ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_0() attributes {spv.entry_point_abi = {local_size = dense<[32, 4, 1]> : vector<3xi32>}} {
    %cst = constant 0.000000e+00 : f32
    %c4 = constant 4 : index
    %c1 = constant 1 : index
    %c32 = constant 32 : index
    %c-1 = constant -1 : index
    %c10 = constant 10 : index
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x10xf32>
    %1 = "gpu.block_id"() {dimension = "x"} : () -> index
    %2 = "gpu.grid_dim"() {dimension = "x"} : () -> index
    %3 = "gpu.block_id"() {dimension = "y"} : () -> index
    %4 = "gpu.grid_dim"() {dimension = "y"} : () -> index
    %5 = muli %3, %c4 : index
    %6 = muli %4, %c4 : index
    %7 = muli %1, %c32 : index
    %8 = muli %2, %c32 : index
    scf.for %arg0 = %5 to %c1 step %6 {
      scf.for %arg1 = %7 to %c10 step %8 {
        %9 = muli %arg0, %c-1 : index
        %10 = addi %9, %c1 : index
        %11 = cmpi "slt", %c4, %10 : index
        %12 = select %11, %c4, %10 : index
        %13 = muli %arg1, %c-1 : index
        %14 = addi %13, %c10 : index
        %15 = cmpi "slt", %c32, %14 : index
        %16 = select %15, %c32, %14 : index
        %17 = "gpu.thread_id"() {dimension = "x"} : () -> index
        %18 = "gpu.block_dim"() {dimension = "x"} : () -> index
        %19 = "gpu.thread_id"() {dimension = "y"} : () -> index
        %20 = "gpu.block_dim"() {dimension = "y"} : () -> index
        scf.for %arg2 = %19 to %12 step %20 {
          scf.for %arg3 = %17 to %16 step %18 {
            %21 = addi %arg0, %arg2 : index
            %22 = addi %arg1, %arg3 : index
            store %cst, %0[%21, %22] : memref<1x10xf32>
          }
        }
      }
    }
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @ret0, set=0, binding=0, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::ConvertToSPIRVPass ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  spv.module Logical GLSL450 {
    spv.globalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
    spv.globalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spv.ptr<vector<3xi32>, Input>
    spv.globalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
    spv.globalVariable @__resource_var_0_0__ bind(0, 0) : !spv.ptr<!spv.struct<!spv.array<10 x f32, stride=4> [0]>, StorageBuffer>
    spv.func @main_ex_dispatch_0() "None" attributes {spv.entry_point_abi = {local_size = dense<[32, 4, 1]> : vector<3xi32>}} {
      %0 = spv.constant 0.000000e+00 : f32
      %1 = spv.constant 4 : i32
      %2 = spv.constant 1 : i32
      %3 = spv.constant 32 : i32
      %4 = spv.constant -1 : i32
      %5 = spv.constant 10 : i32
      %6 = spv._address_of @__resource_var_0_0__ : !spv.ptr<!spv.struct<!spv.array<10 x f32, stride=4> [0]>, StorageBuffer>
      %7 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
      %8 = spv.Load "Input" %7 : vector<3xi32>
      %9 = spv.CompositeExtract %8[0 : i32] : vector<3xi32>
      %10 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
      %11 = spv.Load "Input" %10 : vector<3xi32>
      %12 = spv.CompositeExtract %11[0 : i32] : vector<3xi32>
      %13 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
      %14 = spv.Load "Input" %13 : vector<3xi32>
      %15 = spv.CompositeExtract %14[1 : i32] : vector<3xi32>
      %16 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
      %17 = spv.Load "Input" %16 : vector<3xi32>
      %18 = spv.CompositeExtract %17[1 : i32] : vector<3xi32>
      %19 = spv.IMul %15, %1 : i32
      %20 = spv.IMul %18, %1 : i32
      %21 = spv.IMul %9, %3 : i32
      %22 = spv.IMul %12, %3 : i32
      spv.loop {
        spv.Branch ^bb1(%19 : i32)
      ^bb1(%23: i32):  // 2 preds: ^bb0, ^bb2
        %24 = spv.SLessThan %23, %2 : i32
        spv.BranchConditional %24, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        spv.loop {
          spv.Branch ^bb1(%21 : i32)
        ^bb1(%26: i32):  // 2 preds: ^bb0, ^bb2
          %27 = spv.SLessThan %26, %5 : i32
          spv.BranchConditional %27, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          %28 = spv.IMul %23, %4 : i32
          %29 = spv.IAdd %28, %2 : i32
          %30 = spv.SLessThan %1, %29 : i32
          %31 = spv.Select %30, %1, %29 : i1, i32
          %32 = spv.IMul %26, %4 : i32
          %33 = spv.IAdd %32, %5 : i32
          %34 = spv.SLessThan %3, %33 : i32
          %35 = spv.Select %34, %3, %33 : i1, i32
          %36 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
          %37 = spv.Load "Input" %36 : vector<3xi32>
          %38 = spv.CompositeExtract %37[0 : i32] : vector<3xi32>
          %39 = spv.constant 32 : i32
          %40 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
          %41 = spv.Load "Input" %40 : vector<3xi32>
          %42 = spv.CompositeExtract %41[1 : i32] : vector<3xi32>
          %43 = spv.constant 4 : i32
          spv.loop {
            spv.Branch ^bb1(%42 : i32)
          ^bb1(%45: i32):  // 2 preds: ^bb0, ^bb2
            %46 = spv.SLessThan %45, %31 : i32
            spv.BranchConditional %46, ^bb2, ^bb3
          ^bb2:  // pred: ^bb1
            spv.loop {
              spv.Branch ^bb1(%38 : i32)
            ^bb1(%48: i32):  // 2 preds: ^bb0, ^bb2
              %49 = spv.SLessThan %48, %35 : i32
              spv.BranchConditional %49, ^bb2, ^bb3
            ^bb2:  // pred: ^bb1
              %50 = spv.IAdd %23, %45 : i32
              %51 = spv.IAdd %26, %48 : i32
              %52 = spv.constant 0 : i32
              %53 = spv.constant 0 : i32
              %54 = spv.constant 10 : i32
              %55 = spv.IMul %54, %50 : i32
              %56 = spv.IAdd %53, %55 : i32
              %57 = spv.constant 1 : i32
              %58 = spv.IMul %57, %51 : i32
              %59 = spv.IAdd %56, %58 : i32
              %60 = spv.AccessChain %6[%52, %59] : !spv.ptr<!spv.struct<!spv.array<10 x f32, stride=4> [0]>, StorageBuffer>
              spv.Store "StorageBuffer" %60, %0 : f32
              %61 = spv.IAdd %48, %39 : i32
              spv.Branch ^bb1(%61 : i32)
            ^bb3:  // pred: ^bb1
              spv._merge
            }
            %47 = spv.IAdd %45, %43 : i32
            spv.Branch ^bb1(%47 : i32)
          ^bb3:  // pred: ^bb1
            spv._merge
          }
          %44 = spv.IAdd %26, %22 : i32
          spv.Branch ^bb1(%44 : i32)
        ^bb3:  // pred: ^bb1
          spv._merge
        }
        %25 = spv.IAdd %23, %20 : i32
        spv.Branch ^bb1(%25 : i32)
      ^bb3:  // pred: ^bb1
        spv._merge
      }
      spv.Return
    }
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @ret0, set=0, binding=0, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After SPIRVLowerABIAttributes ***
spv.module Logical GLSL450 {
  spv.globalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__resource_var_0_0__ bind(0, 0) : !spv.ptr<!spv.struct<!spv.array<10 x f32, stride=4> [0]>, StorageBuffer>
  spv.func @main_ex_dispatch_0() "None" {
    %0 = spv.constant 0.000000e+00 : f32
    %1 = spv.constant 4 : i32
    %2 = spv.constant 1 : i32
    %3 = spv.constant 32 : i32
    %4 = spv.constant -1 : i32
    %5 = spv.constant 10 : i32
    %6 = spv._address_of @__resource_var_0_0__ : !spv.ptr<!spv.struct<!spv.array<10 x f32, stride=4> [0]>, StorageBuffer>
    %7 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %8 = spv.Load "Input" %7 : vector<3xi32>
    %9 = spv.CompositeExtract %8[0 : i32] : vector<3xi32>
    %10 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
    %11 = spv.Load "Input" %10 : vector<3xi32>
    %12 = spv.CompositeExtract %11[0 : i32] : vector<3xi32>
    %13 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %14 = spv.Load "Input" %13 : vector<3xi32>
    %15 = spv.CompositeExtract %14[1 : i32] : vector<3xi32>
    %16 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
    %17 = spv.Load "Input" %16 : vector<3xi32>
    %18 = spv.CompositeExtract %17[1 : i32] : vector<3xi32>
    %19 = spv.IMul %15, %1 : i32
    %20 = spv.IMul %18, %1 : i32
    %21 = spv.IMul %9, %3 : i32
    %22 = spv.IMul %12, %3 : i32
    spv.loop {
      spv.Branch ^bb1(%19 : i32)
    ^bb1(%23: i32):  // 2 preds: ^bb0, ^bb2
      %24 = spv.SLessThan %23, %2 : i32
      spv.BranchConditional %24, ^bb2, ^bb3
    ^bb2:  // pred: ^bb1
      spv.loop {
        spv.Branch ^bb1(%21 : i32)
      ^bb1(%26: i32):  // 2 preds: ^bb0, ^bb2
        %27 = spv.SLessThan %26, %5 : i32
        spv.BranchConditional %27, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        %28 = spv.IMul %23, %4 : i32
        %29 = spv.IAdd %28, %2 : i32
        %30 = spv.SLessThan %1, %29 : i32
        %31 = spv.Select %30, %1, %29 : i1, i32
        %32 = spv.IMul %26, %4 : i32
        %33 = spv.IAdd %32, %5 : i32
        %34 = spv.SLessThan %3, %33 : i32
        %35 = spv.Select %34, %3, %33 : i1, i32
        %36 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
        %37 = spv.Load "Input" %36 : vector<3xi32>
        %38 = spv.CompositeExtract %37[0 : i32] : vector<3xi32>
        %39 = spv.constant 32 : i32
        %40 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
        %41 = spv.Load "Input" %40 : vector<3xi32>
        %42 = spv.CompositeExtract %41[1 : i32] : vector<3xi32>
        %43 = spv.constant 4 : i32
        spv.loop {
          spv.Branch ^bb1(%42 : i32)
        ^bb1(%45: i32):  // 2 preds: ^bb0, ^bb2
          %46 = spv.SLessThan %45, %31 : i32
          spv.BranchConditional %46, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          spv.loop {
            spv.Branch ^bb1(%38 : i32)
          ^bb1(%48: i32):  // 2 preds: ^bb0, ^bb2
            %49 = spv.SLessThan %48, %35 : i32
            spv.BranchConditional %49, ^bb2, ^bb3
          ^bb2:  // pred: ^bb1
            %50 = spv.IAdd %23, %45 : i32
            %51 = spv.IAdd %26, %48 : i32
            %52 = spv.constant 0 : i32
            %53 = spv.constant 0 : i32
            %54 = spv.constant 10 : i32
            %55 = spv.IMul %54, %50 : i32
            %56 = spv.IAdd %53, %55 : i32
            %57 = spv.constant 1 : i32
            %58 = spv.IMul %57, %51 : i32
            %59 = spv.IAdd %56, %58 : i32
            %60 = spv.AccessChain %6[%52, %59] : !spv.ptr<!spv.struct<!spv.array<10 x f32, stride=4> [0]>, StorageBuffer>
            spv.Store "StorageBuffer" %60, %0 : f32
            %61 = spv.IAdd %48, %39 : i32
            spv.Branch ^bb1(%61 : i32)
          ^bb3:  // pred: ^bb1
            spv._merge
          }
          %47 = spv.IAdd %45, %43 : i32
          spv.Branch ^bb1(%47 : i32)
        ^bb3:  // pred: ^bb1
          spv._merge
        }
        %44 = spv.IAdd %26, %22 : i32
        spv.Branch ^bb1(%44 : i32)
      ^bb3:  // pred: ^bb1
        spv._merge
      }
      %25 = spv.IAdd %23, %20 : i32
      spv.Branch ^bb1(%25 : i32)
    ^bb3:  // pred: ^bb1
      spv._merge
    }
    spv.Return
  }
  spv.EntryPoint "GLCompute" @main_ex_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__
  spv.ExecutionMode @main_ex_dispatch_0 "LocalSize", 32, 4, 1
}

// *** IR Dump After Canonicalizer ***
spv.module Logical GLSL450 {
  spv.globalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__resource_var_0_0__ bind(0, 0) : !spv.ptr<!spv.struct<!spv.array<10 x f32, stride=4> [0]>, StorageBuffer>
  spv.func @main_ex_dispatch_0() "None" {
    %0 = spv.constant 0.000000e+00 : f32
    %1 = spv.constant 1 : i32
    %2 = spv.constant -1 : i32
    %3 = spv.constant 32 : i32
    %4 = spv.constant 4 : i32
    %5 = spv.constant 0 : i32
    %6 = spv.constant 10 : i32
    %7 = spv._address_of @__resource_var_0_0__ : !spv.ptr<!spv.struct<!spv.array<10 x f32, stride=4> [0]>, StorageBuffer>
    %8 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %9 = spv.Load "Input" %8 : vector<3xi32>
    %10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
    %11 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
    %12 = spv.Load "Input" %11 : vector<3xi32>
    %13 = spv.CompositeExtract %12[0 : i32] : vector<3xi32>
    %14 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %15 = spv.Load "Input" %14 : vector<3xi32>
    %16 = spv.CompositeExtract %15[1 : i32] : vector<3xi32>
    %17 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
    %18 = spv.Load "Input" %17 : vector<3xi32>
    %19 = spv.CompositeExtract %18[1 : i32] : vector<3xi32>
    %20 = spv.IMul %16, %4 : i32
    %21 = spv.IMul %19, %4 : i32
    %22 = spv.IMul %10, %3 : i32
    %23 = spv.IMul %13, %3 : i32
    spv.loop {
      spv.Branch ^bb1(%20 : i32)
    ^bb1(%24: i32):  // 2 preds: ^bb0, ^bb2
      %25 = spv.SLessThan %24, %1 : i32
      spv.BranchConditional %25, ^bb2, ^bb3
    ^bb2:  // pred: ^bb1
      spv.loop {
        spv.Branch ^bb1(%22 : i32)
      ^bb1(%27: i32):  // 2 preds: ^bb0, ^bb2
        %28 = spv.SLessThan %27, %6 : i32
        spv.BranchConditional %28, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        %29 = spv.IMul %24, %2 : i32
        %30 = spv.IAdd %29, %1 : i32
        %31 = spv.SLessThan %4, %30 : i32
        %32 = spv.Select %31, %4, %30 : i1, i32
        %33 = spv.IMul %27, %2 : i32
        %34 = spv.IAdd %33, %6 : i32
        %35 = spv.SLessThan %3, %34 : i32
        %36 = spv.Select %35, %3, %34 : i1, i32
        %37 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
        %38 = spv.Load "Input" %37 : vector<3xi32>
        %39 = spv.CompositeExtract %38[0 : i32] : vector<3xi32>
        %40 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
        %41 = spv.Load "Input" %40 : vector<3xi32>
        %42 = spv.CompositeExtract %41[1 : i32] : vector<3xi32>
        spv.loop {
          spv.Branch ^bb1(%42 : i32)
        ^bb1(%44: i32):  // 2 preds: ^bb0, ^bb2
          %45 = spv.SLessThan %44, %32 : i32
          spv.BranchConditional %45, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          spv.loop {
            spv.Branch ^bb1(%39 : i32)
          ^bb1(%47: i32):  // 2 preds: ^bb0, ^bb2
            %48 = spv.SLessThan %47, %36 : i32
            spv.BranchConditional %48, ^bb2, ^bb3
          ^bb2:  // pred: ^bb1
            %49 = spv.IAdd %24, %44 : i32
            %50 = spv.IAdd %27, %47 : i32
            %51 = spv.IMul %49, %6 : i32
            %52 = spv.IAdd %51, %50 : i32
            %53 = spv.AccessChain %7[%5, %52] : !spv.ptr<!spv.struct<!spv.array<10 x f32, stride=4> [0]>, StorageBuffer>
            spv.Store "StorageBuffer" %53, %0 : f32
            %54 = spv.IAdd %47, %3 : i32
            spv.Branch ^bb1(%54 : i32)
          ^bb3:  // pred: ^bb1
            spv._merge
          }
          %46 = spv.IAdd %44, %4 : i32
          spv.Branch ^bb1(%46 : i32)
        ^bb3:  // pred: ^bb1
          spv._merge
        }
        %43 = spv.IAdd %27, %23 : i32
        spv.Branch ^bb1(%43 : i32)
      ^bb3:  // pred: ^bb1
        spv._merge
      }
      %26 = spv.IAdd %24, %21 : i32
      spv.Branch ^bb1(%26 : i32)
    ^bb3:  // pred: ^bb1
      spv._merge
    }
    spv.Return
  }
  spv.EntryPoint "GLCompute" @main_ex_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__
  spv.ExecutionMode @main_ex_dispatch_0 "LocalSize", 32, 4, 1
}

// *** IR Dump After CSE ***
spv.module Logical GLSL450 {
  spv.globalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__resource_var_0_0__ bind(0, 0) : !spv.ptr<!spv.struct<!spv.array<10 x f32, stride=4> [0]>, StorageBuffer>
  spv.func @main_ex_dispatch_0() "None" {
    %0 = spv.constant 0.000000e+00 : f32
    %1 = spv.constant 1 : i32
    %2 = spv.constant -1 : i32
    %3 = spv.constant 32 : i32
    %4 = spv.constant 4 : i32
    %5 = spv.constant 0 : i32
    %6 = spv.constant 10 : i32
    %7 = spv._address_of @__resource_var_0_0__ : !spv.ptr<!spv.struct<!spv.array<10 x f32, stride=4> [0]>, StorageBuffer>
    %8 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %9 = spv.Load "Input" %8 : vector<3xi32>
    %10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
    %11 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
    %12 = spv.Load "Input" %11 : vector<3xi32>
    %13 = spv.CompositeExtract %12[0 : i32] : vector<3xi32>
    %14 = spv.Load "Input" %8 : vector<3xi32>
    %15 = spv.CompositeExtract %14[1 : i32] : vector<3xi32>
    %16 = spv.Load "Input" %11 : vector<3xi32>
    %17 = spv.CompositeExtract %16[1 : i32] : vector<3xi32>
    %18 = spv.IMul %15, %4 : i32
    %19 = spv.IMul %17, %4 : i32
    %20 = spv.IMul %10, %3 : i32
    %21 = spv.IMul %13, %3 : i32
    spv.loop {
      spv.Branch ^bb1(%18 : i32)
    ^bb1(%22: i32):  // 2 preds: ^bb0, ^bb2
      %23 = spv.SLessThan %22, %1 : i32
      spv.BranchConditional %23, ^bb2, ^bb3
    ^bb2:  // pred: ^bb1
      spv.loop {
        spv.Branch ^bb1(%20 : i32)
      ^bb1(%25: i32):  // 2 preds: ^bb0, ^bb2
        %26 = spv.SLessThan %25, %6 : i32
        spv.BranchConditional %26, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        %27 = spv.IMul %22, %2 : i32
        %28 = spv.IAdd %27, %1 : i32
        %29 = spv.SLessThan %4, %28 : i32
        %30 = spv.Select %29, %4, %28 : i1, i32
        %31 = spv.IMul %25, %2 : i32
        %32 = spv.IAdd %31, %6 : i32
        %33 = spv.SLessThan %3, %32 : i32
        %34 = spv.Select %33, %3, %32 : i1, i32
        %35 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
        %36 = spv.Load "Input" %35 : vector<3xi32>
        %37 = spv.CompositeExtract %36[0 : i32] : vector<3xi32>
        %38 = spv.Load "Input" %35 : vector<3xi32>
        %39 = spv.CompositeExtract %38[1 : i32] : vector<3xi32>
        spv.loop {
          spv.Branch ^bb1(%39 : i32)
        ^bb1(%41: i32):  // 2 preds: ^bb0, ^bb2
          %42 = spv.SLessThan %41, %30 : i32
          spv.BranchConditional %42, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          spv.loop {
            spv.Branch ^bb1(%37 : i32)
          ^bb1(%44: i32):  // 2 preds: ^bb0, ^bb2
            %45 = spv.SLessThan %44, %34 : i32
            spv.BranchConditional %45, ^bb2, ^bb3
          ^bb2:  // pred: ^bb1
            %46 = spv.IAdd %22, %41 : i32
            %47 = spv.IAdd %25, %44 : i32
            %48 = spv.IMul %46, %6 : i32
            %49 = spv.IAdd %48, %47 : i32
            %50 = spv.AccessChain %7[%5, %49] : !spv.ptr<!spv.struct<!spv.array<10 x f32, stride=4> [0]>, StorageBuffer>
            spv.Store "StorageBuffer" %50, %0 : f32
            %51 = spv.IAdd %44, %3 : i32
            spv.Branch ^bb1(%51 : i32)
          ^bb3:  // pred: ^bb1
            spv._merge
          }
          %43 = spv.IAdd %41, %4 : i32
          spv.Branch ^bb1(%43 : i32)
        ^bb3:  // pred: ^bb1
          spv._merge
        }
        %40 = spv.IAdd %25, %21 : i32
        spv.Branch ^bb1(%40 : i32)
      ^bb3:  // pred: ^bb1
        spv._merge
      }
      %24 = spv.IAdd %22, %19 : i32
      spv.Branch ^bb1(%24 : i32)
    ^bb3:  // pred: ^bb1
      spv._merge
    }
    spv.Return
  }
  spv.EntryPoint "GLCompute" @main_ex_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__
  spv.ExecutionMode @main_ex_dispatch_0 "LocalSize", 32, 4, 1
}

// *** IR Dump After SPIRVUpdateVCE ***
spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
  spv.globalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__resource_var_0_0__ bind(0, 0) : !spv.ptr<!spv.struct<!spv.array<10 x f32, stride=4> [0]>, StorageBuffer>
  spv.func @main_ex_dispatch_0() "None" {
    %0 = spv.constant 0.000000e+00 : f32
    %1 = spv.constant 1 : i32
    %2 = spv.constant -1 : i32
    %3 = spv.constant 32 : i32
    %4 = spv.constant 4 : i32
    %5 = spv.constant 0 : i32
    %6 = spv.constant 10 : i32
    %7 = spv._address_of @__resource_var_0_0__ : !spv.ptr<!spv.struct<!spv.array<10 x f32, stride=4> [0]>, StorageBuffer>
    %8 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %9 = spv.Load "Input" %8 : vector<3xi32>
    %10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
    %11 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
    %12 = spv.Load "Input" %11 : vector<3xi32>
    %13 = spv.CompositeExtract %12[0 : i32] : vector<3xi32>
    %14 = spv.Load "Input" %8 : vector<3xi32>
    %15 = spv.CompositeExtract %14[1 : i32] : vector<3xi32>
    %16 = spv.Load "Input" %11 : vector<3xi32>
    %17 = spv.CompositeExtract %16[1 : i32] : vector<3xi32>
    %18 = spv.IMul %15, %4 : i32
    %19 = spv.IMul %17, %4 : i32
    %20 = spv.IMul %10, %3 : i32
    %21 = spv.IMul %13, %3 : i32
    spv.loop {
      spv.Branch ^bb1(%18 : i32)
    ^bb1(%22: i32):  // 2 preds: ^bb0, ^bb2
      %23 = spv.SLessThan %22, %1 : i32
      spv.BranchConditional %23, ^bb2, ^bb3
    ^bb2:  // pred: ^bb1
      spv.loop {
        spv.Branch ^bb1(%20 : i32)
      ^bb1(%25: i32):  // 2 preds: ^bb0, ^bb2
        %26 = spv.SLessThan %25, %6 : i32
        spv.BranchConditional %26, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        %27 = spv.IMul %22, %2 : i32
        %28 = spv.IAdd %27, %1 : i32
        %29 = spv.SLessThan %4, %28 : i32
        %30 = spv.Select %29, %4, %28 : i1, i32
        %31 = spv.IMul %25, %2 : i32
        %32 = spv.IAdd %31, %6 : i32
        %33 = spv.SLessThan %3, %32 : i32
        %34 = spv.Select %33, %3, %32 : i1, i32
        %35 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
        %36 = spv.Load "Input" %35 : vector<3xi32>
        %37 = spv.CompositeExtract %36[0 : i32] : vector<3xi32>
        %38 = spv.Load "Input" %35 : vector<3xi32>
        %39 = spv.CompositeExtract %38[1 : i32] : vector<3xi32>
        spv.loop {
          spv.Branch ^bb1(%39 : i32)
        ^bb1(%41: i32):  // 2 preds: ^bb0, ^bb2
          %42 = spv.SLessThan %41, %30 : i32
          spv.BranchConditional %42, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          spv.loop {
            spv.Branch ^bb1(%37 : i32)
          ^bb1(%44: i32):  // 2 preds: ^bb0, ^bb2
            %45 = spv.SLessThan %44, %34 : i32
            spv.BranchConditional %45, ^bb2, ^bb3
          ^bb2:  // pred: ^bb1
            %46 = spv.IAdd %22, %41 : i32
            %47 = spv.IAdd %25, %44 : i32
            %48 = spv.IMul %46, %6 : i32
            %49 = spv.IAdd %48, %47 : i32
            %50 = spv.AccessChain %7[%5, %49] : !spv.ptr<!spv.struct<!spv.array<10 x f32, stride=4> [0]>, StorageBuffer>
            spv.Store "StorageBuffer" %50, %0 : f32
            %51 = spv.IAdd %44, %3 : i32
            spv.Branch ^bb1(%51 : i32)
          ^bb3:  // pred: ^bb1
            spv._merge
          }
          %43 = spv.IAdd %41, %4 : i32
          spv.Branch ^bb1(%43 : i32)
        ^bb3:  // pred: ^bb1
          spv._merge
        }
        %40 = spv.IAdd %25, %21 : i32
        spv.Branch ^bb1(%40 : i32)
      ^bb3:  // pred: ^bb1
        spv._merge
      }
      %24 = spv.IAdd %22, %19 : i32
      spv.Branch ^bb1(%24 : i32)
    ^bb3:  // pred: ^bb1
      spv._merge
    }
    spv.Return
  }
  spv.EntryPoint "GLCompute" @main_ex_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__
  spv.ExecutionMode @main_ex_dispatch_0 "LocalSize", 32, 4, 1
}

// *** IR Dump After mlir::iree_compiler::IREE::HAL::TranslateExecutablesPass ***
hal.executable @main_ex_dispatch_0 attributes {sym_visibility = "private"} {
  hal.interface @legacy_io {
    hal.interface.binding @ret0, set=0, binding=0, type="StorageBuffer", access="Write|Discard"
  }
  hal.executable.entry_point @main_ex_dispatch_0 attributes {interface = @legacy_io, ordinal = 0 : i32, signature = () -> tensor<1x10xf32>}
  hal.executable.target "vulkan*" {
    module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
      spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
        spv.globalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
        spv.globalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spv.ptr<vector<3xi32>, Input>
        spv.globalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
        spv.globalVariable @__resource_var_0_0__ bind(0, 0) : !spv.ptr<!spv.struct<!spv.array<10 x f32, stride=4> [0]>, StorageBuffer>
        spv.func @main_ex_dispatch_0() "None" {
          %0 = spv.constant 0.000000e+00 : f32
          %1 = spv.constant 1 : i32
          %2 = spv.constant -1 : i32
          %3 = spv.constant 32 : i32
          %4 = spv.constant 4 : i32
          %5 = spv.constant 0 : i32
          %6 = spv.constant 10 : i32
          %7 = spv._address_of @__resource_var_0_0__ : !spv.ptr<!spv.struct<!spv.array<10 x f32, stride=4> [0]>, StorageBuffer>
          %8 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
          %9 = spv.Load "Input" %8 : vector<3xi32>
          %10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
          %11 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
          %12 = spv.Load "Input" %11 : vector<3xi32>
          %13 = spv.CompositeExtract %12[0 : i32] : vector<3xi32>
          %14 = spv.Load "Input" %8 : vector<3xi32>
          %15 = spv.CompositeExtract %14[1 : i32] : vector<3xi32>
          %16 = spv.Load "Input" %11 : vector<3xi32>
          %17 = spv.CompositeExtract %16[1 : i32] : vector<3xi32>
          %18 = spv.IMul %15, %4 : i32
          %19 = spv.IMul %17, %4 : i32
          %20 = spv.IMul %10, %3 : i32
          %21 = spv.IMul %13, %3 : i32
          spv.loop {
            spv.Branch ^bb1(%18 : i32)
          ^bb1(%22: i32):  // 2 preds: ^bb0, ^bb2
            %23 = spv.SLessThan %22, %1 : i32
            spv.BranchConditional %23, ^bb2, ^bb3
          ^bb2:  // pred: ^bb1
            spv.loop {
              spv.Branch ^bb1(%20 : i32)
            ^bb1(%25: i32):  // 2 preds: ^bb0, ^bb2
              %26 = spv.SLessThan %25, %6 : i32
              spv.BranchConditional %26, ^bb2, ^bb3
            ^bb2:  // pred: ^bb1
              %27 = spv.IMul %22, %2 : i32
              %28 = spv.IAdd %27, %1 : i32
              %29 = spv.SLessThan %4, %28 : i32
              %30 = spv.Select %29, %4, %28 : i1, i32
              %31 = spv.IMul %25, %2 : i32
              %32 = spv.IAdd %31, %6 : i32
              %33 = spv.SLessThan %3, %32 : i32
              %34 = spv.Select %33, %3, %32 : i1, i32
              %35 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
              %36 = spv.Load "Input" %35 : vector<3xi32>
              %37 = spv.CompositeExtract %36[0 : i32] : vector<3xi32>
              %38 = spv.Load "Input" %35 : vector<3xi32>
              %39 = spv.CompositeExtract %38[1 : i32] : vector<3xi32>
              spv.loop {
                spv.Branch ^bb1(%39 : i32)
              ^bb1(%41: i32):  // 2 preds: ^bb0, ^bb2
                %42 = spv.SLessThan %41, %30 : i32
                spv.BranchConditional %42, ^bb2, ^bb3
              ^bb2:  // pred: ^bb1
                spv.loop {
                  spv.Branch ^bb1(%37 : i32)
                ^bb1(%44: i32):  // 2 preds: ^bb0, ^bb2
                  %45 = spv.SLessThan %44, %34 : i32
                  spv.BranchConditional %45, ^bb2, ^bb3
                ^bb2:  // pred: ^bb1
                  %46 = spv.IAdd %22, %41 : i32
                  %47 = spv.IAdd %25, %44 : i32
                  %48 = spv.IMul %46, %6 : i32
                  %49 = spv.IAdd %48, %47 : i32
                  %50 = spv.AccessChain %7[%5, %49] : !spv.ptr<!spv.struct<!spv.array<10 x f32, stride=4> [0]>, StorageBuffer>
                  spv.Store "StorageBuffer" %50, %0 : f32
                  %51 = spv.IAdd %44, %3 : i32
                  spv.Branch ^bb1(%51 : i32)
                ^bb3:  // pred: ^bb1
                  spv._merge
                }
                %43 = spv.IAdd %41, %4 : i32
                spv.Branch ^bb1(%43 : i32)
              ^bb3:  // pred: ^bb1
                spv._merge
              }
              %40 = spv.IAdd %25, %21 : i32
              spv.Branch ^bb1(%40 : i32)
            ^bb3:  // pred: ^bb1
              spv._merge
            }
            %24 = spv.IAdd %22, %19 : i32
            spv.Branch ^bb1(%24 : i32)
          ^bb3:  // pred: ^bb1
            spv._merge
          }
          spv.Return
        }
        spv.EntryPoint "GLCompute" @main_ex_dispatch_0, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__
        spv.ExecutionMode @main_ex_dispatch_0 "LocalSize", 32, 4, 1
      }
      hal.interface @legacy_io attributes {sym_visibility = "private"} {
        hal.interface.binding @ret0, set=0, binding=0, type="StorageBuffer", access="Write|Discard"
      }
    }
  }
}

// *** IR Dump After Inliner ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_1() {
    %c0 = constant 0 : index
    %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<1x5x2x2xf32>
    %1 = "xla_hlo.reshape"(%0) : (tensor<1x5x2x2xf32>) -> tensor<1x5x4xf32>
    hal.interface.store.tensor %1, @legacy_io::@ret0, offset = %c0 : tensor<1x5x4xf32>
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::Shape::`anonymous-namespace'::TieDynamicShapesPass ***
func @main_ex_dispatch_1() {
  %c0 = constant 0 : index
  %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<1x5x2x2xf32>
  %1 = "xla_hlo.reshape"(%0) : (tensor<1x5x2x2xf32>) -> tensor<1x5x4xf32>
  hal.interface.store.tensor %1, @legacy_io::@ret0, offset = %c0 : tensor<1x5x4xf32>
  return
}

// *** IR Dump After mlir::iree_compiler::Shape::`anonymous-namespace'::MaterializeShapeCalculationsPass ***
func @main_ex_dispatch_1() {
  %c0 = constant 0 : index
  %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<1x5x2x2xf32>
  %1 = "xla_hlo.reshape"(%0) : (tensor<1x5x2x2xf32>) -> tensor<1x5x4xf32>
  hal.interface.store.tensor %1, @legacy_io::@ret0, offset = %c0 : tensor<1x5x4xf32>
  return
}

// *** IR Dump After mlir::iree_compiler::Shape::`anonymous-namespace'::HoistShapeCalculations ***
func @main_ex_dispatch_1() {
  %c0 = constant 0 : index
  %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<1x5x2x2xf32>
  %1 = "xla_hlo.reshape"(%0) : (tensor<1x5x2x2xf32>) -> tensor<1x5x4xf32>
  hal.interface.store.tensor %1, @legacy_io::@ret0, offset = %c0 : tensor<1x5x4xf32>
  return
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::DecomposeHLOClampPass ***
func @main_ex_dispatch_1() {
  %c0 = constant 0 : index
  %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<1x5x2x2xf32>
  %1 = "xla_hlo.reshape"(%0) : (tensor<1x5x2x2xf32>) -> tensor<1x5x4xf32>
  hal.interface.store.tensor %1, @legacy_io::@ret0, offset = %c0 : tensor<1x5x4xf32>
  return
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::ConvertHLOToLinalgOnTensorsPass ***
func @main_ex_dispatch_1() {
  %c0 = constant 0 : index
  %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<1x5x2x2xf32>
  %1 = linalg.tensor_reshape %0 [affine_map<(d0, d1, d2, d3) -> (d0)>, affine_map<(d0, d1, d2, d3) -> (d1)>, affine_map<(d0, d1, d2, d3) -> (d2, d3)>] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
  hal.interface.store.tensor %1, @legacy_io::@ret0, offset = %c0 : tensor<1x5x4xf32>
  return
}

// *** IR Dump After LinalgFusionOfTensorOps ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_1() {
    %c0 = constant 0 : index
    %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<1x5x2x2xf32>
    %1 = linalg.tensor_reshape %0 [affine_map<(d0, d1, d2, d3) -> (d0)>, affine_map<(d0, d1, d2, d3) -> (d1)>, affine_map<(d0, d1, d2, d3) -> (d2, d3)>] : tensor<1x5x2x2xf32> into tensor<1x5x4xf32>
    hal.interface.store.tensor %1, @legacy_io::@ret0, offset = %c0 : tensor<1x5x4xf32>
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::ConvertHLOToLinalgOnBuffersPass ***
func @main_ex_dispatch_1() {
  %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x5x4xf32>
  %1 = linalg.reshape %0 [affine_map<(d0, d1, d2, d3) -> (d0)>, affine_map<(d0, d1, d2, d3) -> (d1)>, affine_map<(d0, d1, d2, d3) -> (d2, d3)>] : memref<1x5x4xf32> into memref<1x5x2x2xf32>
  %c0 = constant 0 : index
  %2 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x2x2xf32>
  linalg.copy(%2, %1) : memref<1x5x2x2xf32>, memref<1x5x2x2xf32>
  return
}

// *** IR Dump After Canonicalizer ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_1() {
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x5x4xf32>
    %1 = linalg.reshape %0 [affine_map<(d0, d1, d2, d3) -> (d0)>, affine_map<(d0, d1, d2, d3) -> (d1)>, affine_map<(d0, d1, d2, d3) -> (d2, d3)>] : memref<1x5x4xf32> into memref<1x5x2x2xf32>
    %2 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x2x2xf32>
    linalg.copy(%2, %1) : memref<1x5x2x2xf32>, memref<1x5x2x2xf32>
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After CSE ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_1() {
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x5x4xf32>
    %1 = linalg.reshape %0 [affine_map<(d0, d1, d2, d3) -> (d0)>, affine_map<(d0, d1, d2, d3) -> (d1)>, affine_map<(d0, d1, d2, d3) -> (d2, d3)>] : memref<1x5x4xf32> into memref<1x5x2x2xf32>
    %2 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x2x2xf32>
    linalg.copy(%2, %1) : memref<1x5x2x2xf32>, memref<1x5x2x2xf32>
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::LinalgTileAndFusePass ***
func @main_ex_dispatch_1() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
  %c5 = constant 5 : index
  %c32 = constant 32 : index
  %c0 = constant 0 : index
  %c2 = constant 2 : index
  %c1 = constant 1 : index
  %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x5x4xf32>
  %1 = linalg.reshape %0 [affine_map<(d0, d1, d2, d3) -> (d0)>, affine_map<(d0, d1, d2, d3) -> (d1)>, affine_map<(d0, d1, d2, d3) -> (d2, d3)>] : memref<1x5x4xf32> into memref<1x5x2x2xf32>
  %2 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x2x2xf32>
  scf.parallel (%arg0, %arg1, %arg2) = (%c0, %c0, %c0) to (%c1, %c5, %c2) step (%c2, %c2, %c32) {
    %3 = affine.min affine_map<(d0, d1, d2) -> (2, d1 - d2)>(%c2, %c1, %arg0)
    %4 = affine.min affine_map<(d0, d1, d2) -> (2, d1 - d2)>(%c2, %c5, %arg1)
    %5 = affine.min affine_map<(d0, d1, d2) -> (32, d1 - d2)>(%c32, %c2, %arg2)
    %6 = subview %2[%arg0, %arg1, %arg2, %c0] [%3, %4, %5, %c2] [%c1, %c1, %c1, %c1]  : memref<1x5x2x2xf32> to memref<?x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>
    %7 = affine.min affine_map<(d0, d1, d2) -> (2, d1 - d2)>(%c2, %c1, %arg0)
    %8 = affine.min affine_map<(d0, d1, d2) -> (2, d1 - d2)>(%c2, %c5, %arg1)
    %9 = affine.min affine_map<(d0, d1, d2) -> (32, d1 - d2)>(%c32, %c2, %arg2)
    %10 = subview %1[%arg0, %arg1, %arg2, %c0] [%7, %8, %9, %c2] [%c1, %c1, %c1, %c1]  : memref<1x5x2x2xf32> to memref<?x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>
    linalg.copy(%6, %10) {__internal_linalg_transform__ = "workitem"} : memref<?x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>, memref<?x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>
    scf.yield
  }
  return
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::SplitDispatchFunctionPass ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_1() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
    %c5 = constant 5 : index
    %c32 = constant 32 : index
    %c0 = constant 0 : index
    %c2 = constant 2 : index
    %c1 = constant 1 : index
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x5x4xf32>
    %1 = linalg.reshape %0 [affine_map<(d0, d1, d2, d3) -> (d0)>, affine_map<(d0, d1, d2, d3) -> (d1)>, affine_map<(d0, d1, d2, d3) -> (d2, d3)>] : memref<1x5x4xf32> into memref<1x5x2x2xf32>
    %2 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x2x2xf32>
    scf.parallel (%arg0, %arg1, %arg2) = (%c0, %c0, %c0) to (%c1, %c5, %c2) step (%c2, %c2, %c32) {
      %3 = affine.min affine_map<(d0, d1, d2) -> (2, d1 - d2)>(%c2, %c1, %arg0)
      %4 = affine.min affine_map<(d0, d1, d2) -> (2, d1 - d2)>(%c2, %c5, %arg1)
      %5 = affine.min affine_map<(d0, d1, d2) -> (32, d1 - d2)>(%c32, %c2, %arg2)
      %6 = subview %2[%arg0, %arg1, %arg2, %c0] [%3, %4, %5, %c2] [%c1, %c1, %c1, %c1]  : memref<1x5x2x2xf32> to memref<?x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>
      %7 = affine.min affine_map<(d0, d1, d2) -> (2, d1 - d2)>(%c2, %c1, %arg0)
      %8 = affine.min affine_map<(d0, d1, d2) -> (2, d1 - d2)>(%c2, %c5, %arg1)
      %9 = affine.min affine_map<(d0, d1, d2) -> (32, d1 - d2)>(%c32, %c2, %arg2)
      %10 = subview %1[%arg0, %arg1, %arg2, %c0] [%7, %8, %9, %c2] [%c1, %c1, %c1, %c1]  : memref<1x5x2x2xf32> to memref<?x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>
      linalg.copy(%6, %10) {__internal_linalg_transform__ = "workitem"} : memref<?x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>, memref<?x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>
      scf.yield
    }
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::LinalgTileAndFusePass ***
func @main_ex_dispatch_1() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
  %c5 = constant 5 : index
  %c32 = constant 32 : index
  %c0 = constant 0 : index
  %c2 = constant 2 : index
  %c1 = constant 1 : index
  %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x5x4xf32>
  %1 = linalg.reshape %0 [affine_map<(d0, d1, d2, d3) -> (d0)>, affine_map<(d0, d1, d2, d3) -> (d1)>, affine_map<(d0, d1, d2, d3) -> (d2, d3)>] : memref<1x5x4xf32> into memref<1x5x2x2xf32>
  %2 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x2x2xf32>
  scf.parallel (%arg0, %arg1, %arg2) = (%c0, %c0, %c0) to (%c1, %c5, %c2) step (%c2, %c2, %c32) {
    %3 = affine.min affine_map<(d0, d1, d2) -> (2, d1 - d2)>(%c2, %c1, %arg0)
    %4 = affine.min affine_map<(d0, d1, d2) -> (2, d1 - d2)>(%c2, %c5, %arg1)
    %5 = affine.min affine_map<(d0, d1, d2) -> (32, d1 - d2)>(%c32, %c2, %arg2)
    %6 = subview %2[%arg0, %arg1, %arg2, %c0] [%3, %4, %5, %c2] [%c1, %c1, %c1, %c1]  : memref<1x5x2x2xf32> to memref<?x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>
    %7 = affine.min affine_map<(d0, d1, d2) -> (2, d1 - d2)>(%c2, %c1, %arg0)
    %8 = affine.min affine_map<(d0, d1, d2) -> (2, d1 - d2)>(%c2, %c5, %arg1)
    %9 = affine.min affine_map<(d0, d1, d2) -> (32, d1 - d2)>(%c32, %c2, %arg2)
    %10 = subview %1[%arg0, %arg1, %arg2, %c0] [%7, %8, %9, %c2] [%c1, %c1, %c1, %c1]  : memref<1x5x2x2xf32> to memref<?x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>
    linalg.copy(%6, %10) {__internal_linalg_transform__ = "workitem"} : memref<?x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>, memref<?x?x?x?xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>>
    scf.yield
  }
  return
}

// *** IR Dump After Canonicalizer ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_1() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
    %c5 = constant 5 : index
    %c32 = constant 32 : index
    %c0 = constant 0 : index
    %c2 = constant 2 : index
    %c1 = constant 1 : index
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x5x4xf32>
    %1 = linalg.reshape %0 [affine_map<(d0, d1, d2, d3) -> (d0)>, affine_map<(d0, d1, d2, d3) -> (d1)>, affine_map<(d0, d1, d2, d3) -> (d2, d3)>] : memref<1x5x4xf32> into memref<1x5x2x2xf32>
    %2 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x2x2xf32>
    scf.parallel (%arg0, %arg1, %arg2) = (%c0, %c0, %c0) to (%c1, %c5, %c2) step (%c2, %c2, %c32) {
      %3 = affine.min affine_map<(d0) -> (2, -d0 + 1)>(%arg0)
      %4 = affine.min affine_map<(d0) -> (2, -d0 + 5)>(%arg1)
      %5 = affine.min affine_map<(d0) -> (32, -d0 + 2)>(%arg2)
      %6 = subview %2[%arg0, %arg1, %arg2, 0] [%3, %4, %5, 2] [1, 1, 1, 1]  : memref<1x5x2x2xf32> to memref<?x?x?x2xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 20 + s0 + d1 * 4 + d2 * 2 + d3)>>
      %7 = affine.min affine_map<(d0) -> (2, -d0 + 1)>(%arg0)
      %8 = affine.min affine_map<(d0) -> (2, -d0 + 5)>(%arg1)
      %9 = affine.min affine_map<(d0) -> (32, -d0 + 2)>(%arg2)
      %10 = subview %1[%arg0, %arg1, %arg2, 0] [%7, %8, %9, 2] [1, 1, 1, 1]  : memref<1x5x2x2xf32> to memref<?x?x?x2xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 20 + s0 + d1 * 4 + d2 * 2 + d3)>>
      linalg.copy(%6, %10) {__internal_linalg_transform__ = "workitem"} : memref<?x?x?x2xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 20 + s0 + d1 * 4 + d2 * 2 + d3)>>, memref<?x?x?x2xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 20 + s0 + d1 * 4 + d2 * 2 + d3)>>
      scf.yield
    }
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::ConvertToGPUPass ***
func @main_ex_dispatch_1() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
  %c5 = constant 5 : index
  %c32 = constant 32 : index
  %c0 = constant 0 : index
  %c2 = constant 2 : index
  %c1 = constant 1 : index
  %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x5x4xf32>
  %1 = linalg.reshape %0 [affine_map<(d0, d1, d2, d3) -> (d0)>, affine_map<(d0, d1, d2, d3) -> (d1)>, affine_map<(d0, d1, d2, d3) -> (d2, d3)>] : memref<1x5x4xf32> into memref<1x5x2x2xf32>
  %2 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x2x2xf32>
  %3 = "gpu.block_id"() {dimension = "x"} : () -> index
  %4 = "gpu.grid_dim"() {dimension = "x"} : () -> index
  %5 = "gpu.block_id"() {dimension = "y"} : () -> index
  %6 = "gpu.grid_dim"() {dimension = "y"} : () -> index
  %7 = "gpu.block_id"() {dimension = "z"} : () -> index
  %8 = "gpu.grid_dim"() {dimension = "z"} : () -> index
  %9 = muli %c2, %7 : index
  %10 = addi %c0, %9 : index
  %11 = muli %c2, %8 : index
  %12 = muli %c2, %5 : index
  %13 = addi %c0, %12 : index
  %14 = muli %c2, %6 : index
  %15 = muli %c32, %3 : index
  %16 = addi %c0, %15 : index
  %17 = muli %c32, %4 : index
  scf.for %arg0 = %10 to %c1 step %11 {
    scf.for %arg1 = %13 to %c5 step %14 {
      scf.for %arg2 = %16 to %c2 step %17 {
        %18 = affine.min affine_map<(d0) -> (2, -d0 + 1)>(%arg0)
        %19 = affine.min affine_map<(d0) -> (2, -d0 + 5)>(%arg1)
        %20 = affine.min affine_map<(d0) -> (32, -d0 + 2)>(%arg2)
        %21 = subview %2[%arg0, %arg1, %arg2, 0] [%18, %19, %20, 2] [1, 1, 1, 1]  : memref<1x5x2x2xf32> to memref<?x?x?x2xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 20 + s0 + d1 * 4 + d2 * 2 + d3)>>
        %22 = affine.min affine_map<(d0) -> (2, -d0 + 1)>(%arg0)
        %23 = affine.min affine_map<(d0) -> (2, -d0 + 5)>(%arg1)
        %24 = affine.min affine_map<(d0) -> (32, -d0 + 2)>(%arg2)
        %25 = subview %1[%arg0, %arg1, %arg2, 0] [%22, %23, %24, 2] [1, 1, 1, 1]  : memref<1x5x2x2xf32> to memref<?x?x?x2xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 20 + s0 + d1 * 4 + d2 * 2 + d3)>>
        %26 = dim %21, 0 : memref<?x?x?x2xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 20 + s0 + d1 * 4 + d2 * 2 + d3)>>
        %27 = dim %21, 1 : memref<?x?x?x2xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 20 + s0 + d1 * 4 + d2 * 2 + d3)>>
        %28 = dim %21, 2 : memref<?x?x?x2xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 20 + s0 + d1 * 4 + d2 * 2 + d3)>>
        %29 = dim %21, 3 : memref<?x?x?x2xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 20 + s0 + d1 * 4 + d2 * 2 + d3)>>
        %30 = dim %25, 0 : memref<?x?x?x2xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 20 + s0 + d1 * 4 + d2 * 2 + d3)>>
        %31 = dim %25, 1 : memref<?x?x?x2xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 20 + s0 + d1 * 4 + d2 * 2 + d3)>>
        %32 = dim %25, 2 : memref<?x?x?x2xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 20 + s0 + d1 * 4 + d2 * 2 + d3)>>
        %33 = dim %25, 3 : memref<?x?x?x2xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 20 + s0 + d1 * 4 + d2 * 2 + d3)>>
        %34 = affine.apply affine_map<(d0) -> (d0)>(%26)
        %35 = affine.apply affine_map<(d0) -> (d0)>(%27)
        %36 = affine.apply affine_map<(d0) -> (d0)>(%28)
        %37 = affine.apply affine_map<()[s0] -> (s0)>()[%29]
        %c0_0 = constant 0 : index
        %c1_1 = constant 1 : index
        %c0_2 = constant 0 : index
        %c1_3 = constant 1 : index
        %c0_4 = constant 0 : index
        %c1_5 = constant 1 : index
        %c0_6 = constant 0 : index
        %c1_7 = constant 1 : index
        %38 = "gpu.thread_id"() {dimension = "x"} : () -> index
        %39 = "gpu.block_dim"() {dimension = "x"} : () -> index
        %40 = "gpu.thread_id"() {dimension = "y"} : () -> index
        %41 = "gpu.block_dim"() {dimension = "y"} : () -> index
        %42 = "gpu.thread_id"() {dimension = "z"} : () -> index
        %43 = "gpu.block_dim"() {dimension = "z"} : () -> index
        %44 = muli %c1_1, %42 : index
        %45 = addi %c0_0, %44 : index
        %46 = muli %c1_1, %43 : index
        %47 = muli %c1_3, %40 : index
        %48 = addi %c0_2, %47 : index
        %49 = muli %c1_3, %41 : index
        %50 = muli %c1_5, %38 : index
        %51 = addi %c0_4, %50 : index
        %52 = muli %c1_5, %39 : index
        scf.for %arg3 = %45 to %34 step %46 {
          scf.for %arg4 = %48 to %35 step %49 {
            scf.for %arg5 = %51 to %36 step %52 {
              scf.for %arg6 = %c0_6 to %37 step %c1_7 {
                %53 = load %21[%arg3, %arg4, %arg5, %arg6] : memref<?x?x?x2xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 20 + s0 + d1 * 4 + d2 * 2 + d3)>>
                store %53, %25[%arg3, %arg4, %arg5, %arg6] : memref<?x?x?x2xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 20 + s0 + d1 * 4 + d2 * 2 + d3)>>
              }
            }
          }
        }
      }
    }
  }
  return
}

// *** IR Dump After ConvertAffineToStandard ***
func @main_ex_dispatch_1() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
  %c5 = constant 5 : index
  %c32 = constant 32 : index
  %c0 = constant 0 : index
  %c2 = constant 2 : index
  %c1 = constant 1 : index
  %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x5x4xf32>
  %1 = linalg.reshape %0 [affine_map<(d0, d1, d2, d3) -> (d0)>, affine_map<(d0, d1, d2, d3) -> (d1)>, affine_map<(d0, d1, d2, d3) -> (d2, d3)>] : memref<1x5x4xf32> into memref<1x5x2x2xf32>
  %2 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x2x2xf32>
  %3 = "gpu.block_id"() {dimension = "x"} : () -> index
  %4 = "gpu.grid_dim"() {dimension = "x"} : () -> index
  %5 = "gpu.block_id"() {dimension = "y"} : () -> index
  %6 = "gpu.grid_dim"() {dimension = "y"} : () -> index
  %7 = "gpu.block_id"() {dimension = "z"} : () -> index
  %8 = "gpu.grid_dim"() {dimension = "z"} : () -> index
  %9 = muli %c2, %7 : index
  %10 = addi %c0, %9 : index
  %11 = muli %c2, %8 : index
  %12 = muli %c2, %5 : index
  %13 = addi %c0, %12 : index
  %14 = muli %c2, %6 : index
  %15 = muli %c32, %3 : index
  %16 = addi %c0, %15 : index
  %17 = muli %c32, %4 : index
  scf.for %arg0 = %10 to %c1 step %11 {
    scf.for %arg1 = %13 to %c5 step %14 {
      scf.for %arg2 = %16 to %c2 step %17 {
        %c2_0 = constant 2 : index
        %c-1 = constant -1 : index
        %18 = muli %arg0, %c-1 : index
        %c1_1 = constant 1 : index
        %19 = addi %18, %c1_1 : index
        %20 = cmpi "slt", %c2_0, %19 : index
        %21 = select %20, %c2_0, %19 : index
        %c2_2 = constant 2 : index
        %c-1_3 = constant -1 : index
        %22 = muli %arg1, %c-1_3 : index
        %c5_4 = constant 5 : index
        %23 = addi %22, %c5_4 : index
        %24 = cmpi "slt", %c2_2, %23 : index
        %25 = select %24, %c2_2, %23 : index
        %c32_5 = constant 32 : index
        %c-1_6 = constant -1 : index
        %26 = muli %arg2, %c-1_6 : index
        %c2_7 = constant 2 : index
        %27 = addi %26, %c2_7 : index
        %28 = cmpi "slt", %c32_5, %27 : index
        %29 = select %28, %c32_5, %27 : index
        %30 = subview %2[%arg0, %arg1, %arg2, 0] [%21, %25, %29, 2] [1, 1, 1, 1]  : memref<1x5x2x2xf32> to memref<?x?x?x2xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 20 + s0 + d1 * 4 + d2 * 2 + d3)>>
        %c2_8 = constant 2 : index
        %c-1_9 = constant -1 : index
        %31 = muli %arg0, %c-1_9 : index
        %c1_10 = constant 1 : index
        %32 = addi %31, %c1_10 : index
        %33 = cmpi "slt", %c2_8, %32 : index
        %34 = select %33, %c2_8, %32 : index
        %c2_11 = constant 2 : index
        %c-1_12 = constant -1 : index
        %35 = muli %arg1, %c-1_12 : index
        %c5_13 = constant 5 : index
        %36 = addi %35, %c5_13 : index
        %37 = cmpi "slt", %c2_11, %36 : index
        %38 = select %37, %c2_11, %36 : index
        %c32_14 = constant 32 : index
        %c-1_15 = constant -1 : index
        %39 = muli %arg2, %c-1_15 : index
        %c2_16 = constant 2 : index
        %40 = addi %39, %c2_16 : index
        %41 = cmpi "slt", %c32_14, %40 : index
        %42 = select %41, %c32_14, %40 : index
        %43 = subview %1[%arg0, %arg1, %arg2, 0] [%34, %38, %42, 2] [1, 1, 1, 1]  : memref<1x5x2x2xf32> to memref<?x?x?x2xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 20 + s0 + d1 * 4 + d2 * 2 + d3)>>
        %44 = dim %30, 0 : memref<?x?x?x2xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 20 + s0 + d1 * 4 + d2 * 2 + d3)>>
        %45 = dim %30, 1 : memref<?x?x?x2xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 20 + s0 + d1 * 4 + d2 * 2 + d3)>>
        %46 = dim %30, 2 : memref<?x?x?x2xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 20 + s0 + d1 * 4 + d2 * 2 + d3)>>
        %47 = dim %30, 3 : memref<?x?x?x2xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 20 + s0 + d1 * 4 + d2 * 2 + d3)>>
        %48 = dim %43, 0 : memref<?x?x?x2xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 20 + s0 + d1 * 4 + d2 * 2 + d3)>>
        %49 = dim %43, 1 : memref<?x?x?x2xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 20 + s0 + d1 * 4 + d2 * 2 + d3)>>
        %50 = dim %43, 2 : memref<?x?x?x2xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 20 + s0 + d1 * 4 + d2 * 2 + d3)>>
        %51 = dim %43, 3 : memref<?x?x?x2xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 20 + s0 + d1 * 4 + d2 * 2 + d3)>>
        %c0_17 = constant 0 : index
        %c1_18 = constant 1 : index
        %c0_19 = constant 0 : index
        %c1_20 = constant 1 : index
        %c0_21 = constant 0 : index
        %c1_22 = constant 1 : index
        %c0_23 = constant 0 : index
        %c1_24 = constant 1 : index
        %52 = "gpu.thread_id"() {dimension = "x"} : () -> index
        %53 = "gpu.block_dim"() {dimension = "x"} : () -> index
        %54 = "gpu.thread_id"() {dimension = "y"} : () -> index
        %55 = "gpu.block_dim"() {dimension = "y"} : () -> index
        %56 = "gpu.thread_id"() {dimension = "z"} : () -> index
        %57 = "gpu.block_dim"() {dimension = "z"} : () -> index
        %58 = muli %c1_18, %56 : index
        %59 = addi %c0_17, %58 : index
        %60 = muli %c1_18, %57 : index
        %61 = muli %c1_20, %54 : index
        %62 = addi %c0_19, %61 : index
        %63 = muli %c1_20, %55 : index
        %64 = muli %c1_22, %52 : index
        %65 = addi %c0_21, %64 : index
        %66 = muli %c1_22, %53 : index
        scf.for %arg3 = %59 to %44 step %60 {
          scf.for %arg4 = %62 to %45 step %63 {
            scf.for %arg5 = %65 to %46 step %66 {
              scf.for %arg6 = %c0_23 to %47 step %c1_24 {
                %67 = load %30[%arg3, %arg4, %arg5, %arg6] : memref<?x?x?x2xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 20 + s0 + d1 * 4 + d2 * 2 + d3)>>
                store %67, %43[%arg3, %arg4, %arg5, %arg6] : memref<?x?x?x2xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 20 + s0 + d1 * 4 + d2 * 2 + d3)>>
              }
            }
          }
        }
      }
    }
  }
  return
}

// *** IR Dump After Canonicalizer ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_1() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
    %c5 = constant 5 : index
    %c32 = constant 32 : index
    %c-1 = constant -1 : index
    %c2 = constant 2 : index
    %c0 = constant 0 : index
    %c1 = constant 1 : index
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x5x4xf32>
    %1 = linalg.reshape %0 [affine_map<(d0, d1, d2, d3) -> (d0)>, affine_map<(d0, d1, d2, d3) -> (d1)>, affine_map<(d0, d1, d2, d3) -> (d2, d3)>] : memref<1x5x4xf32> into memref<1x5x2x2xf32>
    %2 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x2x2xf32>
    %3 = "gpu.block_id"() {dimension = "x"} : () -> index
    %4 = "gpu.grid_dim"() {dimension = "x"} : () -> index
    %5 = "gpu.block_id"() {dimension = "y"} : () -> index
    %6 = "gpu.grid_dim"() {dimension = "y"} : () -> index
    %7 = "gpu.block_id"() {dimension = "z"} : () -> index
    %8 = "gpu.grid_dim"() {dimension = "z"} : () -> index
    %9 = muli %7, %c2 : index
    %10 = muli %8, %c2 : index
    %11 = muli %5, %c2 : index
    %12 = muli %6, %c2 : index
    %13 = muli %3, %c32 : index
    %14 = muli %4, %c32 : index
    scf.for %arg0 = %9 to %c1 step %10 {
      scf.for %arg1 = %11 to %c5 step %12 {
        scf.for %arg2 = %13 to %c2 step %14 {
          %15 = muli %arg0, %c-1 : index
          %16 = addi %15, %c1 : index
          %17 = cmpi "slt", %c2, %16 : index
          %18 = select %17, %c2, %16 : index
          %19 = muli %arg1, %c-1 : index
          %20 = addi %19, %c5 : index
          %21 = cmpi "slt", %c2, %20 : index
          %22 = select %21, %c2, %20 : index
          %23 = muli %arg2, %c-1 : index
          %24 = addi %23, %c2 : index
          %25 = cmpi "slt", %c32, %24 : index
          %26 = select %25, %c32, %24 : index
          %27 = subview %2[%arg0, %arg1, %arg2, 0] [%18, %22, %26, 2] [1, 1, 1, 1]  : memref<1x5x2x2xf32> to memref<?x?x?x2xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 20 + s0 + d1 * 4 + d2 * 2 + d3)>>
          %28 = muli %arg0, %c-1 : index
          %29 = addi %28, %c1 : index
          %30 = cmpi "slt", %c2, %29 : index
          %31 = select %30, %c2, %29 : index
          %32 = muli %arg1, %c-1 : index
          %33 = addi %32, %c5 : index
          %34 = cmpi "slt", %c2, %33 : index
          %35 = select %34, %c2, %33 : index
          %36 = muli %arg2, %c-1 : index
          %37 = addi %36, %c2 : index
          %38 = cmpi "slt", %c32, %37 : index
          %39 = select %38, %c32, %37 : index
          %40 = subview %1[%arg0, %arg1, %arg2, 0] [%31, %35, %39, 2] [1, 1, 1, 1]  : memref<1x5x2x2xf32> to memref<?x?x?x2xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 20 + s0 + d1 * 4 + d2 * 2 + d3)>>
          %41 = "gpu.thread_id"() {dimension = "x"} : () -> index
          %42 = "gpu.block_dim"() {dimension = "x"} : () -> index
          %43 = "gpu.thread_id"() {dimension = "y"} : () -> index
          %44 = "gpu.block_dim"() {dimension = "y"} : () -> index
          %45 = "gpu.thread_id"() {dimension = "z"} : () -> index
          %46 = "gpu.block_dim"() {dimension = "z"} : () -> index
          scf.for %arg3 = %45 to %18 step %46 {
            scf.for %arg4 = %43 to %22 step %44 {
              scf.for %arg5 = %41 to %26 step %42 {
                scf.for %arg6 = %c0 to %c2 step %c1 {
                  %47 = load %27[%arg3, %arg4, %arg5, %arg6] : memref<?x?x?x2xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 20 + s0 + d1 * 4 + d2 * 2 + d3)>>
                  store %47, %40[%arg3, %arg4, %arg5, %arg6] : memref<?x?x?x2xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 20 + s0 + d1 * 4 + d2 * 2 + d3)>>
                }
              }
            }
          }
        }
      }
    }
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After CSE ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_1() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
    %c5 = constant 5 : index
    %c32 = constant 32 : index
    %c-1 = constant -1 : index
    %c2 = constant 2 : index
    %c0 = constant 0 : index
    %c1 = constant 1 : index
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x5x4xf32>
    %1 = linalg.reshape %0 [affine_map<(d0, d1, d2, d3) -> (d0)>, affine_map<(d0, d1, d2, d3) -> (d1)>, affine_map<(d0, d1, d2, d3) -> (d2, d3)>] : memref<1x5x4xf32> into memref<1x5x2x2xf32>
    %2 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x2x2xf32>
    %3 = "gpu.block_id"() {dimension = "x"} : () -> index
    %4 = "gpu.grid_dim"() {dimension = "x"} : () -> index
    %5 = "gpu.block_id"() {dimension = "y"} : () -> index
    %6 = "gpu.grid_dim"() {dimension = "y"} : () -> index
    %7 = "gpu.block_id"() {dimension = "z"} : () -> index
    %8 = "gpu.grid_dim"() {dimension = "z"} : () -> index
    %9 = muli %7, %c2 : index
    %10 = muli %8, %c2 : index
    %11 = muli %5, %c2 : index
    %12 = muli %6, %c2 : index
    %13 = muli %3, %c32 : index
    %14 = muli %4, %c32 : index
    scf.for %arg0 = %9 to %c1 step %10 {
      scf.for %arg1 = %11 to %c5 step %12 {
        scf.for %arg2 = %13 to %c2 step %14 {
          %15 = muli %arg0, %c-1 : index
          %16 = addi %15, %c1 : index
          %17 = cmpi "slt", %c2, %16 : index
          %18 = select %17, %c2, %16 : index
          %19 = muli %arg1, %c-1 : index
          %20 = addi %19, %c5 : index
          %21 = cmpi "slt", %c2, %20 : index
          %22 = select %21, %c2, %20 : index
          %23 = muli %arg2, %c-1 : index
          %24 = addi %23, %c2 : index
          %25 = cmpi "slt", %c32, %24 : index
          %26 = select %25, %c32, %24 : index
          %27 = subview %2[%arg0, %arg1, %arg2, 0] [%18, %22, %26, 2] [1, 1, 1, 1]  : memref<1x5x2x2xf32> to memref<?x?x?x2xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 20 + s0 + d1 * 4 + d2 * 2 + d3)>>
          %28 = subview %1[%arg0, %arg1, %arg2, 0] [%18, %22, %26, 2] [1, 1, 1, 1]  : memref<1x5x2x2xf32> to memref<?x?x?x2xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 20 + s0 + d1 * 4 + d2 * 2 + d3)>>
          %29 = "gpu.thread_id"() {dimension = "x"} : () -> index
          %30 = "gpu.block_dim"() {dimension = "x"} : () -> index
          %31 = "gpu.thread_id"() {dimension = "y"} : () -> index
          %32 = "gpu.block_dim"() {dimension = "y"} : () -> index
          %33 = "gpu.thread_id"() {dimension = "z"} : () -> index
          %34 = "gpu.block_dim"() {dimension = "z"} : () -> index
          scf.for %arg3 = %33 to %18 step %34 {
            scf.for %arg4 = %31 to %22 step %32 {
              scf.for %arg5 = %29 to %26 step %30 {
                scf.for %arg6 = %c0 to %c2 step %c1 {
                  %35 = load %27[%arg3, %arg4, %arg5, %arg6] : memref<?x?x?x2xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 20 + s0 + d1 * 4 + d2 * 2 + d3)>>
                  store %35, %28[%arg3, %arg4, %arg5, %arg6] : memref<?x?x?x2xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 20 + s0 + d1 * 4 + d2 * 2 + d3)>>
                }
              }
            }
          }
        }
      }
    }
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::ResolveShapeOpsPass ***
func @main_ex_dispatch_1() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
  %c5 = constant 5 : index
  %c32 = constant 32 : index
  %c-1 = constant -1 : index
  %c2 = constant 2 : index
  %c0 = constant 0 : index
  %c1 = constant 1 : index
  %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x5x4xf32>
  %1 = linalg.reshape %0 [affine_map<(d0, d1, d2, d3) -> (d0)>, affine_map<(d0, d1, d2, d3) -> (d1)>, affine_map<(d0, d1, d2, d3) -> (d2, d3)>] : memref<1x5x4xf32> into memref<1x5x2x2xf32>
  %2 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x2x2xf32>
  %3 = "gpu.block_id"() {dimension = "x"} : () -> index
  %4 = "gpu.grid_dim"() {dimension = "x"} : () -> index
  %5 = "gpu.block_id"() {dimension = "y"} : () -> index
  %6 = "gpu.grid_dim"() {dimension = "y"} : () -> index
  %7 = "gpu.block_id"() {dimension = "z"} : () -> index
  %8 = "gpu.grid_dim"() {dimension = "z"} : () -> index
  %9 = muli %7, %c2 : index
  %10 = muli %8, %c2 : index
  %11 = muli %5, %c2 : index
  %12 = muli %6, %c2 : index
  %13 = muli %3, %c32 : index
  %14 = muli %4, %c32 : index
  scf.for %arg0 = %9 to %c1 step %10 {
    scf.for %arg1 = %11 to %c5 step %12 {
      scf.for %arg2 = %13 to %c2 step %14 {
        %15 = muli %arg0, %c-1 : index
        %16 = addi %15, %c1 : index
        %17 = cmpi "slt", %c2, %16 : index
        %18 = select %17, %c2, %16 : index
        %19 = muli %arg1, %c-1 : index
        %20 = addi %19, %c5 : index
        %21 = cmpi "slt", %c2, %20 : index
        %22 = select %21, %c2, %20 : index
        %23 = muli %arg2, %c-1 : index
        %24 = addi %23, %c2 : index
        %25 = cmpi "slt", %c32, %24 : index
        %26 = select %25, %c32, %24 : index
        %27 = subview %2[%arg0, %arg1, %arg2, 0] [%18, %22, %26, 2] [1, 1, 1, 1]  : memref<1x5x2x2xf32> to memref<?x?x?x2xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 20 + s0 + d1 * 4 + d2 * 2 + d3)>>
        %28 = subview %1[%arg0, %arg1, %arg2, 0] [%18, %22, %26, 2] [1, 1, 1, 1]  : memref<1x5x2x2xf32> to memref<?x?x?x2xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 20 + s0 + d1 * 4 + d2 * 2 + d3)>>
        %29 = "gpu.thread_id"() {dimension = "x"} : () -> index
        %30 = "gpu.block_dim"() {dimension = "x"} : () -> index
        %31 = "gpu.thread_id"() {dimension = "y"} : () -> index
        %32 = "gpu.block_dim"() {dimension = "y"} : () -> index
        %33 = "gpu.thread_id"() {dimension = "z"} : () -> index
        %34 = "gpu.block_dim"() {dimension = "z"} : () -> index
        scf.for %arg3 = %33 to %18 step %34 {
          scf.for %arg4 = %31 to %22 step %32 {
            scf.for %arg5 = %29 to %26 step %30 {
              scf.for %arg6 = %c0 to %c2 step %c1 {
                %35 = load %27[%arg3, %arg4, %arg5, %arg6] : memref<?x?x?x2xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 20 + s0 + d1 * 4 + d2 * 2 + d3)>>
                store %35, %28[%arg3, %arg4, %arg5, %arg6] : memref<?x?x?x2xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 20 + s0 + d1 * 4 + d2 * 2 + d3)>>
              }
            }
          }
        }
      }
    }
  }
  return
}

// *** IR Dump After LegalizeStandardForSPIRV ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_1() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
    %c5 = constant 5 : index
    %c32 = constant 32 : index
    %c-1 = constant -1 : index
    %c2 = constant 2 : index
    %c0 = constant 0 : index
    %c1 = constant 1 : index
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x5x4xf32>
    %1 = linalg.reshape %0 [affine_map<(d0, d1, d2, d3) -> (d0)>, affine_map<(d0, d1, d2, d3) -> (d1)>, affine_map<(d0, d1, d2, d3) -> (d2, d3)>] : memref<1x5x4xf32> into memref<1x5x2x2xf32>
    %2 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x2x2xf32>
    %3 = "gpu.block_id"() {dimension = "x"} : () -> index
    %4 = "gpu.grid_dim"() {dimension = "x"} : () -> index
    %5 = "gpu.block_id"() {dimension = "y"} : () -> index
    %6 = "gpu.grid_dim"() {dimension = "y"} : () -> index
    %7 = "gpu.block_id"() {dimension = "z"} : () -> index
    %8 = "gpu.grid_dim"() {dimension = "z"} : () -> index
    %9 = muli %7, %c2 : index
    %10 = muli %8, %c2 : index
    %11 = muli %5, %c2 : index
    %12 = muli %6, %c2 : index
    %13 = muli %3, %c32 : index
    %14 = muli %4, %c32 : index
    scf.for %arg0 = %9 to %c1 step %10 {
      scf.for %arg1 = %11 to %c5 step %12 {
        scf.for %arg2 = %13 to %c2 step %14 {
          %15 = muli %arg0, %c-1 : index
          %16 = addi %15, %c1 : index
          %17 = cmpi "slt", %c2, %16 : index
          %18 = select %17, %c2, %16 : index
          %19 = muli %arg1, %c-1 : index
          %20 = addi %19, %c5 : index
          %21 = cmpi "slt", %c2, %20 : index
          %22 = select %21, %c2, %20 : index
          %23 = muli %arg2, %c-1 : index
          %24 = addi %23, %c2 : index
          %25 = cmpi "slt", %c32, %24 : index
          %26 = select %25, %c32, %24 : index
          %27 = "gpu.thread_id"() {dimension = "x"} : () -> index
          %28 = "gpu.block_dim"() {dimension = "x"} : () -> index
          %29 = "gpu.thread_id"() {dimension = "y"} : () -> index
          %30 = "gpu.block_dim"() {dimension = "y"} : () -> index
          %31 = "gpu.thread_id"() {dimension = "z"} : () -> index
          %32 = "gpu.block_dim"() {dimension = "z"} : () -> index
          scf.for %arg3 = %31 to %18 step %32 {
            scf.for %arg4 = %29 to %22 step %30 {
              scf.for %arg5 = %27 to %26 step %28 {
                scf.for %arg6 = %c0 to %c2 step %c1 {
                  %33 = addi %arg0, %arg3 : index
                  %34 = addi %arg1, %arg4 : index
                  %35 = addi %arg2, %arg5 : index
                  %36 = load %2[%33, %34, %35, %arg6] : memref<1x5x2x2xf32>
                  %37 = addi %arg0, %arg3 : index
                  %38 = addi %arg1, %arg4 : index
                  %39 = addi %arg2, %arg5 : index
                  store %36, %1[%37, %38, %39, %arg6] : memref<1x5x2x2xf32>
                }
              }
            }
          }
        }
      }
    }
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After Canonicalizer ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_1() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
    %c5 = constant 5 : index
    %c32 = constant 32 : index
    %c-1 = constant -1 : index
    %c2 = constant 2 : index
    %c0 = constant 0 : index
    %c1 = constant 1 : index
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x5x4xf32>
    %1 = linalg.reshape %0 [affine_map<(d0, d1, d2, d3) -> (d0)>, affine_map<(d0, d1, d2, d3) -> (d1)>, affine_map<(d0, d1, d2, d3) -> (d2, d3)>] : memref<1x5x4xf32> into memref<1x5x2x2xf32>
    %2 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x2x2xf32>
    %3 = "gpu.block_id"() {dimension = "x"} : () -> index
    %4 = "gpu.grid_dim"() {dimension = "x"} : () -> index
    %5 = "gpu.block_id"() {dimension = "y"} : () -> index
    %6 = "gpu.grid_dim"() {dimension = "y"} : () -> index
    %7 = "gpu.block_id"() {dimension = "z"} : () -> index
    %8 = "gpu.grid_dim"() {dimension = "z"} : () -> index
    %9 = muli %7, %c2 : index
    %10 = muli %8, %c2 : index
    %11 = muli %5, %c2 : index
    %12 = muli %6, %c2 : index
    %13 = muli %3, %c32 : index
    %14 = muli %4, %c32 : index
    scf.for %arg0 = %9 to %c1 step %10 {
      scf.for %arg1 = %11 to %c5 step %12 {
        scf.for %arg2 = %13 to %c2 step %14 {
          %15 = muli %arg0, %c-1 : index
          %16 = addi %15, %c1 : index
          %17 = cmpi "slt", %c2, %16 : index
          %18 = select %17, %c2, %16 : index
          %19 = muli %arg1, %c-1 : index
          %20 = addi %19, %c5 : index
          %21 = cmpi "slt", %c2, %20 : index
          %22 = select %21, %c2, %20 : index
          %23 = muli %arg2, %c-1 : index
          %24 = addi %23, %c2 : index
          %25 = cmpi "slt", %c32, %24 : index
          %26 = select %25, %c32, %24 : index
          %27 = "gpu.thread_id"() {dimension = "x"} : () -> index
          %28 = "gpu.block_dim"() {dimension = "x"} : () -> index
          %29 = "gpu.thread_id"() {dimension = "y"} : () -> index
          %30 = "gpu.block_dim"() {dimension = "y"} : () -> index
          %31 = "gpu.thread_id"() {dimension = "z"} : () -> index
          %32 = "gpu.block_dim"() {dimension = "z"} : () -> index
          scf.for %arg3 = %31 to %18 step %32 {
            scf.for %arg4 = %29 to %22 step %30 {
              scf.for %arg5 = %27 to %26 step %28 {
                scf.for %arg6 = %c0 to %c2 step %c1 {
                  %33 = addi %arg0, %arg3 : index
                  %34 = addi %arg1, %arg4 : index
                  %35 = addi %arg2, %arg5 : index
                  %36 = load %2[%33, %34, %35, %arg6] : memref<1x5x2x2xf32>
                  %37 = addi %arg0, %arg3 : index
                  %38 = addi %arg1, %arg4 : index
                  %39 = addi %arg2, %arg5 : index
                  store %36, %1[%37, %38, %39, %arg6] : memref<1x5x2x2xf32>
                }
              }
            }
          }
        }
      }
    }
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After CSE ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_1() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
    %c5 = constant 5 : index
    %c32 = constant 32 : index
    %c-1 = constant -1 : index
    %c2 = constant 2 : index
    %c0 = constant 0 : index
    %c1 = constant 1 : index
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x5x4xf32>
    %1 = linalg.reshape %0 [affine_map<(d0, d1, d2, d3) -> (d0)>, affine_map<(d0, d1, d2, d3) -> (d1)>, affine_map<(d0, d1, d2, d3) -> (d2, d3)>] : memref<1x5x4xf32> into memref<1x5x2x2xf32>
    %2 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x2x2xf32>
    %3 = "gpu.block_id"() {dimension = "x"} : () -> index
    %4 = "gpu.grid_dim"() {dimension = "x"} : () -> index
    %5 = "gpu.block_id"() {dimension = "y"} : () -> index
    %6 = "gpu.grid_dim"() {dimension = "y"} : () -> index
    %7 = "gpu.block_id"() {dimension = "z"} : () -> index
    %8 = "gpu.grid_dim"() {dimension = "z"} : () -> index
    %9 = muli %7, %c2 : index
    %10 = muli %8, %c2 : index
    %11 = muli %5, %c2 : index
    %12 = muli %6, %c2 : index
    %13 = muli %3, %c32 : index
    %14 = muli %4, %c32 : index
    scf.for %arg0 = %9 to %c1 step %10 {
      scf.for %arg1 = %11 to %c5 step %12 {
        scf.for %arg2 = %13 to %c2 step %14 {
          %15 = muli %arg0, %c-1 : index
          %16 = addi %15, %c1 : index
          %17 = cmpi "slt", %c2, %16 : index
          %18 = select %17, %c2, %16 : index
          %19 = muli %arg1, %c-1 : index
          %20 = addi %19, %c5 : index
          %21 = cmpi "slt", %c2, %20 : index
          %22 = select %21, %c2, %20 : index
          %23 = muli %arg2, %c-1 : index
          %24 = addi %23, %c2 : index
          %25 = cmpi "slt", %c32, %24 : index
          %26 = select %25, %c32, %24 : index
          %27 = "gpu.thread_id"() {dimension = "x"} : () -> index
          %28 = "gpu.block_dim"() {dimension = "x"} : () -> index
          %29 = "gpu.thread_id"() {dimension = "y"} : () -> index
          %30 = "gpu.block_dim"() {dimension = "y"} : () -> index
          %31 = "gpu.thread_id"() {dimension = "z"} : () -> index
          %32 = "gpu.block_dim"() {dimension = "z"} : () -> index
          scf.for %arg3 = %31 to %18 step %32 {
            scf.for %arg4 = %29 to %22 step %30 {
              scf.for %arg5 = %27 to %26 step %28 {
                scf.for %arg6 = %c0 to %c2 step %c1 {
                  %33 = addi %arg0, %arg3 : index
                  %34 = addi %arg1, %arg4 : index
                  %35 = addi %arg2, %arg5 : index
                  %36 = load %2[%33, %34, %35, %arg6] : memref<1x5x2x2xf32>
                  store %36, %1[%33, %34, %35, %arg6] : memref<1x5x2x2xf32>
                }
              }
            }
          }
        }
      }
    }
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::ConvertToSPIRVPass ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  spv.module Logical GLSL450 {
    spv.globalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
    spv.globalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spv.ptr<vector<3xi32>, Input>
    spv.globalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
    spv.globalVariable @__resource_var_0_0__ bind(0, 0) : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
    spv.globalVariable @__resource_var_0_1__ bind(0, 1) : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
    spv.func @main_ex_dispatch_1() "None" attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
      %0 = spv.constant 5 : i32
      %1 = spv.constant 32 : i32
      %2 = spv.constant -1 : i32
      %3 = spv.constant 2 : i32
      %4 = spv.constant 0 : i32
      %5 = spv.constant 1 : i32
      %6 = spv._address_of @__resource_var_0_1__ : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
      %7 = spv._address_of @__resource_var_0_0__ : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
      %8 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
      %9 = spv.Load "Input" %8 : vector<3xi32>
      %10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
      %11 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
      %12 = spv.Load "Input" %11 : vector<3xi32>
      %13 = spv.CompositeExtract %12[0 : i32] : vector<3xi32>
      %14 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
      %15 = spv.Load "Input" %14 : vector<3xi32>
      %16 = spv.CompositeExtract %15[1 : i32] : vector<3xi32>
      %17 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
      %18 = spv.Load "Input" %17 : vector<3xi32>
      %19 = spv.CompositeExtract %18[1 : i32] : vector<3xi32>
      %20 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
      %21 = spv.Load "Input" %20 : vector<3xi32>
      %22 = spv.CompositeExtract %21[2 : i32] : vector<3xi32>
      %23 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
      %24 = spv.Load "Input" %23 : vector<3xi32>
      %25 = spv.CompositeExtract %24[2 : i32] : vector<3xi32>
      %26 = spv.IMul %22, %3 : i32
      %27 = spv.IMul %25, %3 : i32
      %28 = spv.IMul %16, %3 : i32
      %29 = spv.IMul %19, %3 : i32
      %30 = spv.IMul %10, %1 : i32
      %31 = spv.IMul %13, %1 : i32
      spv.loop {
        spv.Branch ^bb1(%26 : i32)
      ^bb1(%32: i32):  // 2 preds: ^bb0, ^bb2
        %33 = spv.SLessThan %32, %5 : i32
        spv.BranchConditional %33, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        spv.loop {
          spv.Branch ^bb1(%28 : i32)
        ^bb1(%35: i32):  // 2 preds: ^bb0, ^bb2
          %36 = spv.SLessThan %35, %0 : i32
          spv.BranchConditional %36, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          spv.loop {
            spv.Branch ^bb1(%30 : i32)
          ^bb1(%38: i32):  // 2 preds: ^bb0, ^bb2
            %39 = spv.SLessThan %38, %3 : i32
            spv.BranchConditional %39, ^bb2, ^bb3
          ^bb2:  // pred: ^bb1
            %40 = spv.IMul %32, %2 : i32
            %41 = spv.IAdd %40, %5 : i32
            %42 = spv.SLessThan %3, %41 : i32
            %43 = spv.Select %42, %3, %41 : i1, i32
            %44 = spv.IMul %35, %2 : i32
            %45 = spv.IAdd %44, %0 : i32
            %46 = spv.SLessThan %3, %45 : i32
            %47 = spv.Select %46, %3, %45 : i1, i32
            %48 = spv.IMul %38, %2 : i32
            %49 = spv.IAdd %48, %3 : i32
            %50 = spv.SLessThan %1, %49 : i32
            %51 = spv.Select %50, %1, %49 : i1, i32
            %52 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
            %53 = spv.Load "Input" %52 : vector<3xi32>
            %54 = spv.CompositeExtract %53[0 : i32] : vector<3xi32>
            %55 = spv.constant 32 : i32
            %56 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
            %57 = spv.Load "Input" %56 : vector<3xi32>
            %58 = spv.CompositeExtract %57[1 : i32] : vector<3xi32>
            %59 = spv.constant 2 : i32
            %60 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
            %61 = spv.Load "Input" %60 : vector<3xi32>
            %62 = spv.CompositeExtract %61[2 : i32] : vector<3xi32>
            %63 = spv.constant 2 : i32
            spv.loop {
              spv.Branch ^bb1(%62 : i32)
            ^bb1(%65: i32):  // 2 preds: ^bb0, ^bb2
              %66 = spv.SLessThan %65, %43 : i32
              spv.BranchConditional %66, ^bb2, ^bb3
            ^bb2:  // pred: ^bb1
              spv.loop {
                spv.Branch ^bb1(%58 : i32)
              ^bb1(%68: i32):  // 2 preds: ^bb0, ^bb2
                %69 = spv.SLessThan %68, %47 : i32
                spv.BranchConditional %69, ^bb2, ^bb3
              ^bb2:  // pred: ^bb1
                spv.loop {
                  spv.Branch ^bb1(%54 : i32)
                ^bb1(%71: i32):  // 2 preds: ^bb0, ^bb2
                  %72 = spv.SLessThan %71, %51 : i32
                  spv.BranchConditional %72, ^bb2, ^bb3
                ^bb2:  // pred: ^bb1
                  spv.loop {
                    spv.Branch ^bb1(%4 : i32)
                  ^bb1(%74: i32):  // 2 preds: ^bb0, ^bb2
                    %75 = spv.SLessThan %74, %3 : i32
                    spv.BranchConditional %75, ^bb2, ^bb3
                  ^bb2:  // pred: ^bb1
                    %76 = spv.IAdd %32, %65 : i32
                    %77 = spv.IAdd %35, %68 : i32
                    %78 = spv.IAdd %38, %71 : i32
                    %79 = spv.constant 0 : i32
                    %80 = spv.constant 0 : i32
                    %81 = spv.constant 20 : i32
                    %82 = spv.IMul %81, %76 : i32
                    %83 = spv.IAdd %80, %82 : i32
                    %84 = spv.constant 4 : i32
                    %85 = spv.IMul %84, %77 : i32
                    %86 = spv.IAdd %83, %85 : i32
                    %87 = spv.constant 2 : i32
                    %88 = spv.IMul %87, %78 : i32
                    %89 = spv.IAdd %86, %88 : i32
                    %90 = spv.constant 1 : i32
                    %91 = spv.IMul %90, %74 : i32
                    %92 = spv.IAdd %89, %91 : i32
                    %93 = spv.AccessChain %7[%79, %92] : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
                    %94 = spv.Load "StorageBuffer" %93 : f32
                    %95 = spv.constant 0 : i32
                    %96 = spv.constant 0 : i32
                    %97 = spv.constant 20 : i32
                    %98 = spv.IMul %97, %76 : i32
                    %99 = spv.IAdd %96, %98 : i32
                    %100 = spv.constant 4 : i32
                    %101 = spv.IMul %100, %77 : i32
                    %102 = spv.IAdd %99, %101 : i32
                    %103 = spv.constant 2 : i32
                    %104 = spv.IMul %103, %78 : i32
                    %105 = spv.IAdd %102, %104 : i32
                    %106 = spv.constant 1 : i32
                    %107 = spv.IMul %106, %74 : i32
                    %108 = spv.IAdd %105, %107 : i32
                    %109 = spv.AccessChain %6[%95, %108] : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
                    spv.Store "StorageBuffer" %109, %94 : f32
                    %110 = spv.IAdd %74, %5 : i32
                    spv.Branch ^bb1(%110 : i32)
                  ^bb3:  // pred: ^bb1
                    spv._merge
                  }
                  %73 = spv.IAdd %71, %55 : i32
                  spv.Branch ^bb1(%73 : i32)
                ^bb3:  // pred: ^bb1
                  spv._merge
                }
                %70 = spv.IAdd %68, %59 : i32
                spv.Branch ^bb1(%70 : i32)
              ^bb3:  // pred: ^bb1
                spv._merge
              }
              %67 = spv.IAdd %65, %63 : i32
              spv.Branch ^bb1(%67 : i32)
            ^bb3:  // pred: ^bb1
              spv._merge
            }
            %64 = spv.IAdd %38, %31 : i32
            spv.Branch ^bb1(%64 : i32)
          ^bb3:  // pred: ^bb1
            spv._merge
          }
          %37 = spv.IAdd %35, %29 : i32
          spv.Branch ^bb1(%37 : i32)
        ^bb3:  // pred: ^bb1
          spv._merge
        }
        %34 = spv.IAdd %32, %27 : i32
        spv.Branch ^bb1(%34 : i32)
      ^bb3:  // pred: ^bb1
        spv._merge
      }
      spv.Return
    }
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After SPIRVLowerABIAttributes ***
spv.module Logical GLSL450 {
  spv.globalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__resource_var_0_0__ bind(0, 0) : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
  spv.globalVariable @__resource_var_0_1__ bind(0, 1) : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
  spv.func @main_ex_dispatch_1() "None" {
    %0 = spv.constant 5 : i32
    %1 = spv.constant 32 : i32
    %2 = spv.constant -1 : i32
    %3 = spv.constant 2 : i32
    %4 = spv.constant 0 : i32
    %5 = spv.constant 1 : i32
    %6 = spv._address_of @__resource_var_0_1__ : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
    %7 = spv._address_of @__resource_var_0_0__ : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
    %8 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %9 = spv.Load "Input" %8 : vector<3xi32>
    %10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
    %11 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
    %12 = spv.Load "Input" %11 : vector<3xi32>
    %13 = spv.CompositeExtract %12[0 : i32] : vector<3xi32>
    %14 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %15 = spv.Load "Input" %14 : vector<3xi32>
    %16 = spv.CompositeExtract %15[1 : i32] : vector<3xi32>
    %17 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
    %18 = spv.Load "Input" %17 : vector<3xi32>
    %19 = spv.CompositeExtract %18[1 : i32] : vector<3xi32>
    %20 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %21 = spv.Load "Input" %20 : vector<3xi32>
    %22 = spv.CompositeExtract %21[2 : i32] : vector<3xi32>
    %23 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
    %24 = spv.Load "Input" %23 : vector<3xi32>
    %25 = spv.CompositeExtract %24[2 : i32] : vector<3xi32>
    %26 = spv.IMul %22, %3 : i32
    %27 = spv.IMul %25, %3 : i32
    %28 = spv.IMul %16, %3 : i32
    %29 = spv.IMul %19, %3 : i32
    %30 = spv.IMul %10, %1 : i32
    %31 = spv.IMul %13, %1 : i32
    spv.loop {
      spv.Branch ^bb1(%26 : i32)
    ^bb1(%32: i32):  // 2 preds: ^bb0, ^bb2
      %33 = spv.SLessThan %32, %5 : i32
      spv.BranchConditional %33, ^bb2, ^bb3
    ^bb2:  // pred: ^bb1
      spv.loop {
        spv.Branch ^bb1(%28 : i32)
      ^bb1(%35: i32):  // 2 preds: ^bb0, ^bb2
        %36 = spv.SLessThan %35, %0 : i32
        spv.BranchConditional %36, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        spv.loop {
          spv.Branch ^bb1(%30 : i32)
        ^bb1(%38: i32):  // 2 preds: ^bb0, ^bb2
          %39 = spv.SLessThan %38, %3 : i32
          spv.BranchConditional %39, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          %40 = spv.IMul %32, %2 : i32
          %41 = spv.IAdd %40, %5 : i32
          %42 = spv.SLessThan %3, %41 : i32
          %43 = spv.Select %42, %3, %41 : i1, i32
          %44 = spv.IMul %35, %2 : i32
          %45 = spv.IAdd %44, %0 : i32
          %46 = spv.SLessThan %3, %45 : i32
          %47 = spv.Select %46, %3, %45 : i1, i32
          %48 = spv.IMul %38, %2 : i32
          %49 = spv.IAdd %48, %3 : i32
          %50 = spv.SLessThan %1, %49 : i32
          %51 = spv.Select %50, %1, %49 : i1, i32
          %52 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
          %53 = spv.Load "Input" %52 : vector<3xi32>
          %54 = spv.CompositeExtract %53[0 : i32] : vector<3xi32>
          %55 = spv.constant 32 : i32
          %56 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
          %57 = spv.Load "Input" %56 : vector<3xi32>
          %58 = spv.CompositeExtract %57[1 : i32] : vector<3xi32>
          %59 = spv.constant 2 : i32
          %60 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
          %61 = spv.Load "Input" %60 : vector<3xi32>
          %62 = spv.CompositeExtract %61[2 : i32] : vector<3xi32>
          %63 = spv.constant 2 : i32
          spv.loop {
            spv.Branch ^bb1(%62 : i32)
          ^bb1(%65: i32):  // 2 preds: ^bb0, ^bb2
            %66 = spv.SLessThan %65, %43 : i32
            spv.BranchConditional %66, ^bb2, ^bb3
          ^bb2:  // pred: ^bb1
            spv.loop {
              spv.Branch ^bb1(%58 : i32)
            ^bb1(%68: i32):  // 2 preds: ^bb0, ^bb2
              %69 = spv.SLessThan %68, %47 : i32
              spv.BranchConditional %69, ^bb2, ^bb3
            ^bb2:  // pred: ^bb1
              spv.loop {
                spv.Branch ^bb1(%54 : i32)
              ^bb1(%71: i32):  // 2 preds: ^bb0, ^bb2
                %72 = spv.SLessThan %71, %51 : i32
                spv.BranchConditional %72, ^bb2, ^bb3
              ^bb2:  // pred: ^bb1
                spv.loop {
                  spv.Branch ^bb1(%4 : i32)
                ^bb1(%74: i32):  // 2 preds: ^bb0, ^bb2
                  %75 = spv.SLessThan %74, %3 : i32
                  spv.BranchConditional %75, ^bb2, ^bb3
                ^bb2:  // pred: ^bb1
                  %76 = spv.IAdd %32, %65 : i32
                  %77 = spv.IAdd %35, %68 : i32
                  %78 = spv.IAdd %38, %71 : i32
                  %79 = spv.constant 0 : i32
                  %80 = spv.constant 0 : i32
                  %81 = spv.constant 20 : i32
                  %82 = spv.IMul %81, %76 : i32
                  %83 = spv.IAdd %80, %82 : i32
                  %84 = spv.constant 4 : i32
                  %85 = spv.IMul %84, %77 : i32
                  %86 = spv.IAdd %83, %85 : i32
                  %87 = spv.constant 2 : i32
                  %88 = spv.IMul %87, %78 : i32
                  %89 = spv.IAdd %86, %88 : i32
                  %90 = spv.constant 1 : i32
                  %91 = spv.IMul %90, %74 : i32
                  %92 = spv.IAdd %89, %91 : i32
                  %93 = spv.AccessChain %7[%79, %92] : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
                  %94 = spv.Load "StorageBuffer" %93 : f32
                  %95 = spv.constant 0 : i32
                  %96 = spv.constant 0 : i32
                  %97 = spv.constant 20 : i32
                  %98 = spv.IMul %97, %76 : i32
                  %99 = spv.IAdd %96, %98 : i32
                  %100 = spv.constant 4 : i32
                  %101 = spv.IMul %100, %77 : i32
                  %102 = spv.IAdd %99, %101 : i32
                  %103 = spv.constant 2 : i32
                  %104 = spv.IMul %103, %78 : i32
                  %105 = spv.IAdd %102, %104 : i32
                  %106 = spv.constant 1 : i32
                  %107 = spv.IMul %106, %74 : i32
                  %108 = spv.IAdd %105, %107 : i32
                  %109 = spv.AccessChain %6[%95, %108] : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
                  spv.Store "StorageBuffer" %109, %94 : f32
                  %110 = spv.IAdd %74, %5 : i32
                  spv.Branch ^bb1(%110 : i32)
                ^bb3:  // pred: ^bb1
                  spv._merge
                }
                %73 = spv.IAdd %71, %55 : i32
                spv.Branch ^bb1(%73 : i32)
              ^bb3:  // pred: ^bb1
                spv._merge
              }
              %70 = spv.IAdd %68, %59 : i32
              spv.Branch ^bb1(%70 : i32)
            ^bb3:  // pred: ^bb1
              spv._merge
            }
            %67 = spv.IAdd %65, %63 : i32
            spv.Branch ^bb1(%67 : i32)
          ^bb3:  // pred: ^bb1
            spv._merge
          }
          %64 = spv.IAdd %38, %31 : i32
          spv.Branch ^bb1(%64 : i32)
        ^bb3:  // pred: ^bb1
          spv._merge
        }
        %37 = spv.IAdd %35, %29 : i32
        spv.Branch ^bb1(%37 : i32)
      ^bb3:  // pred: ^bb1
        spv._merge
      }
      %34 = spv.IAdd %32, %27 : i32
      spv.Branch ^bb1(%34 : i32)
    ^bb3:  // pred: ^bb1
      spv._merge
    }
    spv.Return
  }
  spv.EntryPoint "GLCompute" @main_ex_dispatch_1, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__
  spv.ExecutionMode @main_ex_dispatch_1 "LocalSize", 32, 2, 2
}

// *** IR Dump After Canonicalizer ***
spv.module Logical GLSL450 {
  spv.globalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__resource_var_0_0__ bind(0, 0) : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
  spv.globalVariable @__resource_var_0_1__ bind(0, 1) : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
  spv.func @main_ex_dispatch_1() "None" {
    %0 = spv.constant 5 : i32
    %1 = spv.constant -1 : i32
    %2 = spv.constant 1 : i32
    %3 = spv.constant 32 : i32
    %4 = spv.constant 0 : i32
    %5 = spv.constant 20 : i32
    %6 = spv.constant 4 : i32
    %7 = spv.constant 2 : i32
    %8 = spv._address_of @__resource_var_0_1__ : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
    %9 = spv._address_of @__resource_var_0_0__ : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
    %10 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %11 = spv.Load "Input" %10 : vector<3xi32>
    %12 = spv.CompositeExtract %11[0 : i32] : vector<3xi32>
    %13 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
    %14 = spv.Load "Input" %13 : vector<3xi32>
    %15 = spv.CompositeExtract %14[0 : i32] : vector<3xi32>
    %16 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %17 = spv.Load "Input" %16 : vector<3xi32>
    %18 = spv.CompositeExtract %17[1 : i32] : vector<3xi32>
    %19 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
    %20 = spv.Load "Input" %19 : vector<3xi32>
    %21 = spv.CompositeExtract %20[1 : i32] : vector<3xi32>
    %22 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %23 = spv.Load "Input" %22 : vector<3xi32>
    %24 = spv.CompositeExtract %23[2 : i32] : vector<3xi32>
    %25 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
    %26 = spv.Load "Input" %25 : vector<3xi32>
    %27 = spv.CompositeExtract %26[2 : i32] : vector<3xi32>
    %28 = spv.IMul %24, %7 : i32
    %29 = spv.IMul %27, %7 : i32
    %30 = spv.IMul %18, %7 : i32
    %31 = spv.IMul %21, %7 : i32
    %32 = spv.IMul %12, %3 : i32
    %33 = spv.IMul %15, %3 : i32
    spv.loop {
      spv.Branch ^bb1(%28 : i32)
    ^bb1(%34: i32):  // 2 preds: ^bb0, ^bb2
      %35 = spv.SLessThan %34, %2 : i32
      spv.BranchConditional %35, ^bb2, ^bb3
    ^bb2:  // pred: ^bb1
      spv.loop {
        spv.Branch ^bb1(%30 : i32)
      ^bb1(%37: i32):  // 2 preds: ^bb0, ^bb2
        %38 = spv.SLessThan %37, %0 : i32
        spv.BranchConditional %38, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        spv.loop {
          spv.Branch ^bb1(%32 : i32)
        ^bb1(%40: i32):  // 2 preds: ^bb0, ^bb2
          %41 = spv.SLessThan %40, %7 : i32
          spv.BranchConditional %41, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          %42 = spv.IMul %34, %1 : i32
          %43 = spv.IAdd %42, %2 : i32
          %44 = spv.SLessThan %7, %43 : i32
          %45 = spv.Select %44, %7, %43 : i1, i32
          %46 = spv.IMul %37, %1 : i32
          %47 = spv.IAdd %46, %0 : i32
          %48 = spv.SLessThan %7, %47 : i32
          %49 = spv.Select %48, %7, %47 : i1, i32
          %50 = spv.IMul %40, %1 : i32
          %51 = spv.IAdd %50, %7 : i32
          %52 = spv.SLessThan %3, %51 : i32
          %53 = spv.Select %52, %3, %51 : i1, i32
          %54 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
          %55 = spv.Load "Input" %54 : vector<3xi32>
          %56 = spv.CompositeExtract %55[0 : i32] : vector<3xi32>
          %57 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
          %58 = spv.Load "Input" %57 : vector<3xi32>
          %59 = spv.CompositeExtract %58[1 : i32] : vector<3xi32>
          %60 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
          %61 = spv.Load "Input" %60 : vector<3xi32>
          %62 = spv.CompositeExtract %61[2 : i32] : vector<3xi32>
          spv.loop {
            spv.Branch ^bb1(%62 : i32)
          ^bb1(%64: i32):  // 2 preds: ^bb0, ^bb2
            %65 = spv.SLessThan %64, %45 : i32
            spv.BranchConditional %65, ^bb2, ^bb3
          ^bb2:  // pred: ^bb1
            spv.loop {
              spv.Branch ^bb1(%59 : i32)
            ^bb1(%67: i32):  // 2 preds: ^bb0, ^bb2
              %68 = spv.SLessThan %67, %49 : i32
              spv.BranchConditional %68, ^bb2, ^bb3
            ^bb2:  // pred: ^bb1
              spv.loop {
                spv.Branch ^bb1(%56 : i32)
              ^bb1(%70: i32):  // 2 preds: ^bb0, ^bb2
                %71 = spv.SLessThan %70, %53 : i32
                spv.BranchConditional %71, ^bb2, ^bb3
              ^bb2:  // pred: ^bb1
                spv.loop {
                  spv.Branch ^bb1(%4 : i32)
                ^bb1(%73: i32):  // 2 preds: ^bb0, ^bb2
                  %74 = spv.SLessThan %73, %7 : i32
                  spv.BranchConditional %74, ^bb2, ^bb3
                ^bb2:  // pred: ^bb1
                  %75 = spv.IAdd %34, %64 : i32
                  %76 = spv.IAdd %37, %67 : i32
                  %77 = spv.IAdd %40, %70 : i32
                  %78 = spv.IMul %75, %5 : i32
                  %79 = spv.IMul %76, %6 : i32
                  %80 = spv.IAdd %78, %79 : i32
                  %81 = spv.IMul %77, %7 : i32
                  %82 = spv.IAdd %80, %81 : i32
                  %83 = spv.IAdd %82, %73 : i32
                  %84 = spv.AccessChain %9[%4, %83] : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
                  %85 = spv.Load "StorageBuffer" %84 : f32
                  %86 = spv.IMul %75, %5 : i32
                  %87 = spv.IMul %76, %6 : i32
                  %88 = spv.IAdd %86, %87 : i32
                  %89 = spv.IMul %77, %7 : i32
                  %90 = spv.IAdd %88, %89 : i32
                  %91 = spv.IAdd %90, %73 : i32
                  %92 = spv.AccessChain %8[%4, %91] : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
                  spv.Store "StorageBuffer" %92, %85 : f32
                  %93 = spv.IAdd %73, %2 : i32
                  spv.Branch ^bb1(%93 : i32)
                ^bb3:  // pred: ^bb1
                  spv._merge
                }
                %72 = spv.IAdd %70, %3 : i32
                spv.Branch ^bb1(%72 : i32)
              ^bb3:  // pred: ^bb1
                spv._merge
              }
              %69 = spv.IAdd %67, %7 : i32
              spv.Branch ^bb1(%69 : i32)
            ^bb3:  // pred: ^bb1
              spv._merge
            }
            %66 = spv.IAdd %64, %7 : i32
            spv.Branch ^bb1(%66 : i32)
          ^bb3:  // pred: ^bb1
            spv._merge
          }
          %63 = spv.IAdd %40, %33 : i32
          spv.Branch ^bb1(%63 : i32)
        ^bb3:  // pred: ^bb1
          spv._merge
        }
        %39 = spv.IAdd %37, %31 : i32
        spv.Branch ^bb1(%39 : i32)
      ^bb3:  // pred: ^bb1
        spv._merge
      }
      %36 = spv.IAdd %34, %29 : i32
      spv.Branch ^bb1(%36 : i32)
    ^bb3:  // pred: ^bb1
      spv._merge
    }
    spv.Return
  }
  spv.EntryPoint "GLCompute" @main_ex_dispatch_1, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__
  spv.ExecutionMode @main_ex_dispatch_1 "LocalSize", 32, 2, 2
}

// *** IR Dump After CSE ***
spv.module Logical GLSL450 {
  spv.globalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__resource_var_0_0__ bind(0, 0) : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
  spv.globalVariable @__resource_var_0_1__ bind(0, 1) : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
  spv.func @main_ex_dispatch_1() "None" {
    %0 = spv.constant 5 : i32
    %1 = spv.constant -1 : i32
    %2 = spv.constant 1 : i32
    %3 = spv.constant 32 : i32
    %4 = spv.constant 0 : i32
    %5 = spv.constant 20 : i32
    %6 = spv.constant 4 : i32
    %7 = spv.constant 2 : i32
    %8 = spv._address_of @__resource_var_0_1__ : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
    %9 = spv._address_of @__resource_var_0_0__ : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
    %10 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %11 = spv.Load "Input" %10 : vector<3xi32>
    %12 = spv.CompositeExtract %11[0 : i32] : vector<3xi32>
    %13 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
    %14 = spv.Load "Input" %13 : vector<3xi32>
    %15 = spv.CompositeExtract %14[0 : i32] : vector<3xi32>
    %16 = spv.Load "Input" %10 : vector<3xi32>
    %17 = spv.CompositeExtract %16[1 : i32] : vector<3xi32>
    %18 = spv.Load "Input" %13 : vector<3xi32>
    %19 = spv.CompositeExtract %18[1 : i32] : vector<3xi32>
    %20 = spv.Load "Input" %10 : vector<3xi32>
    %21 = spv.CompositeExtract %20[2 : i32] : vector<3xi32>
    %22 = spv.Load "Input" %13 : vector<3xi32>
    %23 = spv.CompositeExtract %22[2 : i32] : vector<3xi32>
    %24 = spv.IMul %21, %7 : i32
    %25 = spv.IMul %23, %7 : i32
    %26 = spv.IMul %17, %7 : i32
    %27 = spv.IMul %19, %7 : i32
    %28 = spv.IMul %12, %3 : i32
    %29 = spv.IMul %15, %3 : i32
    spv.loop {
      spv.Branch ^bb1(%24 : i32)
    ^bb1(%30: i32):  // 2 preds: ^bb0, ^bb2
      %31 = spv.SLessThan %30, %2 : i32
      spv.BranchConditional %31, ^bb2, ^bb3
    ^bb2:  // pred: ^bb1
      spv.loop {
        spv.Branch ^bb1(%26 : i32)
      ^bb1(%33: i32):  // 2 preds: ^bb0, ^bb2
        %34 = spv.SLessThan %33, %0 : i32
        spv.BranchConditional %34, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        spv.loop {
          spv.Branch ^bb1(%28 : i32)
        ^bb1(%36: i32):  // 2 preds: ^bb0, ^bb2
          %37 = spv.SLessThan %36, %7 : i32
          spv.BranchConditional %37, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          %38 = spv.IMul %30, %1 : i32
          %39 = spv.IAdd %38, %2 : i32
          %40 = spv.SLessThan %7, %39 : i32
          %41 = spv.Select %40, %7, %39 : i1, i32
          %42 = spv.IMul %33, %1 : i32
          %43 = spv.IAdd %42, %0 : i32
          %44 = spv.SLessThan %7, %43 : i32
          %45 = spv.Select %44, %7, %43 : i1, i32
          %46 = spv.IMul %36, %1 : i32
          %47 = spv.IAdd %46, %7 : i32
          %48 = spv.SLessThan %3, %47 : i32
          %49 = spv.Select %48, %3, %47 : i1, i32
          %50 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
          %51 = spv.Load "Input" %50 : vector<3xi32>
          %52 = spv.CompositeExtract %51[0 : i32] : vector<3xi32>
          %53 = spv.Load "Input" %50 : vector<3xi32>
          %54 = spv.CompositeExtract %53[1 : i32] : vector<3xi32>
          %55 = spv.Load "Input" %50 : vector<3xi32>
          %56 = spv.CompositeExtract %55[2 : i32] : vector<3xi32>
          spv.loop {
            spv.Branch ^bb1(%56 : i32)
          ^bb1(%58: i32):  // 2 preds: ^bb0, ^bb2
            %59 = spv.SLessThan %58, %41 : i32
            spv.BranchConditional %59, ^bb2, ^bb3
          ^bb2:  // pred: ^bb1
            spv.loop {
              spv.Branch ^bb1(%54 : i32)
            ^bb1(%61: i32):  // 2 preds: ^bb0, ^bb2
              %62 = spv.SLessThan %61, %45 : i32
              spv.BranchConditional %62, ^bb2, ^bb3
            ^bb2:  // pred: ^bb1
              spv.loop {
                spv.Branch ^bb1(%52 : i32)
              ^bb1(%64: i32):  // 2 preds: ^bb0, ^bb2
                %65 = spv.SLessThan %64, %49 : i32
                spv.BranchConditional %65, ^bb2, ^bb3
              ^bb2:  // pred: ^bb1
                spv.loop {
                  spv.Branch ^bb1(%4 : i32)
                ^bb1(%67: i32):  // 2 preds: ^bb0, ^bb2
                  %68 = spv.SLessThan %67, %7 : i32
                  spv.BranchConditional %68, ^bb2, ^bb3
                ^bb2:  // pred: ^bb1
                  %69 = spv.IAdd %30, %58 : i32
                  %70 = spv.IAdd %33, %61 : i32
                  %71 = spv.IAdd %36, %64 : i32
                  %72 = spv.IMul %69, %5 : i32
                  %73 = spv.IMul %70, %6 : i32
                  %74 = spv.IAdd %72, %73 : i32
                  %75 = spv.IMul %71, %7 : i32
                  %76 = spv.IAdd %74, %75 : i32
                  %77 = spv.IAdd %76, %67 : i32
                  %78 = spv.AccessChain %9[%4, %77] : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
                  %79 = spv.Load "StorageBuffer" %78 : f32
                  %80 = spv.AccessChain %8[%4, %77] : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
                  spv.Store "StorageBuffer" %80, %79 : f32
                  %81 = spv.IAdd %67, %2 : i32
                  spv.Branch ^bb1(%81 : i32)
                ^bb3:  // pred: ^bb1
                  spv._merge
                }
                %66 = spv.IAdd %64, %3 : i32
                spv.Branch ^bb1(%66 : i32)
              ^bb3:  // pred: ^bb1
                spv._merge
              }
              %63 = spv.IAdd %61, %7 : i32
              spv.Branch ^bb1(%63 : i32)
            ^bb3:  // pred: ^bb1
              spv._merge
            }
            %60 = spv.IAdd %58, %7 : i32
            spv.Branch ^bb1(%60 : i32)
          ^bb3:  // pred: ^bb1
            spv._merge
          }
          %57 = spv.IAdd %36, %29 : i32
          spv.Branch ^bb1(%57 : i32)
        ^bb3:  // pred: ^bb1
          spv._merge
        }
        %35 = spv.IAdd %33, %27 : i32
        spv.Branch ^bb1(%35 : i32)
      ^bb3:  // pred: ^bb1
        spv._merge
      }
      %32 = spv.IAdd %30, %25 : i32
      spv.Branch ^bb1(%32 : i32)
    ^bb3:  // pred: ^bb1
      spv._merge
    }
    spv.Return
  }
  spv.EntryPoint "GLCompute" @main_ex_dispatch_1, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__
  spv.ExecutionMode @main_ex_dispatch_1 "LocalSize", 32, 2, 2
}

// *** IR Dump After SPIRVUpdateVCE ***
spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
  spv.globalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__resource_var_0_0__ bind(0, 0) : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
  spv.globalVariable @__resource_var_0_1__ bind(0, 1) : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
  spv.func @main_ex_dispatch_1() "None" {
    %0 = spv.constant 5 : i32
    %1 = spv.constant -1 : i32
    %2 = spv.constant 1 : i32
    %3 = spv.constant 32 : i32
    %4 = spv.constant 0 : i32
    %5 = spv.constant 20 : i32
    %6 = spv.constant 4 : i32
    %7 = spv.constant 2 : i32
    %8 = spv._address_of @__resource_var_0_1__ : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
    %9 = spv._address_of @__resource_var_0_0__ : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
    %10 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %11 = spv.Load "Input" %10 : vector<3xi32>
    %12 = spv.CompositeExtract %11[0 : i32] : vector<3xi32>
    %13 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
    %14 = spv.Load "Input" %13 : vector<3xi32>
    %15 = spv.CompositeExtract %14[0 : i32] : vector<3xi32>
    %16 = spv.Load "Input" %10 : vector<3xi32>
    %17 = spv.CompositeExtract %16[1 : i32] : vector<3xi32>
    %18 = spv.Load "Input" %13 : vector<3xi32>
    %19 = spv.CompositeExtract %18[1 : i32] : vector<3xi32>
    %20 = spv.Load "Input" %10 : vector<3xi32>
    %21 = spv.CompositeExtract %20[2 : i32] : vector<3xi32>
    %22 = spv.Load "Input" %13 : vector<3xi32>
    %23 = spv.CompositeExtract %22[2 : i32] : vector<3xi32>
    %24 = spv.IMul %21, %7 : i32
    %25 = spv.IMul %23, %7 : i32
    %26 = spv.IMul %17, %7 : i32
    %27 = spv.IMul %19, %7 : i32
    %28 = spv.IMul %12, %3 : i32
    %29 = spv.IMul %15, %3 : i32
    spv.loop {
      spv.Branch ^bb1(%24 : i32)
    ^bb1(%30: i32):  // 2 preds: ^bb0, ^bb2
      %31 = spv.SLessThan %30, %2 : i32
      spv.BranchConditional %31, ^bb2, ^bb3
    ^bb2:  // pred: ^bb1
      spv.loop {
        spv.Branch ^bb1(%26 : i32)
      ^bb1(%33: i32):  // 2 preds: ^bb0, ^bb2
        %34 = spv.SLessThan %33, %0 : i32
        spv.BranchConditional %34, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        spv.loop {
          spv.Branch ^bb1(%28 : i32)
        ^bb1(%36: i32):  // 2 preds: ^bb0, ^bb2
          %37 = spv.SLessThan %36, %7 : i32
          spv.BranchConditional %37, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          %38 = spv.IMul %30, %1 : i32
          %39 = spv.IAdd %38, %2 : i32
          %40 = spv.SLessThan %7, %39 : i32
          %41 = spv.Select %40, %7, %39 : i1, i32
          %42 = spv.IMul %33, %1 : i32
          %43 = spv.IAdd %42, %0 : i32
          %44 = spv.SLessThan %7, %43 : i32
          %45 = spv.Select %44, %7, %43 : i1, i32
          %46 = spv.IMul %36, %1 : i32
          %47 = spv.IAdd %46, %7 : i32
          %48 = spv.SLessThan %3, %47 : i32
          %49 = spv.Select %48, %3, %47 : i1, i32
          %50 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
          %51 = spv.Load "Input" %50 : vector<3xi32>
          %52 = spv.CompositeExtract %51[0 : i32] : vector<3xi32>
          %53 = spv.Load "Input" %50 : vector<3xi32>
          %54 = spv.CompositeExtract %53[1 : i32] : vector<3xi32>
          %55 = spv.Load "Input" %50 : vector<3xi32>
          %56 = spv.CompositeExtract %55[2 : i32] : vector<3xi32>
          spv.loop {
            spv.Branch ^bb1(%56 : i32)
          ^bb1(%58: i32):  // 2 preds: ^bb0, ^bb2
            %59 = spv.SLessThan %58, %41 : i32
            spv.BranchConditional %59, ^bb2, ^bb3
          ^bb2:  // pred: ^bb1
            spv.loop {
              spv.Branch ^bb1(%54 : i32)
            ^bb1(%61: i32):  // 2 preds: ^bb0, ^bb2
              %62 = spv.SLessThan %61, %45 : i32
              spv.BranchConditional %62, ^bb2, ^bb3
            ^bb2:  // pred: ^bb1
              spv.loop {
                spv.Branch ^bb1(%52 : i32)
              ^bb1(%64: i32):  // 2 preds: ^bb0, ^bb2
                %65 = spv.SLessThan %64, %49 : i32
                spv.BranchConditional %65, ^bb2, ^bb3
              ^bb2:  // pred: ^bb1
                spv.loop {
                  spv.Branch ^bb1(%4 : i32)
                ^bb1(%67: i32):  // 2 preds: ^bb0, ^bb2
                  %68 = spv.SLessThan %67, %7 : i32
                  spv.BranchConditional %68, ^bb2, ^bb3
                ^bb2:  // pred: ^bb1
                  %69 = spv.IAdd %30, %58 : i32
                  %70 = spv.IAdd %33, %61 : i32
                  %71 = spv.IAdd %36, %64 : i32
                  %72 = spv.IMul %69, %5 : i32
                  %73 = spv.IMul %70, %6 : i32
                  %74 = spv.IAdd %72, %73 : i32
                  %75 = spv.IMul %71, %7 : i32
                  %76 = spv.IAdd %74, %75 : i32
                  %77 = spv.IAdd %76, %67 : i32
                  %78 = spv.AccessChain %9[%4, %77] : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
                  %79 = spv.Load "StorageBuffer" %78 : f32
                  %80 = spv.AccessChain %8[%4, %77] : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
                  spv.Store "StorageBuffer" %80, %79 : f32
                  %81 = spv.IAdd %67, %2 : i32
                  spv.Branch ^bb1(%81 : i32)
                ^bb3:  // pred: ^bb1
                  spv._merge
                }
                %66 = spv.IAdd %64, %3 : i32
                spv.Branch ^bb1(%66 : i32)
              ^bb3:  // pred: ^bb1
                spv._merge
              }
              %63 = spv.IAdd %61, %7 : i32
              spv.Branch ^bb1(%63 : i32)
            ^bb3:  // pred: ^bb1
              spv._merge
            }
            %60 = spv.IAdd %58, %7 : i32
            spv.Branch ^bb1(%60 : i32)
          ^bb3:  // pred: ^bb1
            spv._merge
          }
          %57 = spv.IAdd %36, %29 : i32
          spv.Branch ^bb1(%57 : i32)
        ^bb3:  // pred: ^bb1
          spv._merge
        }
        %35 = spv.IAdd %33, %27 : i32
        spv.Branch ^bb1(%35 : i32)
      ^bb3:  // pred: ^bb1
        spv._merge
      }
      %32 = spv.IAdd %30, %25 : i32
      spv.Branch ^bb1(%32 : i32)
    ^bb3:  // pred: ^bb1
      spv._merge
    }
    spv.Return
  }
  spv.EntryPoint "GLCompute" @main_ex_dispatch_1, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__
  spv.ExecutionMode @main_ex_dispatch_1 "LocalSize", 32, 2, 2
}

// *** IR Dump After mlir::iree_compiler::IREE::HAL::TranslateExecutablesPass ***
hal.executable @main_ex_dispatch_1 attributes {sym_visibility = "private"} {
  hal.interface @legacy_io {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
  hal.executable.entry_point @main_ex_dispatch_1 attributes {interface = @legacy_io, ordinal = 0 : i32, signature = (tensor<1x5x2x2xf32>) -> tensor<1x5x4xf32>}
  hal.executable.target "vulkan*" {
    module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
      spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
        spv.globalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
        spv.globalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spv.ptr<vector<3xi32>, Input>
        spv.globalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
        spv.globalVariable @__resource_var_0_0__ bind(0, 0) : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
        spv.globalVariable @__resource_var_0_1__ bind(0, 1) : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
        spv.func @main_ex_dispatch_1() "None" {
          %0 = spv.constant 5 : i32
          %1 = spv.constant -1 : i32
          %2 = spv.constant 1 : i32
          %3 = spv.constant 32 : i32
          %4 = spv.constant 0 : i32
          %5 = spv.constant 20 : i32
          %6 = spv.constant 4 : i32
          %7 = spv.constant 2 : i32
          %8 = spv._address_of @__resource_var_0_1__ : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
          %9 = spv._address_of @__resource_var_0_0__ : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
          %10 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
          %11 = spv.Load "Input" %10 : vector<3xi32>
          %12 = spv.CompositeExtract %11[0 : i32] : vector<3xi32>
          %13 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
          %14 = spv.Load "Input" %13 : vector<3xi32>
          %15 = spv.CompositeExtract %14[0 : i32] : vector<3xi32>
          %16 = spv.Load "Input" %10 : vector<3xi32>
          %17 = spv.CompositeExtract %16[1 : i32] : vector<3xi32>
          %18 = spv.Load "Input" %13 : vector<3xi32>
          %19 = spv.CompositeExtract %18[1 : i32] : vector<3xi32>
          %20 = spv.Load "Input" %10 : vector<3xi32>
          %21 = spv.CompositeExtract %20[2 : i32] : vector<3xi32>
          %22 = spv.Load "Input" %13 : vector<3xi32>
          %23 = spv.CompositeExtract %22[2 : i32] : vector<3xi32>
          %24 = spv.IMul %21, %7 : i32
          %25 = spv.IMul %23, %7 : i32
          %26 = spv.IMul %17, %7 : i32
          %27 = spv.IMul %19, %7 : i32
          %28 = spv.IMul %12, %3 : i32
          %29 = spv.IMul %15, %3 : i32
          spv.loop {
            spv.Branch ^bb1(%24 : i32)
          ^bb1(%30: i32):  // 2 preds: ^bb0, ^bb2
            %31 = spv.SLessThan %30, %2 : i32
            spv.BranchConditional %31, ^bb2, ^bb3
          ^bb2:  // pred: ^bb1
            spv.loop {
              spv.Branch ^bb1(%26 : i32)
            ^bb1(%33: i32):  // 2 preds: ^bb0, ^bb2
              %34 = spv.SLessThan %33, %0 : i32
              spv.BranchConditional %34, ^bb2, ^bb3
            ^bb2:  // pred: ^bb1
              spv.loop {
                spv.Branch ^bb1(%28 : i32)
              ^bb1(%36: i32):  // 2 preds: ^bb0, ^bb2
                %37 = spv.SLessThan %36, %7 : i32
                spv.BranchConditional %37, ^bb2, ^bb3
              ^bb2:  // pred: ^bb1
                %38 = spv.IMul %30, %1 : i32
                %39 = spv.IAdd %38, %2 : i32
                %40 = spv.SLessThan %7, %39 : i32
                %41 = spv.Select %40, %7, %39 : i1, i32
                %42 = spv.IMul %33, %1 : i32
                %43 = spv.IAdd %42, %0 : i32
                %44 = spv.SLessThan %7, %43 : i32
                %45 = spv.Select %44, %7, %43 : i1, i32
                %46 = spv.IMul %36, %1 : i32
                %47 = spv.IAdd %46, %7 : i32
                %48 = spv.SLessThan %3, %47 : i32
                %49 = spv.Select %48, %3, %47 : i1, i32
                %50 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
                %51 = spv.Load "Input" %50 : vector<3xi32>
                %52 = spv.CompositeExtract %51[0 : i32] : vector<3xi32>
                %53 = spv.Load "Input" %50 : vector<3xi32>
                %54 = spv.CompositeExtract %53[1 : i32] : vector<3xi32>
                %55 = spv.Load "Input" %50 : vector<3xi32>
                %56 = spv.CompositeExtract %55[2 : i32] : vector<3xi32>
                spv.loop {
                  spv.Branch ^bb1(%56 : i32)
                ^bb1(%58: i32):  // 2 preds: ^bb0, ^bb2
                  %59 = spv.SLessThan %58, %41 : i32
                  spv.BranchConditional %59, ^bb2, ^bb3
                ^bb2:  // pred: ^bb1
                  spv.loop {
                    spv.Branch ^bb1(%54 : i32)
                  ^bb1(%61: i32):  // 2 preds: ^bb0, ^bb2
                    %62 = spv.SLessThan %61, %45 : i32
                    spv.BranchConditional %62, ^bb2, ^bb3
                  ^bb2:  // pred: ^bb1
                    spv.loop {
                      spv.Branch ^bb1(%52 : i32)
                    ^bb1(%64: i32):  // 2 preds: ^bb0, ^bb2
                      %65 = spv.SLessThan %64, %49 : i32
                      spv.BranchConditional %65, ^bb2, ^bb3
                    ^bb2:  // pred: ^bb1
                      spv.loop {
                        spv.Branch ^bb1(%4 : i32)
                      ^bb1(%67: i32):  // 2 preds: ^bb0, ^bb2
                        %68 = spv.SLessThan %67, %7 : i32
                        spv.BranchConditional %68, ^bb2, ^bb3
                      ^bb2:  // pred: ^bb1
                        %69 = spv.IAdd %30, %58 : i32
                        %70 = spv.IAdd %33, %61 : i32
                        %71 = spv.IAdd %36, %64 : i32
                        %72 = spv.IMul %69, %5 : i32
                        %73 = spv.IMul %70, %6 : i32
                        %74 = spv.IAdd %72, %73 : i32
                        %75 = spv.IMul %71, %7 : i32
                        %76 = spv.IAdd %74, %75 : i32
                        %77 = spv.IAdd %76, %67 : i32
                        %78 = spv.AccessChain %9[%4, %77] : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
                        %79 = spv.Load "StorageBuffer" %78 : f32
                        %80 = spv.AccessChain %8[%4, %77] : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
                        spv.Store "StorageBuffer" %80, %79 : f32
                        %81 = spv.IAdd %67, %2 : i32
                        spv.Branch ^bb1(%81 : i32)
                      ^bb3:  // pred: ^bb1
                        spv._merge
                      }
                      %66 = spv.IAdd %64, %3 : i32
                      spv.Branch ^bb1(%66 : i32)
                    ^bb3:  // pred: ^bb1
                      spv._merge
                    }
                    %63 = spv.IAdd %61, %7 : i32
                    spv.Branch ^bb1(%63 : i32)
                  ^bb3:  // pred: ^bb1
                    spv._merge
                  }
                  %60 = spv.IAdd %58, %7 : i32
                  spv.Branch ^bb1(%60 : i32)
                ^bb3:  // pred: ^bb1
                  spv._merge
                }
                %57 = spv.IAdd %36, %29 : i32
                spv.Branch ^bb1(%57 : i32)
              ^bb3:  // pred: ^bb1
                spv._merge
              }
              %35 = spv.IAdd %33, %27 : i32
              spv.Branch ^bb1(%35 : i32)
            ^bb3:  // pred: ^bb1
              spv._merge
            }
            %32 = spv.IAdd %30, %25 : i32
            spv.Branch ^bb1(%32 : i32)
          ^bb3:  // pred: ^bb1
            spv._merge
          }
          spv.Return
        }
        spv.EntryPoint "GLCompute" @main_ex_dispatch_1, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__
        spv.ExecutionMode @main_ex_dispatch_1 "LocalSize", 32, 2, 2
      }
      hal.interface @legacy_io attributes {sym_visibility = "private"} {
        hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
        hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
      }
    }
  }
}

// *** IR Dump After Inliner ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_2() {
    %c0 = constant 0 : index
    %cst = constant dense<0.000000e+00> : tensor<f32>
    %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<1x5x4xf32>
    %1 = "xla_hlo.pad"(%0, %cst) {edge_padding_high = dense<[0, 0, 60]> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<1x5x4xf32>, tensor<f32>) -> tensor<1x5x64xf32>
    hal.interface.store.tensor %1, @legacy_io::@ret0, offset = %c0 : tensor<1x5x64xf32>
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::Shape::`anonymous-namespace'::TieDynamicShapesPass ***
func @main_ex_dispatch_2() {
  %c0 = constant 0 : index
  %cst = constant dense<0.000000e+00> : tensor<f32>
  %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<1x5x4xf32>
  %1 = "xla_hlo.pad"(%0, %cst) {edge_padding_high = dense<[0, 0, 60]> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<1x5x4xf32>, tensor<f32>) -> tensor<1x5x64xf32>
  hal.interface.store.tensor %1, @legacy_io::@ret0, offset = %c0 : tensor<1x5x64xf32>
  return
}

// *** IR Dump After mlir::iree_compiler::Shape::`anonymous-namespace'::MaterializeShapeCalculationsPass ***
func @main_ex_dispatch_2() {
  %c0 = constant 0 : index
  %cst = constant dense<0.000000e+00> : tensor<f32>
  %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<1x5x4xf32>
  %1 = "xla_hlo.pad"(%0, %cst) {edge_padding_high = dense<[0, 0, 60]> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<1x5x4xf32>, tensor<f32>) -> tensor<1x5x64xf32>
  hal.interface.store.tensor %1, @legacy_io::@ret0, offset = %c0 : tensor<1x5x64xf32>
  return
}

// *** IR Dump After mlir::iree_compiler::Shape::`anonymous-namespace'::HoistShapeCalculations ***
func @main_ex_dispatch_2() {
  %c0 = constant 0 : index
  %cst = constant dense<0.000000e+00> : tensor<f32>
  %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<1x5x4xf32>
  %1 = "xla_hlo.pad"(%0, %cst) {edge_padding_high = dense<[0, 0, 60]> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<1x5x4xf32>, tensor<f32>) -> tensor<1x5x64xf32>
  hal.interface.store.tensor %1, @legacy_io::@ret0, offset = %c0 : tensor<1x5x64xf32>
  return
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::DecomposeHLOClampPass ***
func @main_ex_dispatch_2() {
  %c0 = constant 0 : index
  %cst = constant dense<0.000000e+00> : tensor<f32>
  %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<1x5x4xf32>
  %1 = "xla_hlo.pad"(%0, %cst) {edge_padding_high = dense<[0, 0, 60]> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<1x5x4xf32>, tensor<f32>) -> tensor<1x5x64xf32>
  hal.interface.store.tensor %1, @legacy_io::@ret0, offset = %c0 : tensor<1x5x64xf32>
  return
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::ConvertHLOToLinalgOnTensorsPass ***
func @main_ex_dispatch_2() {
  %c0 = constant 0 : index
  %cst = constant dense<0.000000e+00> : tensor<f32>
  %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<1x5x4xf32>
  %1 = "xla_hlo.pad"(%0, %cst) {edge_padding_high = dense<[0, 0, 60]> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<1x5x4xf32>, tensor<f32>) -> tensor<1x5x64xf32>
  hal.interface.store.tensor %1, @legacy_io::@ret0, offset = %c0 : tensor<1x5x64xf32>
  return
}

// *** IR Dump After LinalgFusionOfTensorOps ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_2() {
    %c0 = constant 0 : index
    %cst = constant dense<0.000000e+00> : tensor<f32>
    %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<1x5x4xf32>
    %1 = "xla_hlo.pad"(%0, %cst) {edge_padding_high = dense<[0, 0, 60]> : tensor<3xi64>, edge_padding_low = dense<0> : tensor<3xi64>, interior_padding = dense<0> : tensor<3xi64>} : (tensor<1x5x4xf32>, tensor<f32>) -> tensor<1x5x64xf32>
    hal.interface.store.tensor %1, @legacy_io::@ret0, offset = %c0 : tensor<1x5x64xf32>
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::ConvertHLOToLinalgOnBuffersPass ***
func @main_ex_dispatch_2() {
  %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x5x64xf32>
  %c0 = constant 0 : index
  %cst = constant dense<0.000000e+00> : tensor<f32>
  %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x4xf32>
  %cst_0 = constant 0.000000e+00 : f32
  linalg.indexed_generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} {__internal_linalg_transform__ = "no-tile"} %1, %0 {
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: f32, %arg4: f32):  // no predecessors
    %c0_1 = constant 0 : index
    %c0_2 = constant 0 : index
    %c0_3 = constant 0 : index
    linalg.yield %arg3 : f32
  }: memref<1x5x4xf32>, memref<1x5x64xf32>
  return
}

// *** IR Dump After Canonicalizer ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_2() {
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x5x64xf32>
    %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x4xf32>
    linalg.indexed_generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} {__internal_linalg_transform__ = "no-tile"} %1, %0 {
    ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: f32, %arg4: f32):  // no predecessors
      linalg.yield %arg3 : f32
    }: memref<1x5x4xf32>, memref<1x5x64xf32>
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After CSE ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_2() {
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x5x64xf32>
    %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x4xf32>
    linalg.indexed_generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} {__internal_linalg_transform__ = "no-tile"} %1, %0 {
    ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: f32, %arg4: f32):  // no predecessors
      linalg.yield %arg3 : f32
    }: memref<1x5x4xf32>, memref<1x5x64xf32>
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::LinalgTileAndFusePass ***
func @main_ex_dispatch_2() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
  %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x5x64xf32>
  %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x4xf32>
  linalg.indexed_generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} {__internal_linalg_transform__ = "no-tile"} %1, %0 {
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: f32, %arg4: f32):  // no predecessors
    linalg.yield %arg3 : f32
  }: memref<1x5x4xf32>, memref<1x5x64xf32>
  return
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::SplitDispatchFunctionPass ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_2() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x5x64xf32>
    %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x4xf32>
    linalg.indexed_generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} {__internal_linalg_transform__ = "no-tile"} %1, %0 {
    ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: f32, %arg4: f32):  // no predecessors
      linalg.yield %arg3 : f32
    }: memref<1x5x4xf32>, memref<1x5x64xf32>
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::LinalgTileAndFusePass ***
func @main_ex_dispatch_2() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
  %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x5x64xf32>
  %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x4xf32>
  linalg.indexed_generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} {__internal_linalg_transform__ = "no-tile"} %1, %0 {
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: f32, %arg4: f32):  // no predecessors
    linalg.yield %arg3 : f32
  }: memref<1x5x4xf32>, memref<1x5x64xf32>
  return
}

// *** IR Dump After Canonicalizer ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_2() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x5x64xf32>
    %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x4xf32>
    linalg.indexed_generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} {__internal_linalg_transform__ = "no-tile"} %1, %0 {
    ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: f32, %arg4: f32):  // no predecessors
      linalg.yield %arg3 : f32
    }: memref<1x5x4xf32>, memref<1x5x64xf32>
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::ConvertToGPUPass ***
func @main_ex_dispatch_2() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
  %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x5x64xf32>
  %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x4xf32>
  %2 = dim %1, 0 : memref<1x5x4xf32>
  %3 = dim %1, 1 : memref<1x5x4xf32>
  %4 = dim %1, 2 : memref<1x5x4xf32>
  %5 = dim %0, 0 : memref<1x5x64xf32>
  %6 = dim %0, 1 : memref<1x5x64xf32>
  %7 = dim %0, 2 : memref<1x5x64xf32>
  %8 = affine.apply affine_map<()[s0] -> (s0)>()[%2]
  %9 = affine.apply affine_map<()[s0] -> (s0)>()[%3]
  %10 = affine.apply affine_map<()[s0] -> (s0)>()[%4]
  %c0 = constant 0 : index
  %c1 = constant 1 : index
  %c0_0 = constant 0 : index
  %c1_1 = constant 1 : index
  %c0_2 = constant 0 : index
  %c1_3 = constant 1 : index
  %11 = "gpu.grid_dim"() {dimension = "x"} : () -> index
  %12 = "gpu.block_id"() {dimension = "x"} : () -> index
  %13 = "gpu.block_dim"() {dimension = "x"} : () -> index
  %14 = "gpu.thread_id"() {dimension = "x"} : () -> index
  %15 = muli %12, %13 : index
  %16 = addi %15, %14 : index
  %17 = muli %13, %11 : index
  %18 = "gpu.grid_dim"() {dimension = "y"} : () -> index
  %19 = "gpu.block_id"() {dimension = "y"} : () -> index
  %20 = "gpu.block_dim"() {dimension = "y"} : () -> index
  %21 = "gpu.thread_id"() {dimension = "y"} : () -> index
  %22 = muli %19, %20 : index
  %23 = addi %22, %21 : index
  %24 = muli %20, %18 : index
  %25 = "gpu.grid_dim"() {dimension = "z"} : () -> index
  %26 = "gpu.block_id"() {dimension = "z"} : () -> index
  %27 = "gpu.block_dim"() {dimension = "z"} : () -> index
  %28 = "gpu.thread_id"() {dimension = "z"} : () -> index
  %29 = muli %26, %27 : index
  %30 = addi %29, %28 : index
  %31 = muli %27, %25 : index
  %32 = muli %c1, %30 : index
  %33 = addi %c0, %32 : index
  %34 = muli %c1, %31 : index
  %35 = muli %c1_1, %23 : index
  %36 = addi %c0_0, %35 : index
  %37 = muli %c1_1, %24 : index
  %38 = muli %c1_3, %16 : index
  %39 = addi %c0_2, %38 : index
  %40 = muli %c1_3, %17 : index
  scf.for %arg0 = %33 to %8 step %34 {
    scf.for %arg1 = %36 to %9 step %37 {
      scf.for %arg2 = %39 to %10 step %40 {
        %41 = affine.apply affine_map<(d0) -> (d0)>(%arg0)
        %42 = affine.apply affine_map<(d0) -> (d0)>(%arg1)
        %43 = affine.apply affine_map<(d0) -> (d0)>(%arg2)
        %44 = load %1[%41, %42, %43] : memref<1x5x4xf32>
        %45 = affine.apply affine_map<(d0) -> (d0)>(%arg0)
        %46 = affine.apply affine_map<(d0) -> (d0)>(%arg1)
        %47 = affine.apply affine_map<(d0) -> (d0)>(%arg2)
        %48 = load %0[%45, %46, %47] : memref<1x5x64xf32>
        %49 = affine.apply affine_map<(d0) -> (d0)>(%arg0)
        %50 = affine.apply affine_map<(d0) -> (d0)>(%arg1)
        %51 = affine.apply affine_map<(d0) -> (d0)>(%arg2)
        store %44, %0[%49, %50, %51] : memref<1x5x64xf32>
      }
    }
  }
  return
}

// *** IR Dump After ConvertAffineToStandard ***
func @main_ex_dispatch_2() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
  %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x5x64xf32>
  %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x4xf32>
  %2 = dim %1, 0 : memref<1x5x4xf32>
  %3 = dim %1, 1 : memref<1x5x4xf32>
  %4 = dim %1, 2 : memref<1x5x4xf32>
  %5 = dim %0, 0 : memref<1x5x64xf32>
  %6 = dim %0, 1 : memref<1x5x64xf32>
  %7 = dim %0, 2 : memref<1x5x64xf32>
  %c0 = constant 0 : index
  %c1 = constant 1 : index
  %c0_0 = constant 0 : index
  %c1_1 = constant 1 : index
  %c0_2 = constant 0 : index
  %c1_3 = constant 1 : index
  %8 = "gpu.grid_dim"() {dimension = "x"} : () -> index
  %9 = "gpu.block_id"() {dimension = "x"} : () -> index
  %10 = "gpu.block_dim"() {dimension = "x"} : () -> index
  %11 = "gpu.thread_id"() {dimension = "x"} : () -> index
  %12 = muli %9, %10 : index
  %13 = addi %12, %11 : index
  %14 = muli %10, %8 : index
  %15 = "gpu.grid_dim"() {dimension = "y"} : () -> index
  %16 = "gpu.block_id"() {dimension = "y"} : () -> index
  %17 = "gpu.block_dim"() {dimension = "y"} : () -> index
  %18 = "gpu.thread_id"() {dimension = "y"} : () -> index
  %19 = muli %16, %17 : index
  %20 = addi %19, %18 : index
  %21 = muli %17, %15 : index
  %22 = "gpu.grid_dim"() {dimension = "z"} : () -> index
  %23 = "gpu.block_id"() {dimension = "z"} : () -> index
  %24 = "gpu.block_dim"() {dimension = "z"} : () -> index
  %25 = "gpu.thread_id"() {dimension = "z"} : () -> index
  %26 = muli %23, %24 : index
  %27 = addi %26, %25 : index
  %28 = muli %24, %22 : index
  %29 = muli %c1, %27 : index
  %30 = addi %c0, %29 : index
  %31 = muli %c1, %28 : index
  %32 = muli %c1_1, %20 : index
  %33 = addi %c0_0, %32 : index
  %34 = muli %c1_1, %21 : index
  %35 = muli %c1_3, %13 : index
  %36 = addi %c0_2, %35 : index
  %37 = muli %c1_3, %14 : index
  scf.for %arg0 = %30 to %2 step %31 {
    scf.for %arg1 = %33 to %3 step %34 {
      scf.for %arg2 = %36 to %4 step %37 {
        %38 = load %1[%arg0, %arg1, %arg2] : memref<1x5x4xf32>
        %39 = load %0[%arg0, %arg1, %arg2] : memref<1x5x64xf32>
        store %38, %0[%arg0, %arg1, %arg2] : memref<1x5x64xf32>
      }
    }
  }
  return
}

// *** IR Dump After Canonicalizer ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_2() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
    %c1 = constant 1 : index
    %c5 = constant 5 : index
    %c4 = constant 4 : index
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x5x64xf32>
    %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x4xf32>
    %2 = "gpu.grid_dim"() {dimension = "x"} : () -> index
    %3 = "gpu.block_id"() {dimension = "x"} : () -> index
    %4 = "gpu.block_dim"() {dimension = "x"} : () -> index
    %5 = "gpu.thread_id"() {dimension = "x"} : () -> index
    %6 = muli %3, %4 : index
    %7 = addi %6, %5 : index
    %8 = muli %4, %2 : index
    %9 = "gpu.grid_dim"() {dimension = "y"} : () -> index
    %10 = "gpu.block_id"() {dimension = "y"} : () -> index
    %11 = "gpu.block_dim"() {dimension = "y"} : () -> index
    %12 = "gpu.thread_id"() {dimension = "y"} : () -> index
    %13 = muli %10, %11 : index
    %14 = addi %13, %12 : index
    %15 = muli %11, %9 : index
    %16 = "gpu.grid_dim"() {dimension = "z"} : () -> index
    %17 = "gpu.block_id"() {dimension = "z"} : () -> index
    %18 = "gpu.block_dim"() {dimension = "z"} : () -> index
    %19 = "gpu.thread_id"() {dimension = "z"} : () -> index
    %20 = muli %17, %18 : index
    %21 = addi %20, %19 : index
    %22 = muli %18, %16 : index
    scf.for %arg0 = %21 to %c1 step %22 {
      scf.for %arg1 = %14 to %c5 step %15 {
        scf.for %arg2 = %7 to %c4 step %8 {
          %23 = load %1[%arg0, %arg1, %arg2] : memref<1x5x4xf32>
          store %23, %0[%arg0, %arg1, %arg2] : memref<1x5x64xf32>
        }
      }
    }
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After CSE ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_2() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
    %c1 = constant 1 : index
    %c5 = constant 5 : index
    %c4 = constant 4 : index
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x5x64xf32>
    %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x4xf32>
    %2 = "gpu.grid_dim"() {dimension = "x"} : () -> index
    %3 = "gpu.block_id"() {dimension = "x"} : () -> index
    %4 = "gpu.block_dim"() {dimension = "x"} : () -> index
    %5 = "gpu.thread_id"() {dimension = "x"} : () -> index
    %6 = muli %3, %4 : index
    %7 = addi %6, %5 : index
    %8 = muli %4, %2 : index
    %9 = "gpu.grid_dim"() {dimension = "y"} : () -> index
    %10 = "gpu.block_id"() {dimension = "y"} : () -> index
    %11 = "gpu.block_dim"() {dimension = "y"} : () -> index
    %12 = "gpu.thread_id"() {dimension = "y"} : () -> index
    %13 = muli %10, %11 : index
    %14 = addi %13, %12 : index
    %15 = muli %11, %9 : index
    %16 = "gpu.grid_dim"() {dimension = "z"} : () -> index
    %17 = "gpu.block_id"() {dimension = "z"} : () -> index
    %18 = "gpu.block_dim"() {dimension = "z"} : () -> index
    %19 = "gpu.thread_id"() {dimension = "z"} : () -> index
    %20 = muli %17, %18 : index
    %21 = addi %20, %19 : index
    %22 = muli %18, %16 : index
    scf.for %arg0 = %21 to %c1 step %22 {
      scf.for %arg1 = %14 to %c5 step %15 {
        scf.for %arg2 = %7 to %c4 step %8 {
          %23 = load %1[%arg0, %arg1, %arg2] : memref<1x5x4xf32>
          store %23, %0[%arg0, %arg1, %arg2] : memref<1x5x64xf32>
        }
      }
    }
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::ResolveShapeOpsPass ***
func @main_ex_dispatch_2() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
  %c1 = constant 1 : index
  %c5 = constant 5 : index
  %c4 = constant 4 : index
  %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x5x64xf32>
  %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x4xf32>
  %2 = "gpu.grid_dim"() {dimension = "x"} : () -> index
  %3 = "gpu.block_id"() {dimension = "x"} : () -> index
  %4 = "gpu.block_dim"() {dimension = "x"} : () -> index
  %5 = "gpu.thread_id"() {dimension = "x"} : () -> index
  %6 = muli %3, %4 : index
  %7 = addi %6, %5 : index
  %8 = muli %4, %2 : index
  %9 = "gpu.grid_dim"() {dimension = "y"} : () -> index
  %10 = "gpu.block_id"() {dimension = "y"} : () -> index
  %11 = "gpu.block_dim"() {dimension = "y"} : () -> index
  %12 = "gpu.thread_id"() {dimension = "y"} : () -> index
  %13 = muli %10, %11 : index
  %14 = addi %13, %12 : index
  %15 = muli %11, %9 : index
  %16 = "gpu.grid_dim"() {dimension = "z"} : () -> index
  %17 = "gpu.block_id"() {dimension = "z"} : () -> index
  %18 = "gpu.block_dim"() {dimension = "z"} : () -> index
  %19 = "gpu.thread_id"() {dimension = "z"} : () -> index
  %20 = muli %17, %18 : index
  %21 = addi %20, %19 : index
  %22 = muli %18, %16 : index
  scf.for %arg0 = %21 to %c1 step %22 {
    scf.for %arg1 = %14 to %c5 step %15 {
      scf.for %arg2 = %7 to %c4 step %8 {
        %23 = load %1[%arg0, %arg1, %arg2] : memref<1x5x4xf32>
        store %23, %0[%arg0, %arg1, %arg2] : memref<1x5x64xf32>
      }
    }
  }
  return
}

// *** IR Dump After LegalizeStandardForSPIRV ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_2() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
    %c1 = constant 1 : index
    %c5 = constant 5 : index
    %c4 = constant 4 : index
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x5x64xf32>
    %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x4xf32>
    %2 = "gpu.grid_dim"() {dimension = "x"} : () -> index
    %3 = "gpu.block_id"() {dimension = "x"} : () -> index
    %4 = "gpu.block_dim"() {dimension = "x"} : () -> index
    %5 = "gpu.thread_id"() {dimension = "x"} : () -> index
    %6 = muli %3, %4 : index
    %7 = addi %6, %5 : index
    %8 = muli %4, %2 : index
    %9 = "gpu.grid_dim"() {dimension = "y"} : () -> index
    %10 = "gpu.block_id"() {dimension = "y"} : () -> index
    %11 = "gpu.block_dim"() {dimension = "y"} : () -> index
    %12 = "gpu.thread_id"() {dimension = "y"} : () -> index
    %13 = muli %10, %11 : index
    %14 = addi %13, %12 : index
    %15 = muli %11, %9 : index
    %16 = "gpu.grid_dim"() {dimension = "z"} : () -> index
    %17 = "gpu.block_id"() {dimension = "z"} : () -> index
    %18 = "gpu.block_dim"() {dimension = "z"} : () -> index
    %19 = "gpu.thread_id"() {dimension = "z"} : () -> index
    %20 = muli %17, %18 : index
    %21 = addi %20, %19 : index
    %22 = muli %18, %16 : index
    scf.for %arg0 = %21 to %c1 step %22 {
      scf.for %arg1 = %14 to %c5 step %15 {
        scf.for %arg2 = %7 to %c4 step %8 {
          %23 = load %1[%arg0, %arg1, %arg2] : memref<1x5x4xf32>
          store %23, %0[%arg0, %arg1, %arg2] : memref<1x5x64xf32>
        }
      }
    }
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After Canonicalizer ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_2() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
    %c1 = constant 1 : index
    %c5 = constant 5 : index
    %c4 = constant 4 : index
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x5x64xf32>
    %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x4xf32>
    %2 = "gpu.grid_dim"() {dimension = "x"} : () -> index
    %3 = "gpu.block_id"() {dimension = "x"} : () -> index
    %4 = "gpu.block_dim"() {dimension = "x"} : () -> index
    %5 = "gpu.thread_id"() {dimension = "x"} : () -> index
    %6 = muli %3, %4 : index
    %7 = addi %6, %5 : index
    %8 = muli %4, %2 : index
    %9 = "gpu.grid_dim"() {dimension = "y"} : () -> index
    %10 = "gpu.block_id"() {dimension = "y"} : () -> index
    %11 = "gpu.block_dim"() {dimension = "y"} : () -> index
    %12 = "gpu.thread_id"() {dimension = "y"} : () -> index
    %13 = muli %10, %11 : index
    %14 = addi %13, %12 : index
    %15 = muli %11, %9 : index
    %16 = "gpu.grid_dim"() {dimension = "z"} : () -> index
    %17 = "gpu.block_id"() {dimension = "z"} : () -> index
    %18 = "gpu.block_dim"() {dimension = "z"} : () -> index
    %19 = "gpu.thread_id"() {dimension = "z"} : () -> index
    %20 = muli %17, %18 : index
    %21 = addi %20, %19 : index
    %22 = muli %18, %16 : index
    scf.for %arg0 = %21 to %c1 step %22 {
      scf.for %arg1 = %14 to %c5 step %15 {
        scf.for %arg2 = %7 to %c4 step %8 {
          %23 = load %1[%arg0, %arg1, %arg2] : memref<1x5x4xf32>
          store %23, %0[%arg0, %arg1, %arg2] : memref<1x5x64xf32>
        }
      }
    }
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After CSE ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_2() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
    %c1 = constant 1 : index
    %c5 = constant 5 : index
    %c4 = constant 4 : index
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<1x5x64xf32>
    %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x4xf32>
    %2 = "gpu.grid_dim"() {dimension = "x"} : () -> index
    %3 = "gpu.block_id"() {dimension = "x"} : () -> index
    %4 = "gpu.block_dim"() {dimension = "x"} : () -> index
    %5 = "gpu.thread_id"() {dimension = "x"} : () -> index
    %6 = muli %3, %4 : index
    %7 = addi %6, %5 : index
    %8 = muli %4, %2 : index
    %9 = "gpu.grid_dim"() {dimension = "y"} : () -> index
    %10 = "gpu.block_id"() {dimension = "y"} : () -> index
    %11 = "gpu.block_dim"() {dimension = "y"} : () -> index
    %12 = "gpu.thread_id"() {dimension = "y"} : () -> index
    %13 = muli %10, %11 : index
    %14 = addi %13, %12 : index
    %15 = muli %11, %9 : index
    %16 = "gpu.grid_dim"() {dimension = "z"} : () -> index
    %17 = "gpu.block_id"() {dimension = "z"} : () -> index
    %18 = "gpu.block_dim"() {dimension = "z"} : () -> index
    %19 = "gpu.thread_id"() {dimension = "z"} : () -> index
    %20 = muli %17, %18 : index
    %21 = addi %20, %19 : index
    %22 = muli %18, %16 : index
    scf.for %arg0 = %21 to %c1 step %22 {
      scf.for %arg1 = %14 to %c5 step %15 {
        scf.for %arg2 = %7 to %c4 step %8 {
          %23 = load %1[%arg0, %arg1, %arg2] : memref<1x5x4xf32>
          store %23, %0[%arg0, %arg1, %arg2] : memref<1x5x64xf32>
        }
      }
    }
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::ConvertToSPIRVPass ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  spv.module Logical GLSL450 {
    spv.globalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
    spv.globalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
    spv.globalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spv.ptr<vector<3xi32>, Input>
    spv.globalVariable @__resource_var_0_0__ bind(0, 0) : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
    spv.globalVariable @__resource_var_0_1__ bind(0, 1) : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
    spv.func @main_ex_dispatch_2() "None" attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
      %0 = spv.constant 1 : i32
      %1 = spv.constant 5 : i32
      %2 = spv.constant 4 : i32
      %3 = spv._address_of @__resource_var_0_1__ : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
      %4 = spv._address_of @__resource_var_0_0__ : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
      %5 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
      %6 = spv.Load "Input" %5 : vector<3xi32>
      %7 = spv.CompositeExtract %6[0 : i32] : vector<3xi32>
      %8 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
      %9 = spv.Load "Input" %8 : vector<3xi32>
      %10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
      %11 = spv.constant 32 : i32
      %12 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
      %13 = spv.Load "Input" %12 : vector<3xi32>
      %14 = spv.CompositeExtract %13[0 : i32] : vector<3xi32>
      %15 = spv.IMul %10, %11 : i32
      %16 = spv.IAdd %15, %14 : i32
      %17 = spv.IMul %11, %7 : i32
      %18 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
      %19 = spv.Load "Input" %18 : vector<3xi32>
      %20 = spv.CompositeExtract %19[1 : i32] : vector<3xi32>
      %21 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
      %22 = spv.Load "Input" %21 : vector<3xi32>
      %23 = spv.CompositeExtract %22[1 : i32] : vector<3xi32>
      %24 = spv.constant 2 : i32
      %25 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
      %26 = spv.Load "Input" %25 : vector<3xi32>
      %27 = spv.CompositeExtract %26[1 : i32] : vector<3xi32>
      %28 = spv.IMul %23, %24 : i32
      %29 = spv.IAdd %28, %27 : i32
      %30 = spv.IMul %24, %20 : i32
      %31 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
      %32 = spv.Load "Input" %31 : vector<3xi32>
      %33 = spv.CompositeExtract %32[2 : i32] : vector<3xi32>
      %34 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
      %35 = spv.Load "Input" %34 : vector<3xi32>
      %36 = spv.CompositeExtract %35[2 : i32] : vector<3xi32>
      %37 = spv.constant 2 : i32
      %38 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
      %39 = spv.Load "Input" %38 : vector<3xi32>
      %40 = spv.CompositeExtract %39[2 : i32] : vector<3xi32>
      %41 = spv.IMul %36, %37 : i32
      %42 = spv.IAdd %41, %40 : i32
      %43 = spv.IMul %37, %33 : i32
      spv.loop {
        spv.Branch ^bb1(%42 : i32)
      ^bb1(%44: i32):  // 2 preds: ^bb0, ^bb2
        %45 = spv.SLessThan %44, %0 : i32
        spv.BranchConditional %45, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        spv.loop {
          spv.Branch ^bb1(%29 : i32)
        ^bb1(%47: i32):  // 2 preds: ^bb0, ^bb2
          %48 = spv.SLessThan %47, %1 : i32
          spv.BranchConditional %48, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          spv.loop {
            spv.Branch ^bb1(%16 : i32)
          ^bb1(%50: i32):  // 2 preds: ^bb0, ^bb2
            %51 = spv.SLessThan %50, %2 : i32
            spv.BranchConditional %51, ^bb2, ^bb3
          ^bb2:  // pred: ^bb1
            %52 = spv.constant 0 : i32
            %53 = spv.constant 0 : i32
            %54 = spv.constant 20 : i32
            %55 = spv.IMul %54, %44 : i32
            %56 = spv.IAdd %53, %55 : i32
            %57 = spv.constant 4 : i32
            %58 = spv.IMul %57, %47 : i32
            %59 = spv.IAdd %56, %58 : i32
            %60 = spv.constant 1 : i32
            %61 = spv.IMul %60, %50 : i32
            %62 = spv.IAdd %59, %61 : i32
            %63 = spv.AccessChain %4[%52, %62] : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
            %64 = spv.Load "StorageBuffer" %63 : f32
            %65 = spv.constant 0 : i32
            %66 = spv.constant 0 : i32
            %67 = spv.constant 320 : i32
            %68 = spv.IMul %67, %44 : i32
            %69 = spv.IAdd %66, %68 : i32
            %70 = spv.constant 64 : i32
            %71 = spv.IMul %70, %47 : i32
            %72 = spv.IAdd %69, %71 : i32
            %73 = spv.constant 1 : i32
            %74 = spv.IMul %73, %50 : i32
            %75 = spv.IAdd %72, %74 : i32
            %76 = spv.AccessChain %3[%65, %75] : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
            spv.Store "StorageBuffer" %76, %64 : f32
            %77 = spv.IAdd %50, %17 : i32
            spv.Branch ^bb1(%77 : i32)
          ^bb3:  // pred: ^bb1
            spv._merge
          }
          %49 = spv.IAdd %47, %30 : i32
          spv.Branch ^bb1(%49 : i32)
        ^bb3:  // pred: ^bb1
          spv._merge
        }
        %46 = spv.IAdd %44, %43 : i32
        spv.Branch ^bb1(%46 : i32)
      ^bb3:  // pred: ^bb1
        spv._merge
      }
      spv.Return
    }
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After SPIRVLowerABIAttributes ***
spv.module Logical GLSL450 {
  spv.globalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__resource_var_0_0__ bind(0, 0) : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
  spv.globalVariable @__resource_var_0_1__ bind(0, 1) : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
  spv.func @main_ex_dispatch_2() "None" {
    %0 = spv.constant 1 : i32
    %1 = spv.constant 5 : i32
    %2 = spv.constant 4 : i32
    %3 = spv._address_of @__resource_var_0_1__ : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
    %4 = spv._address_of @__resource_var_0_0__ : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
    %5 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
    %6 = spv.Load "Input" %5 : vector<3xi32>
    %7 = spv.CompositeExtract %6[0 : i32] : vector<3xi32>
    %8 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %9 = spv.Load "Input" %8 : vector<3xi32>
    %10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
    %11 = spv.constant 32 : i32
    %12 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
    %13 = spv.Load "Input" %12 : vector<3xi32>
    %14 = spv.CompositeExtract %13[0 : i32] : vector<3xi32>
    %15 = spv.IMul %10, %11 : i32
    %16 = spv.IAdd %15, %14 : i32
    %17 = spv.IMul %11, %7 : i32
    %18 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
    %19 = spv.Load "Input" %18 : vector<3xi32>
    %20 = spv.CompositeExtract %19[1 : i32] : vector<3xi32>
    %21 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %22 = spv.Load "Input" %21 : vector<3xi32>
    %23 = spv.CompositeExtract %22[1 : i32] : vector<3xi32>
    %24 = spv.constant 2 : i32
    %25 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
    %26 = spv.Load "Input" %25 : vector<3xi32>
    %27 = spv.CompositeExtract %26[1 : i32] : vector<3xi32>
    %28 = spv.IMul %23, %24 : i32
    %29 = spv.IAdd %28, %27 : i32
    %30 = spv.IMul %24, %20 : i32
    %31 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
    %32 = spv.Load "Input" %31 : vector<3xi32>
    %33 = spv.CompositeExtract %32[2 : i32] : vector<3xi32>
    %34 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %35 = spv.Load "Input" %34 : vector<3xi32>
    %36 = spv.CompositeExtract %35[2 : i32] : vector<3xi32>
    %37 = spv.constant 2 : i32
    %38 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
    %39 = spv.Load "Input" %38 : vector<3xi32>
    %40 = spv.CompositeExtract %39[2 : i32] : vector<3xi32>
    %41 = spv.IMul %36, %37 : i32
    %42 = spv.IAdd %41, %40 : i32
    %43 = spv.IMul %37, %33 : i32
    spv.loop {
      spv.Branch ^bb1(%42 : i32)
    ^bb1(%44: i32):  // 2 preds: ^bb0, ^bb2
      %45 = spv.SLessThan %44, %0 : i32
      spv.BranchConditional %45, ^bb2, ^bb3
    ^bb2:  // pred: ^bb1
      spv.loop {
        spv.Branch ^bb1(%29 : i32)
      ^bb1(%47: i32):  // 2 preds: ^bb0, ^bb2
        %48 = spv.SLessThan %47, %1 : i32
        spv.BranchConditional %48, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        spv.loop {
          spv.Branch ^bb1(%16 : i32)
        ^bb1(%50: i32):  // 2 preds: ^bb0, ^bb2
          %51 = spv.SLessThan %50, %2 : i32
          spv.BranchConditional %51, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          %52 = spv.constant 0 : i32
          %53 = spv.constant 0 : i32
          %54 = spv.constant 20 : i32
          %55 = spv.IMul %54, %44 : i32
          %56 = spv.IAdd %53, %55 : i32
          %57 = spv.constant 4 : i32
          %58 = spv.IMul %57, %47 : i32
          %59 = spv.IAdd %56, %58 : i32
          %60 = spv.constant 1 : i32
          %61 = spv.IMul %60, %50 : i32
          %62 = spv.IAdd %59, %61 : i32
          %63 = spv.AccessChain %4[%52, %62] : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
          %64 = spv.Load "StorageBuffer" %63 : f32
          %65 = spv.constant 0 : i32
          %66 = spv.constant 0 : i32
          %67 = spv.constant 320 : i32
          %68 = spv.IMul %67, %44 : i32
          %69 = spv.IAdd %66, %68 : i32
          %70 = spv.constant 64 : i32
          %71 = spv.IMul %70, %47 : i32
          %72 = spv.IAdd %69, %71 : i32
          %73 = spv.constant 1 : i32
          %74 = spv.IMul %73, %50 : i32
          %75 = spv.IAdd %72, %74 : i32
          %76 = spv.AccessChain %3[%65, %75] : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
          spv.Store "StorageBuffer" %76, %64 : f32
          %77 = spv.IAdd %50, %17 : i32
          spv.Branch ^bb1(%77 : i32)
        ^bb3:  // pred: ^bb1
          spv._merge
        }
        %49 = spv.IAdd %47, %30 : i32
        spv.Branch ^bb1(%49 : i32)
      ^bb3:  // pred: ^bb1
        spv._merge
      }
      %46 = spv.IAdd %44, %43 : i32
      spv.Branch ^bb1(%46 : i32)
    ^bb3:  // pred: ^bb1
      spv._merge
    }
    spv.Return
  }
  spv.EntryPoint "GLCompute" @main_ex_dispatch_2, @__builtin_var_NumWorkgroups__, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
  spv.ExecutionMode @main_ex_dispatch_2 "LocalSize", 32, 2, 2
}

// *** IR Dump After Canonicalizer ***
spv.module Logical GLSL450 {
  spv.globalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__resource_var_0_0__ bind(0, 0) : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
  spv.globalVariable @__resource_var_0_1__ bind(0, 1) : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
  spv.func @main_ex_dispatch_2() "None" {
    %0 = spv.constant 1 : i32
    %1 = spv.constant 5 : i32
    %2 = spv.constant 32 : i32
    %3 = spv.constant 2 : i32
    %4 = spv.constant 20 : i32
    %5 = spv.constant 4 : i32
    %6 = spv.constant 0 : i32
    %7 = spv.constant 320 : i32
    %8 = spv.constant 64 : i32
    %9 = spv._address_of @__resource_var_0_1__ : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
    %10 = spv._address_of @__resource_var_0_0__ : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
    %11 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
    %12 = spv.Load "Input" %11 : vector<3xi32>
    %13 = spv.CompositeExtract %12[0 : i32] : vector<3xi32>
    %14 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %15 = spv.Load "Input" %14 : vector<3xi32>
    %16 = spv.CompositeExtract %15[0 : i32] : vector<3xi32>
    %17 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
    %18 = spv.Load "Input" %17 : vector<3xi32>
    %19 = spv.CompositeExtract %18[0 : i32] : vector<3xi32>
    %20 = spv.IMul %16, %2 : i32
    %21 = spv.IAdd %20, %19 : i32
    %22 = spv.IMul %13, %2 : i32
    %23 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
    %24 = spv.Load "Input" %23 : vector<3xi32>
    %25 = spv.CompositeExtract %24[1 : i32] : vector<3xi32>
    %26 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %27 = spv.Load "Input" %26 : vector<3xi32>
    %28 = spv.CompositeExtract %27[1 : i32] : vector<3xi32>
    %29 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
    %30 = spv.Load "Input" %29 : vector<3xi32>
    %31 = spv.CompositeExtract %30[1 : i32] : vector<3xi32>
    %32 = spv.IMul %28, %3 : i32
    %33 = spv.IAdd %32, %31 : i32
    %34 = spv.IMul %25, %3 : i32
    %35 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
    %36 = spv.Load "Input" %35 : vector<3xi32>
    %37 = spv.CompositeExtract %36[2 : i32] : vector<3xi32>
    %38 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %39 = spv.Load "Input" %38 : vector<3xi32>
    %40 = spv.CompositeExtract %39[2 : i32] : vector<3xi32>
    %41 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
    %42 = spv.Load "Input" %41 : vector<3xi32>
    %43 = spv.CompositeExtract %42[2 : i32] : vector<3xi32>
    %44 = spv.IMul %40, %3 : i32
    %45 = spv.IAdd %44, %43 : i32
    %46 = spv.IMul %37, %3 : i32
    spv.loop {
      spv.Branch ^bb1(%45 : i32)
    ^bb1(%47: i32):  // 2 preds: ^bb0, ^bb2
      %48 = spv.SLessThan %47, %0 : i32
      spv.BranchConditional %48, ^bb2, ^bb3
    ^bb2:  // pred: ^bb1
      spv.loop {
        spv.Branch ^bb1(%33 : i32)
      ^bb1(%50: i32):  // 2 preds: ^bb0, ^bb2
        %51 = spv.SLessThan %50, %1 : i32
        spv.BranchConditional %51, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        spv.loop {
          spv.Branch ^bb1(%21 : i32)
        ^bb1(%53: i32):  // 2 preds: ^bb0, ^bb2
          %54 = spv.SLessThan %53, %5 : i32
          spv.BranchConditional %54, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          %55 = spv.IMul %47, %4 : i32
          %56 = spv.IMul %50, %5 : i32
          %57 = spv.IAdd %55, %56 : i32
          %58 = spv.IAdd %57, %53 : i32
          %59 = spv.AccessChain %10[%6, %58] : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
          %60 = spv.Load "StorageBuffer" %59 : f32
          %61 = spv.IMul %47, %7 : i32
          %62 = spv.IMul %50, %8 : i32
          %63 = spv.IAdd %61, %62 : i32
          %64 = spv.IAdd %63, %53 : i32
          %65 = spv.AccessChain %9[%6, %64] : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
          spv.Store "StorageBuffer" %65, %60 : f32
          %66 = spv.IAdd %53, %22 : i32
          spv.Branch ^bb1(%66 : i32)
        ^bb3:  // pred: ^bb1
          spv._merge
        }
        %52 = spv.IAdd %50, %34 : i32
        spv.Branch ^bb1(%52 : i32)
      ^bb3:  // pred: ^bb1
        spv._merge
      }
      %49 = spv.IAdd %47, %46 : i32
      spv.Branch ^bb1(%49 : i32)
    ^bb3:  // pred: ^bb1
      spv._merge
    }
    spv.Return
  }
  spv.EntryPoint "GLCompute" @main_ex_dispatch_2, @__builtin_var_NumWorkgroups__, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
  spv.ExecutionMode @main_ex_dispatch_2 "LocalSize", 32, 2, 2
}

// *** IR Dump After CSE ***
spv.module Logical GLSL450 {
  spv.globalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__resource_var_0_0__ bind(0, 0) : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
  spv.globalVariable @__resource_var_0_1__ bind(0, 1) : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
  spv.func @main_ex_dispatch_2() "None" {
    %0 = spv.constant 1 : i32
    %1 = spv.constant 5 : i32
    %2 = spv.constant 32 : i32
    %3 = spv.constant 2 : i32
    %4 = spv.constant 20 : i32
    %5 = spv.constant 4 : i32
    %6 = spv.constant 0 : i32
    %7 = spv.constant 320 : i32
    %8 = spv.constant 64 : i32
    %9 = spv._address_of @__resource_var_0_1__ : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
    %10 = spv._address_of @__resource_var_0_0__ : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
    %11 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
    %12 = spv.Load "Input" %11 : vector<3xi32>
    %13 = spv.CompositeExtract %12[0 : i32] : vector<3xi32>
    %14 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %15 = spv.Load "Input" %14 : vector<3xi32>
    %16 = spv.CompositeExtract %15[0 : i32] : vector<3xi32>
    %17 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
    %18 = spv.Load "Input" %17 : vector<3xi32>
    %19 = spv.CompositeExtract %18[0 : i32] : vector<3xi32>
    %20 = spv.IMul %16, %2 : i32
    %21 = spv.IAdd %20, %19 : i32
    %22 = spv.IMul %13, %2 : i32
    %23 = spv.Load "Input" %11 : vector<3xi32>
    %24 = spv.CompositeExtract %23[1 : i32] : vector<3xi32>
    %25 = spv.Load "Input" %14 : vector<3xi32>
    %26 = spv.CompositeExtract %25[1 : i32] : vector<3xi32>
    %27 = spv.Load "Input" %17 : vector<3xi32>
    %28 = spv.CompositeExtract %27[1 : i32] : vector<3xi32>
    %29 = spv.IMul %26, %3 : i32
    %30 = spv.IAdd %29, %28 : i32
    %31 = spv.IMul %24, %3 : i32
    %32 = spv.Load "Input" %11 : vector<3xi32>
    %33 = spv.CompositeExtract %32[2 : i32] : vector<3xi32>
    %34 = spv.Load "Input" %14 : vector<3xi32>
    %35 = spv.CompositeExtract %34[2 : i32] : vector<3xi32>
    %36 = spv.Load "Input" %17 : vector<3xi32>
    %37 = spv.CompositeExtract %36[2 : i32] : vector<3xi32>
    %38 = spv.IMul %35, %3 : i32
    %39 = spv.IAdd %38, %37 : i32
    %40 = spv.IMul %33, %3 : i32
    spv.loop {
      spv.Branch ^bb1(%39 : i32)
    ^bb1(%41: i32):  // 2 preds: ^bb0, ^bb2
      %42 = spv.SLessThan %41, %0 : i32
      spv.BranchConditional %42, ^bb2, ^bb3
    ^bb2:  // pred: ^bb1
      spv.loop {
        spv.Branch ^bb1(%30 : i32)
      ^bb1(%44: i32):  // 2 preds: ^bb0, ^bb2
        %45 = spv.SLessThan %44, %1 : i32
        spv.BranchConditional %45, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        spv.loop {
          spv.Branch ^bb1(%21 : i32)
        ^bb1(%47: i32):  // 2 preds: ^bb0, ^bb2
          %48 = spv.SLessThan %47, %5 : i32
          spv.BranchConditional %48, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          %49 = spv.IMul %41, %4 : i32
          %50 = spv.IMul %44, %5 : i32
          %51 = spv.IAdd %49, %50 : i32
          %52 = spv.IAdd %51, %47 : i32
          %53 = spv.AccessChain %10[%6, %52] : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
          %54 = spv.Load "StorageBuffer" %53 : f32
          %55 = spv.IMul %41, %7 : i32
          %56 = spv.IMul %44, %8 : i32
          %57 = spv.IAdd %55, %56 : i32
          %58 = spv.IAdd %57, %47 : i32
          %59 = spv.AccessChain %9[%6, %58] : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
          spv.Store "StorageBuffer" %59, %54 : f32
          %60 = spv.IAdd %47, %22 : i32
          spv.Branch ^bb1(%60 : i32)
        ^bb3:  // pred: ^bb1
          spv._merge
        }
        %46 = spv.IAdd %44, %31 : i32
        spv.Branch ^bb1(%46 : i32)
      ^bb3:  // pred: ^bb1
        spv._merge
      }
      %43 = spv.IAdd %41, %40 : i32
      spv.Branch ^bb1(%43 : i32)
    ^bb3:  // pred: ^bb1
      spv._merge
    }
    spv.Return
  }
  spv.EntryPoint "GLCompute" @main_ex_dispatch_2, @__builtin_var_NumWorkgroups__, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
  spv.ExecutionMode @main_ex_dispatch_2 "LocalSize", 32, 2, 2
}

// *** IR Dump After SPIRVUpdateVCE ***
spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
  spv.globalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__resource_var_0_0__ bind(0, 0) : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
  spv.globalVariable @__resource_var_0_1__ bind(0, 1) : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
  spv.func @main_ex_dispatch_2() "None" {
    %0 = spv.constant 1 : i32
    %1 = spv.constant 5 : i32
    %2 = spv.constant 32 : i32
    %3 = spv.constant 2 : i32
    %4 = spv.constant 20 : i32
    %5 = spv.constant 4 : i32
    %6 = spv.constant 0 : i32
    %7 = spv.constant 320 : i32
    %8 = spv.constant 64 : i32
    %9 = spv._address_of @__resource_var_0_1__ : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
    %10 = spv._address_of @__resource_var_0_0__ : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
    %11 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
    %12 = spv.Load "Input" %11 : vector<3xi32>
    %13 = spv.CompositeExtract %12[0 : i32] : vector<3xi32>
    %14 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %15 = spv.Load "Input" %14 : vector<3xi32>
    %16 = spv.CompositeExtract %15[0 : i32] : vector<3xi32>
    %17 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
    %18 = spv.Load "Input" %17 : vector<3xi32>
    %19 = spv.CompositeExtract %18[0 : i32] : vector<3xi32>
    %20 = spv.IMul %16, %2 : i32
    %21 = spv.IAdd %20, %19 : i32
    %22 = spv.IMul %13, %2 : i32
    %23 = spv.Load "Input" %11 : vector<3xi32>
    %24 = spv.CompositeExtract %23[1 : i32] : vector<3xi32>
    %25 = spv.Load "Input" %14 : vector<3xi32>
    %26 = spv.CompositeExtract %25[1 : i32] : vector<3xi32>
    %27 = spv.Load "Input" %17 : vector<3xi32>
    %28 = spv.CompositeExtract %27[1 : i32] : vector<3xi32>
    %29 = spv.IMul %26, %3 : i32
    %30 = spv.IAdd %29, %28 : i32
    %31 = spv.IMul %24, %3 : i32
    %32 = spv.Load "Input" %11 : vector<3xi32>
    %33 = spv.CompositeExtract %32[2 : i32] : vector<3xi32>
    %34 = spv.Load "Input" %14 : vector<3xi32>
    %35 = spv.CompositeExtract %34[2 : i32] : vector<3xi32>
    %36 = spv.Load "Input" %17 : vector<3xi32>
    %37 = spv.CompositeExtract %36[2 : i32] : vector<3xi32>
    %38 = spv.IMul %35, %3 : i32
    %39 = spv.IAdd %38, %37 : i32
    %40 = spv.IMul %33, %3 : i32
    spv.loop {
      spv.Branch ^bb1(%39 : i32)
    ^bb1(%41: i32):  // 2 preds: ^bb0, ^bb2
      %42 = spv.SLessThan %41, %0 : i32
      spv.BranchConditional %42, ^bb2, ^bb3
    ^bb2:  // pred: ^bb1
      spv.loop {
        spv.Branch ^bb1(%30 : i32)
      ^bb1(%44: i32):  // 2 preds: ^bb0, ^bb2
        %45 = spv.SLessThan %44, %1 : i32
        spv.BranchConditional %45, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        spv.loop {
          spv.Branch ^bb1(%21 : i32)
        ^bb1(%47: i32):  // 2 preds: ^bb0, ^bb2
          %48 = spv.SLessThan %47, %5 : i32
          spv.BranchConditional %48, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          %49 = spv.IMul %41, %4 : i32
          %50 = spv.IMul %44, %5 : i32
          %51 = spv.IAdd %49, %50 : i32
          %52 = spv.IAdd %51, %47 : i32
          %53 = spv.AccessChain %10[%6, %52] : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
          %54 = spv.Load "StorageBuffer" %53 : f32
          %55 = spv.IMul %41, %7 : i32
          %56 = spv.IMul %44, %8 : i32
          %57 = spv.IAdd %55, %56 : i32
          %58 = spv.IAdd %57, %47 : i32
          %59 = spv.AccessChain %9[%6, %58] : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
          spv.Store "StorageBuffer" %59, %54 : f32
          %60 = spv.IAdd %47, %22 : i32
          spv.Branch ^bb1(%60 : i32)
        ^bb3:  // pred: ^bb1
          spv._merge
        }
        %46 = spv.IAdd %44, %31 : i32
        spv.Branch ^bb1(%46 : i32)
      ^bb3:  // pred: ^bb1
        spv._merge
      }
      %43 = spv.IAdd %41, %40 : i32
      spv.Branch ^bb1(%43 : i32)
    ^bb3:  // pred: ^bb1
      spv._merge
    }
    spv.Return
  }
  spv.EntryPoint "GLCompute" @main_ex_dispatch_2, @__builtin_var_NumWorkgroups__, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
  spv.ExecutionMode @main_ex_dispatch_2 "LocalSize", 32, 2, 2
}

// *** IR Dump After mlir::iree_compiler::IREE::HAL::TranslateExecutablesPass ***
hal.executable @main_ex_dispatch_2 attributes {sym_visibility = "private"} {
  hal.interface @legacy_io {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
  hal.executable.entry_point @main_ex_dispatch_2 attributes {interface = @legacy_io, ordinal = 0 : i32, signature = (tensor<1x5x4xf32>) -> tensor<1x5x64xf32>}
  hal.executable.target "vulkan*" {
    module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
      spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
        spv.globalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
        spv.globalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
        spv.globalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spv.ptr<vector<3xi32>, Input>
        spv.globalVariable @__resource_var_0_0__ bind(0, 0) : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
        spv.globalVariable @__resource_var_0_1__ bind(0, 1) : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
        spv.func @main_ex_dispatch_2() "None" {
          %0 = spv.constant 1 : i32
          %1 = spv.constant 5 : i32
          %2 = spv.constant 32 : i32
          %3 = spv.constant 2 : i32
          %4 = spv.constant 20 : i32
          %5 = spv.constant 4 : i32
          %6 = spv.constant 0 : i32
          %7 = spv.constant 320 : i32
          %8 = spv.constant 64 : i32
          %9 = spv._address_of @__resource_var_0_1__ : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
          %10 = spv._address_of @__resource_var_0_0__ : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
          %11 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
          %12 = spv.Load "Input" %11 : vector<3xi32>
          %13 = spv.CompositeExtract %12[0 : i32] : vector<3xi32>
          %14 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
          %15 = spv.Load "Input" %14 : vector<3xi32>
          %16 = spv.CompositeExtract %15[0 : i32] : vector<3xi32>
          %17 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
          %18 = spv.Load "Input" %17 : vector<3xi32>
          %19 = spv.CompositeExtract %18[0 : i32] : vector<3xi32>
          %20 = spv.IMul %16, %2 : i32
          %21 = spv.IAdd %20, %19 : i32
          %22 = spv.IMul %13, %2 : i32
          %23 = spv.Load "Input" %11 : vector<3xi32>
          %24 = spv.CompositeExtract %23[1 : i32] : vector<3xi32>
          %25 = spv.Load "Input" %14 : vector<3xi32>
          %26 = spv.CompositeExtract %25[1 : i32] : vector<3xi32>
          %27 = spv.Load "Input" %17 : vector<3xi32>
          %28 = spv.CompositeExtract %27[1 : i32] : vector<3xi32>
          %29 = spv.IMul %26, %3 : i32
          %30 = spv.IAdd %29, %28 : i32
          %31 = spv.IMul %24, %3 : i32
          %32 = spv.Load "Input" %11 : vector<3xi32>
          %33 = spv.CompositeExtract %32[2 : i32] : vector<3xi32>
          %34 = spv.Load "Input" %14 : vector<3xi32>
          %35 = spv.CompositeExtract %34[2 : i32] : vector<3xi32>
          %36 = spv.Load "Input" %17 : vector<3xi32>
          %37 = spv.CompositeExtract %36[2 : i32] : vector<3xi32>
          %38 = spv.IMul %35, %3 : i32
          %39 = spv.IAdd %38, %37 : i32
          %40 = spv.IMul %33, %3 : i32
          spv.loop {
            spv.Branch ^bb1(%39 : i32)
          ^bb1(%41: i32):  // 2 preds: ^bb0, ^bb2
            %42 = spv.SLessThan %41, %0 : i32
            spv.BranchConditional %42, ^bb2, ^bb3
          ^bb2:  // pred: ^bb1
            spv.loop {
              spv.Branch ^bb1(%30 : i32)
            ^bb1(%44: i32):  // 2 preds: ^bb0, ^bb2
              %45 = spv.SLessThan %44, %1 : i32
              spv.BranchConditional %45, ^bb2, ^bb3
            ^bb2:  // pred: ^bb1
              spv.loop {
                spv.Branch ^bb1(%21 : i32)
              ^bb1(%47: i32):  // 2 preds: ^bb0, ^bb2
                %48 = spv.SLessThan %47, %5 : i32
                spv.BranchConditional %48, ^bb2, ^bb3
              ^bb2:  // pred: ^bb1
                %49 = spv.IMul %41, %4 : i32
                %50 = spv.IMul %44, %5 : i32
                %51 = spv.IAdd %49, %50 : i32
                %52 = spv.IAdd %51, %47 : i32
                %53 = spv.AccessChain %10[%6, %52] : !spv.ptr<!spv.struct<!spv.array<20 x f32, stride=4> [0]>, StorageBuffer>
                %54 = spv.Load "StorageBuffer" %53 : f32
                %55 = spv.IMul %41, %7 : i32
                %56 = spv.IMul %44, %8 : i32
                %57 = spv.IAdd %55, %56 : i32
                %58 = spv.IAdd %57, %47 : i32
                %59 = spv.AccessChain %9[%6, %58] : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
                spv.Store "StorageBuffer" %59, %54 : f32
                %60 = spv.IAdd %47, %22 : i32
                spv.Branch ^bb1(%60 : i32)
              ^bb3:  // pred: ^bb1
                spv._merge
              }
              %46 = spv.IAdd %44, %31 : i32
              spv.Branch ^bb1(%46 : i32)
            ^bb3:  // pred: ^bb1
              spv._merge
            }
            %43 = spv.IAdd %41, %40 : i32
            spv.Branch ^bb1(%43 : i32)
          ^bb3:  // pred: ^bb1
            spv._merge
          }
          spv.Return
        }
        spv.EntryPoint "GLCompute" @main_ex_dispatch_2, @__builtin_var_NumWorkgroups__, @__builtin_var_WorkgroupId__, @__builtin_var_LocalInvocationId__
        spv.ExecutionMode @main_ex_dispatch_2 "LocalSize", 32, 2, 2
      }
      hal.interface @legacy_io attributes {sym_visibility = "private"} {
        hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
        hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
      }
    }
  }
}

// *** IR Dump After Inliner ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_3() {
    %c0 = constant 0 : index
    %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<1x5x64xf32>
    %1 = "xla_hlo.transpose"(%0) {permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<1x5x64xf32>) -> tensor<5x1x64xf32>
    hal.interface.store.tensor %1, @legacy_io::@ret0, offset = %c0 : tensor<5x1x64xf32>
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::Shape::`anonymous-namespace'::TieDynamicShapesPass ***
func @main_ex_dispatch_3() {
  %c0 = constant 0 : index
  %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<1x5x64xf32>
  %1 = "xla_hlo.transpose"(%0) {permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<1x5x64xf32>) -> tensor<5x1x64xf32>
  hal.interface.store.tensor %1, @legacy_io::@ret0, offset = %c0 : tensor<5x1x64xf32>
  return
}

// *** IR Dump After mlir::iree_compiler::Shape::`anonymous-namespace'::MaterializeShapeCalculationsPass ***
func @main_ex_dispatch_3() {
  %c0 = constant 0 : index
  %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<1x5x64xf32>
  %1 = "xla_hlo.transpose"(%0) {permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<1x5x64xf32>) -> tensor<5x1x64xf32>
  hal.interface.store.tensor %1, @legacy_io::@ret0, offset = %c0 : tensor<5x1x64xf32>
  return
}

// *** IR Dump After mlir::iree_compiler::Shape::`anonymous-namespace'::HoistShapeCalculations ***
func @main_ex_dispatch_3() {
  %c0 = constant 0 : index
  %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<1x5x64xf32>
  %1 = "xla_hlo.transpose"(%0) {permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<1x5x64xf32>) -> tensor<5x1x64xf32>
  hal.interface.store.tensor %1, @legacy_io::@ret0, offset = %c0 : tensor<5x1x64xf32>
  return
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::DecomposeHLOClampPass ***
func @main_ex_dispatch_3() {
  %c0 = constant 0 : index
  %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<1x5x64xf32>
  %1 = "xla_hlo.transpose"(%0) {permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<1x5x64xf32>) -> tensor<5x1x64xf32>
  hal.interface.store.tensor %1, @legacy_io::@ret0, offset = %c0 : tensor<5x1x64xf32>
  return
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::ConvertHLOToLinalgOnTensorsPass ***
func @main_ex_dispatch_3() {
  %c0 = constant 0 : index
  %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<1x5x64xf32>
  %1 = linalg.generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} %0 {
  ^bb0(%arg0: f32):  // no predecessors
    linalg.yield %arg0 : f32
  }: tensor<1x5x64xf32> -> tensor<5x1x64xf32>
  hal.interface.store.tensor %1, @legacy_io::@ret0, offset = %c0 : tensor<5x1x64xf32>
  return
}

// *** IR Dump After LinalgFusionOfTensorOps ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_3() {
    %c0 = constant 0 : index
    %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<1x5x64xf32>
    %1 = linalg.generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} %0 {
    ^bb0(%arg0: f32):  // no predecessors
      linalg.yield %arg0 : f32
    }: tensor<1x5x64xf32> -> tensor<5x1x64xf32>
    hal.interface.store.tensor %1, @legacy_io::@ret0, offset = %c0 : tensor<5x1x64xf32>
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::ConvertHLOToLinalgOnBuffersPass ***
func @main_ex_dispatch_3() {
  %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5x1x64xf32>
  %c0 = constant 0 : index
  %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x64xf32>
  linalg.generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} %1, %0 {
  ^bb0(%arg0: f32, %arg1: f32):  // no predecessors
    linalg.yield %arg0 : f32
  }: memref<1x5x64xf32>, memref<5x1x64xf32>
  return
}

// *** IR Dump After Canonicalizer ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_3() {
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5x1x64xf32>
    %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x64xf32>
    linalg.generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} %1, %0 {
    ^bb0(%arg0: f32, %arg1: f32):  // no predecessors
      linalg.yield %arg0 : f32
    }: memref<1x5x64xf32>, memref<5x1x64xf32>
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After CSE ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_3() {
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5x1x64xf32>
    %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x64xf32>
    linalg.generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} %1, %0 {
    ^bb0(%arg0: f32, %arg1: f32):  // no predecessors
      linalg.yield %arg0 : f32
    }: memref<1x5x64xf32>, memref<5x1x64xf32>
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::LinalgTileAndFusePass ***
func @main_ex_dispatch_3() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
  %c0 = constant 0 : index
  %c5 = constant 5 : index
  %c2 = constant 2 : index
  %c32 = constant 32 : index
  %c64 = constant 64 : index
  %c1 = constant 1 : index
  %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5x1x64xf32>
  %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x64xf32>
  scf.parallel (%arg0, %arg1, %arg2) = (%c0, %c0, %c0) to (%c5, %c1, %c64) step (%c2, %c2, %c32) {
    %2 = affine.min affine_map<(d0, d1, d2) -> (2, d1 - d2)>(%c2, %c1, %arg1)
    %3 = affine.min affine_map<(d0, d1, d2) -> (2, d1 - d2)>(%c2, %c5, %arg0)
    %4 = affine.min affine_map<(d0, d1, d2) -> (32, d1 - d2)>(%c32, %c64, %arg2)
    %5 = subview %1[%arg1, %arg0, %arg2] [%2, %3, %4] [%c1, %c1, %c1]  : memref<1x5x64xf32> to memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3)>>
    %6 = affine.min affine_map<(d0, d1, d2) -> (2, d1 - d2)>(%c2, %c5, %arg0)
    %7 = affine.min affine_map<(d0, d1, d2) -> (2, d1 - d2)>(%c2, %c1, %arg1)
    %8 = affine.min affine_map<(d0, d1, d2) -> (32, d1 - d2)>(%c32, %c64, %arg2)
    %9 = subview %0[%arg0, %arg1, %arg2] [%6, %7, %8] [%c1, %c1, %c1]  : memref<5x1x64xf32> to memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3)>>
    linalg.generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} {__internal_linalg_transform__ = "workitem"} %5, %9 {
    ^bb0(%arg3: f32, %arg4: f32):  // no predecessors
      linalg.yield %arg3 : f32
    }: memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3)>>, memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3)>>
    scf.yield
  }
  return
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::SplitDispatchFunctionPass ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_3() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
    %c0 = constant 0 : index
    %c5 = constant 5 : index
    %c2 = constant 2 : index
    %c32 = constant 32 : index
    %c64 = constant 64 : index
    %c1 = constant 1 : index
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5x1x64xf32>
    %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x64xf32>
    scf.parallel (%arg0, %arg1, %arg2) = (%c0, %c0, %c0) to (%c5, %c1, %c64) step (%c2, %c2, %c32) {
      %2 = affine.min affine_map<(d0, d1, d2) -> (2, d1 - d2)>(%c2, %c1, %arg1)
      %3 = affine.min affine_map<(d0, d1, d2) -> (2, d1 - d2)>(%c2, %c5, %arg0)
      %4 = affine.min affine_map<(d0, d1, d2) -> (32, d1 - d2)>(%c32, %c64, %arg2)
      %5 = subview %1[%arg1, %arg0, %arg2] [%2, %3, %4] [%c1, %c1, %c1]  : memref<1x5x64xf32> to memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3)>>
      %6 = affine.min affine_map<(d0, d1, d2) -> (2, d1 - d2)>(%c2, %c5, %arg0)
      %7 = affine.min affine_map<(d0, d1, d2) -> (2, d1 - d2)>(%c2, %c1, %arg1)
      %8 = affine.min affine_map<(d0, d1, d2) -> (32, d1 - d2)>(%c32, %c64, %arg2)
      %9 = subview %0[%arg0, %arg1, %arg2] [%6, %7, %8] [%c1, %c1, %c1]  : memref<5x1x64xf32> to memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3)>>
      linalg.generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} {__internal_linalg_transform__ = "workitem"} %5, %9 {
      ^bb0(%arg3: f32, %arg4: f32):  // no predecessors
        linalg.yield %arg3 : f32
      }: memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3)>>, memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3)>>
      scf.yield
    }
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::LinalgTileAndFusePass ***
func @main_ex_dispatch_3() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
  %c0 = constant 0 : index
  %c5 = constant 5 : index
  %c2 = constant 2 : index
  %c32 = constant 32 : index
  %c64 = constant 64 : index
  %c1 = constant 1 : index
  %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5x1x64xf32>
  %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x64xf32>
  scf.parallel (%arg0, %arg1, %arg2) = (%c0, %c0, %c0) to (%c5, %c1, %c64) step (%c2, %c2, %c32) {
    %2 = affine.min affine_map<(d0, d1, d2) -> (2, d1 - d2)>(%c2, %c1, %arg1)
    %3 = affine.min affine_map<(d0, d1, d2) -> (2, d1 - d2)>(%c2, %c5, %arg0)
    %4 = affine.min affine_map<(d0, d1, d2) -> (32, d1 - d2)>(%c32, %c64, %arg2)
    %5 = subview %1[%arg1, %arg0, %arg2] [%2, %3, %4] [%c1, %c1, %c1]  : memref<1x5x64xf32> to memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3)>>
    %6 = affine.min affine_map<(d0, d1, d2) -> (2, d1 - d2)>(%c2, %c5, %arg0)
    %7 = affine.min affine_map<(d0, d1, d2) -> (2, d1 - d2)>(%c2, %c1, %arg1)
    %8 = affine.min affine_map<(d0, d1, d2) -> (32, d1 - d2)>(%c32, %c64, %arg2)
    %9 = subview %0[%arg0, %arg1, %arg2] [%6, %7, %8] [%c1, %c1, %c1]  : memref<5x1x64xf32> to memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3)>>
    linalg.generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} {__internal_linalg_transform__ = "workitem"} %5, %9 {
    ^bb0(%arg3: f32, %arg4: f32):  // no predecessors
      linalg.yield %arg3 : f32
    }: memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3)>>, memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3)>>
    scf.yield
  }
  return
}

// *** IR Dump After Canonicalizer ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_3() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
    %c0 = constant 0 : index
    %c5 = constant 5 : index
    %c2 = constant 2 : index
    %c32 = constant 32 : index
    %c64 = constant 64 : index
    %c1 = constant 1 : index
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5x1x64xf32>
    %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x64xf32>
    scf.parallel (%arg0, %arg1, %arg2) = (%c0, %c0, %c0) to (%c5, %c1, %c64) step (%c2, %c2, %c32) {
      %2 = affine.min affine_map<(d0) -> (2, -d0 + 1)>(%arg1)
      %3 = affine.min affine_map<(d0) -> (2, -d0 + 5)>(%arg0)
      %4 = affine.min affine_map<(d0) -> (32, -d0 + 64)>(%arg2)
      %5 = subview %1[%arg1, %arg0, %arg2] [%2, %3, %4] [1, 1, 1]  : memref<1x5x64xf32> to memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 320 + s0 + d1 * 64 + d2)>>
      %6 = affine.min affine_map<(d0) -> (2, -d0 + 5)>(%arg0)
      %7 = affine.min affine_map<(d0) -> (2, -d0 + 1)>(%arg1)
      %8 = affine.min affine_map<(d0) -> (32, -d0 + 64)>(%arg2)
      %9 = subview %0[%arg0, %arg1, %arg2] [%6, %7, %8] [1, 1, 1]  : memref<5x1x64xf32> to memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 64 + s0 + d1 * 64 + d2)>>
      linalg.generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} {__internal_linalg_transform__ = "workitem"} %5, %9 {
      ^bb0(%arg3: f32, %arg4: f32):  // no predecessors
        linalg.yield %arg3 : f32
      }: memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 320 + s0 + d1 * 64 + d2)>>, memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 64 + s0 + d1 * 64 + d2)>>
      scf.yield
    }
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::ConvertToGPUPass ***
func @main_ex_dispatch_3() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
  %c0 = constant 0 : index
  %c5 = constant 5 : index
  %c2 = constant 2 : index
  %c32 = constant 32 : index
  %c64 = constant 64 : index
  %c1 = constant 1 : index
  %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5x1x64xf32>
  %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x64xf32>
  %2 = "gpu.block_id"() {dimension = "x"} : () -> index
  %3 = "gpu.grid_dim"() {dimension = "x"} : () -> index
  %4 = "gpu.block_id"() {dimension = "y"} : () -> index
  %5 = "gpu.grid_dim"() {dimension = "y"} : () -> index
  %6 = "gpu.block_id"() {dimension = "z"} : () -> index
  %7 = "gpu.grid_dim"() {dimension = "z"} : () -> index
  %8 = muli %c2, %6 : index
  %9 = addi %c0, %8 : index
  %10 = muli %c2, %7 : index
  %11 = muli %c2, %4 : index
  %12 = addi %c0, %11 : index
  %13 = muli %c2, %5 : index
  %14 = muli %c32, %2 : index
  %15 = addi %c0, %14 : index
  %16 = muli %c32, %3 : index
  scf.for %arg0 = %9 to %c5 step %10 {
    scf.for %arg1 = %12 to %c1 step %13 {
      scf.for %arg2 = %15 to %c64 step %16 {
        %17 = affine.min affine_map<(d0) -> (2, -d0 + 1)>(%arg1)
        %18 = affine.min affine_map<(d0) -> (2, -d0 + 5)>(%arg0)
        %19 = affine.min affine_map<(d0) -> (32, -d0 + 64)>(%arg2)
        %20 = subview %1[%arg1, %arg0, %arg2] [%17, %18, %19] [1, 1, 1]  : memref<1x5x64xf32> to memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 320 + s0 + d1 * 64 + d2)>>
        %21 = affine.min affine_map<(d0) -> (2, -d0 + 5)>(%arg0)
        %22 = affine.min affine_map<(d0) -> (2, -d0 + 1)>(%arg1)
        %23 = affine.min affine_map<(d0) -> (32, -d0 + 64)>(%arg2)
        %24 = subview %0[%arg0, %arg1, %arg2] [%21, %22, %23] [1, 1, 1]  : memref<5x1x64xf32> to memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 64 + s0 + d1 * 64 + d2)>>
        %25 = dim %20, 0 : memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 320 + s0 + d1 * 64 + d2)>>
        %26 = dim %20, 1 : memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 320 + s0 + d1 * 64 + d2)>>
        %27 = dim %20, 2 : memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 320 + s0 + d1 * 64 + d2)>>
        %28 = dim %24, 0 : memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 64 + s0 + d1 * 64 + d2)>>
        %29 = dim %24, 1 : memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 64 + s0 + d1 * 64 + d2)>>
        %30 = dim %24, 2 : memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 64 + s0 + d1 * 64 + d2)>>
        %31 = affine.apply affine_map<(d0) -> (d0)>(%26)
        %32 = affine.apply affine_map<(d0) -> (d0)>(%25)
        %33 = affine.apply affine_map<(d0) -> (d0)>(%27)
        %c0_0 = constant 0 : index
        %c1_1 = constant 1 : index
        %c0_2 = constant 0 : index
        %c1_3 = constant 1 : index
        %c0_4 = constant 0 : index
        %c1_5 = constant 1 : index
        %34 = "gpu.thread_id"() {dimension = "x"} : () -> index
        %35 = "gpu.block_dim"() {dimension = "x"} : () -> index
        %36 = "gpu.thread_id"() {dimension = "y"} : () -> index
        %37 = "gpu.block_dim"() {dimension = "y"} : () -> index
        %38 = "gpu.thread_id"() {dimension = "z"} : () -> index
        %39 = "gpu.block_dim"() {dimension = "z"} : () -> index
        %40 = muli %c1_1, %38 : index
        %41 = addi %c0_0, %40 : index
        %42 = muli %c1_1, %39 : index
        %43 = muli %c1_3, %36 : index
        %44 = addi %c0_2, %43 : index
        %45 = muli %c1_3, %37 : index
        %46 = muli %c1_5, %34 : index
        %47 = addi %c0_4, %46 : index
        %48 = muli %c1_5, %35 : index
        scf.for %arg3 = %41 to %31 step %42 {
          scf.for %arg4 = %44 to %32 step %45 {
            scf.for %arg5 = %47 to %33 step %48 {
              %49 = affine.apply affine_map<(d0) -> (d0)>(%arg4)
              %50 = affine.apply affine_map<(d0) -> (d0)>(%arg3)
              %51 = affine.apply affine_map<(d0) -> (d0)>(%arg5)
              %52 = load %20[%49, %50, %51] : memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 320 + s0 + d1 * 64 + d2)>>
              %53 = affine.apply affine_map<(d0) -> (d0)>(%arg3)
              %54 = affine.apply affine_map<(d0) -> (d0)>(%arg4)
              %55 = affine.apply affine_map<(d0) -> (d0)>(%arg5)
              %56 = load %24[%53, %54, %55] : memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 64 + s0 + d1 * 64 + d2)>>
              %57 = affine.apply affine_map<(d0) -> (d0)>(%arg3)
              %58 = affine.apply affine_map<(d0) -> (d0)>(%arg4)
              %59 = affine.apply affine_map<(d0) -> (d0)>(%arg5)
              store %52, %24[%57, %58, %59] : memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 64 + s0 + d1 * 64 + d2)>>
            }
          }
        }
      }
    }
  }
  return
}

// *** IR Dump After ConvertAffineToStandard ***
func @main_ex_dispatch_3() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
  %c0 = constant 0 : index
  %c5 = constant 5 : index
  %c2 = constant 2 : index
  %c32 = constant 32 : index
  %c64 = constant 64 : index
  %c1 = constant 1 : index
  %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5x1x64xf32>
  %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x64xf32>
  %2 = "gpu.block_id"() {dimension = "x"} : () -> index
  %3 = "gpu.grid_dim"() {dimension = "x"} : () -> index
  %4 = "gpu.block_id"() {dimension = "y"} : () -> index
  %5 = "gpu.grid_dim"() {dimension = "y"} : () -> index
  %6 = "gpu.block_id"() {dimension = "z"} : () -> index
  %7 = "gpu.grid_dim"() {dimension = "z"} : () -> index
  %8 = muli %c2, %6 : index
  %9 = addi %c0, %8 : index
  %10 = muli %c2, %7 : index
  %11 = muli %c2, %4 : index
  %12 = addi %c0, %11 : index
  %13 = muli %c2, %5 : index
  %14 = muli %c32, %2 : index
  %15 = addi %c0, %14 : index
  %16 = muli %c32, %3 : index
  scf.for %arg0 = %9 to %c5 step %10 {
    scf.for %arg1 = %12 to %c1 step %13 {
      scf.for %arg2 = %15 to %c64 step %16 {
        %c2_0 = constant 2 : index
        %c-1 = constant -1 : index
        %17 = muli %arg1, %c-1 : index
        %c1_1 = constant 1 : index
        %18 = addi %17, %c1_1 : index
        %19 = cmpi "slt", %c2_0, %18 : index
        %20 = select %19, %c2_0, %18 : index
        %c2_2 = constant 2 : index
        %c-1_3 = constant -1 : index
        %21 = muli %arg0, %c-1_3 : index
        %c5_4 = constant 5 : index
        %22 = addi %21, %c5_4 : index
        %23 = cmpi "slt", %c2_2, %22 : index
        %24 = select %23, %c2_2, %22 : index
        %c32_5 = constant 32 : index
        %c-1_6 = constant -1 : index
        %25 = muli %arg2, %c-1_6 : index
        %c64_7 = constant 64 : index
        %26 = addi %25, %c64_7 : index
        %27 = cmpi "slt", %c32_5, %26 : index
        %28 = select %27, %c32_5, %26 : index
        %29 = subview %1[%arg1, %arg0, %arg2] [%20, %24, %28] [1, 1, 1]  : memref<1x5x64xf32> to memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 320 + s0 + d1 * 64 + d2)>>
        %c2_8 = constant 2 : index
        %c-1_9 = constant -1 : index
        %30 = muli %arg0, %c-1_9 : index
        %c5_10 = constant 5 : index
        %31 = addi %30, %c5_10 : index
        %32 = cmpi "slt", %c2_8, %31 : index
        %33 = select %32, %c2_8, %31 : index
        %c2_11 = constant 2 : index
        %c-1_12 = constant -1 : index
        %34 = muli %arg1, %c-1_12 : index
        %c1_13 = constant 1 : index
        %35 = addi %34, %c1_13 : index
        %36 = cmpi "slt", %c2_11, %35 : index
        %37 = select %36, %c2_11, %35 : index
        %c32_14 = constant 32 : index
        %c-1_15 = constant -1 : index
        %38 = muli %arg2, %c-1_15 : index
        %c64_16 = constant 64 : index
        %39 = addi %38, %c64_16 : index
        %40 = cmpi "slt", %c32_14, %39 : index
        %41 = select %40, %c32_14, %39 : index
        %42 = subview %0[%arg0, %arg1, %arg2] [%33, %37, %41] [1, 1, 1]  : memref<5x1x64xf32> to memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 64 + s0 + d1 * 64 + d2)>>
        %43 = dim %29, 0 : memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 320 + s0 + d1 * 64 + d2)>>
        %44 = dim %29, 1 : memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 320 + s0 + d1 * 64 + d2)>>
        %45 = dim %29, 2 : memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 320 + s0 + d1 * 64 + d2)>>
        %46 = dim %42, 0 : memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 64 + s0 + d1 * 64 + d2)>>
        %47 = dim %42, 1 : memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 64 + s0 + d1 * 64 + d2)>>
        %48 = dim %42, 2 : memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 64 + s0 + d1 * 64 + d2)>>
        %c0_17 = constant 0 : index
        %c1_18 = constant 1 : index
        %c0_19 = constant 0 : index
        %c1_20 = constant 1 : index
        %c0_21 = constant 0 : index
        %c1_22 = constant 1 : index
        %49 = "gpu.thread_id"() {dimension = "x"} : () -> index
        %50 = "gpu.block_dim"() {dimension = "x"} : () -> index
        %51 = "gpu.thread_id"() {dimension = "y"} : () -> index
        %52 = "gpu.block_dim"() {dimension = "y"} : () -> index
        %53 = "gpu.thread_id"() {dimension = "z"} : () -> index
        %54 = "gpu.block_dim"() {dimension = "z"} : () -> index
        %55 = muli %c1_18, %53 : index
        %56 = addi %c0_17, %55 : index
        %57 = muli %c1_18, %54 : index
        %58 = muli %c1_20, %51 : index
        %59 = addi %c0_19, %58 : index
        %60 = muli %c1_20, %52 : index
        %61 = muli %c1_22, %49 : index
        %62 = addi %c0_21, %61 : index
        %63 = muli %c1_22, %50 : index
        scf.for %arg3 = %56 to %44 step %57 {
          scf.for %arg4 = %59 to %43 step %60 {
            scf.for %arg5 = %62 to %45 step %63 {
              %64 = load %29[%arg4, %arg3, %arg5] : memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 320 + s0 + d1 * 64 + d2)>>
              %65 = load %42[%arg3, %arg4, %arg5] : memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 64 + s0 + d1 * 64 + d2)>>
              store %64, %42[%arg3, %arg4, %arg5] : memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 64 + s0 + d1 * 64 + d2)>>
            }
          }
        }
      }
    }
  }
  return
}

// *** IR Dump After Canonicalizer ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_3() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
    %c5 = constant 5 : index
    %c2 = constant 2 : index
    %c1 = constant 1 : index
    %c32 = constant 32 : index
    %c-1 = constant -1 : index
    %c64 = constant 64 : index
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5x1x64xf32>
    %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x64xf32>
    %2 = "gpu.block_id"() {dimension = "x"} : () -> index
    %3 = "gpu.grid_dim"() {dimension = "x"} : () -> index
    %4 = "gpu.block_id"() {dimension = "y"} : () -> index
    %5 = "gpu.grid_dim"() {dimension = "y"} : () -> index
    %6 = "gpu.block_id"() {dimension = "z"} : () -> index
    %7 = "gpu.grid_dim"() {dimension = "z"} : () -> index
    %8 = muli %6, %c2 : index
    %9 = muli %7, %c2 : index
    %10 = muli %4, %c2 : index
    %11 = muli %5, %c2 : index
    %12 = muli %2, %c32 : index
    %13 = muli %3, %c32 : index
    scf.for %arg0 = %8 to %c5 step %9 {
      scf.for %arg1 = %10 to %c1 step %11 {
        scf.for %arg2 = %12 to %c64 step %13 {
          %14 = muli %arg1, %c-1 : index
          %15 = addi %14, %c1 : index
          %16 = cmpi "slt", %c2, %15 : index
          %17 = select %16, %c2, %15 : index
          %18 = muli %arg0, %c-1 : index
          %19 = addi %18, %c5 : index
          %20 = cmpi "slt", %c2, %19 : index
          %21 = select %20, %c2, %19 : index
          %22 = muli %arg2, %c-1 : index
          %23 = addi %22, %c64 : index
          %24 = cmpi "slt", %c32, %23 : index
          %25 = select %24, %c32, %23 : index
          %26 = subview %1[%arg1, %arg0, %arg2] [%17, %21, %25] [1, 1, 1]  : memref<1x5x64xf32> to memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 320 + s0 + d1 * 64 + d2)>>
          %27 = muli %arg0, %c-1 : index
          %28 = addi %27, %c5 : index
          %29 = cmpi "slt", %c2, %28 : index
          %30 = select %29, %c2, %28 : index
          %31 = muli %arg1, %c-1 : index
          %32 = addi %31, %c1 : index
          %33 = cmpi "slt", %c2, %32 : index
          %34 = select %33, %c2, %32 : index
          %35 = muli %arg2, %c-1 : index
          %36 = addi %35, %c64 : index
          %37 = cmpi "slt", %c32, %36 : index
          %38 = select %37, %c32, %36 : index
          %39 = subview %0[%arg0, %arg1, %arg2] [%30, %34, %38] [1, 1, 1]  : memref<5x1x64xf32> to memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 64 + s0 + d1 * 64 + d2)>>
          %40 = "gpu.thread_id"() {dimension = "x"} : () -> index
          %41 = "gpu.block_dim"() {dimension = "x"} : () -> index
          %42 = "gpu.thread_id"() {dimension = "y"} : () -> index
          %43 = "gpu.block_dim"() {dimension = "y"} : () -> index
          %44 = "gpu.thread_id"() {dimension = "z"} : () -> index
          %45 = "gpu.block_dim"() {dimension = "z"} : () -> index
          scf.for %arg3 = %44 to %21 step %45 {
            scf.for %arg4 = %42 to %17 step %43 {
              scf.for %arg5 = %40 to %25 step %41 {
                %46 = load %26[%arg4, %arg3, %arg5] : memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 320 + s0 + d1 * 64 + d2)>>
                store %46, %39[%arg3, %arg4, %arg5] : memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 64 + s0 + d1 * 64 + d2)>>
              }
            }
          }
        }
      }
    }
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After CSE ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_3() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
    %c5 = constant 5 : index
    %c2 = constant 2 : index
    %c1 = constant 1 : index
    %c32 = constant 32 : index
    %c-1 = constant -1 : index
    %c64 = constant 64 : index
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5x1x64xf32>
    %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x64xf32>
    %2 = "gpu.block_id"() {dimension = "x"} : () -> index
    %3 = "gpu.grid_dim"() {dimension = "x"} : () -> index
    %4 = "gpu.block_id"() {dimension = "y"} : () -> index
    %5 = "gpu.grid_dim"() {dimension = "y"} : () -> index
    %6 = "gpu.block_id"() {dimension = "z"} : () -> index
    %7 = "gpu.grid_dim"() {dimension = "z"} : () -> index
    %8 = muli %6, %c2 : index
    %9 = muli %7, %c2 : index
    %10 = muli %4, %c2 : index
    %11 = muli %5, %c2 : index
    %12 = muli %2, %c32 : index
    %13 = muli %3, %c32 : index
    scf.for %arg0 = %8 to %c5 step %9 {
      scf.for %arg1 = %10 to %c1 step %11 {
        scf.for %arg2 = %12 to %c64 step %13 {
          %14 = muli %arg1, %c-1 : index
          %15 = addi %14, %c1 : index
          %16 = cmpi "slt", %c2, %15 : index
          %17 = select %16, %c2, %15 : index
          %18 = muli %arg0, %c-1 : index
          %19 = addi %18, %c5 : index
          %20 = cmpi "slt", %c2, %19 : index
          %21 = select %20, %c2, %19 : index
          %22 = muli %arg2, %c-1 : index
          %23 = addi %22, %c64 : index
          %24 = cmpi "slt", %c32, %23 : index
          %25 = select %24, %c32, %23 : index
          %26 = subview %1[%arg1, %arg0, %arg2] [%17, %21, %25] [1, 1, 1]  : memref<1x5x64xf32> to memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 320 + s0 + d1 * 64 + d2)>>
          %27 = subview %0[%arg0, %arg1, %arg2] [%21, %17, %25] [1, 1, 1]  : memref<5x1x64xf32> to memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 64 + s0 + d1 * 64 + d2)>>
          %28 = "gpu.thread_id"() {dimension = "x"} : () -> index
          %29 = "gpu.block_dim"() {dimension = "x"} : () -> index
          %30 = "gpu.thread_id"() {dimension = "y"} : () -> index
          %31 = "gpu.block_dim"() {dimension = "y"} : () -> index
          %32 = "gpu.thread_id"() {dimension = "z"} : () -> index
          %33 = "gpu.block_dim"() {dimension = "z"} : () -> index
          scf.for %arg3 = %32 to %21 step %33 {
            scf.for %arg4 = %30 to %17 step %31 {
              scf.for %arg5 = %28 to %25 step %29 {
                %34 = load %26[%arg4, %arg3, %arg5] : memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 320 + s0 + d1 * 64 + d2)>>
                store %34, %27[%arg3, %arg4, %arg5] : memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 64 + s0 + d1 * 64 + d2)>>
              }
            }
          }
        }
      }
    }
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::ResolveShapeOpsPass ***
func @main_ex_dispatch_3() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
  %c5 = constant 5 : index
  %c2 = constant 2 : index
  %c1 = constant 1 : index
  %c32 = constant 32 : index
  %c-1 = constant -1 : index
  %c64 = constant 64 : index
  %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5x1x64xf32>
  %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x64xf32>
  %2 = "gpu.block_id"() {dimension = "x"} : () -> index
  %3 = "gpu.grid_dim"() {dimension = "x"} : () -> index
  %4 = "gpu.block_id"() {dimension = "y"} : () -> index
  %5 = "gpu.grid_dim"() {dimension = "y"} : () -> index
  %6 = "gpu.block_id"() {dimension = "z"} : () -> index
  %7 = "gpu.grid_dim"() {dimension = "z"} : () -> index
  %8 = muli %6, %c2 : index
  %9 = muli %7, %c2 : index
  %10 = muli %4, %c2 : index
  %11 = muli %5, %c2 : index
  %12 = muli %2, %c32 : index
  %13 = muli %3, %c32 : index
  scf.for %arg0 = %8 to %c5 step %9 {
    scf.for %arg1 = %10 to %c1 step %11 {
      scf.for %arg2 = %12 to %c64 step %13 {
        %14 = muli %arg1, %c-1 : index
        %15 = addi %14, %c1 : index
        %16 = cmpi "slt", %c2, %15 : index
        %17 = select %16, %c2, %15 : index
        %18 = muli %arg0, %c-1 : index
        %19 = addi %18, %c5 : index
        %20 = cmpi "slt", %c2, %19 : index
        %21 = select %20, %c2, %19 : index
        %22 = muli %arg2, %c-1 : index
        %23 = addi %22, %c64 : index
        %24 = cmpi "slt", %c32, %23 : index
        %25 = select %24, %c32, %23 : index
        %26 = subview %1[%arg1, %arg0, %arg2] [%17, %21, %25] [1, 1, 1]  : memref<1x5x64xf32> to memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 320 + s0 + d1 * 64 + d2)>>
        %27 = subview %0[%arg0, %arg1, %arg2] [%21, %17, %25] [1, 1, 1]  : memref<5x1x64xf32> to memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 64 + s0 + d1 * 64 + d2)>>
        %28 = "gpu.thread_id"() {dimension = "x"} : () -> index
        %29 = "gpu.block_dim"() {dimension = "x"} : () -> index
        %30 = "gpu.thread_id"() {dimension = "y"} : () -> index
        %31 = "gpu.block_dim"() {dimension = "y"} : () -> index
        %32 = "gpu.thread_id"() {dimension = "z"} : () -> index
        %33 = "gpu.block_dim"() {dimension = "z"} : () -> index
        scf.for %arg3 = %32 to %21 step %33 {
          scf.for %arg4 = %30 to %17 step %31 {
            scf.for %arg5 = %28 to %25 step %29 {
              %34 = load %26[%arg4, %arg3, %arg5] : memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 320 + s0 + d1 * 64 + d2)>>
              store %34, %27[%arg3, %arg4, %arg5] : memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 64 + s0 + d1 * 64 + d2)>>
            }
          }
        }
      }
    }
  }
  return
}

// *** IR Dump After LegalizeStandardForSPIRV ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_3() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
    %c5 = constant 5 : index
    %c2 = constant 2 : index
    %c1 = constant 1 : index
    %c32 = constant 32 : index
    %c-1 = constant -1 : index
    %c64 = constant 64 : index
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5x1x64xf32>
    %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x64xf32>
    %2 = "gpu.block_id"() {dimension = "x"} : () -> index
    %3 = "gpu.grid_dim"() {dimension = "x"} : () -> index
    %4 = "gpu.block_id"() {dimension = "y"} : () -> index
    %5 = "gpu.grid_dim"() {dimension = "y"} : () -> index
    %6 = "gpu.block_id"() {dimension = "z"} : () -> index
    %7 = "gpu.grid_dim"() {dimension = "z"} : () -> index
    %8 = muli %6, %c2 : index
    %9 = muli %7, %c2 : index
    %10 = muli %4, %c2 : index
    %11 = muli %5, %c2 : index
    %12 = muli %2, %c32 : index
    %13 = muli %3, %c32 : index
    scf.for %arg0 = %8 to %c5 step %9 {
      scf.for %arg1 = %10 to %c1 step %11 {
        scf.for %arg2 = %12 to %c64 step %13 {
          %14 = muli %arg1, %c-1 : index
          %15 = addi %14, %c1 : index
          %16 = cmpi "slt", %c2, %15 : index
          %17 = select %16, %c2, %15 : index
          %18 = muli %arg0, %c-1 : index
          %19 = addi %18, %c5 : index
          %20 = cmpi "slt", %c2, %19 : index
          %21 = select %20, %c2, %19 : index
          %22 = muli %arg2, %c-1 : index
          %23 = addi %22, %c64 : index
          %24 = cmpi "slt", %c32, %23 : index
          %25 = select %24, %c32, %23 : index
          %26 = "gpu.thread_id"() {dimension = "x"} : () -> index
          %27 = "gpu.block_dim"() {dimension = "x"} : () -> index
          %28 = "gpu.thread_id"() {dimension = "y"} : () -> index
          %29 = "gpu.block_dim"() {dimension = "y"} : () -> index
          %30 = "gpu.thread_id"() {dimension = "z"} : () -> index
          %31 = "gpu.block_dim"() {dimension = "z"} : () -> index
          scf.for %arg3 = %30 to %21 step %31 {
            scf.for %arg4 = %28 to %17 step %29 {
              scf.for %arg5 = %26 to %25 step %27 {
                %32 = addi %arg1, %arg4 : index
                %33 = addi %arg0, %arg3 : index
                %34 = addi %arg2, %arg5 : index
                %35 = load %1[%32, %33, %34] : memref<1x5x64xf32>
                %36 = addi %arg0, %arg3 : index
                %37 = addi %arg1, %arg4 : index
                %38 = addi %arg2, %arg5 : index
                store %35, %0[%36, %37, %38] : memref<5x1x64xf32>
              }
            }
          }
        }
      }
    }
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After Canonicalizer ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_3() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
    %c5 = constant 5 : index
    %c2 = constant 2 : index
    %c1 = constant 1 : index
    %c32 = constant 32 : index
    %c-1 = constant -1 : index
    %c64 = constant 64 : index
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5x1x64xf32>
    %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x64xf32>
    %2 = "gpu.block_id"() {dimension = "x"} : () -> index
    %3 = "gpu.grid_dim"() {dimension = "x"} : () -> index
    %4 = "gpu.block_id"() {dimension = "y"} : () -> index
    %5 = "gpu.grid_dim"() {dimension = "y"} : () -> index
    %6 = "gpu.block_id"() {dimension = "z"} : () -> index
    %7 = "gpu.grid_dim"() {dimension = "z"} : () -> index
    %8 = muli %6, %c2 : index
    %9 = muli %7, %c2 : index
    %10 = muli %4, %c2 : index
    %11 = muli %5, %c2 : index
    %12 = muli %2, %c32 : index
    %13 = muli %3, %c32 : index
    scf.for %arg0 = %8 to %c5 step %9 {
      scf.for %arg1 = %10 to %c1 step %11 {
        scf.for %arg2 = %12 to %c64 step %13 {
          %14 = muli %arg1, %c-1 : index
          %15 = addi %14, %c1 : index
          %16 = cmpi "slt", %c2, %15 : index
          %17 = select %16, %c2, %15 : index
          %18 = muli %arg0, %c-1 : index
          %19 = addi %18, %c5 : index
          %20 = cmpi "slt", %c2, %19 : index
          %21 = select %20, %c2, %19 : index
          %22 = muli %arg2, %c-1 : index
          %23 = addi %22, %c64 : index
          %24 = cmpi "slt", %c32, %23 : index
          %25 = select %24, %c32, %23 : index
          %26 = "gpu.thread_id"() {dimension = "x"} : () -> index
          %27 = "gpu.block_dim"() {dimension = "x"} : () -> index
          %28 = "gpu.thread_id"() {dimension = "y"} : () -> index
          %29 = "gpu.block_dim"() {dimension = "y"} : () -> index
          %30 = "gpu.thread_id"() {dimension = "z"} : () -> index
          %31 = "gpu.block_dim"() {dimension = "z"} : () -> index
          scf.for %arg3 = %30 to %21 step %31 {
            scf.for %arg4 = %28 to %17 step %29 {
              scf.for %arg5 = %26 to %25 step %27 {
                %32 = addi %arg1, %arg4 : index
                %33 = addi %arg0, %arg3 : index
                %34 = addi %arg2, %arg5 : index
                %35 = load %1[%32, %33, %34] : memref<1x5x64xf32>
                %36 = addi %arg0, %arg3 : index
                %37 = addi %arg1, %arg4 : index
                %38 = addi %arg2, %arg5 : index
                store %35, %0[%36, %37, %38] : memref<5x1x64xf32>
              }
            }
          }
        }
      }
    }
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After CSE ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_3() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
    %c5 = constant 5 : index
    %c2 = constant 2 : index
    %c1 = constant 1 : index
    %c32 = constant 32 : index
    %c-1 = constant -1 : index
    %c64 = constant 64 : index
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5x1x64xf32>
    %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5x64xf32>
    %2 = "gpu.block_id"() {dimension = "x"} : () -> index
    %3 = "gpu.grid_dim"() {dimension = "x"} : () -> index
    %4 = "gpu.block_id"() {dimension = "y"} : () -> index
    %5 = "gpu.grid_dim"() {dimension = "y"} : () -> index
    %6 = "gpu.block_id"() {dimension = "z"} : () -> index
    %7 = "gpu.grid_dim"() {dimension = "z"} : () -> index
    %8 = muli %6, %c2 : index
    %9 = muli %7, %c2 : index
    %10 = muli %4, %c2 : index
    %11 = muli %5, %c2 : index
    %12 = muli %2, %c32 : index
    %13 = muli %3, %c32 : index
    scf.for %arg0 = %8 to %c5 step %9 {
      scf.for %arg1 = %10 to %c1 step %11 {
        scf.for %arg2 = %12 to %c64 step %13 {
          %14 = muli %arg1, %c-1 : index
          %15 = addi %14, %c1 : index
          %16 = cmpi "slt", %c2, %15 : index
          %17 = select %16, %c2, %15 : index
          %18 = muli %arg0, %c-1 : index
          %19 = addi %18, %c5 : index
          %20 = cmpi "slt", %c2, %19 : index
          %21 = select %20, %c2, %19 : index
          %22 = muli %arg2, %c-1 : index
          %23 = addi %22, %c64 : index
          %24 = cmpi "slt", %c32, %23 : index
          %25 = select %24, %c32, %23 : index
          %26 = "gpu.thread_id"() {dimension = "x"} : () -> index
          %27 = "gpu.block_dim"() {dimension = "x"} : () -> index
          %28 = "gpu.thread_id"() {dimension = "y"} : () -> index
          %29 = "gpu.block_dim"() {dimension = "y"} : () -> index
          %30 = "gpu.thread_id"() {dimension = "z"} : () -> index
          %31 = "gpu.block_dim"() {dimension = "z"} : () -> index
          scf.for %arg3 = %30 to %21 step %31 {
            scf.for %arg4 = %28 to %17 step %29 {
              scf.for %arg5 = %26 to %25 step %27 {
                %32 = addi %arg1, %arg4 : index
                %33 = addi %arg0, %arg3 : index
                %34 = addi %arg2, %arg5 : index
                %35 = load %1[%32, %33, %34] : memref<1x5x64xf32>
                store %35, %0[%33, %32, %34] : memref<5x1x64xf32>
              }
            }
          }
        }
      }
    }
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::ConvertToSPIRVPass ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  spv.module Logical GLSL450 {
    spv.globalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
    spv.globalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spv.ptr<vector<3xi32>, Input>
    spv.globalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
    spv.globalVariable @__resource_var_0_0__ bind(0, 0) : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
    spv.globalVariable @__resource_var_0_1__ bind(0, 1) : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
    spv.func @main_ex_dispatch_3() "None" attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
      %0 = spv.constant 5 : i32
      %1 = spv.constant 2 : i32
      %2 = spv.constant 1 : i32
      %3 = spv.constant 32 : i32
      %4 = spv.constant -1 : i32
      %5 = spv.constant 64 : i32
      %6 = spv._address_of @__resource_var_0_1__ : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
      %7 = spv._address_of @__resource_var_0_0__ : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
      %8 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
      %9 = spv.Load "Input" %8 : vector<3xi32>
      %10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
      %11 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
      %12 = spv.Load "Input" %11 : vector<3xi32>
      %13 = spv.CompositeExtract %12[0 : i32] : vector<3xi32>
      %14 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
      %15 = spv.Load "Input" %14 : vector<3xi32>
      %16 = spv.CompositeExtract %15[1 : i32] : vector<3xi32>
      %17 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
      %18 = spv.Load "Input" %17 : vector<3xi32>
      %19 = spv.CompositeExtract %18[1 : i32] : vector<3xi32>
      %20 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
      %21 = spv.Load "Input" %20 : vector<3xi32>
      %22 = spv.CompositeExtract %21[2 : i32] : vector<3xi32>
      %23 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
      %24 = spv.Load "Input" %23 : vector<3xi32>
      %25 = spv.CompositeExtract %24[2 : i32] : vector<3xi32>
      %26 = spv.IMul %22, %1 : i32
      %27 = spv.IMul %25, %1 : i32
      %28 = spv.IMul %16, %1 : i32
      %29 = spv.IMul %19, %1 : i32
      %30 = spv.IMul %10, %3 : i32
      %31 = spv.IMul %13, %3 : i32
      spv.loop {
        spv.Branch ^bb1(%26 : i32)
      ^bb1(%32: i32):  // 2 preds: ^bb0, ^bb2
        %33 = spv.SLessThan %32, %0 : i32
        spv.BranchConditional %33, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        spv.loop {
          spv.Branch ^bb1(%28 : i32)
        ^bb1(%35: i32):  // 2 preds: ^bb0, ^bb2
          %36 = spv.SLessThan %35, %2 : i32
          spv.BranchConditional %36, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          spv.loop {
            spv.Branch ^bb1(%30 : i32)
          ^bb1(%38: i32):  // 2 preds: ^bb0, ^bb2
            %39 = spv.SLessThan %38, %5 : i32
            spv.BranchConditional %39, ^bb2, ^bb3
          ^bb2:  // pred: ^bb1
            %40 = spv.IMul %35, %4 : i32
            %41 = spv.IAdd %40, %2 : i32
            %42 = spv.SLessThan %1, %41 : i32
            %43 = spv.Select %42, %1, %41 : i1, i32
            %44 = spv.IMul %32, %4 : i32
            %45 = spv.IAdd %44, %0 : i32
            %46 = spv.SLessThan %1, %45 : i32
            %47 = spv.Select %46, %1, %45 : i1, i32
            %48 = spv.IMul %38, %4 : i32
            %49 = spv.IAdd %48, %5 : i32
            %50 = spv.SLessThan %3, %49 : i32
            %51 = spv.Select %50, %3, %49 : i1, i32
            %52 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
            %53 = spv.Load "Input" %52 : vector<3xi32>
            %54 = spv.CompositeExtract %53[0 : i32] : vector<3xi32>
            %55 = spv.constant 32 : i32
            %56 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
            %57 = spv.Load "Input" %56 : vector<3xi32>
            %58 = spv.CompositeExtract %57[1 : i32] : vector<3xi32>
            %59 = spv.constant 2 : i32
            %60 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
            %61 = spv.Load "Input" %60 : vector<3xi32>
            %62 = spv.CompositeExtract %61[2 : i32] : vector<3xi32>
            %63 = spv.constant 2 : i32
            spv.loop {
              spv.Branch ^bb1(%62 : i32)
            ^bb1(%65: i32):  // 2 preds: ^bb0, ^bb2
              %66 = spv.SLessThan %65, %47 : i32
              spv.BranchConditional %66, ^bb2, ^bb3
            ^bb2:  // pred: ^bb1
              spv.loop {
                spv.Branch ^bb1(%58 : i32)
              ^bb1(%68: i32):  // 2 preds: ^bb0, ^bb2
                %69 = spv.SLessThan %68, %43 : i32
                spv.BranchConditional %69, ^bb2, ^bb3
              ^bb2:  // pred: ^bb1
                spv.loop {
                  spv.Branch ^bb1(%54 : i32)
                ^bb1(%71: i32):  // 2 preds: ^bb0, ^bb2
                  %72 = spv.SLessThan %71, %51 : i32
                  spv.BranchConditional %72, ^bb2, ^bb3
                ^bb2:  // pred: ^bb1
                  %73 = spv.IAdd %35, %68 : i32
                  %74 = spv.IAdd %32, %65 : i32
                  %75 = spv.IAdd %38, %71 : i32
                  %76 = spv.constant 0 : i32
                  %77 = spv.constant 0 : i32
                  %78 = spv.constant 320 : i32
                  %79 = spv.IMul %78, %73 : i32
                  %80 = spv.IAdd %77, %79 : i32
                  %81 = spv.constant 64 : i32
                  %82 = spv.IMul %81, %74 : i32
                  %83 = spv.IAdd %80, %82 : i32
                  %84 = spv.constant 1 : i32
                  %85 = spv.IMul %84, %75 : i32
                  %86 = spv.IAdd %83, %85 : i32
                  %87 = spv.AccessChain %7[%76, %86] : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
                  %88 = spv.Load "StorageBuffer" %87 : f32
                  %89 = spv.constant 0 : i32
                  %90 = spv.constant 0 : i32
                  %91 = spv.constant 64 : i32
                  %92 = spv.IMul %91, %74 : i32
                  %93 = spv.IAdd %90, %92 : i32
                  %94 = spv.constant 64 : i32
                  %95 = spv.IMul %94, %73 : i32
                  %96 = spv.IAdd %93, %95 : i32
                  %97 = spv.constant 1 : i32
                  %98 = spv.IMul %97, %75 : i32
                  %99 = spv.IAdd %96, %98 : i32
                  %100 = spv.AccessChain %6[%89, %99] : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
                  spv.Store "StorageBuffer" %100, %88 : f32
                  %101 = spv.IAdd %71, %55 : i32
                  spv.Branch ^bb1(%101 : i32)
                ^bb3:  // pred: ^bb1
                  spv._merge
                }
                %70 = spv.IAdd %68, %59 : i32
                spv.Branch ^bb1(%70 : i32)
              ^bb3:  // pred: ^bb1
                spv._merge
              }
              %67 = spv.IAdd %65, %63 : i32
              spv.Branch ^bb1(%67 : i32)
            ^bb3:  // pred: ^bb1
              spv._merge
            }
            %64 = spv.IAdd %38, %31 : i32
            spv.Branch ^bb1(%64 : i32)
          ^bb3:  // pred: ^bb1
            spv._merge
          }
          %37 = spv.IAdd %35, %29 : i32
          spv.Branch ^bb1(%37 : i32)
        ^bb3:  // pred: ^bb1
          spv._merge
        }
        %34 = spv.IAdd %32, %27 : i32
        spv.Branch ^bb1(%34 : i32)
      ^bb3:  // pred: ^bb1
        spv._merge
      }
      spv.Return
    }
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After SPIRVLowerABIAttributes ***
spv.module Logical GLSL450 {
  spv.globalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__resource_var_0_0__ bind(0, 0) : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
  spv.globalVariable @__resource_var_0_1__ bind(0, 1) : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
  spv.func @main_ex_dispatch_3() "None" {
    %0 = spv.constant 5 : i32
    %1 = spv.constant 2 : i32
    %2 = spv.constant 1 : i32
    %3 = spv.constant 32 : i32
    %4 = spv.constant -1 : i32
    %5 = spv.constant 64 : i32
    %6 = spv._address_of @__resource_var_0_1__ : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
    %7 = spv._address_of @__resource_var_0_0__ : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
    %8 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %9 = spv.Load "Input" %8 : vector<3xi32>
    %10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
    %11 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
    %12 = spv.Load "Input" %11 : vector<3xi32>
    %13 = spv.CompositeExtract %12[0 : i32] : vector<3xi32>
    %14 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %15 = spv.Load "Input" %14 : vector<3xi32>
    %16 = spv.CompositeExtract %15[1 : i32] : vector<3xi32>
    %17 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
    %18 = spv.Load "Input" %17 : vector<3xi32>
    %19 = spv.CompositeExtract %18[1 : i32] : vector<3xi32>
    %20 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %21 = spv.Load "Input" %20 : vector<3xi32>
    %22 = spv.CompositeExtract %21[2 : i32] : vector<3xi32>
    %23 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
    %24 = spv.Load "Input" %23 : vector<3xi32>
    %25 = spv.CompositeExtract %24[2 : i32] : vector<3xi32>
    %26 = spv.IMul %22, %1 : i32
    %27 = spv.IMul %25, %1 : i32
    %28 = spv.IMul %16, %1 : i32
    %29 = spv.IMul %19, %1 : i32
    %30 = spv.IMul %10, %3 : i32
    %31 = spv.IMul %13, %3 : i32
    spv.loop {
      spv.Branch ^bb1(%26 : i32)
    ^bb1(%32: i32):  // 2 preds: ^bb0, ^bb2
      %33 = spv.SLessThan %32, %0 : i32
      spv.BranchConditional %33, ^bb2, ^bb3
    ^bb2:  // pred: ^bb1
      spv.loop {
        spv.Branch ^bb1(%28 : i32)
      ^bb1(%35: i32):  // 2 preds: ^bb0, ^bb2
        %36 = spv.SLessThan %35, %2 : i32
        spv.BranchConditional %36, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        spv.loop {
          spv.Branch ^bb1(%30 : i32)
        ^bb1(%38: i32):  // 2 preds: ^bb0, ^bb2
          %39 = spv.SLessThan %38, %5 : i32
          spv.BranchConditional %39, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          %40 = spv.IMul %35, %4 : i32
          %41 = spv.IAdd %40, %2 : i32
          %42 = spv.SLessThan %1, %41 : i32
          %43 = spv.Select %42, %1, %41 : i1, i32
          %44 = spv.IMul %32, %4 : i32
          %45 = spv.IAdd %44, %0 : i32
          %46 = spv.SLessThan %1, %45 : i32
          %47 = spv.Select %46, %1, %45 : i1, i32
          %48 = spv.IMul %38, %4 : i32
          %49 = spv.IAdd %48, %5 : i32
          %50 = spv.SLessThan %3, %49 : i32
          %51 = spv.Select %50, %3, %49 : i1, i32
          %52 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
          %53 = spv.Load "Input" %52 : vector<3xi32>
          %54 = spv.CompositeExtract %53[0 : i32] : vector<3xi32>
          %55 = spv.constant 32 : i32
          %56 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
          %57 = spv.Load "Input" %56 : vector<3xi32>
          %58 = spv.CompositeExtract %57[1 : i32] : vector<3xi32>
          %59 = spv.constant 2 : i32
          %60 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
          %61 = spv.Load "Input" %60 : vector<3xi32>
          %62 = spv.CompositeExtract %61[2 : i32] : vector<3xi32>
          %63 = spv.constant 2 : i32
          spv.loop {
            spv.Branch ^bb1(%62 : i32)
          ^bb1(%65: i32):  // 2 preds: ^bb0, ^bb2
            %66 = spv.SLessThan %65, %47 : i32
            spv.BranchConditional %66, ^bb2, ^bb3
          ^bb2:  // pred: ^bb1
            spv.loop {
              spv.Branch ^bb1(%58 : i32)
            ^bb1(%68: i32):  // 2 preds: ^bb0, ^bb2
              %69 = spv.SLessThan %68, %43 : i32
              spv.BranchConditional %69, ^bb2, ^bb3
            ^bb2:  // pred: ^bb1
              spv.loop {
                spv.Branch ^bb1(%54 : i32)
              ^bb1(%71: i32):  // 2 preds: ^bb0, ^bb2
                %72 = spv.SLessThan %71, %51 : i32
                spv.BranchConditional %72, ^bb2, ^bb3
              ^bb2:  // pred: ^bb1
                %73 = spv.IAdd %35, %68 : i32
                %74 = spv.IAdd %32, %65 : i32
                %75 = spv.IAdd %38, %71 : i32
                %76 = spv.constant 0 : i32
                %77 = spv.constant 0 : i32
                %78 = spv.constant 320 : i32
                %79 = spv.IMul %78, %73 : i32
                %80 = spv.IAdd %77, %79 : i32
                %81 = spv.constant 64 : i32
                %82 = spv.IMul %81, %74 : i32
                %83 = spv.IAdd %80, %82 : i32
                %84 = spv.constant 1 : i32
                %85 = spv.IMul %84, %75 : i32
                %86 = spv.IAdd %83, %85 : i32
                %87 = spv.AccessChain %7[%76, %86] : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
                %88 = spv.Load "StorageBuffer" %87 : f32
                %89 = spv.constant 0 : i32
                %90 = spv.constant 0 : i32
                %91 = spv.constant 64 : i32
                %92 = spv.IMul %91, %74 : i32
                %93 = spv.IAdd %90, %92 : i32
                %94 = spv.constant 64 : i32
                %95 = spv.IMul %94, %73 : i32
                %96 = spv.IAdd %93, %95 : i32
                %97 = spv.constant 1 : i32
                %98 = spv.IMul %97, %75 : i32
                %99 = spv.IAdd %96, %98 : i32
                %100 = spv.AccessChain %6[%89, %99] : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
                spv.Store "StorageBuffer" %100, %88 : f32
                %101 = spv.IAdd %71, %55 : i32
                spv.Branch ^bb1(%101 : i32)
              ^bb3:  // pred: ^bb1
                spv._merge
              }
              %70 = spv.IAdd %68, %59 : i32
              spv.Branch ^bb1(%70 : i32)
            ^bb3:  // pred: ^bb1
              spv._merge
            }
            %67 = spv.IAdd %65, %63 : i32
            spv.Branch ^bb1(%67 : i32)
          ^bb3:  // pred: ^bb1
            spv._merge
          }
          %64 = spv.IAdd %38, %31 : i32
          spv.Branch ^bb1(%64 : i32)
        ^bb3:  // pred: ^bb1
          spv._merge
        }
        %37 = spv.IAdd %35, %29 : i32
        spv.Branch ^bb1(%37 : i32)
      ^bb3:  // pred: ^bb1
        spv._merge
      }
      %34 = spv.IAdd %32, %27 : i32
      spv.Branch ^bb1(%34 : i32)
    ^bb3:  // pred: ^bb1
      spv._merge
    }
    spv.Return
  }
  spv.EntryPoint "GLCompute" @main_ex_dispatch_3, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__
  spv.ExecutionMode @main_ex_dispatch_3 "LocalSize", 32, 2, 2
}

// *** IR Dump After Canonicalizer ***
spv.module Logical GLSL450 {
  spv.globalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__resource_var_0_0__ bind(0, 0) : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
  spv.globalVariable @__resource_var_0_1__ bind(0, 1) : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
  spv.func @main_ex_dispatch_3() "None" {
    %0 = spv.constant 5 : i32
    %1 = spv.constant 1 : i32
    %2 = spv.constant -1 : i32
    %3 = spv.constant 32 : i32
    %4 = spv.constant 2 : i32
    %5 = spv.constant 320 : i32
    %6 = spv.constant 0 : i32
    %7 = spv.constant 64 : i32
    %8 = spv._address_of @__resource_var_0_1__ : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
    %9 = spv._address_of @__resource_var_0_0__ : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
    %10 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %11 = spv.Load "Input" %10 : vector<3xi32>
    %12 = spv.CompositeExtract %11[0 : i32] : vector<3xi32>
    %13 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
    %14 = spv.Load "Input" %13 : vector<3xi32>
    %15 = spv.CompositeExtract %14[0 : i32] : vector<3xi32>
    %16 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %17 = spv.Load "Input" %16 : vector<3xi32>
    %18 = spv.CompositeExtract %17[1 : i32] : vector<3xi32>
    %19 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
    %20 = spv.Load "Input" %19 : vector<3xi32>
    %21 = spv.CompositeExtract %20[1 : i32] : vector<3xi32>
    %22 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %23 = spv.Load "Input" %22 : vector<3xi32>
    %24 = spv.CompositeExtract %23[2 : i32] : vector<3xi32>
    %25 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
    %26 = spv.Load "Input" %25 : vector<3xi32>
    %27 = spv.CompositeExtract %26[2 : i32] : vector<3xi32>
    %28 = spv.IMul %24, %4 : i32
    %29 = spv.IMul %27, %4 : i32
    %30 = spv.IMul %18, %4 : i32
    %31 = spv.IMul %21, %4 : i32
    %32 = spv.IMul %12, %3 : i32
    %33 = spv.IMul %15, %3 : i32
    spv.loop {
      spv.Branch ^bb1(%28 : i32)
    ^bb1(%34: i32):  // 2 preds: ^bb0, ^bb2
      %35 = spv.SLessThan %34, %0 : i32
      spv.BranchConditional %35, ^bb2, ^bb3
    ^bb2:  // pred: ^bb1
      spv.loop {
        spv.Branch ^bb1(%30 : i32)
      ^bb1(%37: i32):  // 2 preds: ^bb0, ^bb2
        %38 = spv.SLessThan %37, %1 : i32
        spv.BranchConditional %38, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        spv.loop {
          spv.Branch ^bb1(%32 : i32)
        ^bb1(%40: i32):  // 2 preds: ^bb0, ^bb2
          %41 = spv.SLessThan %40, %7 : i32
          spv.BranchConditional %41, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          %42 = spv.IMul %37, %2 : i32
          %43 = spv.IAdd %42, %1 : i32
          %44 = spv.SLessThan %4, %43 : i32
          %45 = spv.Select %44, %4, %43 : i1, i32
          %46 = spv.IMul %34, %2 : i32
          %47 = spv.IAdd %46, %0 : i32
          %48 = spv.SLessThan %4, %47 : i32
          %49 = spv.Select %48, %4, %47 : i1, i32
          %50 = spv.IMul %40, %2 : i32
          %51 = spv.IAdd %50, %7 : i32
          %52 = spv.SLessThan %3, %51 : i32
          %53 = spv.Select %52, %3, %51 : i1, i32
          %54 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
          %55 = spv.Load "Input" %54 : vector<3xi32>
          %56 = spv.CompositeExtract %55[0 : i32] : vector<3xi32>
          %57 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
          %58 = spv.Load "Input" %57 : vector<3xi32>
          %59 = spv.CompositeExtract %58[1 : i32] : vector<3xi32>
          %60 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
          %61 = spv.Load "Input" %60 : vector<3xi32>
          %62 = spv.CompositeExtract %61[2 : i32] : vector<3xi32>
          spv.loop {
            spv.Branch ^bb1(%62 : i32)
          ^bb1(%64: i32):  // 2 preds: ^bb0, ^bb2
            %65 = spv.SLessThan %64, %49 : i32
            spv.BranchConditional %65, ^bb2, ^bb3
          ^bb2:  // pred: ^bb1
            spv.loop {
              spv.Branch ^bb1(%59 : i32)
            ^bb1(%67: i32):  // 2 preds: ^bb0, ^bb2
              %68 = spv.SLessThan %67, %45 : i32
              spv.BranchConditional %68, ^bb2, ^bb3
            ^bb2:  // pred: ^bb1
              spv.loop {
                spv.Branch ^bb1(%56 : i32)
              ^bb1(%70: i32):  // 2 preds: ^bb0, ^bb2
                %71 = spv.SLessThan %70, %53 : i32
                spv.BranchConditional %71, ^bb2, ^bb3
              ^bb2:  // pred: ^bb1
                %72 = spv.IAdd %37, %67 : i32
                %73 = spv.IAdd %34, %64 : i32
                %74 = spv.IAdd %40, %70 : i32
                %75 = spv.IMul %72, %5 : i32
                %76 = spv.IMul %73, %7 : i32
                %77 = spv.IAdd %75, %76 : i32
                %78 = spv.IAdd %77, %74 : i32
                %79 = spv.AccessChain %9[%6, %78] : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
                %80 = spv.Load "StorageBuffer" %79 : f32
                %81 = spv.IMul %73, %7 : i32
                %82 = spv.IMul %72, %7 : i32
                %83 = spv.IAdd %81, %82 : i32
                %84 = spv.IAdd %83, %74 : i32
                %85 = spv.AccessChain %8[%6, %84] : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
                spv.Store "StorageBuffer" %85, %80 : f32
                %86 = spv.IAdd %70, %3 : i32
                spv.Branch ^bb1(%86 : i32)
              ^bb3:  // pred: ^bb1
                spv._merge
              }
              %69 = spv.IAdd %67, %4 : i32
              spv.Branch ^bb1(%69 : i32)
            ^bb3:  // pred: ^bb1
              spv._merge
            }
            %66 = spv.IAdd %64, %4 : i32
            spv.Branch ^bb1(%66 : i32)
          ^bb3:  // pred: ^bb1
            spv._merge
          }
          %63 = spv.IAdd %40, %33 : i32
          spv.Branch ^bb1(%63 : i32)
        ^bb3:  // pred: ^bb1
          spv._merge
        }
        %39 = spv.IAdd %37, %31 : i32
        spv.Branch ^bb1(%39 : i32)
      ^bb3:  // pred: ^bb1
        spv._merge
      }
      %36 = spv.IAdd %34, %29 : i32
      spv.Branch ^bb1(%36 : i32)
    ^bb3:  // pred: ^bb1
      spv._merge
    }
    spv.Return
  }
  spv.EntryPoint "GLCompute" @main_ex_dispatch_3, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__
  spv.ExecutionMode @main_ex_dispatch_3 "LocalSize", 32, 2, 2
}

// *** IR Dump After CSE ***
spv.module Logical GLSL450 {
  spv.globalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__resource_var_0_0__ bind(0, 0) : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
  spv.globalVariable @__resource_var_0_1__ bind(0, 1) : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
  spv.func @main_ex_dispatch_3() "None" {
    %0 = spv.constant 5 : i32
    %1 = spv.constant 1 : i32
    %2 = spv.constant -1 : i32
    %3 = spv.constant 32 : i32
    %4 = spv.constant 2 : i32
    %5 = spv.constant 320 : i32
    %6 = spv.constant 0 : i32
    %7 = spv.constant 64 : i32
    %8 = spv._address_of @__resource_var_0_1__ : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
    %9 = spv._address_of @__resource_var_0_0__ : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
    %10 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %11 = spv.Load "Input" %10 : vector<3xi32>
    %12 = spv.CompositeExtract %11[0 : i32] : vector<3xi32>
    %13 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
    %14 = spv.Load "Input" %13 : vector<3xi32>
    %15 = spv.CompositeExtract %14[0 : i32] : vector<3xi32>
    %16 = spv.Load "Input" %10 : vector<3xi32>
    %17 = spv.CompositeExtract %16[1 : i32] : vector<3xi32>
    %18 = spv.Load "Input" %13 : vector<3xi32>
    %19 = spv.CompositeExtract %18[1 : i32] : vector<3xi32>
    %20 = spv.Load "Input" %10 : vector<3xi32>
    %21 = spv.CompositeExtract %20[2 : i32] : vector<3xi32>
    %22 = spv.Load "Input" %13 : vector<3xi32>
    %23 = spv.CompositeExtract %22[2 : i32] : vector<3xi32>
    %24 = spv.IMul %21, %4 : i32
    %25 = spv.IMul %23, %4 : i32
    %26 = spv.IMul %17, %4 : i32
    %27 = spv.IMul %19, %4 : i32
    %28 = spv.IMul %12, %3 : i32
    %29 = spv.IMul %15, %3 : i32
    spv.loop {
      spv.Branch ^bb1(%24 : i32)
    ^bb1(%30: i32):  // 2 preds: ^bb0, ^bb2
      %31 = spv.SLessThan %30, %0 : i32
      spv.BranchConditional %31, ^bb2, ^bb3
    ^bb2:  // pred: ^bb1
      spv.loop {
        spv.Branch ^bb1(%26 : i32)
      ^bb1(%33: i32):  // 2 preds: ^bb0, ^bb2
        %34 = spv.SLessThan %33, %1 : i32
        spv.BranchConditional %34, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        spv.loop {
          spv.Branch ^bb1(%28 : i32)
        ^bb1(%36: i32):  // 2 preds: ^bb0, ^bb2
          %37 = spv.SLessThan %36, %7 : i32
          spv.BranchConditional %37, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          %38 = spv.IMul %33, %2 : i32
          %39 = spv.IAdd %38, %1 : i32
          %40 = spv.SLessThan %4, %39 : i32
          %41 = spv.Select %40, %4, %39 : i1, i32
          %42 = spv.IMul %30, %2 : i32
          %43 = spv.IAdd %42, %0 : i32
          %44 = spv.SLessThan %4, %43 : i32
          %45 = spv.Select %44, %4, %43 : i1, i32
          %46 = spv.IMul %36, %2 : i32
          %47 = spv.IAdd %46, %7 : i32
          %48 = spv.SLessThan %3, %47 : i32
          %49 = spv.Select %48, %3, %47 : i1, i32
          %50 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
          %51 = spv.Load "Input" %50 : vector<3xi32>
          %52 = spv.CompositeExtract %51[0 : i32] : vector<3xi32>
          %53 = spv.Load "Input" %50 : vector<3xi32>
          %54 = spv.CompositeExtract %53[1 : i32] : vector<3xi32>
          %55 = spv.Load "Input" %50 : vector<3xi32>
          %56 = spv.CompositeExtract %55[2 : i32] : vector<3xi32>
          spv.loop {
            spv.Branch ^bb1(%56 : i32)
          ^bb1(%58: i32):  // 2 preds: ^bb0, ^bb2
            %59 = spv.SLessThan %58, %45 : i32
            spv.BranchConditional %59, ^bb2, ^bb3
          ^bb2:  // pred: ^bb1
            spv.loop {
              spv.Branch ^bb1(%54 : i32)
            ^bb1(%61: i32):  // 2 preds: ^bb0, ^bb2
              %62 = spv.SLessThan %61, %41 : i32
              spv.BranchConditional %62, ^bb2, ^bb3
            ^bb2:  // pred: ^bb1
              spv.loop {
                spv.Branch ^bb1(%52 : i32)
              ^bb1(%64: i32):  // 2 preds: ^bb0, ^bb2
                %65 = spv.SLessThan %64, %49 : i32
                spv.BranchConditional %65, ^bb2, ^bb3
              ^bb2:  // pred: ^bb1
                %66 = spv.IAdd %33, %61 : i32
                %67 = spv.IAdd %30, %58 : i32
                %68 = spv.IAdd %36, %64 : i32
                %69 = spv.IMul %66, %5 : i32
                %70 = spv.IMul %67, %7 : i32
                %71 = spv.IAdd %69, %70 : i32
                %72 = spv.IAdd %71, %68 : i32
                %73 = spv.AccessChain %9[%6, %72] : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
                %74 = spv.Load "StorageBuffer" %73 : f32
                %75 = spv.IMul %66, %7 : i32
                %76 = spv.IAdd %70, %75 : i32
                %77 = spv.IAdd %76, %68 : i32
                %78 = spv.AccessChain %8[%6, %77] : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
                spv.Store "StorageBuffer" %78, %74 : f32
                %79 = spv.IAdd %64, %3 : i32
                spv.Branch ^bb1(%79 : i32)
              ^bb3:  // pred: ^bb1
                spv._merge
              }
              %63 = spv.IAdd %61, %4 : i32
              spv.Branch ^bb1(%63 : i32)
            ^bb3:  // pred: ^bb1
              spv._merge
            }
            %60 = spv.IAdd %58, %4 : i32
            spv.Branch ^bb1(%60 : i32)
          ^bb3:  // pred: ^bb1
            spv._merge
          }
          %57 = spv.IAdd %36, %29 : i32
          spv.Branch ^bb1(%57 : i32)
        ^bb3:  // pred: ^bb1
          spv._merge
        }
        %35 = spv.IAdd %33, %27 : i32
        spv.Branch ^bb1(%35 : i32)
      ^bb3:  // pred: ^bb1
        spv._merge
      }
      %32 = spv.IAdd %30, %25 : i32
      spv.Branch ^bb1(%32 : i32)
    ^bb3:  // pred: ^bb1
      spv._merge
    }
    spv.Return
  }
  spv.EntryPoint "GLCompute" @main_ex_dispatch_3, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__
  spv.ExecutionMode @main_ex_dispatch_3 "LocalSize", 32, 2, 2
}

// *** IR Dump After SPIRVUpdateVCE ***
spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
  spv.globalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__resource_var_0_0__ bind(0, 0) : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
  spv.globalVariable @__resource_var_0_1__ bind(0, 1) : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
  spv.func @main_ex_dispatch_3() "None" {
    %0 = spv.constant 5 : i32
    %1 = spv.constant 1 : i32
    %2 = spv.constant -1 : i32
    %3 = spv.constant 32 : i32
    %4 = spv.constant 2 : i32
    %5 = spv.constant 320 : i32
    %6 = spv.constant 0 : i32
    %7 = spv.constant 64 : i32
    %8 = spv._address_of @__resource_var_0_1__ : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
    %9 = spv._address_of @__resource_var_0_0__ : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
    %10 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %11 = spv.Load "Input" %10 : vector<3xi32>
    %12 = spv.CompositeExtract %11[0 : i32] : vector<3xi32>
    %13 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
    %14 = spv.Load "Input" %13 : vector<3xi32>
    %15 = spv.CompositeExtract %14[0 : i32] : vector<3xi32>
    %16 = spv.Load "Input" %10 : vector<3xi32>
    %17 = spv.CompositeExtract %16[1 : i32] : vector<3xi32>
    %18 = spv.Load "Input" %13 : vector<3xi32>
    %19 = spv.CompositeExtract %18[1 : i32] : vector<3xi32>
    %20 = spv.Load "Input" %10 : vector<3xi32>
    %21 = spv.CompositeExtract %20[2 : i32] : vector<3xi32>
    %22 = spv.Load "Input" %13 : vector<3xi32>
    %23 = spv.CompositeExtract %22[2 : i32] : vector<3xi32>
    %24 = spv.IMul %21, %4 : i32
    %25 = spv.IMul %23, %4 : i32
    %26 = spv.IMul %17, %4 : i32
    %27 = spv.IMul %19, %4 : i32
    %28 = spv.IMul %12, %3 : i32
    %29 = spv.IMul %15, %3 : i32
    spv.loop {
      spv.Branch ^bb1(%24 : i32)
    ^bb1(%30: i32):  // 2 preds: ^bb0, ^bb2
      %31 = spv.SLessThan %30, %0 : i32
      spv.BranchConditional %31, ^bb2, ^bb3
    ^bb2:  // pred: ^bb1
      spv.loop {
        spv.Branch ^bb1(%26 : i32)
      ^bb1(%33: i32):  // 2 preds: ^bb0, ^bb2
        %34 = spv.SLessThan %33, %1 : i32
        spv.BranchConditional %34, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        spv.loop {
          spv.Branch ^bb1(%28 : i32)
        ^bb1(%36: i32):  // 2 preds: ^bb0, ^bb2
          %37 = spv.SLessThan %36, %7 : i32
          spv.BranchConditional %37, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          %38 = spv.IMul %33, %2 : i32
          %39 = spv.IAdd %38, %1 : i32
          %40 = spv.SLessThan %4, %39 : i32
          %41 = spv.Select %40, %4, %39 : i1, i32
          %42 = spv.IMul %30, %2 : i32
          %43 = spv.IAdd %42, %0 : i32
          %44 = spv.SLessThan %4, %43 : i32
          %45 = spv.Select %44, %4, %43 : i1, i32
          %46 = spv.IMul %36, %2 : i32
          %47 = spv.IAdd %46, %7 : i32
          %48 = spv.SLessThan %3, %47 : i32
          %49 = spv.Select %48, %3, %47 : i1, i32
          %50 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
          %51 = spv.Load "Input" %50 : vector<3xi32>
          %52 = spv.CompositeExtract %51[0 : i32] : vector<3xi32>
          %53 = spv.Load "Input" %50 : vector<3xi32>
          %54 = spv.CompositeExtract %53[1 : i32] : vector<3xi32>
          %55 = spv.Load "Input" %50 : vector<3xi32>
          %56 = spv.CompositeExtract %55[2 : i32] : vector<3xi32>
          spv.loop {
            spv.Branch ^bb1(%56 : i32)
          ^bb1(%58: i32):  // 2 preds: ^bb0, ^bb2
            %59 = spv.SLessThan %58, %45 : i32
            spv.BranchConditional %59, ^bb2, ^bb3
          ^bb2:  // pred: ^bb1
            spv.loop {
              spv.Branch ^bb1(%54 : i32)
            ^bb1(%61: i32):  // 2 preds: ^bb0, ^bb2
              %62 = spv.SLessThan %61, %41 : i32
              spv.BranchConditional %62, ^bb2, ^bb3
            ^bb2:  // pred: ^bb1
              spv.loop {
                spv.Branch ^bb1(%52 : i32)
              ^bb1(%64: i32):  // 2 preds: ^bb0, ^bb2
                %65 = spv.SLessThan %64, %49 : i32
                spv.BranchConditional %65, ^bb2, ^bb3
              ^bb2:  // pred: ^bb1
                %66 = spv.IAdd %33, %61 : i32
                %67 = spv.IAdd %30, %58 : i32
                %68 = spv.IAdd %36, %64 : i32
                %69 = spv.IMul %66, %5 : i32
                %70 = spv.IMul %67, %7 : i32
                %71 = spv.IAdd %69, %70 : i32
                %72 = spv.IAdd %71, %68 : i32
                %73 = spv.AccessChain %9[%6, %72] : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
                %74 = spv.Load "StorageBuffer" %73 : f32
                %75 = spv.IMul %66, %7 : i32
                %76 = spv.IAdd %70, %75 : i32
                %77 = spv.IAdd %76, %68 : i32
                %78 = spv.AccessChain %8[%6, %77] : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
                spv.Store "StorageBuffer" %78, %74 : f32
                %79 = spv.IAdd %64, %3 : i32
                spv.Branch ^bb1(%79 : i32)
              ^bb3:  // pred: ^bb1
                spv._merge
              }
              %63 = spv.IAdd %61, %4 : i32
              spv.Branch ^bb1(%63 : i32)
            ^bb3:  // pred: ^bb1
              spv._merge
            }
            %60 = spv.IAdd %58, %4 : i32
            spv.Branch ^bb1(%60 : i32)
          ^bb3:  // pred: ^bb1
            spv._merge
          }
          %57 = spv.IAdd %36, %29 : i32
          spv.Branch ^bb1(%57 : i32)
        ^bb3:  // pred: ^bb1
          spv._merge
        }
        %35 = spv.IAdd %33, %27 : i32
        spv.Branch ^bb1(%35 : i32)
      ^bb3:  // pred: ^bb1
        spv._merge
      }
      %32 = spv.IAdd %30, %25 : i32
      spv.Branch ^bb1(%32 : i32)
    ^bb3:  // pred: ^bb1
      spv._merge
    }
    spv.Return
  }
  spv.EntryPoint "GLCompute" @main_ex_dispatch_3, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__
  spv.ExecutionMode @main_ex_dispatch_3 "LocalSize", 32, 2, 2
}

// *** IR Dump After mlir::iree_compiler::IREE::HAL::TranslateExecutablesPass ***
hal.executable @main_ex_dispatch_3 attributes {sym_visibility = "private"} {
  hal.interface @legacy_io {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
  hal.executable.entry_point @main_ex_dispatch_3 attributes {interface = @legacy_io, ordinal = 0 : i32, signature = (tensor<1x5x64xf32>) -> tensor<5x1x64xf32>}
  hal.executable.target "vulkan*" {
    module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
      spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
        spv.globalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
        spv.globalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spv.ptr<vector<3xi32>, Input>
        spv.globalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
        spv.globalVariable @__resource_var_0_0__ bind(0, 0) : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
        spv.globalVariable @__resource_var_0_1__ bind(0, 1) : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
        spv.func @main_ex_dispatch_3() "None" {
          %0 = spv.constant 5 : i32
          %1 = spv.constant 1 : i32
          %2 = spv.constant -1 : i32
          %3 = spv.constant 32 : i32
          %4 = spv.constant 2 : i32
          %5 = spv.constant 320 : i32
          %6 = spv.constant 0 : i32
          %7 = spv.constant 64 : i32
          %8 = spv._address_of @__resource_var_0_1__ : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
          %9 = spv._address_of @__resource_var_0_0__ : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
          %10 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
          %11 = spv.Load "Input" %10 : vector<3xi32>
          %12 = spv.CompositeExtract %11[0 : i32] : vector<3xi32>
          %13 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
          %14 = spv.Load "Input" %13 : vector<3xi32>
          %15 = spv.CompositeExtract %14[0 : i32] : vector<3xi32>
          %16 = spv.Load "Input" %10 : vector<3xi32>
          %17 = spv.CompositeExtract %16[1 : i32] : vector<3xi32>
          %18 = spv.Load "Input" %13 : vector<3xi32>
          %19 = spv.CompositeExtract %18[1 : i32] : vector<3xi32>
          %20 = spv.Load "Input" %10 : vector<3xi32>
          %21 = spv.CompositeExtract %20[2 : i32] : vector<3xi32>
          %22 = spv.Load "Input" %13 : vector<3xi32>
          %23 = spv.CompositeExtract %22[2 : i32] : vector<3xi32>
          %24 = spv.IMul %21, %4 : i32
          %25 = spv.IMul %23, %4 : i32
          %26 = spv.IMul %17, %4 : i32
          %27 = spv.IMul %19, %4 : i32
          %28 = spv.IMul %12, %3 : i32
          %29 = spv.IMul %15, %3 : i32
          spv.loop {
            spv.Branch ^bb1(%24 : i32)
          ^bb1(%30: i32):  // 2 preds: ^bb0, ^bb2
            %31 = spv.SLessThan %30, %0 : i32
            spv.BranchConditional %31, ^bb2, ^bb3
          ^bb2:  // pred: ^bb1
            spv.loop {
              spv.Branch ^bb1(%26 : i32)
            ^bb1(%33: i32):  // 2 preds: ^bb0, ^bb2
              %34 = spv.SLessThan %33, %1 : i32
              spv.BranchConditional %34, ^bb2, ^bb3
            ^bb2:  // pred: ^bb1
              spv.loop {
                spv.Branch ^bb1(%28 : i32)
              ^bb1(%36: i32):  // 2 preds: ^bb0, ^bb2
                %37 = spv.SLessThan %36, %7 : i32
                spv.BranchConditional %37, ^bb2, ^bb3
              ^bb2:  // pred: ^bb1
                %38 = spv.IMul %33, %2 : i32
                %39 = spv.IAdd %38, %1 : i32
                %40 = spv.SLessThan %4, %39 : i32
                %41 = spv.Select %40, %4, %39 : i1, i32
                %42 = spv.IMul %30, %2 : i32
                %43 = spv.IAdd %42, %0 : i32
                %44 = spv.SLessThan %4, %43 : i32
                %45 = spv.Select %44, %4, %43 : i1, i32
                %46 = spv.IMul %36, %2 : i32
                %47 = spv.IAdd %46, %7 : i32
                %48 = spv.SLessThan %3, %47 : i32
                %49 = spv.Select %48, %3, %47 : i1, i32
                %50 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
                %51 = spv.Load "Input" %50 : vector<3xi32>
                %52 = spv.CompositeExtract %51[0 : i32] : vector<3xi32>
                %53 = spv.Load "Input" %50 : vector<3xi32>
                %54 = spv.CompositeExtract %53[1 : i32] : vector<3xi32>
                %55 = spv.Load "Input" %50 : vector<3xi32>
                %56 = spv.CompositeExtract %55[2 : i32] : vector<3xi32>
                spv.loop {
                  spv.Branch ^bb1(%56 : i32)
                ^bb1(%58: i32):  // 2 preds: ^bb0, ^bb2
                  %59 = spv.SLessThan %58, %45 : i32
                  spv.BranchConditional %59, ^bb2, ^bb3
                ^bb2:  // pred: ^bb1
                  spv.loop {
                    spv.Branch ^bb1(%54 : i32)
                  ^bb1(%61: i32):  // 2 preds: ^bb0, ^bb2
                    %62 = spv.SLessThan %61, %41 : i32
                    spv.BranchConditional %62, ^bb2, ^bb3
                  ^bb2:  // pred: ^bb1
                    spv.loop {
                      spv.Branch ^bb1(%52 : i32)
                    ^bb1(%64: i32):  // 2 preds: ^bb0, ^bb2
                      %65 = spv.SLessThan %64, %49 : i32
                      spv.BranchConditional %65, ^bb2, ^bb3
                    ^bb2:  // pred: ^bb1
                      %66 = spv.IAdd %33, %61 : i32
                      %67 = spv.IAdd %30, %58 : i32
                      %68 = spv.IAdd %36, %64 : i32
                      %69 = spv.IMul %66, %5 : i32
                      %70 = spv.IMul %67, %7 : i32
                      %71 = spv.IAdd %69, %70 : i32
                      %72 = spv.IAdd %71, %68 : i32
                      %73 = spv.AccessChain %9[%6, %72] : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
                      %74 = spv.Load "StorageBuffer" %73 : f32
                      %75 = spv.IMul %66, %7 : i32
                      %76 = spv.IAdd %70, %75 : i32
                      %77 = spv.IAdd %76, %68 : i32
                      %78 = spv.AccessChain %8[%6, %77] : !spv.ptr<!spv.struct<!spv.array<320 x f32, stride=4> [0]>, StorageBuffer>
                      spv.Store "StorageBuffer" %78, %74 : f32
                      %79 = spv.IAdd %64, %3 : i32
                      spv.Branch ^bb1(%79 : i32)
                    ^bb3:  // pred: ^bb1
                      spv._merge
                    }
                    %63 = spv.IAdd %61, %4 : i32
                    spv.Branch ^bb1(%63 : i32)
                  ^bb3:  // pred: ^bb1
                    spv._merge
                  }
                  %60 = spv.IAdd %58, %4 : i32
                  spv.Branch ^bb1(%60 : i32)
                ^bb3:  // pred: ^bb1
                  spv._merge
                }
                %57 = spv.IAdd %36, %29 : i32
                spv.Branch ^bb1(%57 : i32)
              ^bb3:  // pred: ^bb1
                spv._merge
              }
              %35 = spv.IAdd %33, %27 : i32
              spv.Branch ^bb1(%35 : i32)
            ^bb3:  // pred: ^bb1
              spv._merge
            }
            %32 = spv.IAdd %30, %25 : i32
            spv.Branch ^bb1(%32 : i32)
          ^bb3:  // pred: ^bb1
            spv._merge
          }
          spv.Return
        }
        spv.EntryPoint "GLCompute" @main_ex_dispatch_3, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__
        spv.ExecutionMode @main_ex_dispatch_3 "LocalSize", 32, 2, 2
      }
      hal.interface @legacy_io attributes {sym_visibility = "private"} {
        hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
        hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
      }
    }
  }
}

// *** IR Dump After Inliner ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_4() {
    %c0 = constant 0 : index
    %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<1x5xf32>
    %1 = "xla_hlo.transpose"(%0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x5xf32>) -> tensor<5x1xf32>
    %2 = "xla_hlo.reshape"(%1) : (tensor<5x1xf32>) -> tensor<5x1x1xf32>
    hal.interface.store.tensor %2, @legacy_io::@ret0, offset = %c0 : tensor<5x1x1xf32>
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::Shape::`anonymous-namespace'::TieDynamicShapesPass ***
func @main_ex_dispatch_4() {
  %c0 = constant 0 : index
  %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<1x5xf32>
  %1 = "xla_hlo.transpose"(%0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x5xf32>) -> tensor<5x1xf32>
  %2 = "xla_hlo.reshape"(%1) : (tensor<5x1xf32>) -> tensor<5x1x1xf32>
  hal.interface.store.tensor %2, @legacy_io::@ret0, offset = %c0 : tensor<5x1x1xf32>
  return
}

// *** IR Dump After mlir::iree_compiler::Shape::`anonymous-namespace'::MaterializeShapeCalculationsPass ***
func @main_ex_dispatch_4() {
  %c0 = constant 0 : index
  %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<1x5xf32>
  %1 = "xla_hlo.transpose"(%0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x5xf32>) -> tensor<5x1xf32>
  %2 = "xla_hlo.reshape"(%1) : (tensor<5x1xf32>) -> tensor<5x1x1xf32>
  hal.interface.store.tensor %2, @legacy_io::@ret0, offset = %c0 : tensor<5x1x1xf32>
  return
}

// *** IR Dump After mlir::iree_compiler::Shape::`anonymous-namespace'::HoistShapeCalculations ***
func @main_ex_dispatch_4() {
  %c0 = constant 0 : index
  %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<1x5xf32>
  %1 = "xla_hlo.transpose"(%0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x5xf32>) -> tensor<5x1xf32>
  %2 = "xla_hlo.reshape"(%1) : (tensor<5x1xf32>) -> tensor<5x1x1xf32>
  hal.interface.store.tensor %2, @legacy_io::@ret0, offset = %c0 : tensor<5x1x1xf32>
  return
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::DecomposeHLOClampPass ***
func @main_ex_dispatch_4() {
  %c0 = constant 0 : index
  %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<1x5xf32>
  %1 = "xla_hlo.transpose"(%0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x5xf32>) -> tensor<5x1xf32>
  %2 = "xla_hlo.reshape"(%1) : (tensor<5x1xf32>) -> tensor<5x1x1xf32>
  hal.interface.store.tensor %2, @legacy_io::@ret0, offset = %c0 : tensor<5x1x1xf32>
  return
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::ConvertHLOToLinalgOnTensorsPass ***
func @main_ex_dispatch_4() {
  %c0 = constant 0 : index
  %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<1x5xf32>
  %1 = linalg.generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} %0 {
  ^bb0(%arg0: f32):  // no predecessors
    linalg.yield %arg0 : f32
  }: tensor<1x5xf32> -> tensor<5x1xf32>
  %2 = linalg.generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} %1 {
  ^bb0(%arg0: f32):  // no predecessors
    linalg.yield %arg0 : f32
  }: tensor<5x1xf32> -> tensor<5x1x1xf32>
  hal.interface.store.tensor %2, @legacy_io::@ret0, offset = %c0 : tensor<5x1x1xf32>
  return
}

// *** IR Dump After LinalgFusionOfTensorOps ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_4() {
    %c0 = constant 0 : index
    %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<1x5xf32>
    %1 = linalg.generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} %0 {
    ^bb0(%arg0: f32):  // no predecessors
      linalg.yield %arg0 : f32
    }: tensor<1x5xf32> -> tensor<5x1x1xf32>
    hal.interface.store.tensor %1, @legacy_io::@ret0, offset = %c0 : tensor<5x1x1xf32>
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::ConvertHLOToLinalgOnBuffersPass ***
func @main_ex_dispatch_4() {
  %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5x1x1xf32>
  %c0 = constant 0 : index
  %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5xf32>
  linalg.generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} %1, %0 {
  ^bb0(%arg0: f32, %arg1: f32):  // no predecessors
    linalg.yield %arg0 : f32
  }: memref<1x5xf32>, memref<5x1x1xf32>
  return
}

// *** IR Dump After Canonicalizer ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_4() {
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5x1x1xf32>
    %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5xf32>
    linalg.generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} %1, %0 {
    ^bb0(%arg0: f32, %arg1: f32):  // no predecessors
      linalg.yield %arg0 : f32
    }: memref<1x5xf32>, memref<5x1x1xf32>
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After CSE ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_4() {
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5x1x1xf32>
    %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5xf32>
    linalg.generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} %1, %0 {
    ^bb0(%arg0: f32, %arg1: f32):  // no predecessors
      linalg.yield %arg0 : f32
    }: memref<1x5xf32>, memref<5x1x1xf32>
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::LinalgTileAndFusePass ***
func @main_ex_dispatch_4() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
  %c0 = constant 0 : index
  %c5 = constant 5 : index
  %c2 = constant 2 : index
  %c32 = constant 32 : index
  %c1 = constant 1 : index
  %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5x1x1xf32>
  %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5xf32>
  scf.parallel (%arg0, %arg1, %arg2) = (%c0, %c0, %c0) to (%c5, %c1, %c1) step (%c2, %c2, %c32) {
    %2 = affine.min affine_map<(d0, d1, d2) -> (2, d1 - d2)>(%c2, %c1, %arg1)
    %3 = affine.min affine_map<(d0, d1, d2) -> (2, d1 - d2)>(%c2, %c5, %arg0)
    %4 = subview %1[%arg1, %arg0] [%2, %3] [%c1, %c1]  : memref<1x5xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2)>>
    %5 = affine.min affine_map<(d0, d1, d2) -> (2, d1 - d2)>(%c2, %c5, %arg0)
    %6 = affine.min affine_map<(d0, d1, d2) -> (2, d1 - d2)>(%c2, %c1, %arg1)
    %7 = affine.min affine_map<(d0, d1, d2) -> (32, d1 - d2)>(%c32, %c1, %arg2)
    %8 = subview %0[%arg0, %arg1, %arg2] [%5, %6, %7] [%c1, %c1, %c1]  : memref<5x1x1xf32> to memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3)>>
    linalg.generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} {__internal_linalg_transform__ = "workitem"} %4, %8 {
    ^bb0(%arg3: f32, %arg4: f32):  // no predecessors
      linalg.yield %arg3 : f32
    }: memref<?x?xf32, affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2)>>, memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3)>>
    scf.yield
  }
  return
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::SplitDispatchFunctionPass ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_4() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
    %c0 = constant 0 : index
    %c5 = constant 5 : index
    %c2 = constant 2 : index
    %c32 = constant 32 : index
    %c1 = constant 1 : index
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5x1x1xf32>
    %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5xf32>
    scf.parallel (%arg0, %arg1, %arg2) = (%c0, %c0, %c0) to (%c5, %c1, %c1) step (%c2, %c2, %c32) {
      %2 = affine.min affine_map<(d0, d1, d2) -> (2, d1 - d2)>(%c2, %c1, %arg1)
      %3 = affine.min affine_map<(d0, d1, d2) -> (2, d1 - d2)>(%c2, %c5, %arg0)
      %4 = subview %1[%arg1, %arg0] [%2, %3] [%c1, %c1]  : memref<1x5xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2)>>
      %5 = affine.min affine_map<(d0, d1, d2) -> (2, d1 - d2)>(%c2, %c5, %arg0)
      %6 = affine.min affine_map<(d0, d1, d2) -> (2, d1 - d2)>(%c2, %c1, %arg1)
      %7 = affine.min affine_map<(d0, d1, d2) -> (32, d1 - d2)>(%c32, %c1, %arg2)
      %8 = subview %0[%arg0, %arg1, %arg2] [%5, %6, %7] [%c1, %c1, %c1]  : memref<5x1x1xf32> to memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3)>>
      linalg.generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} {__internal_linalg_transform__ = "workitem"} %4, %8 {
      ^bb0(%arg3: f32, %arg4: f32):  // no predecessors
        linalg.yield %arg3 : f32
      }: memref<?x?xf32, affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2)>>, memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3)>>
      scf.yield
    }
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::LinalgTileAndFusePass ***
func @main_ex_dispatch_4() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
  %c0 = constant 0 : index
  %c5 = constant 5 : index
  %c2 = constant 2 : index
  %c32 = constant 32 : index
  %c1 = constant 1 : index
  %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5x1x1xf32>
  %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5xf32>
  scf.parallel (%arg0, %arg1, %arg2) = (%c0, %c0, %c0) to (%c5, %c1, %c1) step (%c2, %c2, %c32) {
    %2 = affine.min affine_map<(d0, d1, d2) -> (2, d1 - d2)>(%c2, %c1, %arg1)
    %3 = affine.min affine_map<(d0, d1, d2) -> (2, d1 - d2)>(%c2, %c5, %arg0)
    %4 = subview %1[%arg1, %arg0] [%2, %3] [%c1, %c1]  : memref<1x5xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2)>>
    %5 = affine.min affine_map<(d0, d1, d2) -> (2, d1 - d2)>(%c2, %c5, %arg0)
    %6 = affine.min affine_map<(d0, d1, d2) -> (2, d1 - d2)>(%c2, %c1, %arg1)
    %7 = affine.min affine_map<(d0, d1, d2) -> (32, d1 - d2)>(%c32, %c1, %arg2)
    %8 = subview %0[%arg0, %arg1, %arg2] [%5, %6, %7] [%c1, %c1, %c1]  : memref<5x1x1xf32> to memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3)>>
    linalg.generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} {__internal_linalg_transform__ = "workitem"} %4, %8 {
    ^bb0(%arg3: f32, %arg4: f32):  // no predecessors
      linalg.yield %arg3 : f32
    }: memref<?x?xf32, affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2)>>, memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3)>>
    scf.yield
  }
  return
}

// *** IR Dump After Canonicalizer ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_4() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
    %c0 = constant 0 : index
    %c5 = constant 5 : index
    %c2 = constant 2 : index
    %c32 = constant 32 : index
    %c1 = constant 1 : index
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5x1x1xf32>
    %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5xf32>
    scf.parallel (%arg0, %arg1, %arg2) = (%c0, %c0, %c0) to (%c5, %c1, %c1) step (%c2, %c2, %c32) {
      %2 = affine.min affine_map<(d0) -> (2, -d0 + 1)>(%arg1)
      %3 = affine.min affine_map<(d0) -> (2, -d0 + 5)>(%arg0)
      %4 = subview %1[%arg1, %arg0] [%2, %3] [1, 1]  : memref<1x5xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 5 + s0 + d1)>>
      %5 = affine.min affine_map<(d0) -> (2, -d0 + 5)>(%arg0)
      %6 = affine.min affine_map<(d0) -> (2, -d0 + 1)>(%arg1)
      %7 = affine.min affine_map<(d0) -> (32, -d0 + 1)>(%arg2)
      %8 = subview %0[%arg0, %arg1, %arg2] [%5, %6, %7] [1, 1, 1]  : memref<5x1x1xf32> to memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 + s0 + d1 + d2)>>
      linalg.generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} {__internal_linalg_transform__ = "workitem"} %4, %8 {
      ^bb0(%arg3: f32, %arg4: f32):  // no predecessors
        linalg.yield %arg3 : f32
      }: memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 5 + s0 + d1)>>, memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 + s0 + d1 + d2)>>
      scf.yield
    }
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::ConvertToGPUPass ***
func @main_ex_dispatch_4() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
  %c0 = constant 0 : index
  %c5 = constant 5 : index
  %c2 = constant 2 : index
  %c32 = constant 32 : index
  %c1 = constant 1 : index
  %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5x1x1xf32>
  %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5xf32>
  %2 = "gpu.block_id"() {dimension = "x"} : () -> index
  %3 = "gpu.grid_dim"() {dimension = "x"} : () -> index
  %4 = "gpu.block_id"() {dimension = "y"} : () -> index
  %5 = "gpu.grid_dim"() {dimension = "y"} : () -> index
  %6 = "gpu.block_id"() {dimension = "z"} : () -> index
  %7 = "gpu.grid_dim"() {dimension = "z"} : () -> index
  %8 = muli %c2, %6 : index
  %9 = addi %c0, %8 : index
  %10 = muli %c2, %7 : index
  %11 = muli %c2, %4 : index
  %12 = addi %c0, %11 : index
  %13 = muli %c2, %5 : index
  %14 = muli %c32, %2 : index
  %15 = addi %c0, %14 : index
  %16 = muli %c32, %3 : index
  scf.for %arg0 = %9 to %c5 step %10 {
    scf.for %arg1 = %12 to %c1 step %13 {
      scf.for %arg2 = %15 to %c1 step %16 {
        %17 = affine.min affine_map<(d0) -> (2, -d0 + 1)>(%arg1)
        %18 = affine.min affine_map<(d0) -> (2, -d0 + 5)>(%arg0)
        %19 = subview %1[%arg1, %arg0] [%17, %18] [1, 1]  : memref<1x5xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 5 + s0 + d1)>>
        %20 = affine.min affine_map<(d0) -> (2, -d0 + 5)>(%arg0)
        %21 = affine.min affine_map<(d0) -> (2, -d0 + 1)>(%arg1)
        %22 = affine.min affine_map<(d0) -> (32, -d0 + 1)>(%arg2)
        %23 = subview %0[%arg0, %arg1, %arg2] [%20, %21, %22] [1, 1, 1]  : memref<5x1x1xf32> to memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 + s0 + d1 + d2)>>
        %24 = dim %19, 0 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 5 + s0 + d1)>>
        %25 = dim %19, 1 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 5 + s0 + d1)>>
        %26 = dim %23, 0 : memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 + s0 + d1 + d2)>>
        %27 = dim %23, 1 : memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 + s0 + d1 + d2)>>
        %28 = dim %23, 2 : memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 + s0 + d1 + d2)>>
        %29 = affine.apply affine_map<(d0) -> (d0)>(%25)
        %30 = affine.apply affine_map<(d0) -> (d0)>(%24)
        %31 = affine.apply affine_map<(d0) -> (d0)>(%28)
        %c0_0 = constant 0 : index
        %c1_1 = constant 1 : index
        %c0_2 = constant 0 : index
        %c1_3 = constant 1 : index
        %c0_4 = constant 0 : index
        %c1_5 = constant 1 : index
        %32 = "gpu.thread_id"() {dimension = "x"} : () -> index
        %33 = "gpu.block_dim"() {dimension = "x"} : () -> index
        %34 = "gpu.thread_id"() {dimension = "y"} : () -> index
        %35 = "gpu.block_dim"() {dimension = "y"} : () -> index
        %36 = "gpu.thread_id"() {dimension = "z"} : () -> index
        %37 = "gpu.block_dim"() {dimension = "z"} : () -> index
        %38 = muli %c1_1, %36 : index
        %39 = addi %c0_0, %38 : index
        %40 = muli %c1_1, %37 : index
        %41 = muli %c1_3, %34 : index
        %42 = addi %c0_2, %41 : index
        %43 = muli %c1_3, %35 : index
        %44 = muli %c1_5, %32 : index
        %45 = addi %c0_4, %44 : index
        %46 = muli %c1_5, %33 : index
        scf.for %arg3 = %39 to %29 step %40 {
          scf.for %arg4 = %42 to %30 step %43 {
            scf.for %arg5 = %45 to %31 step %46 {
              %47 = affine.apply affine_map<(d0) -> (d0)>(%arg4)
              %48 = affine.apply affine_map<(d0) -> (d0)>(%arg3)
              %49 = load %19[%47, %48] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 5 + s0 + d1)>>
              %50 = affine.apply affine_map<(d0) -> (d0)>(%arg3)
              %51 = affine.apply affine_map<(d0) -> (d0)>(%arg4)
              %52 = affine.apply affine_map<(d0) -> (d0)>(%arg5)
              %53 = load %23[%50, %51, %52] : memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 + s0 + d1 + d2)>>
              %54 = affine.apply affine_map<(d0) -> (d0)>(%arg3)
              %55 = affine.apply affine_map<(d0) -> (d0)>(%arg4)
              %56 = affine.apply affine_map<(d0) -> (d0)>(%arg5)
              store %49, %23[%54, %55, %56] : memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 + s0 + d1 + d2)>>
            }
          }
        }
      }
    }
  }
  return
}

// *** IR Dump After ConvertAffineToStandard ***
func @main_ex_dispatch_4() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
  %c0 = constant 0 : index
  %c5 = constant 5 : index
  %c2 = constant 2 : index
  %c32 = constant 32 : index
  %c1 = constant 1 : index
  %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5x1x1xf32>
  %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5xf32>
  %2 = "gpu.block_id"() {dimension = "x"} : () -> index
  %3 = "gpu.grid_dim"() {dimension = "x"} : () -> index
  %4 = "gpu.block_id"() {dimension = "y"} : () -> index
  %5 = "gpu.grid_dim"() {dimension = "y"} : () -> index
  %6 = "gpu.block_id"() {dimension = "z"} : () -> index
  %7 = "gpu.grid_dim"() {dimension = "z"} : () -> index
  %8 = muli %c2, %6 : index
  %9 = addi %c0, %8 : index
  %10 = muli %c2, %7 : index
  %11 = muli %c2, %4 : index
  %12 = addi %c0, %11 : index
  %13 = muli %c2, %5 : index
  %14 = muli %c32, %2 : index
  %15 = addi %c0, %14 : index
  %16 = muli %c32, %3 : index
  scf.for %arg0 = %9 to %c5 step %10 {
    scf.for %arg1 = %12 to %c1 step %13 {
      scf.for %arg2 = %15 to %c1 step %16 {
        %c2_0 = constant 2 : index
        %c-1 = constant -1 : index
        %17 = muli %arg1, %c-1 : index
        %c1_1 = constant 1 : index
        %18 = addi %17, %c1_1 : index
        %19 = cmpi "slt", %c2_0, %18 : index
        %20 = select %19, %c2_0, %18 : index
        %c2_2 = constant 2 : index
        %c-1_3 = constant -1 : index
        %21 = muli %arg0, %c-1_3 : index
        %c5_4 = constant 5 : index
        %22 = addi %21, %c5_4 : index
        %23 = cmpi "slt", %c2_2, %22 : index
        %24 = select %23, %c2_2, %22 : index
        %25 = subview %1[%arg1, %arg0] [%20, %24] [1, 1]  : memref<1x5xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 5 + s0 + d1)>>
        %c2_5 = constant 2 : index
        %c-1_6 = constant -1 : index
        %26 = muli %arg0, %c-1_6 : index
        %c5_7 = constant 5 : index
        %27 = addi %26, %c5_7 : index
        %28 = cmpi "slt", %c2_5, %27 : index
        %29 = select %28, %c2_5, %27 : index
        %c2_8 = constant 2 : index
        %c-1_9 = constant -1 : index
        %30 = muli %arg1, %c-1_9 : index
        %c1_10 = constant 1 : index
        %31 = addi %30, %c1_10 : index
        %32 = cmpi "slt", %c2_8, %31 : index
        %33 = select %32, %c2_8, %31 : index
        %c32_11 = constant 32 : index
        %c-1_12 = constant -1 : index
        %34 = muli %arg2, %c-1_12 : index
        %c1_13 = constant 1 : index
        %35 = addi %34, %c1_13 : index
        %36 = cmpi "slt", %c32_11, %35 : index
        %37 = select %36, %c32_11, %35 : index
        %38 = subview %0[%arg0, %arg1, %arg2] [%29, %33, %37] [1, 1, 1]  : memref<5x1x1xf32> to memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 + s0 + d1 + d2)>>
        %39 = dim %25, 0 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 5 + s0 + d1)>>
        %40 = dim %25, 1 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 5 + s0 + d1)>>
        %41 = dim %38, 0 : memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 + s0 + d1 + d2)>>
        %42 = dim %38, 1 : memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 + s0 + d1 + d2)>>
        %43 = dim %38, 2 : memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 + s0 + d1 + d2)>>
        %c0_14 = constant 0 : index
        %c1_15 = constant 1 : index
        %c0_16 = constant 0 : index
        %c1_17 = constant 1 : index
        %c0_18 = constant 0 : index
        %c1_19 = constant 1 : index
        %44 = "gpu.thread_id"() {dimension = "x"} : () -> index
        %45 = "gpu.block_dim"() {dimension = "x"} : () -> index
        %46 = "gpu.thread_id"() {dimension = "y"} : () -> index
        %47 = "gpu.block_dim"() {dimension = "y"} : () -> index
        %48 = "gpu.thread_id"() {dimension = "z"} : () -> index
        %49 = "gpu.block_dim"() {dimension = "z"} : () -> index
        %50 = muli %c1_15, %48 : index
        %51 = addi %c0_14, %50 : index
        %52 = muli %c1_15, %49 : index
        %53 = muli %c1_17, %46 : index
        %54 = addi %c0_16, %53 : index
        %55 = muli %c1_17, %47 : index
        %56 = muli %c1_19, %44 : index
        %57 = addi %c0_18, %56 : index
        %58 = muli %c1_19, %45 : index
        scf.for %arg3 = %51 to %40 step %52 {
          scf.for %arg4 = %54 to %39 step %55 {
            scf.for %arg5 = %57 to %43 step %58 {
              %59 = load %25[%arg4, %arg3] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 5 + s0 + d1)>>
              %60 = load %38[%arg3, %arg4, %arg5] : memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 + s0 + d1 + d2)>>
              store %59, %38[%arg3, %arg4, %arg5] : memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 + s0 + d1 + d2)>>
            }
          }
        }
      }
    }
  }
  return
}

// *** IR Dump After Canonicalizer ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_4() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
    %c5 = constant 5 : index
    %c2 = constant 2 : index
    %c32 = constant 32 : index
    %c-1 = constant -1 : index
    %c1 = constant 1 : index
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5x1x1xf32>
    %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5xf32>
    %2 = "gpu.block_id"() {dimension = "x"} : () -> index
    %3 = "gpu.grid_dim"() {dimension = "x"} : () -> index
    %4 = "gpu.block_id"() {dimension = "y"} : () -> index
    %5 = "gpu.grid_dim"() {dimension = "y"} : () -> index
    %6 = "gpu.block_id"() {dimension = "z"} : () -> index
    %7 = "gpu.grid_dim"() {dimension = "z"} : () -> index
    %8 = muli %6, %c2 : index
    %9 = muli %7, %c2 : index
    %10 = muli %4, %c2 : index
    %11 = muli %5, %c2 : index
    %12 = muli %2, %c32 : index
    %13 = muli %3, %c32 : index
    scf.for %arg0 = %8 to %c5 step %9 {
      scf.for %arg1 = %10 to %c1 step %11 {
        scf.for %arg2 = %12 to %c1 step %13 {
          %14 = muli %arg1, %c-1 : index
          %15 = addi %14, %c1 : index
          %16 = cmpi "slt", %c2, %15 : index
          %17 = select %16, %c2, %15 : index
          %18 = muli %arg0, %c-1 : index
          %19 = addi %18, %c5 : index
          %20 = cmpi "slt", %c2, %19 : index
          %21 = select %20, %c2, %19 : index
          %22 = subview %1[%arg1, %arg0] [%17, %21] [1, 1]  : memref<1x5xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 5 + s0 + d1)>>
          %23 = muli %arg0, %c-1 : index
          %24 = addi %23, %c5 : index
          %25 = cmpi "slt", %c2, %24 : index
          %26 = select %25, %c2, %24 : index
          %27 = muli %arg1, %c-1 : index
          %28 = addi %27, %c1 : index
          %29 = cmpi "slt", %c2, %28 : index
          %30 = select %29, %c2, %28 : index
          %31 = muli %arg2, %c-1 : index
          %32 = addi %31, %c1 : index
          %33 = cmpi "slt", %c32, %32 : index
          %34 = select %33, %c32, %32 : index
          %35 = subview %0[%arg0, %arg1, %arg2] [%26, %30, %34] [1, 1, 1]  : memref<5x1x1xf32> to memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 + s0 + d1 + d2)>>
          %36 = "gpu.thread_id"() {dimension = "x"} : () -> index
          %37 = "gpu.block_dim"() {dimension = "x"} : () -> index
          %38 = "gpu.thread_id"() {dimension = "y"} : () -> index
          %39 = "gpu.block_dim"() {dimension = "y"} : () -> index
          %40 = "gpu.thread_id"() {dimension = "z"} : () -> index
          %41 = "gpu.block_dim"() {dimension = "z"} : () -> index
          scf.for %arg3 = %40 to %21 step %41 {
            scf.for %arg4 = %38 to %17 step %39 {
              scf.for %arg5 = %36 to %34 step %37 {
                %42 = load %22[%arg4, %arg3] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 5 + s0 + d1)>>
                store %42, %35[%arg3, %arg4, %arg5] : memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 + s0 + d1 + d2)>>
              }
            }
          }
        }
      }
    }
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After CSE ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_4() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
    %c5 = constant 5 : index
    %c2 = constant 2 : index
    %c32 = constant 32 : index
    %c-1 = constant -1 : index
    %c1 = constant 1 : index
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5x1x1xf32>
    %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5xf32>
    %2 = "gpu.block_id"() {dimension = "x"} : () -> index
    %3 = "gpu.grid_dim"() {dimension = "x"} : () -> index
    %4 = "gpu.block_id"() {dimension = "y"} : () -> index
    %5 = "gpu.grid_dim"() {dimension = "y"} : () -> index
    %6 = "gpu.block_id"() {dimension = "z"} : () -> index
    %7 = "gpu.grid_dim"() {dimension = "z"} : () -> index
    %8 = muli %6, %c2 : index
    %9 = muli %7, %c2 : index
    %10 = muli %4, %c2 : index
    %11 = muli %5, %c2 : index
    %12 = muli %2, %c32 : index
    %13 = muli %3, %c32 : index
    scf.for %arg0 = %8 to %c5 step %9 {
      scf.for %arg1 = %10 to %c1 step %11 {
        scf.for %arg2 = %12 to %c1 step %13 {
          %14 = muli %arg1, %c-1 : index
          %15 = addi %14, %c1 : index
          %16 = cmpi "slt", %c2, %15 : index
          %17 = select %16, %c2, %15 : index
          %18 = muli %arg0, %c-1 : index
          %19 = addi %18, %c5 : index
          %20 = cmpi "slt", %c2, %19 : index
          %21 = select %20, %c2, %19 : index
          %22 = subview %1[%arg1, %arg0] [%17, %21] [1, 1]  : memref<1x5xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 5 + s0 + d1)>>
          %23 = muli %arg2, %c-1 : index
          %24 = addi %23, %c1 : index
          %25 = cmpi "slt", %c32, %24 : index
          %26 = select %25, %c32, %24 : index
          %27 = subview %0[%arg0, %arg1, %arg2] [%21, %17, %26] [1, 1, 1]  : memref<5x1x1xf32> to memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 + s0 + d1 + d2)>>
          %28 = "gpu.thread_id"() {dimension = "x"} : () -> index
          %29 = "gpu.block_dim"() {dimension = "x"} : () -> index
          %30 = "gpu.thread_id"() {dimension = "y"} : () -> index
          %31 = "gpu.block_dim"() {dimension = "y"} : () -> index
          %32 = "gpu.thread_id"() {dimension = "z"} : () -> index
          %33 = "gpu.block_dim"() {dimension = "z"} : () -> index
          scf.for %arg3 = %32 to %21 step %33 {
            scf.for %arg4 = %30 to %17 step %31 {
              scf.for %arg5 = %28 to %26 step %29 {
                %34 = load %22[%arg4, %arg3] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 5 + s0 + d1)>>
                store %34, %27[%arg3, %arg4, %arg5] : memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 + s0 + d1 + d2)>>
              }
            }
          }
        }
      }
    }
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::ResolveShapeOpsPass ***
func @main_ex_dispatch_4() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
  %c5 = constant 5 : index
  %c2 = constant 2 : index
  %c32 = constant 32 : index
  %c-1 = constant -1 : index
  %c1 = constant 1 : index
  %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5x1x1xf32>
  %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5xf32>
  %2 = "gpu.block_id"() {dimension = "x"} : () -> index
  %3 = "gpu.grid_dim"() {dimension = "x"} : () -> index
  %4 = "gpu.block_id"() {dimension = "y"} : () -> index
  %5 = "gpu.grid_dim"() {dimension = "y"} : () -> index
  %6 = "gpu.block_id"() {dimension = "z"} : () -> index
  %7 = "gpu.grid_dim"() {dimension = "z"} : () -> index
  %8 = muli %6, %c2 : index
  %9 = muli %7, %c2 : index
  %10 = muli %4, %c2 : index
  %11 = muli %5, %c2 : index
  %12 = muli %2, %c32 : index
  %13 = muli %3, %c32 : index
  scf.for %arg0 = %8 to %c5 step %9 {
    scf.for %arg1 = %10 to %c1 step %11 {
      scf.for %arg2 = %12 to %c1 step %13 {
        %14 = muli %arg1, %c-1 : index
        %15 = addi %14, %c1 : index
        %16 = cmpi "slt", %c2, %15 : index
        %17 = select %16, %c2, %15 : index
        %18 = muli %arg0, %c-1 : index
        %19 = addi %18, %c5 : index
        %20 = cmpi "slt", %c2, %19 : index
        %21 = select %20, %c2, %19 : index
        %22 = subview %1[%arg1, %arg0] [%17, %21] [1, 1]  : memref<1x5xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 5 + s0 + d1)>>
        %23 = muli %arg2, %c-1 : index
        %24 = addi %23, %c1 : index
        %25 = cmpi "slt", %c32, %24 : index
        %26 = select %25, %c32, %24 : index
        %27 = subview %0[%arg0, %arg1, %arg2] [%21, %17, %26] [1, 1, 1]  : memref<5x1x1xf32> to memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 + s0 + d1 + d2)>>
        %28 = "gpu.thread_id"() {dimension = "x"} : () -> index
        %29 = "gpu.block_dim"() {dimension = "x"} : () -> index
        %30 = "gpu.thread_id"() {dimension = "y"} : () -> index
        %31 = "gpu.block_dim"() {dimension = "y"} : () -> index
        %32 = "gpu.thread_id"() {dimension = "z"} : () -> index
        %33 = "gpu.block_dim"() {dimension = "z"} : () -> index
        scf.for %arg3 = %32 to %21 step %33 {
          scf.for %arg4 = %30 to %17 step %31 {
            scf.for %arg5 = %28 to %26 step %29 {
              %34 = load %22[%arg4, %arg3] : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 5 + s0 + d1)>>
              store %34, %27[%arg3, %arg4, %arg5] : memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0] -> (d0 + s0 + d1 + d2)>>
            }
          }
        }
      }
    }
  }
  return
}

// *** IR Dump After LegalizeStandardForSPIRV ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_4() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
    %c5 = constant 5 : index
    %c2 = constant 2 : index
    %c32 = constant 32 : index
    %c-1 = constant -1 : index
    %c1 = constant 1 : index
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5x1x1xf32>
    %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5xf32>
    %2 = "gpu.block_id"() {dimension = "x"} : () -> index
    %3 = "gpu.grid_dim"() {dimension = "x"} : () -> index
    %4 = "gpu.block_id"() {dimension = "y"} : () -> index
    %5 = "gpu.grid_dim"() {dimension = "y"} : () -> index
    %6 = "gpu.block_id"() {dimension = "z"} : () -> index
    %7 = "gpu.grid_dim"() {dimension = "z"} : () -> index
    %8 = muli %6, %c2 : index
    %9 = muli %7, %c2 : index
    %10 = muli %4, %c2 : index
    %11 = muli %5, %c2 : index
    %12 = muli %2, %c32 : index
    %13 = muli %3, %c32 : index
    scf.for %arg0 = %8 to %c5 step %9 {
      scf.for %arg1 = %10 to %c1 step %11 {
        scf.for %arg2 = %12 to %c1 step %13 {
          %14 = muli %arg1, %c-1 : index
          %15 = addi %14, %c1 : index
          %16 = cmpi "slt", %c2, %15 : index
          %17 = select %16, %c2, %15 : index
          %18 = muli %arg0, %c-1 : index
          %19 = addi %18, %c5 : index
          %20 = cmpi "slt", %c2, %19 : index
          %21 = select %20, %c2, %19 : index
          %22 = muli %arg2, %c-1 : index
          %23 = addi %22, %c1 : index
          %24 = cmpi "slt", %c32, %23 : index
          %25 = select %24, %c32, %23 : index
          %26 = "gpu.thread_id"() {dimension = "x"} : () -> index
          %27 = "gpu.block_dim"() {dimension = "x"} : () -> index
          %28 = "gpu.thread_id"() {dimension = "y"} : () -> index
          %29 = "gpu.block_dim"() {dimension = "y"} : () -> index
          %30 = "gpu.thread_id"() {dimension = "z"} : () -> index
          %31 = "gpu.block_dim"() {dimension = "z"} : () -> index
          scf.for %arg3 = %30 to %21 step %31 {
            scf.for %arg4 = %28 to %17 step %29 {
              scf.for %arg5 = %26 to %25 step %27 {
                %32 = addi %arg1, %arg4 : index
                %33 = addi %arg0, %arg3 : index
                %34 = load %1[%32, %33] : memref<1x5xf32>
                %35 = addi %arg0, %arg3 : index
                %36 = addi %arg1, %arg4 : index
                %37 = addi %arg2, %arg5 : index
                store %34, %0[%35, %36, %37] : memref<5x1x1xf32>
              }
            }
          }
        }
      }
    }
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After Canonicalizer ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_4() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
    %c5 = constant 5 : index
    %c2 = constant 2 : index
    %c32 = constant 32 : index
    %c-1 = constant -1 : index
    %c1 = constant 1 : index
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5x1x1xf32>
    %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5xf32>
    %2 = "gpu.block_id"() {dimension = "x"} : () -> index
    %3 = "gpu.grid_dim"() {dimension = "x"} : () -> index
    %4 = "gpu.block_id"() {dimension = "y"} : () -> index
    %5 = "gpu.grid_dim"() {dimension = "y"} : () -> index
    %6 = "gpu.block_id"() {dimension = "z"} : () -> index
    %7 = "gpu.grid_dim"() {dimension = "z"} : () -> index
    %8 = muli %6, %c2 : index
    %9 = muli %7, %c2 : index
    %10 = muli %4, %c2 : index
    %11 = muli %5, %c2 : index
    %12 = muli %2, %c32 : index
    %13 = muli %3, %c32 : index
    scf.for %arg0 = %8 to %c5 step %9 {
      scf.for %arg1 = %10 to %c1 step %11 {
        scf.for %arg2 = %12 to %c1 step %13 {
          %14 = muli %arg1, %c-1 : index
          %15 = addi %14, %c1 : index
          %16 = cmpi "slt", %c2, %15 : index
          %17 = select %16, %c2, %15 : index
          %18 = muli %arg0, %c-1 : index
          %19 = addi %18, %c5 : index
          %20 = cmpi "slt", %c2, %19 : index
          %21 = select %20, %c2, %19 : index
          %22 = muli %arg2, %c-1 : index
          %23 = addi %22, %c1 : index
          %24 = cmpi "slt", %c32, %23 : index
          %25 = select %24, %c32, %23 : index
          %26 = "gpu.thread_id"() {dimension = "x"} : () -> index
          %27 = "gpu.block_dim"() {dimension = "x"} : () -> index
          %28 = "gpu.thread_id"() {dimension = "y"} : () -> index
          %29 = "gpu.block_dim"() {dimension = "y"} : () -> index
          %30 = "gpu.thread_id"() {dimension = "z"} : () -> index
          %31 = "gpu.block_dim"() {dimension = "z"} : () -> index
          scf.for %arg3 = %30 to %21 step %31 {
            scf.for %arg4 = %28 to %17 step %29 {
              scf.for %arg5 = %26 to %25 step %27 {
                %32 = addi %arg1, %arg4 : index
                %33 = addi %arg0, %arg3 : index
                %34 = load %1[%32, %33] : memref<1x5xf32>
                %35 = addi %arg0, %arg3 : index
                %36 = addi %arg1, %arg4 : index
                %37 = addi %arg2, %arg5 : index
                store %34, %0[%35, %36, %37] : memref<5x1x1xf32>
              }
            }
          }
        }
      }
    }
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After CSE ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_4() attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
    %c5 = constant 5 : index
    %c2 = constant 2 : index
    %c32 = constant 32 : index
    %c-1 = constant -1 : index
    %c1 = constant 1 : index
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5x1x1xf32>
    %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<1x5xf32>
    %2 = "gpu.block_id"() {dimension = "x"} : () -> index
    %3 = "gpu.grid_dim"() {dimension = "x"} : () -> index
    %4 = "gpu.block_id"() {dimension = "y"} : () -> index
    %5 = "gpu.grid_dim"() {dimension = "y"} : () -> index
    %6 = "gpu.block_id"() {dimension = "z"} : () -> index
    %7 = "gpu.grid_dim"() {dimension = "z"} : () -> index
    %8 = muli %6, %c2 : index
    %9 = muli %7, %c2 : index
    %10 = muli %4, %c2 : index
    %11 = muli %5, %c2 : index
    %12 = muli %2, %c32 : index
    %13 = muli %3, %c32 : index
    scf.for %arg0 = %8 to %c5 step %9 {
      scf.for %arg1 = %10 to %c1 step %11 {
        scf.for %arg2 = %12 to %c1 step %13 {
          %14 = muli %arg1, %c-1 : index
          %15 = addi %14, %c1 : index
          %16 = cmpi "slt", %c2, %15 : index
          %17 = select %16, %c2, %15 : index
          %18 = muli %arg0, %c-1 : index
          %19 = addi %18, %c5 : index
          %20 = cmpi "slt", %c2, %19 : index
          %21 = select %20, %c2, %19 : index
          %22 = muli %arg2, %c-1 : index
          %23 = addi %22, %c1 : index
          %24 = cmpi "slt", %c32, %23 : index
          %25 = select %24, %c32, %23 : index
          %26 = "gpu.thread_id"() {dimension = "x"} : () -> index
          %27 = "gpu.block_dim"() {dimension = "x"} : () -> index
          %28 = "gpu.thread_id"() {dimension = "y"} : () -> index
          %29 = "gpu.block_dim"() {dimension = "y"} : () -> index
          %30 = "gpu.thread_id"() {dimension = "z"} : () -> index
          %31 = "gpu.block_dim"() {dimension = "z"} : () -> index
          scf.for %arg3 = %30 to %21 step %31 {
            scf.for %arg4 = %28 to %17 step %29 {
              scf.for %arg5 = %26 to %25 step %27 {
                %32 = addi %arg1, %arg4 : index
                %33 = addi %arg0, %arg3 : index
                %34 = load %1[%32, %33] : memref<1x5xf32>
                %35 = addi %arg2, %arg5 : index
                store %34, %0[%33, %32, %35] : memref<5x1x1xf32>
              }
            }
          }
        }
      }
    }
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::ConvertToSPIRVPass ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  spv.module Logical GLSL450 {
    spv.globalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
    spv.globalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spv.ptr<vector<3xi32>, Input>
    spv.globalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
    spv.globalVariable @__resource_var_0_0__ bind(0, 0) : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
    spv.globalVariable @__resource_var_0_1__ bind(0, 1) : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
    spv.func @main_ex_dispatch_4() "None" attributes {spv.entry_point_abi = {local_size = dense<[32, 2, 2]> : vector<3xi32>}} {
      %0 = spv.constant 5 : i32
      %1 = spv.constant 2 : i32
      %2 = spv.constant 32 : i32
      %3 = spv.constant -1 : i32
      %4 = spv.constant 1 : i32
      %5 = spv._address_of @__resource_var_0_1__ : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
      %6 = spv._address_of @__resource_var_0_0__ : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
      %7 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
      %8 = spv.Load "Input" %7 : vector<3xi32>
      %9 = spv.CompositeExtract %8[0 : i32] : vector<3xi32>
      %10 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
      %11 = spv.Load "Input" %10 : vector<3xi32>
      %12 = spv.CompositeExtract %11[0 : i32] : vector<3xi32>
      %13 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
      %14 = spv.Load "Input" %13 : vector<3xi32>
      %15 = spv.CompositeExtract %14[1 : i32] : vector<3xi32>
      %16 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
      %17 = spv.Load "Input" %16 : vector<3xi32>
      %18 = spv.CompositeExtract %17[1 : i32] : vector<3xi32>
      %19 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
      %20 = spv.Load "Input" %19 : vector<3xi32>
      %21 = spv.CompositeExtract %20[2 : i32] : vector<3xi32>
      %22 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
      %23 = spv.Load "Input" %22 : vector<3xi32>
      %24 = spv.CompositeExtract %23[2 : i32] : vector<3xi32>
      %25 = spv.IMul %21, %1 : i32
      %26 = spv.IMul %24, %1 : i32
      %27 = spv.IMul %15, %1 : i32
      %28 = spv.IMul %18, %1 : i32
      %29 = spv.IMul %9, %2 : i32
      %30 = spv.IMul %12, %2 : i32
      spv.loop {
        spv.Branch ^bb1(%25 : i32)
      ^bb1(%31: i32):  // 2 preds: ^bb0, ^bb2
        %32 = spv.SLessThan %31, %0 : i32
        spv.BranchConditional %32, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        spv.loop {
          spv.Branch ^bb1(%27 : i32)
        ^bb1(%34: i32):  // 2 preds: ^bb0, ^bb2
          %35 = spv.SLessThan %34, %4 : i32
          spv.BranchConditional %35, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          spv.loop {
            spv.Branch ^bb1(%29 : i32)
          ^bb1(%37: i32):  // 2 preds: ^bb0, ^bb2
            %38 = spv.SLessThan %37, %4 : i32
            spv.BranchConditional %38, ^bb2, ^bb3
          ^bb2:  // pred: ^bb1
            %39 = spv.IMul %34, %3 : i32
            %40 = spv.IAdd %39, %4 : i32
            %41 = spv.SLessThan %1, %40 : i32
            %42 = spv.Select %41, %1, %40 : i1, i32
            %43 = spv.IMul %31, %3 : i32
            %44 = spv.IAdd %43, %0 : i32
            %45 = spv.SLessThan %1, %44 : i32
            %46 = spv.Select %45, %1, %44 : i1, i32
            %47 = spv.IMul %37, %3 : i32
            %48 = spv.IAdd %47, %4 : i32
            %49 = spv.SLessThan %2, %48 : i32
            %50 = spv.Select %49, %2, %48 : i1, i32
            %51 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
            %52 = spv.Load "Input" %51 : vector<3xi32>
            %53 = spv.CompositeExtract %52[0 : i32] : vector<3xi32>
            %54 = spv.constant 32 : i32
            %55 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
            %56 = spv.Load "Input" %55 : vector<3xi32>
            %57 = spv.CompositeExtract %56[1 : i32] : vector<3xi32>
            %58 = spv.constant 2 : i32
            %59 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
            %60 = spv.Load "Input" %59 : vector<3xi32>
            %61 = spv.CompositeExtract %60[2 : i32] : vector<3xi32>
            %62 = spv.constant 2 : i32
            spv.loop {
              spv.Branch ^bb1(%61 : i32)
            ^bb1(%64: i32):  // 2 preds: ^bb0, ^bb2
              %65 = spv.SLessThan %64, %46 : i32
              spv.BranchConditional %65, ^bb2, ^bb3
            ^bb2:  // pred: ^bb1
              spv.loop {
                spv.Branch ^bb1(%57 : i32)
              ^bb1(%67: i32):  // 2 preds: ^bb0, ^bb2
                %68 = spv.SLessThan %67, %42 : i32
                spv.BranchConditional %68, ^bb2, ^bb3
              ^bb2:  // pred: ^bb1
                spv.loop {
                  spv.Branch ^bb1(%53 : i32)
                ^bb1(%70: i32):  // 2 preds: ^bb0, ^bb2
                  %71 = spv.SLessThan %70, %50 : i32
                  spv.BranchConditional %71, ^bb2, ^bb3
                ^bb2:  // pred: ^bb1
                  %72 = spv.IAdd %34, %67 : i32
                  %73 = spv.IAdd %31, %64 : i32
                  %74 = spv.constant 0 : i32
                  %75 = spv.constant 0 : i32
                  %76 = spv.constant 5 : i32
                  %77 = spv.IMul %76, %72 : i32
                  %78 = spv.IAdd %75, %77 : i32
                  %79 = spv.constant 1 : i32
                  %80 = spv.IMul %79, %73 : i32
                  %81 = spv.IAdd %78, %80 : i32
                  %82 = spv.AccessChain %6[%74, %81] : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
                  %83 = spv.Load "StorageBuffer" %82 : f32
                  %84 = spv.IAdd %37, %70 : i32
                  %85 = spv.constant 0 : i32
                  %86 = spv.constant 0 : i32
                  %87 = spv.constant 1 : i32
                  %88 = spv.IMul %87, %73 : i32
                  %89 = spv.IAdd %86, %88 : i32
                  %90 = spv.constant 1 : i32
                  %91 = spv.IMul %90, %72 : i32
                  %92 = spv.IAdd %89, %91 : i32
                  %93 = spv.constant 1 : i32
                  %94 = spv.IMul %93, %84 : i32
                  %95 = spv.IAdd %92, %94 : i32
                  %96 = spv.AccessChain %5[%85, %95] : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
                  spv.Store "StorageBuffer" %96, %83 : f32
                  %97 = spv.IAdd %70, %54 : i32
                  spv.Branch ^bb1(%97 : i32)
                ^bb3:  // pred: ^bb1
                  spv._merge
                }
                %69 = spv.IAdd %67, %58 : i32
                spv.Branch ^bb1(%69 : i32)
              ^bb3:  // pred: ^bb1
                spv._merge
              }
              %66 = spv.IAdd %64, %62 : i32
              spv.Branch ^bb1(%66 : i32)
            ^bb3:  // pred: ^bb1
              spv._merge
            }
            %63 = spv.IAdd %37, %30 : i32
            spv.Branch ^bb1(%63 : i32)
          ^bb3:  // pred: ^bb1
            spv._merge
          }
          %36 = spv.IAdd %34, %28 : i32
          spv.Branch ^bb1(%36 : i32)
        ^bb3:  // pred: ^bb1
          spv._merge
        }
        %33 = spv.IAdd %31, %26 : i32
        spv.Branch ^bb1(%33 : i32)
      ^bb3:  // pred: ^bb1
        spv._merge
      }
      spv.Return
    }
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After SPIRVLowerABIAttributes ***
spv.module Logical GLSL450 {
  spv.globalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__resource_var_0_0__ bind(0, 0) : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
  spv.globalVariable @__resource_var_0_1__ bind(0, 1) : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
  spv.func @main_ex_dispatch_4() "None" {
    %0 = spv.constant 5 : i32
    %1 = spv.constant 2 : i32
    %2 = spv.constant 32 : i32
    %3 = spv.constant -1 : i32
    %4 = spv.constant 1 : i32
    %5 = spv._address_of @__resource_var_0_1__ : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
    %6 = spv._address_of @__resource_var_0_0__ : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
    %7 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %8 = spv.Load "Input" %7 : vector<3xi32>
    %9 = spv.CompositeExtract %8[0 : i32] : vector<3xi32>
    %10 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
    %11 = spv.Load "Input" %10 : vector<3xi32>
    %12 = spv.CompositeExtract %11[0 : i32] : vector<3xi32>
    %13 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %14 = spv.Load "Input" %13 : vector<3xi32>
    %15 = spv.CompositeExtract %14[1 : i32] : vector<3xi32>
    %16 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
    %17 = spv.Load "Input" %16 : vector<3xi32>
    %18 = spv.CompositeExtract %17[1 : i32] : vector<3xi32>
    %19 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %20 = spv.Load "Input" %19 : vector<3xi32>
    %21 = spv.CompositeExtract %20[2 : i32] : vector<3xi32>
    %22 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
    %23 = spv.Load "Input" %22 : vector<3xi32>
    %24 = spv.CompositeExtract %23[2 : i32] : vector<3xi32>
    %25 = spv.IMul %21, %1 : i32
    %26 = spv.IMul %24, %1 : i32
    %27 = spv.IMul %15, %1 : i32
    %28 = spv.IMul %18, %1 : i32
    %29 = spv.IMul %9, %2 : i32
    %30 = spv.IMul %12, %2 : i32
    spv.loop {
      spv.Branch ^bb1(%25 : i32)
    ^bb1(%31: i32):  // 2 preds: ^bb0, ^bb2
      %32 = spv.SLessThan %31, %0 : i32
      spv.BranchConditional %32, ^bb2, ^bb3
    ^bb2:  // pred: ^bb1
      spv.loop {
        spv.Branch ^bb1(%27 : i32)
      ^bb1(%34: i32):  // 2 preds: ^bb0, ^bb2
        %35 = spv.SLessThan %34, %4 : i32
        spv.BranchConditional %35, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        spv.loop {
          spv.Branch ^bb1(%29 : i32)
        ^bb1(%37: i32):  // 2 preds: ^bb0, ^bb2
          %38 = spv.SLessThan %37, %4 : i32
          spv.BranchConditional %38, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          %39 = spv.IMul %34, %3 : i32
          %40 = spv.IAdd %39, %4 : i32
          %41 = spv.SLessThan %1, %40 : i32
          %42 = spv.Select %41, %1, %40 : i1, i32
          %43 = spv.IMul %31, %3 : i32
          %44 = spv.IAdd %43, %0 : i32
          %45 = spv.SLessThan %1, %44 : i32
          %46 = spv.Select %45, %1, %44 : i1, i32
          %47 = spv.IMul %37, %3 : i32
          %48 = spv.IAdd %47, %4 : i32
          %49 = spv.SLessThan %2, %48 : i32
          %50 = spv.Select %49, %2, %48 : i1, i32
          %51 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
          %52 = spv.Load "Input" %51 : vector<3xi32>
          %53 = spv.CompositeExtract %52[0 : i32] : vector<3xi32>
          %54 = spv.constant 32 : i32
          %55 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
          %56 = spv.Load "Input" %55 : vector<3xi32>
          %57 = spv.CompositeExtract %56[1 : i32] : vector<3xi32>
          %58 = spv.constant 2 : i32
          %59 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
          %60 = spv.Load "Input" %59 : vector<3xi32>
          %61 = spv.CompositeExtract %60[2 : i32] : vector<3xi32>
          %62 = spv.constant 2 : i32
          spv.loop {
            spv.Branch ^bb1(%61 : i32)
          ^bb1(%64: i32):  // 2 preds: ^bb0, ^bb2
            %65 = spv.SLessThan %64, %46 : i32
            spv.BranchConditional %65, ^bb2, ^bb3
          ^bb2:  // pred: ^bb1
            spv.loop {
              spv.Branch ^bb1(%57 : i32)
            ^bb1(%67: i32):  // 2 preds: ^bb0, ^bb2
              %68 = spv.SLessThan %67, %42 : i32
              spv.BranchConditional %68, ^bb2, ^bb3
            ^bb2:  // pred: ^bb1
              spv.loop {
                spv.Branch ^bb1(%53 : i32)
              ^bb1(%70: i32):  // 2 preds: ^bb0, ^bb2
                %71 = spv.SLessThan %70, %50 : i32
                spv.BranchConditional %71, ^bb2, ^bb3
              ^bb2:  // pred: ^bb1
                %72 = spv.IAdd %34, %67 : i32
                %73 = spv.IAdd %31, %64 : i32
                %74 = spv.constant 0 : i32
                %75 = spv.constant 0 : i32
                %76 = spv.constant 5 : i32
                %77 = spv.IMul %76, %72 : i32
                %78 = spv.IAdd %75, %77 : i32
                %79 = spv.constant 1 : i32
                %80 = spv.IMul %79, %73 : i32
                %81 = spv.IAdd %78, %80 : i32
                %82 = spv.AccessChain %6[%74, %81] : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
                %83 = spv.Load "StorageBuffer" %82 : f32
                %84 = spv.IAdd %37, %70 : i32
                %85 = spv.constant 0 : i32
                %86 = spv.constant 0 : i32
                %87 = spv.constant 1 : i32
                %88 = spv.IMul %87, %73 : i32
                %89 = spv.IAdd %86, %88 : i32
                %90 = spv.constant 1 : i32
                %91 = spv.IMul %90, %72 : i32
                %92 = spv.IAdd %89, %91 : i32
                %93 = spv.constant 1 : i32
                %94 = spv.IMul %93, %84 : i32
                %95 = spv.IAdd %92, %94 : i32
                %96 = spv.AccessChain %5[%85, %95] : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
                spv.Store "StorageBuffer" %96, %83 : f32
                %97 = spv.IAdd %70, %54 : i32
                spv.Branch ^bb1(%97 : i32)
              ^bb3:  // pred: ^bb1
                spv._merge
              }
              %69 = spv.IAdd %67, %58 : i32
              spv.Branch ^bb1(%69 : i32)
            ^bb3:  // pred: ^bb1
              spv._merge
            }
            %66 = spv.IAdd %64, %62 : i32
            spv.Branch ^bb1(%66 : i32)
          ^bb3:  // pred: ^bb1
            spv._merge
          }
          %63 = spv.IAdd %37, %30 : i32
          spv.Branch ^bb1(%63 : i32)
        ^bb3:  // pred: ^bb1
          spv._merge
        }
        %36 = spv.IAdd %34, %28 : i32
        spv.Branch ^bb1(%36 : i32)
      ^bb3:  // pred: ^bb1
        spv._merge
      }
      %33 = spv.IAdd %31, %26 : i32
      spv.Branch ^bb1(%33 : i32)
    ^bb3:  // pred: ^bb1
      spv._merge
    }
    spv.Return
  }
  spv.EntryPoint "GLCompute" @main_ex_dispatch_4, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__
  spv.ExecutionMode @main_ex_dispatch_4 "LocalSize", 32, 2, 2
}

// *** IR Dump After Canonicalizer ***
spv.module Logical GLSL450 {
  spv.globalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__resource_var_0_0__ bind(0, 0) : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
  spv.globalVariable @__resource_var_0_1__ bind(0, 1) : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
  spv.func @main_ex_dispatch_4() "None" {
    %0 = spv.constant -1 : i32
    %1 = spv.constant 1 : i32
    %2 = spv.constant 32 : i32
    %3 = spv.constant 2 : i32
    %4 = spv.constant 5 : i32
    %5 = spv.constant 0 : i32
    %6 = spv._address_of @__resource_var_0_1__ : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
    %7 = spv._address_of @__resource_var_0_0__ : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
    %8 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %9 = spv.Load "Input" %8 : vector<3xi32>
    %10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
    %11 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
    %12 = spv.Load "Input" %11 : vector<3xi32>
    %13 = spv.CompositeExtract %12[0 : i32] : vector<3xi32>
    %14 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %15 = spv.Load "Input" %14 : vector<3xi32>
    %16 = spv.CompositeExtract %15[1 : i32] : vector<3xi32>
    %17 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
    %18 = spv.Load "Input" %17 : vector<3xi32>
    %19 = spv.CompositeExtract %18[1 : i32] : vector<3xi32>
    %20 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %21 = spv.Load "Input" %20 : vector<3xi32>
    %22 = spv.CompositeExtract %21[2 : i32] : vector<3xi32>
    %23 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
    %24 = spv.Load "Input" %23 : vector<3xi32>
    %25 = spv.CompositeExtract %24[2 : i32] : vector<3xi32>
    %26 = spv.IMul %22, %3 : i32
    %27 = spv.IMul %25, %3 : i32
    %28 = spv.IMul %16, %3 : i32
    %29 = spv.IMul %19, %3 : i32
    %30 = spv.IMul %10, %2 : i32
    %31 = spv.IMul %13, %2 : i32
    spv.loop {
      spv.Branch ^bb1(%26 : i32)
    ^bb1(%32: i32):  // 2 preds: ^bb0, ^bb2
      %33 = spv.SLessThan %32, %4 : i32
      spv.BranchConditional %33, ^bb2, ^bb3
    ^bb2:  // pred: ^bb1
      spv.loop {
        spv.Branch ^bb1(%28 : i32)
      ^bb1(%35: i32):  // 2 preds: ^bb0, ^bb2
        %36 = spv.SLessThan %35, %1 : i32
        spv.BranchConditional %36, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        spv.loop {
          spv.Branch ^bb1(%30 : i32)
        ^bb1(%38: i32):  // 2 preds: ^bb0, ^bb2
          %39 = spv.SLessThan %38, %1 : i32
          spv.BranchConditional %39, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          %40 = spv.IMul %35, %0 : i32
          %41 = spv.IAdd %40, %1 : i32
          %42 = spv.SLessThan %3, %41 : i32
          %43 = spv.Select %42, %3, %41 : i1, i32
          %44 = spv.IMul %32, %0 : i32
          %45 = spv.IAdd %44, %4 : i32
          %46 = spv.SLessThan %3, %45 : i32
          %47 = spv.Select %46, %3, %45 : i1, i32
          %48 = spv.IMul %38, %0 : i32
          %49 = spv.IAdd %48, %1 : i32
          %50 = spv.SLessThan %2, %49 : i32
          %51 = spv.Select %50, %2, %49 : i1, i32
          %52 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
          %53 = spv.Load "Input" %52 : vector<3xi32>
          %54 = spv.CompositeExtract %53[0 : i32] : vector<3xi32>
          %55 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
          %56 = spv.Load "Input" %55 : vector<3xi32>
          %57 = spv.CompositeExtract %56[1 : i32] : vector<3xi32>
          %58 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
          %59 = spv.Load "Input" %58 : vector<3xi32>
          %60 = spv.CompositeExtract %59[2 : i32] : vector<3xi32>
          spv.loop {
            spv.Branch ^bb1(%60 : i32)
          ^bb1(%62: i32):  // 2 preds: ^bb0, ^bb2
            %63 = spv.SLessThan %62, %47 : i32
            spv.BranchConditional %63, ^bb2, ^bb3
          ^bb2:  // pred: ^bb1
            spv.loop {
              spv.Branch ^bb1(%57 : i32)
            ^bb1(%65: i32):  // 2 preds: ^bb0, ^bb2
              %66 = spv.SLessThan %65, %43 : i32
              spv.BranchConditional %66, ^bb2, ^bb3
            ^bb2:  // pred: ^bb1
              spv.loop {
                spv.Branch ^bb1(%54 : i32)
              ^bb1(%68: i32):  // 2 preds: ^bb0, ^bb2
                %69 = spv.SLessThan %68, %51 : i32
                spv.BranchConditional %69, ^bb2, ^bb3
              ^bb2:  // pred: ^bb1
                %70 = spv.IAdd %35, %65 : i32
                %71 = spv.IAdd %32, %62 : i32
                %72 = spv.IMul %70, %4 : i32
                %73 = spv.IAdd %72, %71 : i32
                %74 = spv.AccessChain %7[%5, %73] : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
                %75 = spv.Load "StorageBuffer" %74 : f32
                %76 = spv.IAdd %38, %68 : i32
                %77 = spv.IAdd %71, %70 : i32
                %78 = spv.IAdd %77, %76 : i32
                %79 = spv.AccessChain %6[%5, %78] : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
                spv.Store "StorageBuffer" %79, %75 : f32
                %80 = spv.IAdd %68, %2 : i32
                spv.Branch ^bb1(%80 : i32)
              ^bb3:  // pred: ^bb1
                spv._merge
              }
              %67 = spv.IAdd %65, %3 : i32
              spv.Branch ^bb1(%67 : i32)
            ^bb3:  // pred: ^bb1
              spv._merge
            }
            %64 = spv.IAdd %62, %3 : i32
            spv.Branch ^bb1(%64 : i32)
          ^bb3:  // pred: ^bb1
            spv._merge
          }
          %61 = spv.IAdd %38, %31 : i32
          spv.Branch ^bb1(%61 : i32)
        ^bb3:  // pred: ^bb1
          spv._merge
        }
        %37 = spv.IAdd %35, %29 : i32
        spv.Branch ^bb1(%37 : i32)
      ^bb3:  // pred: ^bb1
        spv._merge
      }
      %34 = spv.IAdd %32, %27 : i32
      spv.Branch ^bb1(%34 : i32)
    ^bb3:  // pred: ^bb1
      spv._merge
    }
    spv.Return
  }
  spv.EntryPoint "GLCompute" @main_ex_dispatch_4, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__
  spv.ExecutionMode @main_ex_dispatch_4 "LocalSize", 32, 2, 2
}

// *** IR Dump After CSE ***
spv.module Logical GLSL450 {
  spv.globalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__resource_var_0_0__ bind(0, 0) : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
  spv.globalVariable @__resource_var_0_1__ bind(0, 1) : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
  spv.func @main_ex_dispatch_4() "None" {
    %0 = spv.constant -1 : i32
    %1 = spv.constant 1 : i32
    %2 = spv.constant 32 : i32
    %3 = spv.constant 2 : i32
    %4 = spv.constant 5 : i32
    %5 = spv.constant 0 : i32
    %6 = spv._address_of @__resource_var_0_1__ : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
    %7 = spv._address_of @__resource_var_0_0__ : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
    %8 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %9 = spv.Load "Input" %8 : vector<3xi32>
    %10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
    %11 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
    %12 = spv.Load "Input" %11 : vector<3xi32>
    %13 = spv.CompositeExtract %12[0 : i32] : vector<3xi32>
    %14 = spv.Load "Input" %8 : vector<3xi32>
    %15 = spv.CompositeExtract %14[1 : i32] : vector<3xi32>
    %16 = spv.Load "Input" %11 : vector<3xi32>
    %17 = spv.CompositeExtract %16[1 : i32] : vector<3xi32>
    %18 = spv.Load "Input" %8 : vector<3xi32>
    %19 = spv.CompositeExtract %18[2 : i32] : vector<3xi32>
    %20 = spv.Load "Input" %11 : vector<3xi32>
    %21 = spv.CompositeExtract %20[2 : i32] : vector<3xi32>
    %22 = spv.IMul %19, %3 : i32
    %23 = spv.IMul %21, %3 : i32
    %24 = spv.IMul %15, %3 : i32
    %25 = spv.IMul %17, %3 : i32
    %26 = spv.IMul %10, %2 : i32
    %27 = spv.IMul %13, %2 : i32
    spv.loop {
      spv.Branch ^bb1(%22 : i32)
    ^bb1(%28: i32):  // 2 preds: ^bb0, ^bb2
      %29 = spv.SLessThan %28, %4 : i32
      spv.BranchConditional %29, ^bb2, ^bb3
    ^bb2:  // pred: ^bb1
      spv.loop {
        spv.Branch ^bb1(%24 : i32)
      ^bb1(%31: i32):  // 2 preds: ^bb0, ^bb2
        %32 = spv.SLessThan %31, %1 : i32
        spv.BranchConditional %32, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        spv.loop {
          spv.Branch ^bb1(%26 : i32)
        ^bb1(%34: i32):  // 2 preds: ^bb0, ^bb2
          %35 = spv.SLessThan %34, %1 : i32
          spv.BranchConditional %35, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          %36 = spv.IMul %31, %0 : i32
          %37 = spv.IAdd %36, %1 : i32
          %38 = spv.SLessThan %3, %37 : i32
          %39 = spv.Select %38, %3, %37 : i1, i32
          %40 = spv.IMul %28, %0 : i32
          %41 = spv.IAdd %40, %4 : i32
          %42 = spv.SLessThan %3, %41 : i32
          %43 = spv.Select %42, %3, %41 : i1, i32
          %44 = spv.IMul %34, %0 : i32
          %45 = spv.IAdd %44, %1 : i32
          %46 = spv.SLessThan %2, %45 : i32
          %47 = spv.Select %46, %2, %45 : i1, i32
          %48 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
          %49 = spv.Load "Input" %48 : vector<3xi32>
          %50 = spv.CompositeExtract %49[0 : i32] : vector<3xi32>
          %51 = spv.Load "Input" %48 : vector<3xi32>
          %52 = spv.CompositeExtract %51[1 : i32] : vector<3xi32>
          %53 = spv.Load "Input" %48 : vector<3xi32>
          %54 = spv.CompositeExtract %53[2 : i32] : vector<3xi32>
          spv.loop {
            spv.Branch ^bb1(%54 : i32)
          ^bb1(%56: i32):  // 2 preds: ^bb0, ^bb2
            %57 = spv.SLessThan %56, %43 : i32
            spv.BranchConditional %57, ^bb2, ^bb3
          ^bb2:  // pred: ^bb1
            spv.loop {
              spv.Branch ^bb1(%52 : i32)
            ^bb1(%59: i32):  // 2 preds: ^bb0, ^bb2
              %60 = spv.SLessThan %59, %39 : i32
              spv.BranchConditional %60, ^bb2, ^bb3
            ^bb2:  // pred: ^bb1
              spv.loop {
                spv.Branch ^bb1(%50 : i32)
              ^bb1(%62: i32):  // 2 preds: ^bb0, ^bb2
                %63 = spv.SLessThan %62, %47 : i32
                spv.BranchConditional %63, ^bb2, ^bb3
              ^bb2:  // pred: ^bb1
                %64 = spv.IAdd %31, %59 : i32
                %65 = spv.IAdd %28, %56 : i32
                %66 = spv.IMul %64, %4 : i32
                %67 = spv.IAdd %66, %65 : i32
                %68 = spv.AccessChain %7[%5, %67] : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
                %69 = spv.Load "StorageBuffer" %68 : f32
                %70 = spv.IAdd %34, %62 : i32
                %71 = spv.IAdd %65, %64 : i32
                %72 = spv.IAdd %71, %70 : i32
                %73 = spv.AccessChain %6[%5, %72] : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
                spv.Store "StorageBuffer" %73, %69 : f32
                %74 = spv.IAdd %62, %2 : i32
                spv.Branch ^bb1(%74 : i32)
              ^bb3:  // pred: ^bb1
                spv._merge
              }
              %61 = spv.IAdd %59, %3 : i32
              spv.Branch ^bb1(%61 : i32)
            ^bb3:  // pred: ^bb1
              spv._merge
            }
            %58 = spv.IAdd %56, %3 : i32
            spv.Branch ^bb1(%58 : i32)
          ^bb3:  // pred: ^bb1
            spv._merge
          }
          %55 = spv.IAdd %34, %27 : i32
          spv.Branch ^bb1(%55 : i32)
        ^bb3:  // pred: ^bb1
          spv._merge
        }
        %33 = spv.IAdd %31, %25 : i32
        spv.Branch ^bb1(%33 : i32)
      ^bb3:  // pred: ^bb1
        spv._merge
      }
      %30 = spv.IAdd %28, %23 : i32
      spv.Branch ^bb1(%30 : i32)
    ^bb3:  // pred: ^bb1
      spv._merge
    }
    spv.Return
  }
  spv.EntryPoint "GLCompute" @main_ex_dispatch_4, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__
  spv.ExecutionMode @main_ex_dispatch_4 "LocalSize", 32, 2, 2
}

// *** IR Dump After SPIRVUpdateVCE ***
spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
  spv.globalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__resource_var_0_0__ bind(0, 0) : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
  spv.globalVariable @__resource_var_0_1__ bind(0, 1) : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
  spv.func @main_ex_dispatch_4() "None" {
    %0 = spv.constant -1 : i32
    %1 = spv.constant 1 : i32
    %2 = spv.constant 32 : i32
    %3 = spv.constant 2 : i32
    %4 = spv.constant 5 : i32
    %5 = spv.constant 0 : i32
    %6 = spv._address_of @__resource_var_0_1__ : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
    %7 = spv._address_of @__resource_var_0_0__ : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
    %8 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %9 = spv.Load "Input" %8 : vector<3xi32>
    %10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
    %11 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
    %12 = spv.Load "Input" %11 : vector<3xi32>
    %13 = spv.CompositeExtract %12[0 : i32] : vector<3xi32>
    %14 = spv.Load "Input" %8 : vector<3xi32>
    %15 = spv.CompositeExtract %14[1 : i32] : vector<3xi32>
    %16 = spv.Load "Input" %11 : vector<3xi32>
    %17 = spv.CompositeExtract %16[1 : i32] : vector<3xi32>
    %18 = spv.Load "Input" %8 : vector<3xi32>
    %19 = spv.CompositeExtract %18[2 : i32] : vector<3xi32>
    %20 = spv.Load "Input" %11 : vector<3xi32>
    %21 = spv.CompositeExtract %20[2 : i32] : vector<3xi32>
    %22 = spv.IMul %19, %3 : i32
    %23 = spv.IMul %21, %3 : i32
    %24 = spv.IMul %15, %3 : i32
    %25 = spv.IMul %17, %3 : i32
    %26 = spv.IMul %10, %2 : i32
    %27 = spv.IMul %13, %2 : i32
    spv.loop {
      spv.Branch ^bb1(%22 : i32)
    ^bb1(%28: i32):  // 2 preds: ^bb0, ^bb2
      %29 = spv.SLessThan %28, %4 : i32
      spv.BranchConditional %29, ^bb2, ^bb3
    ^bb2:  // pred: ^bb1
      spv.loop {
        spv.Branch ^bb1(%24 : i32)
      ^bb1(%31: i32):  // 2 preds: ^bb0, ^bb2
        %32 = spv.SLessThan %31, %1 : i32
        spv.BranchConditional %32, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        spv.loop {
          spv.Branch ^bb1(%26 : i32)
        ^bb1(%34: i32):  // 2 preds: ^bb0, ^bb2
          %35 = spv.SLessThan %34, %1 : i32
          spv.BranchConditional %35, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          %36 = spv.IMul %31, %0 : i32
          %37 = spv.IAdd %36, %1 : i32
          %38 = spv.SLessThan %3, %37 : i32
          %39 = spv.Select %38, %3, %37 : i1, i32
          %40 = spv.IMul %28, %0 : i32
          %41 = spv.IAdd %40, %4 : i32
          %42 = spv.SLessThan %3, %41 : i32
          %43 = spv.Select %42, %3, %41 : i1, i32
          %44 = spv.IMul %34, %0 : i32
          %45 = spv.IAdd %44, %1 : i32
          %46 = spv.SLessThan %2, %45 : i32
          %47 = spv.Select %46, %2, %45 : i1, i32
          %48 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
          %49 = spv.Load "Input" %48 : vector<3xi32>
          %50 = spv.CompositeExtract %49[0 : i32] : vector<3xi32>
          %51 = spv.Load "Input" %48 : vector<3xi32>
          %52 = spv.CompositeExtract %51[1 : i32] : vector<3xi32>
          %53 = spv.Load "Input" %48 : vector<3xi32>
          %54 = spv.CompositeExtract %53[2 : i32] : vector<3xi32>
          spv.loop {
            spv.Branch ^bb1(%54 : i32)
          ^bb1(%56: i32):  // 2 preds: ^bb0, ^bb2
            %57 = spv.SLessThan %56, %43 : i32
            spv.BranchConditional %57, ^bb2, ^bb3
          ^bb2:  // pred: ^bb1
            spv.loop {
              spv.Branch ^bb1(%52 : i32)
            ^bb1(%59: i32):  // 2 preds: ^bb0, ^bb2
              %60 = spv.SLessThan %59, %39 : i32
              spv.BranchConditional %60, ^bb2, ^bb3
            ^bb2:  // pred: ^bb1
              spv.loop {
                spv.Branch ^bb1(%50 : i32)
              ^bb1(%62: i32):  // 2 preds: ^bb0, ^bb2
                %63 = spv.SLessThan %62, %47 : i32
                spv.BranchConditional %63, ^bb2, ^bb3
              ^bb2:  // pred: ^bb1
                %64 = spv.IAdd %31, %59 : i32
                %65 = spv.IAdd %28, %56 : i32
                %66 = spv.IMul %64, %4 : i32
                %67 = spv.IAdd %66, %65 : i32
                %68 = spv.AccessChain %7[%5, %67] : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
                %69 = spv.Load "StorageBuffer" %68 : f32
                %70 = spv.IAdd %34, %62 : i32
                %71 = spv.IAdd %65, %64 : i32
                %72 = spv.IAdd %71, %70 : i32
                %73 = spv.AccessChain %6[%5, %72] : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
                spv.Store "StorageBuffer" %73, %69 : f32
                %74 = spv.IAdd %62, %2 : i32
                spv.Branch ^bb1(%74 : i32)
              ^bb3:  // pred: ^bb1
                spv._merge
              }
              %61 = spv.IAdd %59, %3 : i32
              spv.Branch ^bb1(%61 : i32)
            ^bb3:  // pred: ^bb1
              spv._merge
            }
            %58 = spv.IAdd %56, %3 : i32
            spv.Branch ^bb1(%58 : i32)
          ^bb3:  // pred: ^bb1
            spv._merge
          }
          %55 = spv.IAdd %34, %27 : i32
          spv.Branch ^bb1(%55 : i32)
        ^bb3:  // pred: ^bb1
          spv._merge
        }
        %33 = spv.IAdd %31, %25 : i32
        spv.Branch ^bb1(%33 : i32)
      ^bb3:  // pred: ^bb1
        spv._merge
      }
      %30 = spv.IAdd %28, %23 : i32
      spv.Branch ^bb1(%30 : i32)
    ^bb3:  // pred: ^bb1
      spv._merge
    }
    spv.Return
  }
  spv.EntryPoint "GLCompute" @main_ex_dispatch_4, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__
  spv.ExecutionMode @main_ex_dispatch_4 "LocalSize", 32, 2, 2
}

// *** IR Dump After mlir::iree_compiler::IREE::HAL::TranslateExecutablesPass ***
hal.executable @main_ex_dispatch_4 attributes {sym_visibility = "private"} {
  hal.interface @legacy_io {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
  hal.executable.entry_point @main_ex_dispatch_4 attributes {interface = @legacy_io, ordinal = 0 : i32, signature = (tensor<1x5xf32>) -> tensor<5x1x1xf32>}
  hal.executable.target "vulkan*" {
    module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
      spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
        spv.globalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
        spv.globalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spv.ptr<vector<3xi32>, Input>
        spv.globalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
        spv.globalVariable @__resource_var_0_0__ bind(0, 0) : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
        spv.globalVariable @__resource_var_0_1__ bind(0, 1) : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
        spv.func @main_ex_dispatch_4() "None" {
          %0 = spv.constant -1 : i32
          %1 = spv.constant 1 : i32
          %2 = spv.constant 32 : i32
          %3 = spv.constant 2 : i32
          %4 = spv.constant 5 : i32
          %5 = spv.constant 0 : i32
          %6 = spv._address_of @__resource_var_0_1__ : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
          %7 = spv._address_of @__resource_var_0_0__ : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
          %8 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
          %9 = spv.Load "Input" %8 : vector<3xi32>
          %10 = spv.CompositeExtract %9[0 : i32] : vector<3xi32>
          %11 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
          %12 = spv.Load "Input" %11 : vector<3xi32>
          %13 = spv.CompositeExtract %12[0 : i32] : vector<3xi32>
          %14 = spv.Load "Input" %8 : vector<3xi32>
          %15 = spv.CompositeExtract %14[1 : i32] : vector<3xi32>
          %16 = spv.Load "Input" %11 : vector<3xi32>
          %17 = spv.CompositeExtract %16[1 : i32] : vector<3xi32>
          %18 = spv.Load "Input" %8 : vector<3xi32>
          %19 = spv.CompositeExtract %18[2 : i32] : vector<3xi32>
          %20 = spv.Load "Input" %11 : vector<3xi32>
          %21 = spv.CompositeExtract %20[2 : i32] : vector<3xi32>
          %22 = spv.IMul %19, %3 : i32
          %23 = spv.IMul %21, %3 : i32
          %24 = spv.IMul %15, %3 : i32
          %25 = spv.IMul %17, %3 : i32
          %26 = spv.IMul %10, %2 : i32
          %27 = spv.IMul %13, %2 : i32
          spv.loop {
            spv.Branch ^bb1(%22 : i32)
          ^bb1(%28: i32):  // 2 preds: ^bb0, ^bb2
            %29 = spv.SLessThan %28, %4 : i32
            spv.BranchConditional %29, ^bb2, ^bb3
          ^bb2:  // pred: ^bb1
            spv.loop {
              spv.Branch ^bb1(%24 : i32)
            ^bb1(%31: i32):  // 2 preds: ^bb0, ^bb2
              %32 = spv.SLessThan %31, %1 : i32
              spv.BranchConditional %32, ^bb2, ^bb3
            ^bb2:  // pred: ^bb1
              spv.loop {
                spv.Branch ^bb1(%26 : i32)
              ^bb1(%34: i32):  // 2 preds: ^bb0, ^bb2
                %35 = spv.SLessThan %34, %1 : i32
                spv.BranchConditional %35, ^bb2, ^bb3
              ^bb2:  // pred: ^bb1
                %36 = spv.IMul %31, %0 : i32
                %37 = spv.IAdd %36, %1 : i32
                %38 = spv.SLessThan %3, %37 : i32
                %39 = spv.Select %38, %3, %37 : i1, i32
                %40 = spv.IMul %28, %0 : i32
                %41 = spv.IAdd %40, %4 : i32
                %42 = spv.SLessThan %3, %41 : i32
                %43 = spv.Select %42, %3, %41 : i1, i32
                %44 = spv.IMul %34, %0 : i32
                %45 = spv.IAdd %44, %1 : i32
                %46 = spv.SLessThan %2, %45 : i32
                %47 = spv.Select %46, %2, %45 : i1, i32
                %48 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
                %49 = spv.Load "Input" %48 : vector<3xi32>
                %50 = spv.CompositeExtract %49[0 : i32] : vector<3xi32>
                %51 = spv.Load "Input" %48 : vector<3xi32>
                %52 = spv.CompositeExtract %51[1 : i32] : vector<3xi32>
                %53 = spv.Load "Input" %48 : vector<3xi32>
                %54 = spv.CompositeExtract %53[2 : i32] : vector<3xi32>
                spv.loop {
                  spv.Branch ^bb1(%54 : i32)
                ^bb1(%56: i32):  // 2 preds: ^bb0, ^bb2
                  %57 = spv.SLessThan %56, %43 : i32
                  spv.BranchConditional %57, ^bb2, ^bb3
                ^bb2:  // pred: ^bb1
                  spv.loop {
                    spv.Branch ^bb1(%52 : i32)
                  ^bb1(%59: i32):  // 2 preds: ^bb0, ^bb2
                    %60 = spv.SLessThan %59, %39 : i32
                    spv.BranchConditional %60, ^bb2, ^bb3
                  ^bb2:  // pred: ^bb1
                    spv.loop {
                      spv.Branch ^bb1(%50 : i32)
                    ^bb1(%62: i32):  // 2 preds: ^bb0, ^bb2
                      %63 = spv.SLessThan %62, %47 : i32
                      spv.BranchConditional %63, ^bb2, ^bb3
                    ^bb2:  // pred: ^bb1
                      %64 = spv.IAdd %31, %59 : i32
                      %65 = spv.IAdd %28, %56 : i32
                      %66 = spv.IMul %64, %4 : i32
                      %67 = spv.IAdd %66, %65 : i32
                      %68 = spv.AccessChain %7[%5, %67] : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
                      %69 = spv.Load "StorageBuffer" %68 : f32
                      %70 = spv.IAdd %34, %62 : i32
                      %71 = spv.IAdd %65, %64 : i32
                      %72 = spv.IAdd %71, %70 : i32
                      %73 = spv.AccessChain %6[%5, %72] : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
                      spv.Store "StorageBuffer" %73, %69 : f32
                      %74 = spv.IAdd %62, %2 : i32
                      spv.Branch ^bb1(%74 : i32)
                    ^bb3:  // pred: ^bb1
                      spv._merge
                    }
                    %61 = spv.IAdd %59, %3 : i32
                    spv.Branch ^bb1(%61 : i32)
                  ^bb3:  // pred: ^bb1
                    spv._merge
                  }
                  %58 = spv.IAdd %56, %3 : i32
                  spv.Branch ^bb1(%58 : i32)
                ^bb3:  // pred: ^bb1
                  spv._merge
                }
                %55 = spv.IAdd %34, %27 : i32
                spv.Branch ^bb1(%55 : i32)
              ^bb3:  // pred: ^bb1
                spv._merge
              }
              %33 = spv.IAdd %31, %25 : i32
              spv.Branch ^bb1(%33 : i32)
            ^bb3:  // pred: ^bb1
              spv._merge
            }
            %30 = spv.IAdd %28, %23 : i32
            spv.Branch ^bb1(%30 : i32)
          ^bb3:  // pred: ^bb1
            spv._merge
          }
          spv.Return
        }
        spv.EntryPoint "GLCompute" @main_ex_dispatch_4, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__
        spv.ExecutionMode @main_ex_dispatch_4 "LocalSize", 32, 2, 2
      }
      hal.interface @legacy_io attributes {sym_visibility = "private"} {
        hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
        hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
      }
    }
  }
}

// *** IR Dump After Inliner ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_5() {
    %c0 = constant 0 : index
    %cst = constant dense<0x7F800000> : tensor<f32>
    %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<5x1x1xf32>
    %1 = "xla_hlo.reduce"(%0, %cst) ( {
    ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):  // no predecessors
      %2 = xla_hlo.minimum %arg0, %arg1 : tensor<f32>
      "xla_hlo.return"(%2) : (tensor<f32>) -> ()
    }) {dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<5x1x1xf32>, tensor<f32>) -> tensor<5xf32>
    hal.interface.store.tensor %1, @legacy_io::@ret0, offset = %c0 : tensor<5xf32>
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::Shape::`anonymous-namespace'::TieDynamicShapesPass ***
func @main_ex_dispatch_5() {
  %c0 = constant 0 : index
  %cst = constant dense<0x7F800000> : tensor<f32>
  %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<5x1x1xf32>
  %1 = "xla_hlo.reduce"(%0, %cst) ( {
  ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):  // no predecessors
    %2 = xla_hlo.minimum %arg0, %arg1 : tensor<f32>
    "xla_hlo.return"(%2) : (tensor<f32>) -> ()
  }) {dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<5x1x1xf32>, tensor<f32>) -> tensor<5xf32>
  hal.interface.store.tensor %1, @legacy_io::@ret0, offset = %c0 : tensor<5xf32>
  return
}

// *** IR Dump After mlir::iree_compiler::Shape::`anonymous-namespace'::MaterializeShapeCalculationsPass ***
func @main_ex_dispatch_5() {
  %c0 = constant 0 : index
  %cst = constant dense<0x7F800000> : tensor<f32>
  %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<5x1x1xf32>
  %1 = "xla_hlo.reduce"(%0, %cst) ( {
  ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):  // no predecessors
    %2 = xla_hlo.minimum %arg0, %arg1 : tensor<f32>
    "xla_hlo.return"(%2) : (tensor<f32>) -> ()
  }) {dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<5x1x1xf32>, tensor<f32>) -> tensor<5xf32>
  hal.interface.store.tensor %1, @legacy_io::@ret0, offset = %c0 : tensor<5xf32>
  return
}

// *** IR Dump After mlir::iree_compiler::Shape::`anonymous-namespace'::HoistShapeCalculations ***
func @main_ex_dispatch_5() {
  %c0 = constant 0 : index
  %cst = constant dense<0x7F800000> : tensor<f32>
  %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<5x1x1xf32>
  %1 = "xla_hlo.reduce"(%0, %cst) ( {
  ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):  // no predecessors
    %2 = xla_hlo.minimum %arg0, %arg1 : tensor<f32>
    "xla_hlo.return"(%2) : (tensor<f32>) -> ()
  }) {dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<5x1x1xf32>, tensor<f32>) -> tensor<5xf32>
  hal.interface.store.tensor %1, @legacy_io::@ret0, offset = %c0 : tensor<5xf32>
  return
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::DecomposeHLOClampPass ***
func @main_ex_dispatch_5() {
  %c0 = constant 0 : index
  %cst = constant dense<0x7F800000> : tensor<f32>
  %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<5x1x1xf32>
  %1 = "xla_hlo.reduce"(%0, %cst) ( {
  ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):  // no predecessors
    %2 = xla_hlo.minimum %arg0, %arg1 : tensor<f32>
    "xla_hlo.return"(%2) : (tensor<f32>) -> ()
  }) {dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<5x1x1xf32>, tensor<f32>) -> tensor<5xf32>
  hal.interface.store.tensor %1, @legacy_io::@ret0, offset = %c0 : tensor<5xf32>
  return
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::ConvertHLOToLinalgOnTensorsPass ***
func @main_ex_dispatch_5() {
  %c0 = constant 0 : index
  %cst = constant dense<0x7F800000> : tensor<f32>
  %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<5x1x1xf32>
  %1 = "xla_hlo.reduce"(%0, %cst) ( {
  ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):  // no predecessors
    %2 = xla_hlo.minimum %arg0, %arg1 : tensor<f32>
    "xla_hlo.return"(%2) : (tensor<f32>) -> ()
  }) {dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<5x1x1xf32>, tensor<f32>) -> tensor<5xf32>
  hal.interface.store.tensor %1, @legacy_io::@ret0, offset = %c0 : tensor<5xf32>
  return
}

// *** IR Dump After LinalgFusionOfTensorOps ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_5() {
    %c0 = constant 0 : index
    %cst = constant dense<0x7F800000> : tensor<f32>
    %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<5x1x1xf32>
    %1 = "xla_hlo.reduce"(%0, %cst) ( {
    ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):  // no predecessors
      %2 = xla_hlo.minimum %arg0, %arg1 : tensor<f32>
      "xla_hlo.return"(%2) : (tensor<f32>) -> ()
    }) {dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<5x1x1xf32>, tensor<f32>) -> tensor<5xf32>
    hal.interface.store.tensor %1, @legacy_io::@ret0, offset = %c0 : tensor<5xf32>
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::ConvertHLOToLinalgOnBuffersPass ***
func @main_ex_dispatch_5() {
  %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5xf32>
  %c0 = constant 0 : index
  %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<5x1x1xf32>
  %cst = constant 0x7F800000 : f32
  %cst_0 = constant 0.000000e+00 : f32
  linalg.fill(%0, %cst_0) : memref<5xf32>, f32
  linalg.indexed_generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} %1, %0 {
  ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: f32, %arg4: f32):  // no predecessors
    %c0_1 = constant 0 : index
    %true = constant true
    %2 = cmpi "eq", %arg1, %c0_1 : index
    %3 = and %true, %2 : i1
    %4 = cmpi "eq", %arg2, %c0_1 : index
    %5 = and %3, %4 : i1
    %6 = select %5, %cst, %arg4 : f32
    %7 = cmpf "olt", %arg3, %6 : f32
    %8 = select %7, %arg3, %6 : f32
    linalg.yield %8 : f32
  }: memref<5x1x1xf32>, memref<5xf32>
  return
}

// *** IR Dump After Canonicalizer ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_5() {
    %cst = constant 0x7F800000 : f32
    %cst_0 = constant 0.000000e+00 : f32
    %c0 = constant 0 : index
    %true = constant true
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5xf32>
    %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<5x1x1xf32>
    linalg.fill(%0, %cst_0) : memref<5xf32>, f32
    linalg.indexed_generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} %1, %0 {
    ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: f32, %arg4: f32):  // no predecessors
      %2 = cmpi "eq", %arg1, %c0 : index
      %3 = and %2, %true : i1
      %4 = cmpi "eq", %arg2, %c0 : index
      %5 = and %3, %4 : i1
      %6 = select %5, %cst, %arg4 : f32
      %7 = cmpf "olt", %arg3, %6 : f32
      %8 = select %7, %arg3, %6 : f32
      linalg.yield %8 : f32
    }: memref<5x1x1xf32>, memref<5xf32>
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After CSE ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_5() {
    %cst = constant 0x7F800000 : f32
    %cst_0 = constant 0.000000e+00 : f32
    %c0 = constant 0 : index
    %true = constant true
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5xf32>
    %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<5x1x1xf32>
    linalg.fill(%0, %cst_0) : memref<5xf32>, f32
    linalg.indexed_generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} %1, %0 {
    ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: f32, %arg4: f32):  // no predecessors
      %2 = cmpi "eq", %arg1, %c0 : index
      %3 = and %2, %true : i1
      %4 = cmpi "eq", %arg2, %c0 : index
      %5 = and %3, %4 : i1
      %6 = select %5, %cst, %arg4 : f32
      %7 = cmpf "olt", %arg3, %6 : f32
      %8 = select %7, %arg3, %6 : f32
      linalg.yield %8 : f32
    }: memref<5x1x1xf32>, memref<5xf32>
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::LinalgTileAndFusePass ***
func @main_ex_dispatch_5() attributes {spv.entry_point_abi = {local_size = dense<[32, 1, 1]> : vector<3xi32>}} {
  %cst = constant 0x7F800000 : f32
  %cst_0 = constant 0.000000e+00 : f32
  %true = constant true
  %c0 = constant 0 : index
  %c32 = constant 32 : index
  %c5 = constant 5 : index
  %c1 = constant 1 : index
  %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5xf32>
  %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<5x1x1xf32>
  scf.parallel (%arg0) = (%c0) to (%c5) step (%c32) {
    %2 = affine.min affine_map<(d0, d1, d2) -> (32, d1 - d2)>(%c32, %c5, %arg0)
    %3 = subview %0[%arg0] [%2] [%c1]  : memref<5xf32> to memref<?xf32, affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>>
    linalg.fill(%3, %cst_0) {__internal_linalg_transform__ = "workitem"} : memref<?xf32, affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>>, f32
    scf.yield
  }
  scf.parallel (%arg0) = (%c0) to (%c5) step (%c32) {
    %2 = affine.min affine_map<(d0, d1, d2) -> (32, d1 - d2)>(%c32, %c5, %arg0)
    %3 = subview %1[%arg0, %c0, %c0] [%2, %c1, %c1] [%c1, %c1, %c1]  : memref<5x1x1xf32> to memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3)>>
    %4 = affine.min affine_map<(d0, d1, d2) -> (32, d1 - d2)>(%c32, %c5, %arg0)
    %5 = subview %0[%arg0] [%4] [%c1]  : memref<5xf32> to memref<?xf32, affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>>
    linalg.indexed_generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} {__internal_linalg_transform__ = "workitem"} %3, %5 {
    ^bb0(%arg1: index, %arg2: index, %arg3: index, %arg4: f32, %arg5: f32):  // no predecessors
      %6 = cmpi "eq", %arg2, %c0 : index
      %7 = and %6, %true : i1
      %8 = cmpi "eq", %arg3, %c0 : index
      %9 = and %7, %8 : i1
      %10 = select %9, %cst, %arg5 : f32
      %11 = cmpf "olt", %arg4, %10 : f32
      %12 = select %11, %arg4, %10 : f32
      linalg.yield %12 : f32
    }: memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3)>>, memref<?xf32, affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>>
    scf.yield
  }
  return
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::SplitDispatchFunctionPass ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_5() attributes {spv.entry_point_abi = {local_size = dense<[32, 1, 1]> : vector<3xi32>}} {
    %cst = constant 0x7F800000 : f32
    %cst_0 = constant 0.000000e+00 : f32
    %true = constant true
    %c0 = constant 0 : index
    %c32 = constant 32 : index
    %c5 = constant 5 : index
    %c1 = constant 1 : index
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5xf32>
    %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<5x1x1xf32>
    scf.parallel (%arg0) = (%c0) to (%c5) step (%c32) {
      %2 = affine.min affine_map<(d0, d1, d2) -> (32, d1 - d2)>(%c32, %c5, %arg0)
      %3 = subview %0[%arg0] [%2] [%c1]  : memref<5xf32> to memref<?xf32, affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>>
      linalg.fill(%3, %cst_0) {__internal_linalg_transform__ = "workitem"} : memref<?xf32, affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>>, f32
      scf.yield
    }
    scf.parallel (%arg0) = (%c0) to (%c5) step (%c32) {
      %2 = affine.min affine_map<(d0, d1, d2) -> (32, d1 - d2)>(%c32, %c5, %arg0)
      %3 = subview %1[%arg0, %c0, %c0] [%2, %c1, %c1] [%c1, %c1, %c1]  : memref<5x1x1xf32> to memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3)>>
      %4 = affine.min affine_map<(d0, d1, d2) -> (32, d1 - d2)>(%c32, %c5, %arg0)
      %5 = subview %0[%arg0] [%4] [%c1]  : memref<5xf32> to memref<?xf32, affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>>
      linalg.indexed_generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} {__internal_linalg_transform__ = "workitem"} %3, %5 {
      ^bb0(%arg1: index, %arg2: index, %arg3: index, %arg4: f32, %arg5: f32):  // no predecessors
        %6 = cmpi "eq", %arg2, %c0 : index
        %7 = and %6, %true : i1
        %8 = cmpi "eq", %arg3, %c0 : index
        %9 = and %7, %8 : i1
        %10 = select %9, %cst, %arg5 : f32
        %11 = cmpf "olt", %arg4, %10 : f32
        %12 = select %11, %arg4, %10 : f32
        linalg.yield %12 : f32
      }: memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3)>>, memref<?xf32, affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>>
      scf.yield
    }
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::LinalgTileAndFusePass ***
func @main_ex_dispatch_5() attributes {spv.entry_point_abi = {local_size = dense<[32, 1, 1]> : vector<3xi32>}} {
  %cst = constant 0x7F800000 : f32
  %cst_0 = constant 0.000000e+00 : f32
  %true = constant true
  %c0 = constant 0 : index
  %c32 = constant 32 : index
  %c5 = constant 5 : index
  %c1 = constant 1 : index
  %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5xf32>
  %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<5x1x1xf32>
  scf.parallel (%arg0) = (%c0) to (%c5) step (%c32) {
    %2 = affine.min affine_map<(d0, d1, d2) -> (32, d1 - d2)>(%c32, %c5, %arg0)
    %3 = subview %0[%arg0] [%2] [%c1]  : memref<5xf32> to memref<?xf32, affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>>
    linalg.fill(%3, %cst_0) {__internal_linalg_transform__ = "workitem"} : memref<?xf32, affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>>, f32
    scf.yield
  }
  scf.parallel (%arg0) = (%c0) to (%c5) step (%c32) {
    %2 = affine.min affine_map<(d0, d1, d2) -> (32, d1 - d2)>(%c32, %c5, %arg0)
    %3 = subview %1[%arg0, %c0, %c0] [%2, %c1, %c1] [%c1, %c1, %c1]  : memref<5x1x1xf32> to memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3)>>
    %4 = affine.min affine_map<(d0, d1, d2) -> (32, d1 - d2)>(%c32, %c5, %arg0)
    %5 = subview %0[%arg0] [%4] [%c1]  : memref<5xf32> to memref<?xf32, affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>>
    linalg.indexed_generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} {__internal_linalg_transform__ = "workitem"} %3, %5 {
    ^bb0(%arg1: index, %arg2: index, %arg3: index, %arg4: f32, %arg5: f32):  // no predecessors
      %6 = cmpi "eq", %arg2, %c0 : index
      %7 = and %6, %true : i1
      %8 = cmpi "eq", %arg3, %c0 : index
      %9 = and %7, %8 : i1
      %10 = select %9, %cst, %arg5 : f32
      %11 = cmpf "olt", %arg4, %10 : f32
      %12 = select %11, %arg4, %10 : f32
      linalg.yield %12 : f32
    }: memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3)>>, memref<?xf32, affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>>
    scf.yield
  }
  return
}

// *** IR Dump After Canonicalizer ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_5() attributes {spv.entry_point_abi = {local_size = dense<[32, 1, 1]> : vector<3xi32>}} {
    %cst = constant 0x7F800000 : f32
    %cst_0 = constant 0.000000e+00 : f32
    %true = constant true
    %c0 = constant 0 : index
    %c32 = constant 32 : index
    %c5 = constant 5 : index
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5xf32>
    %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<5x1x1xf32>
    scf.parallel (%arg0) = (%c0) to (%c5) step (%c32) {
      %2 = affine.min affine_map<(d0) -> (32, -d0 + 5)>(%arg0)
      %3 = subview %0[%arg0] [%2] [1]  : memref<5xf32> to memref<?xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
      linalg.fill(%3, %cst_0) {__internal_linalg_transform__ = "workitem"} : memref<?xf32, affine_map<(d0)[s0] -> (d0 + s0)>>, f32
      scf.yield
    }
    scf.parallel (%arg0) = (%c0) to (%c5) step (%c32) {
      %2 = affine.min affine_map<(d0) -> (32, -d0 + 5)>(%arg0)
      %3 = subview %1[%arg0, 0, 0] [%2, 1, 1] [1, 1, 1]  : memref<5x1x1xf32> to memref<?x1x1xf32, affine_map<(d0, d1, d2)[s0] -> (d0 + s0 + d1 + d2)>>
      %4 = affine.min affine_map<(d0) -> (32, -d0 + 5)>(%arg0)
      %5 = subview %0[%arg0] [%4] [1]  : memref<5xf32> to memref<?xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
      linalg.indexed_generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} {__internal_linalg_transform__ = "workitem"} %3, %5 {
      ^bb0(%arg1: index, %arg2: index, %arg3: index, %arg4: f32, %arg5: f32):  // no predecessors
        %6 = cmpi "eq", %arg2, %c0 : index
        %7 = and %6, %true : i1
        %8 = cmpi "eq", %arg3, %c0 : index
        %9 = and %7, %8 : i1
        %10 = select %9, %cst, %arg5 : f32
        %11 = cmpf "olt", %arg4, %10 : f32
        %12 = select %11, %arg4, %10 : f32
        linalg.yield %12 : f32
      }: memref<?x1x1xf32, affine_map<(d0, d1, d2)[s0] -> (d0 + s0 + d1 + d2)>>, memref<?xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
      scf.yield
    }
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::ConvertToGPUPass ***
func @main_ex_dispatch_5() attributes {spv.entry_point_abi = {local_size = dense<[32, 1, 1]> : vector<3xi32>}} {
  %cst = constant 0x7F800000 : f32
  %cst_0 = constant 0.000000e+00 : f32
  %true = constant true
  %c0 = constant 0 : index
  %c32 = constant 32 : index
  %c5 = constant 5 : index
  %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5xf32>
  %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<5x1x1xf32>
  %2 = "gpu.block_id"() {dimension = "x"} : () -> index
  %3 = "gpu.grid_dim"() {dimension = "x"} : () -> index
  %4 = muli %c32, %2 : index
  %5 = addi %c0, %4 : index
  %6 = muli %c32, %3 : index
  scf.for %arg0 = %5 to %c5 step %6 {
    %12 = affine.min affine_map<(d0) -> (32, -d0 + 5)>(%arg0)
    %13 = subview %0[%arg0] [%12] [1]  : memref<5xf32> to memref<?xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
    %14 = dim %13, 0 : memref<?xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
    %15 = affine.apply affine_map<(d0) -> (d0)>(%14)
    %c0_1 = constant 0 : index
    %c1 = constant 1 : index
    %16 = "gpu.thread_id"() {dimension = "x"} : () -> index
    %17 = "gpu.block_dim"() {dimension = "x"} : () -> index
    %18 = muli %c1, %16 : index
    %19 = addi %c0_1, %18 : index
    %20 = muli %c1, %17 : index
    scf.for %arg1 = %19 to %15 step %20 {
      store %cst_0, %13[%arg1] : memref<?xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
    }
  }
  %7 = "gpu.block_id"() {dimension = "x"} : () -> index
  %8 = "gpu.grid_dim"() {dimension = "x"} : () -> index
  %9 = muli %c32, %7 : index
  %10 = addi %c0, %9 : index
  %11 = muli %c32, %8 : index
  scf.for %arg0 = %10 to %c5 step %11 {
    %12 = affine.min affine_map<(d0) -> (32, -d0 + 5)>(%arg0)
    %13 = subview %1[%arg0, 0, 0] [%12, 1, 1] [1, 1, 1]  : memref<5x1x1xf32> to memref<?x1x1xf32, affine_map<(d0, d1, d2)[s0] -> (d0 + s0 + d1 + d2)>>
    %14 = affine.min affine_map<(d0) -> (32, -d0 + 5)>(%arg0)
    %15 = subview %0[%arg0] [%14] [1]  : memref<5xf32> to memref<?xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
    %16 = dim %13, 0 : memref<?x1x1xf32, affine_map<(d0, d1, d2)[s0] -> (d0 + s0 + d1 + d2)>>
    %17 = dim %13, 1 : memref<?x1x1xf32, affine_map<(d0, d1, d2)[s0] -> (d0 + s0 + d1 + d2)>>
    %18 = dim %13, 2 : memref<?x1x1xf32, affine_map<(d0, d1, d2)[s0] -> (d0 + s0 + d1 + d2)>>
    %19 = dim %15, 0 : memref<?xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
    %20 = affine.apply affine_map<(d0) -> (d0)>(%16)
    %21 = affine.apply affine_map<()[s0] -> (s0)>()[%17]
    %22 = affine.apply affine_map<()[s0] -> (s0)>()[%18]
    %c0_1 = constant 0 : index
    %c1 = constant 1 : index
    %c0_2 = constant 0 : index
    %c1_3 = constant 1 : index
    %c0_4 = constant 0 : index
    %c1_5 = constant 1 : index
    %23 = "gpu.thread_id"() {dimension = "x"} : () -> index
    %24 = "gpu.block_dim"() {dimension = "x"} : () -> index
    %25 = muli %c1, %23 : index
    %26 = addi %c0_1, %25 : index
    %27 = muli %c1, %24 : index
    scf.for %arg1 = %26 to %20 step %27 {
      scf.for %arg2 = %c0_2 to %21 step %c1_3 {
        scf.for %arg3 = %c0_4 to %22 step %c1_5 {
          %28 = affine.apply affine_map<(d0) -> (d0)>(%arg1)
          %29 = affine.apply affine_map<(d0) -> (d0)>(%arg2)
          %30 = affine.apply affine_map<(d0) -> (d0)>(%arg3)
          %31 = load %13[%28, %29, %30] : memref<?x1x1xf32, affine_map<(d0, d1, d2)[s0] -> (d0 + s0 + d1 + d2)>>
          %32 = affine.apply affine_map<(d0) -> (d0)>(%arg1)
          %33 = load %15[%32] : memref<?xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
          %34 = affine.apply affine_map<(d0) -> (d0)>(%arg1)
          %35 = cmpi "eq", %arg2, %c0 : index
          %36 = and %35, %true : i1
          %37 = cmpi "eq", %arg3, %c0 : index
          %38 = and %36, %37 : i1
          %39 = select %38, %cst, %33 : f32
          %40 = cmpf "olt", %31, %39 : f32
          %41 = select %40, %31, %39 : f32
          store %41, %15[%34] : memref<?xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
        }
      }
    }
  }
  return
}

// *** IR Dump After ConvertAffineToStandard ***
func @main_ex_dispatch_5() attributes {spv.entry_point_abi = {local_size = dense<[32, 1, 1]> : vector<3xi32>}} {
  %cst = constant 0x7F800000 : f32
  %cst_0 = constant 0.000000e+00 : f32
  %true = constant true
  %c0 = constant 0 : index
  %c32 = constant 32 : index
  %c5 = constant 5 : index
  %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5xf32>
  %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<5x1x1xf32>
  %2 = "gpu.block_id"() {dimension = "x"} : () -> index
  %3 = "gpu.grid_dim"() {dimension = "x"} : () -> index
  %4 = muli %c32, %2 : index
  %5 = addi %c0, %4 : index
  %6 = muli %c32, %3 : index
  scf.for %arg0 = %5 to %c5 step %6 {
    %c32_1 = constant 32 : index
    %c-1 = constant -1 : index
    %12 = muli %arg0, %c-1 : index
    %c5_2 = constant 5 : index
    %13 = addi %12, %c5_2 : index
    %14 = cmpi "slt", %c32_1, %13 : index
    %15 = select %14, %c32_1, %13 : index
    %16 = subview %0[%arg0] [%15] [1]  : memref<5xf32> to memref<?xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
    %17 = dim %16, 0 : memref<?xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
    %c0_3 = constant 0 : index
    %c1 = constant 1 : index
    %18 = "gpu.thread_id"() {dimension = "x"} : () -> index
    %19 = "gpu.block_dim"() {dimension = "x"} : () -> index
    %20 = muli %c1, %18 : index
    %21 = addi %c0_3, %20 : index
    %22 = muli %c1, %19 : index
    scf.for %arg1 = %21 to %17 step %22 {
      store %cst_0, %16[%arg1] : memref<?xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
    }
  }
  %7 = "gpu.block_id"() {dimension = "x"} : () -> index
  %8 = "gpu.grid_dim"() {dimension = "x"} : () -> index
  %9 = muli %c32, %7 : index
  %10 = addi %c0, %9 : index
  %11 = muli %c32, %8 : index
  scf.for %arg0 = %10 to %c5 step %11 {
    %c32_1 = constant 32 : index
    %c-1 = constant -1 : index
    %12 = muli %arg0, %c-1 : index
    %c5_2 = constant 5 : index
    %13 = addi %12, %c5_2 : index
    %14 = cmpi "slt", %c32_1, %13 : index
    %15 = select %14, %c32_1, %13 : index
    %16 = subview %1[%arg0, 0, 0] [%15, 1, 1] [1, 1, 1]  : memref<5x1x1xf32> to memref<?x1x1xf32, affine_map<(d0, d1, d2)[s0] -> (d0 + s0 + d1 + d2)>>
    %c32_3 = constant 32 : index
    %c-1_4 = constant -1 : index
    %17 = muli %arg0, %c-1_4 : index
    %c5_5 = constant 5 : index
    %18 = addi %17, %c5_5 : index
    %19 = cmpi "slt", %c32_3, %18 : index
    %20 = select %19, %c32_3, %18 : index
    %21 = subview %0[%arg0] [%20] [1]  : memref<5xf32> to memref<?xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
    %22 = dim %16, 0 : memref<?x1x1xf32, affine_map<(d0, d1, d2)[s0] -> (d0 + s0 + d1 + d2)>>
    %23 = dim %16, 1 : memref<?x1x1xf32, affine_map<(d0, d1, d2)[s0] -> (d0 + s0 + d1 + d2)>>
    %24 = dim %16, 2 : memref<?x1x1xf32, affine_map<(d0, d1, d2)[s0] -> (d0 + s0 + d1 + d2)>>
    %25 = dim %21, 0 : memref<?xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
    %c0_6 = constant 0 : index
    %c1 = constant 1 : index
    %c0_7 = constant 0 : index
    %c1_8 = constant 1 : index
    %c0_9 = constant 0 : index
    %c1_10 = constant 1 : index
    %26 = "gpu.thread_id"() {dimension = "x"} : () -> index
    %27 = "gpu.block_dim"() {dimension = "x"} : () -> index
    %28 = muli %c1, %26 : index
    %29 = addi %c0_6, %28 : index
    %30 = muli %c1, %27 : index
    scf.for %arg1 = %29 to %22 step %30 {
      scf.for %arg2 = %c0_7 to %23 step %c1_8 {
        scf.for %arg3 = %c0_9 to %24 step %c1_10 {
          %31 = load %16[%arg1, %arg2, %arg3] : memref<?x1x1xf32, affine_map<(d0, d1, d2)[s0] -> (d0 + s0 + d1 + d2)>>
          %32 = load %21[%arg1] : memref<?xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
          %33 = cmpi "eq", %arg2, %c0 : index
          %34 = and %33, %true : i1
          %35 = cmpi "eq", %arg3, %c0 : index
          %36 = and %34, %35 : i1
          %37 = select %36, %cst, %32 : f32
          %38 = cmpf "olt", %31, %37 : f32
          %39 = select %38, %31, %37 : f32
          store %39, %21[%arg1] : memref<?xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
        }
      }
    }
  }
  return
}

// *** IR Dump After Canonicalizer ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_5() attributes {spv.entry_point_abi = {local_size = dense<[32, 1, 1]> : vector<3xi32>}} {
    %cst = constant 0x7F800000 : f32
    %cst_0 = constant 0.000000e+00 : f32
    %true = constant true
    %c32 = constant 32 : index
    %c-1 = constant -1 : index
    %c5 = constant 5 : index
    %c0 = constant 0 : index
    %c1 = constant 1 : index
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5xf32>
    %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<5x1x1xf32>
    %2 = "gpu.block_id"() {dimension = "x"} : () -> index
    %3 = "gpu.grid_dim"() {dimension = "x"} : () -> index
    %4 = muli %2, %c32 : index
    %5 = muli %3, %c32 : index
    scf.for %arg0 = %4 to %c5 step %5 {
      %10 = muli %arg0, %c-1 : index
      %11 = addi %10, %c5 : index
      %12 = cmpi "slt", %c32, %11 : index
      %13 = select %12, %c32, %11 : index
      %14 = subview %0[%arg0] [%13] [1]  : memref<5xf32> to memref<?xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
      %15 = "gpu.thread_id"() {dimension = "x"} : () -> index
      %16 = "gpu.block_dim"() {dimension = "x"} : () -> index
      scf.for %arg1 = %15 to %13 step %16 {
        store %cst_0, %14[%arg1] : memref<?xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
      }
    }
    %6 = "gpu.block_id"() {dimension = "x"} : () -> index
    %7 = "gpu.grid_dim"() {dimension = "x"} : () -> index
    %8 = muli %6, %c32 : index
    %9 = muli %7, %c32 : index
    scf.for %arg0 = %8 to %c5 step %9 {
      %10 = muli %arg0, %c-1 : index
      %11 = addi %10, %c5 : index
      %12 = cmpi "slt", %c32, %11 : index
      %13 = select %12, %c32, %11 : index
      %14 = subview %1[%arg0, 0, 0] [%13, 1, 1] [1, 1, 1]  : memref<5x1x1xf32> to memref<?x1x1xf32, affine_map<(d0, d1, d2)[s0] -> (d0 + s0 + d1 + d2)>>
      %15 = muli %arg0, %c-1 : index
      %16 = addi %15, %c5 : index
      %17 = cmpi "slt", %c32, %16 : index
      %18 = select %17, %c32, %16 : index
      %19 = subview %0[%arg0] [%18] [1]  : memref<5xf32> to memref<?xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
      %20 = "gpu.thread_id"() {dimension = "x"} : () -> index
      %21 = "gpu.block_dim"() {dimension = "x"} : () -> index
      scf.for %arg1 = %20 to %13 step %21 {
        scf.for %arg2 = %c0 to %c1 step %c1 {
          scf.for %arg3 = %c0 to %c1 step %c1 {
            %22 = load %14[%arg1, %arg2, %arg3] : memref<?x1x1xf32, affine_map<(d0, d1, d2)[s0] -> (d0 + s0 + d1 + d2)>>
            %23 = load %19[%arg1] : memref<?xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
            %24 = cmpi "eq", %arg2, %c0 : index
            %25 = and %24, %true : i1
            %26 = cmpi "eq", %arg3, %c0 : index
            %27 = and %25, %26 : i1
            %28 = select %27, %cst, %23 : f32
            %29 = cmpf "olt", %22, %28 : f32
            %30 = select %29, %22, %28 : f32
            store %30, %19[%arg1] : memref<?xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
          }
        }
      }
    }
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After CSE ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_5() attributes {spv.entry_point_abi = {local_size = dense<[32, 1, 1]> : vector<3xi32>}} {
    %cst = constant 0x7F800000 : f32
    %cst_0 = constant 0.000000e+00 : f32
    %true = constant true
    %c32 = constant 32 : index
    %c-1 = constant -1 : index
    %c5 = constant 5 : index
    %c0 = constant 0 : index
    %c1 = constant 1 : index
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5xf32>
    %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<5x1x1xf32>
    %2 = "gpu.block_id"() {dimension = "x"} : () -> index
    %3 = "gpu.grid_dim"() {dimension = "x"} : () -> index
    %4 = muli %2, %c32 : index
    %5 = muli %3, %c32 : index
    scf.for %arg0 = %4 to %c5 step %5 {
      %6 = muli %arg0, %c-1 : index
      %7 = addi %6, %c5 : index
      %8 = cmpi "slt", %c32, %7 : index
      %9 = select %8, %c32, %7 : index
      %10 = subview %0[%arg0] [%9] [1]  : memref<5xf32> to memref<?xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
      %11 = "gpu.thread_id"() {dimension = "x"} : () -> index
      %12 = "gpu.block_dim"() {dimension = "x"} : () -> index
      scf.for %arg1 = %11 to %9 step %12 {
        store %cst_0, %10[%arg1] : memref<?xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
      }
    }
    scf.for %arg0 = %4 to %c5 step %5 {
      %6 = muli %arg0, %c-1 : index
      %7 = addi %6, %c5 : index
      %8 = cmpi "slt", %c32, %7 : index
      %9 = select %8, %c32, %7 : index
      %10 = subview %1[%arg0, 0, 0] [%9, 1, 1] [1, 1, 1]  : memref<5x1x1xf32> to memref<?x1x1xf32, affine_map<(d0, d1, d2)[s0] -> (d0 + s0 + d1 + d2)>>
      %11 = subview %0[%arg0] [%9] [1]  : memref<5xf32> to memref<?xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
      %12 = "gpu.thread_id"() {dimension = "x"} : () -> index
      %13 = "gpu.block_dim"() {dimension = "x"} : () -> index
      scf.for %arg1 = %12 to %9 step %13 {
        scf.for %arg2 = %c0 to %c1 step %c1 {
          scf.for %arg3 = %c0 to %c1 step %c1 {
            %14 = load %10[%arg1, %arg2, %arg3] : memref<?x1x1xf32, affine_map<(d0, d1, d2)[s0] -> (d0 + s0 + d1 + d2)>>
            %15 = load %11[%arg1] : memref<?xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
            %16 = cmpi "eq", %arg2, %c0 : index
            %17 = and %16, %true : i1
            %18 = cmpi "eq", %arg3, %c0 : index
            %19 = and %17, %18 : i1
            %20 = select %19, %cst, %15 : f32
            %21 = cmpf "olt", %14, %20 : f32
            %22 = select %21, %14, %20 : f32
            store %22, %11[%arg1] : memref<?xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
          }
        }
      }
    }
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::ResolveShapeOpsPass ***
func @main_ex_dispatch_5() attributes {spv.entry_point_abi = {local_size = dense<[32, 1, 1]> : vector<3xi32>}} {
  %cst = constant 0x7F800000 : f32
  %cst_0 = constant 0.000000e+00 : f32
  %true = constant true
  %c32 = constant 32 : index
  %c-1 = constant -1 : index
  %c5 = constant 5 : index
  %c0 = constant 0 : index
  %c1 = constant 1 : index
  %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5xf32>
  %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<5x1x1xf32>
  %2 = "gpu.block_id"() {dimension = "x"} : () -> index
  %3 = "gpu.grid_dim"() {dimension = "x"} : () -> index
  %4 = muli %2, %c32 : index
  %5 = muli %3, %c32 : index
  scf.for %arg0 = %4 to %c5 step %5 {
    %6 = muli %arg0, %c-1 : index
    %7 = addi %6, %c5 : index
    %8 = cmpi "slt", %c32, %7 : index
    %9 = select %8, %c32, %7 : index
    %10 = subview %0[%arg0] [%9] [1]  : memref<5xf32> to memref<?xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
    %11 = "gpu.thread_id"() {dimension = "x"} : () -> index
    %12 = "gpu.block_dim"() {dimension = "x"} : () -> index
    scf.for %arg1 = %11 to %9 step %12 {
      store %cst_0, %10[%arg1] : memref<?xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
    }
  }
  scf.for %arg0 = %4 to %c5 step %5 {
    %6 = muli %arg0, %c-1 : index
    %7 = addi %6, %c5 : index
    %8 = cmpi "slt", %c32, %7 : index
    %9 = select %8, %c32, %7 : index
    %10 = subview %1[%arg0, 0, 0] [%9, 1, 1] [1, 1, 1]  : memref<5x1x1xf32> to memref<?x1x1xf32, affine_map<(d0, d1, d2)[s0] -> (d0 + s0 + d1 + d2)>>
    %11 = subview %0[%arg0] [%9] [1]  : memref<5xf32> to memref<?xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
    %12 = "gpu.thread_id"() {dimension = "x"} : () -> index
    %13 = "gpu.block_dim"() {dimension = "x"} : () -> index
    scf.for %arg1 = %12 to %9 step %13 {
      scf.for %arg2 = %c0 to %c1 step %c1 {
        scf.for %arg3 = %c0 to %c1 step %c1 {
          %14 = load %10[%arg1, %arg2, %arg3] : memref<?x1x1xf32, affine_map<(d0, d1, d2)[s0] -> (d0 + s0 + d1 + d2)>>
          %15 = load %11[%arg1] : memref<?xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
          %16 = cmpi "eq", %arg2, %c0 : index
          %17 = and %16, %true : i1
          %18 = cmpi "eq", %arg3, %c0 : index
          %19 = and %17, %18 : i1
          %20 = select %19, %cst, %15 : f32
          %21 = cmpf "olt", %14, %20 : f32
          %22 = select %21, %14, %20 : f32
          store %22, %11[%arg1] : memref<?xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
        }
      }
    }
  }
  return
}

// *** IR Dump After LegalizeStandardForSPIRV ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_5() attributes {spv.entry_point_abi = {local_size = dense<[32, 1, 1]> : vector<3xi32>}} {
    %cst = constant 0x7F800000 : f32
    %cst_0 = constant 0.000000e+00 : f32
    %true = constant true
    %c32 = constant 32 : index
    %c-1 = constant -1 : index
    %c5 = constant 5 : index
    %c0 = constant 0 : index
    %c1 = constant 1 : index
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5xf32>
    %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<5x1x1xf32>
    %2 = "gpu.block_id"() {dimension = "x"} : () -> index
    %3 = "gpu.grid_dim"() {dimension = "x"} : () -> index
    %4 = muli %2, %c32 : index
    %5 = muli %3, %c32 : index
    scf.for %arg0 = %4 to %c5 step %5 {
      %6 = muli %arg0, %c-1 : index
      %7 = addi %6, %c5 : index
      %8 = cmpi "slt", %c32, %7 : index
      %9 = select %8, %c32, %7 : index
      %10 = "gpu.thread_id"() {dimension = "x"} : () -> index
      %11 = "gpu.block_dim"() {dimension = "x"} : () -> index
      scf.for %arg1 = %10 to %9 step %11 {
        %12 = addi %arg0, %arg1 : index
        store %cst_0, %0[%12] : memref<5xf32>
      }
    }
    scf.for %arg0 = %4 to %c5 step %5 {
      %6 = muli %arg0, %c-1 : index
      %7 = addi %6, %c5 : index
      %8 = cmpi "slt", %c32, %7 : index
      %9 = select %8, %c32, %7 : index
      %10 = "gpu.thread_id"() {dimension = "x"} : () -> index
      %11 = "gpu.block_dim"() {dimension = "x"} : () -> index
      scf.for %arg1 = %10 to %9 step %11 {
        scf.for %arg2 = %c0 to %c1 step %c1 {
          scf.for %arg3 = %c0 to %c1 step %c1 {
            %12 = addi %arg0, %arg1 : index
            %13 = load %1[%12, %arg2, %arg3] : memref<5x1x1xf32>
            %14 = addi %arg0, %arg1 : index
            %15 = load %0[%14] : memref<5xf32>
            %16 = cmpi "eq", %arg2, %c0 : index
            %17 = and %16, %true : i1
            %18 = cmpi "eq", %arg3, %c0 : index
            %19 = and %17, %18 : i1
            %20 = select %19, %cst, %15 : f32
            %21 = cmpf "olt", %13, %20 : f32
            %22 = select %21, %13, %20 : f32
            %23 = addi %arg0, %arg1 : index
            store %22, %0[%23] : memref<5xf32>
          }
        }
      }
    }
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After Canonicalizer ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_5() attributes {spv.entry_point_abi = {local_size = dense<[32, 1, 1]> : vector<3xi32>}} {
    %cst = constant 0x7F800000 : f32
    %cst_0 = constant 0.000000e+00 : f32
    %true = constant true
    %c32 = constant 32 : index
    %c-1 = constant -1 : index
    %c5 = constant 5 : index
    %c0 = constant 0 : index
    %c1 = constant 1 : index
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5xf32>
    %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<5x1x1xf32>
    %2 = "gpu.block_id"() {dimension = "x"} : () -> index
    %3 = "gpu.grid_dim"() {dimension = "x"} : () -> index
    %4 = muli %2, %c32 : index
    %5 = muli %3, %c32 : index
    scf.for %arg0 = %4 to %c5 step %5 {
      %6 = muli %arg0, %c-1 : index
      %7 = addi %6, %c5 : index
      %8 = cmpi "slt", %c32, %7 : index
      %9 = select %8, %c32, %7 : index
      %10 = "gpu.thread_id"() {dimension = "x"} : () -> index
      %11 = "gpu.block_dim"() {dimension = "x"} : () -> index
      scf.for %arg1 = %10 to %9 step %11 {
        %12 = addi %arg0, %arg1 : index
        store %cst_0, %0[%12] : memref<5xf32>
      }
    }
    scf.for %arg0 = %4 to %c5 step %5 {
      %6 = muli %arg0, %c-1 : index
      %7 = addi %6, %c5 : index
      %8 = cmpi "slt", %c32, %7 : index
      %9 = select %8, %c32, %7 : index
      %10 = "gpu.thread_id"() {dimension = "x"} : () -> index
      %11 = "gpu.block_dim"() {dimension = "x"} : () -> index
      scf.for %arg1 = %10 to %9 step %11 {
        scf.for %arg2 = %c0 to %c1 step %c1 {
          scf.for %arg3 = %c0 to %c1 step %c1 {
            %12 = addi %arg0, %arg1 : index
            %13 = load %1[%12, %arg2, %arg3] : memref<5x1x1xf32>
            %14 = addi %arg0, %arg1 : index
            %15 = load %0[%14] : memref<5xf32>
            %16 = cmpi "eq", %arg2, %c0 : index
            %17 = and %16, %true : i1
            %18 = cmpi "eq", %arg3, %c0 : index
            %19 = and %17, %18 : i1
            %20 = select %19, %cst, %15 : f32
            %21 = cmpf "olt", %13, %20 : f32
            %22 = select %21, %13, %20 : f32
            %23 = addi %arg0, %arg1 : index
            store %22, %0[%23] : memref<5xf32>
          }
        }
      }
    }
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After CSE ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_5() attributes {spv.entry_point_abi = {local_size = dense<[32, 1, 1]> : vector<3xi32>}} {
    %cst = constant 0x7F800000 : f32
    %cst_0 = constant 0.000000e+00 : f32
    %true = constant true
    %c32 = constant 32 : index
    %c-1 = constant -1 : index
    %c5 = constant 5 : index
    %c0 = constant 0 : index
    %c1 = constant 1 : index
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5xf32>
    %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<5x1x1xf32>
    %2 = "gpu.block_id"() {dimension = "x"} : () -> index
    %3 = "gpu.grid_dim"() {dimension = "x"} : () -> index
    %4 = muli %2, %c32 : index
    %5 = muli %3, %c32 : index
    scf.for %arg0 = %4 to %c5 step %5 {
      %6 = muli %arg0, %c-1 : index
      %7 = addi %6, %c5 : index
      %8 = cmpi "slt", %c32, %7 : index
      %9 = select %8, %c32, %7 : index
      %10 = "gpu.thread_id"() {dimension = "x"} : () -> index
      %11 = "gpu.block_dim"() {dimension = "x"} : () -> index
      scf.for %arg1 = %10 to %9 step %11 {
        %12 = addi %arg0, %arg1 : index
        store %cst_0, %0[%12] : memref<5xf32>
      }
    }
    scf.for %arg0 = %4 to %c5 step %5 {
      %6 = muli %arg0, %c-1 : index
      %7 = addi %6, %c5 : index
      %8 = cmpi "slt", %c32, %7 : index
      %9 = select %8, %c32, %7 : index
      %10 = "gpu.thread_id"() {dimension = "x"} : () -> index
      %11 = "gpu.block_dim"() {dimension = "x"} : () -> index
      scf.for %arg1 = %10 to %9 step %11 {
        scf.for %arg2 = %c0 to %c1 step %c1 {
          scf.for %arg3 = %c0 to %c1 step %c1 {
            %12 = addi %arg0, %arg1 : index
            %13 = load %1[%12, %arg2, %arg3] : memref<5x1x1xf32>
            %14 = load %0[%12] : memref<5xf32>
            %15 = cmpi "eq", %arg2, %c0 : index
            %16 = and %15, %true : i1
            %17 = cmpi "eq", %arg3, %c0 : index
            %18 = and %16, %17 : i1
            %19 = select %18, %cst, %14 : f32
            %20 = cmpf "olt", %13, %19 : f32
            %21 = select %20, %13, %19 : f32
            store %21, %0[%12] : memref<5xf32>
          }
        }
      }
    }
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::ConvertToSPIRVPass ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  spv.module Logical GLSL450 {
    spv.globalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
    spv.globalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spv.ptr<vector<3xi32>, Input>
    spv.globalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
    spv.globalVariable @__resource_var_0_0__ bind(0, 0) : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
    spv.globalVariable @__resource_var_0_1__ bind(0, 1) : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
    spv.func @main_ex_dispatch_5() "None" attributes {spv.entry_point_abi = {local_size = dense<[32, 1, 1]> : vector<3xi32>}} {
      %0 = spv.constant 0x7F800000 : f32
      %1 = spv.constant 0.000000e+00 : f32
      %2 = spv.constant true
      %3 = spv.constant 32 : i32
      %4 = spv.constant -1 : i32
      %5 = spv.constant 5 : i32
      %6 = spv.constant 0 : i32
      %7 = spv.constant 1 : i32
      %8 = spv._address_of @__resource_var_0_1__ : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
      %9 = spv._address_of @__resource_var_0_0__ : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
      %10 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
      %11 = spv.Load "Input" %10 : vector<3xi32>
      %12 = spv.CompositeExtract %11[0 : i32] : vector<3xi32>
      %13 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
      %14 = spv.Load "Input" %13 : vector<3xi32>
      %15 = spv.CompositeExtract %14[0 : i32] : vector<3xi32>
      %16 = spv.IMul %12, %3 : i32
      %17 = spv.IMul %15, %3 : i32
      spv.loop {
        spv.Branch ^bb1(%16 : i32)
      ^bb1(%18: i32):  // 2 preds: ^bb0, ^bb2
        %19 = spv.SLessThan %18, %5 : i32
        spv.BranchConditional %19, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        %20 = spv.IMul %18, %4 : i32
        %21 = spv.IAdd %20, %5 : i32
        %22 = spv.SLessThan %3, %21 : i32
        %23 = spv.Select %22, %3, %21 : i1, i32
        %24 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
        %25 = spv.Load "Input" %24 : vector<3xi32>
        %26 = spv.CompositeExtract %25[0 : i32] : vector<3xi32>
        %27 = spv.constant 32 : i32
        spv.loop {
          spv.Branch ^bb1(%26 : i32)
        ^bb1(%29: i32):  // 2 preds: ^bb0, ^bb2
          %30 = spv.SLessThan %29, %23 : i32
          spv.BranchConditional %30, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          %31 = spv.IAdd %18, %29 : i32
          %32 = spv.constant 0 : i32
          %33 = spv.constant 0 : i32
          %34 = spv.constant 1 : i32
          %35 = spv.IMul %34, %31 : i32
          %36 = spv.IAdd %33, %35 : i32
          %37 = spv.AccessChain %8[%32, %36] : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
          spv.Store "StorageBuffer" %37, %1 : f32
          %38 = spv.IAdd %29, %27 : i32
          spv.Branch ^bb1(%38 : i32)
        ^bb3:  // pred: ^bb1
          spv._merge
        }
        %28 = spv.IAdd %18, %17 : i32
        spv.Branch ^bb1(%28 : i32)
      ^bb3:  // pred: ^bb1
        spv._merge
      }
      spv.loop {
        spv.Branch ^bb1(%16 : i32)
      ^bb1(%18: i32):  // 2 preds: ^bb0, ^bb2
        %19 = spv.SLessThan %18, %5 : i32
        spv.BranchConditional %19, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        %20 = spv.IMul %18, %4 : i32
        %21 = spv.IAdd %20, %5 : i32
        %22 = spv.SLessThan %3, %21 : i32
        %23 = spv.Select %22, %3, %21 : i1, i32
        %24 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
        %25 = spv.Load "Input" %24 : vector<3xi32>
        %26 = spv.CompositeExtract %25[0 : i32] : vector<3xi32>
        %27 = spv.constant 32 : i32
        spv.loop {
          spv.Branch ^bb1(%26 : i32)
        ^bb1(%29: i32):  // 2 preds: ^bb0, ^bb2
          %30 = spv.SLessThan %29, %23 : i32
          spv.BranchConditional %30, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          spv.loop {
            spv.Branch ^bb1(%6 : i32)
          ^bb1(%32: i32):  // 2 preds: ^bb0, ^bb2
            %33 = spv.SLessThan %32, %7 : i32
            spv.BranchConditional %33, ^bb2, ^bb3
          ^bb2:  // pred: ^bb1
            spv.loop {
              spv.Branch ^bb1(%6 : i32)
            ^bb1(%35: i32):  // 2 preds: ^bb0, ^bb2
              %36 = spv.SLessThan %35, %7 : i32
              spv.BranchConditional %36, ^bb2, ^bb3
            ^bb2:  // pred: ^bb1
              %37 = spv.IAdd %18, %29 : i32
              %38 = spv.constant 0 : i32
              %39 = spv.constant 0 : i32
              %40 = spv.constant 1 : i32
              %41 = spv.IMul %40, %37 : i32
              %42 = spv.IAdd %39, %41 : i32
              %43 = spv.constant 1 : i32
              %44 = spv.IMul %43, %32 : i32
              %45 = spv.IAdd %42, %44 : i32
              %46 = spv.constant 1 : i32
              %47 = spv.IMul %46, %35 : i32
              %48 = spv.IAdd %45, %47 : i32
              %49 = spv.AccessChain %9[%38, %48] : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
              %50 = spv.Load "StorageBuffer" %49 : f32
              %51 = spv.constant 0 : i32
              %52 = spv.constant 0 : i32
              %53 = spv.constant 1 : i32
              %54 = spv.IMul %53, %37 : i32
              %55 = spv.IAdd %52, %54 : i32
              %56 = spv.AccessChain %8[%51, %55] : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
              %57 = spv.Load "StorageBuffer" %56 : f32
              %58 = spv.IEqual %32, %6 : i32
              %59 = spv.LogicalAnd %58, %2 : i1
              %60 = spv.IEqual %35, %6 : i32
              %61 = spv.LogicalAnd %59, %60 : i1
              %62 = spv.Select %61, %0, %57 : i1, f32
              %63 = spv.FOrdLessThan %50, %62 : f32
              %64 = spv.Select %63, %50, %62 : i1, f32
              %65 = spv.constant 0 : i32
              %66 = spv.constant 0 : i32
              %67 = spv.constant 1 : i32
              %68 = spv.IMul %67, %37 : i32
              %69 = spv.IAdd %66, %68 : i32
              %70 = spv.AccessChain %8[%65, %69] : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
              spv.Store "StorageBuffer" %70, %64 : f32
              %71 = spv.IAdd %35, %7 : i32
              spv.Branch ^bb1(%71 : i32)
            ^bb3:  // pred: ^bb1
              spv._merge
            }
            %34 = spv.IAdd %32, %7 : i32
            spv.Branch ^bb1(%34 : i32)
          ^bb3:  // pred: ^bb1
            spv._merge
          }
          %31 = spv.IAdd %29, %27 : i32
          spv.Branch ^bb1(%31 : i32)
        ^bb3:  // pred: ^bb1
          spv._merge
        }
        %28 = spv.IAdd %18, %17 : i32
        spv.Branch ^bb1(%28 : i32)
      ^bb3:  // pred: ^bb1
        spv._merge
      }
      spv.Return
    }
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After SPIRVLowerABIAttributes ***
spv.module Logical GLSL450 {
  spv.globalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__resource_var_0_0__ bind(0, 0) : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
  spv.globalVariable @__resource_var_0_1__ bind(0, 1) : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
  spv.func @main_ex_dispatch_5() "None" {
    %0 = spv.constant 0x7F800000 : f32
    %1 = spv.constant 0.000000e+00 : f32
    %2 = spv.constant true
    %3 = spv.constant 32 : i32
    %4 = spv.constant -1 : i32
    %5 = spv.constant 5 : i32
    %6 = spv.constant 0 : i32
    %7 = spv.constant 1 : i32
    %8 = spv._address_of @__resource_var_0_1__ : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
    %9 = spv._address_of @__resource_var_0_0__ : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
    %10 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %11 = spv.Load "Input" %10 : vector<3xi32>
    %12 = spv.CompositeExtract %11[0 : i32] : vector<3xi32>
    %13 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
    %14 = spv.Load "Input" %13 : vector<3xi32>
    %15 = spv.CompositeExtract %14[0 : i32] : vector<3xi32>
    %16 = spv.IMul %12, %3 : i32
    %17 = spv.IMul %15, %3 : i32
    spv.loop {
      spv.Branch ^bb1(%16 : i32)
    ^bb1(%18: i32):  // 2 preds: ^bb0, ^bb2
      %19 = spv.SLessThan %18, %5 : i32
      spv.BranchConditional %19, ^bb2, ^bb3
    ^bb2:  // pred: ^bb1
      %20 = spv.IMul %18, %4 : i32
      %21 = spv.IAdd %20, %5 : i32
      %22 = spv.SLessThan %3, %21 : i32
      %23 = spv.Select %22, %3, %21 : i1, i32
      %24 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
      %25 = spv.Load "Input" %24 : vector<3xi32>
      %26 = spv.CompositeExtract %25[0 : i32] : vector<3xi32>
      %27 = spv.constant 32 : i32
      spv.loop {
        spv.Branch ^bb1(%26 : i32)
      ^bb1(%29: i32):  // 2 preds: ^bb0, ^bb2
        %30 = spv.SLessThan %29, %23 : i32
        spv.BranchConditional %30, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        %31 = spv.IAdd %18, %29 : i32
        %32 = spv.constant 0 : i32
        %33 = spv.constant 0 : i32
        %34 = spv.constant 1 : i32
        %35 = spv.IMul %34, %31 : i32
        %36 = spv.IAdd %33, %35 : i32
        %37 = spv.AccessChain %8[%32, %36] : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
        spv.Store "StorageBuffer" %37, %1 : f32
        %38 = spv.IAdd %29, %27 : i32
        spv.Branch ^bb1(%38 : i32)
      ^bb3:  // pred: ^bb1
        spv._merge
      }
      %28 = spv.IAdd %18, %17 : i32
      spv.Branch ^bb1(%28 : i32)
    ^bb3:  // pred: ^bb1
      spv._merge
    }
    spv.loop {
      spv.Branch ^bb1(%16 : i32)
    ^bb1(%18: i32):  // 2 preds: ^bb0, ^bb2
      %19 = spv.SLessThan %18, %5 : i32
      spv.BranchConditional %19, ^bb2, ^bb3
    ^bb2:  // pred: ^bb1
      %20 = spv.IMul %18, %4 : i32
      %21 = spv.IAdd %20, %5 : i32
      %22 = spv.SLessThan %3, %21 : i32
      %23 = spv.Select %22, %3, %21 : i1, i32
      %24 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
      %25 = spv.Load "Input" %24 : vector<3xi32>
      %26 = spv.CompositeExtract %25[0 : i32] : vector<3xi32>
      %27 = spv.constant 32 : i32
      spv.loop {
        spv.Branch ^bb1(%26 : i32)
      ^bb1(%29: i32):  // 2 preds: ^bb0, ^bb2
        %30 = spv.SLessThan %29, %23 : i32
        spv.BranchConditional %30, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        spv.loop {
          spv.Branch ^bb1(%6 : i32)
        ^bb1(%32: i32):  // 2 preds: ^bb0, ^bb2
          %33 = spv.SLessThan %32, %7 : i32
          spv.BranchConditional %33, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          spv.loop {
            spv.Branch ^bb1(%6 : i32)
          ^bb1(%35: i32):  // 2 preds: ^bb0, ^bb2
            %36 = spv.SLessThan %35, %7 : i32
            spv.BranchConditional %36, ^bb2, ^bb3
          ^bb2:  // pred: ^bb1
            %37 = spv.IAdd %18, %29 : i32
            %38 = spv.constant 0 : i32
            %39 = spv.constant 0 : i32
            %40 = spv.constant 1 : i32
            %41 = spv.IMul %40, %37 : i32
            %42 = spv.IAdd %39, %41 : i32
            %43 = spv.constant 1 : i32
            %44 = spv.IMul %43, %32 : i32
            %45 = spv.IAdd %42, %44 : i32
            %46 = spv.constant 1 : i32
            %47 = spv.IMul %46, %35 : i32
            %48 = spv.IAdd %45, %47 : i32
            %49 = spv.AccessChain %9[%38, %48] : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
            %50 = spv.Load "StorageBuffer" %49 : f32
            %51 = spv.constant 0 : i32
            %52 = spv.constant 0 : i32
            %53 = spv.constant 1 : i32
            %54 = spv.IMul %53, %37 : i32
            %55 = spv.IAdd %52, %54 : i32
            %56 = spv.AccessChain %8[%51, %55] : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
            %57 = spv.Load "StorageBuffer" %56 : f32
            %58 = spv.IEqual %32, %6 : i32
            %59 = spv.LogicalAnd %58, %2 : i1
            %60 = spv.IEqual %35, %6 : i32
            %61 = spv.LogicalAnd %59, %60 : i1
            %62 = spv.Select %61, %0, %57 : i1, f32
            %63 = spv.FOrdLessThan %50, %62 : f32
            %64 = spv.Select %63, %50, %62 : i1, f32
            %65 = spv.constant 0 : i32
            %66 = spv.constant 0 : i32
            %67 = spv.constant 1 : i32
            %68 = spv.IMul %67, %37 : i32
            %69 = spv.IAdd %66, %68 : i32
            %70 = spv.AccessChain %8[%65, %69] : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
            spv.Store "StorageBuffer" %70, %64 : f32
            %71 = spv.IAdd %35, %7 : i32
            spv.Branch ^bb1(%71 : i32)
          ^bb3:  // pred: ^bb1
            spv._merge
          }
          %34 = spv.IAdd %32, %7 : i32
          spv.Branch ^bb1(%34 : i32)
        ^bb3:  // pred: ^bb1
          spv._merge
        }
        %31 = spv.IAdd %29, %27 : i32
        spv.Branch ^bb1(%31 : i32)
      ^bb3:  // pred: ^bb1
        spv._merge
      }
      %28 = spv.IAdd %18, %17 : i32
      spv.Branch ^bb1(%28 : i32)
    ^bb3:  // pred: ^bb1
      spv._merge
    }
    spv.Return
  }
  spv.EntryPoint "GLCompute" @main_ex_dispatch_5, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__
  spv.ExecutionMode @main_ex_dispatch_5 "LocalSize", 32, 1, 1
}

// *** IR Dump After Canonicalizer ***
spv.module Logical GLSL450 {
  spv.globalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__resource_var_0_0__ bind(0, 0) : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
  spv.globalVariable @__resource_var_0_1__ bind(0, 1) : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
  spv.func @main_ex_dispatch_5() "None" {
    %0 = spv.constant 0x7F800000 : f32
    %1 = spv.constant 0.000000e+00 : f32
    %2 = spv.constant -1 : i32
    %3 = spv.constant 5 : i32
    %4 = spv.constant 1 : i32
    %5 = spv.constant 32 : i32
    %6 = spv.constant 0 : i32
    %7 = spv._address_of @__resource_var_0_1__ : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
    %8 = spv._address_of @__resource_var_0_0__ : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
    %9 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %10 = spv.Load "Input" %9 : vector<3xi32>
    %11 = spv.CompositeExtract %10[0 : i32] : vector<3xi32>
    %12 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
    %13 = spv.Load "Input" %12 : vector<3xi32>
    %14 = spv.CompositeExtract %13[0 : i32] : vector<3xi32>
    %15 = spv.IMul %11, %5 : i32
    %16 = spv.IMul %14, %5 : i32
    spv.loop {
      spv.Branch ^bb1(%15 : i32)
    ^bb1(%17: i32):  // 2 preds: ^bb0, ^bb2
      %18 = spv.SLessThan %17, %3 : i32
      spv.BranchConditional %18, ^bb2, ^bb3
    ^bb2:  // pred: ^bb1
      %19 = spv.IMul %17, %2 : i32
      %20 = spv.IAdd %19, %3 : i32
      %21 = spv.SLessThan %5, %20 : i32
      %22 = spv.Select %21, %5, %20 : i1, i32
      %23 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
      %24 = spv.Load "Input" %23 : vector<3xi32>
      %25 = spv.CompositeExtract %24[0 : i32] : vector<3xi32>
      spv.loop {
        spv.Branch ^bb1(%25 : i32)
      ^bb1(%27: i32):  // 2 preds: ^bb0, ^bb2
        %28 = spv.SLessThan %27, %22 : i32
        spv.BranchConditional %28, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        %29 = spv.IAdd %17, %27 : i32
        %30 = spv.AccessChain %7[%6, %29] : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
        spv.Store "StorageBuffer" %30, %1 : f32
        %31 = spv.IAdd %27, %5 : i32
        spv.Branch ^bb1(%31 : i32)
      ^bb3:  // pred: ^bb1
        spv._merge
      }
      %26 = spv.IAdd %17, %16 : i32
      spv.Branch ^bb1(%26 : i32)
    ^bb3:  // pred: ^bb1
      spv._merge
    }
    spv.loop {
      spv.Branch ^bb1(%15 : i32)
    ^bb1(%17: i32):  // 2 preds: ^bb0, ^bb2
      %18 = spv.SLessThan %17, %3 : i32
      spv.BranchConditional %18, ^bb2, ^bb3
    ^bb2:  // pred: ^bb1
      %19 = spv.IMul %17, %2 : i32
      %20 = spv.IAdd %19, %3 : i32
      %21 = spv.SLessThan %5, %20 : i32
      %22 = spv.Select %21, %5, %20 : i1, i32
      %23 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
      %24 = spv.Load "Input" %23 : vector<3xi32>
      %25 = spv.CompositeExtract %24[0 : i32] : vector<3xi32>
      spv.loop {
        spv.Branch ^bb1(%25 : i32)
      ^bb1(%27: i32):  // 2 preds: ^bb0, ^bb2
        %28 = spv.SLessThan %27, %22 : i32
        spv.BranchConditional %28, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        spv.loop {
          spv.Branch ^bb1(%6 : i32)
        ^bb1(%30: i32):  // 2 preds: ^bb0, ^bb2
          %31 = spv.SLessThan %30, %4 : i32
          spv.BranchConditional %31, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          spv.loop {
            spv.Branch ^bb1(%6 : i32)
          ^bb1(%33: i32):  // 2 preds: ^bb0, ^bb2
            %34 = spv.SLessThan %33, %4 : i32
            spv.BranchConditional %34, ^bb2, ^bb3
          ^bb2:  // pred: ^bb1
            %35 = spv.IAdd %17, %27 : i32
            %36 = spv.IAdd %35, %30 : i32
            %37 = spv.IAdd %36, %33 : i32
            %38 = spv.AccessChain %8[%6, %37] : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
            %39 = spv.Load "StorageBuffer" %38 : f32
            %40 = spv.AccessChain %7[%6, %35] : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
            %41 = spv.Load "StorageBuffer" %40 : f32
            %42 = spv.IEqual %30, %6 : i32
            %43 = spv.IEqual %33, %6 : i32
            %44 = spv.LogicalAnd %42, %43 : i1
            %45 = spv.Select %44, %0, %41 : i1, f32
            %46 = spv.FOrdLessThan %39, %45 : f32
            %47 = spv.Select %46, %39, %45 : i1, f32
            %48 = spv.AccessChain %7[%6, %35] : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
            spv.Store "StorageBuffer" %48, %47 : f32
            %49 = spv.IAdd %33, %4 : i32
            spv.Branch ^bb1(%49 : i32)
          ^bb3:  // pred: ^bb1
            spv._merge
          }
          %32 = spv.IAdd %30, %4 : i32
          spv.Branch ^bb1(%32 : i32)
        ^bb3:  // pred: ^bb1
          spv._merge
        }
        %29 = spv.IAdd %27, %5 : i32
        spv.Branch ^bb1(%29 : i32)
      ^bb3:  // pred: ^bb1
        spv._merge
      }
      %26 = spv.IAdd %17, %16 : i32
      spv.Branch ^bb1(%26 : i32)
    ^bb3:  // pred: ^bb1
      spv._merge
    }
    spv.Return
  }
  spv.EntryPoint "GLCompute" @main_ex_dispatch_5, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__
  spv.ExecutionMode @main_ex_dispatch_5 "LocalSize", 32, 1, 1
}

// *** IR Dump After CSE ***
spv.module Logical GLSL450 {
  spv.globalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__resource_var_0_0__ bind(0, 0) : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
  spv.globalVariable @__resource_var_0_1__ bind(0, 1) : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
  spv.func @main_ex_dispatch_5() "None" {
    %0 = spv.constant 0x7F800000 : f32
    %1 = spv.constant 0.000000e+00 : f32
    %2 = spv.constant -1 : i32
    %3 = spv.constant 5 : i32
    %4 = spv.constant 1 : i32
    %5 = spv.constant 32 : i32
    %6 = spv.constant 0 : i32
    %7 = spv._address_of @__resource_var_0_1__ : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
    %8 = spv._address_of @__resource_var_0_0__ : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
    %9 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %10 = spv.Load "Input" %9 : vector<3xi32>
    %11 = spv.CompositeExtract %10[0 : i32] : vector<3xi32>
    %12 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
    %13 = spv.Load "Input" %12 : vector<3xi32>
    %14 = spv.CompositeExtract %13[0 : i32] : vector<3xi32>
    %15 = spv.IMul %11, %5 : i32
    %16 = spv.IMul %14, %5 : i32
    spv.loop {
      spv.Branch ^bb1(%15 : i32)
    ^bb1(%17: i32):  // 2 preds: ^bb0, ^bb2
      %18 = spv.SLessThan %17, %3 : i32
      spv.BranchConditional %18, ^bb2, ^bb3
    ^bb2:  // pred: ^bb1
      %19 = spv.IMul %17, %2 : i32
      %20 = spv.IAdd %19, %3 : i32
      %21 = spv.SLessThan %5, %20 : i32
      %22 = spv.Select %21, %5, %20 : i1, i32
      %23 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
      %24 = spv.Load "Input" %23 : vector<3xi32>
      %25 = spv.CompositeExtract %24[0 : i32] : vector<3xi32>
      spv.loop {
        spv.Branch ^bb1(%25 : i32)
      ^bb1(%27: i32):  // 2 preds: ^bb0, ^bb2
        %28 = spv.SLessThan %27, %22 : i32
        spv.BranchConditional %28, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        %29 = spv.IAdd %17, %27 : i32
        %30 = spv.AccessChain %7[%6, %29] : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
        spv.Store "StorageBuffer" %30, %1 : f32
        %31 = spv.IAdd %27, %5 : i32
        spv.Branch ^bb1(%31 : i32)
      ^bb3:  // pred: ^bb1
        spv._merge
      }
      %26 = spv.IAdd %17, %16 : i32
      spv.Branch ^bb1(%26 : i32)
    ^bb3:  // pred: ^bb1
      spv._merge
    }
    spv.loop {
      spv.Branch ^bb1(%15 : i32)
    ^bb1(%17: i32):  // 2 preds: ^bb0, ^bb2
      %18 = spv.SLessThan %17, %3 : i32
      spv.BranchConditional %18, ^bb2, ^bb3
    ^bb2:  // pred: ^bb1
      %19 = spv.IMul %17, %2 : i32
      %20 = spv.IAdd %19, %3 : i32
      %21 = spv.SLessThan %5, %20 : i32
      %22 = spv.Select %21, %5, %20 : i1, i32
      %23 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
      %24 = spv.Load "Input" %23 : vector<3xi32>
      %25 = spv.CompositeExtract %24[0 : i32] : vector<3xi32>
      spv.loop {
        spv.Branch ^bb1(%25 : i32)
      ^bb1(%27: i32):  // 2 preds: ^bb0, ^bb2
        %28 = spv.SLessThan %27, %22 : i32
        spv.BranchConditional %28, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        spv.loop {
          spv.Branch ^bb1(%6 : i32)
        ^bb1(%30: i32):  // 2 preds: ^bb0, ^bb2
          %31 = spv.SLessThan %30, %4 : i32
          spv.BranchConditional %31, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          spv.loop {
            spv.Branch ^bb1(%6 : i32)
          ^bb1(%33: i32):  // 2 preds: ^bb0, ^bb2
            %34 = spv.SLessThan %33, %4 : i32
            spv.BranchConditional %34, ^bb2, ^bb3
          ^bb2:  // pred: ^bb1
            %35 = spv.IAdd %17, %27 : i32
            %36 = spv.IAdd %35, %30 : i32
            %37 = spv.IAdd %36, %33 : i32
            %38 = spv.AccessChain %8[%6, %37] : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
            %39 = spv.Load "StorageBuffer" %38 : f32
            %40 = spv.AccessChain %7[%6, %35] : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
            %41 = spv.Load "StorageBuffer" %40 : f32
            %42 = spv.IEqual %30, %6 : i32
            %43 = spv.IEqual %33, %6 : i32
            %44 = spv.LogicalAnd %42, %43 : i1
            %45 = spv.Select %44, %0, %41 : i1, f32
            %46 = spv.FOrdLessThan %39, %45 : f32
            %47 = spv.Select %46, %39, %45 : i1, f32
            spv.Store "StorageBuffer" %40, %47 : f32
            %48 = spv.IAdd %33, %4 : i32
            spv.Branch ^bb1(%48 : i32)
          ^bb3:  // pred: ^bb1
            spv._merge
          }
          %32 = spv.IAdd %30, %4 : i32
          spv.Branch ^bb1(%32 : i32)
        ^bb3:  // pred: ^bb1
          spv._merge
        }
        %29 = spv.IAdd %27, %5 : i32
        spv.Branch ^bb1(%29 : i32)
      ^bb3:  // pred: ^bb1
        spv._merge
      }
      %26 = spv.IAdd %17, %16 : i32
      spv.Branch ^bb1(%26 : i32)
    ^bb3:  // pred: ^bb1
      spv._merge
    }
    spv.Return
  }
  spv.EntryPoint "GLCompute" @main_ex_dispatch_5, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__
  spv.ExecutionMode @main_ex_dispatch_5 "LocalSize", 32, 1, 1
}

// *** IR Dump After SPIRVUpdateVCE ***
spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
  spv.globalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
  spv.globalVariable @__resource_var_0_0__ bind(0, 0) : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
  spv.globalVariable @__resource_var_0_1__ bind(0, 1) : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
  spv.func @main_ex_dispatch_5() "None" {
    %0 = spv.constant 0x7F800000 : f32
    %1 = spv.constant 0.000000e+00 : f32
    %2 = spv.constant -1 : i32
    %3 = spv.constant 5 : i32
    %4 = spv.constant 1 : i32
    %5 = spv.constant 32 : i32
    %6 = spv.constant 0 : i32
    %7 = spv._address_of @__resource_var_0_1__ : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
    %8 = spv._address_of @__resource_var_0_0__ : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
    %9 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
    %10 = spv.Load "Input" %9 : vector<3xi32>
    %11 = spv.CompositeExtract %10[0 : i32] : vector<3xi32>
    %12 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
    %13 = spv.Load "Input" %12 : vector<3xi32>
    %14 = spv.CompositeExtract %13[0 : i32] : vector<3xi32>
    %15 = spv.IMul %11, %5 : i32
    %16 = spv.IMul %14, %5 : i32
    spv.loop {
      spv.Branch ^bb1(%15 : i32)
    ^bb1(%17: i32):  // 2 preds: ^bb0, ^bb2
      %18 = spv.SLessThan %17, %3 : i32
      spv.BranchConditional %18, ^bb2, ^bb3
    ^bb2:  // pred: ^bb1
      %19 = spv.IMul %17, %2 : i32
      %20 = spv.IAdd %19, %3 : i32
      %21 = spv.SLessThan %5, %20 : i32
      %22 = spv.Select %21, %5, %20 : i1, i32
      %23 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
      %24 = spv.Load "Input" %23 : vector<3xi32>
      %25 = spv.CompositeExtract %24[0 : i32] : vector<3xi32>
      spv.loop {
        spv.Branch ^bb1(%25 : i32)
      ^bb1(%27: i32):  // 2 preds: ^bb0, ^bb2
        %28 = spv.SLessThan %27, %22 : i32
        spv.BranchConditional %28, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        %29 = spv.IAdd %17, %27 : i32
        %30 = spv.AccessChain %7[%6, %29] : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
        spv.Store "StorageBuffer" %30, %1 : f32
        %31 = spv.IAdd %27, %5 : i32
        spv.Branch ^bb1(%31 : i32)
      ^bb3:  // pred: ^bb1
        spv._merge
      }
      %26 = spv.IAdd %17, %16 : i32
      spv.Branch ^bb1(%26 : i32)
    ^bb3:  // pred: ^bb1
      spv._merge
    }
    spv.loop {
      spv.Branch ^bb1(%15 : i32)
    ^bb1(%17: i32):  // 2 preds: ^bb0, ^bb2
      %18 = spv.SLessThan %17, %3 : i32
      spv.BranchConditional %18, ^bb2, ^bb3
    ^bb2:  // pred: ^bb1
      %19 = spv.IMul %17, %2 : i32
      %20 = spv.IAdd %19, %3 : i32
      %21 = spv.SLessThan %5, %20 : i32
      %22 = spv.Select %21, %5, %20 : i1, i32
      %23 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
      %24 = spv.Load "Input" %23 : vector<3xi32>
      %25 = spv.CompositeExtract %24[0 : i32] : vector<3xi32>
      spv.loop {
        spv.Branch ^bb1(%25 : i32)
      ^bb1(%27: i32):  // 2 preds: ^bb0, ^bb2
        %28 = spv.SLessThan %27, %22 : i32
        spv.BranchConditional %28, ^bb2, ^bb3
      ^bb2:  // pred: ^bb1
        spv.loop {
          spv.Branch ^bb1(%6 : i32)
        ^bb1(%30: i32):  // 2 preds: ^bb0, ^bb2
          %31 = spv.SLessThan %30, %4 : i32
          spv.BranchConditional %31, ^bb2, ^bb3
        ^bb2:  // pred: ^bb1
          spv.loop {
            spv.Branch ^bb1(%6 : i32)
          ^bb1(%33: i32):  // 2 preds: ^bb0, ^bb2
            %34 = spv.SLessThan %33, %4 : i32
            spv.BranchConditional %34, ^bb2, ^bb3
          ^bb2:  // pred: ^bb1
            %35 = spv.IAdd %17, %27 : i32
            %36 = spv.IAdd %35, %30 : i32
            %37 = spv.IAdd %36, %33 : i32
            %38 = spv.AccessChain %8[%6, %37] : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
            %39 = spv.Load "StorageBuffer" %38 : f32
            %40 = spv.AccessChain %7[%6, %35] : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
            %41 = spv.Load "StorageBuffer" %40 : f32
            %42 = spv.IEqual %30, %6 : i32
            %43 = spv.IEqual %33, %6 : i32
            %44 = spv.LogicalAnd %42, %43 : i1
            %45 = spv.Select %44, %0, %41 : i1, f32
            %46 = spv.FOrdLessThan %39, %45 : f32
            %47 = spv.Select %46, %39, %45 : i1, f32
            spv.Store "StorageBuffer" %40, %47 : f32
            %48 = spv.IAdd %33, %4 : i32
            spv.Branch ^bb1(%48 : i32)
          ^bb3:  // pred: ^bb1
            spv._merge
          }
          %32 = spv.IAdd %30, %4 : i32
          spv.Branch ^bb1(%32 : i32)
        ^bb3:  // pred: ^bb1
          spv._merge
        }
        %29 = spv.IAdd %27, %5 : i32
        spv.Branch ^bb1(%29 : i32)
      ^bb3:  // pred: ^bb1
        spv._merge
      }
      %26 = spv.IAdd %17, %16 : i32
      spv.Branch ^bb1(%26 : i32)
    ^bb3:  // pred: ^bb1
      spv._merge
    }
    spv.Return
  }
  spv.EntryPoint "GLCompute" @main_ex_dispatch_5, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__
  spv.ExecutionMode @main_ex_dispatch_5 "LocalSize", 32, 1, 1
}

// *** IR Dump After mlir::iree_compiler::IREE::HAL::TranslateExecutablesPass ***
hal.executable @main_ex_dispatch_5 attributes {sym_visibility = "private"} {
  hal.interface @legacy_io {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
  }
  hal.executable.entry_point @main_ex_dispatch_5 attributes {interface = @legacy_io, ordinal = 0 : i32, signature = (tensor<5x1x1xf32>) -> tensor<5xf32>}
  hal.executable.target "vulkan*" {
    module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
      spv.module Logical GLSL450 requires #spv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]> {
        spv.globalVariable @__builtin_var_LocalInvocationId__ built_in("LocalInvocationId") : !spv.ptr<vector<3xi32>, Input>
        spv.globalVariable @__builtin_var_NumWorkgroups__ built_in("NumWorkgroups") : !spv.ptr<vector<3xi32>, Input>
        spv.globalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spv.ptr<vector<3xi32>, Input>
        spv.globalVariable @__resource_var_0_0__ bind(0, 0) : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
        spv.globalVariable @__resource_var_0_1__ bind(0, 1) : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
        spv.func @main_ex_dispatch_5() "None" {
          %0 = spv.constant 0x7F800000 : f32
          %1 = spv.constant 0.000000e+00 : f32
          %2 = spv.constant -1 : i32
          %3 = spv.constant 5 : i32
          %4 = spv.constant 1 : i32
          %5 = spv.constant 32 : i32
          %6 = spv.constant 0 : i32
          %7 = spv._address_of @__resource_var_0_1__ : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
          %8 = spv._address_of @__resource_var_0_0__ : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
          %9 = spv._address_of @__builtin_var_WorkgroupId__ : !spv.ptr<vector<3xi32>, Input>
          %10 = spv.Load "Input" %9 : vector<3xi32>
          %11 = spv.CompositeExtract %10[0 : i32] : vector<3xi32>
          %12 = spv._address_of @__builtin_var_NumWorkgroups__ : !spv.ptr<vector<3xi32>, Input>
          %13 = spv.Load "Input" %12 : vector<3xi32>
          %14 = spv.CompositeExtract %13[0 : i32] : vector<3xi32>
          %15 = spv.IMul %11, %5 : i32
          %16 = spv.IMul %14, %5 : i32
          spv.loop {
            spv.Branch ^bb1(%15 : i32)
          ^bb1(%17: i32):  // 2 preds: ^bb0, ^bb2
            %18 = spv.SLessThan %17, %3 : i32
            spv.BranchConditional %18, ^bb2, ^bb3
          ^bb2:  // pred: ^bb1
            %19 = spv.IMul %17, %2 : i32
            %20 = spv.IAdd %19, %3 : i32
            %21 = spv.SLessThan %5, %20 : i32
            %22 = spv.Select %21, %5, %20 : i1, i32
            %23 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
            %24 = spv.Load "Input" %23 : vector<3xi32>
            %25 = spv.CompositeExtract %24[0 : i32] : vector<3xi32>
            spv.loop {
              spv.Branch ^bb1(%25 : i32)
            ^bb1(%27: i32):  // 2 preds: ^bb0, ^bb2
              %28 = spv.SLessThan %27, %22 : i32
              spv.BranchConditional %28, ^bb2, ^bb3
            ^bb2:  // pred: ^bb1
              %29 = spv.IAdd %17, %27 : i32
              %30 = spv.AccessChain %7[%6, %29] : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
              spv.Store "StorageBuffer" %30, %1 : f32
              %31 = spv.IAdd %27, %5 : i32
              spv.Branch ^bb1(%31 : i32)
            ^bb3:  // pred: ^bb1
              spv._merge
            }
            %26 = spv.IAdd %17, %16 : i32
            spv.Branch ^bb1(%26 : i32)
          ^bb3:  // pred: ^bb1
            spv._merge
          }
          spv.loop {
            spv.Branch ^bb1(%15 : i32)
          ^bb1(%17: i32):  // 2 preds: ^bb0, ^bb2
            %18 = spv.SLessThan %17, %3 : i32
            spv.BranchConditional %18, ^bb2, ^bb3
          ^bb2:  // pred: ^bb1
            %19 = spv.IMul %17, %2 : i32
            %20 = spv.IAdd %19, %3 : i32
            %21 = spv.SLessThan %5, %20 : i32
            %22 = spv.Select %21, %5, %20 : i1, i32
            %23 = spv._address_of @__builtin_var_LocalInvocationId__ : !spv.ptr<vector<3xi32>, Input>
            %24 = spv.Load "Input" %23 : vector<3xi32>
            %25 = spv.CompositeExtract %24[0 : i32] : vector<3xi32>
            spv.loop {
              spv.Branch ^bb1(%25 : i32)
            ^bb1(%27: i32):  // 2 preds: ^bb0, ^bb2
              %28 = spv.SLessThan %27, %22 : i32
              spv.BranchConditional %28, ^bb2, ^bb3
            ^bb2:  // pred: ^bb1
              spv.loop {
                spv.Branch ^bb1(%6 : i32)
              ^bb1(%30: i32):  // 2 preds: ^bb0, ^bb2
                %31 = spv.SLessThan %30, %4 : i32
                spv.BranchConditional %31, ^bb2, ^bb3
              ^bb2:  // pred: ^bb1
                spv.loop {
                  spv.Branch ^bb1(%6 : i32)
                ^bb1(%33: i32):  // 2 preds: ^bb0, ^bb2
                  %34 = spv.SLessThan %33, %4 : i32
                  spv.BranchConditional %34, ^bb2, ^bb3
                ^bb2:  // pred: ^bb1
                  %35 = spv.IAdd %17, %27 : i32
                  %36 = spv.IAdd %35, %30 : i32
                  %37 = spv.IAdd %36, %33 : i32
                  %38 = spv.AccessChain %8[%6, %37] : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
                  %39 = spv.Load "StorageBuffer" %38 : f32
                  %40 = spv.AccessChain %7[%6, %35] : !spv.ptr<!spv.struct<!spv.array<5 x f32, stride=4> [0]>, StorageBuffer>
                  %41 = spv.Load "StorageBuffer" %40 : f32
                  %42 = spv.IEqual %30, %6 : i32
                  %43 = spv.IEqual %33, %6 : i32
                  %44 = spv.LogicalAnd %42, %43 : i1
                  %45 = spv.Select %44, %0, %41 : i1, f32
                  %46 = spv.FOrdLessThan %39, %45 : f32
                  %47 = spv.Select %46, %39, %45 : i1, f32
                  spv.Store "StorageBuffer" %40, %47 : f32
                  %48 = spv.IAdd %33, %4 : i32
                  spv.Branch ^bb1(%48 : i32)
                ^bb3:  // pred: ^bb1
                  spv._merge
                }
                %32 = spv.IAdd %30, %4 : i32
                spv.Branch ^bb1(%32 : i32)
              ^bb3:  // pred: ^bb1
                spv._merge
              }
              %29 = spv.IAdd %27, %5 : i32
              spv.Branch ^bb1(%29 : i32)
            ^bb3:  // pred: ^bb1
              spv._merge
            }
            %26 = spv.IAdd %17, %16 : i32
            spv.Branch ^bb1(%26 : i32)
          ^bb3:  // pred: ^bb1
            spv._merge
          }
          spv.Return
        }
        spv.EntryPoint "GLCompute" @main_ex_dispatch_5, @__builtin_var_WorkgroupId__, @__builtin_var_NumWorkgroups__, @__builtin_var_LocalInvocationId__
        spv.ExecutionMode @main_ex_dispatch_5 "LocalSize", 32, 1, 1
      }
      hal.interface @legacy_io attributes {sym_visibility = "private"} {
        hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
        hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
      }
    }
  }
}

// *** IR Dump After Inliner ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_6() {
    %c0 = constant 0 : index
    %cst = constant dense<0.000000e+00> : tensor<f32>
    %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<5xf32>
    %1 = hal.interface.load.tensor @legacy_io::@arg1, offset = %c0 : tensor<5xi32>
    %2 = "xla_hlo.broadcast_in_dim"(%cst) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<f32>) -> tensor<5xf32>
    %3 = "xla_hlo.compare"(%0, %2) {comparison_direction = "EQ"} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
    %4 = "xla_hlo.convert"(%3) : (tensor<5xi1>) -> tensor<5xi32>
    %5 = xla_hlo.multiply %4, %1 : tensor<5xi32>
    %6 = "xla_hlo.reverse"(%0) {dimensions = dense<0> : tensor<1xi64>} : (tensor<5xf32>) -> tensor<5xf32>
    %7 = "xla_hlo.broadcast_in_dim"(%cst) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<f32>) -> tensor<5xf32>
    %8 = "xla_hlo.compare"(%6, %7) {comparison_direction = "EQ"} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
    %9 = "xla_hlo.convert"(%8) : (tensor<5xi1>) -> tensor<5xi32>
    %10 = xla_hlo.multiply %9, %1 : tensor<5xi32>
    hal.interface.store.tensor %5, @legacy_io::@ret0, offset = %c0 : tensor<5xi32>
    hal.interface.store.tensor %10, @legacy_io::@ret1, offset = %c0 : tensor<5xi32>
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
    hal.interface.binding @ret1, set=0, binding=3, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::Shape::`anonymous-namespace'::TieDynamicShapesPass ***
func @main_ex_dispatch_6() {
  %c0 = constant 0 : index
  %cst = constant dense<0.000000e+00> : tensor<f32>
  %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<5xf32>
  %1 = hal.interface.load.tensor @legacy_io::@arg1, offset = %c0 : tensor<5xi32>
  %2 = "xla_hlo.broadcast_in_dim"(%cst) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<f32>) -> tensor<5xf32>
  %3 = "xla_hlo.compare"(%0, %2) {comparison_direction = "EQ"} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
  %4 = "xla_hlo.convert"(%3) : (tensor<5xi1>) -> tensor<5xi32>
  %5 = xla_hlo.multiply %4, %1 : tensor<5xi32>
  %6 = "xla_hlo.reverse"(%0) {dimensions = dense<0> : tensor<1xi64>} : (tensor<5xf32>) -> tensor<5xf32>
  %7 = "xla_hlo.broadcast_in_dim"(%cst) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<f32>) -> tensor<5xf32>
  %8 = "xla_hlo.compare"(%6, %7) {comparison_direction = "EQ"} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
  %9 = "xla_hlo.convert"(%8) : (tensor<5xi1>) -> tensor<5xi32>
  %10 = xla_hlo.multiply %9, %1 : tensor<5xi32>
  hal.interface.store.tensor %5, @legacy_io::@ret0, offset = %c0 : tensor<5xi32>
  hal.interface.store.tensor %10, @legacy_io::@ret1, offset = %c0 : tensor<5xi32>
  return
}

// *** IR Dump After mlir::iree_compiler::Shape::`anonymous-namespace'::MaterializeShapeCalculationsPass ***
func @main_ex_dispatch_6() {
  %c0 = constant 0 : index
  %cst = constant dense<0.000000e+00> : tensor<f32>
  %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<5xf32>
  %1 = hal.interface.load.tensor @legacy_io::@arg1, offset = %c0 : tensor<5xi32>
  %2 = "xla_hlo.broadcast_in_dim"(%cst) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<f32>) -> tensor<5xf32>
  %3 = "xla_hlo.compare"(%0, %2) {comparison_direction = "EQ"} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
  %4 = "xla_hlo.convert"(%3) : (tensor<5xi1>) -> tensor<5xi32>
  %5 = xla_hlo.multiply %4, %1 : tensor<5xi32>
  %6 = "xla_hlo.reverse"(%0) {dimensions = dense<0> : tensor<1xi64>} : (tensor<5xf32>) -> tensor<5xf32>
  %7 = "xla_hlo.broadcast_in_dim"(%cst) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<f32>) -> tensor<5xf32>
  %8 = "xla_hlo.compare"(%6, %7) {comparison_direction = "EQ"} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
  %9 = "xla_hlo.convert"(%8) : (tensor<5xi1>) -> tensor<5xi32>
  %10 = xla_hlo.multiply %9, %1 : tensor<5xi32>
  hal.interface.store.tensor %5, @legacy_io::@ret0, offset = %c0 : tensor<5xi32>
  hal.interface.store.tensor %10, @legacy_io::@ret1, offset = %c0 : tensor<5xi32>
  return
}

// *** IR Dump After mlir::iree_compiler::Shape::`anonymous-namespace'::HoistShapeCalculations ***
func @main_ex_dispatch_6() {
  %c0 = constant 0 : index
  %cst = constant dense<0.000000e+00> : tensor<f32>
  %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<5xf32>
  %1 = hal.interface.load.tensor @legacy_io::@arg1, offset = %c0 : tensor<5xi32>
  %2 = "xla_hlo.broadcast_in_dim"(%cst) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<f32>) -> tensor<5xf32>
  %3 = "xla_hlo.compare"(%0, %2) {comparison_direction = "EQ"} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
  %4 = "xla_hlo.convert"(%3) : (tensor<5xi1>) -> tensor<5xi32>
  %5 = xla_hlo.multiply %4, %1 : tensor<5xi32>
  %6 = "xla_hlo.reverse"(%0) {dimensions = dense<0> : tensor<1xi64>} : (tensor<5xf32>) -> tensor<5xf32>
  %7 = "xla_hlo.broadcast_in_dim"(%cst) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<f32>) -> tensor<5xf32>
  %8 = "xla_hlo.compare"(%6, %7) {comparison_direction = "EQ"} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
  %9 = "xla_hlo.convert"(%8) : (tensor<5xi1>) -> tensor<5xi32>
  %10 = xla_hlo.multiply %9, %1 : tensor<5xi32>
  hal.interface.store.tensor %5, @legacy_io::@ret0, offset = %c0 : tensor<5xi32>
  hal.interface.store.tensor %10, @legacy_io::@ret1, offset = %c0 : tensor<5xi32>
  return
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::DecomposeHLOClampPass ***
func @main_ex_dispatch_6() {
  %c0 = constant 0 : index
  %cst = constant dense<0.000000e+00> : tensor<f32>
  %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<5xf32>
  %1 = hal.interface.load.tensor @legacy_io::@arg1, offset = %c0 : tensor<5xi32>
  %2 = "xla_hlo.broadcast_in_dim"(%cst) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<f32>) -> tensor<5xf32>
  %3 = "xla_hlo.compare"(%0, %2) {comparison_direction = "EQ"} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
  %4 = "xla_hlo.convert"(%3) : (tensor<5xi1>) -> tensor<5xi32>
  %5 = xla_hlo.multiply %4, %1 : tensor<5xi32>
  %6 = "xla_hlo.reverse"(%0) {dimensions = dense<0> : tensor<1xi64>} : (tensor<5xf32>) -> tensor<5xf32>
  %7 = "xla_hlo.broadcast_in_dim"(%cst) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<f32>) -> tensor<5xf32>
  %8 = "xla_hlo.compare"(%6, %7) {comparison_direction = "EQ"} : (tensor<5xf32>, tensor<5xf32>) -> tensor<5xi1>
  %9 = "xla_hlo.convert"(%8) : (tensor<5xi1>) -> tensor<5xi32>
  %10 = xla_hlo.multiply %9, %1 : tensor<5xi32>
  hal.interface.store.tensor %5, @legacy_io::@ret0, offset = %c0 : tensor<5xi32>
  hal.interface.store.tensor %10, @legacy_io::@ret1, offset = %c0 : tensor<5xi32>
  return
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::ConvertHLOToLinalgOnTensorsPass ***
func @main_ex_dispatch_6() {
  %c0 = constant 0 : index
  %cst = constant dense<0.000000e+00> : tensor<f32>
  %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<5xf32>
  %1 = hal.interface.load.tensor @legacy_io::@arg1, offset = %c0 : tensor<5xi32>
  %2 = linalg.generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} %cst {
  ^bb0(%arg0: f32):  // no predecessors
    linalg.yield %arg0 : f32
  }: tensor<f32> -> tensor<5xf32>
  %3 = linalg.generic {args_in = 2 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} %0, %2 {
  ^bb0(%arg0: f32, %arg1: f32):  // no predecessors
    %11 = cmpf "oeq", %arg0, %arg1 : f32
    linalg.yield %11 : i1
  }: tensor<5xf32>, tensor<5xf32> -> tensor<5xi1>
  %4 = linalg.generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} %3 {
  ^bb0(%arg0: i1):  // no predecessors
    %11 = zexti %arg0 : i1 to i32
    linalg.yield %11 : i32
  }: tensor<5xi1> -> tensor<5xi32>
  %5 = linalg.generic {args_in = 2 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} %4, %1 {
  ^bb0(%arg0: i32, %arg1: i32):  // no predecessors
    %11 = muli %arg0, %arg1 : i32
    linalg.yield %11 : i32
  }: tensor<5xi32>, tensor<5xi32> -> tensor<5xi32>
  %6 = linalg.generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0) -> (-d0 + 4)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} %0 {
  ^bb0(%arg0: f32):  // no predecessors
    linalg.yield %arg0 : f32
  }: tensor<5xf32> -> tensor<5xf32>
  %7 = linalg.generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} %cst {
  ^bb0(%arg0: f32):  // no predecessors
    linalg.yield %arg0 : f32
  }: tensor<f32> -> tensor<5xf32>
  %8 = linalg.generic {args_in = 2 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} %6, %7 {
  ^bb0(%arg0: f32, %arg1: f32):  // no predecessors
    %11 = cmpf "oeq", %arg0, %arg1 : f32
    linalg.yield %11 : i1
  }: tensor<5xf32>, tensor<5xf32> -> tensor<5xi1>
  %9 = linalg.generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} %8 {
  ^bb0(%arg0: i1):  // no predecessors
    %11 = zexti %arg0 : i1 to i32
    linalg.yield %11 : i32
  }: tensor<5xi1> -> tensor<5xi32>
  %10 = linalg.generic {args_in = 2 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} %9, %1 {
  ^bb0(%arg0: i32, %arg1: i32):  // no predecessors
    %11 = muli %arg0, %arg1 : i32
    linalg.yield %11 : i32
  }: tensor<5xi32>, tensor<5xi32> -> tensor<5xi32>
  hal.interface.store.tensor %5, @legacy_io::@ret0, offset = %c0 : tensor<5xi32>
  hal.interface.store.tensor %10, @legacy_io::@ret1, offset = %c0 : tensor<5xi32>
  return
}

// *** IR Dump After LinalgFusionOfTensorOps ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_6() {
    %c0 = constant 0 : index
    %cst = constant 0.000000e+00 : f32
    %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0 : tensor<5xf32>
    %1 = hal.interface.load.tensor @legacy_io::@arg1, offset = %c0 : tensor<5xi32>
    %2 = linalg.generic {args_in = 2 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} %0, %1 {
    ^bb0(%arg0: f32, %arg1: i32):  // no predecessors
      %4 = cmpf "oeq", %arg0, %cst : f32
      %5 = zexti %4 : i1 to i32
      %6 = muli %5, %arg1 : i32
      linalg.yield %6 : i32
    }: tensor<5xf32>, tensor<5xi32> -> tensor<5xi32>
    %3 = linalg.generic {args_in = 2 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0) -> (-d0 + 4)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} %0, %1 {
    ^bb0(%arg0: f32, %arg1: i32):  // no predecessors
      %4 = cmpf "oeq", %arg0, %cst : f32
      %5 = zexti %4 : i1 to i32
      %6 = muli %5, %arg1 : i32
      linalg.yield %6 : i32
    }: tensor<5xf32>, tensor<5xi32> -> tensor<5xi32>
    hal.interface.store.tensor %2, @legacy_io::@ret0, offset = %c0 : tensor<5xi32>
    hal.interface.store.tensor %3, @legacy_io::@ret1, offset = %c0 : tensor<5xi32>
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
    hal.interface.binding @ret1, set=0, binding=3, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After mlir::iree_compiler::`anonymous-namespace'::ConvertHLOToLinalgOnBuffersPass ***
func @main_ex_dispatch_6() {
  %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5xi32>
  %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret1} : memref<5xi32>
  %c0 = constant 0 : index
  %cst = constant 0.000000e+00 : f32
  %2 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<5xf32>
  %3 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg1} : memref<5xi32>
  linalg.generic {args_in = 2 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} %2, %3, %0 {
  ^bb0(%arg0: f32, %arg1: i32, %arg2: i32):  // no predecessors
    %4 = cmpf "oeq", %arg0, %cst : f32
    %5 = zexti %4 : i1 to i32
    %6 = muli %5, %arg1 : i32
    linalg.yield %6 : i32
  }: memref<5xf32>, memref<5xi32>, memref<5xi32>
  linalg.generic {args_in = 2 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0) -> (-d0 + 4)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} %2, %3, %1 {
  ^bb0(%arg0: f32, %arg1: i32, %arg2: i32):  // no predecessors
    %4 = cmpf "oeq", %arg0, %cst : f32
    %5 = zexti %4 : i1 to i32
    %6 = muli %5, %arg1 : i32
    linalg.yield %6 : i32
  }: memref<5xf32>, memref<5xi32>, memref<5xi32>
  return
}

// *** IR Dump After Canonicalizer ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_6() {
    %cst = constant 0.000000e+00 : f32
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5xi32>
    %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret1} : memref<5xi32>
    %2 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<5xf32>
    %3 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg1} : memref<5xi32>
    linalg.generic {args_in = 2 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} %2, %3, %0 {
    ^bb0(%arg0: f32, %arg1: i32, %arg2: i32):  // no predecessors
      %4 = cmpf "oeq", %arg0, %cst : f32
      %5 = zexti %4 : i1 to i32
      %6 = muli %5, %arg1 : i32
      linalg.yield %6 : i32
    }: memref<5xf32>, memref<5xi32>, memref<5xi32>
    linalg.generic {args_in = 2 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0) -> (-d0 + 4)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} %2, %3, %1 {
    ^bb0(%arg0: f32, %arg1: i32, %arg2: i32):  // no predecessors
      %4 = cmpf "oeq", %arg0, %cst : f32
      %5 = zexti %4 : i1 to i32
      %6 = muli %5, %arg1 : i32
      linalg.yield %6 : i32
    }: memref<5xf32>, memref<5xi32>, memref<5xi32>
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
    hal.interface.binding @ret1, set=0, binding=3, type="StorageBuffer", access="Write|Discard"
  }
}

// *** IR Dump After CSE ***
module attributes {spv.target_env = #spv.target_env<#spv.vce<v1.3, [Shader], [SPV_KHR_storage_buffer_storage_class]>, {max_compute_workgroup_invocations = 128 : i32, max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}>} {
  func @main_ex_dispatch_6() {
    %cst = constant 0.000000e+00 : f32
    %0 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret0} : memref<5xi32>
    %1 = iree.placeholder for "interface buffer" {binding = @legacy_io::@ret1} : memref<5xi32>
    %2 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg0} : memref<5xf32>
    %3 = iree.placeholder for "interface buffer" {binding = @legacy_io::@arg1} : memref<5xi32>
    linalg.generic {args_in = 2 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} %2, %3, %0 {
    ^bb0(%arg0: f32, %arg1: i32, %arg2: i32):  // no predecessors
      %4 = cmpf "oeq", %arg0, %cst : f32
      %5 = zexti %4 : i1 to i32
      %6 = muli %5, %arg1 : i32
      linalg.yield %6 : i32
    }: memref<5xf32>, memref<5xi32>, memref<5xi32>
    linalg.generic {args_in = 2 : i64, args_out = 1 : i64, indexing_maps = [affine_map<(d0) -> (-d0 + 4)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} %2, %3, %1 {
    ^bb0(%arg0: f32, %arg1: i32, %arg2: i32):  // no predecessors
      %4 = cmpf "oeq", %arg0, %cst : f32
      %5 = zexti %4 : i1 to i32
      %6 = muli %5, %arg1 : i32
      linalg.yield %6 : i32
    }: memref<5xf32>, memref<5xi32>, memref<5xi32>
    return
  }
  hal.interface @legacy_io attributes {sym_visibility = "private"} {
    hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
    hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
    hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
    hal.interface.binding @ret1, set=0, binding=3, type="StorageBuffer", access="Write|Discard"
  }
}

Assertion failed: expr.getRHS().cast<AffineConstantExpr>().getValue() > 0 && "nonpositive multiplying coefficient", file D:\Dev\iree\third_party\llvm-project\mlir\lib\Dialect\Linalg\Transforms\Tiling.cpp, line 107