Abhishek-Varma/module_forward_dispatch_28_vulkan_spirv_fb.mlir Secret

## module_forward_dispatch_28_vulkan_spirv_fb.mlir
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, AMD:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 65536, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 1024], subgroup_size = 64, min_subgroup_size = 32, max_subgroup_size = 64, cooperative_matrix_properties_nv = []>>}>], legacy_sync}>]} {
  hal.executable private @forward_dispatch_28 {
    hal.executable.variant public @vulkan_spirv_fb, target = <"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, AMD:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 65536, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 1024], subgroup_size = 64, min_subgroup_size = 32, max_subgroup_size = 64, cooperative_matrix_properties_nv = []>>}> {
      hal.executable.export public @forward_dispatch_28_matmul_4096x512x512 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @forward_dispatch_28_matmul_4096x512x512() {
          %c8489024 = arith.constant 8489024 : index
          %c578560 = arith.constant 578560 : index
          %c43008 = arith.constant 43008 : index
          %c12683328 = arith.constant 12683328 : index
          %cst = arith.constant 0.000000e+00 : f16
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c8489024) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x512xf16>>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c578560) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x512xf16>>
          %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c43008) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512xf16>>
          %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c12683328) : !flow.dispatch.tensor<writeonly:tensor<512x4096xf16>>
          %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [4096, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x512xf16>> -> tensor<4096x512xf16>
          %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x512xf16>> -> tensor<512x512xf16>
          %6 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [512], strides = [1] : !flow.dispatch.tensor<readonly:tensor<512xf16>> -> tensor<512xf16>
          %7 = tensor.empty() : tensor<512x4096xf16>
          %8 = tensor.empty() : tensor<4096x512xf16>
          %9 = linalg.fill ins(%cst : f16) outs(%8 : tensor<4096x512xf16>) -> tensor<4096x512xf16>
          %10 = linalg.matmul ins(%4, %5 : tensor<4096x512xf16>, tensor<512x512xf16>) outs(%9 : tensor<4096x512xf16>) -> tensor<4096x512xf16>
          %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>], iterator_types = ["parallel", "parallel"]} ins(%10, %6 : tensor<4096x512xf16>, tensor<512xf16>) outs(%7 : tensor<512x4096xf16>) {
          ^bb0(%in: f16, %in_0: f16, %out: f16):
            %12 = arith.addf %in, %in_0 : f16
            linalg.yield %12 : f16
          } -> tensor<512x4096xf16>
          flow.dispatch.tensor.store %11, %3, offsets = [0, 0], sizes = [512, 4096], strides = [1, 1] : tensor<512x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<512x4096xf16>>
          return
        }
      }
    }
  }
  util.global private mutable @forward_dispatch_28_vulkan_spirv_fb_forward_dispatch_28_matmul_4096x512x512_4096x512_buffer : !hal.buffer
  util.initializer {
    %c1441153280 = arith.constant 1441153280 : index
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    %buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c1441153280}
    util.global.store %buffer, @forward_dispatch_28_vulkan_spirv_fb_forward_dispatch_28_matmul_4096x512x512_4096x512_buffer : !hal.buffer
    util.initializer.return
  }
  func.func @forward_dispatch_28_vulkan_spirv_fb_forward_dispatch_28_matmul_4096x512x512_4096x512(%arg0: i32) attributes {iree.abi.stub, iree.reflection = {iree.benchmark = "dispatch"}} {
    %c-1_i32 = arith.constant -1 : i32
    %c-1_i64 = arith.constant -1 : i64
    %c512 = arith.constant 512 : index
    %c4096 = arith.constant 4096 : index
    %c770064384 = arith.constant 770064384 : index
    %c2 = arith.constant 2 : index
    %c98975488 = arith.constant 98975488 : index
    %c671088896 = arith.constant 671088896 : index
    %c1 = arith.constant 1 : index
    %c671088768 = arith.constant 671088768 : index
    %c0 = arith.constant 0 : index
    %0 = arith.index_cast %arg0 : i32 to index
    %device = hal.ex.shared_device : !hal.device
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Dispatch) : !hal.command_buffer
    %pipeline_layout = hal.pipeline_layout.lookup device(%device : !hal.device) layout(<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) : !hal.pipeline_layout
    %forward_dispatch_28_vulkan_spirv_fb_forward_dispatch_28_matmul_4096x512x512_4096x512_buffer = util.global.load @forward_dispatch_28_vulkan_spirv_fb_forward_dispatch_28_matmul_4096x512x512_4096x512_buffer : !hal.buffer
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%forward_dispatch_28_vulkan_spirv_fb_forward_dispatch_28_matmul_4096x512x512_4096x512_buffer : !hal.buffer)[%c0, %c671088768],
      %c1 = (%forward_dispatch_28_vulkan_spirv_fb_forward_dispatch_28_matmul_4096x512x512_4096x512_buffer : !hal.buffer)[%c671088896, %c98975488],
      %c2 = (%forward_dispatch_28_vulkan_spirv_fb_forward_dispatch_28_matmul_4096x512x512_4096x512_buffer : !hal.buffer)[%c770064384, %c671088768]
    ])
    %workgroup_x, %workgroup_y, %workgroup_z = hal.executable.calculate_workgroups device(%device : !hal.device) target(@forward_dispatch_28::@vulkan_spirv_fb::@forward_dispatch_28_matmul_4096x512x512) workload([%c4096, %c512]) : index, index, index
    scf.for %arg1 = %c0 to %0 step %c1 {
      hal.command_buffer.dispatch.symbol<%cmd : !hal.command_buffer> target(@forward_dispatch_28::@vulkan_spirv_fb::@forward_dispatch_28_matmul_4096x512x512) workgroups([%workgroup_x, %workgroup_y, %workgroup_z])
      hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
    }
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %1 = util.null : !hal.fence
    %fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
    hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%1) signal(%fence) commands([%cmd])
    %status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    return
  }
}
	module attributes {hal.device.targets = [#hal.device.target<"vulkan", {executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, AMD:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 65536, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 1024], subgroup_size = 64, min_subgroup_size = 32, max_subgroup_size = 64, cooperative_matrix_properties_nv = []>>}>], legacy_sync}>]} {
	hal.executable private @forward_dispatch_28 {
	hal.executable.variant public @vulkan_spirv_fb, target = <"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>, api=Vulkan, AMD:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 65536, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 1024], subgroup_size = 64, min_subgroup_size = 32, max_subgroup_size = 64, cooperative_matrix_properties_nv = []>>}> {
	hal.executable.export public @forward_dispatch_28_matmul_4096x512x512 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) {
	^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
	%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
	hal.return %x, %y, %z : index, index, index
	}
	builtin.module {
	func.func @forward_dispatch_28_matmul_4096x512x512() {
	%c8489024 = arith.constant 8489024 : index
	%c578560 = arith.constant 578560 : index
	%c43008 = arith.constant 43008 : index
	%c12683328 = arith.constant 12683328 : index
	%cst = arith.constant 0.000000e+00 : f16
	%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c8489024) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x512xf16>>
	%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c578560) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x512xf16>>
	%2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c43008) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512xf16>>
	%3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c12683328) : !flow.dispatch.tensor<writeonly:tensor<512x4096xf16>>
	%4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [4096, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x512xf16>> -> tensor<4096x512xf16>
	%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x512xf16>> -> tensor<512x512xf16>
	%6 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [512], strides = [1] : !flow.dispatch.tensor<readonly:tensor<512xf16>> -> tensor<512xf16>
	%7 = tensor.empty() : tensor<512x4096xf16>
	%8 = tensor.empty() : tensor<4096x512xf16>
	%9 = linalg.fill ins(%cst : f16) outs(%8 : tensor<4096x512xf16>) -> tensor<4096x512xf16>
	%10 = linalg.matmul ins(%4, %5 : tensor<4096x512xf16>, tensor<512x512xf16>) outs(%9 : tensor<4096x512xf16>) -> tensor<4096x512xf16>
	%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1, d0)>], iterator_types = ["parallel", "parallel"]} ins(%10, %6 : tensor<4096x512xf16>, tensor<512xf16>) outs(%7 : tensor<512x4096xf16>) {
	^bb0(%in: f16, %in_0: f16, %out: f16):
	%12 = arith.addf %in, %in_0 : f16
	linalg.yield %12 : f16
	} -> tensor<512x4096xf16>
	flow.dispatch.tensor.store %11, %3, offsets = [0, 0], sizes = [512, 4096], strides = [1, 1] : tensor<512x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<512x4096xf16>>
	return
	}
	}
	}
	}
	util.global private mutable @forward_dispatch_28_vulkan_spirv_fb_forward_dispatch_28_matmul_4096x512x512_4096x512_buffer : !hal.buffer
	util.initializer {
	%c1441153280 = arith.constant 1441153280 : index
	%device = hal.ex.shared_device : !hal.device
	%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
	%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible\|DeviceLocal") usage("TransferSource\|TransferTarget\|Transfer\|DispatchStorageRead\|DispatchStorageWrite\|DispatchStorage") : !hal.buffer{%c1441153280}
	util.global.store %buffer, @forward_dispatch_28_vulkan_spirv_fb_forward_dispatch_28_matmul_4096x512x512_4096x512_buffer : !hal.buffer
	util.initializer.return
	}
	func.func @forward_dispatch_28_vulkan_spirv_fb_forward_dispatch_28_matmul_4096x512x512_4096x512(%arg0: i32) attributes {iree.abi.stub, iree.reflection = {iree.benchmark = "dispatch"}} {
	%c-1_i32 = arith.constant -1 : i32
	%c-1_i64 = arith.constant -1 : i64
	%c512 = arith.constant 512 : index
	%c4096 = arith.constant 4096 : index
	%c770064384 = arith.constant 770064384 : index
	%c2 = arith.constant 2 : index
	%c98975488 = arith.constant 98975488 : index
	%c671088896 = arith.constant 671088896 : index
	%c1 = arith.constant 1 : index
	%c671088768 = arith.constant 671088768 : index
	%c0 = arith.constant 0 : index
	%0 = arith.index_cast %arg0 : i32 to index
	%device = hal.ex.shared_device : !hal.device
	%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot\|AllowInlineExecution") categories(Dispatch) : !hal.command_buffer
	%pipeline_layout = hal.pipeline_layout.lookup device(%device : !hal.device) layout(<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) : !hal.pipeline_layout
	%forward_dispatch_28_vulkan_spirv_fb_forward_dispatch_28_matmul_4096x512x512_4096x512_buffer = util.global.load @forward_dispatch_28_vulkan_spirv_fb_forward_dispatch_28_matmul_4096x512x512_4096x512_buffer : !hal.buffer
	hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
	%c0 = (%forward_dispatch_28_vulkan_spirv_fb_forward_dispatch_28_matmul_4096x512x512_4096x512_buffer : !hal.buffer)[%c0, %c671088768],
	%c1 = (%forward_dispatch_28_vulkan_spirv_fb_forward_dispatch_28_matmul_4096x512x512_4096x512_buffer : !hal.buffer)[%c671088896, %c98975488],
	%c2 = (%forward_dispatch_28_vulkan_spirv_fb_forward_dispatch_28_matmul_4096x512x512_4096x512_buffer : !hal.buffer)[%c770064384, %c671088768]
	])
	%workgroup_x, %workgroup_y, %workgroup_z = hal.executable.calculate_workgroups device(%device : !hal.device) target(@forward_dispatch_28::@vulkan_spirv_fb::@forward_dispatch_28_matmul_4096x512x512) workload([%c4096, %c512]) : index, index, index
	scf.for %arg1 = %c0 to %0 step %c1 {
	hal.command_buffer.dispatch.symbol<%cmd : !hal.command_buffer> target(@forward_dispatch_28::@vulkan_spirv_fb::@forward_dispatch_28_matmul_4096x512x512) workgroups([%workgroup_x, %workgroup_y, %workgroup_z])
	hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch\|CommandRetire") target("CommandIssue\|Dispatch") flags("None")
	}
	hal.command_buffer.finalize<%cmd : !hal.command_buffer>
	%1 = util.null : !hal.fence
	%fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
	hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%1) signal(%fence) commands([%cmd])
	%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
	util.status.check_ok %status, "failed to wait on timepoint"
	return
	}
	}