Murali Vijayaraghavan vmurali

## test
Time for each load+add: 0.291310 ns for START=0
576460752303423488.000000
Time for each load+add: 0.290989 ns for START=1
inf
Time for each load+add: 0.291260 ns for START=2
nan
Time for each load+add: 0.292583 ns for START=3
nan
Time for each load+add: 0.290434 ns for START=4
nan

## matthias
muralivi@gogeta:~$ /usr/local/google/home/muralivi/projects/iree-build/tools/iree-compile --output-format=vm-bytecode --mlir-print-op-on-diagnostic=false --iree-hal-target-backends=vmvx /usr/local/google/home/muralivi/projects/iree/tests/e2e/tensor_ops/pack.mlir -o check_vmvx_local-task_pack.mlir_module.vmfb --iree-hal-executable-object-search-path=\"/usr/local/google/home/muralivi/projects/iree-build\" --iree-llvm-embedded-linker-path=\"/usr/local/google/home/muralivi/projects/iree-build/llvm-project/bin/lld\" --iree-llvm-wasm-linker-path=\"/usr/local/google/home/muralivi/projects/iree-build/llvm-project/bin/lld\" >& ~/crap2
Aborted
muralivi@gogeta:~$ vim ~/crap2
muralivi@gogeta:~$ /usr/local/google/home/muralivi/projects/iree-build/tools/iree-compile --output-format=vm-bytecode --mlir-print-op-on-diagnostic=false --iree-hal-target-backends=vmvx /usr/local/google/home/muralivi/projects/iree/tests/e2e/tensor_ops/pack.mlir -o check_vmvx_local-task_pack.mlir_module.vmfb --iree-hal-executable-object-search-path

## crap
// -----// IR Dump Before TosaToSCF (tosa-to-scf) //----- //
func.func @tensor_float() {
  %0 = util.unfoldable_constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
  %1 = util.unfoldable_constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
  %2 = util.unfoldable_constant dense<1.000000e+00> : tensor<1xf32>
  %3 = "tosa.fully_connected"(%0, %1, %2) : (tensor<2x3xf32>, tensor<1x3xf32>, tensor<1xf32>) -> tensor<2x1xf32>
  check.expect_eq_const(%3, dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>) : tensor<2x1xf32>
  return
}

## registermap_bad_13_8
General Purpose Registers:
        x0 = 0x0000600003e00038
        x1 = 0x000000016feb2d00
        x2 = 0x000000016feb2cc0
        x3 = 0x000000016feb2cc0
        x4 = 0x0000000000000000
        x5 = 0x000000016feb2f10
        x6 = 0x00000001c1c1d07d
        x7 = 0x000000000d84bc77
        x8 = 0x000000010019c3e0

## badcode_13_8
->  0x10019c3e0: stp    x29, x30, [sp, #-0x10]!
    0x10019c3e4: mov    x29, sp
    0x10019c3e8: ldp    x8, x9, [x1, #0x18]
    0x10019c3ec: mov    x13, xzr
    0x10019c3f0: mov    w17, #0x1
    0x10019c3f4: mov    w12, #0xd
    0x10019c3f8: mov    w14, #0x34
    0x10019c3fc: ldr    w10, [x8]
    0x10019c400: ldp    x11, x8, [x9]
    0x10019c404: add    x9, x11, x10

## 13_8.mlir
module attributes {hal.device.targets = [#hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "arm64-unknown-unknown-eabi-elf"}>]}>]} {
  hal.executable private @encode_dispatch_285 {
    hal.executable.variant public @embedded_elf_arm_64, target = <"llvm-cpu", "embedded-elf-arm_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "arm64-unknown-unknown-eabi-elf"}> {
      hal.executable.export public @encode_dispatch_285_generic_8x13 ordinal(0) layout(#hal.pipeline.layout<push_constants = 1, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %x, %y, %z = flow.dispatch.workgroup_count_from_dag_roo

## gpt2_module_encode_dispatch_285_embedded_elf_arm_64.mlir
module attributes {hal.device.targets = [#hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "arm64-unknown-unknown-eabi-elf"}>]}>]} {
  hal.executable private @encode_dispatch_285 {
    hal.executable.variant public @embedded_elf_arm_64, target = <"llvm-cpu", "embedded-elf-arm_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "arm64-unknown-unknown-eabi-elf"}> {
      hal.executable.export public @encode_dispatch_285_generic_768x50272 ordinal(0) layout(#hal.pipeline.layout<push_constants = 1, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
        %x, %y, %z = flow.dispatch.workgroup_count_from_da

## fulldump
// -----// IR Dump After LegalizeControlFlowPass (mhlo-legalize-control-flow) //----- //
func.func @lbeta__2x2x2x2x2x2x2__f32__uniform(%arg0: !iree_input.buffer_view) -> !iree_input.buffer_view attributes {iree.abi = "{\22a\22:[[\22ndarray\22,\22f32\22,7,null,null,null,null,null,null,null]],\22r\22:[[\22ndarray\22,\22f32\22,6,null,null,null,null,null,null]],\22v\22:1}"} {
  %0 = mhlo.constant dense<-0.000000e+00> : tensor<f32>
  %1 = iree_input.cast.buffer_view_to_tensor %arg0 : !iree_input.buffer_view -> tensor<?x?x?x?x?x?x?xf32>
  %2 = chlo.lgamma %1 : tensor<?x?x?x?x?x?x?xf32> -> tensor<?x?x?x?x?x?x?xf32>
  %3 = mhlo.reduce(%2 init: %0) applies mhlo.add across dimensions = [6] : (tensor<?x?x?x?x?x?x?xf32>, tensor<f32>) -> tensor<?x?x?x?x?x?xf32>
  %4 = mhlo.reduce(%1 init: %0) applies mhlo.add across dimensions = [6] : (tensor<?x?x?x?x?x?x?xf32>, tensor<f32>) -> tensor<?x?x?x?x?x?xf32>
  %5 = chlo.lgamma %4 : tensor<?x?x?x?x?x?xf32> -> tensor<?x?x?x?x?x?xf32>
  %6 = chlo.broadcast_subtract %3, %5 : (tensor

## newcrapShared
// -----// IR Dump After LegalizeControlFlowPass (mhlo-legalize-control-flow) //----- //
func.func @lbeta__2x2x2x2x2x2x2__f32__uniform(%arg0: !iree_input.buffer_view) -> !iree_input.buffer_view attributes {iree.abi = "{\22a\22:[[\22ndarray\22,\22f32\22,7,null,null,null,null,null,null,null]],\22r\22:[[\22ndarray\22,\22f32\22,6,null,null,null,null,null,null]],\22v\22:1}"} {
  %0 = mhlo.constant dense<-0.000000e+00> : tensor<f32>
  %1 = iree_input.cast.buffer_view_to_tensor %arg0 : !iree_input.buffer_view -> tensor<?x?x?x?x?x?x?xf32>
  %2 = chlo.lgamma %1 : tensor<?x?x?x?x?x?x?xf32> -> tensor<?x?x?x?x?x?x?xf32>
  %3 = mhlo.reduce(%2 init: %0) applies mhlo.add across dimensions = [6] : (tensor<?x?x?x?x?x?x?xf32>, tensor<f32>) -> tensor<?x?x?x?x?x?xf32>
  %4 = mhlo.reduce(%1 init: %0) applies mhlo.add across dimensions = [6] : (tensor<?x?x?x?x?x?x?xf32>, tensor<f32>) -> tensor<?x?x?x?x?x?xf32>
  %5 = chlo.lgamma %4 : tensor<?x?x?x?x?x?xf32> -> tensor<?x?x?x?x?x?xf32>
  %6 = chlo.broadcast_subtract %3, %5 : (tensor

## gist:671d0300d85e4615057385dcbfdf3260
muralivi@gogeta:~/projects/iree$ cmake --build ../iree-build
[0/2] Re-checking globbed directories...
[347/1202] Building CXX object third_party/llvm-project/llvm/tools/torch-mlir-dialects/lib/Dialect/TMTensor/IR/CMakeFiles/obj.TorchMLIRTMTensorDialect.dir/TMTensorInterfaces.cpp.o
FAILED: third_party/llvm-project/llvm/tools/torch-mlir-dialects/lib/Dialect/TMTensor/IR/CMakeFiles/obj.TorchMLIRTMTensorDialect.dir/TMTensorInterfaces.cpp.o
/usr/bin/clang++ -DGTEST_HAS_RTTI=0 -D_DEBUG -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/local/google/home/muralivi/projects/iree-build/third_party/llvm-project/llvm/tools/torch-mlir-dialects/lib/Dialect/TMTensor/IR -I/usr/local/google/home/muralivi/projects/iree/third_party/torch-mlir-dialects/lib/Dialect/TMTensor/IR -I/usr/local/google/home/muralivi/projects/iree-build/third_party/llvm-project/llvm/include -I/usr/local/google/home/muralivi/projects/iree/third_party/llvm-project/llvm/include -I/usr/local/google/home/muralivi/proj
	Time for each load+add: 0.291310 ns for START=0
	576460752303423488.000000
	Time for each load+add: 0.290989 ns for START=1
	inf
	Time for each load+add: 0.291260 ns for START=2
	nan
	Time for each load+add: 0.292583 ns for START=3
	nan
	Time for each load+add: 0.290434 ns for START=4
	nan
	muralivi@gogeta:~$ /usr/local/google/home/muralivi/projects/iree-build/tools/iree-compile --output-format=vm-bytecode --mlir-print-op-on-diagnostic=false --iree-hal-target-backends=vmvx /usr/local/google/home/muralivi/projects/iree/tests/e2e/tensor_ops/pack.mlir -o check_vmvx_local-task_pack.mlir_module.vmfb --iree-hal-executable-object-search-path=\"/usr/local/google/home/muralivi/projects/iree-build\" --iree-llvm-embedded-linker-path=\"/usr/local/google/home/muralivi/projects/iree-build/llvm-project/bin/lld\" --iree-llvm-wasm-linker-path=\"/usr/local/google/home/muralivi/projects/iree-build/llvm-project/bin/lld\" >& ~/crap2
	Aborted
	muralivi@gogeta:~$ vim ~/crap2
	muralivi@gogeta:~$ /usr/local/google/home/muralivi/projects/iree-build/tools/iree-compile --output-format=vm-bytecode --mlir-print-op-on-diagnostic=false --iree-hal-target-backends=vmvx /usr/local/google/home/muralivi/projects/iree/tests/e2e/tensor_ops/pack.mlir -o check_vmvx_local-task_pack.mlir_module.vmfb --iree-hal-executable-object-search-path
	// -----// IR Dump Before TosaToSCF (tosa-to-scf) //----- //
	func.func @tensor_float() {
	%0 = util.unfoldable_constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
	%1 = util.unfoldable_constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00]]> : tensor<1x3xf32>
	%2 = util.unfoldable_constant dense<1.000000e+00> : tensor<1xf32>
	%3 = "tosa.fully_connected"(%0, %1, %2) : (tensor<2x3xf32>, tensor<1x3xf32>, tensor<1xf32>) -> tensor<2x1xf32>
	check.expect_eq_const(%3, dense<[[5.100000e+01], [1.230000e+02]]> : tensor<2x1xf32>) : tensor<2x1xf32>
	return
	}
	General Purpose Registers:
	x0 = 0x0000600003e00038
	x1 = 0x000000016feb2d00
	x2 = 0x000000016feb2cc0
	x3 = 0x000000016feb2cc0
	x4 = 0x0000000000000000
	x5 = 0x000000016feb2f10
	x6 = 0x00000001c1c1d07d
	x7 = 0x000000000d84bc77
	x8 = 0x000000010019c3e0
	-> 0x10019c3e0: stp x29, x30, [sp, #-0x10]!
	0x10019c3e4: mov x29, sp
	0x10019c3e8: ldp x8, x9, [x1, #0x18]
	0x10019c3ec: mov x13, xzr
	0x10019c3f0: mov w17, #0x1
	0x10019c3f4: mov w12, #0xd
	0x10019c3f8: mov w14, #0x34
	0x10019c3fc: ldr w10, [x8]
	0x10019c400: ldp x11, x8, [x9]
	0x10019c404: add x9, x11, x10
	module attributes {hal.device.targets = [#hal.device.target<"llvm-cpu", {executable_targets = [#hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "arm64-unknown-unknown-eabi-elf"}>]}>]} {
	hal.executable private @encode_dispatch_285 {
	hal.executable.variant public @embedded_elf_arm_64, target = <"llvm-cpu", "embedded-elf-arm_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "arm64-unknown-unknown-eabi-elf"}> {
	hal.executable.export public @encode_dispatch_285_generic_8x13 ordinal(0) layout(#hal.pipeline.layout<push_constants = 1, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) {
	^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
	%x, %y, %z = flow.dispatch.workgroup_count_from_dag_roo
	// -----// IR Dump After LegalizeControlFlowPass (mhlo-legalize-control-flow) //----- //
	func.func @lbeta__2x2x2x2x2x2x2__f32__uniform(%arg0: !iree_input.buffer_view) -> !iree_input.buffer_view attributes {iree.abi = "{\22a\22:[[\22ndarray\22,\22f32\22,7,null,null,null,null,null,null,null]],\22r\22:[[\22ndarray\22,\22f32\22,6,null,null,null,null,null,null]],\22v\22:1}"} {
	%0 = mhlo.constant dense<-0.000000e+00> : tensor<f32>
	%1 = iree_input.cast.buffer_view_to_tensor %arg0 : !iree_input.buffer_view -> tensor<?x?x?x?x?x?x?xf32>
	%2 = chlo.lgamma %1 : tensor<?x?x?x?x?x?x?xf32> -> tensor<?x?x?x?x?x?x?xf32>
	%3 = mhlo.reduce(%2 init: %0) applies mhlo.add across dimensions = [6] : (tensor<?x?x?x?x?x?x?xf32>, tensor<f32>) -> tensor<?x?x?x?x?x?xf32>
	%4 = mhlo.reduce(%1 init: %0) applies mhlo.add across dimensions = [6] : (tensor<?x?x?x?x?x?x?xf32>, tensor<f32>) -> tensor<?x?x?x?x?x?xf32>
	%5 = chlo.lgamma %4 : tensor<?x?x?x?x?x?xf32> -> tensor<?x?x?x?x?x?xf32>
	%6 = chlo.broadcast_subtract %3, %5 : (tensor
	muralivi@gogeta:~/projects/iree$ cmake --build ../iree-build
	[0/2] Re-checking globbed directories...
	[347/1202] Building CXX object third_party/llvm-project/llvm/tools/torch-mlir-dialects/lib/Dialect/TMTensor/IR/CMakeFiles/obj.TorchMLIRTMTensorDialect.dir/TMTensorInterfaces.cpp.o
	FAILED: third_party/llvm-project/llvm/tools/torch-mlir-dialects/lib/Dialect/TMTensor/IR/CMakeFiles/obj.TorchMLIRTMTensorDialect.dir/TMTensorInterfaces.cpp.o
	/usr/bin/clang++ -DGTEST_HAS_RTTI=0 -D_DEBUG -D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -I/usr/local/google/home/muralivi/projects/iree-build/third_party/llvm-project/llvm/tools/torch-mlir-dialects/lib/Dialect/TMTensor/IR -I/usr/local/google/home/muralivi/projects/iree/third_party/torch-mlir-dialects/lib/Dialect/TMTensor/IR -I/usr/local/google/home/muralivi/projects/iree-build/third_party/llvm-project/llvm/include -I/usr/local/google/home/muralivi/projects/iree/third_party/llvm-project/llvm/include -I/usr/local/google/home/muralivi/proj