vivekkhandelwal1/log_error_bincount_trunci.mlir

## log_error_bincount_trunci.mlir
// -----// IR Dump After VerifyInvariantsBeforeBackendLowering //----- //
module attributes {torch.debug_module_name = "BincountModule"} {
  func @forward(%arg0: !torch.vtensor<[?],si64>) -> !torch.vtensor<[?],si64> {
    %none = torch.constant.none
    %int0 = torch.constant.int 0
    %0 = torch.aten.bincount %arg0, %none, %int0 : !torch.vtensor<[?],si64>, !torch.none, !torch.int -> !torch.vtensor<[?],si64>
    return %0 : !torch.vtensor<[?],si64>
  }
}


// -----// IR Dump After ConvertTorchToLinalg //----- //
func @forward(%arg0: !torch.vtensor<[?],si64>) -> !torch.vtensor<[?],si64> {
  %none = torch.constant.none
  %int0 = torch.constant.int 0
  %0 = torch.aten.bincount %arg0, %none, %int0 : !torch.vtensor<[?],si64>, !torch.none, !torch.int -> !torch.vtensor<[?],si64>
  return %0 : !torch.vtensor<[?],si64>
}

module attributes {torch.debug_module_name = "BincountModule"} {
  func @forward(%arg0: !torch.vtensor<[?],si64>) -> !torch.vtensor<[?],si64> {
    %0 = builtin.unrealized_conversion_cast %arg0 : !torch.vtensor<[?],si64> to tensor<?xi64>
    %none = torch.constant.none
    %int0 = torch.constant.int 0
    %1 = builtin.unrealized_conversion_cast %int0 : !torch.int to i64
    %c1 = arith.constant 1 : index
    %c0_i64 = arith.constant 0 : i64
    %2 = linalg.init_tensor [%c1] : tensor<?xi64>
    %3 = linalg.fill(%c0_i64, %2) : i64, tensor<?xi64> -> tensor<?xi64>
    %4 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>], iterator_types = ["reduction", "parallel"]} ins(%0 : tensor<?xi64>) outs(%3 : tensor<?xi64>) {
    ^bb0(%arg1: i64, %arg2: i64):
      %18 = arith.cmpi ugt, %arg1, %arg2 : i64
      %19 = arith.select %18, %arg1, %arg2 : i64
      linalg.yield %19 : i64
    } -> tensor<?xi64>
    %5 = arith.index_cast %c0_i64 : i64 to index
    %6 = tensor.extract %4[%5] : tensor<?xi64>
    %c0 = arith.constant 0 : index
    %7 = tensor.dim %0, %c0 : tensor<?xi64>
    %c1_i32 = arith.constant 1 : i32
    %8 = linalg.init_tensor [%7] : tensor<?xi32>
    %9 = linalg.fill(%c1_i32, %8) : i32, tensor<?xi32> -> tensor<?xi32>
    %10 = tensor.expand_shape %0 [[0, 1]] : tensor<?xi64> into tensor<?x1xi64>
    %11 = arith.trunci %10 : tensor<?x1xi64> to tensor<?x1xi32>
    %12 = arith.maxui %6, %1 : i64
    %13 = arith.index_cast %12 : i64 to index
    %14 = linalg.init_tensor [%13] : tensor<?xi32>
    %15 = linalg.fill(%c1_i32, %14) : i32, tensor<?xi32> -> tensor<?xi32>
    %16 = tm_tensor.scatter unique_indices(false) ins(%9, %11 : tensor<?xi32>, tensor<?x1xi32>) outs(%15 : tensor<?xi32>) {
    ^bb0(%arg1: i32, %arg2: i32):
      %18 = arith.addi %arg1, %arg2 : i32
      tm_tensor.yield %18 : i32
    } -> tensor<?xi32>
    %17 = torch.aten.bincount %arg0, %none, %int0 : !torch.vtensor<[?],si64>, !torch.none, !torch.int -> !torch.vtensor<[?],si64>
    return %17 : !torch.vtensor<[?],si64>
  }
}
// -----// IR Dump After ConvertLowerAlgorithmicOps //----- //
func @forward(%arg0: !torch.vtensor<[?],si64>) -> !torch.vtensor<[?],si64> {
  %0 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[?],si64> -> tensor<?xi64>
  %none = torch.constant.none
  %int0 = torch.constant.int 0
  %1 = torch_c.to_i64 %int0
  %c1 = arith.constant 1 : index
  %c0_i64 = arith.constant 0 : i64
  %2 = linalg.init_tensor [%c1] : tensor<?xi64>
  %3 = linalg.fill(%c0_i64, %2) : i64, tensor<?xi64> -> tensor<?xi64>
  %4 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>], iterator_types = ["reduction", "parallel"]} ins(%0 : tensor<?xi64>) outs(%3 : tensor<?xi64>) {
  ^bb0(%arg1: i64, %arg2: i64):
    %20 = arith.cmpi ugt, %arg1, %arg2 : i64
    %21 = arith.select %20, %arg1, %arg2 : i64
    linalg.yield %21 : i64
  } -> tensor<?xi64>
  %5 = arith.index_cast %c0_i64 : i64 to index
  %6 = tensor.extract %4[%5] : tensor<?xi64>
  %c0 = arith.constant 0 : index
  %7 = tensor.dim %0, %c0 : tensor<?xi64>
  %c1_i32 = arith.constant 1 : i32
  %8 = linalg.init_tensor [%7] : tensor<?xi32>
  %9 = linalg.fill(%c1_i32, %8) : i32, tensor<?xi32> -> tensor<?xi32>
  %10 = tensor.expand_shape %0 [[0, 1]] : tensor<?xi64> into tensor<?x1xi64>
  %11 = arith.trunci %10 : tensor<?x1xi64> to tensor<?x1xi32>
  %12 = arith.maxui %6, %1 : i64
  %13 = arith.index_cast %12 : i64 to index
  %14 = linalg.init_tensor [%13] : tensor<?xi32>
  %15 = linalg.fill(%c1_i32, %14) : i32, tensor<?xi32> -> tensor<?xi32>
  %16 = tm_tensor.scatter unique_indices(false) ins(%9, %11 : tensor<?xi32>, tensor<?x1xi32>) outs(%15 : tensor<?xi32>) {
  ^bb0(%arg1: i32, %arg2: i32):
    %20 = arith.addi %arg1, %arg2 : i32
    tm_tensor.yield %20 : i32
  } -> tensor<?xi32>
  %17 = arith.extui %16 : tensor<?xi32> to tensor<?xi64>
  %18 = tensor.cast %17 : tensor<?xi64> to tensor<?xi64>
  %19 = torch_c.from_builtin_tensor %18 : tensor<?xi64> -> !torch.vtensor<[?],si64>
  return %19 : !torch.vtensor<[?],si64>
}

// -----// IR Dump After ConvertTorchToStd //----- //
func @forward(%arg0: !torch.vtensor<[?],si64>) -> !torch.vtensor<[?],si64> {
  %0 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[?],si64> -> tensor<?xi64>
  %none = torch.constant.none
  %c0_i64 = arith.constant 0 : i64
  %1 = torch_c.from_i64 %c0_i64
  %2 = torch_c.to_i64 %1
  %c1 = arith.constant 1 : index
  %c0_i64_0 = arith.constant 0 : i64
  %3 = linalg.init_tensor [%c1] : tensor<?xi64>
  %4 = linalg.fill(%c0_i64_0, %3) : i64, tensor<?xi64> -> tensor<?xi64>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>], iterator_types = ["reduction", "parallel"]} ins(%0 : tensor<?xi64>) outs(%4 : tensor<?xi64>) {
  ^bb0(%arg1: i64, %arg2: i64):
    %21 = arith.cmpi ugt, %arg1, %arg2 : i64
    %22 = arith.select %21, %arg1, %arg2 : i64
    linalg.yield %22 : i64
  } -> tensor<?xi64>
  %6 = arith.index_cast %c0_i64_0 : i64 to index
  %7 = tensor.extract %5[%6] : tensor<?xi64>
  %c0 = arith.constant 0 : index
  %8 = tensor.dim %0, %c0 : tensor<?xi64>
  %c1_i32 = arith.constant 1 : i32
  %9 = linalg.init_tensor [%8] : tensor<?xi32>
  %10 = linalg.fill(%c1_i32, %9) : i32, tensor<?xi32> -> tensor<?xi32>
  %11 = tensor.expand_shape %0 [[0, 1]] : tensor<?xi64> into tensor<?x1xi64>
  %12 = arith.trunci %11 : tensor<?x1xi64> to tensor<?x1xi32>
  %13 = arith.maxui %7, %2 : i64
  %14 = arith.index_cast %13 : i64 to index
  %15 = linalg.init_tensor [%14] : tensor<?xi32>
  %16 = linalg.fill(%c1_i32, %15) : i32, tensor<?xi32> -> tensor<?xi32>
  %17 = tm_tensor.scatter unique_indices(false) ins(%10, %12 : tensor<?xi32>, tensor<?x1xi32>) outs(%16 : tensor<?xi32>) {
  ^bb0(%arg1: i32, %arg2: i32):
    %21 = arith.addi %arg1, %arg2 : i32
    tm_tensor.yield %21 : i32
  } -> tensor<?xi32>
  %18 = arith.extui %17 : tensor<?xi32> to tensor<?xi64>
  %19 = tensor.cast %18 : tensor<?xi64> to tensor<?xi64>
  %20 = torch_c.from_builtin_tensor %19 : tensor<?xi64> -> !torch.vtensor<[?],si64>
  return %20 : !torch.vtensor<[?],si64>
}

// -----// IR Dump After ConvertTorchToSCF //----- //
func @forward(%arg0: !torch.vtensor<[?],si64>) -> !torch.vtensor<[?],si64> {
  %0 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[?],si64> -> tensor<?xi64>
  %none = torch.constant.none
  %c0_i64 = arith.constant 0 : i64
  %1 = torch_c.from_i64 %c0_i64
  %2 = torch_c.to_i64 %1
  %c1 = arith.constant 1 : index
  %c0_i64_0 = arith.constant 0 : i64
  %3 = linalg.init_tensor [%c1] : tensor<?xi64>
  %4 = linalg.fill(%c0_i64_0, %3) : i64, tensor<?xi64> -> tensor<?xi64>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>], iterator_types = ["reduction", "parallel"]} ins(%0 : tensor<?xi64>) outs(%4 : tensor<?xi64>) {
  ^bb0(%arg1: i64, %arg2: i64):
    %20 = arith.cmpi ugt, %arg1, %arg2 : i64
    %21 = arith.select %20, %arg1, %arg2 : i64
    linalg.yield %21 : i64
  } -> tensor<?xi64>
  %6 = arith.index_cast %c0_i64_0 : i64 to index
  %7 = tensor.extract %5[%6] : tensor<?xi64>
  %c0 = arith.constant 0 : index
  %8 = tensor.dim %0, %c0 : tensor<?xi64>
  %c1_i32 = arith.constant 1 : i32
  %9 = linalg.init_tensor [%8] : tensor<?xi32>
  %10 = linalg.fill(%c1_i32, %9) : i32, tensor<?xi32> -> tensor<?xi32>
  %11 = tensor.expand_shape %0 [[0, 1]] : tensor<?xi64> into tensor<?x1xi64>
  %12 = arith.trunci %11 : tensor<?x1xi64> to tensor<?x1xi32>
  %13 = arith.maxui %7, %2 : i64
  %14 = arith.index_cast %13 : i64 to index
  %15 = linalg.init_tensor [%14] : tensor<?xi32>
  %16 = linalg.fill(%c1_i32, %15) : i32, tensor<?xi32> -> tensor<?xi32>
  %17 = tm_tensor.scatter unique_indices(false) ins(%10, %12 : tensor<?xi32>, tensor<?x1xi32>) outs(%16 : tensor<?xi32>) {
  ^bb0(%arg1: i32, %arg2: i32):
    %20 = arith.addi %arg1, %arg2 : i32
    tm_tensor.yield %20 : i32
  } -> tensor<?xi32>
  %18 = arith.extui %17 : tensor<?xi32> to tensor<?xi64>
  %19 = torch_c.from_builtin_tensor %18 : tensor<?xi64> -> !torch.vtensor<[?],si64>
  return %19 : !torch.vtensor<[?],si64>
}

// -----// IR Dump After ExpandOps //----- //
func @forward(%arg0: !torch.vtensor<[?],si64>) -> !torch.vtensor<[?],si64> {
  %0 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[?],si64> -> tensor<?xi64>
  %none = torch.constant.none
  %c0_i64 = arith.constant 0 : i64
  %1 = torch_c.from_i64 %c0_i64
  %2 = torch_c.to_i64 %1
  %c1 = arith.constant 1 : index
  %c0_i64_0 = arith.constant 0 : i64
  %3 = linalg.init_tensor [%c1] : tensor<?xi64>
  %4 = linalg.fill(%c0_i64_0, %3) : i64, tensor<?xi64> -> tensor<?xi64>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>], iterator_types = ["reduction", "parallel"]} ins(%0 : tensor<?xi64>) outs(%4 : tensor<?xi64>) {
  ^bb0(%arg1: i64, %arg2: i64):
    %20 = arith.cmpi ugt, %arg1, %arg2 : i64
    %21 = arith.select %20, %arg1, %arg2 : i64
    linalg.yield %21 : i64
  } -> tensor<?xi64>
  %6 = arith.index_cast %c0_i64_0 : i64 to index
  %7 = tensor.extract %5[%6] : tensor<?xi64>
  %c0 = arith.constant 0 : index
  %8 = tensor.dim %0, %c0 : tensor<?xi64>
  %c1_i32 = arith.constant 1 : i32
  %9 = linalg.init_tensor [%8] : tensor<?xi32>
  %10 = linalg.fill(%c1_i32, %9) : i32, tensor<?xi32> -> tensor<?xi32>
  %11 = tensor.expand_shape %0 [[0, 1]] : tensor<?xi64> into tensor<?x1xi64>
  %12 = arith.trunci %11 : tensor<?x1xi64> to tensor<?x1xi32>
  %13 = arith.maxui %7, %2 : i64
  %14 = arith.index_cast %13 : i64 to index
  %15 = linalg.init_tensor [%14] : tensor<?xi32>
  %16 = linalg.fill(%c1_i32, %15) : i32, tensor<?xi32> -> tensor<?xi32>
  %17 = tm_tensor.scatter unique_indices(false) ins(%10, %12 : tensor<?xi32>, tensor<?x1xi32>) outs(%16 : tensor<?xi32>) {
  ^bb0(%arg1: i32, %arg2: i32):
    %20 = arith.addi %arg1, %arg2 : i32
    tm_tensor.yield %20 : i32
  } -> tensor<?xi32>
  %18 = arith.extui %17 : tensor<?xi32> to tensor<?xi64>
  %19 = torch_c.from_builtin_tensor %18 : tensor<?xi64> -> !torch.vtensor<[?],si64>
  return %19 : !torch.vtensor<[?],si64>
}

// -----// IR Dump After Canonicalizer //----- //
func @forward(%arg0: !torch.vtensor<[?],si64>) -> !torch.vtensor<[?],si64> {
  %c1_i32 = arith.constant 1 : i32
  %c0 = arith.constant 0 : index
  %c0_i64 = arith.constant 0 : i64
  %0 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[?],si64> -> tensor<?xi64>
  %1 = torch_c.from_i64 %c0_i64
  %2 = torch_c.to_i64 %1
  %3 = linalg.init_tensor [1] : tensor<1xi64>
  %4 = linalg.fill(%c0_i64, %3) : i64, tensor<1xi64> -> tensor<1xi64>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>], iterator_types = ["reduction", "parallel"]} ins(%0 : tensor<?xi64>) outs(%4 : tensor<1xi64>) {
  ^bb0(%arg1: i64, %arg2: i64):
    %19 = arith.cmpi ugt, %arg1, %arg2 : i64
    %20 = arith.select %19, %arg1, %arg2 : i64
    linalg.yield %20 : i64
  } -> tensor<1xi64>
  %6 = tensor.extract %5[%c0] : tensor<1xi64>
  %7 = tensor.dim %0, %c0 : tensor<?xi64>
  %8 = linalg.init_tensor [%7] : tensor<?xi32>
  %9 = linalg.fill(%c1_i32, %8) : i32, tensor<?xi32> -> tensor<?xi32>
  %10 = tensor.expand_shape %0 [[0, 1]] : tensor<?xi64> into tensor<?x1xi64>
  %11 = arith.trunci %10 : tensor<?x1xi64> to tensor<?x1xi32>
  %12 = arith.maxui %6, %2 : i64
  %13 = arith.index_cast %12 : i64 to index
  %14 = linalg.init_tensor [%13] : tensor<?xi32>
  %15 = linalg.fill(%c1_i32, %14) : i32, tensor<?xi32> -> tensor<?xi32>
  %16 = tm_tensor.scatter unique_indices(false) ins(%9, %11 : tensor<?xi32>, tensor<?x1xi32>) outs(%15 : tensor<?xi32>) {
  ^bb0(%arg1: i32, %arg2: i32):
    %19 = arith.addi %arg1, %arg2 : i32
    tm_tensor.yield %19 : i32
  } -> tensor<?xi32>
  %17 = arith.extui %16 : tensor<?xi32> to tensor<?xi64>
  %18 = torch_c.from_builtin_tensor %17 : tensor<?xi64> -> !torch.vtensor<[?],si64>
  return %18 : !torch.vtensor<[?],si64>
}

// -----// IR Dump After ResolveShapedTypeResultDims //----- //
func @forward(%arg0: !torch.vtensor<[?],si64>) -> !torch.vtensor<[?],si64> {
  %c1_i32 = arith.constant 1 : i32
  %c0 = arith.constant 0 : index
  %c0_i64 = arith.constant 0 : i64
  %0 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[?],si64> -> tensor<?xi64>
  %1 = torch_c.from_i64 %c0_i64
  %2 = torch_c.to_i64 %1
  %3 = linalg.init_tensor [1] : tensor<1xi64>
  %4 = linalg.fill(%c0_i64, %3) : i64, tensor<1xi64> -> tensor<1xi64>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>], iterator_types = ["reduction", "parallel"]} ins(%0 : tensor<?xi64>) outs(%4 : tensor<1xi64>) {
  ^bb0(%arg1: i64, %arg2: i64):
    %19 = arith.cmpi ugt, %arg1, %arg2 : i64
    %20 = arith.select %19, %arg1, %arg2 : i64
    linalg.yield %20 : i64
  } -> tensor<1xi64>
  %6 = tensor.extract %5[%c0] : tensor<1xi64>
  %7 = tensor.dim %0, %c0 : tensor<?xi64>
  %8 = linalg.init_tensor [%7] : tensor<?xi32>
  %9 = linalg.fill(%c1_i32, %8) : i32, tensor<?xi32> -> tensor<?xi32>
  %10 = tensor.expand_shape %0 [[0, 1]] : tensor<?xi64> into tensor<?x1xi64>
  %11 = arith.trunci %10 : tensor<?x1xi64> to tensor<?x1xi32>
  %12 = arith.maxui %6, %2 : i64
  %13 = arith.index_cast %12 : i64 to index
  %14 = linalg.init_tensor [%13] : tensor<?xi32>
  %15 = linalg.fill(%c1_i32, %14) : i32, tensor<?xi32> -> tensor<?xi32>
  %16 = tm_tensor.scatter unique_indices(false) ins(%9, %11 : tensor<?xi32>, tensor<?x1xi32>) outs(%15 : tensor<?xi32>) {
  ^bb0(%arg1: i32, %arg2: i32):
    %19 = arith.addi %arg1, %arg2 : i32
    tm_tensor.yield %19 : i32
  } -> tensor<?xi32>
  %17 = arith.extui %16 : tensor<?xi32> to tensor<?xi64>
  %18 = torch_c.from_builtin_tensor %17 : tensor<?xi64> -> !torch.vtensor<[?],si64>
  return %18 : !torch.vtensor<[?],si64>
}

// -----// IR Dump After CSE //----- //
func @forward(%arg0: !torch.vtensor<[?],si64>) -> !torch.vtensor<[?],si64> {
  %c1_i32 = arith.constant 1 : i32
  %c0 = arith.constant 0 : index
  %c0_i64 = arith.constant 0 : i64
  %0 = torch_c.to_builtin_tensor %arg0 : !torch.vtensor<[?],si64> -> tensor<?xi64>
  %1 = torch_c.from_i64 %c0_i64
  %2 = torch_c.to_i64 %1
  %3 = linalg.init_tensor [1] : tensor<1xi64>
  %4 = linalg.fill(%c0_i64, %3) : i64, tensor<1xi64> -> tensor<1xi64>
  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>], iterator_types = ["reduction", "parallel"]} ins(%0 : tensor<?xi64>) outs(%4 : tensor<1xi64>) {
  ^bb0(%arg1: i64, %arg2: i64):
    %19 = arith.cmpi ugt, %arg1, %arg2 : i64
    %20 = arith.select %19, %arg1, %arg2 : i64
    linalg.yield %20 : i64
  } -> tensor<1xi64>
  %6 = tensor.extract %5[%c0] : tensor<1xi64>
  %7 = tensor.dim %0, %c0 : tensor<?xi64>
  %8 = linalg.init_tensor [%7] : tensor<?xi32>
  %9 = linalg.fill(%c1_i32, %8) : i32, tensor<?xi32> -> tensor<?xi32>
  %10 = tensor.expand_shape %0 [[0, 1]] : tensor<?xi64> into tensor<?x1xi64>
  %11 = arith.trunci %10 : tensor<?x1xi64> to tensor<?x1xi32>
  %12 = arith.maxui %6, %2 : i64
  %13 = arith.index_cast %12 : i64 to index
  %14 = linalg.init_tensor [%13] : tensor<?xi32>
  %15 = linalg.fill(%c1_i32, %14) : i32, tensor<?xi32> -> tensor<?xi32>
  %16 = tm_tensor.scatter unique_indices(false) ins(%9, %11 : tensor<?xi32>, tensor<?x1xi32>) outs(%15 : tensor<?xi32>) {
  ^bb0(%arg1: i32, %arg2: i32):
    %19 = arith.addi %arg1, %arg2 : i32
    tm_tensor.yield %19 : i32
  } -> tensor<?xi32>
  %17 = arith.extui %16 : tensor<?xi32> to tensor<?xi64>
  %18 = torch_c.from_builtin_tensor %17 : tensor<?xi64> -> !torch.vtensor<[?],si64>
  return %18 : !torch.vtensor<[?],si64>
}

// -----// IR Dump After FuncBackendTypeConversion //----- //
#map0 = affine_map<(d0, d1) -> (d0)>
#map1 = affine_map<(d0, d1) -> (d1)>
module attributes {torch.debug_module_name = "BincountModule"} {
  func @forward(%arg0: tensor<?xi64>) -> tensor<?xi64> {
    %0 = torch_c.from_builtin_tensor %arg0 : tensor<?xi64> -> !torch.vtensor<[?],si64>
    %c1_i32 = arith.constant 1 : i32
    %c0 = arith.constant 0 : index
    %c0_i64 = arith.constant 0 : i64
    %1 = torch_c.to_builtin_tensor %0 : !torch.vtensor<[?],si64> -> tensor<?xi64>
    %2 = torch_c.from_i64 %c0_i64
    %3 = torch_c.to_i64 %2
    %4 = linalg.init_tensor [1] : tensor<1xi64>
    %5 = linalg.fill(%c0_i64, %4) : i64, tensor<1xi64> -> tensor<1xi64>
    %6 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["reduction", "parallel"]} ins(%1 : tensor<?xi64>) outs(%5 : tensor<1xi64>) {
    ^bb0(%arg1: i64, %arg2: i64):
      %21 = arith.cmpi ugt, %arg1, %arg2 : i64
      %22 = arith.select %21, %arg1, %arg2 : i64
      linalg.yield %22 : i64
    } -> tensor<1xi64>
    %7 = tensor.extract %6[%c0] : tensor<1xi64>
    %8 = tensor.dim %1, %c0 : tensor<?xi64>
    %9 = linalg.init_tensor [%8] : tensor<?xi32>
    %10 = linalg.fill(%c1_i32, %9) : i32, tensor<?xi32> -> tensor<?xi32>
    %11 = tensor.expand_shape %1 [[0, 1]] : tensor<?xi64> into tensor<?x1xi64>
    %12 = arith.trunci %11 : tensor<?x1xi64> to tensor<?x1xi32>
    %13 = arith.maxui %7, %3 : i64
    %14 = arith.index_cast %13 : i64 to index
    %15 = linalg.init_tensor [%14] : tensor<?xi32>
    %16 = linalg.fill(%c1_i32, %15) : i32, tensor<?xi32> -> tensor<?xi32>
    %17 = tm_tensor.scatter unique_indices(false) ins(%10, %12 : tensor<?xi32>, tensor<?x1xi32>) outs(%16 : tensor<?xi32>) {
    ^bb0(%arg1: i32, %arg2: i32):
      %21 = arith.addi %arg1, %arg2 : i32
      tm_tensor.yield %21 : i32
    } -> tensor<?xi32>
    %18 = arith.extui %17 : tensor<?xi32> to tensor<?xi64>
    %19 = torch_c.from_builtin_tensor %18 : tensor<?xi64> -> !torch.vtensor<[?],si64>
    %20 = torch_c.to_builtin_tensor %19 : !torch.vtensor<[?],si64> -> tensor<?xi64>
    return %20 : tensor<?xi64>
  }
}


// -----// IR Dump After FinalizingBackendTypeConversion //----- //
func @forward(%arg0: tensor<?xi64>) -> tensor<?xi64> {
  %c1_i32 = arith.constant 1 : i32
  %c0 = arith.constant 0 : index
  %c0_i64 = arith.constant 0 : i64
  %0 = linalg.init_tensor [1] : tensor<1xi64>
  %1 = linalg.fill(%c0_i64, %0) : i64, tensor<1xi64> -> tensor<1xi64>
  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>], iterator_types = ["reduction", "parallel"]} ins(%arg0 : tensor<?xi64>) outs(%1 : tensor<1xi64>) {
  ^bb0(%arg1: i64, %arg2: i64):
    %15 = arith.cmpi ugt, %arg1, %arg2 : i64
    %16 = arith.select %15, %arg1, %arg2 : i64
    linalg.yield %16 : i64
  } -> tensor<1xi64>
  %3 = tensor.extract %2[%c0] : tensor<1xi64>
  %4 = tensor.dim %arg0, %c0 : tensor<?xi64>
  %5 = linalg.init_tensor [%4] : tensor<?xi32>
  %6 = linalg.fill(%c1_i32, %5) : i32, tensor<?xi32> -> tensor<?xi32>
  %7 = tensor.expand_shape %arg0 [[0, 1]] : tensor<?xi64> into tensor<?x1xi64>
  %8 = arith.trunci %7 : tensor<?x1xi64> to tensor<?x1xi32>
  %9 = arith.maxui %3, %c0_i64 : i64
  %10 = arith.index_cast %9 : i64 to index
  %11 = linalg.init_tensor [%10] : tensor<?xi32>
  %12 = linalg.fill(%c1_i32, %11) : i32, tensor<?xi32> -> tensor<?xi32>
  %13 = tm_tensor.scatter unique_indices(false) ins(%6, %8 : tensor<?xi32>, tensor<?x1xi32>) outs(%12 : tensor<?xi32>) {
  ^bb0(%arg1: i32, %arg2: i32):
    %15 = arith.addi %arg1, %arg2 : i32
    tm_tensor.yield %15 : i32
  } -> tensor<?xi32>
  %14 = arith.extui %13 : tensor<?xi32> to tensor<?xi64>
  return %14 : tensor<?xi64>
}

/home/vivek/work/02_07/vivekkhandelwal1-torch-mlir/e2e_testing/torchscript/basic.py:1323:15: error: failed to legalize operation 'arith.trunci'
        return torch.bincount(x)
              ^
/home/vivek/work/02_07/vivekkhandelwal1-torch-mlir/e2e_testing/torchscript/basic.py:1323:15: note: see current operation: %11 = "arith.trunci"(%10) : (tensor<?x1xi64>) -> tensor<?x1xi32>
<unknown>:0: error: Module does not conform to the linalg-on-tensors backend contract. See dialect conversion legality information above.
// -----// IR Dump After VerifyLinalgOnTensorsBackendContract Failed //----- //
#map0 = affine_map<(d0, d1) -> (d0)>
#map1 = affine_map<(d0, d1) -> (d1)>
"builtin.module"() ({
  "builtin.func"() ({
  ^bb0(%arg0: tensor<?xi64>):
    %0 = "arith.constant"() {value = 1 : i32} : () -> i32
    %1 = "arith.constant"() {value = 0 : index} : () -> index
    %2 = "arith.constant"() {value = 0 : i64} : () -> i64
    %3 = "linalg.init_tensor"() {static_sizes = [1]} : () -> tensor<1xi64>
    %4 = "linalg.fill"(%2, %3) ({
    ^bb0(%arg1: i64, %arg2: i64):
      "linalg.yield"(%arg1) : (i64) -> ()
    }) : (i64, tensor<1xi64>) -> tensor<1xi64>
    %5 = "linalg.generic"(%arg0, %4) ({
    ^bb0(%arg1: i64, %arg2: i64):
      %18 = "arith.cmpi"(%arg1, %arg2) {predicate = 8 : i64} : (i64, i64) -> i1
      %19 = "arith.select"(%18, %arg1, %arg2) : (i1, i64, i64) -> i64
      "linalg.yield"(%19) : (i64) -> ()
    }) {indexing_maps = [#map0, #map1], iterator_types = ["reduction", "parallel"], operand_segment_sizes = dense<1> : vector<2xi32>} : (tensor<?xi64>, tensor<1xi64>) -> tensor<1xi64>
    %6 = "tensor.extract"(%5, %1) : (tensor<1xi64>, index) -> i64
    %7 = "tensor.dim"(%arg0, %1) : (tensor<?xi64>, index) -> index
    %8 = "linalg.init_tensor"(%7) {static_sizes = [-1]} : (index) -> tensor<?xi32>
    %9 = "linalg.fill"(%0, %8) ({
    ^bb0(%arg1: i32, %arg2: i32):
      "linalg.yield"(%arg1) : (i32) -> ()
    }) : (i32, tensor<?xi32>) -> tensor<?xi32>
    %10 = "tensor.expand_shape"(%arg0) {reassociation = [[0, 1]]} : (tensor<?xi64>) -> tensor<?x1xi64>
    %11 = "arith.trunci"(%10) : (tensor<?x1xi64>) -> tensor<?x1xi32>
    %12 = "arith.maxui"(%6, %2) : (i64, i64) -> i64
    %13 = "arith.index_cast"(%12) : (i64) -> index
    %14 = "linalg.init_tensor"(%13) {static_sizes = [-1]} : (index) -> tensor<?xi32>
    %15 = "linalg.fill"(%0, %14) ({
    ^bb0(%arg1: i32, %arg2: i32):
      "linalg.yield"(%arg1) : (i32) -> ()
    }) : (i32, tensor<?xi32>) -> tensor<?xi32>
    %16 = "tm_tensor.scatter"(%9, %11, %15) ({
    ^bb0(%arg1: i32, %arg2: i32):
      %18 = "arith.addi"(%arg1, %arg2) : (i32, i32) -> i32
      "tm_tensor.yield"(%18) : (i32) -> ()
    }) {operand_segment_sizes = dense<[2, 1]> : vector<2xi32>, unique_indices = false} : (tensor<?xi32>, tensor<?x1xi32>, tensor<?xi32>) -> tensor<?xi32>
    %17 = "arith.extui"(%16) : (tensor<?xi32>) -> tensor<?xi64>
    "std.return"(%17) : (tensor<?xi64>) -> ()
  }) {sym_name = "forward", type = (tensor<?xi64>) -> tensor<?xi64>} : () -> ()
}) {torch.debug_module_name = "BincountModule"} : () -> ()