Skip to content

Instantly share code, notes, and snippets.

@davidberard98
Last active November 30, 2023 01:45
Show Gist options
  • Save davidberard98/5927855a91e4818a32908493dcdb7e38 to your computer and use it in GitHub Desktop.
Save davidberard98/5927855a91e4818a32908493dcdb7e38 to your computer and use it in GitHub Desktop.
// -----// IR Dump Before Inliner (inline) ('builtin.module' operation) //----- //
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)
#loc62 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0)
#loc64 = loc(unknown)
#loc66 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0)
#loc84 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":12:0)
#loc87 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":6:0)
module {
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} {
%c1_i32 = arith.constant 1 : i32 loc(#loc1)
%c32_i32 = arith.constant 32 : i32 loc(#loc2)
%0 = tt.get_program_id x : i32 loc(#loc3)
%c1_i32_0 = arith.constant 1 : i32 loc(#loc4)
%1 = arith.muli %0, %c1_i32_0 : i32 loc(#loc4)
%2 = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc5)
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc6)
%4 = tt.splat %1 : (i32) -> tensor<1x1xi32> loc(#loc7)
%5 = arith.addi %4, %3 : tensor<1x1xi32> loc(#loc7)
%cst = arith.constant dense<1> : tensor<1x1xi32> loc(#loc8)
%6 = arith.cmpi slt, %5, %cst : tensor<1x1xi32> loc(#loc8)
%7 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc9)
%8 = tt.expand_dims %7 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc10)
%cst_1 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc11)
%9 = arith.cmpi slt, %8, %cst_1 : tensor<1x32xi32> loc(#loc11)
%c2_i32 = arith.constant 2 : i32 loc(#loc12)
%cst_2 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc12)
%10 = arith.remsi %8, %cst_2 : tensor<1x32xi32> loc(#loc12)
%c2_i32_3 = arith.constant 2 : i32 loc(#loc13)
%cst_4 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc13)
%11 = arith.divsi %8, %cst_4 : tensor<1x32xi32> loc(#loc13)
%c0_i64 = arith.constant 0 : i64 loc(#loc14)
%cst_5 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc14)
%c1_i64 = arith.constant 1 : i64 loc(#loc15)
%cst_6 = arith.constant dense<1> : tensor<1x1xi64> loc(#loc15)
%12 = arith.cmpi slt, %cst_5, %cst_6 : tensor<1x1xi64> loc(#loc16)
%c3_i32 = arith.constant 3 : i32 loc(#loc17)
%cst_7 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc17)
%13 = arith.muli %10, %cst_7 : tensor<1x32xi32> loc(#loc17)
%c2_i32_8 = arith.constant 2 : i32 loc(#loc18)
%cst_9 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc18)
%14 = arith.divsi %13, %cst_9 : tensor<1x32xi32> loc(#loc18)
%c3_i32_10 = arith.constant 3 : i32 loc(#loc19)
%cst_11 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc19)
%15 = arith.muli %10, %cst_11 : tensor<1x32xi32> loc(#loc19)
%c2_i32_12 = arith.constant 2 : i32 loc(#loc20)
%cst_13 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc20)
%16 = arith.divsi %15, %cst_13 : tensor<1x32xi32> loc(#loc20)
%c2_i32_14 = arith.constant 2 : i32 loc(#loc21)
%cst_15 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc21)
%17 = arith.addi %16, %cst_15 : tensor<1x32xi32> loc(#loc21)
%18 = arith.cmpi slt, %14, %17 : tensor<1x32xi32> loc(#loc22)
%19 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc23)
%20 = arith.andi %19, %18 : tensor<1x32xi1> loc(#loc23)
%c3_i32_16 = arith.constant 3 : i32 loc(#loc24)
%cst_17 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc24)
%21 = arith.muli %11, %cst_17 : tensor<1x32xi32> loc(#loc24)
%c3_i32_18 = arith.constant 3 : i32 loc(#loc25)
%cst_19 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc25)
%22 = arith.muli %10, %cst_19 : tensor<1x32xi32> loc(#loc25)
%c2_i32_20 = arith.constant 2 : i32 loc(#loc26)
%cst_21 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc26)
%23 = arith.divsi %22, %cst_21 : tensor<1x32xi32> loc(#loc26)
%24 = arith.addi %21, %23 : tensor<1x32xi32> loc(#loc27)
%25 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc28)
%26 = tt.addptr %25, %24 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc28)
%27 = arith.andi %9, %20 : tensor<1x32xi1> loc(#loc29)
%cst_22 = arith.constant 0.000000e+00 : f32 loc(#loc30)
%cst_23 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc30)
%28 = arith.extf %cst_23 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc30)
%29 = tt.load %26, %27, %28 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc30)
%cst_24 = arith.constant 0.000000e+00 : f64 loc(#loc31)
%cst_25 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc31)
%30 = arith.select %20, %29, %cst_25 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc32)
%c3_i32_26 = arith.constant 3 : i32 loc(#loc33)
%cst_27 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc33)
%31 = arith.muli %10, %cst_27 : tensor<1x32xi32> loc(#loc33)
%c2_i32_28 = arith.constant 2 : i32 loc(#loc34)
%cst_29 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc34)
%32 = arith.divsi %31, %cst_29 : tensor<1x32xi32> loc(#loc34)
%c1_i32_30 = arith.constant 1 : i32 loc(#loc35)
%cst_31 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc35)
%33 = arith.addi %32, %cst_31 : tensor<1x32xi32> loc(#loc35)
%34 = arith.cmpi slt, %33, %17 : tensor<1x32xi32> loc(#loc36)
%35 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc37)
%36 = arith.andi %35, %34 : tensor<1x32xi1> loc(#loc37)
%c3_i32_32 = arith.constant 3 : i32 loc(#loc38)
%cst_33 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc38)
%37 = arith.muli %11, %cst_33 : tensor<1x32xi32> loc(#loc38)
%c1_i32_34 = arith.constant 1 : i32 loc(#loc39)
%cst_35 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc39)
%38 = arith.addi %37, %cst_35 : tensor<1x32xi32> loc(#loc39)
%c3_i32_36 = arith.constant 3 : i32 loc(#loc40)
%cst_37 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc40)
%39 = arith.muli %10, %cst_37 : tensor<1x32xi32> loc(#loc40)
%c2_i32_38 = arith.constant 2 : i32 loc(#loc41)
%cst_39 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc41)
%40 = arith.divsi %39, %cst_39 : tensor<1x32xi32> loc(#loc41)
%41 = arith.addi %38, %40 : tensor<1x32xi32> loc(#loc42)
%42 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc43)
%43 = tt.addptr %42, %41 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc43)
%44 = arith.andi %9, %36 : tensor<1x32xi1> loc(#loc44)
%cst_40 = arith.constant 0.000000e+00 : f32 loc(#loc45)
%cst_41 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc45)
%45 = arith.extf %cst_41 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc45)
%46 = tt.load %43, %44, %45 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc45)
%cst_42 = arith.constant 0.000000e+00 : f64 loc(#loc46)
%cst_43 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc46)
%47 = arith.select %36, %46, %cst_43 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc47)
%48 = arith.addf %47, %30 : tensor<1x32xf64> loc(#loc48)
%cst_44 = arith.constant 1.000000e+00 : f64 loc(#loc49)
%cst_45 = arith.constant dense<1.000000e+00> : tensor<1x1xf64> loc(#loc49)
%cst_46 = arith.constant 0.000000e+00 : f64 loc(#loc50)
%cst_47 = arith.constant dense<0.000000e+00> : tensor<1x1xf64> loc(#loc50)
%cst_48 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc51)
%cst_49 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc51)
%49 = arith.select %20, %cst_48, %cst_49 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc51)
%cst_50 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc52)
%cst_51 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc52)
%50 = arith.select %36, %cst_50, %cst_51 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc52)
%51 = arith.addf %50, %49 : tensor<1x32xf64> loc(#loc53)
%52 = arith.divf %48, %51 : tensor<1x32xf64> loc(#loc54)
%cst_52 = arith.constant 0xFF800000 : f32 loc(#loc55)
%cst_53 = arith.constant dense<0xFF800000> : tensor<1x32xf32> loc(#loc55)
%53 = arith.extf %cst_53 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc55)
%54 = arith.select %9, %52, %53 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc55)
%55:2 = tt.call @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%54, %8) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc56)
%56 = tt.expand_dims %55#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc57)
%c0_i32 = arith.constant 0 : i32 loc(#loc58)
%cst_54 = arith.constant dense<0> : tensor<1x1xi32> loc(#loc58)
%57 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc59)
%58 = tt.addptr %57, %cst_54 : tensor<1x1x!tt.ptr<i64, 1>>, tensor<1x1xi32> loc(#loc59)
%59 = arith.extsi %56 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc60)
tt.store %58, %59 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc60)
tt.return loc(#loc61)
} loc(#loc)
tt.func private @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%arg0: tensor<1x32xf64> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0), %arg1: tensor<1x32xi32> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0)) -> (tensor<1xf64>, tensor<1xi32>) attributes {noinline = false} {
%0:2 = "tt.reduce"(%arg0, %arg1) <{axis = 1 : i32}> ({
^bb0(%arg2: f64 loc(unknown), %arg3: i32 loc(unknown), %arg4: f64 loc(unknown), %arg5: i32 loc(unknown)):
%1:2 = tt.call @maximum_with_index__fp64_i32_fp64_i32__(%arg2, %arg3, %arg4, %arg5) : (f64, i32, f64, i32) -> (f64, i32) loc(#loc63)
tt.reduce.return %1#0, %1#1 : f64, i32 loc(#loc63)
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc63)
tt.return %0#0, %0#1 : tensor<1xf64>, tensor<1xi32> loc(#loc65)
} loc(#loc62)
tt.func private @maximum_with_index__fp64_i32_fp64_i32__(%arg0: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg1: i32 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg2: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg3: i32 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0)) -> (f64, i32) attributes {noinline = false} {
%0 = arith.cmpf ogt, %arg0, %arg2 : f64 loc(#loc67)
%1 = arith.cmpf oeq, %arg0, %arg2 : f64 loc(#loc68)
%2 = tt.call @is_floating__fp64__(%arg0) : (f64) -> i1 loc(#loc69)
%3:2 = scf.if %2 -> (i1, i1) {
%9 = arith.cmpf une, %arg0, %arg0 : f64 loc(#loc71)
%10 = arith.cmpf une, %arg2, %arg2 : f64 loc(#loc72)
%true = arith.constant true loc(#loc73)
%11 = arith.xori %10, %true : i1 loc(#loc73)
%12 = arith.andi %9, %11 : i1 loc(#loc74)
%13 = arith.ori %0, %12 : i1 loc(#loc75)
%14 = arith.andi %9, %10 : i1 loc(#loc76)
%15 = arith.ori %1, %14 : i1 loc(#loc77)
scf.yield %13, %15 : i1, i1 loc(#loc77)
} else {
scf.yield %0, %1 : i1, i1 loc(#loc64)
} loc(#loc70)
%4 = arith.cmpi slt, %arg1, %arg3 : i32 loc(#loc78)
%5 = arith.andi %3#1, %4 : i1 loc(#loc79)
%6 = arith.ori %3#0, %5 : i1 loc(#loc80)
%7 = arith.select %6, %arg0, %arg2 : f64 loc(#loc81)
%8 = arith.select %6, %arg1, %arg3 : i32 loc(#loc82)
tt.return %7, %8 : f64, i32 loc(#loc83)
} loc(#loc66)
tt.func private @is_floating__fp64__(%arg0: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":12:0)) -> i1 attributes {noinline = false} {
%0 = tt.call @promote_to_tensor__fp64__(%arg0) : (f64) -> tensor<1xf64> loc(#loc85)
%true = arith.constant true loc(#loc86)
tt.return %true : i1 loc(#loc86)
} loc(#loc84)
tt.func private @promote_to_tensor__fp64__(%arg0: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":6:0)) -> tensor<1xf64> attributes {noinline = false} {
%0 = tt.call @"zeros____0cconstexpr_(constexpr_1_,)__1cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc88)
%1 = tt.splat %arg0 : (f64) -> tensor<1xf64> loc(#loc89)
%2 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf64> loc(#loc89)
%3 = arith.addf %1, %2 : tensor<1xf64> loc(#loc89)
tt.return %3 : tensor<1xf64> loc(#loc90)
} loc(#loc87)
tt.func private @"zeros____0cconstexpr_(constexpr_1_,)__1cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} {
%false = arith.constant false loc(#loc92)
%cst = arith.constant dense<false> : tensor<1xi1> loc(#loc92)
tt.return %cst : tensor<1xi1> loc(#loc93)
} loc(#loc91)
} loc(#loc)
#loc1 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":19:13)
#loc2 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":20:13)
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:28)
#loc4 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:33)
#loc5 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:36)
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:44)
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:23)
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":24:21)
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26)
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34)
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21)
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18)
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20)
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":29:30)
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":30:30)
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":31:18)
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15)
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22)
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:19)
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:26)
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16)
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18)
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":35:18)
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49)
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:59)
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:66)
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56)
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30)
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98)
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90)
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36)
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32)
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:20)
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:27)
#loc35 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17)
#loc36 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20)
#loc37 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":41:19)
#loc38 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:54)
#loc39 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52)
#loc40 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:64)
#loc41 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:71)
#loc42 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61)
#loc43 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31)
#loc44 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103)
#loc45 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95)
#loc46 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":43:38)
#loc47 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35)
#loc48 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20)
#loc49 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":46:33)
#loc50 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":47:38)
#loc51 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34)
#loc52 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35)
#loc53 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20)
#loc54 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20)
#loc55 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35)
#loc56 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63)
#loc57 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22)
#loc58 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:49)
#loc59 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25)
#loc60 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68)
#loc61 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4)
#loc63 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42)
#loc65 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:11)
#loc67 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21)
#loc68 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23)
#loc69 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":72:19)
#loc70 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":72:7)
#loc71 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29)
#loc72 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29)
#loc73 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32)
#loc74 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28)
#loc75 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16)
#loc76 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29)
#loc77 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17)
#loc78 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31)
#loc79 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21)
#loc80 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12)
#loc81 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35)
#loc82 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69)
#loc83 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:11)
#loc85 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":13:29)
#loc86 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":13:11)
#loc88 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":8:30)
#loc89 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":8:15)
#loc90 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":8:11)
#loc91 = loc("/home/dberard/local/triton/python/triton/language/standard.py":84:0)
#loc92 = loc("/home/dberard/local/triton/python/triton/language/standard.py":93:31)
#loc93 = loc("/home/dberard/local/triton/python/triton/language/standard.py":93:11)
// -----// IR Dump Before Canonicalizer (canonicalize) ('tt.func' operation: @zeros____0cconstexpr_(constexpr_1_,)__1cconstexpr_int1_) //----- //
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)
#loc62 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0)
#loc64 = loc(unknown)
#loc66 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0)
#loc84 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":12:0)
#loc87 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":6:0)
module {
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} {
%c1_i32 = arith.constant 1 : i32 loc(#loc1)
%c32_i32 = arith.constant 32 : i32 loc(#loc2)
%0 = tt.get_program_id x : i32 loc(#loc3)
%c1_i32_0 = arith.constant 1 : i32 loc(#loc4)
%1 = arith.muli %0, %c1_i32_0 : i32 loc(#loc4)
%2 = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc5)
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc6)
%4 = tt.splat %1 : (i32) -> tensor<1x1xi32> loc(#loc7)
%5 = arith.addi %4, %3 : tensor<1x1xi32> loc(#loc7)
%cst = arith.constant dense<1> : tensor<1x1xi32> loc(#loc8)
%6 = arith.cmpi slt, %5, %cst : tensor<1x1xi32> loc(#loc8)
%7 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc9)
%8 = tt.expand_dims %7 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc10)
%cst_1 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc11)
%9 = arith.cmpi slt, %8, %cst_1 : tensor<1x32xi32> loc(#loc11)
%c2_i32 = arith.constant 2 : i32 loc(#loc12)
%cst_2 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc12)
%10 = arith.remsi %8, %cst_2 : tensor<1x32xi32> loc(#loc12)
%c2_i32_3 = arith.constant 2 : i32 loc(#loc13)
%cst_4 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc13)
%11 = arith.divsi %8, %cst_4 : tensor<1x32xi32> loc(#loc13)
%c0_i64 = arith.constant 0 : i64 loc(#loc14)
%cst_5 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc14)
%c1_i64 = arith.constant 1 : i64 loc(#loc15)
%cst_6 = arith.constant dense<1> : tensor<1x1xi64> loc(#loc15)
%12 = arith.cmpi slt, %cst_5, %cst_6 : tensor<1x1xi64> loc(#loc16)
%c3_i32 = arith.constant 3 : i32 loc(#loc17)
%cst_7 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc17)
%13 = arith.muli %10, %cst_7 : tensor<1x32xi32> loc(#loc17)
%c2_i32_8 = arith.constant 2 : i32 loc(#loc18)
%cst_9 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc18)
%14 = arith.divsi %13, %cst_9 : tensor<1x32xi32> loc(#loc18)
%c3_i32_10 = arith.constant 3 : i32 loc(#loc19)
%cst_11 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc19)
%15 = arith.muli %10, %cst_11 : tensor<1x32xi32> loc(#loc19)
%c2_i32_12 = arith.constant 2 : i32 loc(#loc20)
%cst_13 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc20)
%16 = arith.divsi %15, %cst_13 : tensor<1x32xi32> loc(#loc20)
%c2_i32_14 = arith.constant 2 : i32 loc(#loc21)
%cst_15 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc21)
%17 = arith.addi %16, %cst_15 : tensor<1x32xi32> loc(#loc21)
%18 = arith.cmpi slt, %14, %17 : tensor<1x32xi32> loc(#loc22)
%19 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc23)
%20 = arith.andi %19, %18 : tensor<1x32xi1> loc(#loc23)
%c3_i32_16 = arith.constant 3 : i32 loc(#loc24)
%cst_17 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc24)
%21 = arith.muli %11, %cst_17 : tensor<1x32xi32> loc(#loc24)
%c3_i32_18 = arith.constant 3 : i32 loc(#loc25)
%cst_19 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc25)
%22 = arith.muli %10, %cst_19 : tensor<1x32xi32> loc(#loc25)
%c2_i32_20 = arith.constant 2 : i32 loc(#loc26)
%cst_21 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc26)
%23 = arith.divsi %22, %cst_21 : tensor<1x32xi32> loc(#loc26)
%24 = arith.addi %21, %23 : tensor<1x32xi32> loc(#loc27)
%25 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc28)
%26 = tt.addptr %25, %24 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc28)
%27 = arith.andi %9, %20 : tensor<1x32xi1> loc(#loc29)
%cst_22 = arith.constant 0.000000e+00 : f32 loc(#loc30)
%cst_23 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc30)
%28 = arith.extf %cst_23 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc30)
%29 = tt.load %26, %27, %28 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc30)
%cst_24 = arith.constant 0.000000e+00 : f64 loc(#loc31)
%cst_25 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc31)
%30 = arith.select %20, %29, %cst_25 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc32)
%c3_i32_26 = arith.constant 3 : i32 loc(#loc33)
%cst_27 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc33)
%31 = arith.muli %10, %cst_27 : tensor<1x32xi32> loc(#loc33)
%c2_i32_28 = arith.constant 2 : i32 loc(#loc34)
%cst_29 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc34)
%32 = arith.divsi %31, %cst_29 : tensor<1x32xi32> loc(#loc34)
%c1_i32_30 = arith.constant 1 : i32 loc(#loc35)
%cst_31 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc35)
%33 = arith.addi %32, %cst_31 : tensor<1x32xi32> loc(#loc35)
%34 = arith.cmpi slt, %33, %17 : tensor<1x32xi32> loc(#loc36)
%35 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc37)
%36 = arith.andi %35, %34 : tensor<1x32xi1> loc(#loc37)
%c3_i32_32 = arith.constant 3 : i32 loc(#loc38)
%cst_33 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc38)
%37 = arith.muli %11, %cst_33 : tensor<1x32xi32> loc(#loc38)
%c1_i32_34 = arith.constant 1 : i32 loc(#loc39)
%cst_35 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc39)
%38 = arith.addi %37, %cst_35 : tensor<1x32xi32> loc(#loc39)
%c3_i32_36 = arith.constant 3 : i32 loc(#loc40)
%cst_37 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc40)
%39 = arith.muli %10, %cst_37 : tensor<1x32xi32> loc(#loc40)
%c2_i32_38 = arith.constant 2 : i32 loc(#loc41)
%cst_39 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc41)
%40 = arith.divsi %39, %cst_39 : tensor<1x32xi32> loc(#loc41)
%41 = arith.addi %38, %40 : tensor<1x32xi32> loc(#loc42)
%42 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc43)
%43 = tt.addptr %42, %41 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc43)
%44 = arith.andi %9, %36 : tensor<1x32xi1> loc(#loc44)
%cst_40 = arith.constant 0.000000e+00 : f32 loc(#loc45)
%cst_41 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc45)
%45 = arith.extf %cst_41 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc45)
%46 = tt.load %43, %44, %45 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc45)
%cst_42 = arith.constant 0.000000e+00 : f64 loc(#loc46)
%cst_43 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc46)
%47 = arith.select %36, %46, %cst_43 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc47)
%48 = arith.addf %47, %30 : tensor<1x32xf64> loc(#loc48)
%cst_44 = arith.constant 1.000000e+00 : f64 loc(#loc49)
%cst_45 = arith.constant dense<1.000000e+00> : tensor<1x1xf64> loc(#loc49)
%cst_46 = arith.constant 0.000000e+00 : f64 loc(#loc50)
%cst_47 = arith.constant dense<0.000000e+00> : tensor<1x1xf64> loc(#loc50)
%cst_48 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc51)
%cst_49 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc51)
%49 = arith.select %20, %cst_48, %cst_49 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc51)
%cst_50 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc52)
%cst_51 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc52)
%50 = arith.select %36, %cst_50, %cst_51 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc52)
%51 = arith.addf %50, %49 : tensor<1x32xf64> loc(#loc53)
%52 = arith.divf %48, %51 : tensor<1x32xf64> loc(#loc54)
%cst_52 = arith.constant 0xFF800000 : f32 loc(#loc55)
%cst_53 = arith.constant dense<0xFF800000> : tensor<1x32xf32> loc(#loc55)
%53 = arith.extf %cst_53 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc55)
%54 = arith.select %9, %52, %53 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc55)
%55:2 = tt.call @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%54, %8) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc56)
%56 = tt.expand_dims %55#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc57)
%c0_i32 = arith.constant 0 : i32 loc(#loc58)
%cst_54 = arith.constant dense<0> : tensor<1x1xi32> loc(#loc58)
%57 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc59)
%58 = tt.addptr %57, %cst_54 : tensor<1x1x!tt.ptr<i64, 1>>, tensor<1x1xi32> loc(#loc59)
%59 = arith.extsi %56 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc60)
tt.store %58, %59 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc60)
tt.return loc(#loc61)
} loc(#loc)
tt.func private @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%arg0: tensor<1x32xf64> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0), %arg1: tensor<1x32xi32> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0)) -> (tensor<1xf64>, tensor<1xi32>) attributes {noinline = false} {
%0:2 = "tt.reduce"(%arg0, %arg1) <{axis = 1 : i32}> ({
^bb0(%arg2: f64 loc(unknown), %arg3: i32 loc(unknown), %arg4: f64 loc(unknown), %arg5: i32 loc(unknown)):
%1:2 = tt.call @maximum_with_index__fp64_i32_fp64_i32__(%arg2, %arg3, %arg4, %arg5) : (f64, i32, f64, i32) -> (f64, i32) loc(#loc63)
tt.reduce.return %1#0, %1#1 : f64, i32 loc(#loc63)
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc63)
tt.return %0#0, %0#1 : tensor<1xf64>, tensor<1xi32> loc(#loc65)
} loc(#loc62)
tt.func private @maximum_with_index__fp64_i32_fp64_i32__(%arg0: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg1: i32 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg2: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg3: i32 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0)) -> (f64, i32) attributes {noinline = false} {
%0 = arith.cmpf ogt, %arg0, %arg2 : f64 loc(#loc67)
%1 = arith.cmpf oeq, %arg0, %arg2 : f64 loc(#loc68)
%2 = tt.call @is_floating__fp64__(%arg0) : (f64) -> i1 loc(#loc69)
%3:2 = scf.if %2 -> (i1, i1) {
%9 = arith.cmpf une, %arg0, %arg0 : f64 loc(#loc71)
%10 = arith.cmpf une, %arg2, %arg2 : f64 loc(#loc72)
%true = arith.constant true loc(#loc73)
%11 = arith.xori %10, %true : i1 loc(#loc73)
%12 = arith.andi %9, %11 : i1 loc(#loc74)
%13 = arith.ori %0, %12 : i1 loc(#loc75)
%14 = arith.andi %9, %10 : i1 loc(#loc76)
%15 = arith.ori %1, %14 : i1 loc(#loc77)
scf.yield %13, %15 : i1, i1 loc(#loc77)
} else {
scf.yield %0, %1 : i1, i1 loc(#loc64)
} loc(#loc70)
%4 = arith.cmpi slt, %arg1, %arg3 : i32 loc(#loc78)
%5 = arith.andi %3#1, %4 : i1 loc(#loc79)
%6 = arith.ori %3#0, %5 : i1 loc(#loc80)
%7 = arith.select %6, %arg0, %arg2 : f64 loc(#loc81)
%8 = arith.select %6, %arg1, %arg3 : i32 loc(#loc82)
tt.return %7, %8 : f64, i32 loc(#loc83)
} loc(#loc66)
tt.func private @is_floating__fp64__(%arg0: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":12:0)) -> i1 attributes {noinline = false} {
%0 = tt.call @promote_to_tensor__fp64__(%arg0) : (f64) -> tensor<1xf64> loc(#loc85)
%true = arith.constant true loc(#loc86)
tt.return %true : i1 loc(#loc86)
} loc(#loc84)
tt.func private @promote_to_tensor__fp64__(%arg0: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":6:0)) -> tensor<1xf64> attributes {noinline = false} {
%0 = tt.call @"zeros____0cconstexpr_(constexpr_1_,)__1cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc88)
%1 = tt.splat %arg0 : (f64) -> tensor<1xf64> loc(#loc89)
%2 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf64> loc(#loc89)
%3 = arith.addf %1, %2 : tensor<1xf64> loc(#loc89)
tt.return %3 : tensor<1xf64> loc(#loc90)
} loc(#loc87)
tt.func private @"zeros____0cconstexpr_(constexpr_1_,)__1cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} {
%false = arith.constant false loc(#loc92)
%cst = arith.constant dense<false> : tensor<1xi1> loc(#loc92)
tt.return %cst : tensor<1xi1> loc(#loc93)
} loc(#loc91)
} loc(#loc)
#loc1 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":19:13)
#loc2 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":20:13)
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:28)
#loc4 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:33)
#loc5 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:36)
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:44)
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:23)
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":24:21)
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26)
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34)
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21)
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18)
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20)
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":29:30)
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":30:30)
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":31:18)
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15)
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22)
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:19)
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:26)
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16)
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18)
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":35:18)
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49)
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:59)
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:66)
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56)
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30)
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98)
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90)
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36)
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32)
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:20)
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:27)
#loc35 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17)
#loc36 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20)
#loc37 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":41:19)
#loc38 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:54)
#loc39 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52)
#loc40 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:64)
#loc41 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:71)
#loc42 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61)
#loc43 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31)
#loc44 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103)
#loc45 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95)
#loc46 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":43:38)
#loc47 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35)
#loc48 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20)
#loc49 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":46:33)
#loc50 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":47:38)
#loc51 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34)
#loc52 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35)
#loc53 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20)
#loc54 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20)
#loc55 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35)
#loc56 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63)
#loc57 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22)
#loc58 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:49)
#loc59 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25)
#loc60 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68)
#loc61 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4)
#loc63 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42)
#loc65 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:11)
#loc67 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21)
#loc68 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23)
#loc69 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":72:19)
#loc70 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":72:7)
#loc71 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29)
#loc72 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29)
#loc73 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32)
#loc74 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28)
#loc75 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16)
#loc76 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29)
#loc77 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17)
#loc78 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31)
#loc79 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21)
#loc80 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12)
#loc81 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35)
#loc82 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69)
#loc83 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:11)
#loc85 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":13:29)
#loc86 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":13:11)
#loc88 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":8:30)
#loc89 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":8:15)
#loc90 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":8:11)
#loc91 = loc("/home/dberard/local/triton/python/triton/language/standard.py":84:0)
#loc92 = loc("/home/dberard/local/triton/python/triton/language/standard.py":93:31)
#loc93 = loc("/home/dberard/local/triton/python/triton/language/standard.py":93:11)
// -----// IR Dump Before Canonicalizer (canonicalize) ('tt.func' operation: @promote_to_tensor__fp64__) //----- //
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)
#loc62 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0)
#loc64 = loc(unknown)
#loc66 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0)
#loc84 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":12:0)
#loc87 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":6:0)
module {
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} {
%c1_i32 = arith.constant 1 : i32 loc(#loc1)
%c32_i32 = arith.constant 32 : i32 loc(#loc2)
%0 = tt.get_program_id x : i32 loc(#loc3)
%c1_i32_0 = arith.constant 1 : i32 loc(#loc4)
%1 = arith.muli %0, %c1_i32_0 : i32 loc(#loc4)
%2 = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc5)
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc6)
%4 = tt.splat %1 : (i32) -> tensor<1x1xi32> loc(#loc7)
%5 = arith.addi %4, %3 : tensor<1x1xi32> loc(#loc7)
%cst = arith.constant dense<1> : tensor<1x1xi32> loc(#loc8)
%6 = arith.cmpi slt, %5, %cst : tensor<1x1xi32> loc(#loc8)
%7 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc9)
%8 = tt.expand_dims %7 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc10)
%cst_1 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc11)
%9 = arith.cmpi slt, %8, %cst_1 : tensor<1x32xi32> loc(#loc11)
%c2_i32 = arith.constant 2 : i32 loc(#loc12)
%cst_2 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc12)
%10 = arith.remsi %8, %cst_2 : tensor<1x32xi32> loc(#loc12)
%c2_i32_3 = arith.constant 2 : i32 loc(#loc13)
%cst_4 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc13)
%11 = arith.divsi %8, %cst_4 : tensor<1x32xi32> loc(#loc13)
%c0_i64 = arith.constant 0 : i64 loc(#loc14)
%cst_5 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc14)
%c1_i64 = arith.constant 1 : i64 loc(#loc15)
%cst_6 = arith.constant dense<1> : tensor<1x1xi64> loc(#loc15)
%12 = arith.cmpi slt, %cst_5, %cst_6 : tensor<1x1xi64> loc(#loc16)
%c3_i32 = arith.constant 3 : i32 loc(#loc17)
%cst_7 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc17)
%13 = arith.muli %10, %cst_7 : tensor<1x32xi32> loc(#loc17)
%c2_i32_8 = arith.constant 2 : i32 loc(#loc18)
%cst_9 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc18)
%14 = arith.divsi %13, %cst_9 : tensor<1x32xi32> loc(#loc18)
%c3_i32_10 = arith.constant 3 : i32 loc(#loc19)
%cst_11 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc19)
%15 = arith.muli %10, %cst_11 : tensor<1x32xi32> loc(#loc19)
%c2_i32_12 = arith.constant 2 : i32 loc(#loc20)
%cst_13 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc20)
%16 = arith.divsi %15, %cst_13 : tensor<1x32xi32> loc(#loc20)
%c2_i32_14 = arith.constant 2 : i32 loc(#loc21)
%cst_15 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc21)
%17 = arith.addi %16, %cst_15 : tensor<1x32xi32> loc(#loc21)
%18 = arith.cmpi slt, %14, %17 : tensor<1x32xi32> loc(#loc22)
%19 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc23)
%20 = arith.andi %19, %18 : tensor<1x32xi1> loc(#loc23)
%c3_i32_16 = arith.constant 3 : i32 loc(#loc24)
%cst_17 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc24)
%21 = arith.muli %11, %cst_17 : tensor<1x32xi32> loc(#loc24)
%c3_i32_18 = arith.constant 3 : i32 loc(#loc25)
%cst_19 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc25)
%22 = arith.muli %10, %cst_19 : tensor<1x32xi32> loc(#loc25)
%c2_i32_20 = arith.constant 2 : i32 loc(#loc26)
%cst_21 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc26)
%23 = arith.divsi %22, %cst_21 : tensor<1x32xi32> loc(#loc26)
%24 = arith.addi %21, %23 : tensor<1x32xi32> loc(#loc27)
%25 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc28)
%26 = tt.addptr %25, %24 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc28)
%27 = arith.andi %9, %20 : tensor<1x32xi1> loc(#loc29)
%cst_22 = arith.constant 0.000000e+00 : f32 loc(#loc30)
%cst_23 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc30)
%28 = arith.extf %cst_23 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc30)
%29 = tt.load %26, %27, %28 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc30)
%cst_24 = arith.constant 0.000000e+00 : f64 loc(#loc31)
%cst_25 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc31)
%30 = arith.select %20, %29, %cst_25 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc32)
%c3_i32_26 = arith.constant 3 : i32 loc(#loc33)
%cst_27 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc33)
%31 = arith.muli %10, %cst_27 : tensor<1x32xi32> loc(#loc33)
%c2_i32_28 = arith.constant 2 : i32 loc(#loc34)
%cst_29 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc34)
%32 = arith.divsi %31, %cst_29 : tensor<1x32xi32> loc(#loc34)
%c1_i32_30 = arith.constant 1 : i32 loc(#loc35)
%cst_31 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc35)
%33 = arith.addi %32, %cst_31 : tensor<1x32xi32> loc(#loc35)
%34 = arith.cmpi slt, %33, %17 : tensor<1x32xi32> loc(#loc36)
%35 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc37)
%36 = arith.andi %35, %34 : tensor<1x32xi1> loc(#loc37)
%c3_i32_32 = arith.constant 3 : i32 loc(#loc38)
%cst_33 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc38)
%37 = arith.muli %11, %cst_33 : tensor<1x32xi32> loc(#loc38)
%c1_i32_34 = arith.constant 1 : i32 loc(#loc39)
%cst_35 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc39)
%38 = arith.addi %37, %cst_35 : tensor<1x32xi32> loc(#loc39)
%c3_i32_36 = arith.constant 3 : i32 loc(#loc40)
%cst_37 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc40)
%39 = arith.muli %10, %cst_37 : tensor<1x32xi32> loc(#loc40)
%c2_i32_38 = arith.constant 2 : i32 loc(#loc41)
%cst_39 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc41)
%40 = arith.divsi %39, %cst_39 : tensor<1x32xi32> loc(#loc41)
%41 = arith.addi %38, %40 : tensor<1x32xi32> loc(#loc42)
%42 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc43)
%43 = tt.addptr %42, %41 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc43)
%44 = arith.andi %9, %36 : tensor<1x32xi1> loc(#loc44)
%cst_40 = arith.constant 0.000000e+00 : f32 loc(#loc45)
%cst_41 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc45)
%45 = arith.extf %cst_41 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc45)
%46 = tt.load %43, %44, %45 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc45)
%cst_42 = arith.constant 0.000000e+00 : f64 loc(#loc46)
%cst_43 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc46)
%47 = arith.select %36, %46, %cst_43 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc47)
%48 = arith.addf %47, %30 : tensor<1x32xf64> loc(#loc48)
%cst_44 = arith.constant 1.000000e+00 : f64 loc(#loc49)
%cst_45 = arith.constant dense<1.000000e+00> : tensor<1x1xf64> loc(#loc49)
%cst_46 = arith.constant 0.000000e+00 : f64 loc(#loc50)
%cst_47 = arith.constant dense<0.000000e+00> : tensor<1x1xf64> loc(#loc50)
%cst_48 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc51)
%cst_49 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc51)
%49 = arith.select %20, %cst_48, %cst_49 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc51)
%cst_50 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc52)
%cst_51 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc52)
%50 = arith.select %36, %cst_50, %cst_51 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc52)
%51 = arith.addf %50, %49 : tensor<1x32xf64> loc(#loc53)
%52 = arith.divf %48, %51 : tensor<1x32xf64> loc(#loc54)
%cst_52 = arith.constant 0xFF800000 : f32 loc(#loc55)
%cst_53 = arith.constant dense<0xFF800000> : tensor<1x32xf32> loc(#loc55)
%53 = arith.extf %cst_53 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc55)
%54 = arith.select %9, %52, %53 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc55)
%55:2 = tt.call @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%54, %8) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc56)
%56 = tt.expand_dims %55#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc57)
%c0_i32 = arith.constant 0 : i32 loc(#loc58)
%cst_54 = arith.constant dense<0> : tensor<1x1xi32> loc(#loc58)
%57 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc59)
%58 = tt.addptr %57, %cst_54 : tensor<1x1x!tt.ptr<i64, 1>>, tensor<1x1xi32> loc(#loc59)
%59 = arith.extsi %56 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc60)
tt.store %58, %59 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc60)
tt.return loc(#loc61)
} loc(#loc)
tt.func private @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%arg0: tensor<1x32xf64> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0), %arg1: tensor<1x32xi32> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0)) -> (tensor<1xf64>, tensor<1xi32>) attributes {noinline = false} {
%0:2 = "tt.reduce"(%arg0, %arg1) <{axis = 1 : i32}> ({
^bb0(%arg2: f64 loc(unknown), %arg3: i32 loc(unknown), %arg4: f64 loc(unknown), %arg5: i32 loc(unknown)):
%1:2 = tt.call @maximum_with_index__fp64_i32_fp64_i32__(%arg2, %arg3, %arg4, %arg5) : (f64, i32, f64, i32) -> (f64, i32) loc(#loc63)
tt.reduce.return %1#0, %1#1 : f64, i32 loc(#loc63)
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc63)
tt.return %0#0, %0#1 : tensor<1xf64>, tensor<1xi32> loc(#loc65)
} loc(#loc62)
tt.func private @maximum_with_index__fp64_i32_fp64_i32__(%arg0: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg1: i32 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg2: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg3: i32 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0)) -> (f64, i32) attributes {noinline = false} {
%0 = arith.cmpf ogt, %arg0, %arg2 : f64 loc(#loc67)
%1 = arith.cmpf oeq, %arg0, %arg2 : f64 loc(#loc68)
%2 = tt.call @is_floating__fp64__(%arg0) : (f64) -> i1 loc(#loc69)
%3:2 = scf.if %2 -> (i1, i1) {
%9 = arith.cmpf une, %arg0, %arg0 : f64 loc(#loc71)
%10 = arith.cmpf une, %arg2, %arg2 : f64 loc(#loc72)
%true = arith.constant true loc(#loc73)
%11 = arith.xori %10, %true : i1 loc(#loc73)
%12 = arith.andi %9, %11 : i1 loc(#loc74)
%13 = arith.ori %0, %12 : i1 loc(#loc75)
%14 = arith.andi %9, %10 : i1 loc(#loc76)
%15 = arith.ori %1, %14 : i1 loc(#loc77)
scf.yield %13, %15 : i1, i1 loc(#loc77)
} else {
scf.yield %0, %1 : i1, i1 loc(#loc64)
} loc(#loc70)
%4 = arith.cmpi slt, %arg1, %arg3 : i32 loc(#loc78)
%5 = arith.andi %3#1, %4 : i1 loc(#loc79)
%6 = arith.ori %3#0, %5 : i1 loc(#loc80)
%7 = arith.select %6, %arg0, %arg2 : f64 loc(#loc81)
%8 = arith.select %6, %arg1, %arg3 : i32 loc(#loc82)
tt.return %7, %8 : f64, i32 loc(#loc83)
} loc(#loc66)
tt.func private @is_floating__fp64__(%arg0: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":12:0)) -> i1 attributes {noinline = false} {
%0 = tt.call @promote_to_tensor__fp64__(%arg0) : (f64) -> tensor<1xf64> loc(#loc85)
%true = arith.constant true loc(#loc86)
tt.return %true : i1 loc(#loc86)
} loc(#loc84)
tt.func private @promote_to_tensor__fp64__(%arg0: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":6:0)) -> tensor<1xf64> attributes {noinline = false} {
%0 = tt.call @"zeros____0cconstexpr_(constexpr_1_,)__1cconstexpr_int1_"() : () -> tensor<1xi1> loc(#loc88)
%1 = tt.splat %arg0 : (f64) -> tensor<1xf64> loc(#loc89)
%2 = arith.uitofp %0 : tensor<1xi1> to tensor<1xf64> loc(#loc89)
%3 = arith.addf %1, %2 : tensor<1xf64> loc(#loc89)
tt.return %3 : tensor<1xf64> loc(#loc90)
} loc(#loc87)
tt.func private @"zeros____0cconstexpr_(constexpr_1_,)__1cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} {
%cst = arith.constant dense<false> : tensor<1xi1> loc(#loc92)
tt.return %cst : tensor<1xi1> loc(#loc93)
} loc(#loc91)
} loc(#loc)
#loc1 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":19:13)
#loc2 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":20:13)
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:28)
#loc4 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:33)
#loc5 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:36)
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:44)
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:23)
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":24:21)
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26)
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34)
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21)
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18)
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20)
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":29:30)
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":30:30)
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":31:18)
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15)
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22)
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:19)
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:26)
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16)
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18)
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":35:18)
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49)
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:59)
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:66)
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56)
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30)
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98)
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90)
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36)
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32)
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:20)
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:27)
#loc35 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17)
#loc36 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20)
#loc37 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":41:19)
#loc38 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:54)
#loc39 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52)
#loc40 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:64)
#loc41 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:71)
#loc42 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61)
#loc43 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31)
#loc44 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103)
#loc45 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95)
#loc46 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":43:38)
#loc47 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35)
#loc48 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20)
#loc49 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":46:33)
#loc50 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":47:38)
#loc51 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34)
#loc52 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35)
#loc53 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20)
#loc54 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20)
#loc55 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35)
#loc56 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63)
#loc57 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22)
#loc58 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:49)
#loc59 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25)
#loc60 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68)
#loc61 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4)
#loc63 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42)
#loc65 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:11)
#loc67 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21)
#loc68 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23)
#loc69 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":72:19)
#loc70 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":72:7)
#loc71 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29)
#loc72 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29)
#loc73 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32)
#loc74 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28)
#loc75 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16)
#loc76 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29)
#loc77 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17)
#loc78 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31)
#loc79 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21)
#loc80 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12)
#loc81 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35)
#loc82 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69)
#loc83 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:11)
#loc85 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":13:29)
#loc86 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":13:11)
#loc88 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":8:30)
#loc89 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":8:15)
#loc90 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":8:11)
#loc91 = loc("/home/dberard/local/triton/python/triton/language/standard.py":84:0)
#loc92 = loc("/home/dberard/local/triton/python/triton/language/standard.py":93:31)
#loc93 = loc("/home/dberard/local/triton/python/triton/language/standard.py":93:11)
// -----// IR Dump Before Canonicalizer (canonicalize) ('tt.func' operation: @promote_to_tensor__fp64__) //----- //
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)
#loc62 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0)
#loc64 = loc(unknown)
#loc66 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0)
#loc84 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":12:0)
#loc87 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":6:0)
module {
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} {
%c1_i32 = arith.constant 1 : i32 loc(#loc1)
%c32_i32 = arith.constant 32 : i32 loc(#loc2)
%0 = tt.get_program_id x : i32 loc(#loc3)
%c1_i32_0 = arith.constant 1 : i32 loc(#loc4)
%1 = arith.muli %0, %c1_i32_0 : i32 loc(#loc4)
%2 = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc5)
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc6)
%4 = tt.splat %1 : (i32) -> tensor<1x1xi32> loc(#loc7)
%5 = arith.addi %4, %3 : tensor<1x1xi32> loc(#loc7)
%cst = arith.constant dense<1> : tensor<1x1xi32> loc(#loc8)
%6 = arith.cmpi slt, %5, %cst : tensor<1x1xi32> loc(#loc8)
%7 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc9)
%8 = tt.expand_dims %7 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc10)
%cst_1 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc11)
%9 = arith.cmpi slt, %8, %cst_1 : tensor<1x32xi32> loc(#loc11)
%c2_i32 = arith.constant 2 : i32 loc(#loc12)
%cst_2 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc12)
%10 = arith.remsi %8, %cst_2 : tensor<1x32xi32> loc(#loc12)
%c2_i32_3 = arith.constant 2 : i32 loc(#loc13)
%cst_4 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc13)
%11 = arith.divsi %8, %cst_4 : tensor<1x32xi32> loc(#loc13)
%c0_i64 = arith.constant 0 : i64 loc(#loc14)
%cst_5 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc14)
%c1_i64 = arith.constant 1 : i64 loc(#loc15)
%cst_6 = arith.constant dense<1> : tensor<1x1xi64> loc(#loc15)
%12 = arith.cmpi slt, %cst_5, %cst_6 : tensor<1x1xi64> loc(#loc16)
%c3_i32 = arith.constant 3 : i32 loc(#loc17)
%cst_7 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc17)
%13 = arith.muli %10, %cst_7 : tensor<1x32xi32> loc(#loc17)
%c2_i32_8 = arith.constant 2 : i32 loc(#loc18)
%cst_9 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc18)
%14 = arith.divsi %13, %cst_9 : tensor<1x32xi32> loc(#loc18)
%c3_i32_10 = arith.constant 3 : i32 loc(#loc19)
%cst_11 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc19)
%15 = arith.muli %10, %cst_11 : tensor<1x32xi32> loc(#loc19)
%c2_i32_12 = arith.constant 2 : i32 loc(#loc20)
%cst_13 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc20)
%16 = arith.divsi %15, %cst_13 : tensor<1x32xi32> loc(#loc20)
%c2_i32_14 = arith.constant 2 : i32 loc(#loc21)
%cst_15 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc21)
%17 = arith.addi %16, %cst_15 : tensor<1x32xi32> loc(#loc21)
%18 = arith.cmpi slt, %14, %17 : tensor<1x32xi32> loc(#loc22)
%19 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc23)
%20 = arith.andi %19, %18 : tensor<1x32xi1> loc(#loc23)
%c3_i32_16 = arith.constant 3 : i32 loc(#loc24)
%cst_17 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc24)
%21 = arith.muli %11, %cst_17 : tensor<1x32xi32> loc(#loc24)
%c3_i32_18 = arith.constant 3 : i32 loc(#loc25)
%cst_19 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc25)
%22 = arith.muli %10, %cst_19 : tensor<1x32xi32> loc(#loc25)
%c2_i32_20 = arith.constant 2 : i32 loc(#loc26)
%cst_21 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc26)
%23 = arith.divsi %22, %cst_21 : tensor<1x32xi32> loc(#loc26)
%24 = arith.addi %21, %23 : tensor<1x32xi32> loc(#loc27)
%25 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc28)
%26 = tt.addptr %25, %24 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc28)
%27 = arith.andi %9, %20 : tensor<1x32xi1> loc(#loc29)
%cst_22 = arith.constant 0.000000e+00 : f32 loc(#loc30)
%cst_23 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc30)
%28 = arith.extf %cst_23 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc30)
%29 = tt.load %26, %27, %28 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc30)
%cst_24 = arith.constant 0.000000e+00 : f64 loc(#loc31)
%cst_25 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc31)
%30 = arith.select %20, %29, %cst_25 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc32)
%c3_i32_26 = arith.constant 3 : i32 loc(#loc33)
%cst_27 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc33)
%31 = arith.muli %10, %cst_27 : tensor<1x32xi32> loc(#loc33)
%c2_i32_28 = arith.constant 2 : i32 loc(#loc34)
%cst_29 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc34)
%32 = arith.divsi %31, %cst_29 : tensor<1x32xi32> loc(#loc34)
%c1_i32_30 = arith.constant 1 : i32 loc(#loc35)
%cst_31 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc35)
%33 = arith.addi %32, %cst_31 : tensor<1x32xi32> loc(#loc35)
%34 = arith.cmpi slt, %33, %17 : tensor<1x32xi32> loc(#loc36)
%35 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc37)
%36 = arith.andi %35, %34 : tensor<1x32xi1> loc(#loc37)
%c3_i32_32 = arith.constant 3 : i32 loc(#loc38)
%cst_33 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc38)
%37 = arith.muli %11, %cst_33 : tensor<1x32xi32> loc(#loc38)
%c1_i32_34 = arith.constant 1 : i32 loc(#loc39)
%cst_35 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc39)
%38 = arith.addi %37, %cst_35 : tensor<1x32xi32> loc(#loc39)
%c3_i32_36 = arith.constant 3 : i32 loc(#loc40)
%cst_37 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc40)
%39 = arith.muli %10, %cst_37 : tensor<1x32xi32> loc(#loc40)
%c2_i32_38 = arith.constant 2 : i32 loc(#loc41)
%cst_39 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc41)
%40 = arith.divsi %39, %cst_39 : tensor<1x32xi32> loc(#loc41)
%41 = arith.addi %38, %40 : tensor<1x32xi32> loc(#loc42)
%42 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc43)
%43 = tt.addptr %42, %41 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc43)
%44 = arith.andi %9, %36 : tensor<1x32xi1> loc(#loc44)
%cst_40 = arith.constant 0.000000e+00 : f32 loc(#loc45)
%cst_41 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc45)
%45 = arith.extf %cst_41 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc45)
%46 = tt.load %43, %44, %45 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc45)
%cst_42 = arith.constant 0.000000e+00 : f64 loc(#loc46)
%cst_43 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc46)
%47 = arith.select %36, %46, %cst_43 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc47)
%48 = arith.addf %47, %30 : tensor<1x32xf64> loc(#loc48)
%cst_44 = arith.constant 1.000000e+00 : f64 loc(#loc49)
%cst_45 = arith.constant dense<1.000000e+00> : tensor<1x1xf64> loc(#loc49)
%cst_46 = arith.constant 0.000000e+00 : f64 loc(#loc50)
%cst_47 = arith.constant dense<0.000000e+00> : tensor<1x1xf64> loc(#loc50)
%cst_48 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc51)
%cst_49 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc51)
%49 = arith.select %20, %cst_48, %cst_49 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc51)
%cst_50 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc52)
%cst_51 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc52)
%50 = arith.select %36, %cst_50, %cst_51 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc52)
%51 = arith.addf %50, %49 : tensor<1x32xf64> loc(#loc53)
%52 = arith.divf %48, %51 : tensor<1x32xf64> loc(#loc54)
%cst_52 = arith.constant 0xFF800000 : f32 loc(#loc55)
%cst_53 = arith.constant dense<0xFF800000> : tensor<1x32xf32> loc(#loc55)
%53 = arith.extf %cst_53 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc55)
%54 = arith.select %9, %52, %53 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc55)
%55:2 = tt.call @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%54, %8) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc56)
%56 = tt.expand_dims %55#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc57)
%c0_i32 = arith.constant 0 : i32 loc(#loc58)
%cst_54 = arith.constant dense<0> : tensor<1x1xi32> loc(#loc58)
%57 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc59)
%58 = tt.addptr %57, %cst_54 : tensor<1x1x!tt.ptr<i64, 1>>, tensor<1x1xi32> loc(#loc59)
%59 = arith.extsi %56 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc60)
tt.store %58, %59 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc60)
tt.return loc(#loc61)
} loc(#loc)
tt.func private @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%arg0: tensor<1x32xf64> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0), %arg1: tensor<1x32xi32> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0)) -> (tensor<1xf64>, tensor<1xi32>) attributes {noinline = false} {
%0:2 = "tt.reduce"(%arg0, %arg1) <{axis = 1 : i32}> ({
^bb0(%arg2: f64 loc(unknown), %arg3: i32 loc(unknown), %arg4: f64 loc(unknown), %arg5: i32 loc(unknown)):
%1:2 = tt.call @maximum_with_index__fp64_i32_fp64_i32__(%arg2, %arg3, %arg4, %arg5) : (f64, i32, f64, i32) -> (f64, i32) loc(#loc63)
tt.reduce.return %1#0, %1#1 : f64, i32 loc(#loc63)
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc63)
tt.return %0#0, %0#1 : tensor<1xf64>, tensor<1xi32> loc(#loc65)
} loc(#loc62)
tt.func private @maximum_with_index__fp64_i32_fp64_i32__(%arg0: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg1: i32 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg2: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg3: i32 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0)) -> (f64, i32) attributes {noinline = false} {
%0 = arith.cmpf ogt, %arg0, %arg2 : f64 loc(#loc67)
%1 = arith.cmpf oeq, %arg0, %arg2 : f64 loc(#loc68)
%2 = tt.call @is_floating__fp64__(%arg0) : (f64) -> i1 loc(#loc69)
%3:2 = scf.if %2 -> (i1, i1) {
%9 = arith.cmpf une, %arg0, %arg0 : f64 loc(#loc71)
%10 = arith.cmpf une, %arg2, %arg2 : f64 loc(#loc72)
%true = arith.constant true loc(#loc73)
%11 = arith.xori %10, %true : i1 loc(#loc73)
%12 = arith.andi %9, %11 : i1 loc(#loc74)
%13 = arith.ori %0, %12 : i1 loc(#loc75)
%14 = arith.andi %9, %10 : i1 loc(#loc76)
%15 = arith.ori %1, %14 : i1 loc(#loc77)
scf.yield %13, %15 : i1, i1 loc(#loc77)
} else {
scf.yield %0, %1 : i1, i1 loc(#loc64)
} loc(#loc70)
%4 = arith.cmpi slt, %arg1, %arg3 : i32 loc(#loc78)
%5 = arith.andi %3#1, %4 : i1 loc(#loc79)
%6 = arith.ori %3#0, %5 : i1 loc(#loc80)
%7 = arith.select %6, %arg0, %arg2 : f64 loc(#loc81)
%8 = arith.select %6, %arg1, %arg3 : i32 loc(#loc82)
tt.return %7, %8 : f64, i32 loc(#loc83)
} loc(#loc66)
tt.func private @is_floating__fp64__(%arg0: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":12:0)) -> i1 attributes {noinline = false} {
%0 = tt.call @promote_to_tensor__fp64__(%arg0) : (f64) -> tensor<1xf64> loc(#loc85)
%true = arith.constant true loc(#loc86)
tt.return %true : i1 loc(#loc86)
} loc(#loc84)
tt.func private @promote_to_tensor__fp64__(%arg0: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":6:0)) -> tensor<1xf64> attributes {noinline = false} {
%cst = arith.constant dense<false> : tensor<1xi1> loc(#loc93)
%0 = tt.splat %arg0 : (f64) -> tensor<1xf64> loc(#loc90)
%1 = arith.uitofp %cst : tensor<1xi1> to tensor<1xf64> loc(#loc90)
%2 = arith.addf %0, %1 : tensor<1xf64> loc(#loc90)
tt.return %2 : tensor<1xf64> loc(#loc91)
} loc(#loc87)
tt.func private @"zeros____0cconstexpr_(constexpr_1_,)__1cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} loc(#loc92)
} loc(#loc)
#loc1 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":19:13)
#loc2 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":20:13)
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:28)
#loc4 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:33)
#loc5 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:36)
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:44)
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:23)
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":24:21)
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26)
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34)
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21)
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18)
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20)
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":29:30)
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":30:30)
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":31:18)
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15)
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22)
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:19)
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:26)
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16)
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18)
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":35:18)
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49)
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:59)
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:66)
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56)
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30)
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98)
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90)
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36)
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32)
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:20)
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:27)
#loc35 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17)
#loc36 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20)
#loc37 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":41:19)
#loc38 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:54)
#loc39 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52)
#loc40 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:64)
#loc41 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:71)
#loc42 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61)
#loc43 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31)
#loc44 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103)
#loc45 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95)
#loc46 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":43:38)
#loc47 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35)
#loc48 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20)
#loc49 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":46:33)
#loc50 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":47:38)
#loc51 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34)
#loc52 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35)
#loc53 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20)
#loc54 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20)
#loc55 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35)
#loc56 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63)
#loc57 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22)
#loc58 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:49)
#loc59 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25)
#loc60 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68)
#loc61 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4)
#loc63 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42)
#loc65 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:11)
#loc67 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21)
#loc68 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23)
#loc69 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":72:19)
#loc70 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":72:7)
#loc71 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29)
#loc72 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29)
#loc73 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32)
#loc74 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28)
#loc75 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16)
#loc76 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29)
#loc77 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17)
#loc78 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31)
#loc79 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21)
#loc80 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12)
#loc81 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35)
#loc82 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69)
#loc83 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:11)
#loc85 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":13:29)
#loc86 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":13:11)
#loc88 = loc("/home/dberard/local/triton/python/triton/language/standard.py":93:31)
#loc89 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":8:30)
#loc90 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":8:15)
#loc91 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":8:11)
#loc92 = loc("/home/dberard/local/triton/python/triton/language/standard.py":84:0)
#loc93 = loc(callsite(#loc88 at #loc89))
// -----// IR Dump Before Canonicalizer (canonicalize) ('tt.func' operation: @is_floating__fp64__) //----- //
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)
#loc62 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0)
#loc64 = loc(unknown)
#loc66 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0)
#loc84 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":12:0)
#loc87 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":6:0)
module {
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} {
%c1_i32 = arith.constant 1 : i32 loc(#loc1)
%c32_i32 = arith.constant 32 : i32 loc(#loc2)
%0 = tt.get_program_id x : i32 loc(#loc3)
%c1_i32_0 = arith.constant 1 : i32 loc(#loc4)
%1 = arith.muli %0, %c1_i32_0 : i32 loc(#loc4)
%2 = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc5)
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc6)
%4 = tt.splat %1 : (i32) -> tensor<1x1xi32> loc(#loc7)
%5 = arith.addi %4, %3 : tensor<1x1xi32> loc(#loc7)
%cst = arith.constant dense<1> : tensor<1x1xi32> loc(#loc8)
%6 = arith.cmpi slt, %5, %cst : tensor<1x1xi32> loc(#loc8)
%7 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc9)
%8 = tt.expand_dims %7 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc10)
%cst_1 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc11)
%9 = arith.cmpi slt, %8, %cst_1 : tensor<1x32xi32> loc(#loc11)
%c2_i32 = arith.constant 2 : i32 loc(#loc12)
%cst_2 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc12)
%10 = arith.remsi %8, %cst_2 : tensor<1x32xi32> loc(#loc12)
%c2_i32_3 = arith.constant 2 : i32 loc(#loc13)
%cst_4 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc13)
%11 = arith.divsi %8, %cst_4 : tensor<1x32xi32> loc(#loc13)
%c0_i64 = arith.constant 0 : i64 loc(#loc14)
%cst_5 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc14)
%c1_i64 = arith.constant 1 : i64 loc(#loc15)
%cst_6 = arith.constant dense<1> : tensor<1x1xi64> loc(#loc15)
%12 = arith.cmpi slt, %cst_5, %cst_6 : tensor<1x1xi64> loc(#loc16)
%c3_i32 = arith.constant 3 : i32 loc(#loc17)
%cst_7 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc17)
%13 = arith.muli %10, %cst_7 : tensor<1x32xi32> loc(#loc17)
%c2_i32_8 = arith.constant 2 : i32 loc(#loc18)
%cst_9 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc18)
%14 = arith.divsi %13, %cst_9 : tensor<1x32xi32> loc(#loc18)
%c3_i32_10 = arith.constant 3 : i32 loc(#loc19)
%cst_11 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc19)
%15 = arith.muli %10, %cst_11 : tensor<1x32xi32> loc(#loc19)
%c2_i32_12 = arith.constant 2 : i32 loc(#loc20)
%cst_13 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc20)
%16 = arith.divsi %15, %cst_13 : tensor<1x32xi32> loc(#loc20)
%c2_i32_14 = arith.constant 2 : i32 loc(#loc21)
%cst_15 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc21)
%17 = arith.addi %16, %cst_15 : tensor<1x32xi32> loc(#loc21)
%18 = arith.cmpi slt, %14, %17 : tensor<1x32xi32> loc(#loc22)
%19 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc23)
%20 = arith.andi %19, %18 : tensor<1x32xi1> loc(#loc23)
%c3_i32_16 = arith.constant 3 : i32 loc(#loc24)
%cst_17 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc24)
%21 = arith.muli %11, %cst_17 : tensor<1x32xi32> loc(#loc24)
%c3_i32_18 = arith.constant 3 : i32 loc(#loc25)
%cst_19 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc25)
%22 = arith.muli %10, %cst_19 : tensor<1x32xi32> loc(#loc25)
%c2_i32_20 = arith.constant 2 : i32 loc(#loc26)
%cst_21 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc26)
%23 = arith.divsi %22, %cst_21 : tensor<1x32xi32> loc(#loc26)
%24 = arith.addi %21, %23 : tensor<1x32xi32> loc(#loc27)
%25 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc28)
%26 = tt.addptr %25, %24 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc28)
%27 = arith.andi %9, %20 : tensor<1x32xi1> loc(#loc29)
%cst_22 = arith.constant 0.000000e+00 : f32 loc(#loc30)
%cst_23 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc30)
%28 = arith.extf %cst_23 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc30)
%29 = tt.load %26, %27, %28 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc30)
%cst_24 = arith.constant 0.000000e+00 : f64 loc(#loc31)
%cst_25 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc31)
%30 = arith.select %20, %29, %cst_25 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc32)
%c3_i32_26 = arith.constant 3 : i32 loc(#loc33)
%cst_27 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc33)
%31 = arith.muli %10, %cst_27 : tensor<1x32xi32> loc(#loc33)
%c2_i32_28 = arith.constant 2 : i32 loc(#loc34)
%cst_29 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc34)
%32 = arith.divsi %31, %cst_29 : tensor<1x32xi32> loc(#loc34)
%c1_i32_30 = arith.constant 1 : i32 loc(#loc35)
%cst_31 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc35)
%33 = arith.addi %32, %cst_31 : tensor<1x32xi32> loc(#loc35)
%34 = arith.cmpi slt, %33, %17 : tensor<1x32xi32> loc(#loc36)
%35 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc37)
%36 = arith.andi %35, %34 : tensor<1x32xi1> loc(#loc37)
%c3_i32_32 = arith.constant 3 : i32 loc(#loc38)
%cst_33 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc38)
%37 = arith.muli %11, %cst_33 : tensor<1x32xi32> loc(#loc38)
%c1_i32_34 = arith.constant 1 : i32 loc(#loc39)
%cst_35 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc39)
%38 = arith.addi %37, %cst_35 : tensor<1x32xi32> loc(#loc39)
%c3_i32_36 = arith.constant 3 : i32 loc(#loc40)
%cst_37 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc40)
%39 = arith.muli %10, %cst_37 : tensor<1x32xi32> loc(#loc40)
%c2_i32_38 = arith.constant 2 : i32 loc(#loc41)
%cst_39 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc41)
%40 = arith.divsi %39, %cst_39 : tensor<1x32xi32> loc(#loc41)
%41 = arith.addi %38, %40 : tensor<1x32xi32> loc(#loc42)
%42 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc43)
%43 = tt.addptr %42, %41 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc43)
%44 = arith.andi %9, %36 : tensor<1x32xi1> loc(#loc44)
%cst_40 = arith.constant 0.000000e+00 : f32 loc(#loc45)
%cst_41 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc45)
%45 = arith.extf %cst_41 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc45)
%46 = tt.load %43, %44, %45 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc45)
%cst_42 = arith.constant 0.000000e+00 : f64 loc(#loc46)
%cst_43 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc46)
%47 = arith.select %36, %46, %cst_43 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc47)
%48 = arith.addf %47, %30 : tensor<1x32xf64> loc(#loc48)
%cst_44 = arith.constant 1.000000e+00 : f64 loc(#loc49)
%cst_45 = arith.constant dense<1.000000e+00> : tensor<1x1xf64> loc(#loc49)
%cst_46 = arith.constant 0.000000e+00 : f64 loc(#loc50)
%cst_47 = arith.constant dense<0.000000e+00> : tensor<1x1xf64> loc(#loc50)
%cst_48 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc51)
%cst_49 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc51)
%49 = arith.select %20, %cst_48, %cst_49 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc51)
%cst_50 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc52)
%cst_51 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc52)
%50 = arith.select %36, %cst_50, %cst_51 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc52)
%51 = arith.addf %50, %49 : tensor<1x32xf64> loc(#loc53)
%52 = arith.divf %48, %51 : tensor<1x32xf64> loc(#loc54)
%cst_52 = arith.constant 0xFF800000 : f32 loc(#loc55)
%cst_53 = arith.constant dense<0xFF800000> : tensor<1x32xf32> loc(#loc55)
%53 = arith.extf %cst_53 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc55)
%54 = arith.select %9, %52, %53 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc55)
%55:2 = tt.call @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%54, %8) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc56)
%56 = tt.expand_dims %55#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc57)
%c0_i32 = arith.constant 0 : i32 loc(#loc58)
%cst_54 = arith.constant dense<0> : tensor<1x1xi32> loc(#loc58)
%57 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc59)
%58 = tt.addptr %57, %cst_54 : tensor<1x1x!tt.ptr<i64, 1>>, tensor<1x1xi32> loc(#loc59)
%59 = arith.extsi %56 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc60)
tt.store %58, %59 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc60)
tt.return loc(#loc61)
} loc(#loc)
tt.func private @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%arg0: tensor<1x32xf64> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0), %arg1: tensor<1x32xi32> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0)) -> (tensor<1xf64>, tensor<1xi32>) attributes {noinline = false} {
%0:2 = "tt.reduce"(%arg0, %arg1) <{axis = 1 : i32}> ({
^bb0(%arg2: f64 loc(unknown), %arg3: i32 loc(unknown), %arg4: f64 loc(unknown), %arg5: i32 loc(unknown)):
%1:2 = tt.call @maximum_with_index__fp64_i32_fp64_i32__(%arg2, %arg3, %arg4, %arg5) : (f64, i32, f64, i32) -> (f64, i32) loc(#loc63)
tt.reduce.return %1#0, %1#1 : f64, i32 loc(#loc63)
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc63)
tt.return %0#0, %0#1 : tensor<1xf64>, tensor<1xi32> loc(#loc65)
} loc(#loc62)
tt.func private @maximum_with_index__fp64_i32_fp64_i32__(%arg0: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg1: i32 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg2: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg3: i32 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0)) -> (f64, i32) attributes {noinline = false} {
%0 = arith.cmpf ogt, %arg0, %arg2 : f64 loc(#loc67)
%1 = arith.cmpf oeq, %arg0, %arg2 : f64 loc(#loc68)
%2 = tt.call @is_floating__fp64__(%arg0) : (f64) -> i1 loc(#loc69)
%3:2 = scf.if %2 -> (i1, i1) {
%9 = arith.cmpf une, %arg0, %arg0 : f64 loc(#loc71)
%10 = arith.cmpf une, %arg2, %arg2 : f64 loc(#loc72)
%true = arith.constant true loc(#loc73)
%11 = arith.xori %10, %true : i1 loc(#loc73)
%12 = arith.andi %9, %11 : i1 loc(#loc74)
%13 = arith.ori %0, %12 : i1 loc(#loc75)
%14 = arith.andi %9, %10 : i1 loc(#loc76)
%15 = arith.ori %1, %14 : i1 loc(#loc77)
scf.yield %13, %15 : i1, i1 loc(#loc77)
} else {
scf.yield %0, %1 : i1, i1 loc(#loc64)
} loc(#loc70)
%4 = arith.cmpi slt, %arg1, %arg3 : i32 loc(#loc78)
%5 = arith.andi %3#1, %4 : i1 loc(#loc79)
%6 = arith.ori %3#0, %5 : i1 loc(#loc80)
%7 = arith.select %6, %arg0, %arg2 : f64 loc(#loc81)
%8 = arith.select %6, %arg1, %arg3 : i32 loc(#loc82)
tt.return %7, %8 : f64, i32 loc(#loc83)
} loc(#loc66)
tt.func private @is_floating__fp64__(%arg0: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":12:0)) -> i1 attributes {noinline = false} {
%0 = tt.call @promote_to_tensor__fp64__(%arg0) : (f64) -> tensor<1xf64> loc(#loc85)
%true = arith.constant true loc(#loc86)
tt.return %true : i1 loc(#loc86)
} loc(#loc84)
tt.func private @promote_to_tensor__fp64__(%arg0: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":6:0)) -> tensor<1xf64> attributes {noinline = false} {
%cst = arith.constant dense<0.000000e+00> : tensor<1xf64> loc(#loc88)
%0 = tt.splat %arg0 : (f64) -> tensor<1xf64> loc(#loc88)
%1 = arith.addf %0, %cst : tensor<1xf64> loc(#loc88)
tt.return %1 : tensor<1xf64> loc(#loc89)
} loc(#loc87)
tt.func private @"zeros____0cconstexpr_(constexpr_1_,)__1cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} loc(#loc90)
} loc(#loc)
#loc1 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":19:13)
#loc2 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":20:13)
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:28)
#loc4 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:33)
#loc5 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:36)
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:44)
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:23)
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":24:21)
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26)
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34)
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21)
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18)
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20)
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":29:30)
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":30:30)
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":31:18)
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15)
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22)
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:19)
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:26)
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16)
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18)
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":35:18)
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49)
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:59)
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:66)
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56)
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30)
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98)
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90)
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36)
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32)
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:20)
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:27)
#loc35 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17)
#loc36 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20)
#loc37 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":41:19)
#loc38 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:54)
#loc39 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52)
#loc40 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:64)
#loc41 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:71)
#loc42 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61)
#loc43 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31)
#loc44 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103)
#loc45 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95)
#loc46 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":43:38)
#loc47 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35)
#loc48 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20)
#loc49 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":46:33)
#loc50 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":47:38)
#loc51 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34)
#loc52 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35)
#loc53 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20)
#loc54 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20)
#loc55 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35)
#loc56 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63)
#loc57 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22)
#loc58 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:49)
#loc59 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25)
#loc60 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68)
#loc61 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4)
#loc63 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42)
#loc65 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:11)
#loc67 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21)
#loc68 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23)
#loc69 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":72:19)
#loc70 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":72:7)
#loc71 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29)
#loc72 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29)
#loc73 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32)
#loc74 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28)
#loc75 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16)
#loc76 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29)
#loc77 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17)
#loc78 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31)
#loc79 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21)
#loc80 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12)
#loc81 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35)
#loc82 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69)
#loc83 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:11)
#loc85 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":13:29)
#loc86 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":13:11)
#loc88 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":8:15)
#loc89 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":8:11)
#loc90 = loc("/home/dberard/local/triton/python/triton/language/standard.py":84:0)
// -----// IR Dump Before Canonicalizer (canonicalize) ('tt.func' operation: @is_floating__fp64__) //----- //
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)
#loc62 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0)
#loc64 = loc(unknown)
#loc66 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0)
#loc84 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":12:0)
module {
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} {
%c1_i32 = arith.constant 1 : i32 loc(#loc1)
%c32_i32 = arith.constant 32 : i32 loc(#loc2)
%0 = tt.get_program_id x : i32 loc(#loc3)
%c1_i32_0 = arith.constant 1 : i32 loc(#loc4)
%1 = arith.muli %0, %c1_i32_0 : i32 loc(#loc4)
%2 = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc5)
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc6)
%4 = tt.splat %1 : (i32) -> tensor<1x1xi32> loc(#loc7)
%5 = arith.addi %4, %3 : tensor<1x1xi32> loc(#loc7)
%cst = arith.constant dense<1> : tensor<1x1xi32> loc(#loc8)
%6 = arith.cmpi slt, %5, %cst : tensor<1x1xi32> loc(#loc8)
%7 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc9)
%8 = tt.expand_dims %7 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc10)
%cst_1 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc11)
%9 = arith.cmpi slt, %8, %cst_1 : tensor<1x32xi32> loc(#loc11)
%c2_i32 = arith.constant 2 : i32 loc(#loc12)
%cst_2 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc12)
%10 = arith.remsi %8, %cst_2 : tensor<1x32xi32> loc(#loc12)
%c2_i32_3 = arith.constant 2 : i32 loc(#loc13)
%cst_4 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc13)
%11 = arith.divsi %8, %cst_4 : tensor<1x32xi32> loc(#loc13)
%c0_i64 = arith.constant 0 : i64 loc(#loc14)
%cst_5 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc14)
%c1_i64 = arith.constant 1 : i64 loc(#loc15)
%cst_6 = arith.constant dense<1> : tensor<1x1xi64> loc(#loc15)
%12 = arith.cmpi slt, %cst_5, %cst_6 : tensor<1x1xi64> loc(#loc16)
%c3_i32 = arith.constant 3 : i32 loc(#loc17)
%cst_7 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc17)
%13 = arith.muli %10, %cst_7 : tensor<1x32xi32> loc(#loc17)
%c2_i32_8 = arith.constant 2 : i32 loc(#loc18)
%cst_9 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc18)
%14 = arith.divsi %13, %cst_9 : tensor<1x32xi32> loc(#loc18)
%c3_i32_10 = arith.constant 3 : i32 loc(#loc19)
%cst_11 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc19)
%15 = arith.muli %10, %cst_11 : tensor<1x32xi32> loc(#loc19)
%c2_i32_12 = arith.constant 2 : i32 loc(#loc20)
%cst_13 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc20)
%16 = arith.divsi %15, %cst_13 : tensor<1x32xi32> loc(#loc20)
%c2_i32_14 = arith.constant 2 : i32 loc(#loc21)
%cst_15 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc21)
%17 = arith.addi %16, %cst_15 : tensor<1x32xi32> loc(#loc21)
%18 = arith.cmpi slt, %14, %17 : tensor<1x32xi32> loc(#loc22)
%19 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc23)
%20 = arith.andi %19, %18 : tensor<1x32xi1> loc(#loc23)
%c3_i32_16 = arith.constant 3 : i32 loc(#loc24)
%cst_17 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc24)
%21 = arith.muli %11, %cst_17 : tensor<1x32xi32> loc(#loc24)
%c3_i32_18 = arith.constant 3 : i32 loc(#loc25)
%cst_19 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc25)
%22 = arith.muli %10, %cst_19 : tensor<1x32xi32> loc(#loc25)
%c2_i32_20 = arith.constant 2 : i32 loc(#loc26)
%cst_21 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc26)
%23 = arith.divsi %22, %cst_21 : tensor<1x32xi32> loc(#loc26)
%24 = arith.addi %21, %23 : tensor<1x32xi32> loc(#loc27)
%25 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc28)
%26 = tt.addptr %25, %24 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc28)
%27 = arith.andi %9, %20 : tensor<1x32xi1> loc(#loc29)
%cst_22 = arith.constant 0.000000e+00 : f32 loc(#loc30)
%cst_23 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc30)
%28 = arith.extf %cst_23 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc30)
%29 = tt.load %26, %27, %28 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc30)
%cst_24 = arith.constant 0.000000e+00 : f64 loc(#loc31)
%cst_25 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc31)
%30 = arith.select %20, %29, %cst_25 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc32)
%c3_i32_26 = arith.constant 3 : i32 loc(#loc33)
%cst_27 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc33)
%31 = arith.muli %10, %cst_27 : tensor<1x32xi32> loc(#loc33)
%c2_i32_28 = arith.constant 2 : i32 loc(#loc34)
%cst_29 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc34)
%32 = arith.divsi %31, %cst_29 : tensor<1x32xi32> loc(#loc34)
%c1_i32_30 = arith.constant 1 : i32 loc(#loc35)
%cst_31 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc35)
%33 = arith.addi %32, %cst_31 : tensor<1x32xi32> loc(#loc35)
%34 = arith.cmpi slt, %33, %17 : tensor<1x32xi32> loc(#loc36)
%35 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc37)
%36 = arith.andi %35, %34 : tensor<1x32xi1> loc(#loc37)
%c3_i32_32 = arith.constant 3 : i32 loc(#loc38)
%cst_33 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc38)
%37 = arith.muli %11, %cst_33 : tensor<1x32xi32> loc(#loc38)
%c1_i32_34 = arith.constant 1 : i32 loc(#loc39)
%cst_35 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc39)
%38 = arith.addi %37, %cst_35 : tensor<1x32xi32> loc(#loc39)
%c3_i32_36 = arith.constant 3 : i32 loc(#loc40)
%cst_37 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc40)
%39 = arith.muli %10, %cst_37 : tensor<1x32xi32> loc(#loc40)
%c2_i32_38 = arith.constant 2 : i32 loc(#loc41)
%cst_39 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc41)
%40 = arith.divsi %39, %cst_39 : tensor<1x32xi32> loc(#loc41)
%41 = arith.addi %38, %40 : tensor<1x32xi32> loc(#loc42)
%42 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc43)
%43 = tt.addptr %42, %41 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc43)
%44 = arith.andi %9, %36 : tensor<1x32xi1> loc(#loc44)
%cst_40 = arith.constant 0.000000e+00 : f32 loc(#loc45)
%cst_41 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc45)
%45 = arith.extf %cst_41 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc45)
%46 = tt.load %43, %44, %45 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc45)
%cst_42 = arith.constant 0.000000e+00 : f64 loc(#loc46)
%cst_43 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc46)
%47 = arith.select %36, %46, %cst_43 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc47)
%48 = arith.addf %47, %30 : tensor<1x32xf64> loc(#loc48)
%cst_44 = arith.constant 1.000000e+00 : f64 loc(#loc49)
%cst_45 = arith.constant dense<1.000000e+00> : tensor<1x1xf64> loc(#loc49)
%cst_46 = arith.constant 0.000000e+00 : f64 loc(#loc50)
%cst_47 = arith.constant dense<0.000000e+00> : tensor<1x1xf64> loc(#loc50)
%cst_48 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc51)
%cst_49 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc51)
%49 = arith.select %20, %cst_48, %cst_49 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc51)
%cst_50 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc52)
%cst_51 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc52)
%50 = arith.select %36, %cst_50, %cst_51 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc52)
%51 = arith.addf %50, %49 : tensor<1x32xf64> loc(#loc53)
%52 = arith.divf %48, %51 : tensor<1x32xf64> loc(#loc54)
%cst_52 = arith.constant 0xFF800000 : f32 loc(#loc55)
%cst_53 = arith.constant dense<0xFF800000> : tensor<1x32xf32> loc(#loc55)
%53 = arith.extf %cst_53 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc55)
%54 = arith.select %9, %52, %53 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc55)
%55:2 = tt.call @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%54, %8) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc56)
%56 = tt.expand_dims %55#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc57)
%c0_i32 = arith.constant 0 : i32 loc(#loc58)
%cst_54 = arith.constant dense<0> : tensor<1x1xi32> loc(#loc58)
%57 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc59)
%58 = tt.addptr %57, %cst_54 : tensor<1x1x!tt.ptr<i64, 1>>, tensor<1x1xi32> loc(#loc59)
%59 = arith.extsi %56 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc60)
tt.store %58, %59 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc60)
tt.return loc(#loc61)
} loc(#loc)
tt.func private @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%arg0: tensor<1x32xf64> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0), %arg1: tensor<1x32xi32> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0)) -> (tensor<1xf64>, tensor<1xi32>) attributes {noinline = false} {
%0:2 = "tt.reduce"(%arg0, %arg1) <{axis = 1 : i32}> ({
^bb0(%arg2: f64 loc(unknown), %arg3: i32 loc(unknown), %arg4: f64 loc(unknown), %arg5: i32 loc(unknown)):
%1:2 = tt.call @maximum_with_index__fp64_i32_fp64_i32__(%arg2, %arg3, %arg4, %arg5) : (f64, i32, f64, i32) -> (f64, i32) loc(#loc63)
tt.reduce.return %1#0, %1#1 : f64, i32 loc(#loc63)
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc63)
tt.return %0#0, %0#1 : tensor<1xf64>, tensor<1xi32> loc(#loc65)
} loc(#loc62)
tt.func private @maximum_with_index__fp64_i32_fp64_i32__(%arg0: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg1: i32 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg2: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg3: i32 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0)) -> (f64, i32) attributes {noinline = false} {
%0 = arith.cmpf ogt, %arg0, %arg2 : f64 loc(#loc67)
%1 = arith.cmpf oeq, %arg0, %arg2 : f64 loc(#loc68)
%2 = tt.call @is_floating__fp64__(%arg0) : (f64) -> i1 loc(#loc69)
%3:2 = scf.if %2 -> (i1, i1) {
%9 = arith.cmpf une, %arg0, %arg0 : f64 loc(#loc71)
%10 = arith.cmpf une, %arg2, %arg2 : f64 loc(#loc72)
%true = arith.constant true loc(#loc73)
%11 = arith.xori %10, %true : i1 loc(#loc73)
%12 = arith.andi %9, %11 : i1 loc(#loc74)
%13 = arith.ori %0, %12 : i1 loc(#loc75)
%14 = arith.andi %9, %10 : i1 loc(#loc76)
%15 = arith.ori %1, %14 : i1 loc(#loc77)
scf.yield %13, %15 : i1, i1 loc(#loc77)
} else {
scf.yield %0, %1 : i1, i1 loc(#loc64)
} loc(#loc70)
%4 = arith.cmpi slt, %arg1, %arg3 : i32 loc(#loc78)
%5 = arith.andi %3#1, %4 : i1 loc(#loc79)
%6 = arith.ori %3#0, %5 : i1 loc(#loc80)
%7 = arith.select %6, %arg0, %arg2 : f64 loc(#loc81)
%8 = arith.select %6, %arg1, %arg3 : i32 loc(#loc82)
tt.return %7, %8 : f64, i32 loc(#loc83)
} loc(#loc66)
tt.func private @is_floating__fp64__(%arg0: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":12:0)) -> i1 attributes {noinline = false} {
%true = arith.constant true loc(#loc85)
%cst = arith.constant dense<0.000000e+00> : tensor<1xf64> loc(#loc90)
%0 = tt.splat %arg0 : (f64) -> tensor<1xf64> loc(#loc90)
%1 = arith.addf %0, %cst : tensor<1xf64> loc(#loc90)
tt.return %true : i1 loc(#loc85)
} loc(#loc84)
tt.func private @promote_to_tensor__fp64__(f64) -> tensor<1xf64> attributes {noinline = false} loc(#loc88)
tt.func private @"zeros____0cconstexpr_(constexpr_1_,)__1cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} loc(#loc89)
} loc(#loc)
#loc1 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":19:13)
#loc2 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":20:13)
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:28)
#loc4 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:33)
#loc5 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:36)
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:44)
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:23)
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":24:21)
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26)
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34)
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21)
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18)
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20)
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":29:30)
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":30:30)
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":31:18)
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15)
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22)
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:19)
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:26)
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16)
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18)
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":35:18)
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49)
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:59)
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:66)
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56)
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30)
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98)
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90)
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36)
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32)
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:20)
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:27)
#loc35 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17)
#loc36 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20)
#loc37 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":41:19)
#loc38 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:54)
#loc39 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52)
#loc40 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:64)
#loc41 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:71)
#loc42 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61)
#loc43 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31)
#loc44 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103)
#loc45 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95)
#loc46 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":43:38)
#loc47 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35)
#loc48 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20)
#loc49 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":46:33)
#loc50 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":47:38)
#loc51 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34)
#loc52 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35)
#loc53 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20)
#loc54 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20)
#loc55 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35)
#loc56 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63)
#loc57 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22)
#loc58 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:49)
#loc59 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25)
#loc60 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68)
#loc61 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4)
#loc63 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42)
#loc65 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:11)
#loc67 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21)
#loc68 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23)
#loc69 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":72:19)
#loc70 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":72:7)
#loc71 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29)
#loc72 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29)
#loc73 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32)
#loc74 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28)
#loc75 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16)
#loc76 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29)
#loc77 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17)
#loc78 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31)
#loc79 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21)
#loc80 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12)
#loc81 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35)
#loc82 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69)
#loc83 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:11)
#loc85 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":13:11)
#loc86 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":8:15)
#loc87 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":13:29)
#loc88 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":6:0)
#loc89 = loc("/home/dberard/local/triton/python/triton/language/standard.py":84:0)
#loc90 = loc(callsite(#loc86 at #loc87))
// -----// IR Dump Before Canonicalizer (canonicalize) ('tt.func' operation: @maximum_with_index__fp64_i32_fp64_i32__) //----- //
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)
#loc62 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0)
#loc64 = loc(unknown)
#loc66 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0)
#loc84 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":12:0)
module {
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} {
%c1_i32 = arith.constant 1 : i32 loc(#loc1)
%c32_i32 = arith.constant 32 : i32 loc(#loc2)
%0 = tt.get_program_id x : i32 loc(#loc3)
%c1_i32_0 = arith.constant 1 : i32 loc(#loc4)
%1 = arith.muli %0, %c1_i32_0 : i32 loc(#loc4)
%2 = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc5)
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc6)
%4 = tt.splat %1 : (i32) -> tensor<1x1xi32> loc(#loc7)
%5 = arith.addi %4, %3 : tensor<1x1xi32> loc(#loc7)
%cst = arith.constant dense<1> : tensor<1x1xi32> loc(#loc8)
%6 = arith.cmpi slt, %5, %cst : tensor<1x1xi32> loc(#loc8)
%7 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc9)
%8 = tt.expand_dims %7 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc10)
%cst_1 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc11)
%9 = arith.cmpi slt, %8, %cst_1 : tensor<1x32xi32> loc(#loc11)
%c2_i32 = arith.constant 2 : i32 loc(#loc12)
%cst_2 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc12)
%10 = arith.remsi %8, %cst_2 : tensor<1x32xi32> loc(#loc12)
%c2_i32_3 = arith.constant 2 : i32 loc(#loc13)
%cst_4 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc13)
%11 = arith.divsi %8, %cst_4 : tensor<1x32xi32> loc(#loc13)
%c0_i64 = arith.constant 0 : i64 loc(#loc14)
%cst_5 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc14)
%c1_i64 = arith.constant 1 : i64 loc(#loc15)
%cst_6 = arith.constant dense<1> : tensor<1x1xi64> loc(#loc15)
%12 = arith.cmpi slt, %cst_5, %cst_6 : tensor<1x1xi64> loc(#loc16)
%c3_i32 = arith.constant 3 : i32 loc(#loc17)
%cst_7 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc17)
%13 = arith.muli %10, %cst_7 : tensor<1x32xi32> loc(#loc17)
%c2_i32_8 = arith.constant 2 : i32 loc(#loc18)
%cst_9 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc18)
%14 = arith.divsi %13, %cst_9 : tensor<1x32xi32> loc(#loc18)
%c3_i32_10 = arith.constant 3 : i32 loc(#loc19)
%cst_11 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc19)
%15 = arith.muli %10, %cst_11 : tensor<1x32xi32> loc(#loc19)
%c2_i32_12 = arith.constant 2 : i32 loc(#loc20)
%cst_13 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc20)
%16 = arith.divsi %15, %cst_13 : tensor<1x32xi32> loc(#loc20)
%c2_i32_14 = arith.constant 2 : i32 loc(#loc21)
%cst_15 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc21)
%17 = arith.addi %16, %cst_15 : tensor<1x32xi32> loc(#loc21)
%18 = arith.cmpi slt, %14, %17 : tensor<1x32xi32> loc(#loc22)
%19 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc23)
%20 = arith.andi %19, %18 : tensor<1x32xi1> loc(#loc23)
%c3_i32_16 = arith.constant 3 : i32 loc(#loc24)
%cst_17 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc24)
%21 = arith.muli %11, %cst_17 : tensor<1x32xi32> loc(#loc24)
%c3_i32_18 = arith.constant 3 : i32 loc(#loc25)
%cst_19 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc25)
%22 = arith.muli %10, %cst_19 : tensor<1x32xi32> loc(#loc25)
%c2_i32_20 = arith.constant 2 : i32 loc(#loc26)
%cst_21 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc26)
%23 = arith.divsi %22, %cst_21 : tensor<1x32xi32> loc(#loc26)
%24 = arith.addi %21, %23 : tensor<1x32xi32> loc(#loc27)
%25 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc28)
%26 = tt.addptr %25, %24 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc28)
%27 = arith.andi %9, %20 : tensor<1x32xi1> loc(#loc29)
%cst_22 = arith.constant 0.000000e+00 : f32 loc(#loc30)
%cst_23 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc30)
%28 = arith.extf %cst_23 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc30)
%29 = tt.load %26, %27, %28 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc30)
%cst_24 = arith.constant 0.000000e+00 : f64 loc(#loc31)
%cst_25 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc31)
%30 = arith.select %20, %29, %cst_25 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc32)
%c3_i32_26 = arith.constant 3 : i32 loc(#loc33)
%cst_27 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc33)
%31 = arith.muli %10, %cst_27 : tensor<1x32xi32> loc(#loc33)
%c2_i32_28 = arith.constant 2 : i32 loc(#loc34)
%cst_29 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc34)
%32 = arith.divsi %31, %cst_29 : tensor<1x32xi32> loc(#loc34)
%c1_i32_30 = arith.constant 1 : i32 loc(#loc35)
%cst_31 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc35)
%33 = arith.addi %32, %cst_31 : tensor<1x32xi32> loc(#loc35)
%34 = arith.cmpi slt, %33, %17 : tensor<1x32xi32> loc(#loc36)
%35 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc37)
%36 = arith.andi %35, %34 : tensor<1x32xi1> loc(#loc37)
%c3_i32_32 = arith.constant 3 : i32 loc(#loc38)
%cst_33 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc38)
%37 = arith.muli %11, %cst_33 : tensor<1x32xi32> loc(#loc38)
%c1_i32_34 = arith.constant 1 : i32 loc(#loc39)
%cst_35 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc39)
%38 = arith.addi %37, %cst_35 : tensor<1x32xi32> loc(#loc39)
%c3_i32_36 = arith.constant 3 : i32 loc(#loc40)
%cst_37 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc40)
%39 = arith.muli %10, %cst_37 : tensor<1x32xi32> loc(#loc40)
%c2_i32_38 = arith.constant 2 : i32 loc(#loc41)
%cst_39 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc41)
%40 = arith.divsi %39, %cst_39 : tensor<1x32xi32> loc(#loc41)
%41 = arith.addi %38, %40 : tensor<1x32xi32> loc(#loc42)
%42 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc43)
%43 = tt.addptr %42, %41 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc43)
%44 = arith.andi %9, %36 : tensor<1x32xi1> loc(#loc44)
%cst_40 = arith.constant 0.000000e+00 : f32 loc(#loc45)
%cst_41 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc45)
%45 = arith.extf %cst_41 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc45)
%46 = tt.load %43, %44, %45 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc45)
%cst_42 = arith.constant 0.000000e+00 : f64 loc(#loc46)
%cst_43 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc46)
%47 = arith.select %36, %46, %cst_43 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc47)
%48 = arith.addf %47, %30 : tensor<1x32xf64> loc(#loc48)
%cst_44 = arith.constant 1.000000e+00 : f64 loc(#loc49)
%cst_45 = arith.constant dense<1.000000e+00> : tensor<1x1xf64> loc(#loc49)
%cst_46 = arith.constant 0.000000e+00 : f64 loc(#loc50)
%cst_47 = arith.constant dense<0.000000e+00> : tensor<1x1xf64> loc(#loc50)
%cst_48 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc51)
%cst_49 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc51)
%49 = arith.select %20, %cst_48, %cst_49 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc51)
%cst_50 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc52)
%cst_51 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc52)
%50 = arith.select %36, %cst_50, %cst_51 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc52)
%51 = arith.addf %50, %49 : tensor<1x32xf64> loc(#loc53)
%52 = arith.divf %48, %51 : tensor<1x32xf64> loc(#loc54)
%cst_52 = arith.constant 0xFF800000 : f32 loc(#loc55)
%cst_53 = arith.constant dense<0xFF800000> : tensor<1x32xf32> loc(#loc55)
%53 = arith.extf %cst_53 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc55)
%54 = arith.select %9, %52, %53 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc55)
%55:2 = tt.call @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%54, %8) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc56)
%56 = tt.expand_dims %55#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc57)
%c0_i32 = arith.constant 0 : i32 loc(#loc58)
%cst_54 = arith.constant dense<0> : tensor<1x1xi32> loc(#loc58)
%57 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc59)
%58 = tt.addptr %57, %cst_54 : tensor<1x1x!tt.ptr<i64, 1>>, tensor<1x1xi32> loc(#loc59)
%59 = arith.extsi %56 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc60)
tt.store %58, %59 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc60)
tt.return loc(#loc61)
} loc(#loc)
tt.func private @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%arg0: tensor<1x32xf64> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0), %arg1: tensor<1x32xi32> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0)) -> (tensor<1xf64>, tensor<1xi32>) attributes {noinline = false} {
%0:2 = "tt.reduce"(%arg0, %arg1) <{axis = 1 : i32}> ({
^bb0(%arg2: f64 loc(unknown), %arg3: i32 loc(unknown), %arg4: f64 loc(unknown), %arg5: i32 loc(unknown)):
%1:2 = tt.call @maximum_with_index__fp64_i32_fp64_i32__(%arg2, %arg3, %arg4, %arg5) : (f64, i32, f64, i32) -> (f64, i32) loc(#loc63)
tt.reduce.return %1#0, %1#1 : f64, i32 loc(#loc63)
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc63)
tt.return %0#0, %0#1 : tensor<1xf64>, tensor<1xi32> loc(#loc65)
} loc(#loc62)
tt.func private @maximum_with_index__fp64_i32_fp64_i32__(%arg0: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg1: i32 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg2: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg3: i32 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0)) -> (f64, i32) attributes {noinline = false} {
%0 = arith.cmpf ogt, %arg0, %arg2 : f64 loc(#loc67)
%1 = arith.cmpf oeq, %arg0, %arg2 : f64 loc(#loc68)
%2 = tt.call @is_floating__fp64__(%arg0) : (f64) -> i1 loc(#loc69)
%3:2 = scf.if %2 -> (i1, i1) {
%9 = arith.cmpf une, %arg0, %arg0 : f64 loc(#loc71)
%10 = arith.cmpf une, %arg2, %arg2 : f64 loc(#loc72)
%true = arith.constant true loc(#loc73)
%11 = arith.xori %10, %true : i1 loc(#loc73)
%12 = arith.andi %9, %11 : i1 loc(#loc74)
%13 = arith.ori %0, %12 : i1 loc(#loc75)
%14 = arith.andi %9, %10 : i1 loc(#loc76)
%15 = arith.ori %1, %14 : i1 loc(#loc77)
scf.yield %13, %15 : i1, i1 loc(#loc77)
} else {
scf.yield %0, %1 : i1, i1 loc(#loc64)
} loc(#loc70)
%4 = arith.cmpi slt, %arg1, %arg3 : i32 loc(#loc78)
%5 = arith.andi %3#1, %4 : i1 loc(#loc79)
%6 = arith.ori %3#0, %5 : i1 loc(#loc80)
%7 = arith.select %6, %arg0, %arg2 : f64 loc(#loc81)
%8 = arith.select %6, %arg1, %arg3 : i32 loc(#loc82)
tt.return %7, %8 : f64, i32 loc(#loc83)
} loc(#loc66)
tt.func private @is_floating__fp64__(%arg0: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":12:0)) -> i1 attributes {noinline = false} {
%true = arith.constant true loc(#loc85)
tt.return %true : i1 loc(#loc85)
} loc(#loc84)
tt.func private @promote_to_tensor__fp64__(f64) -> tensor<1xf64> attributes {noinline = false} loc(#loc86)
tt.func private @"zeros____0cconstexpr_(constexpr_1_,)__1cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} loc(#loc87)
} loc(#loc)
#loc1 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":19:13)
#loc2 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":20:13)
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:28)
#loc4 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:33)
#loc5 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:36)
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:44)
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:23)
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":24:21)
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26)
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34)
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21)
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18)
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20)
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":29:30)
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":30:30)
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":31:18)
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15)
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22)
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:19)
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:26)
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16)
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18)
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":35:18)
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49)
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:59)
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:66)
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56)
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30)
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98)
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90)
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36)
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32)
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:20)
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:27)
#loc35 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17)
#loc36 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20)
#loc37 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":41:19)
#loc38 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:54)
#loc39 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52)
#loc40 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:64)
#loc41 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:71)
#loc42 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61)
#loc43 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31)
#loc44 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103)
#loc45 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95)
#loc46 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":43:38)
#loc47 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35)
#loc48 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20)
#loc49 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":46:33)
#loc50 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":47:38)
#loc51 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34)
#loc52 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35)
#loc53 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20)
#loc54 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20)
#loc55 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35)
#loc56 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63)
#loc57 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22)
#loc58 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:49)
#loc59 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25)
#loc60 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68)
#loc61 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4)
#loc63 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42)
#loc65 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:11)
#loc67 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21)
#loc68 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23)
#loc69 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":72:19)
#loc70 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":72:7)
#loc71 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29)
#loc72 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29)
#loc73 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32)
#loc74 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28)
#loc75 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16)
#loc76 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29)
#loc77 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17)
#loc78 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31)
#loc79 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21)
#loc80 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12)
#loc81 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35)
#loc82 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69)
#loc83 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:11)
#loc85 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":13:11)
#loc86 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":6:0)
#loc87 = loc("/home/dberard/local/triton/python/triton/language/standard.py":84:0)
// -----// IR Dump Before Canonicalizer (canonicalize) ('tt.func' operation: @maximum_with_index__fp64_i32_fp64_i32__) //----- //
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)
#loc62 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0)
#loc64 = loc(unknown)
#loc66 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0)
module {
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} {
%c1_i32 = arith.constant 1 : i32 loc(#loc1)
%c32_i32 = arith.constant 32 : i32 loc(#loc2)
%0 = tt.get_program_id x : i32 loc(#loc3)
%c1_i32_0 = arith.constant 1 : i32 loc(#loc4)
%1 = arith.muli %0, %c1_i32_0 : i32 loc(#loc4)
%2 = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc5)
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc6)
%4 = tt.splat %1 : (i32) -> tensor<1x1xi32> loc(#loc7)
%5 = arith.addi %4, %3 : tensor<1x1xi32> loc(#loc7)
%cst = arith.constant dense<1> : tensor<1x1xi32> loc(#loc8)
%6 = arith.cmpi slt, %5, %cst : tensor<1x1xi32> loc(#loc8)
%7 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc9)
%8 = tt.expand_dims %7 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc10)
%cst_1 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc11)
%9 = arith.cmpi slt, %8, %cst_1 : tensor<1x32xi32> loc(#loc11)
%c2_i32 = arith.constant 2 : i32 loc(#loc12)
%cst_2 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc12)
%10 = arith.remsi %8, %cst_2 : tensor<1x32xi32> loc(#loc12)
%c2_i32_3 = arith.constant 2 : i32 loc(#loc13)
%cst_4 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc13)
%11 = arith.divsi %8, %cst_4 : tensor<1x32xi32> loc(#loc13)
%c0_i64 = arith.constant 0 : i64 loc(#loc14)
%cst_5 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc14)
%c1_i64 = arith.constant 1 : i64 loc(#loc15)
%cst_6 = arith.constant dense<1> : tensor<1x1xi64> loc(#loc15)
%12 = arith.cmpi slt, %cst_5, %cst_6 : tensor<1x1xi64> loc(#loc16)
%c3_i32 = arith.constant 3 : i32 loc(#loc17)
%cst_7 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc17)
%13 = arith.muli %10, %cst_7 : tensor<1x32xi32> loc(#loc17)
%c2_i32_8 = arith.constant 2 : i32 loc(#loc18)
%cst_9 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc18)
%14 = arith.divsi %13, %cst_9 : tensor<1x32xi32> loc(#loc18)
%c3_i32_10 = arith.constant 3 : i32 loc(#loc19)
%cst_11 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc19)
%15 = arith.muli %10, %cst_11 : tensor<1x32xi32> loc(#loc19)
%c2_i32_12 = arith.constant 2 : i32 loc(#loc20)
%cst_13 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc20)
%16 = arith.divsi %15, %cst_13 : tensor<1x32xi32> loc(#loc20)
%c2_i32_14 = arith.constant 2 : i32 loc(#loc21)
%cst_15 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc21)
%17 = arith.addi %16, %cst_15 : tensor<1x32xi32> loc(#loc21)
%18 = arith.cmpi slt, %14, %17 : tensor<1x32xi32> loc(#loc22)
%19 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc23)
%20 = arith.andi %19, %18 : tensor<1x32xi1> loc(#loc23)
%c3_i32_16 = arith.constant 3 : i32 loc(#loc24)
%cst_17 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc24)
%21 = arith.muli %11, %cst_17 : tensor<1x32xi32> loc(#loc24)
%c3_i32_18 = arith.constant 3 : i32 loc(#loc25)
%cst_19 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc25)
%22 = arith.muli %10, %cst_19 : tensor<1x32xi32> loc(#loc25)
%c2_i32_20 = arith.constant 2 : i32 loc(#loc26)
%cst_21 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc26)
%23 = arith.divsi %22, %cst_21 : tensor<1x32xi32> loc(#loc26)
%24 = arith.addi %21, %23 : tensor<1x32xi32> loc(#loc27)
%25 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc28)
%26 = tt.addptr %25, %24 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc28)
%27 = arith.andi %9, %20 : tensor<1x32xi1> loc(#loc29)
%cst_22 = arith.constant 0.000000e+00 : f32 loc(#loc30)
%cst_23 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc30)
%28 = arith.extf %cst_23 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc30)
%29 = tt.load %26, %27, %28 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc30)
%cst_24 = arith.constant 0.000000e+00 : f64 loc(#loc31)
%cst_25 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc31)
%30 = arith.select %20, %29, %cst_25 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc32)
%c3_i32_26 = arith.constant 3 : i32 loc(#loc33)
%cst_27 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc33)
%31 = arith.muli %10, %cst_27 : tensor<1x32xi32> loc(#loc33)
%c2_i32_28 = arith.constant 2 : i32 loc(#loc34)
%cst_29 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc34)
%32 = arith.divsi %31, %cst_29 : tensor<1x32xi32> loc(#loc34)
%c1_i32_30 = arith.constant 1 : i32 loc(#loc35)
%cst_31 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc35)
%33 = arith.addi %32, %cst_31 : tensor<1x32xi32> loc(#loc35)
%34 = arith.cmpi slt, %33, %17 : tensor<1x32xi32> loc(#loc36)
%35 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc37)
%36 = arith.andi %35, %34 : tensor<1x32xi1> loc(#loc37)
%c3_i32_32 = arith.constant 3 : i32 loc(#loc38)
%cst_33 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc38)
%37 = arith.muli %11, %cst_33 : tensor<1x32xi32> loc(#loc38)
%c1_i32_34 = arith.constant 1 : i32 loc(#loc39)
%cst_35 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc39)
%38 = arith.addi %37, %cst_35 : tensor<1x32xi32> loc(#loc39)
%c3_i32_36 = arith.constant 3 : i32 loc(#loc40)
%cst_37 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc40)
%39 = arith.muli %10, %cst_37 : tensor<1x32xi32> loc(#loc40)
%c2_i32_38 = arith.constant 2 : i32 loc(#loc41)
%cst_39 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc41)
%40 = arith.divsi %39, %cst_39 : tensor<1x32xi32> loc(#loc41)
%41 = arith.addi %38, %40 : tensor<1x32xi32> loc(#loc42)
%42 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc43)
%43 = tt.addptr %42, %41 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc43)
%44 = arith.andi %9, %36 : tensor<1x32xi1> loc(#loc44)
%cst_40 = arith.constant 0.000000e+00 : f32 loc(#loc45)
%cst_41 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc45)
%45 = arith.extf %cst_41 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc45)
%46 = tt.load %43, %44, %45 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc45)
%cst_42 = arith.constant 0.000000e+00 : f64 loc(#loc46)
%cst_43 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc46)
%47 = arith.select %36, %46, %cst_43 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc47)
%48 = arith.addf %47, %30 : tensor<1x32xf64> loc(#loc48)
%cst_44 = arith.constant 1.000000e+00 : f64 loc(#loc49)
%cst_45 = arith.constant dense<1.000000e+00> : tensor<1x1xf64> loc(#loc49)
%cst_46 = arith.constant 0.000000e+00 : f64 loc(#loc50)
%cst_47 = arith.constant dense<0.000000e+00> : tensor<1x1xf64> loc(#loc50)
%cst_48 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc51)
%cst_49 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc51)
%49 = arith.select %20, %cst_48, %cst_49 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc51)
%cst_50 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc52)
%cst_51 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc52)
%50 = arith.select %36, %cst_50, %cst_51 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc52)
%51 = arith.addf %50, %49 : tensor<1x32xf64> loc(#loc53)
%52 = arith.divf %48, %51 : tensor<1x32xf64> loc(#loc54)
%cst_52 = arith.constant 0xFF800000 : f32 loc(#loc55)
%cst_53 = arith.constant dense<0xFF800000> : tensor<1x32xf32> loc(#loc55)
%53 = arith.extf %cst_53 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc55)
%54 = arith.select %9, %52, %53 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc55)
%55:2 = tt.call @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%54, %8) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc56)
%56 = tt.expand_dims %55#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc57)
%c0_i32 = arith.constant 0 : i32 loc(#loc58)
%cst_54 = arith.constant dense<0> : tensor<1x1xi32> loc(#loc58)
%57 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc59)
%58 = tt.addptr %57, %cst_54 : tensor<1x1x!tt.ptr<i64, 1>>, tensor<1x1xi32> loc(#loc59)
%59 = arith.extsi %56 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc60)
tt.store %58, %59 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc60)
tt.return loc(#loc61)
} loc(#loc)
tt.func private @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%arg0: tensor<1x32xf64> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0), %arg1: tensor<1x32xi32> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0)) -> (tensor<1xf64>, tensor<1xi32>) attributes {noinline = false} {
%0:2 = "tt.reduce"(%arg0, %arg1) <{axis = 1 : i32}> ({
^bb0(%arg2: f64 loc(unknown), %arg3: i32 loc(unknown), %arg4: f64 loc(unknown), %arg5: i32 loc(unknown)):
%1:2 = tt.call @maximum_with_index__fp64_i32_fp64_i32__(%arg2, %arg3, %arg4, %arg5) : (f64, i32, f64, i32) -> (f64, i32) loc(#loc63)
tt.reduce.return %1#0, %1#1 : f64, i32 loc(#loc63)
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc63)
tt.return %0#0, %0#1 : tensor<1xf64>, tensor<1xi32> loc(#loc65)
} loc(#loc62)
tt.func private @maximum_with_index__fp64_i32_fp64_i32__(%arg0: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg1: i32 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg2: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg3: i32 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0)) -> (f64, i32) attributes {noinline = false} {
%true = arith.constant true loc(#loc67)
%0 = arith.cmpf ogt, %arg0, %arg2 : f64 loc(#loc68)
%1 = arith.cmpf oeq, %arg0, %arg2 : f64 loc(#loc69)
%true_0 = arith.constant true loc(#loc88)
%2:2 = scf.if %true_0 -> (i1, i1) {
%8 = arith.cmpf une, %arg0, %arg0 : f64 loc(#loc73)
%9 = arith.cmpf une, %arg2, %arg2 : f64 loc(#loc74)
%10 = arith.xori %9, %true : i1 loc(#loc67)
%11 = arith.andi %8, %10 : i1 loc(#loc75)
%12 = arith.ori %0, %11 : i1 loc(#loc76)
%13 = arith.andi %8, %9 : i1 loc(#loc77)
%14 = arith.ori %1, %13 : i1 loc(#loc78)
scf.yield %12, %14 : i1, i1 loc(#loc78)
} else {
scf.yield %0, %1 : i1, i1 loc(#loc64)
} loc(#loc72)
%3 = arith.cmpi slt, %arg1, %arg3 : i32 loc(#loc79)
%4 = arith.andi %2#1, %3 : i1 loc(#loc80)
%5 = arith.ori %2#0, %4 : i1 loc(#loc81)
%6 = arith.select %5, %arg0, %arg2 : f64 loc(#loc82)
%7 = arith.select %5, %arg1, %arg3 : i32 loc(#loc83)
tt.return %6, %7 : f64, i32 loc(#loc84)
} loc(#loc66)
tt.func private @is_floating__fp64__(f64) -> i1 attributes {noinline = false} loc(#loc85)
tt.func private @promote_to_tensor__fp64__(f64) -> tensor<1xf64> attributes {noinline = false} loc(#loc86)
tt.func private @"zeros____0cconstexpr_(constexpr_1_,)__1cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} loc(#loc87)
} loc(#loc)
#loc1 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":19:13)
#loc2 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":20:13)
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:28)
#loc4 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:33)
#loc5 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:36)
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:44)
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:23)
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":24:21)
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26)
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34)
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21)
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18)
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20)
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":29:30)
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":30:30)
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":31:18)
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15)
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22)
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:19)
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:26)
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16)
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18)
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":35:18)
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49)
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:59)
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:66)
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56)
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30)
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98)
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90)
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36)
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32)
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:20)
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:27)
#loc35 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17)
#loc36 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20)
#loc37 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":41:19)
#loc38 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:54)
#loc39 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52)
#loc40 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:64)
#loc41 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:71)
#loc42 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61)
#loc43 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31)
#loc44 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103)
#loc45 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95)
#loc46 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":43:38)
#loc47 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35)
#loc48 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20)
#loc49 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":46:33)
#loc50 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":47:38)
#loc51 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34)
#loc52 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35)
#loc53 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20)
#loc54 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20)
#loc55 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35)
#loc56 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63)
#loc57 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22)
#loc58 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:49)
#loc59 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25)
#loc60 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68)
#loc61 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4)
#loc63 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42)
#loc65 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:11)
#loc67 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32)
#loc68 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21)
#loc69 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23)
#loc70 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":13:11)
#loc71 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":72:19)
#loc72 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":72:7)
#loc73 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29)
#loc74 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29)
#loc75 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28)
#loc76 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16)
#loc77 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29)
#loc78 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17)
#loc79 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31)
#loc80 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21)
#loc81 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12)
#loc82 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35)
#loc83 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69)
#loc84 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:11)
#loc85 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":12:0)
#loc86 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":6:0)
#loc87 = loc("/home/dberard/local/triton/python/triton/language/standard.py":84:0)
#loc88 = loc(callsite(#loc70 at #loc71))
// -----// IR Dump Before Canonicalizer (canonicalize) ('tt.func' operation: @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_) //----- //
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)
#loc62 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0)
#loc64 = loc(unknown)
#loc66 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0)
module {
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} {
%c1_i32 = arith.constant 1 : i32 loc(#loc1)
%c32_i32 = arith.constant 32 : i32 loc(#loc2)
%0 = tt.get_program_id x : i32 loc(#loc3)
%c1_i32_0 = arith.constant 1 : i32 loc(#loc4)
%1 = arith.muli %0, %c1_i32_0 : i32 loc(#loc4)
%2 = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc5)
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc6)
%4 = tt.splat %1 : (i32) -> tensor<1x1xi32> loc(#loc7)
%5 = arith.addi %4, %3 : tensor<1x1xi32> loc(#loc7)
%cst = arith.constant dense<1> : tensor<1x1xi32> loc(#loc8)
%6 = arith.cmpi slt, %5, %cst : tensor<1x1xi32> loc(#loc8)
%7 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc9)
%8 = tt.expand_dims %7 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc10)
%cst_1 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc11)
%9 = arith.cmpi slt, %8, %cst_1 : tensor<1x32xi32> loc(#loc11)
%c2_i32 = arith.constant 2 : i32 loc(#loc12)
%cst_2 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc12)
%10 = arith.remsi %8, %cst_2 : tensor<1x32xi32> loc(#loc12)
%c2_i32_3 = arith.constant 2 : i32 loc(#loc13)
%cst_4 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc13)
%11 = arith.divsi %8, %cst_4 : tensor<1x32xi32> loc(#loc13)
%c0_i64 = arith.constant 0 : i64 loc(#loc14)
%cst_5 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc14)
%c1_i64 = arith.constant 1 : i64 loc(#loc15)
%cst_6 = arith.constant dense<1> : tensor<1x1xi64> loc(#loc15)
%12 = arith.cmpi slt, %cst_5, %cst_6 : tensor<1x1xi64> loc(#loc16)
%c3_i32 = arith.constant 3 : i32 loc(#loc17)
%cst_7 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc17)
%13 = arith.muli %10, %cst_7 : tensor<1x32xi32> loc(#loc17)
%c2_i32_8 = arith.constant 2 : i32 loc(#loc18)
%cst_9 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc18)
%14 = arith.divsi %13, %cst_9 : tensor<1x32xi32> loc(#loc18)
%c3_i32_10 = arith.constant 3 : i32 loc(#loc19)
%cst_11 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc19)
%15 = arith.muli %10, %cst_11 : tensor<1x32xi32> loc(#loc19)
%c2_i32_12 = arith.constant 2 : i32 loc(#loc20)
%cst_13 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc20)
%16 = arith.divsi %15, %cst_13 : tensor<1x32xi32> loc(#loc20)
%c2_i32_14 = arith.constant 2 : i32 loc(#loc21)
%cst_15 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc21)
%17 = arith.addi %16, %cst_15 : tensor<1x32xi32> loc(#loc21)
%18 = arith.cmpi slt, %14, %17 : tensor<1x32xi32> loc(#loc22)
%19 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc23)
%20 = arith.andi %19, %18 : tensor<1x32xi1> loc(#loc23)
%c3_i32_16 = arith.constant 3 : i32 loc(#loc24)
%cst_17 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc24)
%21 = arith.muli %11, %cst_17 : tensor<1x32xi32> loc(#loc24)
%c3_i32_18 = arith.constant 3 : i32 loc(#loc25)
%cst_19 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc25)
%22 = arith.muli %10, %cst_19 : tensor<1x32xi32> loc(#loc25)
%c2_i32_20 = arith.constant 2 : i32 loc(#loc26)
%cst_21 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc26)
%23 = arith.divsi %22, %cst_21 : tensor<1x32xi32> loc(#loc26)
%24 = arith.addi %21, %23 : tensor<1x32xi32> loc(#loc27)
%25 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc28)
%26 = tt.addptr %25, %24 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc28)
%27 = arith.andi %9, %20 : tensor<1x32xi1> loc(#loc29)
%cst_22 = arith.constant 0.000000e+00 : f32 loc(#loc30)
%cst_23 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc30)
%28 = arith.extf %cst_23 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc30)
%29 = tt.load %26, %27, %28 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc30)
%cst_24 = arith.constant 0.000000e+00 : f64 loc(#loc31)
%cst_25 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc31)
%30 = arith.select %20, %29, %cst_25 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc32)
%c3_i32_26 = arith.constant 3 : i32 loc(#loc33)
%cst_27 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc33)
%31 = arith.muli %10, %cst_27 : tensor<1x32xi32> loc(#loc33)
%c2_i32_28 = arith.constant 2 : i32 loc(#loc34)
%cst_29 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc34)
%32 = arith.divsi %31, %cst_29 : tensor<1x32xi32> loc(#loc34)
%c1_i32_30 = arith.constant 1 : i32 loc(#loc35)
%cst_31 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc35)
%33 = arith.addi %32, %cst_31 : tensor<1x32xi32> loc(#loc35)
%34 = arith.cmpi slt, %33, %17 : tensor<1x32xi32> loc(#loc36)
%35 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc37)
%36 = arith.andi %35, %34 : tensor<1x32xi1> loc(#loc37)
%c3_i32_32 = arith.constant 3 : i32 loc(#loc38)
%cst_33 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc38)
%37 = arith.muli %11, %cst_33 : tensor<1x32xi32> loc(#loc38)
%c1_i32_34 = arith.constant 1 : i32 loc(#loc39)
%cst_35 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc39)
%38 = arith.addi %37, %cst_35 : tensor<1x32xi32> loc(#loc39)
%c3_i32_36 = arith.constant 3 : i32 loc(#loc40)
%cst_37 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc40)
%39 = arith.muli %10, %cst_37 : tensor<1x32xi32> loc(#loc40)
%c2_i32_38 = arith.constant 2 : i32 loc(#loc41)
%cst_39 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc41)
%40 = arith.divsi %39, %cst_39 : tensor<1x32xi32> loc(#loc41)
%41 = arith.addi %38, %40 : tensor<1x32xi32> loc(#loc42)
%42 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc43)
%43 = tt.addptr %42, %41 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc43)
%44 = arith.andi %9, %36 : tensor<1x32xi1> loc(#loc44)
%cst_40 = arith.constant 0.000000e+00 : f32 loc(#loc45)
%cst_41 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc45)
%45 = arith.extf %cst_41 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc45)
%46 = tt.load %43, %44, %45 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc45)
%cst_42 = arith.constant 0.000000e+00 : f64 loc(#loc46)
%cst_43 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc46)
%47 = arith.select %36, %46, %cst_43 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc47)
%48 = arith.addf %47, %30 : tensor<1x32xf64> loc(#loc48)
%cst_44 = arith.constant 1.000000e+00 : f64 loc(#loc49)
%cst_45 = arith.constant dense<1.000000e+00> : tensor<1x1xf64> loc(#loc49)
%cst_46 = arith.constant 0.000000e+00 : f64 loc(#loc50)
%cst_47 = arith.constant dense<0.000000e+00> : tensor<1x1xf64> loc(#loc50)
%cst_48 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc51)
%cst_49 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc51)
%49 = arith.select %20, %cst_48, %cst_49 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc51)
%cst_50 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc52)
%cst_51 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc52)
%50 = arith.select %36, %cst_50, %cst_51 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc52)
%51 = arith.addf %50, %49 : tensor<1x32xf64> loc(#loc53)
%52 = arith.divf %48, %51 : tensor<1x32xf64> loc(#loc54)
%cst_52 = arith.constant 0xFF800000 : f32 loc(#loc55)
%cst_53 = arith.constant dense<0xFF800000> : tensor<1x32xf32> loc(#loc55)
%53 = arith.extf %cst_53 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc55)
%54 = arith.select %9, %52, %53 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc55)
%55:2 = tt.call @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%54, %8) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc56)
%56 = tt.expand_dims %55#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc57)
%c0_i32 = arith.constant 0 : i32 loc(#loc58)
%cst_54 = arith.constant dense<0> : tensor<1x1xi32> loc(#loc58)
%57 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc59)
%58 = tt.addptr %57, %cst_54 : tensor<1x1x!tt.ptr<i64, 1>>, tensor<1x1xi32> loc(#loc59)
%59 = arith.extsi %56 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc60)
tt.store %58, %59 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc60)
tt.return loc(#loc61)
} loc(#loc)
tt.func private @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%arg0: tensor<1x32xf64> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0), %arg1: tensor<1x32xi32> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0)) -> (tensor<1xf64>, tensor<1xi32>) attributes {noinline = false} {
%0:2 = "tt.reduce"(%arg0, %arg1) <{axis = 1 : i32}> ({
^bb0(%arg2: f64 loc(unknown), %arg3: i32 loc(unknown), %arg4: f64 loc(unknown), %arg5: i32 loc(unknown)):
%1:2 = tt.call @maximum_with_index__fp64_i32_fp64_i32__(%arg2, %arg3, %arg4, %arg5) : (f64, i32, f64, i32) -> (f64, i32) loc(#loc63)
tt.reduce.return %1#0, %1#1 : f64, i32 loc(#loc63)
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc63)
tt.return %0#0, %0#1 : tensor<1xf64>, tensor<1xi32> loc(#loc65)
} loc(#loc62)
tt.func private @maximum_with_index__fp64_i32_fp64_i32__(%arg0: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg1: i32 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg2: f64 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0), %arg3: i32 loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0)) -> (f64, i32) attributes {noinline = false} {
%true = arith.constant true loc(#loc67)
%0 = arith.cmpf ogt, %arg0, %arg2 : f64 loc(#loc68)
%1 = arith.cmpf oeq, %arg0, %arg2 : f64 loc(#loc69)
%2 = arith.cmpf une, %arg0, %arg0 : f64 loc(#loc70)
%3 = arith.cmpf une, %arg2, %arg2 : f64 loc(#loc71)
%4 = arith.xori %3, %true : i1 loc(#loc67)
%5 = arith.andi %2, %4 : i1 loc(#loc72)
%6 = arith.ori %0, %5 : i1 loc(#loc73)
%7 = arith.andi %2, %3 : i1 loc(#loc74)
%8 = arith.ori %1, %7 : i1 loc(#loc75)
%9 = arith.cmpi slt, %arg1, %arg3 : i32 loc(#loc76)
%10 = arith.andi %8, %9 : i1 loc(#loc77)
%11 = arith.ori %6, %10 : i1 loc(#loc78)
%12 = arith.select %11, %arg0, %arg2 : f64 loc(#loc79)
%13 = arith.select %11, %arg1, %arg3 : i32 loc(#loc80)
tt.return %12, %13 : f64, i32 loc(#loc81)
} loc(#loc66)
tt.func private @is_floating__fp64__(f64) -> i1 attributes {noinline = false} loc(#loc82)
tt.func private @promote_to_tensor__fp64__(f64) -> tensor<1xf64> attributes {noinline = false} loc(#loc83)
tt.func private @"zeros____0cconstexpr_(constexpr_1_,)__1cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} loc(#loc84)
} loc(#loc)
#loc1 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":19:13)
#loc2 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":20:13)
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:28)
#loc4 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:33)
#loc5 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:36)
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:44)
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:23)
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":24:21)
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26)
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34)
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21)
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18)
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20)
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":29:30)
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":30:30)
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":31:18)
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15)
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22)
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:19)
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:26)
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16)
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18)
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":35:18)
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49)
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:59)
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:66)
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56)
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30)
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98)
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90)
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36)
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32)
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:20)
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:27)
#loc35 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17)
#loc36 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20)
#loc37 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":41:19)
#loc38 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:54)
#loc39 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52)
#loc40 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:64)
#loc41 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:71)
#loc42 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61)
#loc43 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31)
#loc44 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103)
#loc45 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95)
#loc46 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":43:38)
#loc47 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35)
#loc48 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20)
#loc49 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":46:33)
#loc50 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":47:38)
#loc51 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34)
#loc52 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35)
#loc53 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20)
#loc54 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20)
#loc55 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35)
#loc56 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63)
#loc57 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22)
#loc58 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:49)
#loc59 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25)
#loc60 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68)
#loc61 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4)
#loc63 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42)
#loc65 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:11)
#loc67 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32)
#loc68 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21)
#loc69 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23)
#loc70 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29)
#loc71 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29)
#loc72 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28)
#loc73 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16)
#loc74 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29)
#loc75 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17)
#loc76 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31)
#loc77 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21)
#loc78 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12)
#loc79 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35)
#loc80 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69)
#loc81 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:11)
#loc82 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":12:0)
#loc83 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":6:0)
#loc84 = loc("/home/dberard/local/triton/python/triton/language/standard.py":84:0)
// -----// IR Dump Before Canonicalizer (canonicalize) ('tt.func' operation: @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_) //----- //
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)
#loc62 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0)
#loc64 = loc(unknown)
module {
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} {
%c1_i32 = arith.constant 1 : i32 loc(#loc1)
%c32_i32 = arith.constant 32 : i32 loc(#loc2)
%0 = tt.get_program_id x : i32 loc(#loc3)
%c1_i32_0 = arith.constant 1 : i32 loc(#loc4)
%1 = arith.muli %0, %c1_i32_0 : i32 loc(#loc4)
%2 = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc5)
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc6)
%4 = tt.splat %1 : (i32) -> tensor<1x1xi32> loc(#loc7)
%5 = arith.addi %4, %3 : tensor<1x1xi32> loc(#loc7)
%cst = arith.constant dense<1> : tensor<1x1xi32> loc(#loc8)
%6 = arith.cmpi slt, %5, %cst : tensor<1x1xi32> loc(#loc8)
%7 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc9)
%8 = tt.expand_dims %7 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc10)
%cst_1 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc11)
%9 = arith.cmpi slt, %8, %cst_1 : tensor<1x32xi32> loc(#loc11)
%c2_i32 = arith.constant 2 : i32 loc(#loc12)
%cst_2 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc12)
%10 = arith.remsi %8, %cst_2 : tensor<1x32xi32> loc(#loc12)
%c2_i32_3 = arith.constant 2 : i32 loc(#loc13)
%cst_4 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc13)
%11 = arith.divsi %8, %cst_4 : tensor<1x32xi32> loc(#loc13)
%c0_i64 = arith.constant 0 : i64 loc(#loc14)
%cst_5 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc14)
%c1_i64 = arith.constant 1 : i64 loc(#loc15)
%cst_6 = arith.constant dense<1> : tensor<1x1xi64> loc(#loc15)
%12 = arith.cmpi slt, %cst_5, %cst_6 : tensor<1x1xi64> loc(#loc16)
%c3_i32 = arith.constant 3 : i32 loc(#loc17)
%cst_7 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc17)
%13 = arith.muli %10, %cst_7 : tensor<1x32xi32> loc(#loc17)
%c2_i32_8 = arith.constant 2 : i32 loc(#loc18)
%cst_9 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc18)
%14 = arith.divsi %13, %cst_9 : tensor<1x32xi32> loc(#loc18)
%c3_i32_10 = arith.constant 3 : i32 loc(#loc19)
%cst_11 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc19)
%15 = arith.muli %10, %cst_11 : tensor<1x32xi32> loc(#loc19)
%c2_i32_12 = arith.constant 2 : i32 loc(#loc20)
%cst_13 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc20)
%16 = arith.divsi %15, %cst_13 : tensor<1x32xi32> loc(#loc20)
%c2_i32_14 = arith.constant 2 : i32 loc(#loc21)
%cst_15 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc21)
%17 = arith.addi %16, %cst_15 : tensor<1x32xi32> loc(#loc21)
%18 = arith.cmpi slt, %14, %17 : tensor<1x32xi32> loc(#loc22)
%19 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc23)
%20 = arith.andi %19, %18 : tensor<1x32xi1> loc(#loc23)
%c3_i32_16 = arith.constant 3 : i32 loc(#loc24)
%cst_17 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc24)
%21 = arith.muli %11, %cst_17 : tensor<1x32xi32> loc(#loc24)
%c3_i32_18 = arith.constant 3 : i32 loc(#loc25)
%cst_19 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc25)
%22 = arith.muli %10, %cst_19 : tensor<1x32xi32> loc(#loc25)
%c2_i32_20 = arith.constant 2 : i32 loc(#loc26)
%cst_21 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc26)
%23 = arith.divsi %22, %cst_21 : tensor<1x32xi32> loc(#loc26)
%24 = arith.addi %21, %23 : tensor<1x32xi32> loc(#loc27)
%25 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc28)
%26 = tt.addptr %25, %24 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc28)
%27 = arith.andi %9, %20 : tensor<1x32xi1> loc(#loc29)
%cst_22 = arith.constant 0.000000e+00 : f32 loc(#loc30)
%cst_23 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc30)
%28 = arith.extf %cst_23 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc30)
%29 = tt.load %26, %27, %28 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc30)
%cst_24 = arith.constant 0.000000e+00 : f64 loc(#loc31)
%cst_25 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc31)
%30 = arith.select %20, %29, %cst_25 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc32)
%c3_i32_26 = arith.constant 3 : i32 loc(#loc33)
%cst_27 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc33)
%31 = arith.muli %10, %cst_27 : tensor<1x32xi32> loc(#loc33)
%c2_i32_28 = arith.constant 2 : i32 loc(#loc34)
%cst_29 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc34)
%32 = arith.divsi %31, %cst_29 : tensor<1x32xi32> loc(#loc34)
%c1_i32_30 = arith.constant 1 : i32 loc(#loc35)
%cst_31 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc35)
%33 = arith.addi %32, %cst_31 : tensor<1x32xi32> loc(#loc35)
%34 = arith.cmpi slt, %33, %17 : tensor<1x32xi32> loc(#loc36)
%35 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc37)
%36 = arith.andi %35, %34 : tensor<1x32xi1> loc(#loc37)
%c3_i32_32 = arith.constant 3 : i32 loc(#loc38)
%cst_33 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc38)
%37 = arith.muli %11, %cst_33 : tensor<1x32xi32> loc(#loc38)
%c1_i32_34 = arith.constant 1 : i32 loc(#loc39)
%cst_35 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc39)
%38 = arith.addi %37, %cst_35 : tensor<1x32xi32> loc(#loc39)
%c3_i32_36 = arith.constant 3 : i32 loc(#loc40)
%cst_37 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc40)
%39 = arith.muli %10, %cst_37 : tensor<1x32xi32> loc(#loc40)
%c2_i32_38 = arith.constant 2 : i32 loc(#loc41)
%cst_39 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc41)
%40 = arith.divsi %39, %cst_39 : tensor<1x32xi32> loc(#loc41)
%41 = arith.addi %38, %40 : tensor<1x32xi32> loc(#loc42)
%42 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc43)
%43 = tt.addptr %42, %41 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc43)
%44 = arith.andi %9, %36 : tensor<1x32xi1> loc(#loc44)
%cst_40 = arith.constant 0.000000e+00 : f32 loc(#loc45)
%cst_41 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc45)
%45 = arith.extf %cst_41 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc45)
%46 = tt.load %43, %44, %45 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc45)
%cst_42 = arith.constant 0.000000e+00 : f64 loc(#loc46)
%cst_43 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc46)
%47 = arith.select %36, %46, %cst_43 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc47)
%48 = arith.addf %47, %30 : tensor<1x32xf64> loc(#loc48)
%cst_44 = arith.constant 1.000000e+00 : f64 loc(#loc49)
%cst_45 = arith.constant dense<1.000000e+00> : tensor<1x1xf64> loc(#loc49)
%cst_46 = arith.constant 0.000000e+00 : f64 loc(#loc50)
%cst_47 = arith.constant dense<0.000000e+00> : tensor<1x1xf64> loc(#loc50)
%cst_48 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc51)
%cst_49 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc51)
%49 = arith.select %20, %cst_48, %cst_49 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc51)
%cst_50 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc52)
%cst_51 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc52)
%50 = arith.select %36, %cst_50, %cst_51 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc52)
%51 = arith.addf %50, %49 : tensor<1x32xf64> loc(#loc53)
%52 = arith.divf %48, %51 : tensor<1x32xf64> loc(#loc54)
%cst_52 = arith.constant 0xFF800000 : f32 loc(#loc55)
%cst_53 = arith.constant dense<0xFF800000> : tensor<1x32xf32> loc(#loc55)
%53 = arith.extf %cst_53 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc55)
%54 = arith.select %9, %52, %53 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc55)
%55:2 = tt.call @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%54, %8) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc56)
%56 = tt.expand_dims %55#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc57)
%c0_i32 = arith.constant 0 : i32 loc(#loc58)
%cst_54 = arith.constant dense<0> : tensor<1x1xi32> loc(#loc58)
%57 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc59)
%58 = tt.addptr %57, %cst_54 : tensor<1x1x!tt.ptr<i64, 1>>, tensor<1x1xi32> loc(#loc59)
%59 = arith.extsi %56 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc60)
tt.store %58, %59 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc60)
tt.return loc(#loc61)
} loc(#loc)
tt.func private @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%arg0: tensor<1x32xf64> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0), %arg1: tensor<1x32xi32> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0)) -> (tensor<1xf64>, tensor<1xi32>) attributes {noinline = false} {
%0:2 = "tt.reduce"(%arg0, %arg1) <{axis = 1 : i32}> ({
^bb0(%arg2: f64 loc(unknown), %arg3: i32 loc(unknown), %arg4: f64 loc(unknown), %arg5: i32 loc(unknown)):
%true = arith.constant true loc(#loc84)
%1 = arith.cmpf ogt, %arg2, %arg4 : f64 loc(#loc85)
%2 = arith.cmpf oeq, %arg2, %arg4 : f64 loc(#loc86)
%3 = arith.cmpf une, %arg2, %arg2 : f64 loc(#loc87)
%4 = arith.cmpf une, %arg4, %arg4 : f64 loc(#loc88)
%5 = arith.xori %4, %true : i1 loc(#loc84)
%6 = arith.andi %3, %5 : i1 loc(#loc89)
%7 = arith.ori %1, %6 : i1 loc(#loc90)
%8 = arith.andi %3, %4 : i1 loc(#loc91)
%9 = arith.ori %2, %8 : i1 loc(#loc92)
%10 = arith.cmpi slt, %arg3, %arg5 : i32 loc(#loc93)
%11 = arith.andi %9, %10 : i1 loc(#loc94)
%12 = arith.ori %7, %11 : i1 loc(#loc95)
%13 = arith.select %12, %arg2, %arg4 : f64 loc(#loc96)
%14 = arith.select %12, %arg3, %arg5 : i32 loc(#loc97)
tt.reduce.return %13, %14 : f64, i32 loc(#loc63)
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc63)
tt.return %0#0, %0#1 : tensor<1xf64>, tensor<1xi32> loc(#loc79)
} loc(#loc62)
tt.func private @maximum_with_index__fp64_i32_fp64_i32__(f64, i32, f64, i32) -> (f64, i32) attributes {noinline = false} loc(#loc80)
tt.func private @is_floating__fp64__(f64) -> i1 attributes {noinline = false} loc(#loc81)
tt.func private @promote_to_tensor__fp64__(f64) -> tensor<1xf64> attributes {noinline = false} loc(#loc82)
tt.func private @"zeros____0cconstexpr_(constexpr_1_,)__1cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} loc(#loc83)
} loc(#loc)
#loc1 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":19:13)
#loc2 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":20:13)
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:28)
#loc4 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:33)
#loc5 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:36)
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:44)
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:23)
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":24:21)
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26)
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34)
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21)
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18)
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20)
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":29:30)
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":30:30)
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":31:18)
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15)
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22)
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:19)
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:26)
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16)
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18)
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":35:18)
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49)
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:59)
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:66)
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56)
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30)
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98)
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90)
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36)
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32)
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:20)
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:27)
#loc35 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17)
#loc36 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20)
#loc37 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":41:19)
#loc38 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:54)
#loc39 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52)
#loc40 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:64)
#loc41 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:71)
#loc42 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61)
#loc43 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31)
#loc44 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103)
#loc45 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95)
#loc46 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":43:38)
#loc47 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35)
#loc48 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20)
#loc49 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":46:33)
#loc50 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":47:38)
#loc51 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34)
#loc52 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35)
#loc53 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20)
#loc54 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20)
#loc55 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35)
#loc56 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63)
#loc57 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22)
#loc58 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:49)
#loc59 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25)
#loc60 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68)
#loc61 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4)
#loc63 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42)
#loc65 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32)
#loc66 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21)
#loc67 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23)
#loc68 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29)
#loc69 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29)
#loc70 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28)
#loc71 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16)
#loc72 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29)
#loc73 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17)
#loc74 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31)
#loc75 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21)
#loc76 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12)
#loc77 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35)
#loc78 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69)
#loc79 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:11)
#loc80 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0)
#loc81 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":12:0)
#loc82 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":6:0)
#loc83 = loc("/home/dberard/local/triton/python/triton/language/standard.py":84:0)
#loc84 = loc(callsite(#loc65 at #loc63))
#loc85 = loc(callsite(#loc66 at #loc63))
#loc86 = loc(callsite(#loc67 at #loc63))
#loc87 = loc(callsite(#loc68 at #loc63))
#loc88 = loc(callsite(#loc69 at #loc63))
#loc89 = loc(callsite(#loc70 at #loc63))
#loc90 = loc(callsite(#loc71 at #loc63))
#loc91 = loc(callsite(#loc72 at #loc63))
#loc92 = loc(callsite(#loc73 at #loc63))
#loc93 = loc(callsite(#loc74 at #loc63))
#loc94 = loc(callsite(#loc75 at #loc63))
#loc95 = loc(callsite(#loc76 at #loc63))
#loc96 = loc(callsite(#loc77 at #loc63))
#loc97 = loc(callsite(#loc78 at #loc63))
// -----// IR Dump Before Canonicalizer (canonicalize) ('tt.func' operation: @triton__0d1d23de) //----- //
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)
#loc62 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0)
#loc65 = loc(unknown)
module {
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} {
%c1_i32 = arith.constant 1 : i32 loc(#loc1)
%c32_i32 = arith.constant 32 : i32 loc(#loc2)
%0 = tt.get_program_id x : i32 loc(#loc3)
%c1_i32_0 = arith.constant 1 : i32 loc(#loc4)
%1 = arith.muli %0, %c1_i32_0 : i32 loc(#loc4)
%2 = tt.make_range {end = 1 : i32, start = 0 : i32} : tensor<1xi32> loc(#loc5)
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc6)
%4 = tt.splat %1 : (i32) -> tensor<1x1xi32> loc(#loc7)
%5 = arith.addi %4, %3 : tensor<1x1xi32> loc(#loc7)
%cst = arith.constant dense<1> : tensor<1x1xi32> loc(#loc8)
%6 = arith.cmpi slt, %5, %cst : tensor<1x1xi32> loc(#loc8)
%7 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc9)
%8 = tt.expand_dims %7 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc10)
%cst_1 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc11)
%9 = arith.cmpi slt, %8, %cst_1 : tensor<1x32xi32> loc(#loc11)
%c2_i32 = arith.constant 2 : i32 loc(#loc12)
%cst_2 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc12)
%10 = arith.remsi %8, %cst_2 : tensor<1x32xi32> loc(#loc12)
%c2_i32_3 = arith.constant 2 : i32 loc(#loc13)
%cst_4 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc13)
%11 = arith.divsi %8, %cst_4 : tensor<1x32xi32> loc(#loc13)
%c0_i64 = arith.constant 0 : i64 loc(#loc14)
%cst_5 = arith.constant dense<0> : tensor<1x1xi64> loc(#loc14)
%c1_i64 = arith.constant 1 : i64 loc(#loc15)
%cst_6 = arith.constant dense<1> : tensor<1x1xi64> loc(#loc15)
%12 = arith.cmpi slt, %cst_5, %cst_6 : tensor<1x1xi64> loc(#loc16)
%c3_i32 = arith.constant 3 : i32 loc(#loc17)
%cst_7 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc17)
%13 = arith.muli %10, %cst_7 : tensor<1x32xi32> loc(#loc17)
%c2_i32_8 = arith.constant 2 : i32 loc(#loc18)
%cst_9 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc18)
%14 = arith.divsi %13, %cst_9 : tensor<1x32xi32> loc(#loc18)
%c3_i32_10 = arith.constant 3 : i32 loc(#loc19)
%cst_11 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc19)
%15 = arith.muli %10, %cst_11 : tensor<1x32xi32> loc(#loc19)
%c2_i32_12 = arith.constant 2 : i32 loc(#loc20)
%cst_13 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc20)
%16 = arith.divsi %15, %cst_13 : tensor<1x32xi32> loc(#loc20)
%c2_i32_14 = arith.constant 2 : i32 loc(#loc21)
%cst_15 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc21)
%17 = arith.addi %16, %cst_15 : tensor<1x32xi32> loc(#loc21)
%18 = arith.cmpi slt, %14, %17 : tensor<1x32xi32> loc(#loc22)
%19 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc23)
%20 = arith.andi %19, %18 : tensor<1x32xi1> loc(#loc23)
%c3_i32_16 = arith.constant 3 : i32 loc(#loc24)
%cst_17 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc24)
%21 = arith.muli %11, %cst_17 : tensor<1x32xi32> loc(#loc24)
%c3_i32_18 = arith.constant 3 : i32 loc(#loc25)
%cst_19 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc25)
%22 = arith.muli %10, %cst_19 : tensor<1x32xi32> loc(#loc25)
%c2_i32_20 = arith.constant 2 : i32 loc(#loc26)
%cst_21 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc26)
%23 = arith.divsi %22, %cst_21 : tensor<1x32xi32> loc(#loc26)
%24 = arith.addi %21, %23 : tensor<1x32xi32> loc(#loc27)
%25 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc28)
%26 = tt.addptr %25, %24 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc28)
%27 = arith.andi %9, %20 : tensor<1x32xi1> loc(#loc29)
%cst_22 = arith.constant 0.000000e+00 : f32 loc(#loc30)
%cst_23 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc30)
%28 = arith.extf %cst_23 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc30)
%29 = tt.load %26, %27, %28 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc30)
%cst_24 = arith.constant 0.000000e+00 : f64 loc(#loc31)
%cst_25 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc31)
%30 = arith.select %20, %29, %cst_25 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc32)
%c3_i32_26 = arith.constant 3 : i32 loc(#loc33)
%cst_27 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc33)
%31 = arith.muli %10, %cst_27 : tensor<1x32xi32> loc(#loc33)
%c2_i32_28 = arith.constant 2 : i32 loc(#loc34)
%cst_29 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc34)
%32 = arith.divsi %31, %cst_29 : tensor<1x32xi32> loc(#loc34)
%c1_i32_30 = arith.constant 1 : i32 loc(#loc35)
%cst_31 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc35)
%33 = arith.addi %32, %cst_31 : tensor<1x32xi32> loc(#loc35)
%34 = arith.cmpi slt, %33, %17 : tensor<1x32xi32> loc(#loc36)
%35 = tt.broadcast %12 : (tensor<1x1xi1>) -> tensor<1x32xi1> loc(#loc37)
%36 = arith.andi %35, %34 : tensor<1x32xi1> loc(#loc37)
%c3_i32_32 = arith.constant 3 : i32 loc(#loc38)
%cst_33 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc38)
%37 = arith.muli %11, %cst_33 : tensor<1x32xi32> loc(#loc38)
%c1_i32_34 = arith.constant 1 : i32 loc(#loc39)
%cst_35 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc39)
%38 = arith.addi %37, %cst_35 : tensor<1x32xi32> loc(#loc39)
%c3_i32_36 = arith.constant 3 : i32 loc(#loc40)
%cst_37 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc40)
%39 = arith.muli %10, %cst_37 : tensor<1x32xi32> loc(#loc40)
%c2_i32_38 = arith.constant 2 : i32 loc(#loc41)
%cst_39 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc41)
%40 = arith.divsi %39, %cst_39 : tensor<1x32xi32> loc(#loc41)
%41 = arith.addi %38, %40 : tensor<1x32xi32> loc(#loc42)
%42 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc43)
%43 = tt.addptr %42, %41 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc43)
%44 = arith.andi %9, %36 : tensor<1x32xi1> loc(#loc44)
%cst_40 = arith.constant 0.000000e+00 : f32 loc(#loc45)
%cst_41 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc45)
%45 = arith.extf %cst_41 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc45)
%46 = tt.load %43, %44, %45 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc45)
%cst_42 = arith.constant 0.000000e+00 : f64 loc(#loc46)
%cst_43 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc46)
%47 = arith.select %36, %46, %cst_43 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc47)
%48 = arith.addf %47, %30 : tensor<1x32xf64> loc(#loc48)
%cst_44 = arith.constant 1.000000e+00 : f64 loc(#loc49)
%cst_45 = arith.constant dense<1.000000e+00> : tensor<1x1xf64> loc(#loc49)
%cst_46 = arith.constant 0.000000e+00 : f64 loc(#loc50)
%cst_47 = arith.constant dense<0.000000e+00> : tensor<1x1xf64> loc(#loc50)
%cst_48 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc51)
%cst_49 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc51)
%49 = arith.select %20, %cst_48, %cst_49 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc51)
%cst_50 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc52)
%cst_51 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc52)
%50 = arith.select %36, %cst_50, %cst_51 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc52)
%51 = arith.addf %50, %49 : tensor<1x32xf64> loc(#loc53)
%52 = arith.divf %48, %51 : tensor<1x32xf64> loc(#loc54)
%cst_52 = arith.constant 0xFF800000 : f32 loc(#loc55)
%cst_53 = arith.constant dense<0xFF800000> : tensor<1x32xf32> loc(#loc55)
%53 = arith.extf %cst_53 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc55)
%54 = arith.select %9, %52, %53 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc55)
%55:2 = tt.call @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%54, %8) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc56)
%56 = tt.expand_dims %55#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc57)
%c0_i32 = arith.constant 0 : i32 loc(#loc58)
%cst_54 = arith.constant dense<0> : tensor<1x1xi32> loc(#loc58)
%57 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc59)
%58 = tt.addptr %57, %cst_54 : tensor<1x1x!tt.ptr<i64, 1>>, tensor<1x1xi32> loc(#loc59)
%59 = arith.extsi %56 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc60)
tt.store %58, %59 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc60)
tt.return loc(#loc61)
} loc(#loc)
tt.func private @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(%arg0: tensor<1x32xf64> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0), %arg1: tensor<1x32xi32> loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0)) -> (tensor<1xf64>, tensor<1xi32>) attributes {noinline = false} {
%true = arith.constant true loc(#loc84)
%0:2 = "tt.reduce"(%arg0, %arg1) <{axis = 1 : i32}> ({
^bb0(%arg2: f64 loc(unknown), %arg3: i32 loc(unknown), %arg4: f64 loc(unknown), %arg5: i32 loc(unknown)):
%1 = arith.cmpf ogt, %arg2, %arg4 : f64 loc(#loc85)
%2 = arith.cmpf oeq, %arg2, %arg4 : f64 loc(#loc86)
%3 = arith.cmpf une, %arg2, %arg2 : f64 loc(#loc87)
%4 = arith.cmpf une, %arg4, %arg4 : f64 loc(#loc88)
%5 = arith.xori %4, %true : i1 loc(#loc84)
%6 = arith.andi %3, %5 : i1 loc(#loc89)
%7 = arith.ori %1, %6 : i1 loc(#loc90)
%8 = arith.andi %3, %4 : i1 loc(#loc91)
%9 = arith.ori %2, %8 : i1 loc(#loc92)
%10 = arith.cmpi slt, %arg3, %arg5 : i32 loc(#loc93)
%11 = arith.andi %9, %10 : i1 loc(#loc94)
%12 = arith.ori %7, %11 : i1 loc(#loc95)
%13 = arith.select %12, %arg2, %arg4 : f64 loc(#loc96)
%14 = arith.select %12, %arg3, %arg5 : i32 loc(#loc97)
tt.reduce.return %13, %14 : f64, i32 loc(#loc64)
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc64)
tt.return %0#0, %0#1 : tensor<1xf64>, tensor<1xi32> loc(#loc79)
} loc(#loc62)
tt.func private @maximum_with_index__fp64_i32_fp64_i32__(f64, i32, f64, i32) -> (f64, i32) attributes {noinline = false} loc(#loc80)
tt.func private @is_floating__fp64__(f64) -> i1 attributes {noinline = false} loc(#loc81)
tt.func private @promote_to_tensor__fp64__(f64) -> tensor<1xf64> attributes {noinline = false} loc(#loc82)
tt.func private @"zeros____0cconstexpr_(constexpr_1_,)__1cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} loc(#loc83)
} loc(#loc)
#loc1 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":19:13)
#loc2 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":20:13)
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:28)
#loc4 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":22:33)
#loc5 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:36)
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:44)
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":23:23)
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":24:21)
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26)
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34)
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21)
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18)
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20)
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":29:30)
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":30:30)
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":31:18)
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15)
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22)
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:19)
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:26)
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16)
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18)
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":35:18)
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49)
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:59)
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:66)
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56)
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30)
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98)
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90)
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36)
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32)
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:20)
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:27)
#loc35 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17)
#loc36 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20)
#loc37 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":41:19)
#loc38 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:54)
#loc39 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52)
#loc40 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:64)
#loc41 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:71)
#loc42 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61)
#loc43 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31)
#loc44 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103)
#loc45 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95)
#loc46 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":43:38)
#loc47 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35)
#loc48 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20)
#loc49 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":46:33)
#loc50 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":47:38)
#loc51 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34)
#loc52 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35)
#loc53 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20)
#loc54 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20)
#loc55 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35)
#loc56 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63)
#loc57 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22)
#loc58 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:49)
#loc59 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25)
#loc60 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68)
#loc61 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4)
#loc63 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32)
#loc64 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42)
#loc66 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21)
#loc67 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23)
#loc68 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29)
#loc69 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29)
#loc70 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28)
#loc71 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16)
#loc72 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29)
#loc73 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17)
#loc74 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31)
#loc75 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21)
#loc76 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12)
#loc77 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35)
#loc78 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69)
#loc79 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:11)
#loc80 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0)
#loc81 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":12:0)
#loc82 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":6:0)
#loc83 = loc("/home/dberard/local/triton/python/triton/language/standard.py":84:0)
#loc84 = loc(callsite(#loc63 at #loc64))
#loc85 = loc(callsite(#loc66 at #loc64))
#loc86 = loc(callsite(#loc67 at #loc64))
#loc87 = loc(callsite(#loc68 at #loc64))
#loc88 = loc(callsite(#loc69 at #loc64))
#loc89 = loc(callsite(#loc70 at #loc64))
#loc90 = loc(callsite(#loc71 at #loc64))
#loc91 = loc(callsite(#loc72 at #loc64))
#loc92 = loc(callsite(#loc73 at #loc64))
#loc93 = loc(callsite(#loc74 at #loc64))
#loc94 = loc(callsite(#loc75 at #loc64))
#loc95 = loc(callsite(#loc76 at #loc64))
#loc96 = loc(callsite(#loc77 at #loc64))
#loc97 = loc(callsite(#loc78 at #loc64))
// -----// IR Dump Before Canonicalizer (canonicalize) ('tt.func' operation: @triton__0d1d23de) //----- //
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)
#loc44 = loc(unknown)
module {
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} {
%cst = arith.constant dense<0> : tensor<1x1xi32> loc(#loc1)
%cst_0 = arith.constant dense<0xFF800000> : tensor<1x32xf32> loc(#loc2)
%cst_1 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc3)
%cst_2 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc4)
%cst_3 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc5)
%cst_4 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc6)
%cst_5 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc7)
%cst_6 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc8)
%cst_7 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc9)
%0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc10)
%1 = tt.expand_dims %0 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc11)
%2 = arith.cmpi slt, %1, %cst_7 : tensor<1x32xi32> loc(#loc9)
%3 = arith.remsi %1, %cst_6 : tensor<1x32xi32> loc(#loc8)
%4 = arith.divsi %1, %cst_6 : tensor<1x32xi32> loc(#loc12)
%5 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc7)
%6 = arith.divsi %5, %cst_6 : tensor<1x32xi32> loc(#loc13)
%7 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc14)
%8 = arith.divsi %7, %cst_6 : tensor<1x32xi32> loc(#loc15)
%9 = arith.addi %8, %cst_6 : tensor<1x32xi32> loc(#loc16)
%10 = arith.cmpi slt, %6, %9 : tensor<1x32xi32> loc(#loc17)
%11 = arith.muli %4, %cst_5 : tensor<1x32xi32> loc(#loc18)
%12 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc19)
%13 = arith.divsi %12, %cst_6 : tensor<1x32xi32> loc(#loc20)
%14 = arith.addi %11, %13 : tensor<1x32xi32> loc(#loc21)
%15 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc22)
%16 = tt.addptr %15, %14 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc22)
%17 = arith.andi %2, %10 : tensor<1x32xi1> loc(#loc23)
%18 = arith.extf %cst_4 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc6)
%19 = tt.load %16, %17, %18 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc6)
%20 = arith.select %10, %19, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc24)
%21 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc25)
%22 = arith.divsi %21, %cst_6 : tensor<1x32xi32> loc(#loc26)
%23 = arith.addi %22, %cst_2 : tensor<1x32xi32> loc(#loc4)
%24 = arith.cmpi slt, %23, %9 : tensor<1x32xi32> loc(#loc27)
%25 = arith.muli %4, %cst_5 : tensor<1x32xi32> loc(#loc28)
%26 = arith.addi %25, %cst_2 : tensor<1x32xi32> loc(#loc29)
%27 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc30)
%28 = arith.divsi %27, %cst_6 : tensor<1x32xi32> loc(#loc31)
%29 = arith.addi %26, %28 : tensor<1x32xi32> loc(#loc32)
%30 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc33)
%31 = tt.addptr %30, %29 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc33)
%32 = arith.andi %2, %24 : tensor<1x32xi1> loc(#loc34)
%33 = arith.extf %cst_4 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc35)
%34 = tt.load %31, %32, %33 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc35)
%35 = arith.select %24, %34, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc36)
%36 = arith.addf %35, %20 : tensor<1x32xf64> loc(#loc37)
%37 = arith.select %10, %cst_1, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc3)
%38 = arith.select %24, %cst_1, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc38)
%39 = arith.addf %38, %37 : tensor<1x32xf64> loc(#loc39)
%40 = arith.divf %36, %39 : tensor<1x32xf64> loc(#loc40)
%41 = arith.extf %cst_0 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc2)
%42 = arith.select %2, %40, %41 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc2)
%true = arith.constant true loc(#loc82)
%43:2 = "tt.reduce"(%42, %1) <{axis = 1 : i32}> ({
^bb0(%arg4: f64 loc(unknown), %arg5: i32 loc(unknown), %arg6: f64 loc(unknown), %arg7: i32 loc(unknown)):
%48 = arith.cmpf ogt, %arg4, %arg6 : f64 loc(#loc83)
%49 = arith.cmpf oeq, %arg4, %arg6 : f64 loc(#loc84)
%50 = arith.cmpf une, %arg4, %arg4 : f64 loc(#loc85)
%51 = arith.cmpf une, %arg6, %arg6 : f64 loc(#loc86)
%52 = arith.xori %51, %true : i1 loc(#loc82)
%53 = arith.andi %50, %52 : i1 loc(#loc87)
%54 = arith.ori %48, %53 : i1 loc(#loc88)
%55 = arith.andi %50, %51 : i1 loc(#loc89)
%56 = arith.ori %49, %55 : i1 loc(#loc90)
%57 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc91)
%58 = arith.andi %56, %57 : i1 loc(#loc92)
%59 = arith.ori %54, %58 : i1 loc(#loc93)
%60 = arith.select %59, %arg4, %arg6 : f64 loc(#loc94)
%61 = arith.select %59, %arg5, %arg7 : i32 loc(#loc95)
tt.reduce.return %60, %61 : f64, i32 loc(#loc68)
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc68)
%44 = tt.expand_dims %43#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc58)
%45 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc59)
%46 = tt.addptr %45, %cst : tensor<1x1x!tt.ptr<i64, 1>>, tensor<1x1xi32> loc(#loc59)
%47 = arith.extsi %44 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc60)
tt.store %46, %47 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc60)
tt.return loc(#loc61)
} loc(#loc)
tt.func private @max_with_index__fp64S1_32S_i32S1_32S__2cconstexpr_1_(tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) attributes {noinline = false} loc(#loc62)
tt.func private @maximum_with_index__fp64_i32_fp64_i32__(f64, i32, f64, i32) -> (f64, i32) attributes {noinline = false} loc(#loc63)
tt.func private @is_floating__fp64__(f64) -> i1 attributes {noinline = false} loc(#loc64)
tt.func private @promote_to_tensor__fp64__(f64) -> tensor<1xf64> attributes {noinline = false} loc(#loc65)
tt.func private @"zeros____0cconstexpr_(constexpr_1_,)__1cconstexpr_int1_"() -> tensor<1xi1> attributes {noinline = false} loc(#loc66)
} loc(#loc)
#loc1 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:49)
#loc2 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35)
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34)
#loc4 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17)
#loc5 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36)
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90)
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15)
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18)
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21)
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26)
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34)
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20)
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22)
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:19)
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:26)
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16)
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18)
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49)
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:59)
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:66)
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56)
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30)
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98)
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32)
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:20)
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:27)
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20)
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:54)
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52)
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:64)
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:71)
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61)
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31)
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103)
#loc35 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95)
#loc36 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35)
#loc37 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20)
#loc38 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35)
#loc39 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20)
#loc40 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20)
#loc41 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32)
#loc42 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42)
#loc43 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63)
#loc45 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21)
#loc46 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23)
#loc47 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29)
#loc48 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29)
#loc49 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28)
#loc50 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16)
#loc51 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29)
#loc52 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17)
#loc53 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31)
#loc54 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21)
#loc55 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12)
#loc56 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35)
#loc57 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69)
#loc58 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22)
#loc59 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25)
#loc60 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68)
#loc61 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4)
#loc62 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":90:0)
#loc63 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":69:0)
#loc64 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":12:0)
#loc65 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":6:0)
#loc66 = loc("/home/dberard/local/triton/python/triton/language/standard.py":84:0)
#loc67 = loc(callsite(#loc41 at #loc42))
#loc68 = loc(callsite(#loc42 at #loc43))
#loc69 = loc(callsite(#loc45 at #loc42))
#loc70 = loc(callsite(#loc46 at #loc42))
#loc71 = loc(callsite(#loc47 at #loc42))
#loc72 = loc(callsite(#loc48 at #loc42))
#loc73 = loc(callsite(#loc49 at #loc42))
#loc74 = loc(callsite(#loc50 at #loc42))
#loc75 = loc(callsite(#loc51 at #loc42))
#loc76 = loc(callsite(#loc52 at #loc42))
#loc77 = loc(callsite(#loc53 at #loc42))
#loc78 = loc(callsite(#loc54 at #loc42))
#loc79 = loc(callsite(#loc55 at #loc42))
#loc80 = loc(callsite(#loc56 at #loc42))
#loc81 = loc(callsite(#loc57 at #loc42))
#loc82 = loc(callsite(#loc67 at #loc43))
#loc83 = loc(callsite(#loc69 at #loc43))
#loc84 = loc(callsite(#loc70 at #loc43))
#loc85 = loc(callsite(#loc71 at #loc43))
#loc86 = loc(callsite(#loc72 at #loc43))
#loc87 = loc(callsite(#loc73 at #loc43))
#loc88 = loc(callsite(#loc74 at #loc43))
#loc89 = loc(callsite(#loc75 at #loc43))
#loc90 = loc(callsite(#loc76 at #loc43))
#loc91 = loc(callsite(#loc77 at #loc43))
#loc92 = loc(callsite(#loc78 at #loc43))
#loc93 = loc(callsite(#loc79 at #loc43))
#loc94 = loc(callsite(#loc80 at #loc43))
#loc95 = loc(callsite(#loc81 at #loc43))
// -----// IR Dump Before TritonRewriteTensorPointer (triton-rewrite-tensor-pointer) ('builtin.module' operation) //----- //
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)
#loc44 = loc(unknown)
module {
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} {
%true = arith.constant true loc(#loc77)
%cst = arith.constant dense<0> : tensor<1x1xi32> loc(#loc4)
%cst_0 = arith.constant dense<0xFF800000> : tensor<1x32xf32> loc(#loc5)
%cst_1 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc6)
%cst_2 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc7)
%cst_3 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc8)
%cst_4 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc9)
%cst_5 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc10)
%cst_6 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc11)
%cst_7 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc12)
%0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc13)
%1 = tt.expand_dims %0 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc14)
%2 = arith.cmpi slt, %1, %cst_7 : tensor<1x32xi32> loc(#loc12)
%3 = arith.remsi %1, %cst_6 : tensor<1x32xi32> loc(#loc11)
%4 = arith.divsi %1, %cst_6 : tensor<1x32xi32> loc(#loc15)
%5 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc10)
%6 = arith.divsi %5, %cst_6 : tensor<1x32xi32> loc(#loc16)
%7 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc17)
%8 = arith.divsi %7, %cst_6 : tensor<1x32xi32> loc(#loc18)
%9 = arith.addi %8, %cst_6 : tensor<1x32xi32> loc(#loc19)
%10 = arith.cmpi slt, %6, %9 : tensor<1x32xi32> loc(#loc20)
%11 = arith.muli %4, %cst_5 : tensor<1x32xi32> loc(#loc21)
%12 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc22)
%13 = arith.divsi %12, %cst_6 : tensor<1x32xi32> loc(#loc23)
%14 = arith.addi %11, %13 : tensor<1x32xi32> loc(#loc24)
%15 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc25)
%16 = tt.addptr %15, %14 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc25)
%17 = arith.andi %2, %10 : tensor<1x32xi1> loc(#loc26)
%18 = arith.extf %cst_4 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc9)
%19 = tt.load %16, %17, %18 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc9)
%20 = arith.select %10, %19, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc27)
%21 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc28)
%22 = arith.divsi %21, %cst_6 : tensor<1x32xi32> loc(#loc29)
%23 = arith.addi %22, %cst_2 : tensor<1x32xi32> loc(#loc7)
%24 = arith.cmpi slt, %23, %9 : tensor<1x32xi32> loc(#loc30)
%25 = arith.muli %4, %cst_5 : tensor<1x32xi32> loc(#loc31)
%26 = arith.addi %25, %cst_2 : tensor<1x32xi32> loc(#loc32)
%27 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc33)
%28 = arith.divsi %27, %cst_6 : tensor<1x32xi32> loc(#loc34)
%29 = arith.addi %26, %28 : tensor<1x32xi32> loc(#loc35)
%30 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc36)
%31 = tt.addptr %30, %29 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc36)
%32 = arith.andi %2, %24 : tensor<1x32xi1> loc(#loc37)
%33 = arith.extf %cst_4 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc38)
%34 = tt.load %31, %32, %33 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc38)
%35 = arith.select %24, %34, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc39)
%36 = arith.addf %35, %20 : tensor<1x32xf64> loc(#loc40)
%37 = arith.select %10, %cst_1, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc6)
%38 = arith.select %24, %cst_1, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc41)
%39 = arith.addf %38, %37 : tensor<1x32xf64> loc(#loc42)
%40 = arith.divf %36, %39 : tensor<1x32xf64> loc(#loc43)
%41 = arith.extf %cst_0 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc5)
%42 = arith.select %2, %40, %41 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc5)
%43:2 = "tt.reduce"(%42, %1) <{axis = 1 : i32}> ({
^bb0(%arg4: f64 loc(unknown), %arg5: i32 loc(unknown), %arg6: f64 loc(unknown), %arg7: i32 loc(unknown)):
%48 = arith.cmpf ogt, %arg4, %arg6 : f64 loc(#loc78)
%49 = arith.cmpf oeq, %arg4, %arg6 : f64 loc(#loc79)
%50 = arith.cmpf une, %arg4, %arg4 : f64 loc(#loc80)
%51 = arith.cmpf une, %arg6, %arg6 : f64 loc(#loc81)
%52 = arith.xori %51, %true : i1 loc(#loc77)
%53 = arith.andi %50, %52 : i1 loc(#loc82)
%54 = arith.ori %48, %53 : i1 loc(#loc83)
%55 = arith.andi %50, %51 : i1 loc(#loc84)
%56 = arith.ori %49, %55 : i1 loc(#loc85)
%57 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc86)
%58 = arith.andi %56, %57 : i1 loc(#loc87)
%59 = arith.ori %54, %58 : i1 loc(#loc88)
%60 = arith.select %59, %arg4, %arg6 : f64 loc(#loc89)
%61 = arith.select %59, %arg5, %arg7 : i32 loc(#loc90)
tt.reduce.return %60, %61 : f64, i32 loc(#loc63)
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc63)
%44 = tt.expand_dims %43#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc58)
%45 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc59)
%46 = tt.addptr %45, %cst : tensor<1x1x!tt.ptr<i64, 1>>, tensor<1x1xi32> loc(#loc59)
%47 = arith.extsi %44 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc60)
tt.store %46, %47 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc60)
tt.return loc(#loc61)
} loc(#loc)
} loc(#loc)
#loc1 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32)
#loc2 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42)
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63)
#loc4 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:49)
#loc5 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35)
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34)
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17)
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36)
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90)
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15)
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18)
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21)
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26)
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34)
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20)
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22)
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:19)
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:26)
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16)
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18)
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49)
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:59)
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:66)
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56)
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30)
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98)
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32)
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:20)
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:27)
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20)
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:54)
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52)
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:64)
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:71)
#loc35 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61)
#loc36 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31)
#loc37 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103)
#loc38 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95)
#loc39 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35)
#loc40 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20)
#loc41 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35)
#loc42 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20)
#loc43 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20)
#loc45 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21)
#loc46 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23)
#loc47 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29)
#loc48 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29)
#loc49 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28)
#loc50 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16)
#loc51 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29)
#loc52 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17)
#loc53 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31)
#loc54 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21)
#loc55 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12)
#loc56 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35)
#loc57 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69)
#loc58 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22)
#loc59 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25)
#loc60 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68)
#loc61 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4)
#loc62 = loc(callsite(#loc1 at #loc2))
#loc63 = loc(callsite(#loc2 at #loc3))
#loc64 = loc(callsite(#loc45 at #loc2))
#loc65 = loc(callsite(#loc46 at #loc2))
#loc66 = loc(callsite(#loc47 at #loc2))
#loc67 = loc(callsite(#loc48 at #loc2))
#loc68 = loc(callsite(#loc49 at #loc2))
#loc69 = loc(callsite(#loc50 at #loc2))
#loc70 = loc(callsite(#loc51 at #loc2))
#loc71 = loc(callsite(#loc52 at #loc2))
#loc72 = loc(callsite(#loc53 at #loc2))
#loc73 = loc(callsite(#loc54 at #loc2))
#loc74 = loc(callsite(#loc55 at #loc2))
#loc75 = loc(callsite(#loc56 at #loc2))
#loc76 = loc(callsite(#loc57 at #loc2))
#loc77 = loc(callsite(#loc62 at #loc3))
#loc78 = loc(callsite(#loc64 at #loc3))
#loc79 = loc(callsite(#loc65 at #loc3))
#loc80 = loc(callsite(#loc66 at #loc3))
#loc81 = loc(callsite(#loc67 at #loc3))
#loc82 = loc(callsite(#loc68 at #loc3))
#loc83 = loc(callsite(#loc69 at #loc3))
#loc84 = loc(callsite(#loc70 at #loc3))
#loc85 = loc(callsite(#loc71 at #loc3))
#loc86 = loc(callsite(#loc72 at #loc3))
#loc87 = loc(callsite(#loc73 at #loc3))
#loc88 = loc(callsite(#loc74 at #loc3))
#loc89 = loc(callsite(#loc75 at #loc3))
#loc90 = loc(callsite(#loc76 at #loc3))
// -----// IR Dump Before Inliner (inline) ('builtin.module' operation) //----- //
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)
#loc44 = loc(unknown)
module {
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} {
%true = arith.constant true loc(#loc77)
%cst = arith.constant dense<0> : tensor<1x1xi32> loc(#loc4)
%cst_0 = arith.constant dense<0xFF800000> : tensor<1x32xf32> loc(#loc5)
%cst_1 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc6)
%cst_2 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc7)
%cst_3 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc8)
%cst_4 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc9)
%cst_5 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc10)
%cst_6 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc11)
%cst_7 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc12)
%0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc13)
%1 = tt.expand_dims %0 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc14)
%2 = arith.cmpi slt, %1, %cst_7 : tensor<1x32xi32> loc(#loc12)
%3 = arith.remsi %1, %cst_6 : tensor<1x32xi32> loc(#loc11)
%4 = arith.divsi %1, %cst_6 : tensor<1x32xi32> loc(#loc15)
%5 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc10)
%6 = arith.divsi %5, %cst_6 : tensor<1x32xi32> loc(#loc16)
%7 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc17)
%8 = arith.divsi %7, %cst_6 : tensor<1x32xi32> loc(#loc18)
%9 = arith.addi %8, %cst_6 : tensor<1x32xi32> loc(#loc19)
%10 = arith.cmpi slt, %6, %9 : tensor<1x32xi32> loc(#loc20)
%11 = arith.muli %4, %cst_5 : tensor<1x32xi32> loc(#loc21)
%12 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc22)
%13 = arith.divsi %12, %cst_6 : tensor<1x32xi32> loc(#loc23)
%14 = arith.addi %11, %13 : tensor<1x32xi32> loc(#loc24)
%15 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc25)
%16 = tt.addptr %15, %14 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc25)
%17 = arith.andi %2, %10 : tensor<1x32xi1> loc(#loc26)
%18 = arith.extf %cst_4 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc9)
%19 = tt.load %16, %17, %18 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc9)
%20 = arith.select %10, %19, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc27)
%21 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc28)
%22 = arith.divsi %21, %cst_6 : tensor<1x32xi32> loc(#loc29)
%23 = arith.addi %22, %cst_2 : tensor<1x32xi32> loc(#loc7)
%24 = arith.cmpi slt, %23, %9 : tensor<1x32xi32> loc(#loc30)
%25 = arith.muli %4, %cst_5 : tensor<1x32xi32> loc(#loc31)
%26 = arith.addi %25, %cst_2 : tensor<1x32xi32> loc(#loc32)
%27 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc33)
%28 = arith.divsi %27, %cst_6 : tensor<1x32xi32> loc(#loc34)
%29 = arith.addi %26, %28 : tensor<1x32xi32> loc(#loc35)
%30 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc36)
%31 = tt.addptr %30, %29 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc36)
%32 = arith.andi %2, %24 : tensor<1x32xi1> loc(#loc37)
%33 = arith.extf %cst_4 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc38)
%34 = tt.load %31, %32, %33 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc38)
%35 = arith.select %24, %34, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc39)
%36 = arith.addf %35, %20 : tensor<1x32xf64> loc(#loc40)
%37 = arith.select %10, %cst_1, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc6)
%38 = arith.select %24, %cst_1, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc41)
%39 = arith.addf %38, %37 : tensor<1x32xf64> loc(#loc42)
%40 = arith.divf %36, %39 : tensor<1x32xf64> loc(#loc43)
%41 = arith.extf %cst_0 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc5)
%42 = arith.select %2, %40, %41 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc5)
%43:2 = "tt.reduce"(%42, %1) <{axis = 1 : i32}> ({
^bb0(%arg4: f64 loc(unknown), %arg5: i32 loc(unknown), %arg6: f64 loc(unknown), %arg7: i32 loc(unknown)):
%48 = arith.cmpf ogt, %arg4, %arg6 : f64 loc(#loc78)
%49 = arith.cmpf oeq, %arg4, %arg6 : f64 loc(#loc79)
%50 = arith.cmpf une, %arg4, %arg4 : f64 loc(#loc80)
%51 = arith.cmpf une, %arg6, %arg6 : f64 loc(#loc81)
%52 = arith.xori %51, %true : i1 loc(#loc77)
%53 = arith.andi %50, %52 : i1 loc(#loc82)
%54 = arith.ori %48, %53 : i1 loc(#loc83)
%55 = arith.andi %50, %51 : i1 loc(#loc84)
%56 = arith.ori %49, %55 : i1 loc(#loc85)
%57 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc86)
%58 = arith.andi %56, %57 : i1 loc(#loc87)
%59 = arith.ori %54, %58 : i1 loc(#loc88)
%60 = arith.select %59, %arg4, %arg6 : f64 loc(#loc89)
%61 = arith.select %59, %arg5, %arg7 : i32 loc(#loc90)
tt.reduce.return %60, %61 : f64, i32 loc(#loc63)
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc63)
%44 = tt.expand_dims %43#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc58)
%45 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc59)
%46 = tt.addptr %45, %cst : tensor<1x1x!tt.ptr<i64, 1>>, tensor<1x1xi32> loc(#loc59)
%47 = arith.extsi %44 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc60)
tt.store %46, %47 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc60)
tt.return loc(#loc61)
} loc(#loc)
} loc(#loc)
#loc1 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32)
#loc2 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42)
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63)
#loc4 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:49)
#loc5 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35)
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34)
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17)
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36)
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90)
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15)
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18)
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21)
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26)
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34)
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20)
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22)
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:19)
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:26)
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16)
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18)
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49)
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:59)
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:66)
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56)
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30)
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98)
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32)
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:20)
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:27)
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20)
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:54)
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52)
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:64)
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:71)
#loc35 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61)
#loc36 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31)
#loc37 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103)
#loc38 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95)
#loc39 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35)
#loc40 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20)
#loc41 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35)
#loc42 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20)
#loc43 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20)
#loc45 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21)
#loc46 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23)
#loc47 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29)
#loc48 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29)
#loc49 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28)
#loc50 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16)
#loc51 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29)
#loc52 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17)
#loc53 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31)
#loc54 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21)
#loc55 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12)
#loc56 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35)
#loc57 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69)
#loc58 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22)
#loc59 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25)
#loc60 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68)
#loc61 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4)
#loc62 = loc(callsite(#loc1 at #loc2))
#loc63 = loc(callsite(#loc2 at #loc3))
#loc64 = loc(callsite(#loc45 at #loc2))
#loc65 = loc(callsite(#loc46 at #loc2))
#loc66 = loc(callsite(#loc47 at #loc2))
#loc67 = loc(callsite(#loc48 at #loc2))
#loc68 = loc(callsite(#loc49 at #loc2))
#loc69 = loc(callsite(#loc50 at #loc2))
#loc70 = loc(callsite(#loc51 at #loc2))
#loc71 = loc(callsite(#loc52 at #loc2))
#loc72 = loc(callsite(#loc53 at #loc2))
#loc73 = loc(callsite(#loc54 at #loc2))
#loc74 = loc(callsite(#loc55 at #loc2))
#loc75 = loc(callsite(#loc56 at #loc2))
#loc76 = loc(callsite(#loc57 at #loc2))
#loc77 = loc(callsite(#loc62 at #loc3))
#loc78 = loc(callsite(#loc64 at #loc3))
#loc79 = loc(callsite(#loc65 at #loc3))
#loc80 = loc(callsite(#loc66 at #loc3))
#loc81 = loc(callsite(#loc67 at #loc3))
#loc82 = loc(callsite(#loc68 at #loc3))
#loc83 = loc(callsite(#loc69 at #loc3))
#loc84 = loc(callsite(#loc70 at #loc3))
#loc85 = loc(callsite(#loc71 at #loc3))
#loc86 = loc(callsite(#loc72 at #loc3))
#loc87 = loc(callsite(#loc73 at #loc3))
#loc88 = loc(callsite(#loc74 at #loc3))
#loc89 = loc(callsite(#loc75 at #loc3))
#loc90 = loc(callsite(#loc76 at #loc3))
// -----// IR Dump Before Canonicalizer (canonicalize) ('tt.func' operation: @triton__0d1d23de) //----- //
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)
#loc44 = loc(unknown)
module {
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} {
%true = arith.constant true loc(#loc77)
%cst = arith.constant dense<0> : tensor<1x1xi32> loc(#loc4)
%cst_0 = arith.constant dense<0xFF800000> : tensor<1x32xf32> loc(#loc5)
%cst_1 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc6)
%cst_2 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc7)
%cst_3 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc8)
%cst_4 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc9)
%cst_5 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc10)
%cst_6 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc11)
%cst_7 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc12)
%0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc13)
%1 = tt.expand_dims %0 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc14)
%2 = arith.cmpi slt, %1, %cst_7 : tensor<1x32xi32> loc(#loc12)
%3 = arith.remsi %1, %cst_6 : tensor<1x32xi32> loc(#loc11)
%4 = arith.divsi %1, %cst_6 : tensor<1x32xi32> loc(#loc15)
%5 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc10)
%6 = arith.divsi %5, %cst_6 : tensor<1x32xi32> loc(#loc16)
%7 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc17)
%8 = arith.divsi %7, %cst_6 : tensor<1x32xi32> loc(#loc18)
%9 = arith.addi %8, %cst_6 : tensor<1x32xi32> loc(#loc19)
%10 = arith.cmpi slt, %6, %9 : tensor<1x32xi32> loc(#loc20)
%11 = arith.muli %4, %cst_5 : tensor<1x32xi32> loc(#loc21)
%12 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc22)
%13 = arith.divsi %12, %cst_6 : tensor<1x32xi32> loc(#loc23)
%14 = arith.addi %11, %13 : tensor<1x32xi32> loc(#loc24)
%15 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc25)
%16 = tt.addptr %15, %14 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc25)
%17 = arith.andi %2, %10 : tensor<1x32xi1> loc(#loc26)
%18 = arith.extf %cst_4 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc9)
%19 = tt.load %16, %17, %18 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc9)
%20 = arith.select %10, %19, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc27)
%21 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc28)
%22 = arith.divsi %21, %cst_6 : tensor<1x32xi32> loc(#loc29)
%23 = arith.addi %22, %cst_2 : tensor<1x32xi32> loc(#loc7)
%24 = arith.cmpi slt, %23, %9 : tensor<1x32xi32> loc(#loc30)
%25 = arith.muli %4, %cst_5 : tensor<1x32xi32> loc(#loc31)
%26 = arith.addi %25, %cst_2 : tensor<1x32xi32> loc(#loc32)
%27 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc33)
%28 = arith.divsi %27, %cst_6 : tensor<1x32xi32> loc(#loc34)
%29 = arith.addi %26, %28 : tensor<1x32xi32> loc(#loc35)
%30 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc36)
%31 = tt.addptr %30, %29 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc36)
%32 = arith.andi %2, %24 : tensor<1x32xi1> loc(#loc37)
%33 = arith.extf %cst_4 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc38)
%34 = tt.load %31, %32, %33 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc38)
%35 = arith.select %24, %34, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc39)
%36 = arith.addf %35, %20 : tensor<1x32xf64> loc(#loc40)
%37 = arith.select %10, %cst_1, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc6)
%38 = arith.select %24, %cst_1, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc41)
%39 = arith.addf %38, %37 : tensor<1x32xf64> loc(#loc42)
%40 = arith.divf %36, %39 : tensor<1x32xf64> loc(#loc43)
%41 = arith.extf %cst_0 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc5)
%42 = arith.select %2, %40, %41 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc5)
%43:2 = "tt.reduce"(%42, %1) <{axis = 1 : i32}> ({
^bb0(%arg4: f64 loc(unknown), %arg5: i32 loc(unknown), %arg6: f64 loc(unknown), %arg7: i32 loc(unknown)):
%48 = arith.cmpf ogt, %arg4, %arg6 : f64 loc(#loc78)
%49 = arith.cmpf oeq, %arg4, %arg6 : f64 loc(#loc79)
%50 = arith.cmpf une, %arg4, %arg4 : f64 loc(#loc80)
%51 = arith.cmpf une, %arg6, %arg6 : f64 loc(#loc81)
%52 = arith.xori %51, %true : i1 loc(#loc77)
%53 = arith.andi %50, %52 : i1 loc(#loc82)
%54 = arith.ori %48, %53 : i1 loc(#loc83)
%55 = arith.andi %50, %51 : i1 loc(#loc84)
%56 = arith.ori %49, %55 : i1 loc(#loc85)
%57 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc86)
%58 = arith.andi %56, %57 : i1 loc(#loc87)
%59 = arith.ori %54, %58 : i1 loc(#loc88)
%60 = arith.select %59, %arg4, %arg6 : f64 loc(#loc89)
%61 = arith.select %59, %arg5, %arg7 : i32 loc(#loc90)
tt.reduce.return %60, %61 : f64, i32 loc(#loc63)
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc63)
%44 = tt.expand_dims %43#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc58)
%45 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc59)
%46 = tt.addptr %45, %cst : tensor<1x1x!tt.ptr<i64, 1>>, tensor<1x1xi32> loc(#loc59)
%47 = arith.extsi %44 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc60)
tt.store %46, %47 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc60)
tt.return loc(#loc61)
} loc(#loc)
} loc(#loc)
#loc1 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32)
#loc2 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42)
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63)
#loc4 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:49)
#loc5 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35)
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34)
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17)
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36)
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90)
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15)
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18)
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21)
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26)
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34)
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20)
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22)
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:19)
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:26)
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16)
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18)
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49)
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:59)
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:66)
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56)
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30)
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98)
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32)
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:20)
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:27)
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20)
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:54)
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52)
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:64)
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:71)
#loc35 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61)
#loc36 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31)
#loc37 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103)
#loc38 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95)
#loc39 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35)
#loc40 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20)
#loc41 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35)
#loc42 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20)
#loc43 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20)
#loc45 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21)
#loc46 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23)
#loc47 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29)
#loc48 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29)
#loc49 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28)
#loc50 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16)
#loc51 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29)
#loc52 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17)
#loc53 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31)
#loc54 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21)
#loc55 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12)
#loc56 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35)
#loc57 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69)
#loc58 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22)
#loc59 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25)
#loc60 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68)
#loc61 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4)
#loc62 = loc(callsite(#loc1 at #loc2))
#loc63 = loc(callsite(#loc2 at #loc3))
#loc64 = loc(callsite(#loc45 at #loc2))
#loc65 = loc(callsite(#loc46 at #loc2))
#loc66 = loc(callsite(#loc47 at #loc2))
#loc67 = loc(callsite(#loc48 at #loc2))
#loc68 = loc(callsite(#loc49 at #loc2))
#loc69 = loc(callsite(#loc50 at #loc2))
#loc70 = loc(callsite(#loc51 at #loc2))
#loc71 = loc(callsite(#loc52 at #loc2))
#loc72 = loc(callsite(#loc53 at #loc2))
#loc73 = loc(callsite(#loc54 at #loc2))
#loc74 = loc(callsite(#loc55 at #loc2))
#loc75 = loc(callsite(#loc56 at #loc2))
#loc76 = loc(callsite(#loc57 at #loc2))
#loc77 = loc(callsite(#loc62 at #loc3))
#loc78 = loc(callsite(#loc64 at #loc3))
#loc79 = loc(callsite(#loc65 at #loc3))
#loc80 = loc(callsite(#loc66 at #loc3))
#loc81 = loc(callsite(#loc67 at #loc3))
#loc82 = loc(callsite(#loc68 at #loc3))
#loc83 = loc(callsite(#loc69 at #loc3))
#loc84 = loc(callsite(#loc70 at #loc3))
#loc85 = loc(callsite(#loc71 at #loc3))
#loc86 = loc(callsite(#loc72 at #loc3))
#loc87 = loc(callsite(#loc73 at #loc3))
#loc88 = loc(callsite(#loc74 at #loc3))
#loc89 = loc(callsite(#loc75 at #loc3))
#loc90 = loc(callsite(#loc76 at #loc3))
// -----// IR Dump Before TritonCombineOps (triton-combine) ('builtin.module' operation) //----- //
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)
#loc44 = loc(unknown)
module {
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} {
%true = arith.constant true loc(#loc77)
%cst = arith.constant dense<0> : tensor<1x1xi32> loc(#loc4)
%cst_0 = arith.constant dense<0xFF800000> : tensor<1x32xf32> loc(#loc5)
%cst_1 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc6)
%cst_2 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc7)
%cst_3 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc8)
%cst_4 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc9)
%cst_5 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc10)
%cst_6 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc11)
%cst_7 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc12)
%0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc13)
%1 = tt.expand_dims %0 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc14)
%2 = arith.cmpi slt, %1, %cst_7 : tensor<1x32xi32> loc(#loc12)
%3 = arith.remsi %1, %cst_6 : tensor<1x32xi32> loc(#loc11)
%4 = arith.divsi %1, %cst_6 : tensor<1x32xi32> loc(#loc15)
%5 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc10)
%6 = arith.divsi %5, %cst_6 : tensor<1x32xi32> loc(#loc16)
%7 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc17)
%8 = arith.divsi %7, %cst_6 : tensor<1x32xi32> loc(#loc18)
%9 = arith.addi %8, %cst_6 : tensor<1x32xi32> loc(#loc19)
%10 = arith.cmpi slt, %6, %9 : tensor<1x32xi32> loc(#loc20)
%11 = arith.muli %4, %cst_5 : tensor<1x32xi32> loc(#loc21)
%12 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc22)
%13 = arith.divsi %12, %cst_6 : tensor<1x32xi32> loc(#loc23)
%14 = arith.addi %11, %13 : tensor<1x32xi32> loc(#loc24)
%15 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc25)
%16 = tt.addptr %15, %14 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc25)
%17 = arith.andi %2, %10 : tensor<1x32xi1> loc(#loc26)
%18 = arith.extf %cst_4 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc9)
%19 = tt.load %16, %17, %18 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc9)
%20 = arith.select %10, %19, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc27)
%21 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc28)
%22 = arith.divsi %21, %cst_6 : tensor<1x32xi32> loc(#loc29)
%23 = arith.addi %22, %cst_2 : tensor<1x32xi32> loc(#loc7)
%24 = arith.cmpi slt, %23, %9 : tensor<1x32xi32> loc(#loc30)
%25 = arith.muli %4, %cst_5 : tensor<1x32xi32> loc(#loc31)
%26 = arith.addi %25, %cst_2 : tensor<1x32xi32> loc(#loc32)
%27 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc33)
%28 = arith.divsi %27, %cst_6 : tensor<1x32xi32> loc(#loc34)
%29 = arith.addi %26, %28 : tensor<1x32xi32> loc(#loc35)
%30 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc36)
%31 = tt.addptr %30, %29 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc36)
%32 = arith.andi %2, %24 : tensor<1x32xi1> loc(#loc37)
%33 = arith.extf %cst_4 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc38)
%34 = tt.load %31, %32, %33 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc38)
%35 = arith.select %24, %34, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc39)
%36 = arith.addf %35, %20 : tensor<1x32xf64> loc(#loc40)
%37 = arith.select %10, %cst_1, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc6)
%38 = arith.select %24, %cst_1, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc41)
%39 = arith.addf %38, %37 : tensor<1x32xf64> loc(#loc42)
%40 = arith.divf %36, %39 : tensor<1x32xf64> loc(#loc43)
%41 = arith.extf %cst_0 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc5)
%42 = arith.select %2, %40, %41 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc5)
%43:2 = "tt.reduce"(%42, %1) <{axis = 1 : i32}> ({
^bb0(%arg4: f64 loc(unknown), %arg5: i32 loc(unknown), %arg6: f64 loc(unknown), %arg7: i32 loc(unknown)):
%48 = arith.cmpf ogt, %arg4, %arg6 : f64 loc(#loc78)
%49 = arith.cmpf oeq, %arg4, %arg6 : f64 loc(#loc79)
%50 = arith.cmpf une, %arg4, %arg4 : f64 loc(#loc80)
%51 = arith.cmpf une, %arg6, %arg6 : f64 loc(#loc81)
%52 = arith.xori %51, %true : i1 loc(#loc77)
%53 = arith.andi %50, %52 : i1 loc(#loc82)
%54 = arith.ori %48, %53 : i1 loc(#loc83)
%55 = arith.andi %50, %51 : i1 loc(#loc84)
%56 = arith.ori %49, %55 : i1 loc(#loc85)
%57 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc86)
%58 = arith.andi %56, %57 : i1 loc(#loc87)
%59 = arith.ori %54, %58 : i1 loc(#loc88)
%60 = arith.select %59, %arg4, %arg6 : f64 loc(#loc89)
%61 = arith.select %59, %arg5, %arg7 : i32 loc(#loc90)
tt.reduce.return %60, %61 : f64, i32 loc(#loc63)
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc63)
%44 = tt.expand_dims %43#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc58)
%45 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc59)
%46 = tt.addptr %45, %cst : tensor<1x1x!tt.ptr<i64, 1>>, tensor<1x1xi32> loc(#loc59)
%47 = arith.extsi %44 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc60)
tt.store %46, %47 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc60)
tt.return loc(#loc61)
} loc(#loc)
} loc(#loc)
#loc1 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32)
#loc2 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42)
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63)
#loc4 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:49)
#loc5 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35)
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34)
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17)
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36)
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90)
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15)
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18)
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21)
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26)
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34)
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20)
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22)
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:19)
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:26)
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16)
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18)
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49)
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:59)
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:66)
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56)
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30)
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98)
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32)
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:20)
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:27)
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20)
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:54)
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52)
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:64)
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:71)
#loc35 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61)
#loc36 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31)
#loc37 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103)
#loc38 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95)
#loc39 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35)
#loc40 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20)
#loc41 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35)
#loc42 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20)
#loc43 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20)
#loc45 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21)
#loc46 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23)
#loc47 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29)
#loc48 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29)
#loc49 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28)
#loc50 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16)
#loc51 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29)
#loc52 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17)
#loc53 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31)
#loc54 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21)
#loc55 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12)
#loc56 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35)
#loc57 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69)
#loc58 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22)
#loc59 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25)
#loc60 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68)
#loc61 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4)
#loc62 = loc(callsite(#loc1 at #loc2))
#loc63 = loc(callsite(#loc2 at #loc3))
#loc64 = loc(callsite(#loc45 at #loc2))
#loc65 = loc(callsite(#loc46 at #loc2))
#loc66 = loc(callsite(#loc47 at #loc2))
#loc67 = loc(callsite(#loc48 at #loc2))
#loc68 = loc(callsite(#loc49 at #loc2))
#loc69 = loc(callsite(#loc50 at #loc2))
#loc70 = loc(callsite(#loc51 at #loc2))
#loc71 = loc(callsite(#loc52 at #loc2))
#loc72 = loc(callsite(#loc53 at #loc2))
#loc73 = loc(callsite(#loc54 at #loc2))
#loc74 = loc(callsite(#loc55 at #loc2))
#loc75 = loc(callsite(#loc56 at #loc2))
#loc76 = loc(callsite(#loc57 at #loc2))
#loc77 = loc(callsite(#loc62 at #loc3))
#loc78 = loc(callsite(#loc64 at #loc3))
#loc79 = loc(callsite(#loc65 at #loc3))
#loc80 = loc(callsite(#loc66 at #loc3))
#loc81 = loc(callsite(#loc67 at #loc3))
#loc82 = loc(callsite(#loc68 at #loc3))
#loc83 = loc(callsite(#loc69 at #loc3))
#loc84 = loc(callsite(#loc70 at #loc3))
#loc85 = loc(callsite(#loc71 at #loc3))
#loc86 = loc(callsite(#loc72 at #loc3))
#loc87 = loc(callsite(#loc73 at #loc3))
#loc88 = loc(callsite(#loc74 at #loc3))
#loc89 = loc(callsite(#loc75 at #loc3))
#loc90 = loc(callsite(#loc76 at #loc3))
// -----// IR Dump Before Canonicalizer (canonicalize) ('builtin.module' operation) //----- //
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)
#loc44 = loc(unknown)
module {
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} {
%true = arith.constant true loc(#loc77)
%cst = arith.constant dense<0> : tensor<1x1xi32> loc(#loc4)
%cst_0 = arith.constant dense<0xFF800000> : tensor<1x32xf32> loc(#loc5)
%cst_1 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc6)
%cst_2 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc7)
%cst_3 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc8)
%cst_4 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc9)
%cst_5 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc10)
%cst_6 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc11)
%cst_7 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc12)
%0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc13)
%1 = tt.expand_dims %0 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc14)
%2 = arith.cmpi slt, %1, %cst_7 : tensor<1x32xi32> loc(#loc12)
%3 = arith.remsi %1, %cst_6 : tensor<1x32xi32> loc(#loc11)
%4 = arith.divsi %1, %cst_6 : tensor<1x32xi32> loc(#loc15)
%5 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc10)
%6 = arith.divsi %5, %cst_6 : tensor<1x32xi32> loc(#loc16)
%7 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc17)
%8 = arith.divsi %7, %cst_6 : tensor<1x32xi32> loc(#loc18)
%9 = arith.addi %8, %cst_6 : tensor<1x32xi32> loc(#loc19)
%10 = arith.cmpi slt, %6, %9 : tensor<1x32xi32> loc(#loc20)
%11 = arith.muli %4, %cst_5 : tensor<1x32xi32> loc(#loc21)
%12 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc22)
%13 = arith.divsi %12, %cst_6 : tensor<1x32xi32> loc(#loc23)
%14 = arith.addi %11, %13 : tensor<1x32xi32> loc(#loc24)
%15 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc25)
%16 = tt.addptr %15, %14 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc25)
%17 = arith.andi %2, %10 : tensor<1x32xi1> loc(#loc26)
%18 = arith.extf %cst_4 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc9)
%19 = tt.load %16, %17, %18 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc9)
%20 = arith.select %10, %19, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc27)
%21 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc28)
%22 = arith.divsi %21, %cst_6 : tensor<1x32xi32> loc(#loc29)
%23 = arith.addi %22, %cst_2 : tensor<1x32xi32> loc(#loc7)
%24 = arith.cmpi slt, %23, %9 : tensor<1x32xi32> loc(#loc30)
%25 = arith.muli %4, %cst_5 : tensor<1x32xi32> loc(#loc31)
%26 = arith.addi %25, %cst_2 : tensor<1x32xi32> loc(#loc32)
%27 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc33)
%28 = arith.divsi %27, %cst_6 : tensor<1x32xi32> loc(#loc34)
%29 = arith.addi %26, %28 : tensor<1x32xi32> loc(#loc35)
%30 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc36)
%31 = tt.addptr %30, %29 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc36)
%32 = arith.andi %2, %24 : tensor<1x32xi1> loc(#loc37)
%33 = arith.extf %cst_4 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc38)
%34 = tt.load %31, %32, %33 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc38)
%35 = arith.select %24, %34, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc39)
%36 = arith.addf %35, %20 : tensor<1x32xf64> loc(#loc40)
%37 = arith.select %10, %cst_1, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc6)
%38 = arith.select %24, %cst_1, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc41)
%39 = arith.addf %38, %37 : tensor<1x32xf64> loc(#loc42)
%40 = arith.divf %36, %39 : tensor<1x32xf64> loc(#loc43)
%41 = arith.extf %cst_0 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc5)
%42 = arith.select %2, %40, %41 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc5)
%43:2 = "tt.reduce"(%42, %1) <{axis = 1 : i32}> ({
^bb0(%arg4: f64 loc(unknown), %arg5: i32 loc(unknown), %arg6: f64 loc(unknown), %arg7: i32 loc(unknown)):
%48 = arith.cmpf ogt, %arg4, %arg6 : f64 loc(#loc78)
%49 = arith.cmpf oeq, %arg4, %arg6 : f64 loc(#loc79)
%50 = arith.cmpf une, %arg4, %arg4 : f64 loc(#loc80)
%51 = arith.cmpf une, %arg6, %arg6 : f64 loc(#loc81)
%52 = arith.xori %51, %true : i1 loc(#loc77)
%53 = arith.andi %50, %52 : i1 loc(#loc82)
%54 = arith.ori %48, %53 : i1 loc(#loc83)
%55 = arith.andi %50, %51 : i1 loc(#loc84)
%56 = arith.ori %49, %55 : i1 loc(#loc85)
%57 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc86)
%58 = arith.andi %56, %57 : i1 loc(#loc87)
%59 = arith.ori %54, %58 : i1 loc(#loc88)
%60 = arith.select %59, %arg4, %arg6 : f64 loc(#loc89)
%61 = arith.select %59, %arg5, %arg7 : i32 loc(#loc90)
tt.reduce.return %60, %61 : f64, i32 loc(#loc63)
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc63)
%44 = tt.expand_dims %43#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc58)
%45 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc59)
%46 = tt.addptr %45, %cst : tensor<1x1x!tt.ptr<i64, 1>>, tensor<1x1xi32> loc(#loc59)
%47 = arith.extsi %44 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc60)
tt.store %46, %47 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc60)
tt.return loc(#loc61)
} loc(#loc)
} loc(#loc)
#loc1 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32)
#loc2 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42)
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63)
#loc4 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:49)
#loc5 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35)
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34)
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17)
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36)
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90)
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15)
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18)
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21)
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26)
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34)
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20)
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22)
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:19)
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:26)
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16)
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18)
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49)
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:59)
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:66)
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56)
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30)
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98)
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32)
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:20)
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:27)
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20)
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:54)
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52)
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:64)
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:71)
#loc35 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61)
#loc36 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31)
#loc37 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103)
#loc38 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95)
#loc39 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35)
#loc40 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20)
#loc41 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35)
#loc42 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20)
#loc43 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20)
#loc45 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21)
#loc46 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23)
#loc47 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29)
#loc48 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29)
#loc49 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28)
#loc50 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16)
#loc51 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29)
#loc52 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17)
#loc53 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31)
#loc54 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21)
#loc55 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12)
#loc56 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35)
#loc57 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69)
#loc58 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22)
#loc59 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25)
#loc60 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68)
#loc61 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4)
#loc62 = loc(callsite(#loc1 at #loc2))
#loc63 = loc(callsite(#loc2 at #loc3))
#loc64 = loc(callsite(#loc45 at #loc2))
#loc65 = loc(callsite(#loc46 at #loc2))
#loc66 = loc(callsite(#loc47 at #loc2))
#loc67 = loc(callsite(#loc48 at #loc2))
#loc68 = loc(callsite(#loc49 at #loc2))
#loc69 = loc(callsite(#loc50 at #loc2))
#loc70 = loc(callsite(#loc51 at #loc2))
#loc71 = loc(callsite(#loc52 at #loc2))
#loc72 = loc(callsite(#loc53 at #loc2))
#loc73 = loc(callsite(#loc54 at #loc2))
#loc74 = loc(callsite(#loc55 at #loc2))
#loc75 = loc(callsite(#loc56 at #loc2))
#loc76 = loc(callsite(#loc57 at #loc2))
#loc77 = loc(callsite(#loc62 at #loc3))
#loc78 = loc(callsite(#loc64 at #loc3))
#loc79 = loc(callsite(#loc65 at #loc3))
#loc80 = loc(callsite(#loc66 at #loc3))
#loc81 = loc(callsite(#loc67 at #loc3))
#loc82 = loc(callsite(#loc68 at #loc3))
#loc83 = loc(callsite(#loc69 at #loc3))
#loc84 = loc(callsite(#loc70 at #loc3))
#loc85 = loc(callsite(#loc71 at #loc3))
#loc86 = loc(callsite(#loc72 at #loc3))
#loc87 = loc(callsite(#loc73 at #loc3))
#loc88 = loc(callsite(#loc74 at #loc3))
#loc89 = loc(callsite(#loc75 at #loc3))
#loc90 = loc(callsite(#loc76 at #loc3))
// -----// IR Dump Before TritonReorderBroadcast (triton-reorder-broadcast) ('builtin.module' operation) //----- //
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)
#loc44 = loc(unknown)
module {
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} {
%true = arith.constant true loc(#loc77)
%cst = arith.constant dense<0> : tensor<1x1xi32> loc(#loc4)
%cst_0 = arith.constant dense<0xFF800000> : tensor<1x32xf32> loc(#loc5)
%cst_1 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc6)
%cst_2 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc7)
%cst_3 = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc8)
%cst_4 = arith.constant dense<0.000000e+00> : tensor<1x32xf32> loc(#loc9)
%cst_5 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc10)
%cst_6 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc11)
%cst_7 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc12)
%0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc13)
%1 = tt.expand_dims %0 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc14)
%2 = arith.cmpi slt, %1, %cst_7 : tensor<1x32xi32> loc(#loc12)
%3 = arith.remsi %1, %cst_6 : tensor<1x32xi32> loc(#loc11)
%4 = arith.divsi %1, %cst_6 : tensor<1x32xi32> loc(#loc15)
%5 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc10)
%6 = arith.divsi %5, %cst_6 : tensor<1x32xi32> loc(#loc16)
%7 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc17)
%8 = arith.divsi %7, %cst_6 : tensor<1x32xi32> loc(#loc18)
%9 = arith.addi %8, %cst_6 : tensor<1x32xi32> loc(#loc19)
%10 = arith.cmpi slt, %6, %9 : tensor<1x32xi32> loc(#loc20)
%11 = arith.muli %4, %cst_5 : tensor<1x32xi32> loc(#loc21)
%12 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc22)
%13 = arith.divsi %12, %cst_6 : tensor<1x32xi32> loc(#loc23)
%14 = arith.addi %11, %13 : tensor<1x32xi32> loc(#loc24)
%15 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc25)
%16 = tt.addptr %15, %14 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc25)
%17 = arith.andi %2, %10 : tensor<1x32xi1> loc(#loc26)
%18 = arith.extf %cst_4 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc9)
%19 = tt.load %16, %17, %18 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc9)
%20 = arith.select %10, %19, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc27)
%21 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc28)
%22 = arith.divsi %21, %cst_6 : tensor<1x32xi32> loc(#loc29)
%23 = arith.addi %22, %cst_2 : tensor<1x32xi32> loc(#loc7)
%24 = arith.cmpi slt, %23, %9 : tensor<1x32xi32> loc(#loc30)
%25 = arith.muli %4, %cst_5 : tensor<1x32xi32> loc(#loc31)
%26 = arith.addi %25, %cst_2 : tensor<1x32xi32> loc(#loc32)
%27 = arith.muli %3, %cst_5 : tensor<1x32xi32> loc(#loc33)
%28 = arith.divsi %27, %cst_6 : tensor<1x32xi32> loc(#loc34)
%29 = arith.addi %26, %28 : tensor<1x32xi32> loc(#loc35)
%30 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc36)
%31 = tt.addptr %30, %29 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc36)
%32 = arith.andi %2, %24 : tensor<1x32xi1> loc(#loc37)
%33 = arith.extf %cst_4 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc38)
%34 = tt.load %31, %32, %33 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc38)
%35 = arith.select %24, %34, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc39)
%36 = arith.addf %35, %20 : tensor<1x32xf64> loc(#loc40)
%37 = arith.select %10, %cst_1, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc6)
%38 = arith.select %24, %cst_1, %cst_3 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc41)
%39 = arith.addf %38, %37 : tensor<1x32xf64> loc(#loc42)
%40 = arith.divf %36, %39 : tensor<1x32xf64> loc(#loc43)
%41 = arith.extf %cst_0 : tensor<1x32xf32> to tensor<1x32xf64> loc(#loc5)
%42 = arith.select %2, %40, %41 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc5)
%43:2 = "tt.reduce"(%42, %1) <{axis = 1 : i32}> ({
^bb0(%arg4: f64 loc(unknown), %arg5: i32 loc(unknown), %arg6: f64 loc(unknown), %arg7: i32 loc(unknown)):
%48 = arith.cmpf ogt, %arg4, %arg6 : f64 loc(#loc78)
%49 = arith.cmpf oeq, %arg4, %arg6 : f64 loc(#loc79)
%50 = arith.cmpf une, %arg4, %arg4 : f64 loc(#loc80)
%51 = arith.cmpf une, %arg6, %arg6 : f64 loc(#loc81)
%52 = arith.xori %51, %true : i1 loc(#loc77)
%53 = arith.andi %50, %52 : i1 loc(#loc82)
%54 = arith.ori %48, %53 : i1 loc(#loc83)
%55 = arith.andi %50, %51 : i1 loc(#loc84)
%56 = arith.ori %49, %55 : i1 loc(#loc85)
%57 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc86)
%58 = arith.andi %56, %57 : i1 loc(#loc87)
%59 = arith.ori %54, %58 : i1 loc(#loc88)
%60 = arith.select %59, %arg4, %arg6 : f64 loc(#loc89)
%61 = arith.select %59, %arg5, %arg7 : i32 loc(#loc90)
tt.reduce.return %60, %61 : f64, i32 loc(#loc63)
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc63)
%44 = tt.expand_dims %43#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc58)
%45 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc59)
%46 = tt.addptr %45, %cst : tensor<1x1x!tt.ptr<i64, 1>>, tensor<1x1xi32> loc(#loc59)
%47 = arith.extsi %44 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc60)
tt.store %46, %47 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc60)
tt.return loc(#loc61)
} loc(#loc)
} loc(#loc)
#loc1 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32)
#loc2 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42)
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63)
#loc4 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:49)
#loc5 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35)
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34)
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17)
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36)
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90)
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15)
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18)
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21)
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26)
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34)
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20)
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22)
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:19)
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:26)
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16)
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18)
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49)
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:59)
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:66)
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56)
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30)
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98)
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32)
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:20)
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:27)
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20)
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:54)
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52)
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:64)
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:71)
#loc35 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61)
#loc36 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31)
#loc37 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103)
#loc38 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95)
#loc39 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35)
#loc40 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20)
#loc41 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35)
#loc42 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20)
#loc43 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20)
#loc45 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21)
#loc46 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23)
#loc47 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29)
#loc48 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29)
#loc49 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28)
#loc50 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16)
#loc51 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29)
#loc52 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17)
#loc53 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31)
#loc54 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21)
#loc55 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12)
#loc56 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35)
#loc57 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69)
#loc58 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22)
#loc59 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25)
#loc60 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68)
#loc61 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4)
#loc62 = loc(callsite(#loc1 at #loc2))
#loc63 = loc(callsite(#loc2 at #loc3))
#loc64 = loc(callsite(#loc45 at #loc2))
#loc65 = loc(callsite(#loc46 at #loc2))
#loc66 = loc(callsite(#loc47 at #loc2))
#loc67 = loc(callsite(#loc48 at #loc2))
#loc68 = loc(callsite(#loc49 at #loc2))
#loc69 = loc(callsite(#loc50 at #loc2))
#loc70 = loc(callsite(#loc51 at #loc2))
#loc71 = loc(callsite(#loc52 at #loc2))
#loc72 = loc(callsite(#loc53 at #loc2))
#loc73 = loc(callsite(#loc54 at #loc2))
#loc74 = loc(callsite(#loc55 at #loc2))
#loc75 = loc(callsite(#loc56 at #loc2))
#loc76 = loc(callsite(#loc57 at #loc2))
#loc77 = loc(callsite(#loc62 at #loc3))
#loc78 = loc(callsite(#loc64 at #loc3))
#loc79 = loc(callsite(#loc65 at #loc3))
#loc80 = loc(callsite(#loc66 at #loc3))
#loc81 = loc(callsite(#loc67 at #loc3))
#loc82 = loc(callsite(#loc68 at #loc3))
#loc83 = loc(callsite(#loc69 at #loc3))
#loc84 = loc(callsite(#loc70 at #loc3))
#loc85 = loc(callsite(#loc71 at #loc3))
#loc86 = loc(callsite(#loc72 at #loc3))
#loc87 = loc(callsite(#loc73 at #loc3))
#loc88 = loc(callsite(#loc74 at #loc3))
#loc89 = loc(callsite(#loc75 at #loc3))
#loc90 = loc(callsite(#loc76 at #loc3))
// -----// IR Dump Before CSE (cse) ('builtin.module' operation) //----- //
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)
#loc44 = loc(unknown)
module {
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} {
%cst = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc1)
%cst_0 = arith.constant dense<0xFFF0000000000000> : tensor<1x32xf64> loc(#loc2)
%c0_i32 = arith.constant 0 : i32 loc(#loc3)
%true = arith.constant true loc(#loc76)
%cst_1 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc7)
%cst_2 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc8)
%cst_3 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc9)
%cst_4 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc10)
%cst_5 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc11)
%0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc12)
%1 = tt.expand_dims %0 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc13)
%2 = arith.cmpi slt, %1, %cst_5 : tensor<1x32xi32> loc(#loc11)
%3 = arith.remsi %1, %cst_4 : tensor<1x32xi32> loc(#loc10)
%4 = arith.divsi %1, %cst_4 : tensor<1x32xi32> loc(#loc14)
%5 = arith.muli %3, %cst_3 : tensor<1x32xi32> loc(#loc9)
%6 = arith.divsi %5, %cst_4 : tensor<1x32xi32> loc(#loc15)
%7 = arith.muli %3, %cst_3 : tensor<1x32xi32> loc(#loc16)
%8 = arith.divsi %7, %cst_4 : tensor<1x32xi32> loc(#loc17)
%9 = arith.addi %8, %cst_4 : tensor<1x32xi32> loc(#loc18)
%10 = arith.cmpi slt, %6, %9 : tensor<1x32xi32> loc(#loc19)
%11 = arith.muli %4, %cst_3 : tensor<1x32xi32> loc(#loc20)
%12 = arith.muli %3, %cst_3 : tensor<1x32xi32> loc(#loc21)
%13 = arith.divsi %12, %cst_4 : tensor<1x32xi32> loc(#loc22)
%14 = arith.addi %11, %13 : tensor<1x32xi32> loc(#loc23)
%15 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc24)
%16 = tt.addptr %15, %14 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc24)
%17 = arith.andi %2, %10 : tensor<1x32xi1> loc(#loc25)
%18 = tt.load %16, %17, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc26)
%19 = arith.select %10, %18, %cst : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc27)
%20 = arith.muli %3, %cst_3 : tensor<1x32xi32> loc(#loc28)
%21 = arith.divsi %20, %cst_4 : tensor<1x32xi32> loc(#loc29)
%22 = arith.addi %21, %cst_2 : tensor<1x32xi32> loc(#loc8)
%23 = arith.cmpi slt, %22, %9 : tensor<1x32xi32> loc(#loc30)
%24 = arith.muli %4, %cst_3 : tensor<1x32xi32> loc(#loc31)
%25 = arith.addi %24, %cst_2 : tensor<1x32xi32> loc(#loc32)
%26 = arith.muli %3, %cst_3 : tensor<1x32xi32> loc(#loc33)
%27 = arith.divsi %26, %cst_4 : tensor<1x32xi32> loc(#loc34)
%28 = arith.addi %25, %27 : tensor<1x32xi32> loc(#loc35)
%29 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc36)
%30 = tt.addptr %29, %28 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc36)
%31 = arith.andi %2, %23 : tensor<1x32xi1> loc(#loc37)
%32 = tt.load %30, %31, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc38)
%33 = arith.select %23, %32, %cst : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc39)
%34 = arith.addf %33, %19 : tensor<1x32xf64> loc(#loc40)
%35 = arith.select %10, %cst_1, %cst : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc7)
%36 = arith.select %23, %cst_1, %cst : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc41)
%37 = arith.addf %36, %35 : tensor<1x32xf64> loc(#loc42)
%38 = arith.divf %34, %37 : tensor<1x32xf64> loc(#loc43)
%39 = arith.select %2, %38, %cst_0 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc2)
%40:2 = "tt.reduce"(%39, %1) <{axis = 1 : i32}> ({
^bb0(%arg4: f64 loc(unknown), %arg5: i32 loc(unknown), %arg6: f64 loc(unknown), %arg7: i32 loc(unknown)):
%45 = arith.cmpf ogt, %arg4, %arg6 : f64 loc(#loc77)
%46 = arith.cmpf oeq, %arg4, %arg6 : f64 loc(#loc78)
%47 = arith.cmpf une, %arg4, %arg4 : f64 loc(#loc79)
%48 = arith.cmpf une, %arg6, %arg6 : f64 loc(#loc80)
%49 = arith.xori %48, %true : i1 loc(#loc76)
%50 = arith.andi %47, %49 : i1 loc(#loc81)
%51 = arith.ori %45, %50 : i1 loc(#loc82)
%52 = arith.andi %47, %48 : i1 loc(#loc83)
%53 = arith.ori %46, %52 : i1 loc(#loc84)
%54 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc85)
%55 = arith.andi %53, %54 : i1 loc(#loc86)
%56 = arith.ori %51, %55 : i1 loc(#loc87)
%57 = arith.select %56, %arg4, %arg6 : f64 loc(#loc88)
%58 = arith.select %56, %arg5, %arg7 : i32 loc(#loc89)
tt.reduce.return %57, %58 : f64, i32 loc(#loc62)
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc62)
%41 = tt.expand_dims %40#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc58)
%42 = tt.addptr %arg1, %c0_i32 : !tt.ptr<i64, 1>, i32 loc(#loc3)
%43 = tt.splat %42 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc3)
%44 = arith.extsi %41 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc59)
tt.store %43, %44 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc59)
tt.return loc(#loc60)
} loc(#loc)
} loc(#loc)
#loc1 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36)
#loc2 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35)
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25)
#loc4 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32)
#loc5 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42)
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63)
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34)
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17)
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15)
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18)
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21)
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26)
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34)
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20)
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22)
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:19)
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:26)
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16)
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18)
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49)
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:59)
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:66)
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56)
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30)
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98)
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90)
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32)
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:20)
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:27)
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20)
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:54)
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52)
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:64)
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:71)
#loc35 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61)
#loc36 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31)
#loc37 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103)
#loc38 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95)
#loc39 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35)
#loc40 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20)
#loc41 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35)
#loc42 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20)
#loc43 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20)
#loc45 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21)
#loc46 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23)
#loc47 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29)
#loc48 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29)
#loc49 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28)
#loc50 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16)
#loc51 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29)
#loc52 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17)
#loc53 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31)
#loc54 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21)
#loc55 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12)
#loc56 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35)
#loc57 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69)
#loc58 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22)
#loc59 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68)
#loc60 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4)
#loc61 = loc(callsite(#loc4 at #loc5))
#loc62 = loc(callsite(#loc5 at #loc6))
#loc63 = loc(callsite(#loc45 at #loc5))
#loc64 = loc(callsite(#loc46 at #loc5))
#loc65 = loc(callsite(#loc47 at #loc5))
#loc66 = loc(callsite(#loc48 at #loc5))
#loc67 = loc(callsite(#loc49 at #loc5))
#loc68 = loc(callsite(#loc50 at #loc5))
#loc69 = loc(callsite(#loc51 at #loc5))
#loc70 = loc(callsite(#loc52 at #loc5))
#loc71 = loc(callsite(#loc53 at #loc5))
#loc72 = loc(callsite(#loc54 at #loc5))
#loc73 = loc(callsite(#loc55 at #loc5))
#loc74 = loc(callsite(#loc56 at #loc5))
#loc75 = loc(callsite(#loc57 at #loc5))
#loc76 = loc(callsite(#loc61 at #loc6))
#loc77 = loc(callsite(#loc63 at #loc6))
#loc78 = loc(callsite(#loc64 at #loc6))
#loc79 = loc(callsite(#loc65 at #loc6))
#loc80 = loc(callsite(#loc66 at #loc6))
#loc81 = loc(callsite(#loc67 at #loc6))
#loc82 = loc(callsite(#loc68 at #loc6))
#loc83 = loc(callsite(#loc69 at #loc6))
#loc84 = loc(callsite(#loc70 at #loc6))
#loc85 = loc(callsite(#loc71 at #loc6))
#loc86 = loc(callsite(#loc72 at #loc6))
#loc87 = loc(callsite(#loc73 at #loc6))
#loc88 = loc(callsite(#loc74 at #loc6))
#loc89 = loc(callsite(#loc75 at #loc6))
// -----// IR Dump Before LoopInvariantCodeMotion (loop-invariant-code-motion) ('builtin.module' operation) //----- //
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)
#loc35 = loc(unknown)
module {
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} {
%cst = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc1)
%cst_0 = arith.constant dense<0xFFF0000000000000> : tensor<1x32xf64> loc(#loc2)
%c0_i32 = arith.constant 0 : i32 loc(#loc3)
%true = arith.constant true loc(#loc67)
%cst_1 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc7)
%cst_2 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc8)
%cst_3 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc9)
%cst_4 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc10)
%cst_5 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc11)
%0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc12)
%1 = tt.expand_dims %0 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc13)
%2 = arith.cmpi slt, %1, %cst_5 : tensor<1x32xi32> loc(#loc11)
%3 = arith.remsi %1, %cst_4 : tensor<1x32xi32> loc(#loc10)
%4 = arith.divsi %1, %cst_4 : tensor<1x32xi32> loc(#loc14)
%5 = arith.muli %3, %cst_3 : tensor<1x32xi32> loc(#loc9)
%6 = arith.divsi %5, %cst_4 : tensor<1x32xi32> loc(#loc15)
%7 = arith.addi %6, %cst_4 : tensor<1x32xi32> loc(#loc16)
%8 = arith.cmpi slt, %6, %7 : tensor<1x32xi32> loc(#loc17)
%9 = arith.muli %4, %cst_3 : tensor<1x32xi32> loc(#loc18)
%10 = arith.addi %9, %6 : tensor<1x32xi32> loc(#loc19)
%11 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc20)
%12 = tt.addptr %11, %10 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc20)
%13 = arith.andi %2, %8 : tensor<1x32xi1> loc(#loc21)
%14 = tt.load %12, %13, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc22)
%15 = arith.select %8, %14, %cst : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc23)
%16 = arith.addi %6, %cst_2 : tensor<1x32xi32> loc(#loc8)
%17 = arith.cmpi slt, %16, %7 : tensor<1x32xi32> loc(#loc24)
%18 = arith.addi %9, %cst_2 : tensor<1x32xi32> loc(#loc25)
%19 = arith.addi %18, %6 : tensor<1x32xi32> loc(#loc26)
%20 = tt.addptr %11, %19 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc27)
%21 = arith.andi %2, %17 : tensor<1x32xi1> loc(#loc28)
%22 = tt.load %20, %21, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc29)
%23 = arith.select %17, %22, %cst : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc30)
%24 = arith.addf %23, %15 : tensor<1x32xf64> loc(#loc31)
%25 = arith.select %8, %cst_1, %cst : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc7)
%26 = arith.select %17, %cst_1, %cst : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc32)
%27 = arith.addf %26, %25 : tensor<1x32xf64> loc(#loc33)
%28 = arith.divf %24, %27 : tensor<1x32xf64> loc(#loc34)
%29 = arith.select %2, %28, %cst_0 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc2)
%30:2 = "tt.reduce"(%29, %1) <{axis = 1 : i32}> ({
^bb0(%arg4: f64 loc(unknown), %arg5: i32 loc(unknown), %arg6: f64 loc(unknown), %arg7: i32 loc(unknown)):
%35 = arith.cmpf ogt, %arg4, %arg6 : f64 loc(#loc68)
%36 = arith.cmpf oeq, %arg4, %arg6 : f64 loc(#loc69)
%37 = arith.cmpf une, %arg4, %arg4 : f64 loc(#loc70)
%38 = arith.cmpf une, %arg6, %arg6 : f64 loc(#loc71)
%39 = arith.xori %38, %true : i1 loc(#loc67)
%40 = arith.andi %37, %39 : i1 loc(#loc72)
%41 = arith.ori %35, %40 : i1 loc(#loc73)
%42 = arith.andi %37, %38 : i1 loc(#loc74)
%43 = arith.ori %36, %42 : i1 loc(#loc75)
%44 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc76)
%45 = arith.andi %43, %44 : i1 loc(#loc77)
%46 = arith.ori %41, %45 : i1 loc(#loc78)
%47 = arith.select %46, %arg4, %arg6 : f64 loc(#loc79)
%48 = arith.select %46, %arg5, %arg7 : i32 loc(#loc80)
tt.reduce.return %47, %48 : f64, i32 loc(#loc53)
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc53)
%31 = tt.expand_dims %30#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc49)
%32 = tt.addptr %arg1, %c0_i32 : !tt.ptr<i64, 1>, i32 loc(#loc3)
%33 = tt.splat %32 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc3)
%34 = arith.extsi %31 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc50)
tt.store %33, %34 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc50)
tt.return loc(#loc51)
} loc(#loc)
} loc(#loc)
#loc1 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36)
#loc2 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35)
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25)
#loc4 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32)
#loc5 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42)
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63)
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34)
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17)
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15)
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18)
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21)
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26)
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34)
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20)
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22)
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16)
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18)
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49)
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56)
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30)
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98)
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90)
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32)
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20)
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52)
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61)
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31)
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103)
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95)
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35)
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20)
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35)
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20)
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20)
#loc36 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21)
#loc37 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23)
#loc38 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29)
#loc39 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29)
#loc40 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28)
#loc41 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16)
#loc42 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29)
#loc43 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17)
#loc44 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31)
#loc45 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21)
#loc46 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12)
#loc47 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35)
#loc48 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69)
#loc49 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22)
#loc50 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68)
#loc51 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4)
#loc52 = loc(callsite(#loc4 at #loc5))
#loc53 = loc(callsite(#loc5 at #loc6))
#loc54 = loc(callsite(#loc36 at #loc5))
#loc55 = loc(callsite(#loc37 at #loc5))
#loc56 = loc(callsite(#loc38 at #loc5))
#loc57 = loc(callsite(#loc39 at #loc5))
#loc58 = loc(callsite(#loc40 at #loc5))
#loc59 = loc(callsite(#loc41 at #loc5))
#loc60 = loc(callsite(#loc42 at #loc5))
#loc61 = loc(callsite(#loc43 at #loc5))
#loc62 = loc(callsite(#loc44 at #loc5))
#loc63 = loc(callsite(#loc45 at #loc5))
#loc64 = loc(callsite(#loc46 at #loc5))
#loc65 = loc(callsite(#loc47 at #loc5))
#loc66 = loc(callsite(#loc48 at #loc5))
#loc67 = loc(callsite(#loc52 at #loc6))
#loc68 = loc(callsite(#loc54 at #loc6))
#loc69 = loc(callsite(#loc55 at #loc6))
#loc70 = loc(callsite(#loc56 at #loc6))
#loc71 = loc(callsite(#loc57 at #loc6))
#loc72 = loc(callsite(#loc58 at #loc6))
#loc73 = loc(callsite(#loc59 at #loc6))
#loc74 = loc(callsite(#loc60 at #loc6))
#loc75 = loc(callsite(#loc61 at #loc6))
#loc76 = loc(callsite(#loc62 at #loc6))
#loc77 = loc(callsite(#loc63 at #loc6))
#loc78 = loc(callsite(#loc64 at #loc6))
#loc79 = loc(callsite(#loc65 at #loc6))
#loc80 = loc(callsite(#loc66 at #loc6))
// -----// IR Dump Before SymbolDCE (symbol-dce) ('builtin.module' operation) //----- //
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)
#loc35 = loc(unknown)
module {
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} {
%cst = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc1)
%cst_0 = arith.constant dense<0xFFF0000000000000> : tensor<1x32xf64> loc(#loc2)
%c0_i32 = arith.constant 0 : i32 loc(#loc3)
%true = arith.constant true loc(#loc67)
%cst_1 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc7)
%cst_2 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc8)
%cst_3 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc9)
%cst_4 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc10)
%cst_5 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc11)
%0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc12)
%1 = tt.expand_dims %0 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc13)
%2 = arith.cmpi slt, %1, %cst_5 : tensor<1x32xi32> loc(#loc11)
%3 = arith.remsi %1, %cst_4 : tensor<1x32xi32> loc(#loc10)
%4 = arith.divsi %1, %cst_4 : tensor<1x32xi32> loc(#loc14)
%5 = arith.muli %3, %cst_3 : tensor<1x32xi32> loc(#loc9)
%6 = arith.divsi %5, %cst_4 : tensor<1x32xi32> loc(#loc15)
%7 = arith.addi %6, %cst_4 : tensor<1x32xi32> loc(#loc16)
%8 = arith.cmpi slt, %6, %7 : tensor<1x32xi32> loc(#loc17)
%9 = arith.muli %4, %cst_3 : tensor<1x32xi32> loc(#loc18)
%10 = arith.addi %9, %6 : tensor<1x32xi32> loc(#loc19)
%11 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc20)
%12 = tt.addptr %11, %10 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc20)
%13 = arith.andi %2, %8 : tensor<1x32xi1> loc(#loc21)
%14 = tt.load %12, %13, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc22)
%15 = arith.select %8, %14, %cst : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc23)
%16 = arith.addi %6, %cst_2 : tensor<1x32xi32> loc(#loc8)
%17 = arith.cmpi slt, %16, %7 : tensor<1x32xi32> loc(#loc24)
%18 = arith.addi %9, %cst_2 : tensor<1x32xi32> loc(#loc25)
%19 = arith.addi %18, %6 : tensor<1x32xi32> loc(#loc26)
%20 = tt.addptr %11, %19 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc27)
%21 = arith.andi %2, %17 : tensor<1x32xi1> loc(#loc28)
%22 = tt.load %20, %21, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc29)
%23 = arith.select %17, %22, %cst : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc30)
%24 = arith.addf %23, %15 : tensor<1x32xf64> loc(#loc31)
%25 = arith.select %8, %cst_1, %cst : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc7)
%26 = arith.select %17, %cst_1, %cst : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc32)
%27 = arith.addf %26, %25 : tensor<1x32xf64> loc(#loc33)
%28 = arith.divf %24, %27 : tensor<1x32xf64> loc(#loc34)
%29 = arith.select %2, %28, %cst_0 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc2)
%30:2 = "tt.reduce"(%29, %1) <{axis = 1 : i32}> ({
^bb0(%arg4: f64 loc(unknown), %arg5: i32 loc(unknown), %arg6: f64 loc(unknown), %arg7: i32 loc(unknown)):
%35 = arith.cmpf ogt, %arg4, %arg6 : f64 loc(#loc68)
%36 = arith.cmpf oeq, %arg4, %arg6 : f64 loc(#loc69)
%37 = arith.cmpf une, %arg4, %arg4 : f64 loc(#loc70)
%38 = arith.cmpf une, %arg6, %arg6 : f64 loc(#loc71)
%39 = arith.xori %38, %true : i1 loc(#loc67)
%40 = arith.andi %37, %39 : i1 loc(#loc72)
%41 = arith.ori %35, %40 : i1 loc(#loc73)
%42 = arith.andi %37, %38 : i1 loc(#loc74)
%43 = arith.ori %36, %42 : i1 loc(#loc75)
%44 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc76)
%45 = arith.andi %43, %44 : i1 loc(#loc77)
%46 = arith.ori %41, %45 : i1 loc(#loc78)
%47 = arith.select %46, %arg4, %arg6 : f64 loc(#loc79)
%48 = arith.select %46, %arg5, %arg7 : i32 loc(#loc80)
tt.reduce.return %47, %48 : f64, i32 loc(#loc53)
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc53)
%31 = tt.expand_dims %30#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc49)
%32 = tt.addptr %arg1, %c0_i32 : !tt.ptr<i64, 1>, i32 loc(#loc3)
%33 = tt.splat %32 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc3)
%34 = arith.extsi %31 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc50)
tt.store %33, %34 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc50)
tt.return loc(#loc51)
} loc(#loc)
} loc(#loc)
#loc1 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36)
#loc2 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35)
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25)
#loc4 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32)
#loc5 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42)
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63)
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34)
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17)
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15)
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18)
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21)
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26)
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34)
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20)
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22)
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16)
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18)
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49)
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56)
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30)
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98)
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90)
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32)
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20)
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52)
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61)
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31)
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103)
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95)
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35)
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20)
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35)
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20)
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20)
#loc36 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21)
#loc37 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23)
#loc38 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29)
#loc39 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29)
#loc40 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28)
#loc41 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16)
#loc42 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29)
#loc43 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17)
#loc44 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31)
#loc45 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21)
#loc46 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12)
#loc47 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35)
#loc48 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69)
#loc49 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22)
#loc50 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68)
#loc51 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4)
#loc52 = loc(callsite(#loc4 at #loc5))
#loc53 = loc(callsite(#loc5 at #loc6))
#loc54 = loc(callsite(#loc36 at #loc5))
#loc55 = loc(callsite(#loc37 at #loc5))
#loc56 = loc(callsite(#loc38 at #loc5))
#loc57 = loc(callsite(#loc39 at #loc5))
#loc58 = loc(callsite(#loc40 at #loc5))
#loc59 = loc(callsite(#loc41 at #loc5))
#loc60 = loc(callsite(#loc42 at #loc5))
#loc61 = loc(callsite(#loc43 at #loc5))
#loc62 = loc(callsite(#loc44 at #loc5))
#loc63 = loc(callsite(#loc45 at #loc5))
#loc64 = loc(callsite(#loc46 at #loc5))
#loc65 = loc(callsite(#loc47 at #loc5))
#loc66 = loc(callsite(#loc48 at #loc5))
#loc67 = loc(callsite(#loc52 at #loc6))
#loc68 = loc(callsite(#loc54 at #loc6))
#loc69 = loc(callsite(#loc55 at #loc6))
#loc70 = loc(callsite(#loc56 at #loc6))
#loc71 = loc(callsite(#loc57 at #loc6))
#loc72 = loc(callsite(#loc58 at #loc6))
#loc73 = loc(callsite(#loc59 at #loc6))
#loc74 = loc(callsite(#loc60 at #loc6))
#loc75 = loc(callsite(#loc61 at #loc6))
#loc76 = loc(callsite(#loc62 at #loc6))
#loc77 = loc(callsite(#loc63 at #loc6))
#loc78 = loc(callsite(#loc64 at #loc6))
#loc79 = loc(callsite(#loc65 at #loc6))
#loc80 = loc(callsite(#loc66 at #loc6))
// -----// IR Dump Before ConvertTritonToTritonGPU (convert-triton-to-tritongpu) ('builtin.module' operation) //----- //
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)
#loc35 = loc(unknown)
module {
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} {
%cst = arith.constant dense<0.000000e+00> : tensor<1x32xf64> loc(#loc1)
%cst_0 = arith.constant dense<0xFFF0000000000000> : tensor<1x32xf64> loc(#loc2)
%c0_i32 = arith.constant 0 : i32 loc(#loc3)
%true = arith.constant true loc(#loc67)
%cst_1 = arith.constant dense<1.000000e+00> : tensor<1x32xf64> loc(#loc7)
%cst_2 = arith.constant dense<1> : tensor<1x32xi32> loc(#loc8)
%cst_3 = arith.constant dense<3> : tensor<1x32xi32> loc(#loc9)
%cst_4 = arith.constant dense<2> : tensor<1x32xi32> loc(#loc10)
%cst_5 = arith.constant dense<32> : tensor<1x32xi32> loc(#loc11)
%0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> loc(#loc12)
%1 = tt.expand_dims %0 {axis = 0 : i32} : (tensor<32xi32>) -> tensor<1x32xi32> loc(#loc13)
%2 = arith.cmpi slt, %1, %cst_5 : tensor<1x32xi32> loc(#loc11)
%3 = arith.remsi %1, %cst_4 : tensor<1x32xi32> loc(#loc10)
%4 = arith.divsi %1, %cst_4 : tensor<1x32xi32> loc(#loc14)
%5 = arith.muli %3, %cst_3 : tensor<1x32xi32> loc(#loc9)
%6 = arith.divsi %5, %cst_4 : tensor<1x32xi32> loc(#loc15)
%7 = arith.addi %6, %cst_4 : tensor<1x32xi32> loc(#loc16)
%8 = arith.cmpi slt, %6, %7 : tensor<1x32xi32> loc(#loc17)
%9 = arith.muli %4, %cst_3 : tensor<1x32xi32> loc(#loc18)
%10 = arith.addi %9, %6 : tensor<1x32xi32> loc(#loc19)
%11 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>> loc(#loc20)
%12 = tt.addptr %11, %10 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc20)
%13 = arith.andi %2, %8 : tensor<1x32xi1> loc(#loc21)
%14 = tt.load %12, %13, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc22)
%15 = arith.select %8, %14, %cst : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc23)
%16 = arith.addi %6, %cst_2 : tensor<1x32xi32> loc(#loc8)
%17 = arith.cmpi slt, %16, %7 : tensor<1x32xi32> loc(#loc24)
%18 = arith.addi %9, %cst_2 : tensor<1x32xi32> loc(#loc25)
%19 = arith.addi %18, %6 : tensor<1x32xi32> loc(#loc26)
%20 = tt.addptr %11, %19 : tensor<1x32x!tt.ptr<f64, 1>>, tensor<1x32xi32> loc(#loc27)
%21 = arith.andi %2, %17 : tensor<1x32xi1> loc(#loc28)
%22 = tt.load %20, %21, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64> loc(#loc29)
%23 = arith.select %17, %22, %cst : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc30)
%24 = arith.addf %23, %15 : tensor<1x32xf64> loc(#loc31)
%25 = arith.select %8, %cst_1, %cst : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc7)
%26 = arith.select %17, %cst_1, %cst : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc32)
%27 = arith.addf %26, %25 : tensor<1x32xf64> loc(#loc33)
%28 = arith.divf %24, %27 : tensor<1x32xf64> loc(#loc34)
%29 = arith.select %2, %28, %cst_0 : tensor<1x32xi1>, tensor<1x32xf64> loc(#loc2)
%30:2 = "tt.reduce"(%29, %1) <{axis = 1 : i32}> ({
^bb0(%arg4: f64 loc(unknown), %arg5: i32 loc(unknown), %arg6: f64 loc(unknown), %arg7: i32 loc(unknown)):
%35 = arith.cmpf ogt, %arg4, %arg6 : f64 loc(#loc68)
%36 = arith.cmpf oeq, %arg4, %arg6 : f64 loc(#loc69)
%37 = arith.cmpf une, %arg4, %arg4 : f64 loc(#loc70)
%38 = arith.cmpf une, %arg6, %arg6 : f64 loc(#loc71)
%39 = arith.xori %38, %true : i1 loc(#loc67)
%40 = arith.andi %37, %39 : i1 loc(#loc72)
%41 = arith.ori %35, %40 : i1 loc(#loc73)
%42 = arith.andi %37, %38 : i1 loc(#loc74)
%43 = arith.ori %36, %42 : i1 loc(#loc75)
%44 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc76)
%45 = arith.andi %43, %44 : i1 loc(#loc77)
%46 = arith.ori %41, %45 : i1 loc(#loc78)
%47 = arith.select %46, %arg4, %arg6 : f64 loc(#loc79)
%48 = arith.select %46, %arg5, %arg7 : i32 loc(#loc80)
tt.reduce.return %47, %48 : f64, i32 loc(#loc53)
}) : (tensor<1x32xf64>, tensor<1x32xi32>) -> (tensor<1xf64>, tensor<1xi32>) loc(#loc53)
%31 = tt.expand_dims %30#1 {axis = 1 : i32} : (tensor<1xi32>) -> tensor<1x1xi32> loc(#loc49)
%32 = tt.addptr %arg1, %c0_i32 : !tt.ptr<i64, 1>, i32 loc(#loc3)
%33 = tt.splat %32 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>> loc(#loc3)
%34 = arith.extsi %31 : tensor<1x1xi32> to tensor<1x1xi64> loc(#loc50)
tt.store %33, %34 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64> loc(#loc50)
tt.return loc(#loc51)
} loc(#loc)
} loc(#loc)
#loc1 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36)
#loc2 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35)
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25)
#loc4 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32)
#loc5 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42)
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63)
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34)
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17)
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15)
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18)
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21)
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26)
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34)
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20)
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22)
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16)
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18)
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49)
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56)
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30)
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98)
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90)
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32)
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20)
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52)
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61)
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31)
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103)
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95)
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35)
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20)
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35)
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20)
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20)
#loc36 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21)
#loc37 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23)
#loc38 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29)
#loc39 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29)
#loc40 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28)
#loc41 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16)
#loc42 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29)
#loc43 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17)
#loc44 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31)
#loc45 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21)
#loc46 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12)
#loc47 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35)
#loc48 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69)
#loc49 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22)
#loc50 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68)
#loc51 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4)
#loc52 = loc(callsite(#loc4 at #loc5))
#loc53 = loc(callsite(#loc5 at #loc6))
#loc54 = loc(callsite(#loc36 at #loc5))
#loc55 = loc(callsite(#loc37 at #loc5))
#loc56 = loc(callsite(#loc38 at #loc5))
#loc57 = loc(callsite(#loc39 at #loc5))
#loc58 = loc(callsite(#loc40 at #loc5))
#loc59 = loc(callsite(#loc41 at #loc5))
#loc60 = loc(callsite(#loc42 at #loc5))
#loc61 = loc(callsite(#loc43 at #loc5))
#loc62 = loc(callsite(#loc44 at #loc5))
#loc63 = loc(callsite(#loc45 at #loc5))
#loc64 = loc(callsite(#loc46 at #loc5))
#loc65 = loc(callsite(#loc47 at #loc5))
#loc66 = loc(callsite(#loc48 at #loc5))
#loc67 = loc(callsite(#loc52 at #loc6))
#loc68 = loc(callsite(#loc54 at #loc6))
#loc69 = loc(callsite(#loc55 at #loc6))
#loc70 = loc(callsite(#loc56 at #loc6))
#loc71 = loc(callsite(#loc57 at #loc6))
#loc72 = loc(callsite(#loc58 at #loc6))
#loc73 = loc(callsite(#loc59 at #loc6))
#loc74 = loc(callsite(#loc60 at #loc6))
#loc75 = loc(callsite(#loc61 at #loc6))
#loc76 = loc(callsite(#loc62 at #loc6))
#loc77 = loc(callsite(#loc63 at #loc6))
#loc78 = loc(callsite(#loc64 at #loc6))
#loc79 = loc(callsite(#loc65 at #loc6))
#loc80 = loc(callsite(#loc66 at #loc6))
// -----// IR Dump Before TritonGPUCoalesce (tritongpu-coalesce) ('builtin.module' operation) //----- //
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
#blocked3 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 1], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
#blocked4 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)
#loc35 = loc(unknown)
module attributes {"triton_gpu.compute-capability" = 80 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} {
%cst = arith.constant dense<0.000000e+00> : tensor<1x32xf64, #blocked> loc(#loc1)
%cst_0 = arith.constant dense<0xFFF0000000000000> : tensor<1x32xf64, #blocked> loc(#loc2)
%c0_i32 = arith.constant 0 : i32 loc(#loc3)
%true = arith.constant true loc(#loc67)
%cst_1 = arith.constant dense<1.000000e+00> : tensor<1x32xf64, #blocked> loc(#loc7)
%cst_2 = arith.constant dense<1> : tensor<1x32xi32, #blocked> loc(#loc8)
%cst_3 = arith.constant dense<3> : tensor<1x32xi32, #blocked> loc(#loc9)
%cst_4 = arith.constant dense<2> : tensor<1x32xi32, #blocked> loc(#loc10)
%cst_5 = arith.constant dense<32> : tensor<1x32xi32, #blocked> loc(#loc11)
%0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #blocked1> loc(#loc12)
%1 = triton_gpu.convert_layout %0 : (tensor<32xi32, #blocked1>) -> tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> loc(#loc13)
%2 = tt.expand_dims %1 {axis = 0 : i32} : (tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x32xi32, #blocked2> loc(#loc13)
%3 = triton_gpu.convert_layout %2 : (tensor<1x32xi32, #blocked2>) -> tensor<1x32xi32, #blocked> loc(#loc11)
%4 = arith.cmpi slt, %3, %cst_5 : tensor<1x32xi32, #blocked> loc(#loc11)
%5 = arith.remsi %3, %cst_4 : tensor<1x32xi32, #blocked> loc(#loc10)
%6 = arith.divsi %3, %cst_4 : tensor<1x32xi32, #blocked> loc(#loc14)
%7 = arith.muli %5, %cst_3 : tensor<1x32xi32, #blocked> loc(#loc9)
%8 = arith.divsi %7, %cst_4 : tensor<1x32xi32, #blocked> loc(#loc15)
%9 = arith.addi %8, %cst_4 : tensor<1x32xi32, #blocked> loc(#loc16)
%10 = arith.cmpi slt, %8, %9 : tensor<1x32xi32, #blocked> loc(#loc17)
%11 = arith.muli %6, %cst_3 : tensor<1x32xi32, #blocked> loc(#loc18)
%12 = arith.addi %11, %8 : tensor<1x32xi32, #blocked> loc(#loc19)
%13 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>, #blocked> loc(#loc20)
%14 = tt.addptr %13, %12 : tensor<1x32x!tt.ptr<f64, 1>, #blocked>, tensor<1x32xi32, #blocked> loc(#loc20)
%15 = arith.andi %4, %10 : tensor<1x32xi1, #blocked> loc(#loc21)
%16 = tt.load %14, %15, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64, #blocked> loc(#loc22)
%17 = arith.select %10, %16, %cst : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc23)
%18 = arith.addi %8, %cst_2 : tensor<1x32xi32, #blocked> loc(#loc8)
%19 = arith.cmpi slt, %18, %9 : tensor<1x32xi32, #blocked> loc(#loc24)
%20 = arith.addi %11, %cst_2 : tensor<1x32xi32, #blocked> loc(#loc25)
%21 = arith.addi %20, %8 : tensor<1x32xi32, #blocked> loc(#loc26)
%22 = tt.addptr %13, %21 : tensor<1x32x!tt.ptr<f64, 1>, #blocked>, tensor<1x32xi32, #blocked> loc(#loc27)
%23 = arith.andi %4, %19 : tensor<1x32xi1, #blocked> loc(#loc28)
%24 = tt.load %22, %23, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64, #blocked> loc(#loc29)
%25 = arith.select %19, %24, %cst : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc30)
%26 = arith.addf %25, %17 : tensor<1x32xf64, #blocked> loc(#loc31)
%27 = arith.select %10, %cst_1, %cst : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc7)
%28 = arith.select %19, %cst_1, %cst : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc32)
%29 = arith.addf %28, %27 : tensor<1x32xf64, #blocked> loc(#loc33)
%30 = arith.divf %26, %29 : tensor<1x32xf64, #blocked> loc(#loc34)
%31 = arith.select %4, %30, %cst_0 : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc2)
%32:2 = "tt.reduce"(%31, %3) <{axis = 1 : i32}> ({
^bb0(%arg4: f64 loc(unknown), %arg5: i32 loc(unknown), %arg6: f64 loc(unknown), %arg7: i32 loc(unknown)):
%40 = arith.cmpf ogt, %arg4, %arg6 : f64 loc(#loc68)
%41 = arith.cmpf oeq, %arg4, %arg6 : f64 loc(#loc69)
%42 = arith.cmpf une, %arg4, %arg4 : f64 loc(#loc70)
%43 = arith.cmpf une, %arg6, %arg6 : f64 loc(#loc71)
%44 = arith.xori %43, %true : i1 loc(#loc67)
%45 = arith.andi %42, %44 : i1 loc(#loc72)
%46 = arith.ori %40, %45 : i1 loc(#loc73)
%47 = arith.andi %42, %43 : i1 loc(#loc74)
%48 = arith.ori %41, %47 : i1 loc(#loc75)
%49 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc76)
%50 = arith.andi %48, %49 : i1 loc(#loc77)
%51 = arith.ori %46, %50 : i1 loc(#loc78)
%52 = arith.select %51, %arg4, %arg6 : f64 loc(#loc79)
%53 = arith.select %51, %arg5, %arg7 : i32 loc(#loc80)
tt.reduce.return %52, %53 : f64, i32 loc(#loc53)
}) : (tensor<1x32xf64, #blocked>, tensor<1x32xi32, #blocked>) -> (tensor<1xf64, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) loc(#loc53)
%33 = triton_gpu.convert_layout %32#1 : (tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1xi32, #blocked1> loc(#loc49)
%34 = triton_gpu.convert_layout %33 : (tensor<1xi32, #blocked1>) -> tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked3}>> loc(#loc49)
%35 = tt.expand_dims %34 {axis = 1 : i32} : (tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked3}>>) -> tensor<1x1xi32, #blocked3> loc(#loc49)
%36 = triton_gpu.convert_layout %35 : (tensor<1x1xi32, #blocked3>) -> tensor<1x1xi32, #blocked4> loc(#loc50)
%37 = tt.addptr %arg1, %c0_i32 : !tt.ptr<i64, 1>, i32 loc(#loc3)
%38 = tt.splat %37 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>, #blocked4> loc(#loc3)
%39 = arith.extsi %36 : tensor<1x1xi32, #blocked4> to tensor<1x1xi64, #blocked4> loc(#loc50)
tt.store %38, %39 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64, #blocked4> loc(#loc50)
tt.return loc(#loc51)
} loc(#loc)
} loc(#loc)
#loc1 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36)
#loc2 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35)
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25)
#loc4 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32)
#loc5 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42)
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63)
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34)
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17)
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15)
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18)
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21)
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26)
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34)
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20)
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22)
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16)
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18)
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49)
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56)
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30)
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98)
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90)
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32)
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20)
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52)
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61)
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31)
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103)
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95)
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35)
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20)
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35)
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20)
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20)
#loc36 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21)
#loc37 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23)
#loc38 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29)
#loc39 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29)
#loc40 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28)
#loc41 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16)
#loc42 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29)
#loc43 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17)
#loc44 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31)
#loc45 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21)
#loc46 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12)
#loc47 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35)
#loc48 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69)
#loc49 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22)
#loc50 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68)
#loc51 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4)
#loc52 = loc(callsite(#loc4 at #loc5))
#loc53 = loc(callsite(#loc5 at #loc6))
#loc54 = loc(callsite(#loc36 at #loc5))
#loc55 = loc(callsite(#loc37 at #loc5))
#loc56 = loc(callsite(#loc38 at #loc5))
#loc57 = loc(callsite(#loc39 at #loc5))
#loc58 = loc(callsite(#loc40 at #loc5))
#loc59 = loc(callsite(#loc41 at #loc5))
#loc60 = loc(callsite(#loc42 at #loc5))
#loc61 = loc(callsite(#loc43 at #loc5))
#loc62 = loc(callsite(#loc44 at #loc5))
#loc63 = loc(callsite(#loc45 at #loc5))
#loc64 = loc(callsite(#loc46 at #loc5))
#loc65 = loc(callsite(#loc47 at #loc5))
#loc66 = loc(callsite(#loc48 at #loc5))
#loc67 = loc(callsite(#loc52 at #loc6))
#loc68 = loc(callsite(#loc54 at #loc6))
#loc69 = loc(callsite(#loc55 at #loc6))
#loc70 = loc(callsite(#loc56 at #loc6))
#loc71 = loc(callsite(#loc57 at #loc6))
#loc72 = loc(callsite(#loc58 at #loc6))
#loc73 = loc(callsite(#loc59 at #loc6))
#loc74 = loc(callsite(#loc60 at #loc6))
#loc75 = loc(callsite(#loc61 at #loc6))
#loc76 = loc(callsite(#loc62 at #loc6))
#loc77 = loc(callsite(#loc63 at #loc6))
#loc78 = loc(callsite(#loc64 at #loc6))
#loc79 = loc(callsite(#loc65 at #loc6))
#loc80 = loc(callsite(#loc66 at #loc6))
// -----// IR Dump Before TritonGPUPlanCTAPass (triton-nvidia-gpu-plan-cta) ('builtin.module' operation) //----- //
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
#blocked3 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
#blocked4 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 1], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
#blocked5 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)
#loc35 = loc(unknown)
module attributes {"triton_gpu.compute-capability" = 80 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} {
%cst = arith.constant dense<0.000000e+00> : tensor<1x32xf64, #blocked> loc(#loc1)
%cst_0 = arith.constant dense<0xFFF0000000000000> : tensor<1x32xf64, #blocked> loc(#loc2)
%c0_i32 = arith.constant 0 : i32 loc(#loc3)
%true = arith.constant true loc(#loc67)
%cst_1 = arith.constant dense<1.000000e+00> : tensor<1x32xf64, #blocked> loc(#loc7)
%cst_2 = arith.constant dense<1> : tensor<1x32xi32, #blocked> loc(#loc8)
%cst_3 = arith.constant dense<3> : tensor<1x32xi32, #blocked> loc(#loc9)
%cst_4 = arith.constant dense<2> : tensor<1x32xi32, #blocked> loc(#loc10)
%cst_5 = arith.constant dense<32> : tensor<1x32xi32, #blocked> loc(#loc11)
%0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #blocked1> loc(#loc12)
%1 = triton_gpu.convert_layout %0 : (tensor<32xi32, #blocked1>) -> tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> loc(#loc13)
%2 = tt.expand_dims %1 {axis = 0 : i32} : (tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x32xi32, #blocked2> loc(#loc13)
%3 = triton_gpu.convert_layout %2 : (tensor<1x32xi32, #blocked2>) -> tensor<1x32xi32, #blocked> loc(#loc11)
%4 = arith.cmpi slt, %3, %cst_5 : tensor<1x32xi32, #blocked> loc(#loc11)
%5 = arith.remsi %3, %cst_4 : tensor<1x32xi32, #blocked> loc(#loc10)
%6 = arith.divsi %3, %cst_4 : tensor<1x32xi32, #blocked> loc(#loc14)
%7 = arith.muli %5, %cst_3 : tensor<1x32xi32, #blocked> loc(#loc9)
%8 = arith.divsi %7, %cst_4 : tensor<1x32xi32, #blocked> loc(#loc15)
%9 = arith.addi %8, %cst_4 : tensor<1x32xi32, #blocked> loc(#loc16)
%10 = arith.cmpi slt, %8, %9 : tensor<1x32xi32, #blocked> loc(#loc17)
%11 = arith.muli %6, %cst_3 : tensor<1x32xi32, #blocked> loc(#loc18)
%12 = arith.addi %11, %8 : tensor<1x32xi32, #blocked> loc(#loc19)
%13 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>, #blocked> loc(#loc20)
%14 = tt.addptr %13, %12 : tensor<1x32x!tt.ptr<f64, 1>, #blocked>, tensor<1x32xi32, #blocked> loc(#loc20)
%15 = arith.andi %4, %10 : tensor<1x32xi1, #blocked> loc(#loc21)
%16 = triton_gpu.convert_layout %14 : (tensor<1x32x!tt.ptr<f64, 1>, #blocked>) -> tensor<1x32x!tt.ptr<f64, 1>, #blocked3> loc(#loc22)
%17 = triton_gpu.convert_layout %15 : (tensor<1x32xi1, #blocked>) -> tensor<1x32xi1, #blocked3> loc(#loc22)
%18 = triton_gpu.convert_layout %cst : (tensor<1x32xf64, #blocked>) -> tensor<1x32xf64, #blocked3> loc(#loc22)
%19 = tt.load %16, %17, %18 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64, #blocked3> loc(#loc22)
%20 = triton_gpu.convert_layout %19 : (tensor<1x32xf64, #blocked3>) -> tensor<1x32xf64, #blocked> loc(#loc22)
%21 = arith.select %10, %20, %cst : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc23)
%22 = arith.addi %8, %cst_2 : tensor<1x32xi32, #blocked> loc(#loc8)
%23 = arith.cmpi slt, %22, %9 : tensor<1x32xi32, #blocked> loc(#loc24)
%24 = arith.addi %11, %cst_2 : tensor<1x32xi32, #blocked> loc(#loc25)
%25 = arith.addi %24, %8 : tensor<1x32xi32, #blocked> loc(#loc26)
%26 = tt.addptr %13, %25 : tensor<1x32x!tt.ptr<f64, 1>, #blocked>, tensor<1x32xi32, #blocked> loc(#loc27)
%27 = arith.andi %4, %23 : tensor<1x32xi1, #blocked> loc(#loc28)
%28 = triton_gpu.convert_layout %26 : (tensor<1x32x!tt.ptr<f64, 1>, #blocked>) -> tensor<1x32x!tt.ptr<f64, 1>, #blocked3> loc(#loc29)
%29 = triton_gpu.convert_layout %27 : (tensor<1x32xi1, #blocked>) -> tensor<1x32xi1, #blocked3> loc(#loc29)
%30 = triton_gpu.convert_layout %cst : (tensor<1x32xf64, #blocked>) -> tensor<1x32xf64, #blocked3> loc(#loc29)
%31 = tt.load %28, %29, %30 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64, #blocked3> loc(#loc29)
%32 = triton_gpu.convert_layout %31 : (tensor<1x32xf64, #blocked3>) -> tensor<1x32xf64, #blocked> loc(#loc29)
%33 = arith.select %23, %32, %cst : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc30)
%34 = arith.addf %33, %21 : tensor<1x32xf64, #blocked> loc(#loc31)
%35 = arith.select %10, %cst_1, %cst : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc7)
%36 = arith.select %23, %cst_1, %cst : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc32)
%37 = arith.addf %36, %35 : tensor<1x32xf64, #blocked> loc(#loc33)
%38 = arith.divf %34, %37 : tensor<1x32xf64, #blocked> loc(#loc34)
%39 = arith.select %4, %38, %cst_0 : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc2)
%40:2 = "tt.reduce"(%39, %3) <{axis = 1 : i32}> ({
^bb0(%arg4: f64 loc(unknown), %arg5: i32 loc(unknown), %arg6: f64 loc(unknown), %arg7: i32 loc(unknown)):
%50 = arith.cmpf ogt, %arg4, %arg6 : f64 loc(#loc68)
%51 = arith.cmpf oeq, %arg4, %arg6 : f64 loc(#loc69)
%52 = arith.cmpf une, %arg4, %arg4 : f64 loc(#loc70)
%53 = arith.cmpf une, %arg6, %arg6 : f64 loc(#loc71)
%54 = arith.xori %53, %true : i1 loc(#loc67)
%55 = arith.andi %52, %54 : i1 loc(#loc72)
%56 = arith.ori %50, %55 : i1 loc(#loc73)
%57 = arith.andi %52, %53 : i1 loc(#loc74)
%58 = arith.ori %51, %57 : i1 loc(#loc75)
%59 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc76)
%60 = arith.andi %58, %59 : i1 loc(#loc77)
%61 = arith.ori %56, %60 : i1 loc(#loc78)
%62 = arith.select %61, %arg4, %arg6 : f64 loc(#loc79)
%63 = arith.select %61, %arg5, %arg7 : i32 loc(#loc80)
tt.reduce.return %62, %63 : f64, i32 loc(#loc53)
}) : (tensor<1x32xf64, #blocked>, tensor<1x32xi32, #blocked>) -> (tensor<1xf64, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) loc(#loc53)
%41 = triton_gpu.convert_layout %40#1 : (tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1xi32, #blocked1> loc(#loc49)
%42 = triton_gpu.convert_layout %41 : (tensor<1xi32, #blocked1>) -> tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>> loc(#loc49)
%43 = tt.expand_dims %42 {axis = 1 : i32} : (tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>>) -> tensor<1x1xi32, #blocked4> loc(#loc49)
%44 = triton_gpu.convert_layout %43 : (tensor<1x1xi32, #blocked4>) -> tensor<1x1xi32, #blocked5> loc(#loc50)
%45 = tt.addptr %arg1, %c0_i32 : !tt.ptr<i64, 1>, i32 loc(#loc3)
%46 = tt.splat %45 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>, #blocked5> loc(#loc3)
%47 = arith.extsi %44 : tensor<1x1xi32, #blocked5> to tensor<1x1xi64, #blocked5> loc(#loc50)
%48 = triton_gpu.convert_layout %46 : (tensor<1x1x!tt.ptr<i64, 1>, #blocked5>) -> tensor<1x1x!tt.ptr<i64, 1>, #blocked3> loc(#loc50)
%49 = triton_gpu.convert_layout %47 : (tensor<1x1xi64, #blocked5>) -> tensor<1x1xi64, #blocked3> loc(#loc50)
tt.store %48, %49 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64, #blocked3> loc(#loc50)
tt.return loc(#loc51)
} loc(#loc)
} loc(#loc)
#loc1 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36)
#loc2 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35)
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25)
#loc4 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32)
#loc5 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42)
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63)
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34)
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17)
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15)
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18)
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21)
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26)
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34)
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20)
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22)
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16)
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18)
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49)
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56)
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30)
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98)
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90)
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32)
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20)
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52)
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61)
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31)
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103)
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95)
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35)
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20)
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35)
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20)
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20)
#loc36 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21)
#loc37 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23)
#loc38 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29)
#loc39 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29)
#loc40 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28)
#loc41 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16)
#loc42 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29)
#loc43 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17)
#loc44 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31)
#loc45 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21)
#loc46 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12)
#loc47 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35)
#loc48 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69)
#loc49 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22)
#loc50 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68)
#loc51 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4)
#loc52 = loc(callsite(#loc4 at #loc5))
#loc53 = loc(callsite(#loc5 at #loc6))
#loc54 = loc(callsite(#loc36 at #loc5))
#loc55 = loc(callsite(#loc37 at #loc5))
#loc56 = loc(callsite(#loc38 at #loc5))
#loc57 = loc(callsite(#loc39 at #loc5))
#loc58 = loc(callsite(#loc40 at #loc5))
#loc59 = loc(callsite(#loc41 at #loc5))
#loc60 = loc(callsite(#loc42 at #loc5))
#loc61 = loc(callsite(#loc43 at #loc5))
#loc62 = loc(callsite(#loc44 at #loc5))
#loc63 = loc(callsite(#loc45 at #loc5))
#loc64 = loc(callsite(#loc46 at #loc5))
#loc65 = loc(callsite(#loc47 at #loc5))
#loc66 = loc(callsite(#loc48 at #loc5))
#loc67 = loc(callsite(#loc52 at #loc6))
#loc68 = loc(callsite(#loc54 at #loc6))
#loc69 = loc(callsite(#loc55 at #loc6))
#loc70 = loc(callsite(#loc56 at #loc6))
#loc71 = loc(callsite(#loc57 at #loc6))
#loc72 = loc(callsite(#loc58 at #loc6))
#loc73 = loc(callsite(#loc59 at #loc6))
#loc74 = loc(callsite(#loc60 at #loc6))
#loc75 = loc(callsite(#loc61 at #loc6))
#loc76 = loc(callsite(#loc62 at #loc6))
#loc77 = loc(callsite(#loc63 at #loc6))
#loc78 = loc(callsite(#loc64 at #loc6))
#loc79 = loc(callsite(#loc65 at #loc6))
#loc80 = loc(callsite(#loc66 at #loc6))
// -----// IR Dump Before TritonGPURewriteTensorPointer (tritongpu-rewrite-tensor-pointer) ('builtin.module' operation) //----- //
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
#blocked3 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
#blocked4 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 1], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
#blocked5 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)
#loc35 = loc(unknown)
module attributes {"triton_gpu.compute-capability" = 80 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} {
%cst = arith.constant dense<0.000000e+00> : tensor<1x32xf64, #blocked> loc(#loc1)
%cst_0 = arith.constant dense<0xFFF0000000000000> : tensor<1x32xf64, #blocked> loc(#loc2)
%c0_i32 = arith.constant 0 : i32 loc(#loc3)
%true = arith.constant true loc(#loc67)
%cst_1 = arith.constant dense<1.000000e+00> : tensor<1x32xf64, #blocked> loc(#loc7)
%cst_2 = arith.constant dense<1> : tensor<1x32xi32, #blocked> loc(#loc8)
%cst_3 = arith.constant dense<3> : tensor<1x32xi32, #blocked> loc(#loc9)
%cst_4 = arith.constant dense<2> : tensor<1x32xi32, #blocked> loc(#loc10)
%cst_5 = arith.constant dense<32> : tensor<1x32xi32, #blocked> loc(#loc11)
%0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #blocked1> loc(#loc12)
%1 = triton_gpu.convert_layout %0 : (tensor<32xi32, #blocked1>) -> tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> loc(#loc13)
%2 = tt.expand_dims %1 {axis = 0 : i32} : (tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x32xi32, #blocked2> loc(#loc13)
%3 = triton_gpu.convert_layout %2 : (tensor<1x32xi32, #blocked2>) -> tensor<1x32xi32, #blocked> loc(#loc11)
%4 = arith.cmpi slt, %3, %cst_5 : tensor<1x32xi32, #blocked> loc(#loc11)
%5 = arith.remsi %3, %cst_4 : tensor<1x32xi32, #blocked> loc(#loc10)
%6 = arith.divsi %3, %cst_4 : tensor<1x32xi32, #blocked> loc(#loc14)
%7 = arith.muli %5, %cst_3 : tensor<1x32xi32, #blocked> loc(#loc9)
%8 = arith.divsi %7, %cst_4 : tensor<1x32xi32, #blocked> loc(#loc15)
%9 = arith.addi %8, %cst_4 : tensor<1x32xi32, #blocked> loc(#loc16)
%10 = arith.cmpi slt, %8, %9 : tensor<1x32xi32, #blocked> loc(#loc17)
%11 = arith.muli %6, %cst_3 : tensor<1x32xi32, #blocked> loc(#loc18)
%12 = arith.addi %11, %8 : tensor<1x32xi32, #blocked> loc(#loc19)
%13 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>, #blocked> loc(#loc20)
%14 = tt.addptr %13, %12 : tensor<1x32x!tt.ptr<f64, 1>, #blocked>, tensor<1x32xi32, #blocked> loc(#loc20)
%15 = arith.andi %4, %10 : tensor<1x32xi1, #blocked> loc(#loc21)
%16 = triton_gpu.convert_layout %14 : (tensor<1x32x!tt.ptr<f64, 1>, #blocked>) -> tensor<1x32x!tt.ptr<f64, 1>, #blocked3> loc(#loc22)
%17 = triton_gpu.convert_layout %15 : (tensor<1x32xi1, #blocked>) -> tensor<1x32xi1, #blocked3> loc(#loc22)
%18 = triton_gpu.convert_layout %cst : (tensor<1x32xf64, #blocked>) -> tensor<1x32xf64, #blocked3> loc(#loc22)
%19 = tt.load %16, %17, %18 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64, #blocked3> loc(#loc22)
%20 = triton_gpu.convert_layout %19 : (tensor<1x32xf64, #blocked3>) -> tensor<1x32xf64, #blocked> loc(#loc22)
%21 = arith.select %10, %20, %cst : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc23)
%22 = arith.addi %8, %cst_2 : tensor<1x32xi32, #blocked> loc(#loc8)
%23 = arith.cmpi slt, %22, %9 : tensor<1x32xi32, #blocked> loc(#loc24)
%24 = arith.addi %11, %cst_2 : tensor<1x32xi32, #blocked> loc(#loc25)
%25 = arith.addi %24, %8 : tensor<1x32xi32, #blocked> loc(#loc26)
%26 = tt.addptr %13, %25 : tensor<1x32x!tt.ptr<f64, 1>, #blocked>, tensor<1x32xi32, #blocked> loc(#loc27)
%27 = arith.andi %4, %23 : tensor<1x32xi1, #blocked> loc(#loc28)
%28 = triton_gpu.convert_layout %26 : (tensor<1x32x!tt.ptr<f64, 1>, #blocked>) -> tensor<1x32x!tt.ptr<f64, 1>, #blocked3> loc(#loc29)
%29 = triton_gpu.convert_layout %27 : (tensor<1x32xi1, #blocked>) -> tensor<1x32xi1, #blocked3> loc(#loc29)
%30 = triton_gpu.convert_layout %cst : (tensor<1x32xf64, #blocked>) -> tensor<1x32xf64, #blocked3> loc(#loc29)
%31 = tt.load %28, %29, %30 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64, #blocked3> loc(#loc29)
%32 = triton_gpu.convert_layout %31 : (tensor<1x32xf64, #blocked3>) -> tensor<1x32xf64, #blocked> loc(#loc29)
%33 = arith.select %23, %32, %cst : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc30)
%34 = arith.addf %33, %21 : tensor<1x32xf64, #blocked> loc(#loc31)
%35 = arith.select %10, %cst_1, %cst : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc7)
%36 = arith.select %23, %cst_1, %cst : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc32)
%37 = arith.addf %36, %35 : tensor<1x32xf64, #blocked> loc(#loc33)
%38 = arith.divf %34, %37 : tensor<1x32xf64, #blocked> loc(#loc34)
%39 = arith.select %4, %38, %cst_0 : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc2)
%40:2 = "tt.reduce"(%39, %3) <{axis = 1 : i32}> ({
^bb0(%arg4: f64 loc(unknown), %arg5: i32 loc(unknown), %arg6: f64 loc(unknown), %arg7: i32 loc(unknown)):
%50 = arith.cmpf ogt, %arg4, %arg6 : f64 loc(#loc68)
%51 = arith.cmpf oeq, %arg4, %arg6 : f64 loc(#loc69)
%52 = arith.cmpf une, %arg4, %arg4 : f64 loc(#loc70)
%53 = arith.cmpf une, %arg6, %arg6 : f64 loc(#loc71)
%54 = arith.xori %53, %true : i1 loc(#loc67)
%55 = arith.andi %52, %54 : i1 loc(#loc72)
%56 = arith.ori %50, %55 : i1 loc(#loc73)
%57 = arith.andi %52, %53 : i1 loc(#loc74)
%58 = arith.ori %51, %57 : i1 loc(#loc75)
%59 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc76)
%60 = arith.andi %58, %59 : i1 loc(#loc77)
%61 = arith.ori %56, %60 : i1 loc(#loc78)
%62 = arith.select %61, %arg4, %arg6 : f64 loc(#loc79)
%63 = arith.select %61, %arg5, %arg7 : i32 loc(#loc80)
tt.reduce.return %62, %63 : f64, i32 loc(#loc53)
}) : (tensor<1x32xf64, #blocked>, tensor<1x32xi32, #blocked>) -> (tensor<1xf64, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) loc(#loc53)
%41 = triton_gpu.convert_layout %40#1 : (tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1xi32, #blocked1> loc(#loc49)
%42 = triton_gpu.convert_layout %41 : (tensor<1xi32, #blocked1>) -> tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>> loc(#loc49)
%43 = tt.expand_dims %42 {axis = 1 : i32} : (tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>>) -> tensor<1x1xi32, #blocked4> loc(#loc49)
%44 = triton_gpu.convert_layout %43 : (tensor<1x1xi32, #blocked4>) -> tensor<1x1xi32, #blocked5> loc(#loc50)
%45 = tt.addptr %arg1, %c0_i32 : !tt.ptr<i64, 1>, i32 loc(#loc3)
%46 = tt.splat %45 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>, #blocked5> loc(#loc3)
%47 = arith.extsi %44 : tensor<1x1xi32, #blocked5> to tensor<1x1xi64, #blocked5> loc(#loc50)
%48 = triton_gpu.convert_layout %46 : (tensor<1x1x!tt.ptr<i64, 1>, #blocked5>) -> tensor<1x1x!tt.ptr<i64, 1>, #blocked3> loc(#loc50)
%49 = triton_gpu.convert_layout %47 : (tensor<1x1xi64, #blocked5>) -> tensor<1x1xi64, #blocked3> loc(#loc50)
tt.store %48, %49 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64, #blocked3> loc(#loc50)
tt.return loc(#loc51)
} loc(#loc)
} loc(#loc)
#loc1 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36)
#loc2 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35)
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25)
#loc4 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32)
#loc5 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42)
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63)
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34)
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17)
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15)
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18)
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21)
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26)
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34)
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20)
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22)
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16)
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18)
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49)
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56)
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30)
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98)
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90)
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32)
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20)
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52)
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61)
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31)
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103)
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95)
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35)
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20)
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35)
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20)
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20)
#loc36 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21)
#loc37 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23)
#loc38 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29)
#loc39 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29)
#loc40 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28)
#loc41 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16)
#loc42 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29)
#loc43 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17)
#loc44 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31)
#loc45 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21)
#loc46 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12)
#loc47 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35)
#loc48 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69)
#loc49 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22)
#loc50 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68)
#loc51 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4)
#loc52 = loc(callsite(#loc4 at #loc5))
#loc53 = loc(callsite(#loc5 at #loc6))
#loc54 = loc(callsite(#loc36 at #loc5))
#loc55 = loc(callsite(#loc37 at #loc5))
#loc56 = loc(callsite(#loc38 at #loc5))
#loc57 = loc(callsite(#loc39 at #loc5))
#loc58 = loc(callsite(#loc40 at #loc5))
#loc59 = loc(callsite(#loc41 at #loc5))
#loc60 = loc(callsite(#loc42 at #loc5))
#loc61 = loc(callsite(#loc43 at #loc5))
#loc62 = loc(callsite(#loc44 at #loc5))
#loc63 = loc(callsite(#loc45 at #loc5))
#loc64 = loc(callsite(#loc46 at #loc5))
#loc65 = loc(callsite(#loc47 at #loc5))
#loc66 = loc(callsite(#loc48 at #loc5))
#loc67 = loc(callsite(#loc52 at #loc6))
#loc68 = loc(callsite(#loc54 at #loc6))
#loc69 = loc(callsite(#loc55 at #loc6))
#loc70 = loc(callsite(#loc56 at #loc6))
#loc71 = loc(callsite(#loc57 at #loc6))
#loc72 = loc(callsite(#loc58 at #loc6))
#loc73 = loc(callsite(#loc59 at #loc6))
#loc74 = loc(callsite(#loc60 at #loc6))
#loc75 = loc(callsite(#loc61 at #loc6))
#loc76 = loc(callsite(#loc62 at #loc6))
#loc77 = loc(callsite(#loc63 at #loc6))
#loc78 = loc(callsite(#loc64 at #loc6))
#loc79 = loc(callsite(#loc65 at #loc6))
#loc80 = loc(callsite(#loc66 at #loc6))
// -----// IR Dump Before TritonGPUPlanCTAPass (triton-nvidia-gpu-plan-cta) ('builtin.module' operation) //----- //
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
#blocked3 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
#blocked4 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 1], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
#blocked5 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)
#loc35 = loc(unknown)
module attributes {"triton_gpu.compute-capability" = 80 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} {
%cst = arith.constant dense<0.000000e+00> : tensor<1x32xf64, #blocked> loc(#loc1)
%cst_0 = arith.constant dense<0xFFF0000000000000> : tensor<1x32xf64, #blocked> loc(#loc2)
%c0_i32 = arith.constant 0 : i32 loc(#loc3)
%true = arith.constant true loc(#loc67)
%cst_1 = arith.constant dense<1.000000e+00> : tensor<1x32xf64, #blocked> loc(#loc7)
%cst_2 = arith.constant dense<1> : tensor<1x32xi32, #blocked> loc(#loc8)
%cst_3 = arith.constant dense<3> : tensor<1x32xi32, #blocked> loc(#loc9)
%cst_4 = arith.constant dense<2> : tensor<1x32xi32, #blocked> loc(#loc10)
%cst_5 = arith.constant dense<32> : tensor<1x32xi32, #blocked> loc(#loc11)
%0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #blocked1> loc(#loc12)
%1 = triton_gpu.convert_layout %0 : (tensor<32xi32, #blocked1>) -> tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> loc(#loc13)
%2 = tt.expand_dims %1 {axis = 0 : i32} : (tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x32xi32, #blocked2> loc(#loc13)
%3 = triton_gpu.convert_layout %2 : (tensor<1x32xi32, #blocked2>) -> tensor<1x32xi32, #blocked> loc(#loc11)
%4 = arith.cmpi slt, %3, %cst_5 : tensor<1x32xi32, #blocked> loc(#loc11)
%5 = arith.remsi %3, %cst_4 : tensor<1x32xi32, #blocked> loc(#loc10)
%6 = arith.divsi %3, %cst_4 : tensor<1x32xi32, #blocked> loc(#loc14)
%7 = arith.muli %5, %cst_3 : tensor<1x32xi32, #blocked> loc(#loc9)
%8 = arith.divsi %7, %cst_4 : tensor<1x32xi32, #blocked> loc(#loc15)
%9 = arith.addi %8, %cst_4 : tensor<1x32xi32, #blocked> loc(#loc16)
%10 = arith.cmpi slt, %8, %9 : tensor<1x32xi32, #blocked> loc(#loc17)
%11 = arith.muli %6, %cst_3 : tensor<1x32xi32, #blocked> loc(#loc18)
%12 = arith.addi %11, %8 : tensor<1x32xi32, #blocked> loc(#loc19)
%13 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>, #blocked> loc(#loc20)
%14 = tt.addptr %13, %12 : tensor<1x32x!tt.ptr<f64, 1>, #blocked>, tensor<1x32xi32, #blocked> loc(#loc20)
%15 = arith.andi %4, %10 : tensor<1x32xi1, #blocked> loc(#loc21)
%16 = triton_gpu.convert_layout %14 : (tensor<1x32x!tt.ptr<f64, 1>, #blocked>) -> tensor<1x32x!tt.ptr<f64, 1>, #blocked3> loc(#loc22)
%17 = triton_gpu.convert_layout %15 : (tensor<1x32xi1, #blocked>) -> tensor<1x32xi1, #blocked3> loc(#loc22)
%18 = triton_gpu.convert_layout %cst : (tensor<1x32xf64, #blocked>) -> tensor<1x32xf64, #blocked3> loc(#loc22)
%19 = tt.load %16, %17, %18 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64, #blocked3> loc(#loc22)
%20 = triton_gpu.convert_layout %19 : (tensor<1x32xf64, #blocked3>) -> tensor<1x32xf64, #blocked> loc(#loc22)
%21 = arith.select %10, %20, %cst : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc23)
%22 = arith.addi %8, %cst_2 : tensor<1x32xi32, #blocked> loc(#loc8)
%23 = arith.cmpi slt, %22, %9 : tensor<1x32xi32, #blocked> loc(#loc24)
%24 = arith.addi %11, %cst_2 : tensor<1x32xi32, #blocked> loc(#loc25)
%25 = arith.addi %24, %8 : tensor<1x32xi32, #blocked> loc(#loc26)
%26 = tt.addptr %13, %25 : tensor<1x32x!tt.ptr<f64, 1>, #blocked>, tensor<1x32xi32, #blocked> loc(#loc27)
%27 = arith.andi %4, %23 : tensor<1x32xi1, #blocked> loc(#loc28)
%28 = triton_gpu.convert_layout %26 : (tensor<1x32x!tt.ptr<f64, 1>, #blocked>) -> tensor<1x32x!tt.ptr<f64, 1>, #blocked3> loc(#loc29)
%29 = triton_gpu.convert_layout %27 : (tensor<1x32xi1, #blocked>) -> tensor<1x32xi1, #blocked3> loc(#loc29)
%30 = triton_gpu.convert_layout %cst : (tensor<1x32xf64, #blocked>) -> tensor<1x32xf64, #blocked3> loc(#loc29)
%31 = tt.load %28, %29, %30 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64, #blocked3> loc(#loc29)
%32 = triton_gpu.convert_layout %31 : (tensor<1x32xf64, #blocked3>) -> tensor<1x32xf64, #blocked> loc(#loc29)
%33 = arith.select %23, %32, %cst : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc30)
%34 = arith.addf %33, %21 : tensor<1x32xf64, #blocked> loc(#loc31)
%35 = arith.select %10, %cst_1, %cst : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc7)
%36 = arith.select %23, %cst_1, %cst : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc32)
%37 = arith.addf %36, %35 : tensor<1x32xf64, #blocked> loc(#loc33)
%38 = arith.divf %34, %37 : tensor<1x32xf64, #blocked> loc(#loc34)
%39 = arith.select %4, %38, %cst_0 : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc2)
%40:2 = "tt.reduce"(%39, %3) <{axis = 1 : i32}> ({
^bb0(%arg4: f64 loc(unknown), %arg5: i32 loc(unknown), %arg6: f64 loc(unknown), %arg7: i32 loc(unknown)):
%50 = arith.cmpf ogt, %arg4, %arg6 : f64 loc(#loc68)
%51 = arith.cmpf oeq, %arg4, %arg6 : f64 loc(#loc69)
%52 = arith.cmpf une, %arg4, %arg4 : f64 loc(#loc70)
%53 = arith.cmpf une, %arg6, %arg6 : f64 loc(#loc71)
%54 = arith.xori %53, %true : i1 loc(#loc67)
%55 = arith.andi %52, %54 : i1 loc(#loc72)
%56 = arith.ori %50, %55 : i1 loc(#loc73)
%57 = arith.andi %52, %53 : i1 loc(#loc74)
%58 = arith.ori %51, %57 : i1 loc(#loc75)
%59 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc76)
%60 = arith.andi %58, %59 : i1 loc(#loc77)
%61 = arith.ori %56, %60 : i1 loc(#loc78)
%62 = arith.select %61, %arg4, %arg6 : f64 loc(#loc79)
%63 = arith.select %61, %arg5, %arg7 : i32 loc(#loc80)
tt.reduce.return %62, %63 : f64, i32 loc(#loc53)
}) : (tensor<1x32xf64, #blocked>, tensor<1x32xi32, #blocked>) -> (tensor<1xf64, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) loc(#loc53)
%41 = triton_gpu.convert_layout %40#1 : (tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1xi32, #blocked1> loc(#loc49)
%42 = triton_gpu.convert_layout %41 : (tensor<1xi32, #blocked1>) -> tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>> loc(#loc49)
%43 = tt.expand_dims %42 {axis = 1 : i32} : (tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>>) -> tensor<1x1xi32, #blocked4> loc(#loc49)
%44 = triton_gpu.convert_layout %43 : (tensor<1x1xi32, #blocked4>) -> tensor<1x1xi32, #blocked5> loc(#loc50)
%45 = tt.addptr %arg1, %c0_i32 : !tt.ptr<i64, 1>, i32 loc(#loc3)
%46 = tt.splat %45 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>, #blocked5> loc(#loc3)
%47 = arith.extsi %44 : tensor<1x1xi32, #blocked5> to tensor<1x1xi64, #blocked5> loc(#loc50)
%48 = triton_gpu.convert_layout %46 : (tensor<1x1x!tt.ptr<i64, 1>, #blocked5>) -> tensor<1x1x!tt.ptr<i64, 1>, #blocked3> loc(#loc50)
%49 = triton_gpu.convert_layout %47 : (tensor<1x1xi64, #blocked5>) -> tensor<1x1xi64, #blocked3> loc(#loc50)
tt.store %48, %49 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64, #blocked3> loc(#loc50)
tt.return loc(#loc51)
} loc(#loc)
} loc(#loc)
#loc1 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36)
#loc2 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35)
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25)
#loc4 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32)
#loc5 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42)
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63)
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34)
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17)
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15)
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18)
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21)
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26)
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34)
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20)
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22)
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16)
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18)
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49)
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56)
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30)
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98)
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90)
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32)
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20)
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52)
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61)
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31)
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103)
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95)
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35)
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20)
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35)
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20)
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20)
#loc36 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21)
#loc37 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23)
#loc38 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29)
#loc39 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29)
#loc40 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28)
#loc41 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16)
#loc42 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29)
#loc43 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17)
#loc44 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31)
#loc45 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21)
#loc46 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12)
#loc47 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35)
#loc48 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69)
#loc49 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22)
#loc50 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68)
#loc51 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4)
#loc52 = loc(callsite(#loc4 at #loc5))
#loc53 = loc(callsite(#loc5 at #loc6))
#loc54 = loc(callsite(#loc36 at #loc5))
#loc55 = loc(callsite(#loc37 at #loc5))
#loc56 = loc(callsite(#loc38 at #loc5))
#loc57 = loc(callsite(#loc39 at #loc5))
#loc58 = loc(callsite(#loc40 at #loc5))
#loc59 = loc(callsite(#loc41 at #loc5))
#loc60 = loc(callsite(#loc42 at #loc5))
#loc61 = loc(callsite(#loc43 at #loc5))
#loc62 = loc(callsite(#loc44 at #loc5))
#loc63 = loc(callsite(#loc45 at #loc5))
#loc64 = loc(callsite(#loc46 at #loc5))
#loc65 = loc(callsite(#loc47 at #loc5))
#loc66 = loc(callsite(#loc48 at #loc5))
#loc67 = loc(callsite(#loc52 at #loc6))
#loc68 = loc(callsite(#loc54 at #loc6))
#loc69 = loc(callsite(#loc55 at #loc6))
#loc70 = loc(callsite(#loc56 at #loc6))
#loc71 = loc(callsite(#loc57 at #loc6))
#loc72 = loc(callsite(#loc58 at #loc6))
#loc73 = loc(callsite(#loc59 at #loc6))
#loc74 = loc(callsite(#loc60 at #loc6))
#loc75 = loc(callsite(#loc61 at #loc6))
#loc76 = loc(callsite(#loc62 at #loc6))
#loc77 = loc(callsite(#loc63 at #loc6))
#loc78 = loc(callsite(#loc64 at #loc6))
#loc79 = loc(callsite(#loc65 at #loc6))
#loc80 = loc(callsite(#loc66 at #loc6))
// -----// IR Dump Before TritonGPURemoveLayoutConversions (tritongpu-remove-layout-conversions) ('builtin.module' operation) //----- //
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
#blocked3 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
#blocked4 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 1], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
#blocked5 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
#loc = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)
#loc35 = loc(unknown)
module attributes {"triton_gpu.compute-capability" = 80 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
tt.func public @triton__0d1d23de(%arg0: !tt.ptr<f64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg2: i32 loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0), %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32} loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":18:0)) attributes {noinline = false} {
%cst = arith.constant dense<0.000000e+00> : tensor<1x32xf64, #blocked> loc(#loc1)
%cst_0 = arith.constant dense<0xFFF0000000000000> : tensor<1x32xf64, #blocked> loc(#loc2)
%c0_i32 = arith.constant 0 : i32 loc(#loc3)
%true = arith.constant true loc(#loc67)
%cst_1 = arith.constant dense<1.000000e+00> : tensor<1x32xf64, #blocked> loc(#loc7)
%cst_2 = arith.constant dense<1> : tensor<1x32xi32, #blocked> loc(#loc8)
%cst_3 = arith.constant dense<3> : tensor<1x32xi32, #blocked> loc(#loc9)
%cst_4 = arith.constant dense<2> : tensor<1x32xi32, #blocked> loc(#loc10)
%cst_5 = arith.constant dense<32> : tensor<1x32xi32, #blocked> loc(#loc11)
%0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #blocked1> loc(#loc12)
%1 = triton_gpu.convert_layout %0 : (tensor<32xi32, #blocked1>) -> tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> loc(#loc13)
%2 = tt.expand_dims %1 {axis = 0 : i32} : (tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x32xi32, #blocked2> loc(#loc13)
%3 = triton_gpu.convert_layout %2 : (tensor<1x32xi32, #blocked2>) -> tensor<1x32xi32, #blocked> loc(#loc11)
%4 = arith.cmpi slt, %3, %cst_5 : tensor<1x32xi32, #blocked> loc(#loc11)
%5 = arith.remsi %3, %cst_4 : tensor<1x32xi32, #blocked> loc(#loc10)
%6 = arith.divsi %3, %cst_4 : tensor<1x32xi32, #blocked> loc(#loc14)
%7 = arith.muli %5, %cst_3 : tensor<1x32xi32, #blocked> loc(#loc9)
%8 = arith.divsi %7, %cst_4 : tensor<1x32xi32, #blocked> loc(#loc15)
%9 = arith.addi %8, %cst_4 : tensor<1x32xi32, #blocked> loc(#loc16)
%10 = arith.cmpi slt, %8, %9 : tensor<1x32xi32, #blocked> loc(#loc17)
%11 = arith.muli %6, %cst_3 : tensor<1x32xi32, #blocked> loc(#loc18)
%12 = arith.addi %11, %8 : tensor<1x32xi32, #blocked> loc(#loc19)
%13 = tt.splat %arg0 : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>, #blocked> loc(#loc20)
%14 = tt.addptr %13, %12 : tensor<1x32x!tt.ptr<f64, 1>, #blocked>, tensor<1x32xi32, #blocked> loc(#loc20)
%15 = arith.andi %4, %10 : tensor<1x32xi1, #blocked> loc(#loc21)
%16 = triton_gpu.convert_layout %14 : (tensor<1x32x!tt.ptr<f64, 1>, #blocked>) -> tensor<1x32x!tt.ptr<f64, 1>, #blocked3> loc(#loc22)
%17 = triton_gpu.convert_layout %15 : (tensor<1x32xi1, #blocked>) -> tensor<1x32xi1, #blocked3> loc(#loc22)
%18 = triton_gpu.convert_layout %cst : (tensor<1x32xf64, #blocked>) -> tensor<1x32xf64, #blocked3> loc(#loc22)
%19 = tt.load %16, %17, %18 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64, #blocked3> loc(#loc22)
%20 = triton_gpu.convert_layout %19 : (tensor<1x32xf64, #blocked3>) -> tensor<1x32xf64, #blocked> loc(#loc22)
%21 = arith.select %10, %20, %cst : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc23)
%22 = arith.addi %8, %cst_2 : tensor<1x32xi32, #blocked> loc(#loc8)
%23 = arith.cmpi slt, %22, %9 : tensor<1x32xi32, #blocked> loc(#loc24)
%24 = arith.addi %11, %cst_2 : tensor<1x32xi32, #blocked> loc(#loc25)
%25 = arith.addi %24, %8 : tensor<1x32xi32, #blocked> loc(#loc26)
%26 = tt.addptr %13, %25 : tensor<1x32x!tt.ptr<f64, 1>, #blocked>, tensor<1x32xi32, #blocked> loc(#loc27)
%27 = arith.andi %4, %23 : tensor<1x32xi1, #blocked> loc(#loc28)
%28 = triton_gpu.convert_layout %26 : (tensor<1x32x!tt.ptr<f64, 1>, #blocked>) -> tensor<1x32x!tt.ptr<f64, 1>, #blocked3> loc(#loc29)
%29 = triton_gpu.convert_layout %27 : (tensor<1x32xi1, #blocked>) -> tensor<1x32xi1, #blocked3> loc(#loc29)
%30 = triton_gpu.convert_layout %cst : (tensor<1x32xf64, #blocked>) -> tensor<1x32xf64, #blocked3> loc(#loc29)
%31 = tt.load %28, %29, %30 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x32xf64, #blocked3> loc(#loc29)
%32 = triton_gpu.convert_layout %31 : (tensor<1x32xf64, #blocked3>) -> tensor<1x32xf64, #blocked> loc(#loc29)
%33 = arith.select %23, %32, %cst : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc30)
%34 = arith.addf %33, %21 : tensor<1x32xf64, #blocked> loc(#loc31)
%35 = arith.select %10, %cst_1, %cst : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc7)
%36 = arith.select %23, %cst_1, %cst : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc32)
%37 = arith.addf %36, %35 : tensor<1x32xf64, #blocked> loc(#loc33)
%38 = arith.divf %34, %37 : tensor<1x32xf64, #blocked> loc(#loc34)
%39 = arith.select %4, %38, %cst_0 : tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked> loc(#loc2)
%40:2 = "tt.reduce"(%39, %3) <{axis = 1 : i32}> ({
^bb0(%arg4: f64 loc(unknown), %arg5: i32 loc(unknown), %arg6: f64 loc(unknown), %arg7: i32 loc(unknown)):
%50 = arith.cmpf ogt, %arg4, %arg6 : f64 loc(#loc68)
%51 = arith.cmpf oeq, %arg4, %arg6 : f64 loc(#loc69)
%52 = arith.cmpf une, %arg4, %arg4 : f64 loc(#loc70)
%53 = arith.cmpf une, %arg6, %arg6 : f64 loc(#loc71)
%54 = arith.xori %53, %true : i1 loc(#loc67)
%55 = arith.andi %52, %54 : i1 loc(#loc72)
%56 = arith.ori %50, %55 : i1 loc(#loc73)
%57 = arith.andi %52, %53 : i1 loc(#loc74)
%58 = arith.ori %51, %57 : i1 loc(#loc75)
%59 = arith.cmpi slt, %arg5, %arg7 : i32 loc(#loc76)
%60 = arith.andi %58, %59 : i1 loc(#loc77)
%61 = arith.ori %56, %60 : i1 loc(#loc78)
%62 = arith.select %61, %arg4, %arg6 : f64 loc(#loc79)
%63 = arith.select %61, %arg5, %arg7 : i32 loc(#loc80)
tt.reduce.return %62, %63 : f64, i32 loc(#loc53)
}) : (tensor<1x32xf64, #blocked>, tensor<1x32xi32, #blocked>) -> (tensor<1xf64, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) loc(#loc53)
%41 = triton_gpu.convert_layout %40#1 : (tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1xi32, #blocked1> loc(#loc49)
%42 = triton_gpu.convert_layout %41 : (tensor<1xi32, #blocked1>) -> tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>> loc(#loc49)
%43 = tt.expand_dims %42 {axis = 1 : i32} : (tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked4}>>) -> tensor<1x1xi32, #blocked4> loc(#loc49)
%44 = triton_gpu.convert_layout %43 : (tensor<1x1xi32, #blocked4>) -> tensor<1x1xi32, #blocked5> loc(#loc50)
%45 = tt.addptr %arg1, %c0_i32 : !tt.ptr<i64, 1>, i32 loc(#loc3)
%46 = tt.splat %45 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>, #blocked5> loc(#loc3)
%47 = arith.extsi %44 : tensor<1x1xi32, #blocked5> to tensor<1x1xi64, #blocked5> loc(#loc50)
%48 = triton_gpu.convert_layout %46 : (tensor<1x1x!tt.ptr<i64, 1>, #blocked5>) -> tensor<1x1x!tt.ptr<i64, 1>, #blocked3> loc(#loc50)
%49 = triton_gpu.convert_layout %47 : (tensor<1x1xi64, #blocked5>) -> tensor<1x1xi64, #blocked3> loc(#loc50)
tt.store %48, %49 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64, #blocked3> loc(#loc50)
tt.return loc(#loc51)
} loc(#loc)
} loc(#loc)
#loc1 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":37:36)
#loc2 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":53:35)
#loc3 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:25)
#loc4 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:32)
#loc5 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42)
#loc6 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63)
#loc7 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":48:34)
#loc8 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":39:17)
#loc9 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:15)
#loc10 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":27:18)
#loc11 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":26:21)
#loc12 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:26)
#loc13 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":25:34)
#loc14 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":28:20)
#loc15 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":32:22)
#loc16 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":33:16)
#loc17 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":34:18)
#loc18 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:49)
#loc19 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:56)
#loc20 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:30)
#loc21 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:98)
#loc22 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":36:90)
#loc23 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":38:32)
#loc24 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":40:20)
#loc25 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:52)
#loc26 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:61)
#loc27 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:31)
#loc28 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:103)
#loc29 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":42:95)
#loc30 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":44:35)
#loc31 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":45:20)
#loc32 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":49:35)
#loc33 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":50:20)
#loc34 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":51:20)
#loc36 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":70:21)
#loc37 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":71:23)
#loc38 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":73:29)
#loc39 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":74:29)
#loc40 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:28)
#loc41 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":75:16)
#loc42 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:29)
#loc43 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":77:17)
#loc44 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:31)
#loc45 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:21)
#loc46 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":80:12)
#loc47 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:35)
#loc48 = loc("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":81:69)
#loc49 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":56:22)
#loc50 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:68)
#loc51 = loc("/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":57:4)
#loc52 = loc(callsite(#loc4 at #loc5))
#loc53 = loc(callsite(#loc5 at #loc6))
#loc54 = loc(callsite(#loc36 at #loc5))
#loc55 = loc(callsite(#loc37 at #loc5))
#loc56 = loc(callsite(#loc38 at #loc5))
#loc57 = loc(callsite(#loc39 at #loc5))
#loc58 = loc(callsite(#loc40 at #loc5))
#loc59 = loc(callsite(#loc41 at #loc5))
#loc60 = loc(callsite(#loc42 at #loc5))
#loc61 = loc(callsite(#loc43 at #loc5))
#loc62 = loc(callsite(#loc44 at #loc5))
#loc63 = loc(callsite(#loc45 at #loc5))
#loc64 = loc(callsite(#loc46 at #loc5))
#loc65 = loc(callsite(#loc47 at #loc5))
#loc66 = loc(callsite(#loc48 at #loc5))
#loc67 = loc(callsite(#loc52 at #loc6))
#loc68 = loc(callsite(#loc54 at #loc6))
#loc69 = loc(callsite(#loc55 at #loc6))
#loc70 = loc(callsite(#loc56 at #loc6))
#loc71 = loc(callsite(#loc57 at #loc6))
#loc72 = loc(callsite(#loc58 at #loc6))
#loc73 = loc(callsite(#loc59 at #loc6))
#loc74 = loc(callsite(#loc60 at #loc6))
#loc75 = loc(callsite(#loc61 at #loc6))
#loc76 = loc(callsite(#loc62 at #loc6))
#loc77 = loc(callsite(#loc63 at #loc6))
#loc78 = loc(callsite(#loc64 at #loc6))
#loc79 = loc(callsite(#loc65 at #loc6))
#loc80 = loc(callsite(#loc66 at #loc6))
loc(callsite("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42 at "/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63)): error: 'tt.reduce' op inferred type(s) 'tensor<1xf64, #triton_gpu.slice<{dim = 1, parent = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>}>>', 'tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>}>>' are incompatible with return type(s) of operation 'tensor<1xf64, #triton_gpu.slice<{dim = 1, parent = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>}>>', 'tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>}>>'
loc(callsite("/home/dberard/local/pytorch/torch/_inductor/triton_helpers.py":91:42 at "/tmp/torchinductor_dberard/yl/cylyrnw2l3cnpot655rarv4ha622jmc4jnwrpsntqcnq4difzu25.py":55:63)): error: 'tt.reduce' op failed to infer returned types
// -----// IR Dump After TritonGPURemoveLayoutConversions Failed (tritongpu-remove-layout-conversions) ('builtin.module' operation) //----- //
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
"builtin.module"() ({
"tt.func"() <{arg_attrs = [{tt.divisibility = 16 : i32}, {tt.divisibility = 16 : i32}, {}, {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}], function_type = (!tt.ptr<f64, 1>, !tt.ptr<i64, 1>, i32, i32) -> (), sym_name = "triton__0d1d23de", sym_visibility = "public"}> ({
^bb0(%arg0: !tt.ptr<f64, 1>, %arg1: !tt.ptr<i64, 1>, %arg2: i32, %arg3: i32):
%0 = "arith.constant"() <{value = dense<0.000000e+00> : tensor<1x32xf64, #blocked>}> : () -> tensor<1x32xf64, #blocked>
%1 = "arith.constant"() <{value = dense<0xFFF0000000000000> : tensor<1x32xf64, #blocked>}> : () -> tensor<1x32xf64, #blocked>
%2 = "arith.constant"() <{value = dense<1.000000e+00> : tensor<1x32xf64, #blocked>}> : () -> tensor<1x32xf64, #blocked>
%3 = "arith.constant"() <{value = dense<1> : tensor<1x32xi32, #blocked>}> : () -> tensor<1x32xi32, #blocked>
%4 = "arith.constant"() <{value = dense<3> : tensor<1x32xi32, #blocked>}> : () -> tensor<1x32xi32, #blocked>
%5 = "arith.constant"() <{value = dense<2> : tensor<1x32xi32, #blocked>}> : () -> tensor<1x32xi32, #blocked>
%6 = "arith.constant"() <{value = dense<32> : tensor<1x32xi32, #blocked>}> : () -> tensor<1x32xi32, #blocked>
%7 = "arith.constant"() <{value = true}> : () -> i1
%8 = "arith.constant"() <{value = 0 : i32}> : () -> i32
%9 = "tt.make_range"() <{end = 32 : i32, start = 0 : i32}> : () -> tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
%10 = "tt.make_range"() <{end = 32 : i32, start = 0 : i32}> : () -> tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
%11 = "tt.make_range"() <{end = 32 : i32, start = 0 : i32}> : () -> tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
%12 = "tt.make_range"() <{end = 32 : i32, start = 0 : i32}> : () -> tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
%13 = "tt.make_range"() <{end = 32 : i32, start = 0 : i32}> : () -> tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
%14 = "tt.expand_dims"(%9) <{axis = 0 : i32}> : (tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x32xi32, #blocked>
%15 = "tt.expand_dims"(%10) <{axis = 0 : i32}> : (tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x32xi32, #blocked>
%16 = "tt.expand_dims"(%11) <{axis = 0 : i32}> : (tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x32xi32, #blocked>
%17 = "tt.expand_dims"(%12) <{axis = 0 : i32}> : (tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x32xi32, #blocked>
%18 = "tt.expand_dims"(%13) <{axis = 0 : i32}> : (tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x32xi32, #blocked>
%19 = "arith.cmpi"(%15, %6) <{predicate = 2 : i64}> : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi1, #blocked>
%20 = "arith.cmpi"(%17, %6) <{predicate = 2 : i64}> : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi1, #blocked>
%21 = "arith.cmpi"(%18, %6) <{predicate = 2 : i64}> : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi1, #blocked>
%22 = "arith.remsi"(%14, %5) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked>
%23 = "arith.remsi"(%15, %5) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked>
%24 = "arith.remsi"(%16, %5) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked>
%25 = "arith.remsi"(%17, %5) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked>
%26 = "arith.remsi"(%18, %5) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked>
%27 = "arith.divsi"(%14, %5) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked>
%28 = "arith.divsi"(%16, %5) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked>
%29 = "arith.muli"(%22, %4) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked>
%30 = "arith.muli"(%23, %4) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked>
%31 = "arith.muli"(%24, %4) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked>
%32 = "arith.muli"(%25, %4) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked>
%33 = "arith.muli"(%26, %4) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked>
%34 = "arith.divsi"(%29, %5) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked>
%35 = "arith.divsi"(%30, %5) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked>
%36 = "arith.divsi"(%31, %5) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked>
%37 = "arith.divsi"(%32, %5) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked>
%38 = "arith.divsi"(%33, %5) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked>
%39 = "arith.addi"(%35, %5) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked>
%40 = "arith.addi"(%37, %5) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked>
%41 = "arith.addi"(%38, %5) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked>
%42 = "arith.cmpi"(%35, %39) <{predicate = 2 : i64}> : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi1, #blocked>
%43 = "arith.cmpi"(%38, %41) <{predicate = 2 : i64}> : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi1, #blocked>
%44 = "arith.muli"(%27, %4) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked>
%45 = "arith.muli"(%28, %4) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked>
%46 = "arith.addi"(%44, %34) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked>
%47 = "tt.splat"(%arg0) : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>, #blocked>
%48 = "tt.splat"(%arg0) : (!tt.ptr<f64, 1>) -> tensor<1x32x!tt.ptr<f64, 1>, #blocked>
%49 = "tt.addptr"(%47, %46) : (tensor<1x32x!tt.ptr<f64, 1>, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32x!tt.ptr<f64, 1>, #blocked>
%50 = "arith.andi"(%19, %42) : (tensor<1x32xi1, #blocked>, tensor<1x32xi1, #blocked>) -> tensor<1x32xi1, #blocked>
%51 = "tt.load"(%49, %50, %0) <{cache = 1 : i32, evict = 1 : i32, isVolatile = false, operandSegmentSizes = array<i32: 1, 1, 1>}> : (tensor<1x32x!tt.ptr<f64, 1>, #blocked>, tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked>) -> tensor<1x32xf64, #blocked>
%52 = "arith.select"(%43, %51, %0) : (tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked>, tensor<1x32xf64, #blocked>) -> tensor<1x32xf64, #blocked>
%53 = "arith.addi"(%37, %3) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked>
%54 = "arith.addi"(%38, %3) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked>
%55 = "arith.cmpi"(%53, %40) <{predicate = 2 : i64}> : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi1, #blocked>
%56 = "arith.cmpi"(%54, %41) <{predicate = 2 : i64}> : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi1, #blocked>
%57 = "arith.addi"(%45, %3) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked>
%58 = "arith.addi"(%57, %36) : (tensor<1x32xi32, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32xi32, #blocked>
%59 = "tt.addptr"(%48, %58) : (tensor<1x32x!tt.ptr<f64, 1>, #blocked>, tensor<1x32xi32, #blocked>) -> tensor<1x32x!tt.ptr<f64, 1>, #blocked>
%60 = "arith.andi"(%20, %55) : (tensor<1x32xi1, #blocked>, tensor<1x32xi1, #blocked>) -> tensor<1x32xi1, #blocked>
%61 = "tt.load"(%59, %60, %0) <{cache = 1 : i32, evict = 1 : i32, isVolatile = false, operandSegmentSizes = array<i32: 1, 1, 1>}> : (tensor<1x32x!tt.ptr<f64, 1>, #blocked>, tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked>) -> tensor<1x32xf64, #blocked>
%62 = "arith.select"(%56, %61, %0) : (tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked>, tensor<1x32xf64, #blocked>) -> tensor<1x32xf64, #blocked>
%63 = "arith.addf"(%62, %52) <{fastmath = #arith.fastmath<none>}> : (tensor<1x32xf64, #blocked>, tensor<1x32xf64, #blocked>) -> tensor<1x32xf64, #blocked>
%64 = "arith.select"(%43, %2, %0) : (tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked>, tensor<1x32xf64, #blocked>) -> tensor<1x32xf64, #blocked>
%65 = "arith.select"(%56, %2, %0) : (tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked>, tensor<1x32xf64, #blocked>) -> tensor<1x32xf64, #blocked>
%66 = "arith.addf"(%65, %64) <{fastmath = #arith.fastmath<none>}> : (tensor<1x32xf64, #blocked>, tensor<1x32xf64, #blocked>) -> tensor<1x32xf64, #blocked>
%67 = "arith.divf"(%63, %66) <{fastmath = #arith.fastmath<none>}> : (tensor<1x32xf64, #blocked>, tensor<1x32xf64, #blocked>) -> tensor<1x32xf64, #blocked>
%68 = "arith.select"(%21, %67, %1) : (tensor<1x32xi1, #blocked>, tensor<1x32xf64, #blocked>, tensor<1x32xf64, #blocked>) -> tensor<1x32xf64, #blocked>
%69:2 = "tt.reduce"(%68, %18) <{axis = 1 : i32}> ({
^bb0(%arg4: f64, %arg5: i32, %arg6: f64, %arg7: i32):
%74 = "arith.cmpf"(%arg4, %arg6) <{predicate = 2 : i64}> : (f64, f64) -> i1
%75 = "arith.cmpf"(%arg4, %arg6) <{predicate = 1 : i64}> : (f64, f64) -> i1
%76 = "arith.cmpf"(%arg4, %arg4) <{predicate = 13 : i64}> : (f64, f64) -> i1
%77 = "arith.cmpf"(%arg6, %arg6) <{predicate = 13 : i64}> : (f64, f64) -> i1
%78 = "arith.xori"(%77, %7) : (i1, i1) -> i1
%79 = "arith.andi"(%76, %78) : (i1, i1) -> i1
%80 = "arith.ori"(%74, %79) : (i1, i1) -> i1
%81 = "arith.andi"(%76, %77) : (i1, i1) -> i1
%82 = "arith.ori"(%75, %81) : (i1, i1) -> i1
%83 = "arith.cmpi"(%arg5, %arg7) <{predicate = 2 : i64}> : (i32, i32) -> i1
%84 = "arith.andi"(%82, %83) : (i1, i1) -> i1
%85 = "arith.ori"(%80, %84) : (i1, i1) -> i1
%86 = "arith.select"(%85, %arg4, %arg6) : (i1, f64, f64) -> f64
%87 = "arith.select"(%85, %arg5, %arg7) : (i1, i32, i32) -> i32
"tt.reduce.return"(%86, %87) : (f64, i32) -> ()
}) : (tensor<1x32xf64, #blocked>, tensor<1x32xi32, #blocked>) -> (tensor<1xf64, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>, tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
%70 = "tt.expand_dims"(%69#1) <{axis = 1 : i32}> : (tensor<1xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1x1xi32, #blocked>
%71 = "tt.addptr"(%arg1, %8) : (!tt.ptr<i64, 1>, i32) -> !tt.ptr<i64, 1>
%72 = "arith.extsi"(%70) : (tensor<1x1xi32, #blocked>) -> tensor<1x1xi64, #blocked>
%73 = "tt.splat"(%71) : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>, #blocked>
"tt.store"(%73, %72) <{cache = 1 : i32, evict = 1 : i32}> : (tensor<1x1x!tt.ptr<i64, 1>, #blocked>, tensor<1x1xi64, #blocked>) -> ()
"tt.return"() : () -> ()
}) {noinline = false} : () -> ()
}) {"triton_gpu.compute-capability" = 80 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} : () -> ()
concurrent.futures.process._RemoteTraceback:
"""
Traceback (most recent call last):
File "/home/dberard/local/miniconda3/envs/pytorch/lib/python3.10/concurrent/futures/process.py", line 246, in _process_worker
r = call_item.fn(*call_item.args, **call_item.kwargs)
File "/home/dberard/local/pytorch/torch/_inductor/codecache.py", line 2280, in _worker_compile
kernel.precompile(warm_cache_only_with_cc=cc)
File "/home/dberard/local/pytorch/torch/_inductor/triton_heuristics.py", line 188, in precompile
compiled_binary, launcher = self._precompile_config(
File "/home/dberard/local/pytorch/torch/_inductor/triton_heuristics.py", line 291, in _precompile_config
triton.compile(
File "/home/dberard/local/triton/python/triton/compiler/compiler.py", line 543, in compile
next_module = compile_kernel(module)
File "/home/dberard/local/triton/python/triton/compiler/compiler.py", line 437, in <lambda>
stages["ttgir"] = (lambda path: parse_mlir_module(path, context), lambda src: optimize_ttgir(
File "/home/dberard/local/triton/python/triton/compiler/compiler.py", line 151, in optimize_ttgir
pm.run(mod)
RuntimeError: PassManager::run failed
"""
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/dberard/local/scripts/bintriton.py", line 93, in <module>
async_compile.wait(globals())
File "/home/dberard/local/pytorch/torch/_inductor/codecache.py", line 2465, in wait
scope[key] = result.result()
File "/home/dberard/local/pytorch/torch/_inductor/codecache.py", line 2308, in result
self.future.result()
File "/home/dberard/local/miniconda3/envs/pytorch/lib/python3.10/concurrent/futures/_base.py", line 458, in result
return self.__get_result()
File "/home/dberard/local/miniconda3/envs/pytorch/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result
raise self._exception
RuntimeError: PassManager::run failed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment