Skip to content

Instantly share code, notes, and snippets.

@bjacob
Created July 29, 2024 19:34
Show Gist options
  • Save bjacob/ff3c0425d063c3938a750ff09cadd448 to your computer and use it in GitHub Desktop.
Save bjacob/ff3c0425d063c3938a750ff09cadd448 to your computer and use it in GitHub Desktop.
Debugging ConvertToNVVM regression

Repro with attached b.mlir:

tools/iree-opt --iree-convert-to-nvvm /tmp/b.mlir

Output attached below (out.mlir).

Problem: why are there unrealized_conversion_cast in the output?

That only arises when dropping the local revert of llvm/llvm-project#99890.

With the local revert, there are no unrealized_conversion_cast.

#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152>>}>
module attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} {
func.func @test_logsoftmax_axis_1_expanded$async_dispatch_2_generic_3x5x4_f32() attributes {translation_info = #iree_codegen.translation_info<None workgroup_size = [1, 64, 1] subgroup_size = 32>} {
%cst = arith.constant dense<127> : vector<1x4x1xi32>
%cst_0 = arith.constant dense<23> : vector<1x4x1xi32>
%cst_1 = arith.constant dense<1.270000e+02> : vector<1x4x1xf32>
%cst_2 = arith.constant dense<-1.270000e+02> : vector<1x4x1xf32>
%cst_3 = arith.constant dense<8.880000e+01> : vector<1x4x1xf32>
%cst_4 = arith.constant dense<-8.780000e+01> : vector<1x4x1xf32>
%cst_5 = arith.constant dense<0.166666657> : vector<1x4x1xf32>
%cst_6 = arith.constant dense<0.0416657962> : vector<1x4x1xf32>
%cst_7 = arith.constant dense<0.00833345205> : vector<1x4x1xf32>
%cst_8 = arith.constant dense<0.00139819994> : vector<1x4x1xf32>
%cst_9 = arith.constant dense<1.98756912E-4> : vector<1x4x1xf32>
%cst_10 = arith.constant dense<2.12194442E-4> : vector<1x4x1xf32>
%cst_11 = arith.constant dense<-0.693359375> : vector<1x4x1xf32>
%cst_12 = arith.constant dense<1.44269502> : vector<1x4x1xf32>
%cst_13 = arith.constant dense<1.000000e+00> : vector<1x4x1xf32>
%cst_14 = arith.constant dense<5.000000e-01> : vector<1x4x1xf32>
%cst_15 = arith.constant 0.000000e+00 : f32
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%cst_16 = arith.constant dense<0.000000e+00> : vector<1x4x1xf32>
%c5 = arith.constant 5 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c64) flags(ReadOnly) : memref<3x4x5xf32, strided<[20, 5, 1], offset: 16>, #gpu.address_space<global>>
memref.assume_alignment %0, 64 : memref<3x4x5xf32, strided<[20, 5, 1], offset: 16>, #gpu.address_space<global>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<3x5xf32, #gpu.address_space<global>>
memref.assume_alignment %1, 64 : memref<3x5xf32, #gpu.address_space<global>>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%2 = arith.muli %workgroup_id_y, %c64 : index
%thread_id_y = gpu.thread_id y
%3 = arith.subi %c5, %thread_id_y : index
%4 = arith.minsi %3, %c1 : index
%5 = arith.maxsi %4, %c0 : index
%6 = arith.addi %2, %thread_id_y : index
cf.br ^bb1(%c0 : index)
^bb1(%7: index): // 2 preds: ^bb0, ^bb2
%8 = arith.cmpi slt, %7, %5 : index
cf.cond_br %8, ^bb2, ^bb3(%c0 : index)
^bb2: // pred: ^bb1
%9 = arith.addi %6, %7 : index
memref.store %cst_15, %1[%workgroup_id_x, %9] : memref<3x5xf32, #gpu.address_space<global>>
%10 = arith.addi %7, %c1 : index
cf.br ^bb1(%10 : index)
^bb3(%11: index): // 2 preds: ^bb1, ^bb4
%12 = arith.cmpi slt, %11, %5 : index
cf.cond_br %12, ^bb4, ^bb5
^bb4: // pred: ^bb3
%13 = arith.addi %thread_id_y, %11 : index
%14 = arith.addi %13, %2 : index
%15 = memref.load %0[%workgroup_id_x, %c0, %14] : memref<3x4x5xf32, strided<[20, 5, 1], offset: 16>, #gpu.address_space<global>>
%16 = vector.broadcast %15 : f32 to vector<1xf32>
%17 = vector.insert %16, %cst_16 [0, 0] : vector<1xf32> into vector<1x4x1xf32>
%18 = memref.load %0[%workgroup_id_x, %c1, %14] : memref<3x4x5xf32, strided<[20, 5, 1], offset: 16>, #gpu.address_space<global>>
%19 = vector.broadcast %18 : f32 to vector<1xf32>
%20 = vector.insert %19, %17 [0, 1] : vector<1xf32> into vector<1x4x1xf32>
%21 = memref.load %0[%workgroup_id_x, %c2, %14] : memref<3x4x5xf32, strided<[20, 5, 1], offset: 16>, #gpu.address_space<global>>
%22 = vector.broadcast %21 : f32 to vector<1xf32>
%23 = vector.insert %22, %20 [0, 2] : vector<1xf32> into vector<1x4x1xf32>
%24 = memref.load %0[%workgroup_id_x, %c3, %14] : memref<3x4x5xf32, strided<[20, 5, 1], offset: 16>, #gpu.address_space<global>>
%25 = vector.broadcast %24 : f32 to vector<1xf32>
%26 = vector.insert %25, %23 [0, 3] : vector<1xf32> into vector<1x4x1xf32>
%27 = arith.addi %6, %11 : index
%28 = memref.load %1[%workgroup_id_x, %27] : memref<3x5xf32, #gpu.address_space<global>>
%29 = vector.broadcast %28 : f32 to vector<1xf32>
%30 = arith.cmpf uge, %26, %cst_4 : vector<1x4x1xf32>
%31 = arith.select %30, %26, %cst_4 : vector<1x4x1xi1>, vector<1x4x1xf32>
%32 = arith.cmpf ule, %31, %cst_3 : vector<1x4x1xf32>
%33 = arith.select %32, %31, %cst_3 : vector<1x4x1xi1>, vector<1x4x1xf32>
%34 = math.fma %33, %cst_12, %cst_14 : vector<1x4x1xf32>
%35 = math.floor %34 : vector<1x4x1xf32>
%36 = arith.cmpf uge, %35, %cst_2 : vector<1x4x1xf32>
%37 = arith.select %36, %35, %cst_2 : vector<1x4x1xi1>, vector<1x4x1xf32>
%38 = arith.cmpf ule, %37, %cst_1 : vector<1x4x1xf32>
%39 = arith.select %38, %37, %cst_1 : vector<1x4x1xi1>, vector<1x4x1xf32>
%40 = math.fma %cst_11, %39, %33 : vector<1x4x1xf32>
%41 = math.fma %cst_10, %39, %40 : vector<1x4x1xf32>
%42 = math.fma %41, %cst_9, %cst_8 : vector<1x4x1xf32>
%43 = math.fma %42, %41, %cst_7 : vector<1x4x1xf32>
%44 = math.fma %43, %41, %cst_6 : vector<1x4x1xf32>
%45 = math.fma %44, %41, %cst_5 : vector<1x4x1xf32>
%46 = math.fma %45, %41, %cst_14 : vector<1x4x1xf32>
%47 = arith.mulf %41, %41 : vector<1x4x1xf32>
%48 = math.fma %46, %47, %41 : vector<1x4x1xf32>
%49 = arith.addf %48, %cst_13 : vector<1x4x1xf32>
%50 = arith.fptosi %39 : vector<1x4x1xf32> to vector<1x4x1xi32>
%51 = arith.addi %50, %cst : vector<1x4x1xi32>
%52 = arith.shli %51, %cst_0 : vector<1x4x1xi32>
%53 = arith.bitcast %52 : vector<1x4x1xi32> to vector<1x4x1xf32>
%54 = arith.mulf %49, %53 : vector<1x4x1xf32>
%55 = vector.extract %54[0, 0] : vector<1xf32> from vector<1x4x1xf32>
%56 = arith.addf %55, %29 : vector<1xf32>
%57 = vector.extract %54[0, 1] : vector<1xf32> from vector<1x4x1xf32>
%58 = arith.addf %57, %56 : vector<1xf32>
%59 = vector.extract %54[0, 2] : vector<1xf32> from vector<1x4x1xf32>
%60 = arith.addf %59, %58 : vector<1xf32>
%61 = vector.extract %54[0, 3] : vector<1xf32> from vector<1x4x1xf32>
%62 = arith.addf %61, %60 : vector<1xf32>
%63 = vector.extract %62[0] : f32 from vector<1xf32>
memref.store %63, %1[%workgroup_id_x, %27] : memref<3x5xf32, #gpu.address_space<global>>
%64 = arith.addi %11, %c1 : index
cf.br ^bb3(%64 : index)
^bb5: // pred: ^bb3
gpu.barrier
return
}
}
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152>>}>
#translation = #iree_codegen.translation_info<None workgroup_size = [1, 64, 1] subgroup_size = 32>
module attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} {
llvm.func @__nv_floorf(f32) -> f32
llvm.func @test_logsoftmax_axis_1_expanded$async_dispatch_2_generic_3x5x4_f32(%arg0: !llvm.ptr<1> {llvm.align = 16 : i32, llvm.noalias, llvm.readonly}, %arg1: !llvm.ptr<1> {llvm.align = 16 : i32, llvm.noalias}) attributes {translation_info = #translation} {
%0 = llvm.mlir.constant(0 : i64) : i64
%1 = llvm.mlir.undef : !llvm.array<1 x array<4 x vector<1xf32>>>
%2 = llvm.mlir.constant(0 : i32) : i32
%3 = llvm.mlir.undef : vector<1xf32>
%4 = llvm.mlir.constant(63 : index) : i64
%5 = llvm.mlir.constant(20 : index) : i64
%6 = llvm.mlir.constant(64 : index) : i64
%7 = llvm.mlir.constant(1 : index) : i64
%8 = llvm.mlir.constant(0 : index) : i64
%9 = llvm.mlir.constant(5 : index) : i64
%10 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x4x1xf32>) : !llvm.array<1 x array<4 x vector<1xf32>>>
%11 = llvm.mlir.constant(2 : index) : i64
%12 = llvm.mlir.constant(3 : index) : i64
%13 = llvm.mlir.constant(0.000000e+00 : f32) : f32
%14 = llvm.mlir.constant(dense<5.000000e-01> : vector<1x4x1xf32>) : !llvm.array<1 x array<4 x vector<1xf32>>>
%15 = llvm.mlir.constant(dense<1.000000e+00> : vector<1x4x1xf32>) : !llvm.array<1 x array<4 x vector<1xf32>>>
%16 = llvm.mlir.constant(dense<1.44269502> : vector<1x4x1xf32>) : !llvm.array<1 x array<4 x vector<1xf32>>>
%17 = llvm.mlir.constant(dense<-0.693359375> : vector<1x4x1xf32>) : !llvm.array<1 x array<4 x vector<1xf32>>>
%18 = llvm.mlir.constant(dense<2.12194442E-4> : vector<1x4x1xf32>) : !llvm.array<1 x array<4 x vector<1xf32>>>
%19 = llvm.mlir.constant(dense<1.98756912E-4> : vector<1x4x1xf32>) : !llvm.array<1 x array<4 x vector<1xf32>>>
%20 = llvm.mlir.constant(dense<0.00139819994> : vector<1x4x1xf32>) : !llvm.array<1 x array<4 x vector<1xf32>>>
%21 = llvm.mlir.constant(dense<0.00833345205> : vector<1x4x1xf32>) : !llvm.array<1 x array<4 x vector<1xf32>>>
%22 = llvm.mlir.constant(dense<0.0416657962> : vector<1x4x1xf32>) : !llvm.array<1 x array<4 x vector<1xf32>>>
%23 = llvm.mlir.constant(dense<127> : vector<1x4x1xi32>) : !llvm.array<1 x array<4 x vector<1xi32>>>
%24 = llvm.mlir.constant(dense<23> : vector<1x4x1xi32>) : !llvm.array<1 x array<4 x vector<1xi32>>>
%25 = llvm.mlir.constant(dense<1.270000e+02> : vector<1x4x1xf32>) : !llvm.array<1 x array<4 x vector<1xf32>>>
%26 = llvm.mlir.constant(dense<-1.270000e+02> : vector<1x4x1xf32>) : !llvm.array<1 x array<4 x vector<1xf32>>>
%27 = llvm.mlir.constant(dense<8.880000e+01> : vector<1x4x1xf32>) : !llvm.array<1 x array<4 x vector<1xf32>>>
%28 = llvm.mlir.constant(dense<-8.780000e+01> : vector<1x4x1xf32>) : !llvm.array<1 x array<4 x vector<1xf32>>>
%29 = llvm.mlir.constant(dense<0.166666657> : vector<1x4x1xf32>) : !llvm.array<1 x array<4 x vector<1xf32>>>
%30 = builtin.unrealized_conversion_cast %29 : !llvm.array<1 x array<4 x vector<1xf32>>> to vector<1x4x1xf32>
%31 = builtin.unrealized_conversion_cast %22 : !llvm.array<1 x array<4 x vector<1xf32>>> to vector<1x4x1xf32>
%32 = builtin.unrealized_conversion_cast %21 : !llvm.array<1 x array<4 x vector<1xf32>>> to vector<1x4x1xf32>
%33 = builtin.unrealized_conversion_cast %20 : !llvm.array<1 x array<4 x vector<1xf32>>> to vector<1x4x1xf32>
%34 = builtin.unrealized_conversion_cast %19 : !llvm.array<1 x array<4 x vector<1xf32>>> to vector<1x4x1xf32>
%35 = builtin.unrealized_conversion_cast %18 : !llvm.array<1 x array<4 x vector<1xf32>>> to vector<1x4x1xf32>
%36 = builtin.unrealized_conversion_cast %17 : !llvm.array<1 x array<4 x vector<1xf32>>> to vector<1x4x1xf32>
%37 = builtin.unrealized_conversion_cast %16 : !llvm.array<1 x array<4 x vector<1xf32>>> to vector<1x4x1xf32>
%38 = builtin.unrealized_conversion_cast %14 : !llvm.array<1 x array<4 x vector<1xf32>>> to vector<1x4x1xf32>
%39 = llvm.getelementptr %arg0[16] : (!llvm.ptr<1>) -> !llvm.ptr<1>, f32
%40 = llvm.ptrtoint %39 : !llvm.ptr<1> to i64
%41 = llvm.and %40, %4 : i64
%42 = llvm.icmp "eq" %41, %8 : i64
"llvm.intr.assume"(%42) : (i1) -> ()
%43 = llvm.ptrtoint %arg1 : !llvm.ptr<1> to i64
%44 = llvm.and %43, %4 : i64
%45 = llvm.icmp "eq" %44, %8 : i64
"llvm.intr.assume"(%45) : (i1) -> ()
%46 = nvvm.read.ptx.sreg.ctaid.x : i32
%47 = llvm.sext %46 : i32 to i64
%48 = nvvm.read.ptx.sreg.ctaid.y : i32
%49 = llvm.sext %48 : i32 to i64
%50 = llvm.mul %49, %6 : i64
%51 = nvvm.read.ptx.sreg.tid.y : i32
%52 = llvm.sext %51 : i32 to i64
%53 = llvm.sub %9, %52 : i64
%54 = llvm.intr.smin(%53, %7) : (i64, i64) -> i64
%55 = llvm.intr.smax(%54, %8) : (i64, i64) -> i64
%56 = llvm.add %50, %52 : i64
llvm.br ^bb1(%8 : i64)
^bb1(%57: i64): // 2 preds: ^bb0, ^bb2
%58 = llvm.icmp "slt" %57, %55 : i64
llvm.cond_br %58, ^bb2, ^bb3(%8 : i64)
^bb2: // pred: ^bb1
%59 = llvm.add %56, %57 : i64
%60 = llvm.mul %47, %9 : i64
%61 = llvm.add %60, %59 : i64
%62 = llvm.getelementptr %arg1[%61] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f32
llvm.store %13, %62 : f32, !llvm.ptr<1>
%63 = llvm.add %57, %7 : i64
llvm.br ^bb1(%63 : i64)
^bb3(%64: i64): // 2 preds: ^bb1, ^bb4
%65 = llvm.icmp "slt" %64, %55 : i64
llvm.cond_br %65, ^bb4, ^bb5
^bb4: // pred: ^bb3
%66 = llvm.add %52, %64 : i64
%67 = llvm.add %66, %50 : i64
%68 = llvm.getelementptr %arg0[16] : (!llvm.ptr<1>) -> !llvm.ptr<1>, f32
%69 = llvm.mul %47, %5 : i64
%70 = llvm.mul %8, %9 : i64
%71 = llvm.add %69, %70 : i64
%72 = llvm.add %71, %67 : i64
%73 = llvm.getelementptr %68[%72] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f32
%74 = llvm.load %73 : !llvm.ptr<1> -> f32
%75 = llvm.insertelement %74, %3[%2 : i32] : vector<1xf32>
%76 = llvm.shufflevector %75, %3 [0] : vector<1xf32>
%77 = llvm.getelementptr %arg0[16] : (!llvm.ptr<1>) -> !llvm.ptr<1>, f32
%78 = llvm.mul %47, %5 : i64
%79 = llvm.mul %7, %9 : i64
%80 = llvm.add %78, %79 : i64
%81 = llvm.add %80, %67 : i64
%82 = llvm.getelementptr %77[%81] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f32
%83 = llvm.load %82 : !llvm.ptr<1> -> f32
%84 = llvm.insertelement %83, %3[%2 : i32] : vector<1xf32>
%85 = llvm.shufflevector %84, %3 [0] : vector<1xf32>
%86 = llvm.getelementptr %arg0[16] : (!llvm.ptr<1>) -> !llvm.ptr<1>, f32
%87 = llvm.mul %47, %5 : i64
%88 = llvm.mul %11, %9 : i64
%89 = llvm.add %87, %88 : i64
%90 = llvm.add %89, %67 : i64
%91 = llvm.getelementptr %86[%90] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f32
%92 = llvm.load %91 : !llvm.ptr<1> -> f32
%93 = llvm.insertelement %92, %3[%2 : i32] : vector<1xf32>
%94 = llvm.shufflevector %93, %3 [0] : vector<1xf32>
%95 = llvm.getelementptr %arg0[16] : (!llvm.ptr<1>) -> !llvm.ptr<1>, f32
%96 = llvm.mul %47, %5 : i64
%97 = llvm.mul %12, %9 : i64
%98 = llvm.add %96, %97 : i64
%99 = llvm.add %98, %67 : i64
%100 = llvm.getelementptr %95[%99] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f32
%101 = llvm.load %100 : !llvm.ptr<1> -> f32
%102 = llvm.insertelement %101, %3[%2 : i32] : vector<1xf32>
%103 = llvm.shufflevector %102, %3 [0] : vector<1xf32>
%104 = llvm.add %56, %64 : i64
%105 = llvm.mul %47, %9 : i64
%106 = llvm.add %105, %104 : i64
%107 = llvm.getelementptr %arg1[%106] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f32
%108 = llvm.load %107 : !llvm.ptr<1> -> f32
%109 = llvm.insertelement %108, %3[%2 : i32] : vector<1xf32>
%110 = llvm.shufflevector %109, %3 [0] : vector<1xf32>
%111 = llvm.extractvalue %28[0, 0] : !llvm.array<1 x array<4 x vector<1xf32>>>
%112 = llvm.fcmp "uge" %76, %111 : vector<1xf32>
%113 = llvm.extractvalue %28[0, 1] : !llvm.array<1 x array<4 x vector<1xf32>>>
%114 = llvm.fcmp "uge" %85, %113 : vector<1xf32>
%115 = llvm.extractvalue %28[0, 2] : !llvm.array<1 x array<4 x vector<1xf32>>>
%116 = llvm.fcmp "uge" %94, %115 : vector<1xf32>
%117 = llvm.extractvalue %28[0, 3] : !llvm.array<1 x array<4 x vector<1xf32>>>
%118 = llvm.fcmp "uge" %103, %117 : vector<1xf32>
%119 = llvm.extractvalue %28[0, 0] : !llvm.array<1 x array<4 x vector<1xf32>>>
%120 = llvm.select %112, %76, %119 : vector<1xi1>, vector<1xf32>
%121 = llvm.extractvalue %28[0, 1] : !llvm.array<1 x array<4 x vector<1xf32>>>
%122 = llvm.select %114, %85, %121 : vector<1xi1>, vector<1xf32>
%123 = llvm.extractvalue %28[0, 2] : !llvm.array<1 x array<4 x vector<1xf32>>>
%124 = llvm.select %116, %94, %123 : vector<1xi1>, vector<1xf32>
%125 = llvm.extractvalue %28[0, 3] : !llvm.array<1 x array<4 x vector<1xf32>>>
%126 = llvm.select %118, %103, %125 : vector<1xi1>, vector<1xf32>
%127 = llvm.extractvalue %27[0, 0] : !llvm.array<1 x array<4 x vector<1xf32>>>
%128 = llvm.fcmp "ule" %120, %127 : vector<1xf32>
%129 = llvm.extractvalue %27[0, 1] : !llvm.array<1 x array<4 x vector<1xf32>>>
%130 = llvm.fcmp "ule" %122, %129 : vector<1xf32>
%131 = llvm.extractvalue %27[0, 2] : !llvm.array<1 x array<4 x vector<1xf32>>>
%132 = llvm.fcmp "ule" %124, %131 : vector<1xf32>
%133 = llvm.extractvalue %27[0, 3] : !llvm.array<1 x array<4 x vector<1xf32>>>
%134 = llvm.fcmp "ule" %126, %133 : vector<1xf32>
%135 = llvm.extractvalue %27[0, 0] : !llvm.array<1 x array<4 x vector<1xf32>>>
%136 = llvm.select %128, %120, %135 : vector<1xi1>, vector<1xf32>
%137 = llvm.insertvalue %136, %1[0, 0] : !llvm.array<1 x array<4 x vector<1xf32>>>
%138 = llvm.extractvalue %27[0, 1] : !llvm.array<1 x array<4 x vector<1xf32>>>
%139 = llvm.select %130, %122, %138 : vector<1xi1>, vector<1xf32>
%140 = llvm.insertvalue %139, %137[0, 1] : !llvm.array<1 x array<4 x vector<1xf32>>>
%141 = llvm.extractvalue %27[0, 2] : !llvm.array<1 x array<4 x vector<1xf32>>>
%142 = llvm.select %132, %124, %141 : vector<1xi1>, vector<1xf32>
%143 = llvm.insertvalue %142, %140[0, 2] : !llvm.array<1 x array<4 x vector<1xf32>>>
%144 = llvm.extractvalue %27[0, 3] : !llvm.array<1 x array<4 x vector<1xf32>>>
%145 = llvm.select %134, %126, %144 : vector<1xi1>, vector<1xf32>
%146 = llvm.insertvalue %145, %143[0, 3] : !llvm.array<1 x array<4 x vector<1xf32>>>
%147 = builtin.unrealized_conversion_cast %146 : !llvm.array<1 x array<4 x vector<1xf32>>> to vector<1x4x1xf32>
%148 = math.fma %147, %37, %38 : vector<1x4x1xf32>
%149 = builtin.unrealized_conversion_cast %148 : vector<1x4x1xf32> to !llvm.array<1 x array<4 x vector<1xf32>>>
%150 = llvm.extractvalue %149[0, 0] : !llvm.array<1 x array<4 x vector<1xf32>>>
%151 = llvm.extractelement %150[%0 : i64] : vector<1xf32>
%152 = llvm.call @__nv_floorf(%151) : (f32) -> f32
%153 = llvm.extractvalue %10[0, 0] : !llvm.array<1 x array<4 x vector<1xf32>>>
%154 = llvm.insertelement %152, %153[%0 : i64] : vector<1xf32>
%155 = llvm.extractvalue %149[0, 1] : !llvm.array<1 x array<4 x vector<1xf32>>>
%156 = llvm.extractelement %155[%0 : i64] : vector<1xf32>
%157 = llvm.call @__nv_floorf(%156) : (f32) -> f32
%158 = llvm.extractvalue %10[0, 1] : !llvm.array<1 x array<4 x vector<1xf32>>>
%159 = llvm.insertelement %157, %158[%0 : i64] : vector<1xf32>
%160 = llvm.extractvalue %149[0, 2] : !llvm.array<1 x array<4 x vector<1xf32>>>
%161 = llvm.extractelement %160[%0 : i64] : vector<1xf32>
%162 = llvm.call @__nv_floorf(%161) : (f32) -> f32
%163 = llvm.extractvalue %10[0, 2] : !llvm.array<1 x array<4 x vector<1xf32>>>
%164 = llvm.insertelement %162, %163[%0 : i64] : vector<1xf32>
%165 = llvm.extractvalue %149[0, 3] : !llvm.array<1 x array<4 x vector<1xf32>>>
%166 = llvm.extractelement %165[%0 : i64] : vector<1xf32>
%167 = llvm.call @__nv_floorf(%166) : (f32) -> f32
%168 = llvm.extractvalue %10[0, 3] : !llvm.array<1 x array<4 x vector<1xf32>>>
%169 = llvm.insertelement %167, %168[%0 : i64] : vector<1xf32>
%170 = llvm.extractvalue %26[0, 0] : !llvm.array<1 x array<4 x vector<1xf32>>>
%171 = llvm.fcmp "uge" %154, %170 : vector<1xf32>
%172 = llvm.extractvalue %26[0, 1] : !llvm.array<1 x array<4 x vector<1xf32>>>
%173 = llvm.fcmp "uge" %159, %172 : vector<1xf32>
%174 = llvm.extractvalue %26[0, 2] : !llvm.array<1 x array<4 x vector<1xf32>>>
%175 = llvm.fcmp "uge" %164, %174 : vector<1xf32>
%176 = llvm.extractvalue %26[0, 3] : !llvm.array<1 x array<4 x vector<1xf32>>>
%177 = llvm.fcmp "uge" %169, %176 : vector<1xf32>
%178 = llvm.extractvalue %26[0, 0] : !llvm.array<1 x array<4 x vector<1xf32>>>
%179 = llvm.select %171, %154, %178 : vector<1xi1>, vector<1xf32>
%180 = llvm.extractvalue %26[0, 1] : !llvm.array<1 x array<4 x vector<1xf32>>>
%181 = llvm.select %173, %159, %180 : vector<1xi1>, vector<1xf32>
%182 = llvm.extractvalue %26[0, 2] : !llvm.array<1 x array<4 x vector<1xf32>>>
%183 = llvm.select %175, %164, %182 : vector<1xi1>, vector<1xf32>
%184 = llvm.extractvalue %26[0, 3] : !llvm.array<1 x array<4 x vector<1xf32>>>
%185 = llvm.select %177, %169, %184 : vector<1xi1>, vector<1xf32>
%186 = llvm.extractvalue %25[0, 0] : !llvm.array<1 x array<4 x vector<1xf32>>>
%187 = llvm.fcmp "ule" %179, %186 : vector<1xf32>
%188 = llvm.extractvalue %25[0, 1] : !llvm.array<1 x array<4 x vector<1xf32>>>
%189 = llvm.fcmp "ule" %181, %188 : vector<1xf32>
%190 = llvm.extractvalue %25[0, 2] : !llvm.array<1 x array<4 x vector<1xf32>>>
%191 = llvm.fcmp "ule" %183, %190 : vector<1xf32>
%192 = llvm.extractvalue %25[0, 3] : !llvm.array<1 x array<4 x vector<1xf32>>>
%193 = llvm.fcmp "ule" %185, %192 : vector<1xf32>
%194 = llvm.extractvalue %25[0, 0] : !llvm.array<1 x array<4 x vector<1xf32>>>
%195 = llvm.select %187, %179, %194 : vector<1xi1>, vector<1xf32>
%196 = llvm.insertvalue %195, %1[0, 0] : !llvm.array<1 x array<4 x vector<1xf32>>>
%197 = llvm.extractvalue %25[0, 1] : !llvm.array<1 x array<4 x vector<1xf32>>>
%198 = llvm.select %189, %181, %197 : vector<1xi1>, vector<1xf32>
%199 = llvm.insertvalue %198, %196[0, 1] : !llvm.array<1 x array<4 x vector<1xf32>>>
%200 = llvm.extractvalue %25[0, 2] : !llvm.array<1 x array<4 x vector<1xf32>>>
%201 = llvm.select %191, %183, %200 : vector<1xi1>, vector<1xf32>
%202 = llvm.insertvalue %201, %199[0, 2] : !llvm.array<1 x array<4 x vector<1xf32>>>
%203 = llvm.extractvalue %25[0, 3] : !llvm.array<1 x array<4 x vector<1xf32>>>
%204 = llvm.select %193, %185, %203 : vector<1xi1>, vector<1xf32>
%205 = llvm.insertvalue %204, %202[0, 3] : !llvm.array<1 x array<4 x vector<1xf32>>>
%206 = builtin.unrealized_conversion_cast %205 : !llvm.array<1 x array<4 x vector<1xf32>>> to vector<1x4x1xf32>
%207 = math.fma %36, %206, %147 : vector<1x4x1xf32>
%208 = math.fma %35, %206, %207 : vector<1x4x1xf32>
%209 = builtin.unrealized_conversion_cast %208 : vector<1x4x1xf32> to !llvm.array<1 x array<4 x vector<1xf32>>>
%210 = math.fma %208, %34, %33 : vector<1x4x1xf32>
%211 = math.fma %210, %208, %32 : vector<1x4x1xf32>
%212 = math.fma %211, %208, %31 : vector<1x4x1xf32>
%213 = math.fma %212, %208, %30 : vector<1x4x1xf32>
%214 = math.fma %213, %208, %38 : vector<1x4x1xf32>
%215 = llvm.extractvalue %209[0, 0] : !llvm.array<1 x array<4 x vector<1xf32>>>
%216 = llvm.extractvalue %209[0, 0] : !llvm.array<1 x array<4 x vector<1xf32>>>
%217 = llvm.fmul %215, %216 : vector<1xf32>
%218 = llvm.insertvalue %217, %1[0, 0] : !llvm.array<1 x array<4 x vector<1xf32>>>
%219 = llvm.extractvalue %209[0, 1] : !llvm.array<1 x array<4 x vector<1xf32>>>
%220 = llvm.extractvalue %209[0, 1] : !llvm.array<1 x array<4 x vector<1xf32>>>
%221 = llvm.fmul %219, %220 : vector<1xf32>
%222 = llvm.insertvalue %221, %218[0, 1] : !llvm.array<1 x array<4 x vector<1xf32>>>
%223 = llvm.extractvalue %209[0, 2] : !llvm.array<1 x array<4 x vector<1xf32>>>
%224 = llvm.extractvalue %209[0, 2] : !llvm.array<1 x array<4 x vector<1xf32>>>
%225 = llvm.fmul %223, %224 : vector<1xf32>
%226 = llvm.insertvalue %225, %222[0, 2] : !llvm.array<1 x array<4 x vector<1xf32>>>
%227 = llvm.extractvalue %209[0, 3] : !llvm.array<1 x array<4 x vector<1xf32>>>
%228 = llvm.extractvalue %209[0, 3] : !llvm.array<1 x array<4 x vector<1xf32>>>
%229 = llvm.fmul %227, %228 : vector<1xf32>
%230 = llvm.insertvalue %229, %226[0, 3] : !llvm.array<1 x array<4 x vector<1xf32>>>
%231 = builtin.unrealized_conversion_cast %230 : !llvm.array<1 x array<4 x vector<1xf32>>> to vector<1x4x1xf32>
%232 = math.fma %214, %231, %208 : vector<1x4x1xf32>
%233 = builtin.unrealized_conversion_cast %232 : vector<1x4x1xf32> to !llvm.array<1 x array<4 x vector<1xf32>>>
%234 = llvm.extractvalue %233[0, 0] : !llvm.array<1 x array<4 x vector<1xf32>>>
%235 = llvm.extractvalue %15[0, 0] : !llvm.array<1 x array<4 x vector<1xf32>>>
%236 = llvm.fadd %234, %235 : vector<1xf32>
%237 = llvm.extractvalue %233[0, 1] : !llvm.array<1 x array<4 x vector<1xf32>>>
%238 = llvm.extractvalue %15[0, 1] : !llvm.array<1 x array<4 x vector<1xf32>>>
%239 = llvm.fadd %237, %238 : vector<1xf32>
%240 = llvm.extractvalue %233[0, 2] : !llvm.array<1 x array<4 x vector<1xf32>>>
%241 = llvm.extractvalue %15[0, 2] : !llvm.array<1 x array<4 x vector<1xf32>>>
%242 = llvm.fadd %240, %241 : vector<1xf32>
%243 = llvm.extractvalue %233[0, 3] : !llvm.array<1 x array<4 x vector<1xf32>>>
%244 = llvm.extractvalue %15[0, 3] : !llvm.array<1 x array<4 x vector<1xf32>>>
%245 = llvm.fadd %243, %244 : vector<1xf32>
%246 = llvm.fptosi %195 : vector<1xf32> to vector<1xi32>
%247 = llvm.fptosi %198 : vector<1xf32> to vector<1xi32>
%248 = llvm.fptosi %201 : vector<1xf32> to vector<1xi32>
%249 = llvm.fptosi %204 : vector<1xf32> to vector<1xi32>
%250 = llvm.extractvalue %23[0, 0] : !llvm.array<1 x array<4 x vector<1xi32>>>
%251 = llvm.add %246, %250 : vector<1xi32>
%252 = llvm.extractvalue %23[0, 1] : !llvm.array<1 x array<4 x vector<1xi32>>>
%253 = llvm.add %247, %252 : vector<1xi32>
%254 = llvm.extractvalue %23[0, 2] : !llvm.array<1 x array<4 x vector<1xi32>>>
%255 = llvm.add %248, %254 : vector<1xi32>
%256 = llvm.extractvalue %23[0, 3] : !llvm.array<1 x array<4 x vector<1xi32>>>
%257 = llvm.add %249, %256 : vector<1xi32>
%258 = llvm.extractvalue %24[0, 0] : !llvm.array<1 x array<4 x vector<1xi32>>>
%259 = llvm.shl %251, %258 : vector<1xi32>
%260 = llvm.extractvalue %24[0, 1] : !llvm.array<1 x array<4 x vector<1xi32>>>
%261 = llvm.shl %253, %260 : vector<1xi32>
%262 = llvm.extractvalue %24[0, 2] : !llvm.array<1 x array<4 x vector<1xi32>>>
%263 = llvm.shl %255, %262 : vector<1xi32>
%264 = llvm.extractvalue %24[0, 3] : !llvm.array<1 x array<4 x vector<1xi32>>>
%265 = llvm.shl %257, %264 : vector<1xi32>
%266 = llvm.bitcast %259 : vector<1xi32> to vector<1xf32>
%267 = llvm.bitcast %261 : vector<1xi32> to vector<1xf32>
%268 = llvm.bitcast %263 : vector<1xi32> to vector<1xf32>
%269 = llvm.bitcast %265 : vector<1xi32> to vector<1xf32>
%270 = llvm.fmul %236, %266 : vector<1xf32>
%271 = llvm.fmul %239, %267 : vector<1xf32>
%272 = llvm.fmul %242, %268 : vector<1xf32>
%273 = llvm.fmul %245, %269 : vector<1xf32>
%274 = llvm.fadd %270, %110 : vector<1xf32>
%275 = llvm.fadd %271, %274 : vector<1xf32>
%276 = llvm.fadd %272, %275 : vector<1xf32>
%277 = llvm.fadd %273, %276 : vector<1xf32>
%278 = llvm.extractelement %277[%0 : i64] : vector<1xf32>
%279 = llvm.mul %47, %9 : i64
%280 = llvm.add %279, %104 : i64
%281 = llvm.getelementptr %arg1[%280] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f32
llvm.store %278, %281 : f32, !llvm.ptr<1>
%282 = llvm.add %64, %7 : i64
llvm.br ^bb3(%282 : i64)
^bb5: // pred: ^bb3
nvvm.barrier0
llvm.return
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment