|
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152>>}> |
|
#translation = #iree_codegen.translation_info<None workgroup_size = [1, 64, 1] subgroup_size = 32> |
|
module attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} { |
|
llvm.func @__nv_floorf(f32) -> f32 |
|
llvm.func @test_logsoftmax_axis_1_expanded$async_dispatch_2_generic_3x5x4_f32(%arg0: !llvm.ptr<1> {llvm.align = 16 : i32, llvm.noalias, llvm.readonly}, %arg1: !llvm.ptr<1> {llvm.align = 16 : i32, llvm.noalias}) attributes {translation_info = #translation} { |
|
%0 = llvm.mlir.constant(0 : i64) : i64 |
|
%1 = llvm.mlir.undef : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%2 = llvm.mlir.constant(0 : i32) : i32 |
|
%3 = llvm.mlir.undef : vector<1xf32> |
|
%4 = llvm.mlir.constant(63 : index) : i64 |
|
%5 = llvm.mlir.constant(20 : index) : i64 |
|
%6 = llvm.mlir.constant(64 : index) : i64 |
|
%7 = llvm.mlir.constant(1 : index) : i64 |
|
%8 = llvm.mlir.constant(0 : index) : i64 |
|
%9 = llvm.mlir.constant(5 : index) : i64 |
|
%10 = llvm.mlir.constant(dense<0.000000e+00> : vector<1x4x1xf32>) : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%11 = llvm.mlir.constant(2 : index) : i64 |
|
%12 = llvm.mlir.constant(3 : index) : i64 |
|
%13 = llvm.mlir.constant(0.000000e+00 : f32) : f32 |
|
%14 = llvm.mlir.constant(dense<5.000000e-01> : vector<1x4x1xf32>) : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%15 = llvm.mlir.constant(dense<1.000000e+00> : vector<1x4x1xf32>) : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%16 = llvm.mlir.constant(dense<1.44269502> : vector<1x4x1xf32>) : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%17 = llvm.mlir.constant(dense<-0.693359375> : vector<1x4x1xf32>) : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%18 = llvm.mlir.constant(dense<2.12194442E-4> : vector<1x4x1xf32>) : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%19 = llvm.mlir.constant(dense<1.98756912E-4> : vector<1x4x1xf32>) : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%20 = llvm.mlir.constant(dense<0.00139819994> : vector<1x4x1xf32>) : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%21 = llvm.mlir.constant(dense<0.00833345205> : vector<1x4x1xf32>) : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%22 = llvm.mlir.constant(dense<0.0416657962> : vector<1x4x1xf32>) : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%23 = llvm.mlir.constant(dense<127> : vector<1x4x1xi32>) : !llvm.array<1 x array<4 x vector<1xi32>>> |
|
%24 = llvm.mlir.constant(dense<23> : vector<1x4x1xi32>) : !llvm.array<1 x array<4 x vector<1xi32>>> |
|
%25 = llvm.mlir.constant(dense<1.270000e+02> : vector<1x4x1xf32>) : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%26 = llvm.mlir.constant(dense<-1.270000e+02> : vector<1x4x1xf32>) : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%27 = llvm.mlir.constant(dense<8.880000e+01> : vector<1x4x1xf32>) : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%28 = llvm.mlir.constant(dense<-8.780000e+01> : vector<1x4x1xf32>) : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%29 = llvm.mlir.constant(dense<0.166666657> : vector<1x4x1xf32>) : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%30 = builtin.unrealized_conversion_cast %29 : !llvm.array<1 x array<4 x vector<1xf32>>> to vector<1x4x1xf32> |
|
%31 = builtin.unrealized_conversion_cast %22 : !llvm.array<1 x array<4 x vector<1xf32>>> to vector<1x4x1xf32> |
|
%32 = builtin.unrealized_conversion_cast %21 : !llvm.array<1 x array<4 x vector<1xf32>>> to vector<1x4x1xf32> |
|
%33 = builtin.unrealized_conversion_cast %20 : !llvm.array<1 x array<4 x vector<1xf32>>> to vector<1x4x1xf32> |
|
%34 = builtin.unrealized_conversion_cast %19 : !llvm.array<1 x array<4 x vector<1xf32>>> to vector<1x4x1xf32> |
|
%35 = builtin.unrealized_conversion_cast %18 : !llvm.array<1 x array<4 x vector<1xf32>>> to vector<1x4x1xf32> |
|
%36 = builtin.unrealized_conversion_cast %17 : !llvm.array<1 x array<4 x vector<1xf32>>> to vector<1x4x1xf32> |
|
%37 = builtin.unrealized_conversion_cast %16 : !llvm.array<1 x array<4 x vector<1xf32>>> to vector<1x4x1xf32> |
|
%38 = builtin.unrealized_conversion_cast %14 : !llvm.array<1 x array<4 x vector<1xf32>>> to vector<1x4x1xf32> |
|
%39 = llvm.getelementptr %arg0[16] : (!llvm.ptr<1>) -> !llvm.ptr<1>, f32 |
|
%40 = llvm.ptrtoint %39 : !llvm.ptr<1> to i64 |
|
%41 = llvm.and %40, %4 : i64 |
|
%42 = llvm.icmp "eq" %41, %8 : i64 |
|
"llvm.intr.assume"(%42) : (i1) -> () |
|
%43 = llvm.ptrtoint %arg1 : !llvm.ptr<1> to i64 |
|
%44 = llvm.and %43, %4 : i64 |
|
%45 = llvm.icmp "eq" %44, %8 : i64 |
|
"llvm.intr.assume"(%45) : (i1) -> () |
|
%46 = nvvm.read.ptx.sreg.ctaid.x : i32 |
|
%47 = llvm.sext %46 : i32 to i64 |
|
%48 = nvvm.read.ptx.sreg.ctaid.y : i32 |
|
%49 = llvm.sext %48 : i32 to i64 |
|
%50 = llvm.mul %49, %6 : i64 |
|
%51 = nvvm.read.ptx.sreg.tid.y : i32 |
|
%52 = llvm.sext %51 : i32 to i64 |
|
%53 = llvm.sub %9, %52 : i64 |
|
%54 = llvm.intr.smin(%53, %7) : (i64, i64) -> i64 |
|
%55 = llvm.intr.smax(%54, %8) : (i64, i64) -> i64 |
|
%56 = llvm.add %50, %52 : i64 |
|
llvm.br ^bb1(%8 : i64) |
|
^bb1(%57: i64): // 2 preds: ^bb0, ^bb2 |
|
%58 = llvm.icmp "slt" %57, %55 : i64 |
|
llvm.cond_br %58, ^bb2, ^bb3(%8 : i64) |
|
^bb2: // pred: ^bb1 |
|
%59 = llvm.add %56, %57 : i64 |
|
%60 = llvm.mul %47, %9 : i64 |
|
%61 = llvm.add %60, %59 : i64 |
|
%62 = llvm.getelementptr %arg1[%61] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f32 |
|
llvm.store %13, %62 : f32, !llvm.ptr<1> |
|
%63 = llvm.add %57, %7 : i64 |
|
llvm.br ^bb1(%63 : i64) |
|
^bb3(%64: i64): // 2 preds: ^bb1, ^bb4 |
|
%65 = llvm.icmp "slt" %64, %55 : i64 |
|
llvm.cond_br %65, ^bb4, ^bb5 |
|
^bb4: // pred: ^bb3 |
|
%66 = llvm.add %52, %64 : i64 |
|
%67 = llvm.add %66, %50 : i64 |
|
%68 = llvm.getelementptr %arg0[16] : (!llvm.ptr<1>) -> !llvm.ptr<1>, f32 |
|
%69 = llvm.mul %47, %5 : i64 |
|
%70 = llvm.mul %8, %9 : i64 |
|
%71 = llvm.add %69, %70 : i64 |
|
%72 = llvm.add %71, %67 : i64 |
|
%73 = llvm.getelementptr %68[%72] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f32 |
|
%74 = llvm.load %73 : !llvm.ptr<1> -> f32 |
|
%75 = llvm.insertelement %74, %3[%2 : i32] : vector<1xf32> |
|
%76 = llvm.shufflevector %75, %3 [0] : vector<1xf32> |
|
%77 = llvm.getelementptr %arg0[16] : (!llvm.ptr<1>) -> !llvm.ptr<1>, f32 |
|
%78 = llvm.mul %47, %5 : i64 |
|
%79 = llvm.mul %7, %9 : i64 |
|
%80 = llvm.add %78, %79 : i64 |
|
%81 = llvm.add %80, %67 : i64 |
|
%82 = llvm.getelementptr %77[%81] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f32 |
|
%83 = llvm.load %82 : !llvm.ptr<1> -> f32 |
|
%84 = llvm.insertelement %83, %3[%2 : i32] : vector<1xf32> |
|
%85 = llvm.shufflevector %84, %3 [0] : vector<1xf32> |
|
%86 = llvm.getelementptr %arg0[16] : (!llvm.ptr<1>) -> !llvm.ptr<1>, f32 |
|
%87 = llvm.mul %47, %5 : i64 |
|
%88 = llvm.mul %11, %9 : i64 |
|
%89 = llvm.add %87, %88 : i64 |
|
%90 = llvm.add %89, %67 : i64 |
|
%91 = llvm.getelementptr %86[%90] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f32 |
|
%92 = llvm.load %91 : !llvm.ptr<1> -> f32 |
|
%93 = llvm.insertelement %92, %3[%2 : i32] : vector<1xf32> |
|
%94 = llvm.shufflevector %93, %3 [0] : vector<1xf32> |
|
%95 = llvm.getelementptr %arg0[16] : (!llvm.ptr<1>) -> !llvm.ptr<1>, f32 |
|
%96 = llvm.mul %47, %5 : i64 |
|
%97 = llvm.mul %12, %9 : i64 |
|
%98 = llvm.add %96, %97 : i64 |
|
%99 = llvm.add %98, %67 : i64 |
|
%100 = llvm.getelementptr %95[%99] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f32 |
|
%101 = llvm.load %100 : !llvm.ptr<1> -> f32 |
|
%102 = llvm.insertelement %101, %3[%2 : i32] : vector<1xf32> |
|
%103 = llvm.shufflevector %102, %3 [0] : vector<1xf32> |
|
%104 = llvm.add %56, %64 : i64 |
|
%105 = llvm.mul %47, %9 : i64 |
|
%106 = llvm.add %105, %104 : i64 |
|
%107 = llvm.getelementptr %arg1[%106] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f32 |
|
%108 = llvm.load %107 : !llvm.ptr<1> -> f32 |
|
%109 = llvm.insertelement %108, %3[%2 : i32] : vector<1xf32> |
|
%110 = llvm.shufflevector %109, %3 [0] : vector<1xf32> |
|
%111 = llvm.extractvalue %28[0, 0] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%112 = llvm.fcmp "uge" %76, %111 : vector<1xf32> |
|
%113 = llvm.extractvalue %28[0, 1] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%114 = llvm.fcmp "uge" %85, %113 : vector<1xf32> |
|
%115 = llvm.extractvalue %28[0, 2] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%116 = llvm.fcmp "uge" %94, %115 : vector<1xf32> |
|
%117 = llvm.extractvalue %28[0, 3] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%118 = llvm.fcmp "uge" %103, %117 : vector<1xf32> |
|
%119 = llvm.extractvalue %28[0, 0] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%120 = llvm.select %112, %76, %119 : vector<1xi1>, vector<1xf32> |
|
%121 = llvm.extractvalue %28[0, 1] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%122 = llvm.select %114, %85, %121 : vector<1xi1>, vector<1xf32> |
|
%123 = llvm.extractvalue %28[0, 2] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%124 = llvm.select %116, %94, %123 : vector<1xi1>, vector<1xf32> |
|
%125 = llvm.extractvalue %28[0, 3] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%126 = llvm.select %118, %103, %125 : vector<1xi1>, vector<1xf32> |
|
%127 = llvm.extractvalue %27[0, 0] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%128 = llvm.fcmp "ule" %120, %127 : vector<1xf32> |
|
%129 = llvm.extractvalue %27[0, 1] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%130 = llvm.fcmp "ule" %122, %129 : vector<1xf32> |
|
%131 = llvm.extractvalue %27[0, 2] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%132 = llvm.fcmp "ule" %124, %131 : vector<1xf32> |
|
%133 = llvm.extractvalue %27[0, 3] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%134 = llvm.fcmp "ule" %126, %133 : vector<1xf32> |
|
%135 = llvm.extractvalue %27[0, 0] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%136 = llvm.select %128, %120, %135 : vector<1xi1>, vector<1xf32> |
|
%137 = llvm.insertvalue %136, %1[0, 0] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%138 = llvm.extractvalue %27[0, 1] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%139 = llvm.select %130, %122, %138 : vector<1xi1>, vector<1xf32> |
|
%140 = llvm.insertvalue %139, %137[0, 1] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%141 = llvm.extractvalue %27[0, 2] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%142 = llvm.select %132, %124, %141 : vector<1xi1>, vector<1xf32> |
|
%143 = llvm.insertvalue %142, %140[0, 2] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%144 = llvm.extractvalue %27[0, 3] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%145 = llvm.select %134, %126, %144 : vector<1xi1>, vector<1xf32> |
|
%146 = llvm.insertvalue %145, %143[0, 3] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%147 = builtin.unrealized_conversion_cast %146 : !llvm.array<1 x array<4 x vector<1xf32>>> to vector<1x4x1xf32> |
|
%148 = math.fma %147, %37, %38 : vector<1x4x1xf32> |
|
%149 = builtin.unrealized_conversion_cast %148 : vector<1x4x1xf32> to !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%150 = llvm.extractvalue %149[0, 0] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%151 = llvm.extractelement %150[%0 : i64] : vector<1xf32> |
|
%152 = llvm.call @__nv_floorf(%151) : (f32) -> f32 |
|
%153 = llvm.extractvalue %10[0, 0] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%154 = llvm.insertelement %152, %153[%0 : i64] : vector<1xf32> |
|
%155 = llvm.extractvalue %149[0, 1] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%156 = llvm.extractelement %155[%0 : i64] : vector<1xf32> |
|
%157 = llvm.call @__nv_floorf(%156) : (f32) -> f32 |
|
%158 = llvm.extractvalue %10[0, 1] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%159 = llvm.insertelement %157, %158[%0 : i64] : vector<1xf32> |
|
%160 = llvm.extractvalue %149[0, 2] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%161 = llvm.extractelement %160[%0 : i64] : vector<1xf32> |
|
%162 = llvm.call @__nv_floorf(%161) : (f32) -> f32 |
|
%163 = llvm.extractvalue %10[0, 2] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%164 = llvm.insertelement %162, %163[%0 : i64] : vector<1xf32> |
|
%165 = llvm.extractvalue %149[0, 3] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%166 = llvm.extractelement %165[%0 : i64] : vector<1xf32> |
|
%167 = llvm.call @__nv_floorf(%166) : (f32) -> f32 |
|
%168 = llvm.extractvalue %10[0, 3] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%169 = llvm.insertelement %167, %168[%0 : i64] : vector<1xf32> |
|
%170 = llvm.extractvalue %26[0, 0] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%171 = llvm.fcmp "uge" %154, %170 : vector<1xf32> |
|
%172 = llvm.extractvalue %26[0, 1] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%173 = llvm.fcmp "uge" %159, %172 : vector<1xf32> |
|
%174 = llvm.extractvalue %26[0, 2] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%175 = llvm.fcmp "uge" %164, %174 : vector<1xf32> |
|
%176 = llvm.extractvalue %26[0, 3] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%177 = llvm.fcmp "uge" %169, %176 : vector<1xf32> |
|
%178 = llvm.extractvalue %26[0, 0] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%179 = llvm.select %171, %154, %178 : vector<1xi1>, vector<1xf32> |
|
%180 = llvm.extractvalue %26[0, 1] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%181 = llvm.select %173, %159, %180 : vector<1xi1>, vector<1xf32> |
|
%182 = llvm.extractvalue %26[0, 2] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%183 = llvm.select %175, %164, %182 : vector<1xi1>, vector<1xf32> |
|
%184 = llvm.extractvalue %26[0, 3] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%185 = llvm.select %177, %169, %184 : vector<1xi1>, vector<1xf32> |
|
%186 = llvm.extractvalue %25[0, 0] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%187 = llvm.fcmp "ule" %179, %186 : vector<1xf32> |
|
%188 = llvm.extractvalue %25[0, 1] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%189 = llvm.fcmp "ule" %181, %188 : vector<1xf32> |
|
%190 = llvm.extractvalue %25[0, 2] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%191 = llvm.fcmp "ule" %183, %190 : vector<1xf32> |
|
%192 = llvm.extractvalue %25[0, 3] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%193 = llvm.fcmp "ule" %185, %192 : vector<1xf32> |
|
%194 = llvm.extractvalue %25[0, 0] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%195 = llvm.select %187, %179, %194 : vector<1xi1>, vector<1xf32> |
|
%196 = llvm.insertvalue %195, %1[0, 0] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%197 = llvm.extractvalue %25[0, 1] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%198 = llvm.select %189, %181, %197 : vector<1xi1>, vector<1xf32> |
|
%199 = llvm.insertvalue %198, %196[0, 1] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%200 = llvm.extractvalue %25[0, 2] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%201 = llvm.select %191, %183, %200 : vector<1xi1>, vector<1xf32> |
|
%202 = llvm.insertvalue %201, %199[0, 2] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%203 = llvm.extractvalue %25[0, 3] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%204 = llvm.select %193, %185, %203 : vector<1xi1>, vector<1xf32> |
|
%205 = llvm.insertvalue %204, %202[0, 3] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%206 = builtin.unrealized_conversion_cast %205 : !llvm.array<1 x array<4 x vector<1xf32>>> to vector<1x4x1xf32> |
|
%207 = math.fma %36, %206, %147 : vector<1x4x1xf32> |
|
%208 = math.fma %35, %206, %207 : vector<1x4x1xf32> |
|
%209 = builtin.unrealized_conversion_cast %208 : vector<1x4x1xf32> to !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%210 = math.fma %208, %34, %33 : vector<1x4x1xf32> |
|
%211 = math.fma %210, %208, %32 : vector<1x4x1xf32> |
|
%212 = math.fma %211, %208, %31 : vector<1x4x1xf32> |
|
%213 = math.fma %212, %208, %30 : vector<1x4x1xf32> |
|
%214 = math.fma %213, %208, %38 : vector<1x4x1xf32> |
|
%215 = llvm.extractvalue %209[0, 0] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%216 = llvm.extractvalue %209[0, 0] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%217 = llvm.fmul %215, %216 : vector<1xf32> |
|
%218 = llvm.insertvalue %217, %1[0, 0] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%219 = llvm.extractvalue %209[0, 1] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%220 = llvm.extractvalue %209[0, 1] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%221 = llvm.fmul %219, %220 : vector<1xf32> |
|
%222 = llvm.insertvalue %221, %218[0, 1] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%223 = llvm.extractvalue %209[0, 2] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%224 = llvm.extractvalue %209[0, 2] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%225 = llvm.fmul %223, %224 : vector<1xf32> |
|
%226 = llvm.insertvalue %225, %222[0, 2] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%227 = llvm.extractvalue %209[0, 3] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%228 = llvm.extractvalue %209[0, 3] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%229 = llvm.fmul %227, %228 : vector<1xf32> |
|
%230 = llvm.insertvalue %229, %226[0, 3] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%231 = builtin.unrealized_conversion_cast %230 : !llvm.array<1 x array<4 x vector<1xf32>>> to vector<1x4x1xf32> |
|
%232 = math.fma %214, %231, %208 : vector<1x4x1xf32> |
|
%233 = builtin.unrealized_conversion_cast %232 : vector<1x4x1xf32> to !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%234 = llvm.extractvalue %233[0, 0] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%235 = llvm.extractvalue %15[0, 0] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%236 = llvm.fadd %234, %235 : vector<1xf32> |
|
%237 = llvm.extractvalue %233[0, 1] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%238 = llvm.extractvalue %15[0, 1] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%239 = llvm.fadd %237, %238 : vector<1xf32> |
|
%240 = llvm.extractvalue %233[0, 2] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%241 = llvm.extractvalue %15[0, 2] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%242 = llvm.fadd %240, %241 : vector<1xf32> |
|
%243 = llvm.extractvalue %233[0, 3] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%244 = llvm.extractvalue %15[0, 3] : !llvm.array<1 x array<4 x vector<1xf32>>> |
|
%245 = llvm.fadd %243, %244 : vector<1xf32> |
|
%246 = llvm.fptosi %195 : vector<1xf32> to vector<1xi32> |
|
%247 = llvm.fptosi %198 : vector<1xf32> to vector<1xi32> |
|
%248 = llvm.fptosi %201 : vector<1xf32> to vector<1xi32> |
|
%249 = llvm.fptosi %204 : vector<1xf32> to vector<1xi32> |
|
%250 = llvm.extractvalue %23[0, 0] : !llvm.array<1 x array<4 x vector<1xi32>>> |
|
%251 = llvm.add %246, %250 : vector<1xi32> |
|
%252 = llvm.extractvalue %23[0, 1] : !llvm.array<1 x array<4 x vector<1xi32>>> |
|
%253 = llvm.add %247, %252 : vector<1xi32> |
|
%254 = llvm.extractvalue %23[0, 2] : !llvm.array<1 x array<4 x vector<1xi32>>> |
|
%255 = llvm.add %248, %254 : vector<1xi32> |
|
%256 = llvm.extractvalue %23[0, 3] : !llvm.array<1 x array<4 x vector<1xi32>>> |
|
%257 = llvm.add %249, %256 : vector<1xi32> |
|
%258 = llvm.extractvalue %24[0, 0] : !llvm.array<1 x array<4 x vector<1xi32>>> |
|
%259 = llvm.shl %251, %258 : vector<1xi32> |
|
%260 = llvm.extractvalue %24[0, 1] : !llvm.array<1 x array<4 x vector<1xi32>>> |
|
%261 = llvm.shl %253, %260 : vector<1xi32> |
|
%262 = llvm.extractvalue %24[0, 2] : !llvm.array<1 x array<4 x vector<1xi32>>> |
|
%263 = llvm.shl %255, %262 : vector<1xi32> |
|
%264 = llvm.extractvalue %24[0, 3] : !llvm.array<1 x array<4 x vector<1xi32>>> |
|
%265 = llvm.shl %257, %264 : vector<1xi32> |
|
%266 = llvm.bitcast %259 : vector<1xi32> to vector<1xf32> |
|
%267 = llvm.bitcast %261 : vector<1xi32> to vector<1xf32> |
|
%268 = llvm.bitcast %263 : vector<1xi32> to vector<1xf32> |
|
%269 = llvm.bitcast %265 : vector<1xi32> to vector<1xf32> |
|
%270 = llvm.fmul %236, %266 : vector<1xf32> |
|
%271 = llvm.fmul %239, %267 : vector<1xf32> |
|
%272 = llvm.fmul %242, %268 : vector<1xf32> |
|
%273 = llvm.fmul %245, %269 : vector<1xf32> |
|
%274 = llvm.fadd %270, %110 : vector<1xf32> |
|
%275 = llvm.fadd %271, %274 : vector<1xf32> |
|
%276 = llvm.fadd %272, %275 : vector<1xf32> |
|
%277 = llvm.fadd %273, %276 : vector<1xf32> |
|
%278 = llvm.extractelement %277[%0 : i64] : vector<1xf32> |
|
%279 = llvm.mul %47, %9 : i64 |
|
%280 = llvm.add %279, %104 : i64 |
|
%281 = llvm.getelementptr %arg1[%280] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f32 |
|
llvm.store %278, %281 : f32, !llvm.ptr<1> |
|
%282 = llvm.add %64, %7 : i64 |
|
llvm.br ^bb3(%282 : i64) |
|
^bb5: // pred: ^bb3 |
|
nvvm.barrier0 |
|
llvm.return |
|
} |
|
} |