Created
March 7, 2022 23:59
-
-
Save zxybazh/c90761de0ce986fd38eae56b2f5b560e to your computer and use it in GitHub Desktop.
Tune_relay cuda tuning log.
This file has been truncated, but you can view the full file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
INFO:tvm.meta_schedule.tune:Working directory: /tmp/tmppqhiwht0 | |
INFO:tvm.meta_schedule.tune:Before task deduplication: 20 tasks | |
INFO:tvm.meta_schedule.tune:After task deduplication: 20 tasks | |
INFO:tvm.meta_schedule.builder.local_builder:LocalBuilder: max_workers = 24 | |
INFO:tvm.meta_schedule.runner.local_runner:LocalRunner: max_workers = 1 | |
[14:36:09] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:96: Initializing Task #0: "vm_mod_fused_layout_transform", mod = | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 3, 224, 224), "float32"], T_layout_trans: T.Buffer[(1, 224, 224, 3), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
# with T.block("root") | |
for i0, i1, i2, i3 in T.grid(1, 224, 224, 3): | |
with T.block("T_layout_trans"): | |
ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(placeholder[ax0, ax3, ax1, ax2]) | |
T.writes(T_layout_trans[ax0, ax1, ax2, ax3]) | |
T_layout_trans[ax0, ax1, ax2, ax3] = T.if_then_else(ax0 < 1 and ax3 < 3 and ax1 < 224 and ax2 < 224, placeholder[ax0, ax3, ax1, ax2], T.float32(0), dtype="float32") | |
[14:36:09] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:113: Total 1 design space(s) generated | |
[14:36:09] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:118: Design space #0: | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 3, 224, 224), "float32"], T_layout_trans: T.Buffer[(1, 224, 224, 3), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
with T.block("root"): | |
T.reads() | |
T.writes() | |
T.block_attr({"meta_schedule.unroll_explicit":16}) | |
for i0, i1, i2, i3 in T.grid(1, 224, 224, 3): | |
with T.block("T_layout_trans"): | |
ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(placeholder[ax0, ax3, ax1, ax2]) | |
T.writes(T_layout_trans[ax0, ax1, ax2, ax3]) | |
T_layout_trans[ax0, ax1, ax2, ax3] = T.if_then_else(ax0 < 1 and ax3 < 3 and ax1 < 224 and ax2 < 224, placeholder[ax0, ax3, ax1, ax2], T.float32(0), dtype="float32") | |
b0 = sch.get_block(name="root", func_name="main") | |
v1 = sch.sample_categorical(candidates=[0, 16, 64, 512, 1024], probs=[0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001], decision=1) | |
sch.annotate(block_or_loop=b0, ann_key="meta_schedule.unroll_explicit", ann_val=v1) | |
[14:36:09] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:96: Initializing Task #1: "vm_mod_fused_nn_conv2d_add_nn_relu", mod = | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 1, 1, 64), "float32"], placeholder_1: T.Buffer[(7, 7, 3, 64), "float32"], placeholder_2: T.Buffer[(1, 224, 224, 3), "float32"], T_relu: T.Buffer[(1, 112, 112, 64), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
# with T.block("root") | |
pad_temp = T.alloc_buffer([1, 230, 230, 3], dtype="float32") | |
conv2d_nhwc = T.alloc_buffer([1, 112, 112, 64], dtype="float32") | |
T_add = T.alloc_buffer([1, 112, 112, 64], dtype="float32") | |
for i0, i1, i2, i3 in T.grid(1, 230, 230, 3): | |
with T.block("pad_temp"): | |
i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(placeholder_2[i0_1, i1_1 - 3, i2_1 - 3, i3_1]) | |
T.writes(pad_temp[i0_1, i1_1, i2_1, i3_1]) | |
pad_temp[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(3 <= i1_1 and i1_1 < 227 and 3 <= i2_1 and i2_1 < 227, placeholder_2[i0_1, i1_1 - 3, i2_1 - 3, i3_1], T.float32(0), dtype="float32") | |
for i0, i1, i2, i3, i4, i5, i6 in T.grid(1, 112, 112, 64, 7, 7, 3): | |
with T.block("conv2d_nhwc"): | |
nn, yy, xx, ff, ry, rx, rc = T.axis.remap("SSSSRRR", [i0, i1, i2, i3, i4, i5, i6]) | |
T.reads(conv2d_nhwc[nn, yy, xx, ff], pad_temp[nn, yy * 2 + ry, xx * 2 + rx, rc], placeholder_1[ry, rx, rc, ff]) | |
T.writes(conv2d_nhwc[nn, yy, xx, ff]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "workload":["conv2d_nhwc.gpu", ["TENSOR", [1, 224, 224, 3], "float32"], ["TENSOR", [7, 7, 3, 64], "float32"], [2, 2], [3, 3, 3, 3], [1, 1], "float32"]}) | |
with T.init(): | |
conv2d_nhwc[nn, yy, xx, ff] = T.float32(0) | |
conv2d_nhwc[nn, yy, xx, ff] = conv2d_nhwc[nn, yy, xx, ff] + pad_temp[nn, yy * 2 + ry, xx * 2 + rx, rc] * placeholder_1[ry, rx, rc, ff] | |
for i0, i1, i2, i3 in T.grid(1, 112, 112, 64): | |
with T.block("T_add"): | |
ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(conv2d_nhwc[ax0, ax1, ax2, ax3], placeholder[ax0, 0, 0, ax3]) | |
T.writes(T_add[ax0, ax1, ax2, ax3]) | |
T_add[ax0, ax1, ax2, ax3] = conv2d_nhwc[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3] | |
for i0, i1, i2, i3 in T.grid(1, 112, 112, 64): | |
with T.block("T_relu"): | |
ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(T_add[ax0, ax1, ax2, ax3]) | |
T.writes(T_relu[ax0, ax1, ax2, ax3]) | |
T_relu[ax0, ax1, ax2, ax3] = T.max(T_add[ax0, ax1, ax2, ax3], T.float32(0)) | |
[14:36:09] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:113: Total 1 design space(s) generated | |
[14:36:09] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:118: Design space #0: | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 1, 1, 64), "float32"], placeholder_1: T.Buffer[(7, 7, 3, 64), "float32"], placeholder_2: T.Buffer[(1, 224, 224, 3), "float32"], T_relu: T.Buffer[(1, 112, 112, 64), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
with T.block("root"): | |
T.reads() | |
T.writes() | |
T.block_attr({"meta_schedule.unroll_explicit":64}) | |
conv2d_nhwc_local = T.alloc_buffer([1, 112, 112, 64], dtype="float32", scope="local") | |
pad_temp_shared = T.alloc_buffer([1, 230, 230, 3], dtype="float32", scope="shared") | |
placeholder_shared = T.alloc_buffer([7, 7, 3, 64], dtype="float32", scope="shared") | |
for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(7, thread="blockIdx.x"): | |
for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(4, thread="vthread.x"): | |
for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(16, thread="threadIdx.x"): | |
for i4_0, i5_0, i6_0 in T.grid(1, 1, 3): | |
for ax0_ax1_ax2_ax3_fused in T.serial(8473): | |
with T.block("pad_temp_shared"): | |
v0 = T.axis.spatial(1, 0) | |
v1 = T.axis.spatial(230, i0_0_i1_0_i2_0_i3_0_fused * 32 + ax0_ax1_ax2_ax3_fused % 8473 // 229) | |
v2 = T.axis.spatial(230, ax0_ax1_ax2_ax3_fused % 229) | |
v3 = T.axis.spatial(3, i6_0 + 0) | |
T.reads(placeholder_2[v0, v1 - 3, v2 - 3, v3]) | |
T.writes(pad_temp_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":1}) | |
pad_temp_shared[v0, v1, v2, v3] = T.if_then_else(3 <= v1 and v1 < 227 and 3 <= v2 and v2 < 227, placeholder_2[v0, v1 - 3, v2 - 3, v3], T.float32(0), dtype="float32") | |
for ax0_ax1_ax2_ax3_fused in T.serial(3136): | |
with T.block("placeholder_shared"): | |
v0 = T.axis.spatial(7, ax0_ax1_ax2_ax3_fused // 448) | |
v1 = T.axis.spatial(7, ax0_ax1_ax2_ax3_fused % 448 // 64) | |
v2 = T.axis.spatial(3, i6_0) | |
v3 = T.axis.spatial(64, ax0_ax1_ax2_ax3_fused % 64) | |
T.reads(placeholder_1[v0, v1, v2, v3]) | |
T.writes(placeholder_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":3}) | |
placeholder_shared[v0, v1, v2, v3] = placeholder_1[v0, v1, v2, v3] | |
for i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3, i4_2, i5_2, i6_2, i0_4, i1_4, i2_4, i3_4 in T.grid(7, 1, 1, 1, 1, 2, 2, 1, 7, 1, 1, 1, 28, 16): | |
with T.block("conv2d_nhwc"): | |
nn = T.axis.spatial(1, 0) | |
yy = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused * 16 + i0_2_i1_2_i2_2_i3_2_fused) | |
xx = T.axis.spatial(112, i0_1_i1_1_i2_1_i3_1_fused // 2 * 56 + i2_3 * 28 + i2_4) | |
ff = T.axis.spatial(64, i0_1_i1_1_i2_1_i3_1_fused % 2 * 32 + i3_3 * 16 + i3_4) | |
ry, rx, rc = T.axis.remap("RRR", [i4_1, i5_2, i6_0]) | |
T.reads(conv2d_nhwc_local[nn, yy, xx, ff], pad_temp_shared[nn, yy * 2 + ry, xx * 2 + rx, rc], placeholder_shared[ry, rx, rc, ff]) | |
T.writes(conv2d_nhwc_local[nn, yy, xx, ff]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["conv2d_nhwc.gpu", ["TENSOR", [1, 224, 224, 3], "float32"], ["TENSOR", [7, 7, 3, 64], "float32"], [2, 2], [3, 3, 3, 3], [1, 1], "float32"]}) | |
with T.init(): | |
conv2d_nhwc_local[nn, yy, xx, ff] = T.float32(0) | |
conv2d_nhwc_local[nn, yy, xx, ff] = conv2d_nhwc_local[nn, yy, xx, ff] + pad_temp_shared[nn, yy * 2 + ry, xx * 2 + rx, rc] * placeholder_shared[ry, rx, rc, ff] | |
for ax0, ax1, ax2, ax3 in T.grid(1, 1, 56, 32): | |
with T.block("conv2d_nhwc_local"): | |
v0 = T.axis.spatial(1, ax0) | |
v1 = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused * 16 + i0_2_i1_2_i2_2_i3_2_fused + ax1) | |
v2 = T.axis.spatial(112, i0_1_i1_1_i2_1_i3_1_fused // 2 * 56 + ax2) | |
v3 = T.axis.spatial(64, i0_1_i1_1_i2_1_i3_1_fused % 2 * 32 + ax3) | |
T.reads(conv2d_nhwc_local[v0, v1, v2, v3], placeholder[v0, 0, 0, v3]) | |
T.writes(T_relu[v0, v1, v2, v3]) | |
T_relu[v0, v1, v2, v3] = T.max(conv2d_nhwc_local[v0, v1, v2, v3] + placeholder[v0, 0, 0, v3], T.float32(0)) | |
b0 = sch.get_block(name="pad_temp", func_name="main") | |
b1 = sch.get_block(name="conv2d_nhwc", func_name="main") | |
b2 = sch.get_block(name="T_add", func_name="main") | |
b3 = sch.get_block(name="T_relu", func_name="main") | |
b4 = sch.get_block(name="root", func_name="main") | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS") | |
l5, l6, l7, l8, l9, l10, l11 = sch.get_loops(block=b1) | |
v12, v13, v14, v15, v16 = sch.sample_perfect_tile(loop=l5, n=5, max_innermost_factor=64, decision=[1, 1, 1, 1, 1]) | |
l17, l18, l19, l20, l21 = sch.split(loop=l5, factors=[v12, v13, v14, v15, v16]) | |
v22, v23, v24, v25, v26 = sch.sample_perfect_tile(loop=l6, n=5, max_innermost_factor=64, decision=[7, 1, 16, 1, 1]) | |
l27, l28, l29, l30, l31 = sch.split(loop=l6, factors=[v22, v23, v24, v25, v26]) | |
v32, v33, v34, v35, v36 = sch.sample_perfect_tile(loop=l7, n=5, max_innermost_factor=64, decision=[1, 2, 1, 2, 28]) | |
l37, l38, l39, l40, l41 = sch.split(loop=l7, factors=[v32, v33, v34, v35, v36]) | |
v42, v43, v44, v45, v46 = sch.sample_perfect_tile(loop=l8, n=5, max_innermost_factor=64, decision=[1, 2, 1, 2, 16]) | |
l47, l48, l49, l50, l51 = sch.split(loop=l8, factors=[v42, v43, v44, v45, v46]) | |
v52, v53, v54 = sch.sample_perfect_tile(loop=l9, n=3, max_innermost_factor=64, decision=[1, 7, 1]) | |
l55, l56, l57 = sch.split(loop=l9, factors=[v52, v53, v54]) | |
v58, v59, v60 = sch.sample_perfect_tile(loop=l10, n=3, max_innermost_factor=64, decision=[1, 1, 7]) | |
l61, l62, l63 = sch.split(loop=l10, factors=[v58, v59, v60]) | |
v64, v65, v66 = sch.sample_perfect_tile(loop=l11, n=3, max_innermost_factor=64, decision=[3, 1, 1]) | |
l67, l68, l69 = sch.split(loop=l11, factors=[v64, v65, v66]) | |
sch.reorder(l17, l27, l37, l47, l18, l28, l38, l48, l19, l29, l39, l49, l55, l61, l67, l56, l62, l68, l20, l30, l40, l50, l57, l63, l69, l21, l31, l41, l51) | |
l70 = sch.fuse(l17, l27, l37, l47) | |
sch.bind(loop=l70, thread_axis="blockIdx.x") | |
l71 = sch.fuse(l18, l28, l38, l48) | |
sch.bind(loop=l71, thread_axis="vthread.x") | |
l72 = sch.fuse(l19, l29, l39, l49) | |
sch.bind(loop=l72, thread_axis="threadIdx.x") | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.thread_extent_low_inclusive", ann_val=32) | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=1024) | |
b73 = sch.cache_write(block=b1, write_buffer_index=0, storage_scope="local") | |
sch.reverse_compute_at(block=b73, loop=l72, preserve_unit_loops=True) | |
b74 = sch.cache_read(block=b1, read_buffer_index=1, storage_scope="shared") | |
sch.compute_at(block=b74, loop=l67, preserve_unit_loops=True) | |
l75, l76, l77, l78, l79, l80, l81, l82, l83, l84 = sch.get_loops(block=b74) | |
l85 = sch.fuse(l81, l82, l83, l84) | |
v86 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=0) | |
sch.annotate(block_or_loop=b74, ann_key="meta_schedule.cooperative_fetch", ann_val=v86) | |
b87 = sch.cache_read(block=b1, read_buffer_index=2, storage_scope="shared") | |
sch.compute_at(block=b87, loop=l67, preserve_unit_loops=True) | |
l88, l89, l90, l91, l92, l93, l94, l95, l96, l97 = sch.get_loops(block=b87) | |
l98 = sch.fuse(l94, l95, l96, l97) | |
v99 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=2) | |
sch.annotate(block_or_loop=b87, ann_key="meta_schedule.cooperative_fetch", ann_val=v99) | |
sch.reverse_compute_inline(block=b3) | |
sch.reverse_compute_inline(block=b2) | |
sch.compute_inline(block=b0) | |
v100 = sch.sample_categorical(candidates=[0, 16, 64, 512, 1024], probs=[0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001], decision=2) | |
sch.annotate(block_or_loop=b4, ann_key="meta_schedule.unroll_explicit", ann_val=v100) | |
[14:36:09] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:96: Initializing Task #2: "vm_mod_fused_nn_max_pool2d", mod = | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 112, 112, 64), "float32"], tensor: T.Buffer[(1, 56, 56, 64), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
# with T.block("root") | |
pad_temp = T.alloc_buffer([1, 114, 114, 64], dtype="float32") | |
for i0, i1, i2, i3 in T.grid(1, 114, 114, 64): | |
with T.block("pad_temp"): | |
ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(placeholder[ax0, ax1 - 1, ax2 - 1, ax3]) | |
T.writes(pad_temp[ax0, ax1, ax2, ax3]) | |
pad_temp[ax0, ax1, ax2, ax3] = T.if_then_else(1 <= ax1 and ax1 < 113 and 1 <= ax2 and ax2 < 113, placeholder[ax0, ax1 - 1, ax2 - 1, ax3], T.float32(-3.4028234663852886e+38), dtype="float32") | |
for i0, i1, i2, i3, i4, i5 in T.grid(1, 56, 56, 64, 3, 3): | |
with T.block("tensor"): | |
ax0, ax1, ax2, ax3, rv0, rv1 = T.axis.remap("SSSSRR", [i0, i1, i2, i3, i4, i5]) | |
T.reads(tensor[ax0, ax1, ax2, ax3], pad_temp[ax0, ax1 * 2 + rv0, ax2 * 2 + rv1, ax3]) | |
T.writes(tensor[ax0, ax1, ax2, ax3]) | |
with T.init(): | |
tensor[ax0, ax1, ax2, ax3] = T.float32(-3.4028234663852886e+38) | |
tensor[ax0, ax1, ax2, ax3] = T.max(tensor[ax0, ax1, ax2, ax3], pad_temp[ax0, ax1 * 2 + rv0, ax2 * 2 + rv1, ax3]) | |
[14:36:10] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:113: Total 1 design space(s) generated | |
[14:36:10] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:118: Design space #0: | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 112, 112, 64), "float32"], tensor: T.Buffer[(1, 56, 56, 64), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
with T.block("root"): | |
T.reads() | |
T.writes() | |
T.block_attr({"meta_schedule.unroll_explicit":0}) | |
for i0, i1, i2, i3, i4, i5 in T.grid(1, 56, 56, 64, 3, 3): | |
with T.block("tensor"): | |
ax0, ax1, ax2, ax3, rv0, rv1 = T.axis.remap("SSSSRR", [i0, i1, i2, i3, i4, i5]) | |
T.reads(tensor[ax0, ax1, ax2, ax3], placeholder[ax0, ax1 * 2 + rv0 - 1, ax2 * 2 + rv1 - 1, ax3]) | |
T.writes(tensor[ax0, ax1, ax2, ax3]) | |
with T.init(): | |
tensor[ax0, ax1, ax2, ax3] = T.float32(-3.4028234663852886e+38) | |
tensor[ax0, ax1, ax2, ax3] = T.max(tensor[ax0, ax1, ax2, ax3], T.if_then_else(1 <= ax1 * 2 + rv0 and ax1 * 2 + rv0 < 113 and 1 <= ax2 * 2 + rv1 and ax2 * 2 + rv1 < 113, placeholder[ax0, ax1 * 2 + rv0 - 1, ax2 * 2 + rv1 - 1, ax3], T.float32(-3.4028234663852886e+38), dtype="float32")) | |
b0 = sch.get_block(name="pad_temp", func_name="main") | |
b1 = sch.get_block(name="root", func_name="main") | |
sch.compute_inline(block=b0) | |
v2 = sch.sample_categorical(candidates=[0, 16, 64, 512, 1024], probs=[0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001], decision=0) | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.unroll_explicit", ann_val=v2) | |
[14:36:10] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:96: Initializing Task #3: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu", mod = | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 1, 1, 64), "float32"], placeholder_1: T.Buffer[(6, 6, 64, 64), "float32"], placeholder_2: T.Buffer[(1, 56, 56, 64), "float32"], T_relu: T.Buffer[(1, 56, 56, 64), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
# with T.block("root") | |
data_pad = T.alloc_buffer([1, 58, 58, 64], dtype="float32") | |
input_tile = T.alloc_buffer([6, 6, 196, 64], dtype="float32") | |
B = T.alloc_buffer([6, 6], dtype="float32") | |
data_pack = T.alloc_buffer([6, 6, 196, 64], dtype="float32") | |
bgemm = T.alloc_buffer([6, 6, 196, 64], dtype="float32") | |
A = T.alloc_buffer([6, 4], dtype="float32") | |
inverse = T.alloc_buffer([4, 4, 196, 64], dtype="float32") | |
conv2d_winograd = T.alloc_buffer([1, 56, 56, 64], dtype="float32") | |
T_add = T.alloc_buffer([1, 56, 56, 64], dtype="float32") | |
for i0, i1, i2, i3 in T.grid(1, 58, 58, 64): | |
with T.block("data_pad"): | |
i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(placeholder_2[i0_1, i1_1 - 1, i2_1 - 1, i3_1]) | |
T.writes(data_pad[i0_1, i1_1, i2_1, i3_1]) | |
T.block_attr({"schedule_rule":"None"}) | |
data_pad[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(1 <= i1_1 and i1_1 < 57 and 1 <= i2_1 and i2_1 < 57, placeholder_2[i0_1, i1_1 - 1, i2_1 - 1, i3_1], T.float32(0), dtype="float32") | |
for i0, i1, i2, i3 in T.grid(6, 6, 196, 64): | |
with T.block("input_tile"): | |
eps, nu, p, ci = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(data_pad[p // 196, p % 196 // 14 * 4 + eps, p % 14 * 4 + nu, ci]) | |
T.writes(input_tile[eps, nu, p, ci]) | |
T.block_attr({"schedule_rule":"None"}) | |
input_tile[eps, nu, p, ci] = data_pad[p // 196, p % 196 // 14 * 4 + eps, p % 14 * 4 + nu, ci] | |
for i0, i1 in T.grid(6, 6): | |
with T.block("B"): | |
i, j = T.axis.remap("SS", [i0, i1]) | |
T.reads() | |
T.writes(B[i, j]) | |
T.block_attr({"const_matrix":True, "schedule_rule":"meta_schedule.compute_inline"}) | |
B[i, j] = T.Select(i % 6 == 5 and j % 6 == 5, T.float32(1), T.Select(i % 6 == 5 and j % 6 == 4, T.float32(0), T.Select(i % 6 == 5 and j % 6 == 3, T.float32(0), T.Select(i % 6 == 5 and j % 6 == 2, T.float32(0), T.Select(i % 6 == 5 and j % 6 == 1, T.float32(0), T.Select(i % 6 == 5 and j % 6 == 0, T.float32(0), T.Select(i % 6 == 4 and j % 6 == 5, T.float32(1.5), T.Select(i % 6 == 4 and j % 6 == 4, T.float32(1), T.Select(i % 6 == 4 and j % 6 == 3, T.float32(1), T.Select(i % 6 == 4 and j % 6 == 2, T.float32(1), T.Select(i % 6 == 4 and j % 6 == 1, T.float32(1), T.Select(i % 6 == 4 and j % 6 == 0, T.float32(1), T.Select(i % 6 == 3 and j % 6 == 5, T.float32(-2), T.Select(i % 6 == 3 and j % 6 == 4, T.float32(-0.5), T.Select(i % 6 == 3 and j % 6 == 3, T.float32(2), T.Select(i % 6 == 3 and j % 6 == 2, T.float32(2.5), T.Select(i % 6 == 3 and j % 6 == 1, T.float32(0.5), T.Select(i % 6 == 3 and j % 6 == 0, T.float32(1.5), T.Select(i % 6 == 2 and j % 6 == 5, T.float32(-1.5), T.Select(i % 6 == 2 and j % 6 == 4, T.float32(-1), T.Select(i % 6 == 2 and j % 6 == 3, T.float32(-1), T.Select(i % 6 == 2 and j % 6 == 2, T.float32(0.5), T.Select(i % 6 == 2 and j % 6 == 1, T.float32(-2.5), T.Select(i % 6 == 2 and j % 6 == 0, T.float32(-2), T.Select(i % 6 == 1 and j % 6 == 5, T.float32(1), T.Select(i % 6 == 1 and j % 6 == 4, T.float32(0.5), T.Select(i % 6 == 1 and j % 6 == 3, T.float32(-2), T.Select(i % 6 == 1 and j % 6 == 2, T.float32(-1), T.Select(i % 6 == 1 and j % 6 == 1, T.float32(1), T.Select(i % 6 == 1 and j % 6 == 0, T.float32(-1.5), T.Select(i % 6 == 0 and j % 6 == 5, T.float32(0), T.Select(i % 6 == 0 and j % 6 == 4, T.float32(0), T.Select(i % 6 == 0 and j % 6 == 3, T.float32(0), T.Select(i % 6 == 0 and j % 6 == 2, T.float32(0), T.Select(i % 6 == 0 and j % 6 == 1, T.float32(0), T.Select(i % 6 == 0 and j % 6 == 0, T.float32(1), T.float32(0))))))))))))))))))))))))))))))))))))) | |
for i0, i1, i2, i3, i4, i5 in T.grid(6, 6, 196, 64, 6, 6): | |
with T.block("data_pack"): | |
eps, nu, p, ci, r_a, r_b = T.axis.remap("SSSSRR", [i0, i1, i2, i3, i4, i5]) | |
T.reads(data_pack[eps, nu, p, ci], input_tile[r_a, r_b, p, ci], B[T.min(r_a, r_b) : T.max(r_a, r_b) + 1, T.min(eps, nu) : T.max(eps, nu) + 1]) | |
T.writes(data_pack[eps, nu, p, ci]) | |
T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["eps", "nu", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_data_pack.cuda"}) | |
with T.init(): | |
data_pack[eps, nu, p, ci] = T.float32(0) | |
data_pack[eps, nu, p, ci] = data_pack[eps, nu, p, ci] + input_tile[r_a, r_b, p, ci] * B[r_a, eps] * B[r_b, nu] | |
for i0, i1, i2, i3, i4 in T.grid(6, 6, 196, 64, 64): | |
with T.block("bgemm"): | |
eps, nu, p, co, ci = T.axis.remap("SSSSR", [i0, i1, i2, i3, i4]) | |
T.reads(bgemm[eps, nu, p, co], data_pack[eps, nu, p, ci], placeholder_1[eps, nu, co, ci]) | |
T.writes(bgemm[eps, nu, p, co]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1]}) | |
with T.init(): | |
bgemm[eps, nu, p, co] = T.float32(0) | |
bgemm[eps, nu, p, co] = bgemm[eps, nu, p, co] + data_pack[eps, nu, p, ci] * placeholder_1[eps, nu, co, ci] | |
for i0, i1 in T.grid(6, 4): | |
with T.block("A"): | |
i, j = T.axis.remap("SS", [i0, i1]) | |
T.reads() | |
T.writes(A[i, j]) | |
T.block_attr({"const_matrix":True, "schedule_rule":"meta_schedule.compute_inline"}) | |
A[i, j] = T.Select(i % 6 == 5 and j % 4 == 3, T.float32(1), T.Select(i % 6 == 5 and j % 4 == 2, T.float32(0), T.Select(i % 6 == 5 and j % 4 == 1, T.float32(0), T.Select(i % 6 == 5 and j % 4 == 0, T.float32(0), T.Select(i % 6 == 4 and j % 4 == 3, T.float32(-8), T.Select(i % 6 == 4 and j % 4 == 2, T.float32(4), T.Select(i % 6 == 4 and j % 4 == 1, T.float32(-2), T.Select(i % 6 == 4 and j % 4 == 0, T.float32(1), T.Select(i % 6 == 3 and j % 4 == 3, T.float32(0.125), T.Select(i % 6 == 3 and j % 4 == 2, T.float32(0.25), T.Select(i % 6 == 3 and j % 4 == 1, T.float32(0.5), T.Select(i % 6 == 3 and j % 4 == 0, T.float32(1), T.Select(i % 6 == 2 and j % 4 == 3, T.float32(1), T.Select(i % 6 == 2 and j % 4 == 2, T.float32(1), T.Select(i % 6 == 2 and j % 4 == 1, T.float32(1), T.Select(i % 6 == 2 and j % 4 == 0, T.float32(1), T.Select(i % 6 == 1 and j % 4 == 3, T.float32(-1), T.Select(i % 6 == 1 and j % 4 == 2, T.float32(1), T.Select(i % 6 == 1 and j % 4 == 1, T.float32(-1), T.Select(i % 6 == 1 and j % 4 == 0, T.float32(1), T.Select(i % 6 == 0 and j % 4 == 3, T.float32(0), T.Select(i % 6 == 0 and j % 4 == 2, T.float32(0), T.Select(i % 6 == 0 and j % 4 == 1, T.float32(0), T.Select(i % 6 == 0 and j % 4 == 0, T.float32(1), T.float32(0))))))))))))))))))))))))) | |
for i0, i1, i2, i3, i4, i5 in T.grid(4, 4, 196, 64, 6, 6): | |
with T.block("inverse"): | |
vh, vw, p, co, r_a, r_b = T.axis.remap("SSSSRR", [i0, i1, i2, i3, i4, i5]) | |
T.reads(inverse[vh, vw, p, co], bgemm[r_a, r_b, p, co], A[T.min(r_a, r_b) : T.max(r_a, r_b) + 1, T.min(vh, vw) : T.max(vh, vw) + 1]) | |
T.writes(inverse[vh, vw, p, co]) | |
T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["vh", "vw", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_inverse"}) | |
with T.init(): | |
inverse[vh, vw, p, co] = T.float32(0) | |
inverse[vh, vw, p, co] = inverse[vh, vw, p, co] + bgemm[r_a, r_b, p, co] * A[r_a, vh] * A[r_b, vw] | |
for i0, i1, i2, i3 in T.grid(1, 56, 56, 64): | |
with T.block("conv2d_winograd"): | |
n, h, w, co = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(inverse[h % 4, w % 4, n * 196 + h // 4 * 14 + w // 4, co]) | |
T.writes(conv2d_winograd[n, h, w, co]) | |
conv2d_winograd[n, h, w, co] = inverse[h % 4, w % 4, n * 196 + h // 4 * 14 + w // 4, co] | |
for i0, i1, i2, i3 in T.grid(1, 56, 56, 64): | |
with T.block("T_add"): | |
ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(conv2d_winograd[ax0, ax1, ax2, ax3], placeholder[ax0, 0, 0, ax3]) | |
T.writes(T_add[ax0, ax1, ax2, ax3]) | |
T_add[ax0, ax1, ax2, ax3] = conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3] | |
for i0, i1, i2, i3 in T.grid(1, 56, 56, 64): | |
with T.block("T_relu"): | |
ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(T_add[ax0, ax1, ax2, ax3]) | |
T.writes(T_relu[ax0, ax1, ax2, ax3]) | |
T_relu[ax0, ax1, ax2, ax3] = T.max(T_add[ax0, ax1, ax2, ax3], T.float32(0)) | |
[14:36:10] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:113: Total 1 design space(s) generated | |
[14:36:10] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:118: Design space #0: | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 1, 1, 64), "float32"], placeholder_1: T.Buffer[(6, 6, 64, 64), "float32"], placeholder_2: T.Buffer[(1, 56, 56, 64), "float32"], T_relu: T.Buffer[(1, 56, 56, 64), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
with T.block("root"): | |
T.reads() | |
T.writes() | |
T.block_attr({"meta_schedule.unroll_explicit":64}) | |
input_tile_local = T.alloc_buffer([6, 6, 196, 64], dtype="float32", scope="local") | |
data_pack = T.alloc_buffer([6, 6, 196, 64], dtype="float32") | |
bgemm = T.alloc_buffer([6, 6, 196, 64], dtype="float32") | |
inverse = T.alloc_buffer([4, 4, 196, 64], dtype="float32") | |
bgemm_local = T.alloc_buffer([6, 6, 196, 64], dtype="float32", scope="local") | |
data_pack_shared = T.alloc_buffer([6, 6, 196, 64], dtype="float32", scope="shared") | |
placeholder_shared = T.alloc_buffer([6, 6, 64, 64], dtype="float32", scope="shared") | |
for i2_0, i3_0, i2_1, i3_1 in T.grid(14, 32, 14, 2): | |
for ax0, ax1, ax2, ax3 in T.grid(6, 6, 1, 1): | |
with T.block("input_tile"): | |
eps, nu = T.axis.remap("SS", [ax0, ax1]) | |
p = T.axis.spatial(196, i2_0 * 14 + i2_1 + ax2) | |
ci = T.axis.spatial(64, i3_0 * 2 + i3_1 + ax3) | |
T.reads(placeholder_2[p // 196, p % 196 // 14 * 4 + eps - 1, p % 14 * 4 + nu - 1, ci]) | |
T.writes(input_tile_local[eps, nu, p, ci]) | |
T.block_attr({"schedule_rule":"None"}) | |
input_tile_local[eps, nu, p, ci] = T.if_then_else(1 <= p % 196 // 14 * 4 + eps and p % 196 // 14 * 4 + eps < 57 and 1 <= p % 14 * 4 + nu and p % 14 * 4 + nu < 57, placeholder_2[p // 196, p % 196 // 14 * 4 + eps - 1, p % 14 * 4 + nu - 1, ci], T.float32(0), dtype="float32") | |
for i0 in T.unroll(6): | |
for i1 in T.unroll(6): | |
for i4 in T.unroll(6): | |
for i5 in T.unroll(6): | |
with T.block("data_pack"): | |
eps, nu = T.axis.remap("SS", [i0, i1]) | |
p = T.axis.spatial(196, i2_0 * 14 + i2_1) | |
ci = T.axis.spatial(64, i3_0 * 2 + i3_1) | |
r_a, r_b = T.axis.remap("RR", [i4, i5]) | |
T.reads(data_pack[eps, nu, p, ci], input_tile_local[r_a, r_b, p, ci]) | |
T.writes(data_pack[eps, nu, p, ci]) | |
T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["eps", "nu", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_data_pack.cuda"}) | |
with T.init(): | |
data_pack[eps, nu, p, ci] = T.float32(0) | |
data_pack[eps, nu, p, ci] = data_pack[eps, nu, p, ci] + input_tile_local[r_a, r_b, p, ci] * T.Select(r_a % 6 == 5 and eps % 6 == 5, T.float32(1), T.Select(r_a % 6 == 5 and eps % 6 == 4, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 3, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 2, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 1, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 0, T.float32(0), T.Select(r_a % 6 == 4 and eps % 6 == 5, T.float32(1.5), T.Select(r_a % 6 == 4 and eps % 6 == 4, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 3, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 2, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 1, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 0, T.float32(1), T.Select(r_a % 6 == 3 and eps % 6 == 5, T.float32(-2), T.Select(r_a % 6 == 3 and eps % 6 == 4, T.float32(-0.5), T.Select(r_a % 6 == 3 and eps % 6 == 3, T.float32(2), T.Select(r_a % 6 == 3 and eps % 6 == 2, T.float32(2.5), T.Select(r_a % 6 == 3 and eps % 6 == 1, T.float32(0.5), T.Select(r_a % 6 == 3 and eps % 6 == 0, T.float32(1.5), T.Select(r_a % 6 == 2 and eps % 6 == 5, T.float32(-1.5), T.Select(r_a % 6 == 2 and eps % 6 == 4, T.float32(-1), T.Select(r_a % 6 == 2 and eps % 6 == 3, T.float32(-1), T.Select(r_a % 6 == 2 and eps % 6 == 2, T.float32(0.5), T.Select(r_a % 6 == 2 and eps % 6 == 1, T.float32(-2.5), T.Select(r_a % 6 == 2 and eps % 6 == 0, T.float32(-2), T.Select(r_a % 6 == 1 and eps % 6 == 5, T.float32(1), T.Select(r_a % 6 == 1 and eps % 6 == 4, T.float32(0.5), T.Select(r_a % 6 == 1 and eps % 6 == 3, T.float32(-2), T.Select(r_a % 6 == 1 and eps % 6 == 2, T.float32(-1), T.Select(r_a % 6 == 1 and eps % 6 == 1, T.float32(1), T.Select(r_a % 6 == 1 and eps % 6 == 0, T.float32(-1.5), T.Select(r_a % 6 == 0 and eps % 6 == 5, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 4, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 3, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 2, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 1, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 0, T.float32(1), T.float32(0))))))))))))))))))))))))))))))))))))) * T.Select(r_b % 6 == 5 and nu % 6 == 5, T.float32(1), T.Select(r_b % 6 == 5 and nu % 6 == 4, T.float32(0), T.Select(r_b % 6 == 5 and nu % 6 == 3, T.float32(0), T.Select(r_b % 6 == 5 and nu % 6 == 2, T.float32(0), T.Select(r_b % 6 == 5 and nu % 6 == 1, T.float32(0), T.Select(r_b % 6 == 5 and nu % 6 == 0, T.float32(0), T.Select(r_b % 6 == 4 and nu % 6 == 5, T.float32(1.5), T.Select(r_b % 6 == 4 and nu % 6 == 4, T.float32(1), T.Select(r_b % 6 == 4 and nu % 6 == 3, T.float32(1), T.Select(r_b % 6 == 4 and nu % 6 == 2, T.float32(1), T.Select(r_b % 6 == 4 and nu % 6 == 1, T.float32(1), T.Select(r_b % 6 == 4 and nu % 6 == 0, T.float32(1), T.Select(r_b % 6 == 3 and nu % 6 == 5, T.float32(-2), T.Select(r_b % 6 == 3 and nu % 6 == 4, T.float32(-0.5), T.Select(r_b % 6 == 3 and nu % 6 == 3, T.float32(2), T.Select(r_b % 6 == 3 and nu % 6 == 2, T.float32(2.5), T.Select(r_b % 6 == 3 and nu % 6 == 1, T.float32(0.5), T.Select(r_b % 6 == 3 and nu % 6 == 0, T.float32(1.5), T.Select(r_b % 6 == 2 and nu % 6 == 5, T.float32(-1.5), T.Select(r_b % 6 == 2 and nu % 6 == 4, T.float32(-1), T.Select(r_b % 6 == 2 and nu % 6 == 3, T.float32(-1), T.Select(r_b % 6 == 2 and nu % 6 == 2, T.float32(0.5), T.Select(r_b % 6 == 2 and nu % 6 == 1, T.float32(-2.5), T.Select(r_b % 6 == 2 and nu % 6 == 0, T.float32(-2), T.Select(r_b % 6 == 1 and nu % 6 == 5, T.float32(1), T.Select(r_b % 6 == 1 and nu % 6 == 4, T.float32(0.5), T.Select(r_b % 6 == 1 and nu % 6 == 3, T.float32(-2), T.Select(r_b % 6 == 1 and nu % 6 == 2, T.float32(-1), T.Select(r_b % 6 == 1 and nu % 6 == 1, T.float32(1), T.Select(r_b % 6 == 1 and nu % 6 == 0, T.float32(-1.5), T.Select(r_b % 6 == 0 and nu % 6 == 5, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 4, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 3, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 2, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 1, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 0, T.float32(1), T.float32(0))))))))))))))))))))))))))))))))))))) | |
for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(4, thread="blockIdx.x"): | |
for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(2, thread="vthread.x"): | |
for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(14, thread="threadIdx.x"): | |
for i4_0 in T.serial(64): | |
for ax0_ax1_ax2_ax3_fused in T.serial(7056): | |
with T.block("data_pack_shared"): | |
v0 = T.axis.spatial(6, ax0_ax1_ax2_ax3_fused // 1176) | |
v1 = T.axis.spatial(6, ax0_ax1_ax2_ax3_fused % 1176 // 196) | |
v2 = T.axis.spatial(196, ax0_ax1_ax2_ax3_fused % 196) | |
v3 = T.axis.spatial(64, i4_0) | |
T.reads(data_pack[v0, v1, v2, v3]) | |
T.writes(data_pack_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":4}) | |
data_pack_shared[v0, v1, v2, v3] = data_pack[v0, v1, v2, v3] | |
for ax0_ax1_ax2_ax3_fused in T.serial(576): | |
with T.block("placeholder_shared"): | |
v0 = T.axis.spatial(6, ax0_ax1_ax2_ax3_fused // 96) | |
v1 = T.axis.spatial(6, ax0_ax1_ax2_ax3_fused % 96 // 16) | |
v2 = T.axis.spatial(64, i0_0_i1_0_i2_0_i3_0_fused * 16 + ax0_ax1_ax2_ax3_fused % 16) | |
v3 = T.axis.spatial(64, i4_0) | |
T.reads(placeholder_1[v0, v1, v2, v3]) | |
T.writes(placeholder_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":2}) | |
placeholder_shared[v0, v1, v2, v3] = placeholder_1[v0, v1, v2, v3] | |
for i4_1, i0_3, i1_3, i2_3, i3_3, i4_2, i0_4, i1_4, i2_4, i3_4 in T.grid(1, 1, 1, 4, 1, 1, 3, 3, 7, 16): | |
with T.block("bgemm"): | |
eps = T.axis.spatial(6, i0_1_i1_1_i2_1_i3_1_fused * 3 + i0_4) | |
nu = T.axis.spatial(6, i0_2_i1_2_i2_2_i3_2_fused // 7 * 3 + i1_4) | |
p = T.axis.spatial(196, i0_2_i1_2_i2_2_i3_2_fused % 7 * 28 + i2_3 * 7 + i2_4) | |
co = T.axis.spatial(64, i0_0_i1_0_i2_0_i3_0_fused * 16 + i3_4) | |
ci = T.axis.reduce(64, i4_0) | |
T.reads(bgemm_local[eps, nu, p, co], data_pack_shared[eps, nu, p, ci], placeholder_shared[eps, nu, co, ci]) | |
T.writes(bgemm_local[eps, nu, p, co]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS"}) | |
with T.init(): | |
bgemm_local[eps, nu, p, co] = T.float32(0) | |
bgemm_local[eps, nu, p, co] = bgemm_local[eps, nu, p, co] + data_pack_shared[eps, nu, p, ci] * placeholder_shared[eps, nu, co, ci] | |
for ax0, ax1, ax2, ax3 in T.grid(3, 3, 28, 16): | |
with T.block("bgemm_local"): | |
v0 = T.axis.spatial(6, i0_1_i1_1_i2_1_i3_1_fused * 3 + ax0) | |
v1 = T.axis.spatial(6, i0_2_i1_2_i2_2_i3_2_fused // 7 * 3 + ax1) | |
v2 = T.axis.spatial(196, i0_2_i1_2_i2_2_i3_2_fused % 7 * 28 + ax2) | |
v3 = T.axis.spatial(64, i0_0_i1_0_i2_0_i3_0_fused * 16 + ax3) | |
T.reads(bgemm_local[v0, v1, v2, v3]) | |
T.writes(bgemm[v0, v1, v2, v3]) | |
bgemm[v0, v1, v2, v3] = bgemm_local[v0, v1, v2, v3] | |
for i2_0, i3_0, i2_1, i3_1 in T.grid(4, 32, 49, 2): | |
for i0 in T.unroll(4): | |
for i1 in T.unroll(4): | |
for i4 in T.unroll(6): | |
for i5 in T.unroll(6): | |
with T.block("inverse"): | |
vh, vw = T.axis.remap("SS", [i0, i1]) | |
p = T.axis.spatial(196, i2_0 * 49 + i2_1) | |
co = T.axis.spatial(64, i3_0 * 2 + i3_1) | |
r_a, r_b = T.axis.remap("RR", [i4, i5]) | |
T.reads(inverse[vh, vw, p, co], bgemm[r_a, r_b, p, co]) | |
T.writes(inverse[vh, vw, p, co]) | |
T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["vh", "vw", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_inverse"}) | |
with T.init(): | |
inverse[vh, vw, p, co] = T.float32(0) | |
inverse[vh, vw, p, co] = inverse[vh, vw, p, co] + bgemm[r_a, r_b, p, co] * T.Select(r_a % 6 == 5 and vh % 4 == 3, T.float32(1), T.Select(r_a % 6 == 5 and vh % 4 == 2, T.float32(0), T.Select(r_a % 6 == 5 and vh % 4 == 1, T.float32(0), T.Select(r_a % 6 == 5 and vh % 4 == 0, T.float32(0), T.Select(r_a % 6 == 4 and vh % 4 == 3, T.float32(-8), T.Select(r_a % 6 == 4 and vh % 4 == 2, T.float32(4), T.Select(r_a % 6 == 4 and vh % 4 == 1, T.float32(-2), T.Select(r_a % 6 == 4 and vh % 4 == 0, T.float32(1), T.Select(r_a % 6 == 3 and vh % 4 == 3, T.float32(0.125), T.Select(r_a % 6 == 3 and vh % 4 == 2, T.float32(0.25), T.Select(r_a % 6 == 3 and vh % 4 == 1, T.float32(0.5), T.Select(r_a % 6 == 3 and vh % 4 == 0, T.float32(1), T.Select(r_a % 6 == 2 and vh % 4 == 3, T.float32(1), T.Select(r_a % 6 == 2 and vh % 4 == 2, T.float32(1), T.Select(r_a % 6 == 2 and vh % 4 == 1, T.float32(1), T.Select(r_a % 6 == 2 and vh % 4 == 0, T.float32(1), T.Select(r_a % 6 == 1 and vh % 4 == 3, T.float32(-1), T.Select(r_a % 6 == 1 and vh % 4 == 2, T.float32(1), T.Select(r_a % 6 == 1 and vh % 4 == 1, T.float32(-1), T.Select(r_a % 6 == 1 and vh % 4 == 0, T.float32(1), T.Select(r_a % 6 == 0 and vh % 4 == 3, T.float32(0), T.Select(r_a % 6 == 0 and vh % 4 == 2, T.float32(0), T.Select(r_a % 6 == 0 and vh % 4 == 1, T.float32(0), T.Select(r_a % 6 == 0 and vh % 4 == 0, T.float32(1), T.float32(0))))))))))))))))))))))))) * T.Select(r_b % 6 == 5 and vw % 4 == 3, T.float32(1), T.Select(r_b % 6 == 5 and vw % 4 == 2, T.float32(0), T.Select(r_b % 6 == 5 and vw % 4 == 1, T.float32(0), T.Select(r_b % 6 == 5 and vw % 4 == 0, T.float32(0), T.Select(r_b % 6 == 4 and vw % 4 == 3, T.float32(-8), T.Select(r_b % 6 == 4 and vw % 4 == 2, T.float32(4), T.Select(r_b % 6 == 4 and vw % 4 == 1, T.float32(-2), T.Select(r_b % 6 == 4 and vw % 4 == 0, T.float32(1), T.Select(r_b % 6 == 3 and vw % 4 == 3, T.float32(0.125), T.Select(r_b % 6 == 3 and vw % 4 == 2, T.float32(0.25), T.Select(r_b % 6 == 3 and vw % 4 == 1, T.float32(0.5), T.Select(r_b % 6 == 3 and vw % 4 == 0, T.float32(1), T.Select(r_b % 6 == 2 and vw % 4 == 3, T.float32(1), T.Select(r_b % 6 == 2 and vw % 4 == 2, T.float32(1), T.Select(r_b % 6 == 2 and vw % 4 == 1, T.float32(1), T.Select(r_b % 6 == 2 and vw % 4 == 0, T.float32(1), T.Select(r_b % 6 == 1 and vw % 4 == 3, T.float32(-1), T.Select(r_b % 6 == 1 and vw % 4 == 2, T.float32(1), T.Select(r_b % 6 == 1 and vw % 4 == 1, T.float32(-1), T.Select(r_b % 6 == 1 and vw % 4 == 0, T.float32(1), T.Select(r_b % 6 == 0 and vw % 4 == 3, T.float32(0), T.Select(r_b % 6 == 0 and vw % 4 == 2, T.float32(0), T.Select(r_b % 6 == 0 and vw % 4 == 1, T.float32(0), T.Select(r_b % 6 == 0 and vw % 4 == 0, T.float32(1), T.float32(0))))))))))))))))))))))))) | |
for i0, i1, i2, i3 in T.grid(1, 56, 56, 64): | |
with T.block("conv2d_winograd"): | |
n, h, w, co = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(inverse[h % 4, w % 4, n * 196 + h // 4 * 14 + w // 4, co], placeholder[n, 0, 0, co]) | |
T.writes(T_relu[n, h, w, co]) | |
T_relu[n, h, w, co] = T.max(inverse[h % 4, w % 4, n * 196 + h // 4 * 14 + w // 4, co] + placeholder[n, 0, 0, co], T.float32(0)) | |
b0 = sch.get_block(name="B", func_name="main") | |
b1 = sch.get_block(name="data_pack", func_name="main") | |
b2 = sch.get_block(name="bgemm", func_name="main") | |
b3 = sch.get_block(name="A", func_name="main") | |
b4 = sch.get_block(name="inverse", func_name="main") | |
b5 = sch.get_block(name="T_add", func_name="main") | |
b6 = sch.get_block(name="T_relu", func_name="main") | |
b7 = sch.get_block(name="root", func_name="main") | |
sch.compute_inline(block=b0) | |
b8, = sch.get_producers(block=b1) | |
b9, = sch.get_producers(block=b8) | |
l10, l11, l12, l13, l14, l15 = sch.get_loops(block=b1) | |
v16, v17 = sch.sample_perfect_tile(loop=l12, n=2, max_innermost_factor=64, decision=[14, 14]) | |
l18, l19 = sch.split(loop=l12, factors=[v16, v17]) | |
v20, v21 = sch.sample_perfect_tile(loop=l13, n=2, max_innermost_factor=64, decision=[32, 2]) | |
l22, l23 = sch.split(loop=l13, factors=[v20, v21]) | |
sch.unroll(loop=l10) | |
sch.unroll(loop=l11) | |
sch.unroll(loop=l14) | |
sch.unroll(loop=l15) | |
sch.reorder(l18, l22, l19, l23, l10, l11, l14, l15) | |
sch.compute_at(block=b8, loop=l23, preserve_unit_loops=True) | |
sch.set_scope(block=b8, buffer_index=0, storage_scope="local") | |
sch.compute_inline(block=b9) | |
sch.compute_inline(block=b3) | |
l24, l25, l26, l27, l28, l29 = sch.get_loops(block=b4) | |
v30, v31 = sch.sample_perfect_tile(loop=l26, n=2, max_innermost_factor=64, decision=[4, 49]) | |
l32, l33 = sch.split(loop=l26, factors=[v30, v31]) | |
v34, v35 = sch.sample_perfect_tile(loop=l27, n=2, max_innermost_factor=64, decision=[32, 2]) | |
l36, l37 = sch.split(loop=l27, factors=[v34, v35]) | |
sch.unroll(loop=l24) | |
sch.unroll(loop=l25) | |
sch.unroll(loop=l28) | |
sch.unroll(loop=l29) | |
sch.reorder(l32, l36, l33, l37, l24, l25, l28, l29) | |
sch.annotate(block_or_loop=b2, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS") | |
l38, l39, l40, l41, l42 = sch.get_loops(block=b2) | |
v43, v44, v45, v46, v47 = sch.sample_perfect_tile(loop=l38, n=5, max_innermost_factor=64, decision=[1, 2, 1, 1, 3]) | |
l48, l49, l50, l51, l52 = sch.split(loop=l38, factors=[v43, v44, v45, v46, v47]) | |
v53, v54, v55, v56, v57 = sch.sample_perfect_tile(loop=l39, n=5, max_innermost_factor=64, decision=[1, 1, 2, 1, 3]) | |
l58, l59, l60, l61, l62 = sch.split(loop=l39, factors=[v53, v54, v55, v56, v57]) | |
v63, v64, v65, v66, v67 = sch.sample_perfect_tile(loop=l40, n=5, max_innermost_factor=64, decision=[1, 1, 7, 4, 7]) | |
l68, l69, l70, l71, l72 = sch.split(loop=l40, factors=[v63, v64, v65, v66, v67]) | |
v73, v74, v75, v76, v77 = sch.sample_perfect_tile(loop=l41, n=5, max_innermost_factor=64, decision=[4, 1, 1, 1, 16]) | |
l78, l79, l80, l81, l82 = sch.split(loop=l41, factors=[v73, v74, v75, v76, v77]) | |
v83, v84, v85 = sch.sample_perfect_tile(loop=l42, n=3, max_innermost_factor=64, decision=[64, 1, 1]) | |
l86, l87, l88 = sch.split(loop=l42, factors=[v83, v84, v85]) | |
sch.reorder(l48, l58, l68, l78, l49, l59, l69, l79, l50, l60, l70, l80, l86, l87, l51, l61, l71, l81, l88, l52, l62, l72, l82) | |
l89 = sch.fuse(l48, l58, l68, l78) | |
sch.bind(loop=l89, thread_axis="blockIdx.x") | |
l90 = sch.fuse(l49, l59, l69, l79) | |
sch.bind(loop=l90, thread_axis="vthread.x") | |
l91 = sch.fuse(l50, l60, l70, l80) | |
sch.bind(loop=l91, thread_axis="threadIdx.x") | |
sch.annotate(block_or_loop=b2, ann_key="meta_schedule.thread_extent_low_inclusive", ann_val=32) | |
sch.annotate(block_or_loop=b2, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=1024) | |
b92 = sch.cache_write(block=b2, write_buffer_index=0, storage_scope="local") | |
sch.reverse_compute_at(block=b92, loop=l91, preserve_unit_loops=True) | |
b93 = sch.cache_read(block=b2, read_buffer_index=1, storage_scope="shared") | |
sch.compute_at(block=b93, loop=l86, preserve_unit_loops=True) | |
l94, l95, l96, l97, l98, l99, l100, l101 = sch.get_loops(block=b93) | |
l102 = sch.fuse(l98, l99, l100, l101) | |
v103 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=3) | |
sch.annotate(block_or_loop=b93, ann_key="meta_schedule.cooperative_fetch", ann_val=v103) | |
b104 = sch.cache_read(block=b2, read_buffer_index=2, storage_scope="shared") | |
sch.compute_at(block=b104, loop=l86, preserve_unit_loops=True) | |
l105, l106, l107, l108, l109, l110, l111, l112 = sch.get_loops(block=b104) | |
l113 = sch.fuse(l109, l110, l111, l112) | |
v114 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=1) | |
sch.annotate(block_or_loop=b104, ann_key="meta_schedule.cooperative_fetch", ann_val=v114) | |
sch.reverse_compute_inline(block=b6) | |
sch.reverse_compute_inline(block=b5) | |
v115 = sch.sample_categorical(candidates=[0, 16, 64, 512, 1024], probs=[0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001], decision=2) | |
sch.annotate(block_or_loop=b7, ann_key="meta_schedule.unroll_explicit", ann_val=v115) | |
[14:36:10] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:96: Initializing Task #4: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu", mod = | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 56, 56, 64), "float32"], placeholder_1: T.Buffer[(1, 1, 1, 64), "float32"], placeholder_2: T.Buffer[(6, 6, 64, 64), "float32"], placeholder_3: T.Buffer[(1, 56, 56, 64), "float32"], T_relu: T.Buffer[(1, 56, 56, 64), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
# with T.block("root") | |
data_pad = T.alloc_buffer([1, 58, 58, 64], dtype="float32") | |
input_tile = T.alloc_buffer([6, 6, 196, 64], dtype="float32") | |
B = T.alloc_buffer([6, 6], dtype="float32") | |
data_pack = T.alloc_buffer([6, 6, 196, 64], dtype="float32") | |
bgemm = T.alloc_buffer([6, 6, 196, 64], dtype="float32") | |
A = T.alloc_buffer([6, 4], dtype="float32") | |
inverse = T.alloc_buffer([4, 4, 196, 64], dtype="float32") | |
conv2d_winograd = T.alloc_buffer([1, 56, 56, 64], dtype="float32") | |
T_add = T.alloc_buffer([1, 56, 56, 64], dtype="float32") | |
T_add_1 = T.alloc_buffer([1, 56, 56, 64], dtype="float32") | |
for i0, i1, i2, i3 in T.grid(1, 58, 58, 64): | |
with T.block("data_pad"): | |
i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(placeholder_3[i0_1, i1_1 - 1, i2_1 - 1, i3_1]) | |
T.writes(data_pad[i0_1, i1_1, i2_1, i3_1]) | |
T.block_attr({"schedule_rule":"None"}) | |
data_pad[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(1 <= i1_1 and i1_1 < 57 and 1 <= i2_1 and i2_1 < 57, placeholder_3[i0_1, i1_1 - 1, i2_1 - 1, i3_1], T.float32(0), dtype="float32") | |
for i0, i1, i2, i3 in T.grid(6, 6, 196, 64): | |
with T.block("input_tile"): | |
eps, nu, p, ci = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(data_pad[p // 196, p % 196 // 14 * 4 + eps, p % 14 * 4 + nu, ci]) | |
T.writes(input_tile[eps, nu, p, ci]) | |
T.block_attr({"schedule_rule":"None"}) | |
input_tile[eps, nu, p, ci] = data_pad[p // 196, p % 196 // 14 * 4 + eps, p % 14 * 4 + nu, ci] | |
for i0, i1 in T.grid(6, 6): | |
with T.block("B"): | |
i, j = T.axis.remap("SS", [i0, i1]) | |
T.reads() | |
T.writes(B[i, j]) | |
T.block_attr({"const_matrix":True, "schedule_rule":"meta_schedule.compute_inline"}) | |
B[i, j] = T.Select(i % 6 == 5 and j % 6 == 5, T.float32(1), T.Select(i % 6 == 5 and j % 6 == 4, T.float32(0), T.Select(i % 6 == 5 and j % 6 == 3, T.float32(0), T.Select(i % 6 == 5 and j % 6 == 2, T.float32(0), T.Select(i % 6 == 5 and j % 6 == 1, T.float32(0), T.Select(i % 6 == 5 and j % 6 == 0, T.float32(0), T.Select(i % 6 == 4 and j % 6 == 5, T.float32(1.5), T.Select(i % 6 == 4 and j % 6 == 4, T.float32(1), T.Select(i % 6 == 4 and j % 6 == 3, T.float32(1), T.Select(i % 6 == 4 and j % 6 == 2, T.float32(1), T.Select(i % 6 == 4 and j % 6 == 1, T.float32(1), T.Select(i % 6 == 4 and j % 6 == 0, T.float32(1), T.Select(i % 6 == 3 and j % 6 == 5, T.float32(-2), T.Select(i % 6 == 3 and j % 6 == 4, T.float32(-0.5), T.Select(i % 6 == 3 and j % 6 == 3, T.float32(2), T.Select(i % 6 == 3 and j % 6 == 2, T.float32(2.5), T.Select(i % 6 == 3 and j % 6 == 1, T.float32(0.5), T.Select(i % 6 == 3 and j % 6 == 0, T.float32(1.5), T.Select(i % 6 == 2 and j % 6 == 5, T.float32(-1.5), T.Select(i % 6 == 2 and j % 6 == 4, T.float32(-1), T.Select(i % 6 == 2 and j % 6 == 3, T.float32(-1), T.Select(i % 6 == 2 and j % 6 == 2, T.float32(0.5), T.Select(i % 6 == 2 and j % 6 == 1, T.float32(-2.5), T.Select(i % 6 == 2 and j % 6 == 0, T.float32(-2), T.Select(i % 6 == 1 and j % 6 == 5, T.float32(1), T.Select(i % 6 == 1 and j % 6 == 4, T.float32(0.5), T.Select(i % 6 == 1 and j % 6 == 3, T.float32(-2), T.Select(i % 6 == 1 and j % 6 == 2, T.float32(-1), T.Select(i % 6 == 1 and j % 6 == 1, T.float32(1), T.Select(i % 6 == 1 and j % 6 == 0, T.float32(-1.5), T.Select(i % 6 == 0 and j % 6 == 5, T.float32(0), T.Select(i % 6 == 0 and j % 6 == 4, T.float32(0), T.Select(i % 6 == 0 and j % 6 == 3, T.float32(0), T.Select(i % 6 == 0 and j % 6 == 2, T.float32(0), T.Select(i % 6 == 0 and j % 6 == 1, T.float32(0), T.Select(i % 6 == 0 and j % 6 == 0, T.float32(1), T.float32(0))))))))))))))))))))))))))))))))))))) | |
for i0, i1, i2, i3, i4, i5 in T.grid(6, 6, 196, 64, 6, 6): | |
with T.block("data_pack"): | |
eps, nu, p, ci, r_a, r_b = T.axis.remap("SSSSRR", [i0, i1, i2, i3, i4, i5]) | |
T.reads(data_pack[eps, nu, p, ci], input_tile[r_a, r_b, p, ci], B[T.min(r_a, r_b) : T.max(r_a, r_b) + 1, T.min(eps, nu) : T.max(eps, nu) + 1]) | |
T.writes(data_pack[eps, nu, p, ci]) | |
T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["eps", "nu", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_data_pack.cuda"}) | |
with T.init(): | |
data_pack[eps, nu, p, ci] = T.float32(0) | |
data_pack[eps, nu, p, ci] = data_pack[eps, nu, p, ci] + input_tile[r_a, r_b, p, ci] * B[r_a, eps] * B[r_b, nu] | |
for i0, i1, i2, i3, i4 in T.grid(6, 6, 196, 64, 64): | |
with T.block("bgemm"): | |
eps, nu, p, co, ci = T.axis.remap("SSSSR", [i0, i1, i2, i3, i4]) | |
T.reads(bgemm[eps, nu, p, co], data_pack[eps, nu, p, ci], placeholder_2[eps, nu, co, ci]) | |
T.writes(bgemm[eps, nu, p, co]) | |
T.block_attr({"layout_free_placeholders":[placeholder_2]}) | |
with T.init(): | |
bgemm[eps, nu, p, co] = T.float32(0) | |
bgemm[eps, nu, p, co] = bgemm[eps, nu, p, co] + data_pack[eps, nu, p, ci] * placeholder_2[eps, nu, co, ci] | |
for i0, i1 in T.grid(6, 4): | |
with T.block("A"): | |
i, j = T.axis.remap("SS", [i0, i1]) | |
T.reads() | |
T.writes(A[i, j]) | |
T.block_attr({"const_matrix":True, "schedule_rule":"meta_schedule.compute_inline"}) | |
A[i, j] = T.Select(i % 6 == 5 and j % 4 == 3, T.float32(1), T.Select(i % 6 == 5 and j % 4 == 2, T.float32(0), T.Select(i % 6 == 5 and j % 4 == 1, T.float32(0), T.Select(i % 6 == 5 and j % 4 == 0, T.float32(0), T.Select(i % 6 == 4 and j % 4 == 3, T.float32(-8), T.Select(i % 6 == 4 and j % 4 == 2, T.float32(4), T.Select(i % 6 == 4 and j % 4 == 1, T.float32(-2), T.Select(i % 6 == 4 and j % 4 == 0, T.float32(1), T.Select(i % 6 == 3 and j % 4 == 3, T.float32(0.125), T.Select(i % 6 == 3 and j % 4 == 2, T.float32(0.25), T.Select(i % 6 == 3 and j % 4 == 1, T.float32(0.5), T.Select(i % 6 == 3 and j % 4 == 0, T.float32(1), T.Select(i % 6 == 2 and j % 4 == 3, T.float32(1), T.Select(i % 6 == 2 and j % 4 == 2, T.float32(1), T.Select(i % 6 == 2 and j % 4 == 1, T.float32(1), T.Select(i % 6 == 2 and j % 4 == 0, T.float32(1), T.Select(i % 6 == 1 and j % 4 == 3, T.float32(-1), T.Select(i % 6 == 1 and j % 4 == 2, T.float32(1), T.Select(i % 6 == 1 and j % 4 == 1, T.float32(-1), T.Select(i % 6 == 1 and j % 4 == 0, T.float32(1), T.Select(i % 6 == 0 and j % 4 == 3, T.float32(0), T.Select(i % 6 == 0 and j % 4 == 2, T.float32(0), T.Select(i % 6 == 0 and j % 4 == 1, T.float32(0), T.Select(i % 6 == 0 and j % 4 == 0, T.float32(1), T.float32(0))))))))))))))))))))))))) | |
for i0, i1, i2, i3, i4, i5 in T.grid(4, 4, 196, 64, 6, 6): | |
with T.block("inverse"): | |
vh, vw, p, co, r_a, r_b = T.axis.remap("SSSSRR", [i0, i1, i2, i3, i4, i5]) | |
T.reads(inverse[vh, vw, p, co], bgemm[r_a, r_b, p, co], A[T.min(r_a, r_b) : T.max(r_a, r_b) + 1, T.min(vh, vw) : T.max(vh, vw) + 1]) | |
T.writes(inverse[vh, vw, p, co]) | |
T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["vh", "vw", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_inverse"}) | |
with T.init(): | |
inverse[vh, vw, p, co] = T.float32(0) | |
inverse[vh, vw, p, co] = inverse[vh, vw, p, co] + bgemm[r_a, r_b, p, co] * A[r_a, vh] * A[r_b, vw] | |
for i0, i1, i2, i3 in T.grid(1, 56, 56, 64): | |
with T.block("conv2d_winograd"): | |
n, h, w, co = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(inverse[h % 4, w % 4, n * 196 + h // 4 * 14 + w // 4, co]) | |
T.writes(conv2d_winograd[n, h, w, co]) | |
conv2d_winograd[n, h, w, co] = inverse[h % 4, w % 4, n * 196 + h // 4 * 14 + w // 4, co] | |
for i0, i1, i2, i3 in T.grid(1, 56, 56, 64): | |
with T.block("T_add"): | |
ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(conv2d_winograd[ax0, ax1, ax2, ax3], placeholder_1[ax0, 0, 0, ax3]) | |
T.writes(T_add[ax0, ax1, ax2, ax3]) | |
T_add[ax0, ax1, ax2, ax3] = conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder_1[ax0, 0, 0, ax3] | |
for i0, i1, i2, i3 in T.grid(1, 56, 56, 64): | |
with T.block("T_add_1"): | |
ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(T_add[ax0, ax1, ax2, ax3], placeholder[ax0, ax1, ax2, ax3]) | |
T.writes(T_add_1[ax0, ax1, ax2, ax3]) | |
T_add_1[ax0, ax1, ax2, ax3] = T_add[ax0, ax1, ax2, ax3] + placeholder[ax0, ax1, ax2, ax3] | |
for i0, i1, i2, i3 in T.grid(1, 56, 56, 64): | |
with T.block("T_relu"): | |
ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(T_add_1[ax0, ax1, ax2, ax3]) | |
T.writes(T_relu[ax0, ax1, ax2, ax3]) | |
T_relu[ax0, ax1, ax2, ax3] = T.max(T_add_1[ax0, ax1, ax2, ax3], T.float32(0)) | |
[14:36:10] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:113: Total 1 design space(s) generated | |
[14:36:10] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:118: Design space #0: | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 56, 56, 64), "float32"], placeholder_1: T.Buffer[(1, 1, 1, 64), "float32"], placeholder_2: T.Buffer[(6, 6, 64, 64), "float32"], placeholder_3: T.Buffer[(1, 56, 56, 64), "float32"], T_relu: T.Buffer[(1, 56, 56, 64), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
with T.block("root"): | |
T.reads() | |
T.writes() | |
T.block_attr({"meta_schedule.unroll_explicit":0}) | |
input_tile_local = T.alloc_buffer([6, 6, 196, 64], dtype="float32", scope="local") | |
data_pack = T.alloc_buffer([6, 6, 196, 64], dtype="float32") | |
bgemm = T.alloc_buffer([6, 6, 196, 64], dtype="float32") | |
inverse = T.alloc_buffer([4, 4, 196, 64], dtype="float32") | |
bgemm_local = T.alloc_buffer([6, 6, 196, 64], dtype="float32", scope="local") | |
data_pack_shared = T.alloc_buffer([6, 6, 196, 64], dtype="float32", scope="shared") | |
placeholder_shared = T.alloc_buffer([6, 6, 64, 64], dtype="float32", scope="shared") | |
for i2_0, i3_0, i2_1, i3_1 in T.grid(4, 32, 49, 2): | |
for ax0, ax1, ax2, ax3 in T.grid(6, 6, 1, 1): | |
with T.block("input_tile"): | |
eps, nu = T.axis.remap("SS", [ax0, ax1]) | |
p = T.axis.spatial(196, i2_0 * 49 + i2_1 + ax2) | |
ci = T.axis.spatial(64, i3_0 * 2 + i3_1 + ax3) | |
T.reads(placeholder_3[p // 196, p % 196 // 14 * 4 + eps - 1, p % 14 * 4 + nu - 1, ci]) | |
T.writes(input_tile_local[eps, nu, p, ci]) | |
T.block_attr({"schedule_rule":"None"}) | |
input_tile_local[eps, nu, p, ci] = T.if_then_else(1 <= p % 196 // 14 * 4 + eps and p % 196 // 14 * 4 + eps < 57 and 1 <= p % 14 * 4 + nu and p % 14 * 4 + nu < 57, placeholder_3[p // 196, p % 196 // 14 * 4 + eps - 1, p % 14 * 4 + nu - 1, ci], T.float32(0), dtype="float32") | |
for i0 in T.unroll(6): | |
for i1 in T.unroll(6): | |
for i4 in T.unroll(6): | |
for i5 in T.unroll(6): | |
with T.block("data_pack"): | |
eps, nu = T.axis.remap("SS", [i0, i1]) | |
p = T.axis.spatial(196, i2_0 * 49 + i2_1) | |
ci = T.axis.spatial(64, i3_0 * 2 + i3_1) | |
r_a, r_b = T.axis.remap("RR", [i4, i5]) | |
T.reads(data_pack[eps, nu, p, ci], input_tile_local[r_a, r_b, p, ci]) | |
T.writes(data_pack[eps, nu, p, ci]) | |
T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["eps", "nu", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_data_pack.cuda"}) | |
with T.init(): | |
data_pack[eps, nu, p, ci] = T.float32(0) | |
data_pack[eps, nu, p, ci] = data_pack[eps, nu, p, ci] + input_tile_local[r_a, r_b, p, ci] * T.Select(r_a % 6 == 5 and eps % 6 == 5, T.float32(1), T.Select(r_a % 6 == 5 and eps % 6 == 4, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 3, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 2, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 1, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 0, T.float32(0), T.Select(r_a % 6 == 4 and eps % 6 == 5, T.float32(1.5), T.Select(r_a % 6 == 4 and eps % 6 == 4, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 3, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 2, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 1, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 0, T.float32(1), T.Select(r_a % 6 == 3 and eps % 6 == 5, T.float32(-2), T.Select(r_a % 6 == 3 and eps % 6 == 4, T.float32(-0.5), T.Select(r_a % 6 == 3 and eps % 6 == 3, T.float32(2), T.Select(r_a % 6 == 3 and eps % 6 == 2, T.float32(2.5), T.Select(r_a % 6 == 3 and eps % 6 == 1, T.float32(0.5), T.Select(r_a % 6 == 3 and eps % 6 == 0, T.float32(1.5), T.Select(r_a % 6 == 2 and eps % 6 == 5, T.float32(-1.5), T.Select(r_a % 6 == 2 and eps % 6 == 4, T.float32(-1), T.Select(r_a % 6 == 2 and eps % 6 == 3, T.float32(-1), T.Select(r_a % 6 == 2 and eps % 6 == 2, T.float32(0.5), T.Select(r_a % 6 == 2 and eps % 6 == 1, T.float32(-2.5), T.Select(r_a % 6 == 2 and eps % 6 == 0, T.float32(-2), T.Select(r_a % 6 == 1 and eps % 6 == 5, T.float32(1), T.Select(r_a % 6 == 1 and eps % 6 == 4, T.float32(0.5), T.Select(r_a % 6 == 1 and eps % 6 == 3, T.float32(-2), T.Select(r_a % 6 == 1 and eps % 6 == 2, T.float32(-1), T.Select(r_a % 6 == 1 and eps % 6 == 1, T.float32(1), T.Select(r_a % 6 == 1 and eps % 6 == 0, T.float32(-1.5), T.Select(r_a % 6 == 0 and eps % 6 == 5, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 4, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 3, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 2, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 1, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 0, T.float32(1), T.float32(0))))))))))))))))))))))))))))))))))))) * T.Select(r_b % 6 == 5 and nu % 6 == 5, T.float32(1), T.Select(r_b % 6 == 5 and nu % 6 == 4, T.float32(0), T.Select(r_b % 6 == 5 and nu % 6 == 3, T.float32(0), T.Select(r_b % 6 == 5 and nu % 6 == 2, T.float32(0), T.Select(r_b % 6 == 5 and nu % 6 == 1, T.float32(0), T.Select(r_b % 6 == 5 and nu % 6 == 0, T.float32(0), T.Select(r_b % 6 == 4 and nu % 6 == 5, T.float32(1.5), T.Select(r_b % 6 == 4 and nu % 6 == 4, T.float32(1), T.Select(r_b % 6 == 4 and nu % 6 == 3, T.float32(1), T.Select(r_b % 6 == 4 and nu % 6 == 2, T.float32(1), T.Select(r_b % 6 == 4 and nu % 6 == 1, T.float32(1), T.Select(r_b % 6 == 4 and nu % 6 == 0, T.float32(1), T.Select(r_b % 6 == 3 and nu % 6 == 5, T.float32(-2), T.Select(r_b % 6 == 3 and nu % 6 == 4, T.float32(-0.5), T.Select(r_b % 6 == 3 and nu % 6 == 3, T.float32(2), T.Select(r_b % 6 == 3 and nu % 6 == 2, T.float32(2.5), T.Select(r_b % 6 == 3 and nu % 6 == 1, T.float32(0.5), T.Select(r_b % 6 == 3 and nu % 6 == 0, T.float32(1.5), T.Select(r_b % 6 == 2 and nu % 6 == 5, T.float32(-1.5), T.Select(r_b % 6 == 2 and nu % 6 == 4, T.float32(-1), T.Select(r_b % 6 == 2 and nu % 6 == 3, T.float32(-1), T.Select(r_b % 6 == 2 and nu % 6 == 2, T.float32(0.5), T.Select(r_b % 6 == 2 and nu % 6 == 1, T.float32(-2.5), T.Select(r_b % 6 == 2 and nu % 6 == 0, T.float32(-2), T.Select(r_b % 6 == 1 and nu % 6 == 5, T.float32(1), T.Select(r_b % 6 == 1 and nu % 6 == 4, T.float32(0.5), T.Select(r_b % 6 == 1 and nu % 6 == 3, T.float32(-2), T.Select(r_b % 6 == 1 and nu % 6 == 2, T.float32(-1), T.Select(r_b % 6 == 1 and nu % 6 == 1, T.float32(1), T.Select(r_b % 6 == 1 and nu % 6 == 0, T.float32(-1.5), T.Select(r_b % 6 == 0 and nu % 6 == 5, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 4, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 3, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 2, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 1, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 0, T.float32(1), T.float32(0))))))))))))))))))))))))))))))))))))) | |
for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(2, thread="blockIdx.x"): | |
for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(2, thread="vthread.x"): | |
for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(49, thread="threadIdx.x"): | |
for i4_0 in T.serial(2): | |
for ax0_ax1_ax2_ax3_fused in T.serial(112896): | |
with T.block("data_pack_shared"): | |
v0 = T.axis.spatial(6, ax0_ax1_ax2_ax3_fused // 18816) | |
v1 = T.axis.spatial(6, i0_0_i1_0_i2_0_i3_0_fused * 3 + ax0_ax1_ax2_ax3_fused % 18816 // 6272) | |
v2 = T.axis.spatial(196, ax0_ax1_ax2_ax3_fused % 6272 // 32) | |
v3 = T.axis.spatial(64, i4_0 * 32 + ax0_ax1_ax2_ax3_fused % 32) | |
T.reads(data_pack[v0, v1, v2, v3]) | |
T.writes(data_pack_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":1}) | |
data_pack_shared[v0, v1, v2, v3] = data_pack[v0, v1, v2, v3] | |
for ax0_ax1_ax2_ax3_fused in T.serial(36864): | |
with T.block("placeholder_shared"): | |
v0 = T.axis.spatial(6, ax0_ax1_ax2_ax3_fused // 6144) | |
v1 = T.axis.spatial(6, i0_0_i1_0_i2_0_i3_0_fused * 3 + ax0_ax1_ax2_ax3_fused % 6144 // 2048) | |
v2 = T.axis.spatial(64, ax0_ax1_ax2_ax3_fused % 2048 // 32) | |
v3 = T.axis.spatial(64, i4_0 * 32 + ax0_ax1_ax2_ax3_fused % 32) | |
T.reads(placeholder_2[v0, v1, v2, v3]) | |
T.writes(placeholder_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":3}) | |
placeholder_shared[v0, v1, v2, v3] = placeholder_2[v0, v1, v2, v3] | |
for i4_1, i0_3, i1_3, i2_3, i3_3, i4_2, i0_4, i1_4, i2_4, i3_4 in T.grid(1, 1, 1, 1, 8, 32, 6, 3, 2, 8): | |
with T.block("bgemm"): | |
eps = T.axis.spatial(6, i0_4) | |
nu = T.axis.spatial(6, i0_0_i1_0_i2_0_i3_0_fused * 3 + i1_4) | |
p = T.axis.spatial(196, i0_1_i1_1_i2_1_i3_1_fused * 98 + i0_2_i1_2_i2_2_i3_2_fused * 2 + i2_4) | |
co = T.axis.spatial(64, i3_3 * 8 + i3_4) | |
ci = T.axis.reduce(64, i4_0 * 32 + i4_2) | |
T.reads(bgemm_local[eps, nu, p, co], data_pack_shared[eps, nu, p, ci], placeholder_shared[eps, nu, co, ci]) | |
T.writes(bgemm_local[eps, nu, p, co]) | |
T.block_attr({"layout_free_placeholders":[placeholder_2], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS"}) | |
with T.init(): | |
bgemm_local[eps, nu, p, co] = T.float32(0) | |
bgemm_local[eps, nu, p, co] = bgemm_local[eps, nu, p, co] + data_pack_shared[eps, nu, p, ci] * placeholder_shared[eps, nu, co, ci] | |
for ax0, ax1, ax2, ax3 in T.grid(6, 3, 2, 64): | |
with T.block("bgemm_local"): | |
v0 = T.axis.spatial(6, ax0) | |
v1 = T.axis.spatial(6, i0_0_i1_0_i2_0_i3_0_fused * 3 + ax1) | |
v2 = T.axis.spatial(196, i0_1_i1_1_i2_1_i3_1_fused * 98 + i0_2_i1_2_i2_2_i3_2_fused * 2 + ax2) | |
v3 = T.axis.spatial(64, ax3) | |
T.reads(bgemm_local[v0, v1, v2, v3]) | |
T.writes(bgemm[v0, v1, v2, v3]) | |
bgemm[v0, v1, v2, v3] = bgemm_local[v0, v1, v2, v3] | |
for i2_0, i3_0, i2_1, i3_1 in T.grid(98, 16, 2, 4): | |
for i0 in T.unroll(4): | |
for i1 in T.unroll(4): | |
for i4 in T.unroll(6): | |
for i5 in T.unroll(6): | |
with T.block("inverse"): | |
vh, vw = T.axis.remap("SS", [i0, i1]) | |
p = T.axis.spatial(196, i2_0 * 2 + i2_1) | |
co = T.axis.spatial(64, i3_0 * 4 + i3_1) | |
r_a, r_b = T.axis.remap("RR", [i4, i5]) | |
T.reads(inverse[vh, vw, p, co], bgemm[r_a, r_b, p, co]) | |
T.writes(inverse[vh, vw, p, co]) | |
T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["vh", "vw", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_inverse"}) | |
with T.init(): | |
inverse[vh, vw, p, co] = T.float32(0) | |
inverse[vh, vw, p, co] = inverse[vh, vw, p, co] + bgemm[r_a, r_b, p, co] * T.Select(r_a % 6 == 5 and vh % 4 == 3, T.float32(1), T.Select(r_a % 6 == 5 and vh % 4 == 2, T.float32(0), T.Select(r_a % 6 == 5 and vh % 4 == 1, T.float32(0), T.Select(r_a % 6 == 5 and vh % 4 == 0, T.float32(0), T.Select(r_a % 6 == 4 and vh % 4 == 3, T.float32(-8), T.Select(r_a % 6 == 4 and vh % 4 == 2, T.float32(4), T.Select(r_a % 6 == 4 and vh % 4 == 1, T.float32(-2), T.Select(r_a % 6 == 4 and vh % 4 == 0, T.float32(1), T.Select(r_a % 6 == 3 and vh % 4 == 3, T.float32(0.125), T.Select(r_a % 6 == 3 and vh % 4 == 2, T.float32(0.25), T.Select(r_a % 6 == 3 and vh % 4 == 1, T.float32(0.5), T.Select(r_a % 6 == 3 and vh % 4 == 0, T.float32(1), T.Select(r_a % 6 == 2 and vh % 4 == 3, T.float32(1), T.Select(r_a % 6 == 2 and vh % 4 == 2, T.float32(1), T.Select(r_a % 6 == 2 and vh % 4 == 1, T.float32(1), T.Select(r_a % 6 == 2 and vh % 4 == 0, T.float32(1), T.Select(r_a % 6 == 1 and vh % 4 == 3, T.float32(-1), T.Select(r_a % 6 == 1 and vh % 4 == 2, T.float32(1), T.Select(r_a % 6 == 1 and vh % 4 == 1, T.float32(-1), T.Select(r_a % 6 == 1 and vh % 4 == 0, T.float32(1), T.Select(r_a % 6 == 0 and vh % 4 == 3, T.float32(0), T.Select(r_a % 6 == 0 and vh % 4 == 2, T.float32(0), T.Select(r_a % 6 == 0 and vh % 4 == 1, T.float32(0), T.Select(r_a % 6 == 0 and vh % 4 == 0, T.float32(1), T.float32(0))))))))))))))))))))))))) * T.Select(r_b % 6 == 5 and vw % 4 == 3, T.float32(1), T.Select(r_b % 6 == 5 and vw % 4 == 2, T.float32(0), T.Select(r_b % 6 == 5 and vw % 4 == 1, T.float32(0), T.Select(r_b % 6 == 5 and vw % 4 == 0, T.float32(0), T.Select(r_b % 6 == 4 and vw % 4 == 3, T.float32(-8), T.Select(r_b % 6 == 4 and vw % 4 == 2, T.float32(4), T.Select(r_b % 6 == 4 and vw % 4 == 1, T.float32(-2), T.Select(r_b % 6 == 4 and vw % 4 == 0, T.float32(1), T.Select(r_b % 6 == 3 and vw % 4 == 3, T.float32(0.125), T.Select(r_b % 6 == 3 and vw % 4 == 2, T.float32(0.25), T.Select(r_b % 6 == 3 and vw % 4 == 1, T.float32(0.5), T.Select(r_b % 6 == 3 and vw % 4 == 0, T.float32(1), T.Select(r_b % 6 == 2 and vw % 4 == 3, T.float32(1), T.Select(r_b % 6 == 2 and vw % 4 == 2, T.float32(1), T.Select(r_b % 6 == 2 and vw % 4 == 1, T.float32(1), T.Select(r_b % 6 == 2 and vw % 4 == 0, T.float32(1), T.Select(r_b % 6 == 1 and vw % 4 == 3, T.float32(-1), T.Select(r_b % 6 == 1 and vw % 4 == 2, T.float32(1), T.Select(r_b % 6 == 1 and vw % 4 == 1, T.float32(-1), T.Select(r_b % 6 == 1 and vw % 4 == 0, T.float32(1), T.Select(r_b % 6 == 0 and vw % 4 == 3, T.float32(0), T.Select(r_b % 6 == 0 and vw % 4 == 2, T.float32(0), T.Select(r_b % 6 == 0 and vw % 4 == 1, T.float32(0), T.Select(r_b % 6 == 0 and vw % 4 == 0, T.float32(1), T.float32(0))))))))))))))))))))))))) | |
for i0, i1, i2, i3 in T.grid(1, 56, 56, 64): | |
with T.block("conv2d_winograd"): | |
n, h, w, co = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(inverse[h % 4, w % 4, n * 196 + h // 4 * 14 + w // 4, co], placeholder_1[n, 0, 0, co], placeholder[n, h, w, co]) | |
T.writes(T_relu[n, h, w, co]) | |
T_relu[n, h, w, co] = T.max(inverse[h % 4, w % 4, n * 196 + h // 4 * 14 + w // 4, co] + placeholder_1[n, 0, 0, co] + placeholder[n, h, w, co], T.float32(0)) | |
b0 = sch.get_block(name="B", func_name="main") | |
b1 = sch.get_block(name="data_pack", func_name="main") | |
b2 = sch.get_block(name="bgemm", func_name="main") | |
b3 = sch.get_block(name="A", func_name="main") | |
b4 = sch.get_block(name="inverse", func_name="main") | |
b5 = sch.get_block(name="T_add", func_name="main") | |
b6 = sch.get_block(name="T_add_1", func_name="main") | |
b7 = sch.get_block(name="T_relu", func_name="main") | |
b8 = sch.get_block(name="root", func_name="main") | |
sch.compute_inline(block=b0) | |
b9, = sch.get_producers(block=b1) | |
b10, = sch.get_producers(block=b9) | |
l11, l12, l13, l14, l15, l16 = sch.get_loops(block=b1) | |
v17, v18 = sch.sample_perfect_tile(loop=l13, n=2, max_innermost_factor=64, decision=[4, 49]) | |
l19, l20 = sch.split(loop=l13, factors=[v17, v18]) | |
v21, v22 = sch.sample_perfect_tile(loop=l14, n=2, max_innermost_factor=64, decision=[32, 2]) | |
l23, l24 = sch.split(loop=l14, factors=[v21, v22]) | |
sch.unroll(loop=l11) | |
sch.unroll(loop=l12) | |
sch.unroll(loop=l15) | |
sch.unroll(loop=l16) | |
sch.reorder(l19, l23, l20, l24, l11, l12, l15, l16) | |
sch.compute_at(block=b9, loop=l24, preserve_unit_loops=True) | |
sch.set_scope(block=b9, buffer_index=0, storage_scope="local") | |
sch.compute_inline(block=b10) | |
sch.compute_inline(block=b3) | |
l25, l26, l27, l28, l29, l30 = sch.get_loops(block=b4) | |
v31, v32 = sch.sample_perfect_tile(loop=l27, n=2, max_innermost_factor=64, decision=[98, 2]) | |
l33, l34 = sch.split(loop=l27, factors=[v31, v32]) | |
v35, v36 = sch.sample_perfect_tile(loop=l28, n=2, max_innermost_factor=64, decision=[16, 4]) | |
l37, l38 = sch.split(loop=l28, factors=[v35, v36]) | |
sch.unroll(loop=l25) | |
sch.unroll(loop=l26) | |
sch.unroll(loop=l29) | |
sch.unroll(loop=l30) | |
sch.reorder(l33, l37, l34, l38, l25, l26, l29, l30) | |
sch.annotate(block_or_loop=b2, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS") | |
l39, l40, l41, l42, l43 = sch.get_loops(block=b2) | |
v44, v45, v46, v47, v48 = sch.sample_perfect_tile(loop=l39, n=5, max_innermost_factor=64, decision=[1, 1, 1, 1, 6]) | |
l49, l50, l51, l52, l53 = sch.split(loop=l39, factors=[v44, v45, v46, v47, v48]) | |
v54, v55, v56, v57, v58 = sch.sample_perfect_tile(loop=l40, n=5, max_innermost_factor=64, decision=[2, 1, 1, 1, 3]) | |
l59, l60, l61, l62, l63 = sch.split(loop=l40, factors=[v54, v55, v56, v57, v58]) | |
v64, v65, v66, v67, v68 = sch.sample_perfect_tile(loop=l41, n=5, max_innermost_factor=64, decision=[1, 2, 49, 1, 2]) | |
l69, l70, l71, l72, l73 = sch.split(loop=l41, factors=[v64, v65, v66, v67, v68]) | |
v74, v75, v76, v77, v78 = sch.sample_perfect_tile(loop=l42, n=5, max_innermost_factor=64, decision=[1, 1, 1, 8, 8]) | |
l79, l80, l81, l82, l83 = sch.split(loop=l42, factors=[v74, v75, v76, v77, v78]) | |
v84, v85, v86 = sch.sample_perfect_tile(loop=l43, n=3, max_innermost_factor=64, decision=[2, 1, 32]) | |
l87, l88, l89 = sch.split(loop=l43, factors=[v84, v85, v86]) | |
sch.reorder(l49, l59, l69, l79, l50, l60, l70, l80, l51, l61, l71, l81, l87, l88, l52, l62, l72, l82, l89, l53, l63, l73, l83) | |
l90 = sch.fuse(l49, l59, l69, l79) | |
sch.bind(loop=l90, thread_axis="blockIdx.x") | |
l91 = sch.fuse(l50, l60, l70, l80) | |
sch.bind(loop=l91, thread_axis="vthread.x") | |
l92 = sch.fuse(l51, l61, l71, l81) | |
sch.bind(loop=l92, thread_axis="threadIdx.x") | |
sch.annotate(block_or_loop=b2, ann_key="meta_schedule.thread_extent_low_inclusive", ann_val=32) | |
sch.annotate(block_or_loop=b2, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=1024) | |
b93 = sch.cache_write(block=b2, write_buffer_index=0, storage_scope="local") | |
sch.reverse_compute_at(block=b93, loop=l92, preserve_unit_loops=True) | |
b94 = sch.cache_read(block=b2, read_buffer_index=1, storage_scope="shared") | |
sch.compute_at(block=b94, loop=l87, preserve_unit_loops=True) | |
l95, l96, l97, l98, l99, l100, l101, l102 = sch.get_loops(block=b94) | |
l103 = sch.fuse(l99, l100, l101, l102) | |
v104 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=0) | |
sch.annotate(block_or_loop=b94, ann_key="meta_schedule.cooperative_fetch", ann_val=v104) | |
b105 = sch.cache_read(block=b2, read_buffer_index=2, storage_scope="shared") | |
sch.compute_at(block=b105, loop=l87, preserve_unit_loops=True) | |
l106, l107, l108, l109, l110, l111, l112, l113 = sch.get_loops(block=b105) | |
l114 = sch.fuse(l110, l111, l112, l113) | |
v115 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=2) | |
sch.annotate(block_or_loop=b105, ann_key="meta_schedule.cooperative_fetch", ann_val=v115) | |
sch.reverse_compute_inline(block=b7) | |
sch.reverse_compute_inline(block=b6) | |
sch.reverse_compute_inline(block=b5) | |
v116 = sch.sample_categorical(candidates=[0, 16, 64, 512, 1024], probs=[0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001], decision=0) | |
sch.annotate(block_or_loop=b8, ann_key="meta_schedule.unroll_explicit", ann_val=v116) | |
[14:36:10] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:96: Initializing Task #5: "vm_mod_fused_nn_conv2d_add_nn_relu_1", mod = | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 1, 1, 128), "float32"], placeholder_1: T.Buffer[(3, 3, 64, 128), "float32"], placeholder_2: T.Buffer[(1, 56, 56, 64), "float32"], T_relu: T.Buffer[(1, 28, 28, 128), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
# with T.block("root") | |
pad_temp = T.alloc_buffer([1, 58, 58, 64], dtype="float32") | |
conv2d_nhwc = T.alloc_buffer([1, 28, 28, 128], dtype="float32") | |
T_add = T.alloc_buffer([1, 28, 28, 128], dtype="float32") | |
for i0, i1, i2, i3 in T.grid(1, 58, 58, 64): | |
with T.block("pad_temp"): | |
i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(placeholder_2[i0_1, i1_1 - 1, i2_1 - 1, i3_1]) | |
T.writes(pad_temp[i0_1, i1_1, i2_1, i3_1]) | |
pad_temp[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(1 <= i1_1 and i1_1 < 57 and 1 <= i2_1 and i2_1 < 57, placeholder_2[i0_1, i1_1 - 1, i2_1 - 1, i3_1], T.float32(0), dtype="float32") | |
for i0, i1, i2, i3, i4, i5, i6 in T.grid(1, 28, 28, 128, 3, 3, 64): | |
with T.block("conv2d_nhwc"): | |
nn, yy, xx, ff, ry, rx, rc = T.axis.remap("SSSSRRR", [i0, i1, i2, i3, i4, i5, i6]) | |
T.reads(conv2d_nhwc[nn, yy, xx, ff], pad_temp[nn, yy * 2 + ry, xx * 2 + rx, rc], placeholder_1[ry, rx, rc, ff]) | |
T.writes(conv2d_nhwc[nn, yy, xx, ff]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "workload":["conv2d_nhwc.gpu", ["TENSOR", [1, 56, 56, 64], "float32"], ["TENSOR", [3, 3, 64, 128], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], "float32"]}) | |
with T.init(): | |
conv2d_nhwc[nn, yy, xx, ff] = T.float32(0) | |
conv2d_nhwc[nn, yy, xx, ff] = conv2d_nhwc[nn, yy, xx, ff] + pad_temp[nn, yy * 2 + ry, xx * 2 + rx, rc] * placeholder_1[ry, rx, rc, ff] | |
for i0, i1, i2, i3 in T.grid(1, 28, 28, 128): | |
with T.block("T_add"): | |
ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(conv2d_nhwc[ax0, ax1, ax2, ax3], placeholder[ax0, 0, 0, ax3]) | |
T.writes(T_add[ax0, ax1, ax2, ax3]) | |
T_add[ax0, ax1, ax2, ax3] = conv2d_nhwc[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3] | |
for i0, i1, i2, i3 in T.grid(1, 28, 28, 128): | |
with T.block("T_relu"): | |
ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(T_add[ax0, ax1, ax2, ax3]) | |
T.writes(T_relu[ax0, ax1, ax2, ax3]) | |
T_relu[ax0, ax1, ax2, ax3] = T.max(T_add[ax0, ax1, ax2, ax3], T.float32(0)) | |
[14:36:10] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:113: Total 1 design space(s) generated | |
[14:36:10] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:118: Design space #0: | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 1, 1, 128), "float32"], placeholder_1: T.Buffer[(3, 3, 64, 128), "float32"], placeholder_2: T.Buffer[(1, 56, 56, 64), "float32"], T_relu: T.Buffer[(1, 28, 28, 128), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
with T.block("root"): | |
T.reads() | |
T.writes() | |
T.block_attr({"meta_schedule.unroll_explicit":1024}) | |
conv2d_nhwc_local = T.alloc_buffer([1, 28, 28, 128], dtype="float32", scope="local") | |
pad_temp_shared = T.alloc_buffer([1, 58, 58, 64], dtype="float32", scope="shared") | |
placeholder_shared = T.alloc_buffer([3, 3, 64, 128], dtype="float32", scope="shared") | |
for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(4, thread="blockIdx.x"): | |
for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(14, thread="vthread.x"): | |
for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(56, thread="threadIdx.x"): | |
for i4_0, i5_0, i6_0 in T.grid(1, 1, 4): | |
for ax0_ax1_ax2_ax3_fused in T.serial(13680): | |
with T.block("pad_temp_shared"): | |
v0 = T.axis.spatial(1, 0) | |
v1 = T.axis.spatial(58, i0_0_i1_0_i2_0_i3_0_fused * 14 + ax0_ax1_ax2_ax3_fused % 13680 // 912) | |
v2 = T.axis.spatial(58, ax0_ax1_ax2_ax3_fused % 912 // 16) | |
v3 = T.axis.spatial(64, i6_0 * 16 + ax0_ax1_ax2_ax3_fused % 16) | |
T.reads(placeholder_2[v0, v1 - 1, v2 - 1, v3]) | |
T.writes(pad_temp_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":1}) | |
pad_temp_shared[v0, v1, v2, v3] = T.if_then_else(1 <= v1 and v1 < 57 and 1 <= v2 and v2 < 57, placeholder_2[v0, v1 - 1, v2 - 1, v3], T.float32(0), dtype="float32") | |
for ax0_ax1_ax2_ax3_fused in T.serial(18432): | |
with T.block("placeholder_shared"): | |
v0 = T.axis.spatial(3, ax0_ax1_ax2_ax3_fused // 6144) | |
v1 = T.axis.spatial(3, ax0_ax1_ax2_ax3_fused % 6144 // 2048) | |
v2 = T.axis.spatial(64, i6_0 * 16 + ax0_ax1_ax2_ax3_fused % 2048 // 128) | |
v3 = T.axis.spatial(128, ax0_ax1_ax2_ax3_fused % 128) | |
T.reads(placeholder_1[v0, v1, v2, v3]) | |
T.writes(placeholder_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":2}) | |
placeholder_shared[v0, v1, v2, v3] = placeholder_1[v0, v1, v2, v3] | |
for i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3, i4_2, i5_2, i6_2, i0_4, i1_4, i2_4, i3_4 in T.grid(3, 1, 1, 1, 1, 1, 2, 1, 3, 16, 1, 1, 2, 8): | |
with T.block("conv2d_nhwc"): | |
nn = T.axis.spatial(1, 0) | |
yy = T.axis.spatial(28, i0_0_i1_0_i2_0_i3_0_fused * 7 + i0_1_i1_1_i2_1_i3_1_fused // 2) | |
xx = T.axis.spatial(28, i0_2_i1_2_i2_2_i3_2_fused // 4 * 2 + i2_4) | |
ff = T.axis.spatial(128, i0_1_i1_1_i2_1_i3_1_fused % 2 * 64 + i0_2_i1_2_i2_2_i3_2_fused % 4 * 16 + i3_3 * 8 + i3_4) | |
ry, rx = T.axis.remap("RR", [i4_1, i5_2]) | |
rc = T.axis.reduce(64, i6_0 * 16 + i6_2) | |
T.reads(conv2d_nhwc_local[nn, yy, xx, ff], pad_temp_shared[nn, yy * 2 + ry, xx * 2 + rx, rc], placeholder_shared[ry, rx, rc, ff]) | |
T.writes(conv2d_nhwc_local[nn, yy, xx, ff]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["conv2d_nhwc.gpu", ["TENSOR", [1, 56, 56, 64], "float32"], ["TENSOR", [3, 3, 64, 128], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], "float32"]}) | |
with T.init(): | |
conv2d_nhwc_local[nn, yy, xx, ff] = T.float32(0) | |
conv2d_nhwc_local[nn, yy, xx, ff] = conv2d_nhwc_local[nn, yy, xx, ff] + pad_temp_shared[nn, yy * 2 + ry, xx * 2 + rx, rc] * placeholder_shared[ry, rx, rc, ff] | |
for ax0, ax1, ax2, ax3 in T.grid(1, 1, 2, 16): | |
with T.block("conv2d_nhwc_local"): | |
v0 = T.axis.spatial(1, ax0) | |
v1 = T.axis.spatial(28, i0_0_i1_0_i2_0_i3_0_fused * 7 + i0_1_i1_1_i2_1_i3_1_fused // 2 + ax1) | |
v2 = T.axis.spatial(28, i0_2_i1_2_i2_2_i3_2_fused // 4 * 2 + ax2) | |
v3 = T.axis.spatial(128, i0_1_i1_1_i2_1_i3_1_fused % 2 * 64 + i0_2_i1_2_i2_2_i3_2_fused % 4 * 16 + ax3) | |
T.reads(conv2d_nhwc_local[v0, v1, v2, v3], placeholder[v0, 0, 0, v3]) | |
T.writes(T_relu[v0, v1, v2, v3]) | |
T_relu[v0, v1, v2, v3] = T.max(conv2d_nhwc_local[v0, v1, v2, v3] + placeholder[v0, 0, 0, v3], T.float32(0)) | |
b0 = sch.get_block(name="pad_temp", func_name="main") | |
b1 = sch.get_block(name="conv2d_nhwc", func_name="main") | |
b2 = sch.get_block(name="T_add", func_name="main") | |
b3 = sch.get_block(name="T_relu", func_name="main") | |
b4 = sch.get_block(name="root", func_name="main") | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS") | |
l5, l6, l7, l8, l9, l10, l11 = sch.get_loops(block=b1) | |
v12, v13, v14, v15, v16 = sch.sample_perfect_tile(loop=l5, n=5, max_innermost_factor=64, decision=[1, 1, 1, 1, 1]) | |
l17, l18, l19, l20, l21 = sch.split(loop=l5, factors=[v12, v13, v14, v15, v16]) | |
v22, v23, v24, v25, v26 = sch.sample_perfect_tile(loop=l6, n=5, max_innermost_factor=64, decision=[4, 7, 1, 1, 1]) | |
l27, l28, l29, l30, l31 = sch.split(loop=l6, factors=[v22, v23, v24, v25, v26]) | |
v32, v33, v34, v35, v36 = sch.sample_perfect_tile(loop=l7, n=5, max_innermost_factor=64, decision=[1, 1, 14, 1, 2]) | |
l37, l38, l39, l40, l41 = sch.split(loop=l7, factors=[v32, v33, v34, v35, v36]) | |
v42, v43, v44, v45, v46 = sch.sample_perfect_tile(loop=l8, n=5, max_innermost_factor=64, decision=[1, 2, 4, 2, 8]) | |
l47, l48, l49, l50, l51 = sch.split(loop=l8, factors=[v42, v43, v44, v45, v46]) | |
v52, v53, v54 = sch.sample_perfect_tile(loop=l9, n=3, max_innermost_factor=64, decision=[1, 3, 1]) | |
l55, l56, l57 = sch.split(loop=l9, factors=[v52, v53, v54]) | |
v58, v59, v60 = sch.sample_perfect_tile(loop=l10, n=3, max_innermost_factor=64, decision=[1, 1, 3]) | |
l61, l62, l63 = sch.split(loop=l10, factors=[v58, v59, v60]) | |
v64, v65, v66 = sch.sample_perfect_tile(loop=l11, n=3, max_innermost_factor=64, decision=[4, 1, 16]) | |
l67, l68, l69 = sch.split(loop=l11, factors=[v64, v65, v66]) | |
sch.reorder(l17, l27, l37, l47, l18, l28, l38, l48, l19, l29, l39, l49, l55, l61, l67, l56, l62, l68, l20, l30, l40, l50, l57, l63, l69, l21, l31, l41, l51) | |
l70 = sch.fuse(l17, l27, l37, l47) | |
sch.bind(loop=l70, thread_axis="blockIdx.x") | |
l71 = sch.fuse(l18, l28, l38, l48) | |
sch.bind(loop=l71, thread_axis="vthread.x") | |
l72 = sch.fuse(l19, l29, l39, l49) | |
sch.bind(loop=l72, thread_axis="threadIdx.x") | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.thread_extent_low_inclusive", ann_val=32) | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=1024) | |
b73 = sch.cache_write(block=b1, write_buffer_index=0, storage_scope="local") | |
sch.reverse_compute_at(block=b73, loop=l72, preserve_unit_loops=True) | |
b74 = sch.cache_read(block=b1, read_buffer_index=1, storage_scope="shared") | |
sch.compute_at(block=b74, loop=l67, preserve_unit_loops=True) | |
l75, l76, l77, l78, l79, l80, l81, l82, l83, l84 = sch.get_loops(block=b74) | |
l85 = sch.fuse(l81, l82, l83, l84) | |
v86 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=0) | |
sch.annotate(block_or_loop=b74, ann_key="meta_schedule.cooperative_fetch", ann_val=v86) | |
b87 = sch.cache_read(block=b1, read_buffer_index=2, storage_scope="shared") | |
sch.compute_at(block=b87, loop=l67, preserve_unit_loops=True) | |
l88, l89, l90, l91, l92, l93, l94, l95, l96, l97 = sch.get_loops(block=b87) | |
l98 = sch.fuse(l94, l95, l96, l97) | |
v99 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=1) | |
sch.annotate(block_or_loop=b87, ann_key="meta_schedule.cooperative_fetch", ann_val=v99) | |
sch.reverse_compute_inline(block=b3) | |
sch.reverse_compute_inline(block=b2) | |
sch.compute_inline(block=b0) | |
v100 = sch.sample_categorical(candidates=[0, 16, 64, 512, 1024], probs=[0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001], decision=4) | |
sch.annotate(block_or_loop=b4, ann_key="meta_schedule.unroll_explicit", ann_val=v100) | |
[14:36:10] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:96: Initializing Task #6: "vm_mod_fused_nn_conv2d_add", mod = | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 1, 1, 128), "float32"], placeholder_1: T.Buffer[(1, 1, 64, 128), "float32"], placeholder_2: T.Buffer[(1, 56, 56, 64), "float32"], T_add: T.Buffer[(1, 28, 28, 128), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
# with T.block("root") | |
pad_temp = T.alloc_buffer([1, 56, 56, 64], dtype="float32") | |
conv2d_nhwc = T.alloc_buffer([1, 28, 28, 128], dtype="float32") | |
for i0, i1, i2, i3 in T.grid(1, 56, 56, 64): | |
with T.block("pad_temp"): | |
i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(placeholder_2[i0_1, i1_1, i2_1, i3_1]) | |
T.writes(pad_temp[i0_1, i1_1, i2_1, i3_1]) | |
pad_temp[i0_1, i1_1, i2_1, i3_1] = placeholder_2[i0_1, i1_1, i2_1, i3_1] | |
for i0, i1, i2, i3, i4, i5, i6 in T.grid(1, 28, 28, 128, 1, 1, 64): | |
with T.block("conv2d_nhwc"): | |
nn, yy, xx, ff, ry, rx, rc = T.axis.remap("SSSSRRR", [i0, i1, i2, i3, i4, i5, i6]) | |
T.reads(conv2d_nhwc[nn, yy, xx, ff], pad_temp[nn, yy * 2 + ry, xx * 2 + rx, rc], placeholder_1[ry, rx, rc, ff]) | |
T.writes(conv2d_nhwc[nn, yy, xx, ff]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "workload":["conv2d_nhwc.gpu", ["TENSOR", [1, 56, 56, 64], "float32"], ["TENSOR", [1, 1, 64, 128], "float32"], [2, 2], [0, 0, 0, 0], [1, 1], "float32"]}) | |
with T.init(): | |
conv2d_nhwc[nn, yy, xx, ff] = T.float32(0) | |
conv2d_nhwc[nn, yy, xx, ff] = conv2d_nhwc[nn, yy, xx, ff] + pad_temp[nn, yy * 2 + ry, xx * 2 + rx, rc] * placeholder_1[ry, rx, rc, ff] | |
for i0, i1, i2, i3 in T.grid(1, 28, 28, 128): | |
with T.block("T_add"): | |
ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(conv2d_nhwc[ax0, ax1, ax2, ax3], placeholder[ax0, 0, 0, ax3]) | |
T.writes(T_add[ax0, ax1, ax2, ax3]) | |
T_add[ax0, ax1, ax2, ax3] = conv2d_nhwc[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3] | |
[14:36:11] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:113: Total 1 design space(s) generated | |
[14:36:11] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:118: Design space #0: | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 1, 1, 128), "float32"], placeholder_1: T.Buffer[(1, 1, 64, 128), "float32"], placeholder_2: T.Buffer[(1, 56, 56, 64), "float32"], T_add: T.Buffer[(1, 28, 28, 128), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
with T.block("root"): | |
T.reads() | |
T.writes() | |
T.block_attr({"meta_schedule.unroll_explicit":16}) | |
conv2d_nhwc_local = T.alloc_buffer([1, 28, 28, 128], dtype="float32", scope="local") | |
pad_temp_shared = T.alloc_buffer([1, 56, 56, 64], dtype="float32", scope="shared") | |
placeholder_shared = T.alloc_buffer([1, 1, 64, 128], dtype="float32", scope="shared") | |
for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(56, thread="blockIdx.x"): | |
for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(14, thread="vthread.x"): | |
for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(4, thread="threadIdx.x"): | |
for i4_0, i5_0, i6_0 in T.grid(1, 1, 1): | |
for ax0_ax1_ax2_ax3_fused in T.serial(24640): | |
with T.block("pad_temp_shared"): | |
v0 = T.axis.spatial(1, 0) | |
v1 = T.axis.spatial(56, ax0_ax1_ax2_ax3_fused % 24640 // 448) | |
v2 = T.axis.spatial(56, i0_0_i1_0_i2_0_i3_0_fused // 8 * 8 + ax0_ax1_ax2_ax3_fused % 448 // 64) | |
v3 = T.axis.spatial(64, ax0_ax1_ax2_ax3_fused % 64) | |
T.reads(placeholder_2[v0, v1, v2, v3]) | |
T.writes(pad_temp_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":4}) | |
pad_temp_shared[v0, v1, v2, v3] = placeholder_2[v0, v1, v2, v3] | |
for ax0_ax1_ax2_ax3_fused in T.serial(1024): | |
with T.block("placeholder_shared"): | |
v0 = T.axis.spatial(1, 0) | |
v1 = T.axis.spatial(1, 0) | |
v2 = T.axis.spatial(64, ax0_ax1_ax2_ax3_fused // 16) | |
v3 = T.axis.spatial(128, i0_0_i1_0_i2_0_i3_0_fused % 8 * 16 + ax0_ax1_ax2_ax3_fused % 16) | |
T.reads(placeholder_1[v0, v1, v2, v3]) | |
T.writes(placeholder_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":1}) | |
placeholder_shared[v0, v1, v2, v3] = placeholder_1[v0, v1, v2, v3] | |
for i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3, i4_2, i5_2, i6_2, i0_4, i1_4, i2_4, i3_4 in T.grid(1, 1, 2, 1, 1, 2, 2, 1, 1, 32, 1, 4, 1, 2): | |
with T.block("conv2d_nhwc"): | |
nn = T.axis.spatial(1, 0) | |
yy = T.axis.spatial(28, i0_1_i1_1_i2_1_i3_1_fused // 2 * 4 + i1_4) | |
xx = T.axis.spatial(28, i0_0_i1_0_i2_0_i3_0_fused // 8 * 4 + i0_1_i1_1_i2_1_i3_1_fused % 2 * 2 + i2_3) | |
ff = T.axis.spatial(128, i0_0_i1_0_i2_0_i3_0_fused % 8 * 16 + i0_2_i1_2_i2_2_i3_2_fused * 4 + i3_3 * 2 + i3_4) | |
ry = T.axis.reduce(1, 0) | |
rx = T.axis.reduce(1, 0) | |
rc = T.axis.reduce(64, i6_1 * 32 + i6_2) | |
T.reads(conv2d_nhwc_local[nn, yy, xx, ff], pad_temp_shared[nn, yy * 2 + ry, xx * 2 + rx, rc], placeholder_shared[ry, rx, rc, ff]) | |
T.writes(conv2d_nhwc_local[nn, yy, xx, ff]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["conv2d_nhwc.gpu", ["TENSOR", [1, 56, 56, 64], "float32"], ["TENSOR", [1, 1, 64, 128], "float32"], [2, 2], [0, 0, 0, 0], [1, 1], "float32"]}) | |
with T.init(): | |
conv2d_nhwc_local[nn, yy, xx, ff] = T.float32(0) | |
conv2d_nhwc_local[nn, yy, xx, ff] = conv2d_nhwc_local[nn, yy, xx, ff] + pad_temp_shared[nn, yy * 2 + ry, xx * 2 + rx, rc] * placeholder_shared[ry, rx, rc, ff] | |
for ax0, ax1, ax2, ax3 in T.grid(1, 4, 2, 4): | |
with T.block("conv2d_nhwc_local"): | |
v0 = T.axis.spatial(1, ax0) | |
v1 = T.axis.spatial(28, i0_1_i1_1_i2_1_i3_1_fused // 2 * 4 + ax1) | |
v2 = T.axis.spatial(28, i0_0_i1_0_i2_0_i3_0_fused // 8 * 4 + i0_1_i1_1_i2_1_i3_1_fused % 2 * 2 + ax2) | |
v3 = T.axis.spatial(128, i0_0_i1_0_i2_0_i3_0_fused % 8 * 16 + i0_2_i1_2_i2_2_i3_2_fused * 4 + ax3) | |
T.reads(conv2d_nhwc_local[v0, v1, v2, v3], placeholder[v0, 0, 0, v3]) | |
T.writes(T_add[v0, v1, v2, v3]) | |
T_add[v0, v1, v2, v3] = conv2d_nhwc_local[v0, v1, v2, v3] + placeholder[v0, 0, 0, v3] | |
b0 = sch.get_block(name="pad_temp", func_name="main") | |
b1 = sch.get_block(name="conv2d_nhwc", func_name="main") | |
b2 = sch.get_block(name="T_add", func_name="main") | |
b3 = sch.get_block(name="root", func_name="main") | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS") | |
l4, l5, l6, l7, l8, l9, l10 = sch.get_loops(block=b1) | |
v11, v12, v13, v14, v15 = sch.sample_perfect_tile(loop=l4, n=5, max_innermost_factor=64, decision=[1, 1, 1, 1, 1]) | |
l16, l17, l18, l19, l20 = sch.split(loop=l4, factors=[v11, v12, v13, v14, v15]) | |
v21, v22, v23, v24, v25 = sch.sample_perfect_tile(loop=l5, n=5, max_innermost_factor=64, decision=[1, 7, 1, 1, 4]) | |
l26, l27, l28, l29, l30 = sch.split(loop=l5, factors=[v21, v22, v23, v24, v25]) | |
v31, v32, v33, v34, v35 = sch.sample_perfect_tile(loop=l6, n=5, max_innermost_factor=64, decision=[7, 2, 1, 2, 1]) | |
l36, l37, l38, l39, l40 = sch.split(loop=l6, factors=[v31, v32, v33, v34, v35]) | |
v41, v42, v43, v44, v45 = sch.sample_perfect_tile(loop=l7, n=5, max_innermost_factor=64, decision=[8, 1, 4, 2, 2]) | |
l46, l47, l48, l49, l50 = sch.split(loop=l7, factors=[v41, v42, v43, v44, v45]) | |
v51, v52, v53 = sch.sample_perfect_tile(loop=l8, n=3, max_innermost_factor=64, decision=[1, 1, 1]) | |
l54, l55, l56 = sch.split(loop=l8, factors=[v51, v52, v53]) | |
v57, v58, v59 = sch.sample_perfect_tile(loop=l9, n=3, max_innermost_factor=64, decision=[1, 1, 1]) | |
l60, l61, l62 = sch.split(loop=l9, factors=[v57, v58, v59]) | |
v63, v64, v65 = sch.sample_perfect_tile(loop=l10, n=3, max_innermost_factor=64, decision=[1, 2, 32]) | |
l66, l67, l68 = sch.split(loop=l10, factors=[v63, v64, v65]) | |
sch.reorder(l16, l26, l36, l46, l17, l27, l37, l47, l18, l28, l38, l48, l54, l60, l66, l55, l61, l67, l19, l29, l39, l49, l56, l62, l68, l20, l30, l40, l50) | |
l69 = sch.fuse(l16, l26, l36, l46) | |
sch.bind(loop=l69, thread_axis="blockIdx.x") | |
l70 = sch.fuse(l17, l27, l37, l47) | |
sch.bind(loop=l70, thread_axis="vthread.x") | |
l71 = sch.fuse(l18, l28, l38, l48) | |
sch.bind(loop=l71, thread_axis="threadIdx.x") | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.thread_extent_low_inclusive", ann_val=32) | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=1024) | |
b72 = sch.cache_write(block=b1, write_buffer_index=0, storage_scope="local") | |
sch.reverse_compute_at(block=b72, loop=l71, preserve_unit_loops=True) | |
b73 = sch.cache_read(block=b1, read_buffer_index=1, storage_scope="shared") | |
sch.compute_at(block=b73, loop=l66, preserve_unit_loops=True) | |
l74, l75, l76, l77, l78, l79, l80, l81, l82, l83 = sch.get_loops(block=b73) | |
l84 = sch.fuse(l80, l81, l82, l83) | |
v85 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=3) | |
sch.annotate(block_or_loop=b73, ann_key="meta_schedule.cooperative_fetch", ann_val=v85) | |
b86 = sch.cache_read(block=b1, read_buffer_index=2, storage_scope="shared") | |
sch.compute_at(block=b86, loop=l66, preserve_unit_loops=True) | |
l87, l88, l89, l90, l91, l92, l93, l94, l95, l96 = sch.get_loops(block=b86) | |
l97 = sch.fuse(l93, l94, l95, l96) | |
v98 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=0) | |
sch.annotate(block_or_loop=b86, ann_key="meta_schedule.cooperative_fetch", ann_val=v98) | |
sch.reverse_compute_inline(block=b2) | |
sch.compute_inline(block=b0) | |
v99 = sch.sample_categorical(candidates=[0, 16, 64, 512, 1024], probs=[0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001], decision=1) | |
sch.annotate(block_or_loop=b3, ann_key="meta_schedule.unroll_explicit", ann_val=v99) | |
[14:36:11] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:96: Initializing Task #7: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1", mod = | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 28, 28, 128), "float32"], placeholder_1: T.Buffer[(1, 1, 1, 128), "float32"], placeholder_2: T.Buffer[(4, 4, 128, 128), "float32"], placeholder_3: T.Buffer[(1, 28, 28, 128), "float32"], T_relu: T.Buffer[(1, 28, 28, 128), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
# with T.block("root") | |
data_pad = T.alloc_buffer([1, 30, 30, 128], dtype="float32") | |
input_tile = T.alloc_buffer([4, 4, 196, 128], dtype="float32") | |
B = T.alloc_buffer([4, 4], dtype="float32") | |
data_pack = T.alloc_buffer([4, 4, 196, 128], dtype="float32") | |
bgemm = T.alloc_buffer([4, 4, 196, 128], dtype="float32") | |
A = T.alloc_buffer([4, 2], dtype="float32") | |
inverse = T.alloc_buffer([2, 2, 196, 128], dtype="float32") | |
conv2d_winograd = T.alloc_buffer([1, 28, 28, 128], dtype="float32") | |
T_add = T.alloc_buffer([1, 28, 28, 128], dtype="float32") | |
T_add_1 = T.alloc_buffer([1, 28, 28, 128], dtype="float32") | |
for i0, i1, i2, i3 in T.grid(1, 30, 30, 128): | |
with T.block("data_pad"): | |
i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(placeholder_3[i0_1, i1_1 - 1, i2_1 - 1, i3_1]) | |
T.writes(data_pad[i0_1, i1_1, i2_1, i3_1]) | |
T.block_attr({"schedule_rule":"None"}) | |
data_pad[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(1 <= i1_1 and i1_1 < 29 and 1 <= i2_1 and i2_1 < 29, placeholder_3[i0_1, i1_1 - 1, i2_1 - 1, i3_1], T.float32(0), dtype="float32") | |
for i0, i1, i2, i3 in T.grid(4, 4, 196, 128): | |
with T.block("input_tile"): | |
eps, nu, p, ci = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(data_pad[p // 196, p % 196 // 14 * 2 + eps, p % 14 * 2 + nu, ci]) | |
T.writes(input_tile[eps, nu, p, ci]) | |
T.block_attr({"schedule_rule":"None"}) | |
input_tile[eps, nu, p, ci] = data_pad[p // 196, p % 196 // 14 * 2 + eps, p % 14 * 2 + nu, ci] | |
for i0, i1 in T.grid(4, 4): | |
with T.block("B"): | |
i, j = T.axis.remap("SS", [i0, i1]) | |
T.reads() | |
T.writes(B[i, j]) | |
T.block_attr({"const_matrix":True, "schedule_rule":"meta_schedule.compute_inline"}) | |
B[i, j] = T.Select(i % 4 == 3 and j % 4 == 3, T.float32(1), T.Select(i % 4 == 3 and j % 4 == 2, T.float32(0), T.Select(i % 4 == 3 and j % 4 == 1, T.float32(0), T.Select(i % 4 == 3 and j % 4 == 0, T.float32(0), T.Select(i % 4 == 2 and j % 4 == 3, T.float32(0), T.Select(i % 4 == 2 and j % 4 == 2, T.float32(1), T.Select(i % 4 == 2 and j % 4 == 1, T.float32(1), T.Select(i % 4 == 2 and j % 4 == 0, T.float32(-1), T.Select(i % 4 == 1 and j % 4 == 3, T.float32(-1), T.Select(i % 4 == 1 and j % 4 == 2, T.float32(1), T.Select(i % 4 == 1 and j % 4 == 1, T.float32(-1), T.Select(i % 4 == 1 and j % 4 == 0, T.float32(0), T.Select(i % 4 == 0 and j % 4 == 3, T.float32(0), T.Select(i % 4 == 0 and j % 4 == 2, T.float32(0), T.Select(i % 4 == 0 and j % 4 == 1, T.float32(0), T.Select(i % 4 == 0 and j % 4 == 0, T.float32(1), T.float32(0))))))))))))))))) | |
for i0, i1, i2, i3, i4, i5 in T.grid(4, 4, 196, 128, 4, 4): | |
with T.block("data_pack"): | |
eps, nu, p, ci, r_a, r_b = T.axis.remap("SSSSRR", [i0, i1, i2, i3, i4, i5]) | |
T.reads(data_pack[eps, nu, p, ci], input_tile[r_a, r_b, p, ci], B[T.min(r_a, r_b) : T.max(r_a, r_b) + 1, T.min(eps, nu) : T.max(eps, nu) + 1]) | |
T.writes(data_pack[eps, nu, p, ci]) | |
T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["eps", "nu", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_data_pack.cuda"}) | |
with T.init(): | |
data_pack[eps, nu, p, ci] = T.float32(0) | |
data_pack[eps, nu, p, ci] = data_pack[eps, nu, p, ci] + input_tile[r_a, r_b, p, ci] * B[r_a, eps] * B[r_b, nu] | |
for i0, i1, i2, i3, i4 in T.grid(4, 4, 196, 128, 128): | |
with T.block("bgemm"): | |
eps, nu, p, co, ci = T.axis.remap("SSSSR", [i0, i1, i2, i3, i4]) | |
T.reads(bgemm[eps, nu, p, co], data_pack[eps, nu, p, ci], placeholder_2[eps, nu, co, ci]) | |
T.writes(bgemm[eps, nu, p, co]) | |
T.block_attr({"layout_free_placeholders":[placeholder_2]}) | |
with T.init(): | |
bgemm[eps, nu, p, co] = T.float32(0) | |
bgemm[eps, nu, p, co] = bgemm[eps, nu, p, co] + data_pack[eps, nu, p, ci] * placeholder_2[eps, nu, co, ci] | |
for i0, i1 in T.grid(4, 2): | |
with T.block("A"): | |
i, j = T.axis.remap("SS", [i0, i1]) | |
T.reads() | |
T.writes(A[i, j]) | |
T.block_attr({"const_matrix":True, "schedule_rule":"meta_schedule.compute_inline"}) | |
A[i, j] = T.Select(i % 4 == 3 and j % 2 == 1, T.float32(1), T.Select(i % 4 == 3 and j % 2 == 0, T.float32(0), T.Select(i % 4 == 2 and j % 2 == 1, T.float32(1), T.Select(i % 4 == 2 and j % 2 == 0, T.float32(1), T.Select(i % 4 == 1 and j % 2 == 1, T.float32(-1), T.Select(i % 4 == 1 and j % 2 == 0, T.float32(1), T.Select(i % 4 == 0 and j % 2 == 1, T.float32(0), T.Select(i % 4 == 0 and j % 2 == 0, T.float32(1), T.float32(0))))))))) | |
for i0, i1, i2, i3, i4, i5 in T.grid(2, 2, 196, 128, 4, 4): | |
with T.block("inverse"): | |
vh, vw, p, co, r_a, r_b = T.axis.remap("SSSSRR", [i0, i1, i2, i3, i4, i5]) | |
T.reads(inverse[vh, vw, p, co], bgemm[r_a, r_b, p, co], A[T.min(r_a, r_b) : T.max(r_a, r_b) + 1, T.min(vh, vw) : T.max(vh, vw) + 1]) | |
T.writes(inverse[vh, vw, p, co]) | |
T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["vh", "vw", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_inverse"}) | |
with T.init(): | |
inverse[vh, vw, p, co] = T.float32(0) | |
inverse[vh, vw, p, co] = inverse[vh, vw, p, co] + bgemm[r_a, r_b, p, co] * A[r_a, vh] * A[r_b, vw] | |
for i0, i1, i2, i3 in T.grid(1, 28, 28, 128): | |
with T.block("conv2d_winograd"): | |
n, h, w, co = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(inverse[h % 2, w % 2, n * 196 + h // 2 * 14 + w // 2, co]) | |
T.writes(conv2d_winograd[n, h, w, co]) | |
conv2d_winograd[n, h, w, co] = inverse[h % 2, w % 2, n * 196 + h // 2 * 14 + w // 2, co] | |
for i0, i1, i2, i3 in T.grid(1, 28, 28, 128): | |
with T.block("T_add"): | |
ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(conv2d_winograd[ax0, ax1, ax2, ax3], placeholder_1[ax0, 0, 0, ax3]) | |
T.writes(T_add[ax0, ax1, ax2, ax3]) | |
T_add[ax0, ax1, ax2, ax3] = conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder_1[ax0, 0, 0, ax3] | |
for i0, i1, i2, i3 in T.grid(1, 28, 28, 128): | |
with T.block("T_add_1"): | |
ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(T_add[ax0, ax1, ax2, ax3], placeholder[ax0, ax1, ax2, ax3]) | |
T.writes(T_add_1[ax0, ax1, ax2, ax3]) | |
T_add_1[ax0, ax1, ax2, ax3] = T_add[ax0, ax1, ax2, ax3] + placeholder[ax0, ax1, ax2, ax3] | |
for i0, i1, i2, i3 in T.grid(1, 28, 28, 128): | |
with T.block("T_relu"): | |
ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(T_add_1[ax0, ax1, ax2, ax3]) | |
T.writes(T_relu[ax0, ax1, ax2, ax3]) | |
T_relu[ax0, ax1, ax2, ax3] = T.max(T_add_1[ax0, ax1, ax2, ax3], T.float32(0)) | |
[14:36:11] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:113: Total 1 design space(s) generated | |
[14:36:11] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:118: Design space #0: | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 28, 28, 128), "float32"], placeholder_1: T.Buffer[(1, 1, 1, 128), "float32"], placeholder_2: T.Buffer[(4, 4, 128, 128), "float32"], placeholder_3: T.Buffer[(1, 28, 28, 128), "float32"], T_relu: T.Buffer[(1, 28, 28, 128), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
with T.block("root"): | |
T.reads() | |
T.writes() | |
T.block_attr({"meta_schedule.unroll_explicit":1024}) | |
input_tile_local = T.alloc_buffer([4, 4, 196, 128], dtype="float32", scope="local") | |
data_pack = T.alloc_buffer([4, 4, 196, 128], dtype="float32") | |
bgemm = T.alloc_buffer([4, 4, 196, 128], dtype="float32") | |
inverse = T.alloc_buffer([2, 2, 196, 128], dtype="float32") | |
bgemm_local = T.alloc_buffer([4, 4, 196, 128], dtype="float32", scope="local") | |
data_pack_shared = T.alloc_buffer([4, 4, 196, 128], dtype="float32", scope="shared") | |
placeholder_shared = T.alloc_buffer([4, 4, 128, 128], dtype="float32", scope="shared") | |
for i2_0, i3_0, i2_1, i3_1 in T.grid(4, 8, 49, 16): | |
for ax0, ax1, ax2, ax3 in T.grid(4, 4, 1, 1): | |
with T.block("input_tile"): | |
eps, nu = T.axis.remap("SS", [ax0, ax1]) | |
p = T.axis.spatial(196, i2_0 * 49 + i2_1 + ax2) | |
ci = T.axis.spatial(128, i3_0 * 16 + i3_1 + ax3) | |
T.reads(placeholder_3[p // 196, p % 196 // 14 * 2 + eps - 1, p % 14 * 2 + nu - 1, ci]) | |
T.writes(input_tile_local[eps, nu, p, ci]) | |
T.block_attr({"schedule_rule":"None"}) | |
input_tile_local[eps, nu, p, ci] = T.if_then_else(1 <= p % 196 // 14 * 2 + eps and p % 196 // 14 * 2 + eps < 29 and 1 <= p % 14 * 2 + nu and p % 14 * 2 + nu < 29, placeholder_3[p // 196, p % 196 // 14 * 2 + eps - 1, p % 14 * 2 + nu - 1, ci], T.float32(0), dtype="float32") | |
for i0 in T.unroll(4): | |
for i1 in T.unroll(4): | |
for i4 in T.unroll(4): | |
for i5 in T.unroll(4): | |
with T.block("data_pack"): | |
eps, nu = T.axis.remap("SS", [i0, i1]) | |
p = T.axis.spatial(196, i2_0 * 49 + i2_1) | |
ci = T.axis.spatial(128, i3_0 * 16 + i3_1) | |
r_a, r_b = T.axis.remap("RR", [i4, i5]) | |
T.reads(data_pack[eps, nu, p, ci], input_tile_local[r_a, r_b, p, ci]) | |
T.writes(data_pack[eps, nu, p, ci]) | |
T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["eps", "nu", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_data_pack.cuda"}) | |
with T.init(): | |
data_pack[eps, nu, p, ci] = T.float32(0) | |
data_pack[eps, nu, p, ci] = data_pack[eps, nu, p, ci] + input_tile_local[r_a, r_b, p, ci] * T.Select(r_a % 4 == 3 and eps % 4 == 3, T.float32(1), T.Select(r_a % 4 == 3 and eps % 4 == 2, T.float32(0), T.Select(r_a % 4 == 3 and eps % 4 == 1, T.float32(0), T.Select(r_a % 4 == 3 and eps % 4 == 0, T.float32(0), T.Select(r_a % 4 == 2 and eps % 4 == 3, T.float32(0), T.Select(r_a % 4 == 2 and eps % 4 == 2, T.float32(1), T.Select(r_a % 4 == 2 and eps % 4 == 1, T.float32(1), T.Select(r_a % 4 == 2 and eps % 4 == 0, T.float32(-1), T.Select(r_a % 4 == 1 and eps % 4 == 3, T.float32(-1), T.Select(r_a % 4 == 1 and eps % 4 == 2, T.float32(1), T.Select(r_a % 4 == 1 and eps % 4 == 1, T.float32(-1), T.Select(r_a % 4 == 1 and eps % 4 == 0, T.float32(0), T.Select(r_a % 4 == 0 and eps % 4 == 3, T.float32(0), T.Select(r_a % 4 == 0 and eps % 4 == 2, T.float32(0), T.Select(r_a % 4 == 0 and eps % 4 == 1, T.float32(0), T.Select(r_a % 4 == 0 and eps % 4 == 0, T.float32(1), T.float32(0))))))))))))))))) * T.Select(r_b % 4 == 3 and nu % 4 == 3, T.float32(1), T.Select(r_b % 4 == 3 and nu % 4 == 2, T.float32(0), T.Select(r_b % 4 == 3 and nu % 4 == 1, T.float32(0), T.Select(r_b % 4 == 3 and nu % 4 == 0, T.float32(0), T.Select(r_b % 4 == 2 and nu % 4 == 3, T.float32(0), T.Select(r_b % 4 == 2 and nu % 4 == 2, T.float32(1), T.Select(r_b % 4 == 2 and nu % 4 == 1, T.float32(1), T.Select(r_b % 4 == 2 and nu % 4 == 0, T.float32(-1), T.Select(r_b % 4 == 1 and nu % 4 == 3, T.float32(-1), T.Select(r_b % 4 == 1 and nu % 4 == 2, T.float32(1), T.Select(r_b % 4 == 1 and nu % 4 == 1, T.float32(-1), T.Select(r_b % 4 == 1 and nu % 4 == 0, T.float32(0), T.Select(r_b % 4 == 0 and nu % 4 == 3, T.float32(0), T.Select(r_b % 4 == 0 and nu % 4 == 2, T.float32(0), T.Select(r_b % 4 == 0 and nu % 4 == 1, T.float32(0), T.Select(r_b % 4 == 0 and nu % 4 == 0, T.float32(1), T.float32(0))))))))))))))))) | |
for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(14, thread="blockIdx.x"): | |
for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(4, thread="vthread.x"): | |
for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(2, thread="threadIdx.x"): | |
for i4_0 in T.serial(4): | |
for ax0_ax1_ax2_ax3_fused in T.serial(7168): | |
with T.block("data_pack_shared"): | |
v0 = T.axis.spatial(4, ax0_ax1_ax2_ax3_fused // 1792) | |
v1 = T.axis.spatial(4, ax0_ax1_ax2_ax3_fused % 1792 // 448) | |
v2 = T.axis.spatial(196, i0_0_i1_0_i2_0_i3_0_fused * 14 + ax0_ax1_ax2_ax3_fused % 448 // 32) | |
v3 = T.axis.spatial(128, i4_0 * 32 + ax0_ax1_ax2_ax3_fused % 32) | |
T.reads(data_pack[v0, v1, v2, v3]) | |
T.writes(data_pack_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":2}) | |
data_pack_shared[v0, v1, v2, v3] = data_pack[v0, v1, v2, v3] | |
for ax0_ax1_ax2_ax3_fused in T.serial(65536): | |
with T.block("placeholder_shared"): | |
v0 = T.axis.spatial(4, ax0_ax1_ax2_ax3_fused // 16384) | |
v1 = T.axis.spatial(4, ax0_ax1_ax2_ax3_fused % 16384 // 4096) | |
v2 = T.axis.spatial(128, ax0_ax1_ax2_ax3_fused % 4096 // 32) | |
v3 = T.axis.spatial(128, i4_0 * 32 + ax0_ax1_ax2_ax3_fused % 32) | |
T.reads(placeholder_2[v0, v1, v2, v3]) | |
T.writes(placeholder_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":3}) | |
placeholder_shared[v0, v1, v2, v3] = placeholder_2[v0, v1, v2, v3] | |
for i4_1, i0_3, i1_3, i2_3, i3_3, i4_2, i0_4, i1_4, i2_4, i3_4 in T.grid(8, 1, 1, 1, 2, 4, 4, 2, 14, 16): | |
with T.block("bgemm"): | |
eps = T.axis.spatial(4, i0_4) | |
nu = T.axis.spatial(4, i0_1_i1_1_i2_1_i3_1_fused // 2 * 2 + i1_4) | |
p = T.axis.spatial(196, i0_0_i1_0_i2_0_i3_0_fused * 14 + i2_4) | |
co = T.axis.spatial(128, i0_1_i1_1_i2_1_i3_1_fused % 2 * 64 + i0_2_i1_2_i2_2_i3_2_fused * 32 + i3_3 * 16 + i3_4) | |
ci = T.axis.reduce(128, i4_0 * 32 + i4_1 * 4 + i4_2) | |
T.reads(bgemm_local[eps, nu, p, co], data_pack_shared[eps, nu, p, ci], placeholder_shared[eps, nu, co, ci]) | |
T.writes(bgemm_local[eps, nu, p, co]) | |
T.block_attr({"layout_free_placeholders":[placeholder_2], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS"}) | |
with T.init(): | |
bgemm_local[eps, nu, p, co] = T.float32(0) | |
bgemm_local[eps, nu, p, co] = bgemm_local[eps, nu, p, co] + data_pack_shared[eps, nu, p, ci] * placeholder_shared[eps, nu, co, ci] | |
for ax0, ax1, ax2, ax3 in T.grid(4, 2, 14, 32): | |
with T.block("bgemm_local"): | |
v0 = T.axis.spatial(4, ax0) | |
v1 = T.axis.spatial(4, i0_1_i1_1_i2_1_i3_1_fused // 2 * 2 + ax1) | |
v2 = T.axis.spatial(196, i0_0_i1_0_i2_0_i3_0_fused * 14 + ax2) | |
v3 = T.axis.spatial(128, i0_1_i1_1_i2_1_i3_1_fused % 2 * 64 + i0_2_i1_2_i2_2_i3_2_fused * 32 + ax3) | |
T.reads(bgemm_local[v0, v1, v2, v3]) | |
T.writes(bgemm[v0, v1, v2, v3]) | |
bgemm[v0, v1, v2, v3] = bgemm_local[v0, v1, v2, v3] | |
for i2_0, i3_0, i2_1, i3_1 in T.grid(98, 8, 2, 16): | |
for i0 in T.unroll(2): | |
for i1 in T.unroll(2): | |
for i4 in T.unroll(4): | |
for i5 in T.unroll(4): | |
with T.block("inverse"): | |
vh, vw = T.axis.remap("SS", [i0, i1]) | |
p = T.axis.spatial(196, i2_0 * 2 + i2_1) | |
co = T.axis.spatial(128, i3_0 * 16 + i3_1) | |
r_a, r_b = T.axis.remap("RR", [i4, i5]) | |
T.reads(inverse[vh, vw, p, co], bgemm[r_a, r_b, p, co]) | |
T.writes(inverse[vh, vw, p, co]) | |
T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["vh", "vw", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_inverse"}) | |
with T.init(): | |
inverse[vh, vw, p, co] = T.float32(0) | |
inverse[vh, vw, p, co] = inverse[vh, vw, p, co] + bgemm[r_a, r_b, p, co] * T.Select(r_a % 4 == 3 and vh % 2 == 1, T.float32(1), T.Select(r_a % 4 == 3 and vh % 2 == 0, T.float32(0), T.Select(r_a % 4 == 2 and vh % 2 == 1, T.float32(1), T.Select(r_a % 4 == 2 and vh % 2 == 0, T.float32(1), T.Select(r_a % 4 == 1 and vh % 2 == 1, T.float32(-1), T.Select(r_a % 4 == 1 and vh % 2 == 0, T.float32(1), T.Select(r_a % 4 == 0 and vh % 2 == 1, T.float32(0), T.Select(r_a % 4 == 0 and vh % 2 == 0, T.float32(1), T.float32(0))))))))) * T.Select(r_b % 4 == 3 and vw % 2 == 1, T.float32(1), T.Select(r_b % 4 == 3 and vw % 2 == 0, T.float32(0), T.Select(r_b % 4 == 2 and vw % 2 == 1, T.float32(1), T.Select(r_b % 4 == 2 and vw % 2 == 0, T.float32(1), T.Select(r_b % 4 == 1 and vw % 2 == 1, T.float32(-1), T.Select(r_b % 4 == 1 and vw % 2 == 0, T.float32(1), T.Select(r_b % 4 == 0 and vw % 2 == 1, T.float32(0), T.Select(r_b % 4 == 0 and vw % 2 == 0, T.float32(1), T.float32(0))))))))) | |
for i0, i1, i2, i3 in T.grid(1, 28, 28, 128): | |
with T.block("conv2d_winograd"): | |
n, h, w, co = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(inverse[h % 2, w % 2, n * 196 + h // 2 * 14 + w // 2, co], placeholder_1[n, 0, 0, co], placeholder[n, h, w, co]) | |
T.writes(T_relu[n, h, w, co]) | |
T_relu[n, h, w, co] = T.max(inverse[h % 2, w % 2, n * 196 + h // 2 * 14 + w // 2, co] + placeholder_1[n, 0, 0, co] + placeholder[n, h, w, co], T.float32(0)) | |
b0 = sch.get_block(name="B", func_name="main") | |
b1 = sch.get_block(name="data_pack", func_name="main") | |
b2 = sch.get_block(name="bgemm", func_name="main") | |
b3 = sch.get_block(name="A", func_name="main") | |
b4 = sch.get_block(name="inverse", func_name="main") | |
b5 = sch.get_block(name="T_add", func_name="main") | |
b6 = sch.get_block(name="T_add_1", func_name="main") | |
b7 = sch.get_block(name="T_relu", func_name="main") | |
b8 = sch.get_block(name="root", func_name="main") | |
sch.compute_inline(block=b0) | |
b9, = sch.get_producers(block=b1) | |
b10, = sch.get_producers(block=b9) | |
l11, l12, l13, l14, l15, l16 = sch.get_loops(block=b1) | |
v17, v18 = sch.sample_perfect_tile(loop=l13, n=2, max_innermost_factor=64, decision=[4, 49]) | |
l19, l20 = sch.split(loop=l13, factors=[v17, v18]) | |
v21, v22 = sch.sample_perfect_tile(loop=l14, n=2, max_innermost_factor=64, decision=[8, 16]) | |
l23, l24 = sch.split(loop=l14, factors=[v21, v22]) | |
sch.unroll(loop=l11) | |
sch.unroll(loop=l12) | |
sch.unroll(loop=l15) | |
sch.unroll(loop=l16) | |
sch.reorder(l19, l23, l20, l24, l11, l12, l15, l16) | |
sch.compute_at(block=b9, loop=l24, preserve_unit_loops=True) | |
sch.set_scope(block=b9, buffer_index=0, storage_scope="local") | |
sch.compute_inline(block=b10) | |
sch.compute_inline(block=b3) | |
l25, l26, l27, l28, l29, l30 = sch.get_loops(block=b4) | |
v31, v32 = sch.sample_perfect_tile(loop=l27, n=2, max_innermost_factor=64, decision=[98, 2]) | |
l33, l34 = sch.split(loop=l27, factors=[v31, v32]) | |
v35, v36 = sch.sample_perfect_tile(loop=l28, n=2, max_innermost_factor=64, decision=[8, 16]) | |
l37, l38 = sch.split(loop=l28, factors=[v35, v36]) | |
sch.unroll(loop=l25) | |
sch.unroll(loop=l26) | |
sch.unroll(loop=l29) | |
sch.unroll(loop=l30) | |
sch.reorder(l33, l37, l34, l38, l25, l26, l29, l30) | |
sch.annotate(block_or_loop=b2, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS") | |
l39, l40, l41, l42, l43 = sch.get_loops(block=b2) | |
v44, v45, v46, v47, v48 = sch.sample_perfect_tile(loop=l39, n=5, max_innermost_factor=64, decision=[1, 1, 1, 1, 4]) | |
l49, l50, l51, l52, l53 = sch.split(loop=l39, factors=[v44, v45, v46, v47, v48]) | |
v54, v55, v56, v57, v58 = sch.sample_perfect_tile(loop=l40, n=5, max_innermost_factor=64, decision=[1, 2, 1, 1, 2]) | |
l59, l60, l61, l62, l63 = sch.split(loop=l40, factors=[v54, v55, v56, v57, v58]) | |
v64, v65, v66, v67, v68 = sch.sample_perfect_tile(loop=l41, n=5, max_innermost_factor=64, decision=[14, 1, 1, 1, 14]) | |
l69, l70, l71, l72, l73 = sch.split(loop=l41, factors=[v64, v65, v66, v67, v68]) | |
v74, v75, v76, v77, v78 = sch.sample_perfect_tile(loop=l42, n=5, max_innermost_factor=64, decision=[1, 2, 2, 2, 16]) | |
l79, l80, l81, l82, l83 = sch.split(loop=l42, factors=[v74, v75, v76, v77, v78]) | |
v84, v85, v86 = sch.sample_perfect_tile(loop=l43, n=3, max_innermost_factor=64, decision=[4, 8, 4]) | |
l87, l88, l89 = sch.split(loop=l43, factors=[v84, v85, v86]) | |
sch.reorder(l49, l59, l69, l79, l50, l60, l70, l80, l51, l61, l71, l81, l87, l88, l52, l62, l72, l82, l89, l53, l63, l73, l83) | |
l90 = sch.fuse(l49, l59, l69, l79) | |
sch.bind(loop=l90, thread_axis="blockIdx.x") | |
l91 = sch.fuse(l50, l60, l70, l80) | |
sch.bind(loop=l91, thread_axis="vthread.x") | |
l92 = sch.fuse(l51, l61, l71, l81) | |
sch.bind(loop=l92, thread_axis="threadIdx.x") | |
sch.annotate(block_or_loop=b2, ann_key="meta_schedule.thread_extent_low_inclusive", ann_val=32) | |
sch.annotate(block_or_loop=b2, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=1024) | |
b93 = sch.cache_write(block=b2, write_buffer_index=0, storage_scope="local") | |
sch.reverse_compute_at(block=b93, loop=l92, preserve_unit_loops=True) | |
b94 = sch.cache_read(block=b2, read_buffer_index=1, storage_scope="shared") | |
sch.compute_at(block=b94, loop=l87, preserve_unit_loops=True) | |
l95, l96, l97, l98, l99, l100, l101, l102 = sch.get_loops(block=b94) | |
l103 = sch.fuse(l99, l100, l101, l102) | |
v104 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=1) | |
sch.annotate(block_or_loop=b94, ann_key="meta_schedule.cooperative_fetch", ann_val=v104) | |
b105 = sch.cache_read(block=b2, read_buffer_index=2, storage_scope="shared") | |
sch.compute_at(block=b105, loop=l87, preserve_unit_loops=True) | |
l106, l107, l108, l109, l110, l111, l112, l113 = sch.get_loops(block=b105) | |
l114 = sch.fuse(l110, l111, l112, l113) | |
v115 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=2) | |
sch.annotate(block_or_loop=b105, ann_key="meta_schedule.cooperative_fetch", ann_val=v115) | |
sch.reverse_compute_inline(block=b7) | |
sch.reverse_compute_inline(block=b6) | |
sch.reverse_compute_inline(block=b5) | |
v116 = sch.sample_categorical(candidates=[0, 16, 64, 512, 1024], probs=[0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001], decision=4) | |
sch.annotate(block_or_loop=b8, ann_key="meta_schedule.unroll_explicit", ann_val=v116) | |
[14:36:11] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:96: Initializing Task #8: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1", mod = | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 1, 1, 128), "float32"], placeholder_1: T.Buffer[(4, 4, 128, 128), "float32"], placeholder_2: T.Buffer[(1, 28, 28, 128), "float32"], T_relu: T.Buffer[(1, 28, 28, 128), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
# with T.block("root") | |
data_pad = T.alloc_buffer([1, 30, 30, 128], dtype="float32") | |
input_tile = T.alloc_buffer([4, 4, 196, 128], dtype="float32") | |
B = T.alloc_buffer([4, 4], dtype="float32") | |
data_pack = T.alloc_buffer([4, 4, 196, 128], dtype="float32") | |
bgemm = T.alloc_buffer([4, 4, 196, 128], dtype="float32") | |
A = T.alloc_buffer([4, 2], dtype="float32") | |
inverse = T.alloc_buffer([2, 2, 196, 128], dtype="float32") | |
conv2d_winograd = T.alloc_buffer([1, 28, 28, 128], dtype="float32") | |
T_add = T.alloc_buffer([1, 28, 28, 128], dtype="float32") | |
for i0, i1, i2, i3 in T.grid(1, 30, 30, 128): | |
with T.block("data_pad"): | |
i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(placeholder_2[i0_1, i1_1 - 1, i2_1 - 1, i3_1]) | |
T.writes(data_pad[i0_1, i1_1, i2_1, i3_1]) | |
T.block_attr({"schedule_rule":"None"}) | |
data_pad[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(1 <= i1_1 and i1_1 < 29 and 1 <= i2_1 and i2_1 < 29, placeholder_2[i0_1, i1_1 - 1, i2_1 - 1, i3_1], T.float32(0), dtype="float32") | |
for i0, i1, i2, i3 in T.grid(4, 4, 196, 128): | |
with T.block("input_tile"): | |
eps, nu, p, ci = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(data_pad[p // 196, p % 196 // 14 * 2 + eps, p % 14 * 2 + nu, ci]) | |
T.writes(input_tile[eps, nu, p, ci]) | |
T.block_attr({"schedule_rule":"None"}) | |
input_tile[eps, nu, p, ci] = data_pad[p // 196, p % 196 // 14 * 2 + eps, p % 14 * 2 + nu, ci] | |
for i0, i1 in T.grid(4, 4): | |
with T.block("B"): | |
i, j = T.axis.remap("SS", [i0, i1]) | |
T.reads() | |
T.writes(B[i, j]) | |
T.block_attr({"const_matrix":True, "schedule_rule":"meta_schedule.compute_inline"}) | |
B[i, j] = T.Select(i % 4 == 3 and j % 4 == 3, T.float32(1), T.Select(i % 4 == 3 and j % 4 == 2, T.float32(0), T.Select(i % 4 == 3 and j % 4 == 1, T.float32(0), T.Select(i % 4 == 3 and j % 4 == 0, T.float32(0), T.Select(i % 4 == 2 and j % 4 == 3, T.float32(0), T.Select(i % 4 == 2 and j % 4 == 2, T.float32(1), T.Select(i % 4 == 2 and j % 4 == 1, T.float32(1), T.Select(i % 4 == 2 and j % 4 == 0, T.float32(-1), T.Select(i % 4 == 1 and j % 4 == 3, T.float32(-1), T.Select(i % 4 == 1 and j % 4 == 2, T.float32(1), T.Select(i % 4 == 1 and j % 4 == 1, T.float32(-1), T.Select(i % 4 == 1 and j % 4 == 0, T.float32(0), T.Select(i % 4 == 0 and j % 4 == 3, T.float32(0), T.Select(i % 4 == 0 and j % 4 == 2, T.float32(0), T.Select(i % 4 == 0 and j % 4 == 1, T.float32(0), T.Select(i % 4 == 0 and j % 4 == 0, T.float32(1), T.float32(0))))))))))))))))) | |
for i0, i1, i2, i3, i4, i5 in T.grid(4, 4, 196, 128, 4, 4): | |
with T.block("data_pack"): | |
eps, nu, p, ci, r_a, r_b = T.axis.remap("SSSSRR", [i0, i1, i2, i3, i4, i5]) | |
T.reads(data_pack[eps, nu, p, ci], input_tile[r_a, r_b, p, ci], B[T.min(r_a, r_b) : T.max(r_a, r_b) + 1, T.min(eps, nu) : T.max(eps, nu) + 1]) | |
T.writes(data_pack[eps, nu, p, ci]) | |
T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["eps", "nu", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_data_pack.cuda"}) | |
with T.init(): | |
data_pack[eps, nu, p, ci] = T.float32(0) | |
data_pack[eps, nu, p, ci] = data_pack[eps, nu, p, ci] + input_tile[r_a, r_b, p, ci] * B[r_a, eps] * B[r_b, nu] | |
for i0, i1, i2, i3, i4 in T.grid(4, 4, 196, 128, 128): | |
with T.block("bgemm"): | |
eps, nu, p, co, ci = T.axis.remap("SSSSR", [i0, i1, i2, i3, i4]) | |
T.reads(bgemm[eps, nu, p, co], data_pack[eps, nu, p, ci], placeholder_1[eps, nu, co, ci]) | |
T.writes(bgemm[eps, nu, p, co]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1]}) | |
with T.init(): | |
bgemm[eps, nu, p, co] = T.float32(0) | |
bgemm[eps, nu, p, co] = bgemm[eps, nu, p, co] + data_pack[eps, nu, p, ci] * placeholder_1[eps, nu, co, ci] | |
for i0, i1 in T.grid(4, 2): | |
with T.block("A"): | |
i, j = T.axis.remap("SS", [i0, i1]) | |
T.reads() | |
T.writes(A[i, j]) | |
T.block_attr({"const_matrix":True, "schedule_rule":"meta_schedule.compute_inline"}) | |
A[i, j] = T.Select(i % 4 == 3 and j % 2 == 1, T.float32(1), T.Select(i % 4 == 3 and j % 2 == 0, T.float32(0), T.Select(i % 4 == 2 and j % 2 == 1, T.float32(1), T.Select(i % 4 == 2 and j % 2 == 0, T.float32(1), T.Select(i % 4 == 1 and j % 2 == 1, T.float32(-1), T.Select(i % 4 == 1 and j % 2 == 0, T.float32(1), T.Select(i % 4 == 0 and j % 2 == 1, T.float32(0), T.Select(i % 4 == 0 and j % 2 == 0, T.float32(1), T.float32(0))))))))) | |
for i0, i1, i2, i3, i4, i5 in T.grid(2, 2, 196, 128, 4, 4): | |
with T.block("inverse"): | |
vh, vw, p, co, r_a, r_b = T.axis.remap("SSSSRR", [i0, i1, i2, i3, i4, i5]) | |
T.reads(inverse[vh, vw, p, co], bgemm[r_a, r_b, p, co], A[T.min(r_a, r_b) : T.max(r_a, r_b) + 1, T.min(vh, vw) : T.max(vh, vw) + 1]) | |
T.writes(inverse[vh, vw, p, co]) | |
T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["vh", "vw", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_inverse"}) | |
with T.init(): | |
inverse[vh, vw, p, co] = T.float32(0) | |
inverse[vh, vw, p, co] = inverse[vh, vw, p, co] + bgemm[r_a, r_b, p, co] * A[r_a, vh] * A[r_b, vw] | |
for i0, i1, i2, i3 in T.grid(1, 28, 28, 128): | |
with T.block("conv2d_winograd"): | |
n, h, w, co = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(inverse[h % 2, w % 2, n * 196 + h // 2 * 14 + w // 2, co]) | |
T.writes(conv2d_winograd[n, h, w, co]) | |
conv2d_winograd[n, h, w, co] = inverse[h % 2, w % 2, n * 196 + h // 2 * 14 + w // 2, co] | |
for i0, i1, i2, i3 in T.grid(1, 28, 28, 128): | |
with T.block("T_add"): | |
ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(conv2d_winograd[ax0, ax1, ax2, ax3], placeholder[ax0, 0, 0, ax3]) | |
T.writes(T_add[ax0, ax1, ax2, ax3]) | |
T_add[ax0, ax1, ax2, ax3] = conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3] | |
for i0, i1, i2, i3 in T.grid(1, 28, 28, 128): | |
with T.block("T_relu"): | |
ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(T_add[ax0, ax1, ax2, ax3]) | |
T.writes(T_relu[ax0, ax1, ax2, ax3]) | |
T_relu[ax0, ax1, ax2, ax3] = T.max(T_add[ax0, ax1, ax2, ax3], T.float32(0)) | |
[14:36:11] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:113: Total 1 design space(s) generated | |
[14:36:11] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:118: Design space #0: | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 1, 1, 128), "float32"], placeholder_1: T.Buffer[(4, 4, 128, 128), "float32"], placeholder_2: T.Buffer[(1, 28, 28, 128), "float32"], T_relu: T.Buffer[(1, 28, 28, 128), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
with T.block("root"): | |
T.reads() | |
T.writes() | |
T.block_attr({"meta_schedule.unroll_explicit":512}) | |
input_tile_local = T.alloc_buffer([4, 4, 196, 128], dtype="float32", scope="local") | |
data_pack = T.alloc_buffer([4, 4, 196, 128], dtype="float32") | |
bgemm = T.alloc_buffer([4, 4, 196, 128], dtype="float32") | |
inverse = T.alloc_buffer([2, 2, 196, 128], dtype="float32") | |
bgemm_local = T.alloc_buffer([4, 4, 196, 128], dtype="float32", scope="local") | |
data_pack_shared = T.alloc_buffer([4, 4, 196, 128], dtype="float32", scope="shared") | |
placeholder_shared = T.alloc_buffer([4, 4, 128, 128], dtype="float32", scope="shared") | |
for i2_0, i3_0, i2_1, i3_1 in T.grid(196, 8, 1, 16): | |
for ax0, ax1, ax2, ax3 in T.grid(4, 4, 1, 1): | |
with T.block("input_tile"): | |
eps, nu = T.axis.remap("SS", [ax0, ax1]) | |
p = T.axis.spatial(196, i2_0 + ax2) | |
ci = T.axis.spatial(128, i3_0 * 16 + i3_1 + ax3) | |
T.reads(placeholder_2[p // 196, p % 196 // 14 * 2 + eps - 1, p % 14 * 2 + nu - 1, ci]) | |
T.writes(input_tile_local[eps, nu, p, ci]) | |
T.block_attr({"schedule_rule":"None"}) | |
input_tile_local[eps, nu, p, ci] = T.if_then_else(1 <= p % 196 // 14 * 2 + eps and p % 196 // 14 * 2 + eps < 29 and 1 <= p % 14 * 2 + nu and p % 14 * 2 + nu < 29, placeholder_2[p // 196, p % 196 // 14 * 2 + eps - 1, p % 14 * 2 + nu - 1, ci], T.float32(0), dtype="float32") | |
for i0 in T.unroll(4): | |
for i1 in T.unroll(4): | |
for i4 in T.unroll(4): | |
for i5 in T.unroll(4): | |
with T.block("data_pack"): | |
eps, nu, p = T.axis.remap("SSS", [i0, i1, i2_0]) | |
ci = T.axis.spatial(128, i3_0 * 16 + i3_1) | |
r_a, r_b = T.axis.remap("RR", [i4, i5]) | |
T.reads(data_pack[eps, nu, p, ci], input_tile_local[r_a, r_b, p, ci]) | |
T.writes(data_pack[eps, nu, p, ci]) | |
T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["eps", "nu", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_data_pack.cuda"}) | |
with T.init(): | |
data_pack[eps, nu, p, ci] = T.float32(0) | |
data_pack[eps, nu, p, ci] = data_pack[eps, nu, p, ci] + input_tile_local[r_a, r_b, p, ci] * T.Select(r_a % 4 == 3 and eps % 4 == 3, T.float32(1), T.Select(r_a % 4 == 3 and eps % 4 == 2, T.float32(0), T.Select(r_a % 4 == 3 and eps % 4 == 1, T.float32(0), T.Select(r_a % 4 == 3 and eps % 4 == 0, T.float32(0), T.Select(r_a % 4 == 2 and eps % 4 == 3, T.float32(0), T.Select(r_a % 4 == 2 and eps % 4 == 2, T.float32(1), T.Select(r_a % 4 == 2 and eps % 4 == 1, T.float32(1), T.Select(r_a % 4 == 2 and eps % 4 == 0, T.float32(-1), T.Select(r_a % 4 == 1 and eps % 4 == 3, T.float32(-1), T.Select(r_a % 4 == 1 and eps % 4 == 2, T.float32(1), T.Select(r_a % 4 == 1 and eps % 4 == 1, T.float32(-1), T.Select(r_a % 4 == 1 and eps % 4 == 0, T.float32(0), T.Select(r_a % 4 == 0 and eps % 4 == 3, T.float32(0), T.Select(r_a % 4 == 0 and eps % 4 == 2, T.float32(0), T.Select(r_a % 4 == 0 and eps % 4 == 1, T.float32(0), T.Select(r_a % 4 == 0 and eps % 4 == 0, T.float32(1), T.float32(0))))))))))))))))) * T.Select(r_b % 4 == 3 and nu % 4 == 3, T.float32(1), T.Select(r_b % 4 == 3 and nu % 4 == 2, T.float32(0), T.Select(r_b % 4 == 3 and nu % 4 == 1, T.float32(0), T.Select(r_b % 4 == 3 and nu % 4 == 0, T.float32(0), T.Select(r_b % 4 == 2 and nu % 4 == 3, T.float32(0), T.Select(r_b % 4 == 2 and nu % 4 == 2, T.float32(1), T.Select(r_b % 4 == 2 and nu % 4 == 1, T.float32(1), T.Select(r_b % 4 == 2 and nu % 4 == 0, T.float32(-1), T.Select(r_b % 4 == 1 and nu % 4 == 3, T.float32(-1), T.Select(r_b % 4 == 1 and nu % 4 == 2, T.float32(1), T.Select(r_b % 4 == 1 and nu % 4 == 1, T.float32(-1), T.Select(r_b % 4 == 1 and nu % 4 == 0, T.float32(0), T.Select(r_b % 4 == 0 and nu % 4 == 3, T.float32(0), T.Select(r_b % 4 == 0 and nu % 4 == 2, T.float32(0), T.Select(r_b % 4 == 0 and nu % 4 == 1, T.float32(0), T.Select(r_b % 4 == 0 and nu % 4 == 0, T.float32(1), T.float32(0))))))))))))))))) | |
for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(28, thread="blockIdx.x"): | |
for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(4, thread="vthread.x"): | |
for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(4, thread="threadIdx.x"): | |
for i4_0 in T.serial(16): | |
for ax0_ax1_ax2_ax3_fused in T.serial(896): | |
with T.block("data_pack_shared"): | |
v0 = T.axis.spatial(4, i0_0_i1_0_i2_0_i3_0_fused // 14 * 2 + ax0_ax1_ax2_ax3_fused // 448) | |
v1 = T.axis.spatial(4, ax0_ax1_ax2_ax3_fused % 448 // 112) | |
v2 = T.axis.spatial(196, i0_0_i1_0_i2_0_i3_0_fused % 14 * 14 + ax0_ax1_ax2_ax3_fused % 112 // 8) | |
v3 = T.axis.spatial(128, i4_0 * 8 + ax0_ax1_ax2_ax3_fused % 8) | |
T.reads(data_pack[v0, v1, v2, v3]) | |
T.writes(data_pack_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":4}) | |
data_pack_shared[v0, v1, v2, v3] = data_pack[v0, v1, v2, v3] | |
for ax0_ax1_ax2_ax3_fused in T.serial(8192): | |
with T.block("placeholder_shared"): | |
v0 = T.axis.spatial(4, i0_0_i1_0_i2_0_i3_0_fused // 14 * 2 + ax0_ax1_ax2_ax3_fused // 4096) | |
v1 = T.axis.spatial(4, ax0_ax1_ax2_ax3_fused % 4096 // 1024) | |
v2 = T.axis.spatial(128, ax0_ax1_ax2_ax3_fused % 1024 // 8) | |
v3 = T.axis.spatial(128, i4_0 * 8 + ax0_ax1_ax2_ax3_fused % 8) | |
T.reads(placeholder_1[v0, v1, v2, v3]) | |
T.writes(placeholder_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":2}) | |
placeholder_shared[v0, v1, v2, v3] = placeholder_1[v0, v1, v2, v3] | |
for i4_1, i0_3, i1_3, i2_3, i3_3, i4_2, i0_4, i1_4, i2_4, i3_4 in T.grid(2, 1, 1, 2, 1, 4, 1, 1, 7, 64): | |
with T.block("bgemm"): | |
eps = T.axis.spatial(4, i0_0_i1_0_i2_0_i3_0_fused // 14 * 2 + i0_2_i1_2_i2_2_i3_2_fused // 2) | |
nu = T.axis.spatial(4, i0_1_i1_1_i2_1_i3_1_fused) | |
p = T.axis.spatial(196, i0_0_i1_0_i2_0_i3_0_fused % 14 * 14 + i2_3 * 7 + i2_4) | |
co = T.axis.spatial(128, i0_2_i1_2_i2_2_i3_2_fused % 2 * 64 + i3_4) | |
ci = T.axis.reduce(128, i4_0 * 8 + i4_1 * 4 + i4_2) | |
T.reads(bgemm_local[eps, nu, p, co], data_pack_shared[eps, nu, p, ci], placeholder_shared[eps, nu, co, ci]) | |
T.writes(bgemm_local[eps, nu, p, co]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS"}) | |
with T.init(): | |
bgemm_local[eps, nu, p, co] = T.float32(0) | |
bgemm_local[eps, nu, p, co] = bgemm_local[eps, nu, p, co] + data_pack_shared[eps, nu, p, ci] * placeholder_shared[eps, nu, co, ci] | |
for ax0, ax1, ax2, ax3 in T.grid(1, 1, 14, 64): | |
with T.block("bgemm_local"): | |
v0 = T.axis.spatial(4, i0_0_i1_0_i2_0_i3_0_fused // 14 * 2 + i0_2_i1_2_i2_2_i3_2_fused // 2 + ax0) | |
v1 = T.axis.spatial(4, i0_1_i1_1_i2_1_i3_1_fused + ax1) | |
v2 = T.axis.spatial(196, i0_0_i1_0_i2_0_i3_0_fused % 14 * 14 + ax2) | |
v3 = T.axis.spatial(128, i0_2_i1_2_i2_2_i3_2_fused % 2 * 64 + ax3) | |
T.reads(bgemm_local[v0, v1, v2, v3]) | |
T.writes(bgemm[v0, v1, v2, v3]) | |
bgemm[v0, v1, v2, v3] = bgemm_local[v0, v1, v2, v3] | |
for i2_0, i3_0, i2_1, i3_1 in T.grid(98, 16, 2, 8): | |
for i0 in T.unroll(2): | |
for i1 in T.unroll(2): | |
for i4 in T.unroll(4): | |
for i5 in T.unroll(4): | |
with T.block("inverse"): | |
vh, vw = T.axis.remap("SS", [i0, i1]) | |
p = T.axis.spatial(196, i2_0 * 2 + i2_1) | |
co = T.axis.spatial(128, i3_0 * 8 + i3_1) | |
r_a, r_b = T.axis.remap("RR", [i4, i5]) | |
T.reads(inverse[vh, vw, p, co], bgemm[r_a, r_b, p, co]) | |
T.writes(inverse[vh, vw, p, co]) | |
T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["vh", "vw", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_inverse"}) | |
with T.init(): | |
inverse[vh, vw, p, co] = T.float32(0) | |
inverse[vh, vw, p, co] = inverse[vh, vw, p, co] + bgemm[r_a, r_b, p, co] * T.Select(r_a % 4 == 3 and vh % 2 == 1, T.float32(1), T.Select(r_a % 4 == 3 and vh % 2 == 0, T.float32(0), T.Select(r_a % 4 == 2 and vh % 2 == 1, T.float32(1), T.Select(r_a % 4 == 2 and vh % 2 == 0, T.float32(1), T.Select(r_a % 4 == 1 and vh % 2 == 1, T.float32(-1), T.Select(r_a % 4 == 1 and vh % 2 == 0, T.float32(1), T.Select(r_a % 4 == 0 and vh % 2 == 1, T.float32(0), T.Select(r_a % 4 == 0 and vh % 2 == 0, T.float32(1), T.float32(0))))))))) * T.Select(r_b % 4 == 3 and vw % 2 == 1, T.float32(1), T.Select(r_b % 4 == 3 and vw % 2 == 0, T.float32(0), T.Select(r_b % 4 == 2 and vw % 2 == 1, T.float32(1), T.Select(r_b % 4 == 2 and vw % 2 == 0, T.float32(1), T.Select(r_b % 4 == 1 and vw % 2 == 1, T.float32(-1), T.Select(r_b % 4 == 1 and vw % 2 == 0, T.float32(1), T.Select(r_b % 4 == 0 and vw % 2 == 1, T.float32(0), T.Select(r_b % 4 == 0 and vw % 2 == 0, T.float32(1), T.float32(0))))))))) | |
for i0, i1, i2, i3 in T.grid(1, 28, 28, 128): | |
with T.block("conv2d_winograd"): | |
n, h, w, co = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(inverse[h % 2, w % 2, n * 196 + h // 2 * 14 + w // 2, co], placeholder[n, 0, 0, co]) | |
T.writes(T_relu[n, h, w, co]) | |
T_relu[n, h, w, co] = T.max(inverse[h % 2, w % 2, n * 196 + h // 2 * 14 + w // 2, co] + placeholder[n, 0, 0, co], T.float32(0)) | |
b0 = sch.get_block(name="B", func_name="main") | |
b1 = sch.get_block(name="data_pack", func_name="main") | |
b2 = sch.get_block(name="bgemm", func_name="main") | |
b3 = sch.get_block(name="A", func_name="main") | |
b4 = sch.get_block(name="inverse", func_name="main") | |
b5 = sch.get_block(name="T_add", func_name="main") | |
b6 = sch.get_block(name="T_relu", func_name="main") | |
b7 = sch.get_block(name="root", func_name="main") | |
sch.compute_inline(block=b0) | |
b8, = sch.get_producers(block=b1) | |
b9, = sch.get_producers(block=b8) | |
l10, l11, l12, l13, l14, l15 = sch.get_loops(block=b1) | |
v16, v17 = sch.sample_perfect_tile(loop=l12, n=2, max_innermost_factor=64, decision=[196, 1]) | |
l18, l19 = sch.split(loop=l12, factors=[v16, v17]) | |
v20, v21 = sch.sample_perfect_tile(loop=l13, n=2, max_innermost_factor=64, decision=[8, 16]) | |
l22, l23 = sch.split(loop=l13, factors=[v20, v21]) | |
sch.unroll(loop=l10) | |
sch.unroll(loop=l11) | |
sch.unroll(loop=l14) | |
sch.unroll(loop=l15) | |
sch.reorder(l18, l22, l19, l23, l10, l11, l14, l15) | |
sch.compute_at(block=b8, loop=l23, preserve_unit_loops=True) | |
sch.set_scope(block=b8, buffer_index=0, storage_scope="local") | |
sch.compute_inline(block=b9) | |
sch.compute_inline(block=b3) | |
l24, l25, l26, l27, l28, l29 = sch.get_loops(block=b4) | |
v30, v31 = sch.sample_perfect_tile(loop=l26, n=2, max_innermost_factor=64, decision=[98, 2]) | |
l32, l33 = sch.split(loop=l26, factors=[v30, v31]) | |
v34, v35 = sch.sample_perfect_tile(loop=l27, n=2, max_innermost_factor=64, decision=[16, 8]) | |
l36, l37 = sch.split(loop=l27, factors=[v34, v35]) | |
sch.unroll(loop=l24) | |
sch.unroll(loop=l25) | |
sch.unroll(loop=l28) | |
sch.unroll(loop=l29) | |
sch.reorder(l32, l36, l33, l37, l24, l25, l28, l29) | |
sch.annotate(block_or_loop=b2, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS") | |
l38, l39, l40, l41, l42 = sch.get_loops(block=b2) | |
v43, v44, v45, v46, v47 = sch.sample_perfect_tile(loop=l38, n=5, max_innermost_factor=64, decision=[2, 1, 2, 1, 1]) | |
l48, l49, l50, l51, l52 = sch.split(loop=l38, factors=[v43, v44, v45, v46, v47]) | |
v53, v54, v55, v56, v57 = sch.sample_perfect_tile(loop=l39, n=5, max_innermost_factor=64, decision=[1, 4, 1, 1, 1]) | |
l58, l59, l60, l61, l62 = sch.split(loop=l39, factors=[v53, v54, v55, v56, v57]) | |
v63, v64, v65, v66, v67 = sch.sample_perfect_tile(loop=l40, n=5, max_innermost_factor=64, decision=[14, 1, 1, 2, 7]) | |
l68, l69, l70, l71, l72 = sch.split(loop=l40, factors=[v63, v64, v65, v66, v67]) | |
v73, v74, v75, v76, v77 = sch.sample_perfect_tile(loop=l41, n=5, max_innermost_factor=64, decision=[1, 1, 2, 1, 64]) | |
l78, l79, l80, l81, l82 = sch.split(loop=l41, factors=[v73, v74, v75, v76, v77]) | |
v83, v84, v85 = sch.sample_perfect_tile(loop=l42, n=3, max_innermost_factor=64, decision=[16, 2, 4]) | |
l86, l87, l88 = sch.split(loop=l42, factors=[v83, v84, v85]) | |
sch.reorder(l48, l58, l68, l78, l49, l59, l69, l79, l50, l60, l70, l80, l86, l87, l51, l61, l71, l81, l88, l52, l62, l72, l82) | |
l89 = sch.fuse(l48, l58, l68, l78) | |
sch.bind(loop=l89, thread_axis="blockIdx.x") | |
l90 = sch.fuse(l49, l59, l69, l79) | |
sch.bind(loop=l90, thread_axis="vthread.x") | |
l91 = sch.fuse(l50, l60, l70, l80) | |
sch.bind(loop=l91, thread_axis="threadIdx.x") | |
sch.annotate(block_or_loop=b2, ann_key="meta_schedule.thread_extent_low_inclusive", ann_val=32) | |
sch.annotate(block_or_loop=b2, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=1024) | |
b92 = sch.cache_write(block=b2, write_buffer_index=0, storage_scope="local") | |
sch.reverse_compute_at(block=b92, loop=l91, preserve_unit_loops=True) | |
b93 = sch.cache_read(block=b2, read_buffer_index=1, storage_scope="shared") | |
sch.compute_at(block=b93, loop=l86, preserve_unit_loops=True) | |
l94, l95, l96, l97, l98, l99, l100, l101 = sch.get_loops(block=b93) | |
l102 = sch.fuse(l98, l99, l100, l101) | |
v103 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=3) | |
sch.annotate(block_or_loop=b93, ann_key="meta_schedule.cooperative_fetch", ann_val=v103) | |
b104 = sch.cache_read(block=b2, read_buffer_index=2, storage_scope="shared") | |
sch.compute_at(block=b104, loop=l86, preserve_unit_loops=True) | |
l105, l106, l107, l108, l109, l110, l111, l112 = sch.get_loops(block=b104) | |
l113 = sch.fuse(l109, l110, l111, l112) | |
v114 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=1) | |
sch.annotate(block_or_loop=b104, ann_key="meta_schedule.cooperative_fetch", ann_val=v114) | |
sch.reverse_compute_inline(block=b6) | |
sch.reverse_compute_inline(block=b5) | |
v115 = sch.sample_categorical(candidates=[0, 16, 64, 512, 1024], probs=[0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001], decision=3) | |
sch.annotate(block_or_loop=b7, ann_key="meta_schedule.unroll_explicit", ann_val=v115) | |
[14:36:11] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:96: Initializing Task #9: "vm_mod_fused_nn_conv2d_add_nn_relu_2", mod = | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 1, 1, 256), "float32"], placeholder_1: T.Buffer[(3, 3, 128, 256), "float32"], placeholder_2: T.Buffer[(1, 28, 28, 128), "float32"], T_relu: T.Buffer[(1, 14, 14, 256), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
# with T.block("root") | |
pad_temp = T.alloc_buffer([1, 30, 30, 128], dtype="float32") | |
conv2d_nhwc = T.alloc_buffer([1, 14, 14, 256], dtype="float32") | |
T_add = T.alloc_buffer([1, 14, 14, 256], dtype="float32") | |
for i0, i1, i2, i3 in T.grid(1, 30, 30, 128): | |
with T.block("pad_temp"): | |
i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(placeholder_2[i0_1, i1_1 - 1, i2_1 - 1, i3_1]) | |
T.writes(pad_temp[i0_1, i1_1, i2_1, i3_1]) | |
pad_temp[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(1 <= i1_1 and i1_1 < 29 and 1 <= i2_1 and i2_1 < 29, placeholder_2[i0_1, i1_1 - 1, i2_1 - 1, i3_1], T.float32(0), dtype="float32") | |
for i0, i1, i2, i3, i4, i5, i6 in T.grid(1, 14, 14, 256, 3, 3, 128): | |
with T.block("conv2d_nhwc"): | |
nn, yy, xx, ff, ry, rx, rc = T.axis.remap("SSSSRRR", [i0, i1, i2, i3, i4, i5, i6]) | |
T.reads(conv2d_nhwc[nn, yy, xx, ff], pad_temp[nn, yy * 2 + ry, xx * 2 + rx, rc], placeholder_1[ry, rx, rc, ff]) | |
T.writes(conv2d_nhwc[nn, yy, xx, ff]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "workload":["conv2d_nhwc.gpu", ["TENSOR", [1, 28, 28, 128], "float32"], ["TENSOR", [3, 3, 128, 256], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], "float32"]}) | |
with T.init(): | |
conv2d_nhwc[nn, yy, xx, ff] = T.float32(0) | |
conv2d_nhwc[nn, yy, xx, ff] = conv2d_nhwc[nn, yy, xx, ff] + pad_temp[nn, yy * 2 + ry, xx * 2 + rx, rc] * placeholder_1[ry, rx, rc, ff] | |
for i0, i1, i2, i3 in T.grid(1, 14, 14, 256): | |
with T.block("T_add"): | |
ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(conv2d_nhwc[ax0, ax1, ax2, ax3], placeholder[ax0, 0, 0, ax3]) | |
T.writes(T_add[ax0, ax1, ax2, ax3]) | |
T_add[ax0, ax1, ax2, ax3] = conv2d_nhwc[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3] | |
for i0, i1, i2, i3 in T.grid(1, 14, 14, 256): | |
with T.block("T_relu"): | |
ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(T_add[ax0, ax1, ax2, ax3]) | |
T.writes(T_relu[ax0, ax1, ax2, ax3]) | |
T_relu[ax0, ax1, ax2, ax3] = T.max(T_add[ax0, ax1, ax2, ax3], T.float32(0)) | |
[14:36:11] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:113: Total 1 design space(s) generated | |
[14:36:11] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:118: Design space #0: | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 1, 1, 256), "float32"], placeholder_1: T.Buffer[(3, 3, 128, 256), "float32"], placeholder_2: T.Buffer[(1, 28, 28, 128), "float32"], T_relu: T.Buffer[(1, 14, 14, 256), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
with T.block("root"): | |
T.reads() | |
T.writes() | |
T.block_attr({"meta_schedule.unroll_explicit":512}) | |
conv2d_nhwc_local = T.alloc_buffer([1, 14, 14, 256], dtype="float32", scope="local") | |
pad_temp_shared = T.alloc_buffer([1, 30, 30, 128], dtype="float32", scope="shared") | |
placeholder_shared = T.alloc_buffer([3, 3, 128, 256], dtype="float32", scope="shared") | |
for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(1, thread="blockIdx.x"): | |
for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(1, thread="vthread.x"): | |
for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(2, thread="threadIdx.x"): | |
for i4_0, i5_0, i6_0 in T.grid(3, 3, 64): | |
for ax0_ax1_ax2_ax3_fused in T.serial(1458): | |
with T.block("pad_temp_shared"): | |
v0 = T.axis.spatial(1, 0) | |
v1 = T.axis.spatial(30, i4_0 + ax0_ax1_ax2_ax3_fused % 1458 // 54) | |
v2 = T.axis.spatial(30, i5_0 + ax0_ax1_ax2_ax3_fused % 54 // 2) | |
v3 = T.axis.spatial(128, i6_0 * 2 + ax0_ax1_ax2_ax3_fused % 2) | |
T.reads(placeholder_2[v0, v1 - 1, v2 - 1, v3]) | |
T.writes(pad_temp_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":3}) | |
pad_temp_shared[v0, v1, v2, v3] = T.if_then_else(1 <= v1 and v1 < 29 and 1 <= v2 and v2 < 29, placeholder_2[v0, v1 - 1, v2 - 1, v3], T.float32(0), dtype="float32") | |
for ax0_ax1_ax2_ax3_fused in T.serial(512): | |
with T.block("placeholder_shared"): | |
v0, v1 = T.axis.remap("SS", [i4_0, i5_0]) | |
v2 = T.axis.spatial(128, i6_0 * 2 + ax0_ax1_ax2_ax3_fused // 256) | |
v3 = T.axis.spatial(256, ax0_ax1_ax2_ax3_fused % 256) | |
T.reads(placeholder_1[v0, v1, v2, v3]) | |
T.writes(placeholder_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":1}) | |
placeholder_shared[v0, v1, v2, v3] = placeholder_1[v0, v1, v2, v3] | |
for i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3, i4_2, i5_2, i6_2, i0_4, i1_4, i2_4, i3_4 in T.grid(1, 1, 2, 1, 1, 1, 4, 1, 1, 1, 1, 14, 14, 32): | |
with T.block("conv2d_nhwc"): | |
nn = T.axis.spatial(1, 0) | |
yy, xx = T.axis.remap("SS", [i1_4, i2_4]) | |
ff = T.axis.spatial(256, i0_2_i1_2_i2_2_i3_2_fused * 128 + i3_3 * 32 + i3_4) | |
ry, rx = T.axis.remap("RR", [i4_0, i5_0]) | |
rc = T.axis.reduce(128, i6_0 * 2 + i6_1) | |
T.reads(conv2d_nhwc_local[nn, yy, xx, ff], pad_temp_shared[nn, yy * 2 + ry, xx * 2 + rx, rc], placeholder_shared[ry, rx, rc, ff]) | |
T.writes(conv2d_nhwc_local[nn, yy, xx, ff]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["conv2d_nhwc.gpu", ["TENSOR", [1, 28, 28, 128], "float32"], ["TENSOR", [3, 3, 128, 256], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], "float32"]}) | |
with T.init(): | |
conv2d_nhwc_local[nn, yy, xx, ff] = T.float32(0) | |
conv2d_nhwc_local[nn, yy, xx, ff] = conv2d_nhwc_local[nn, yy, xx, ff] + pad_temp_shared[nn, yy * 2 + ry, xx * 2 + rx, rc] * placeholder_shared[ry, rx, rc, ff] | |
for ax0, ax1, ax2, ax3 in T.grid(1, 14, 14, 128): | |
with T.block("conv2d_nhwc_local"): | |
v0, v1, v2 = T.axis.remap("SSS", [ax0, ax1, ax2]) | |
v3 = T.axis.spatial(256, i0_2_i1_2_i2_2_i3_2_fused * 128 + ax3) | |
T.reads(conv2d_nhwc_local[v0, v1, v2, v3], placeholder[v0, 0, 0, v3]) | |
T.writes(T_relu[v0, v1, v2, v3]) | |
T_relu[v0, v1, v2, v3] = T.max(conv2d_nhwc_local[v0, v1, v2, v3] + placeholder[v0, 0, 0, v3], T.float32(0)) | |
b0 = sch.get_block(name="pad_temp", func_name="main") | |
b1 = sch.get_block(name="conv2d_nhwc", func_name="main") | |
b2 = sch.get_block(name="T_add", func_name="main") | |
b3 = sch.get_block(name="T_relu", func_name="main") | |
b4 = sch.get_block(name="root", func_name="main") | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS") | |
l5, l6, l7, l8, l9, l10, l11 = sch.get_loops(block=b1) | |
v12, v13, v14, v15, v16 = sch.sample_perfect_tile(loop=l5, n=5, max_innermost_factor=64, decision=[1, 1, 1, 1, 1]) | |
l17, l18, l19, l20, l21 = sch.split(loop=l5, factors=[v12, v13, v14, v15, v16]) | |
v22, v23, v24, v25, v26 = sch.sample_perfect_tile(loop=l6, n=5, max_innermost_factor=64, decision=[1, 1, 1, 1, 14]) | |
l27, l28, l29, l30, l31 = sch.split(loop=l6, factors=[v22, v23, v24, v25, v26]) | |
v32, v33, v34, v35, v36 = sch.sample_perfect_tile(loop=l7, n=5, max_innermost_factor=64, decision=[1, 1, 1, 1, 14]) | |
l37, l38, l39, l40, l41 = sch.split(loop=l7, factors=[v32, v33, v34, v35, v36]) | |
v42, v43, v44, v45, v46 = sch.sample_perfect_tile(loop=l8, n=5, max_innermost_factor=64, decision=[1, 1, 2, 4, 32]) | |
l47, l48, l49, l50, l51 = sch.split(loop=l8, factors=[v42, v43, v44, v45, v46]) | |
v52, v53, v54 = sch.sample_perfect_tile(loop=l9, n=3, max_innermost_factor=64, decision=[3, 1, 1]) | |
l55, l56, l57 = sch.split(loop=l9, factors=[v52, v53, v54]) | |
v58, v59, v60 = sch.sample_perfect_tile(loop=l10, n=3, max_innermost_factor=64, decision=[3, 1, 1]) | |
l61, l62, l63 = sch.split(loop=l10, factors=[v58, v59, v60]) | |
v64, v65, v66 = sch.sample_perfect_tile(loop=l11, n=3, max_innermost_factor=64, decision=[64, 2, 1]) | |
l67, l68, l69 = sch.split(loop=l11, factors=[v64, v65, v66]) | |
sch.reorder(l17, l27, l37, l47, l18, l28, l38, l48, l19, l29, l39, l49, l55, l61, l67, l56, l62, l68, l20, l30, l40, l50, l57, l63, l69, l21, l31, l41, l51) | |
l70 = sch.fuse(l17, l27, l37, l47) | |
sch.bind(loop=l70, thread_axis="blockIdx.x") | |
l71 = sch.fuse(l18, l28, l38, l48) | |
sch.bind(loop=l71, thread_axis="vthread.x") | |
l72 = sch.fuse(l19, l29, l39, l49) | |
sch.bind(loop=l72, thread_axis="threadIdx.x") | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.thread_extent_low_inclusive", ann_val=32) | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=1024) | |
b73 = sch.cache_write(block=b1, write_buffer_index=0, storage_scope="local") | |
sch.reverse_compute_at(block=b73, loop=l72, preserve_unit_loops=True) | |
b74 = sch.cache_read(block=b1, read_buffer_index=1, storage_scope="shared") | |
sch.compute_at(block=b74, loop=l67, preserve_unit_loops=True) | |
l75, l76, l77, l78, l79, l80, l81, l82, l83, l84 = sch.get_loops(block=b74) | |
l85 = sch.fuse(l81, l82, l83, l84) | |
v86 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=2) | |
sch.annotate(block_or_loop=b74, ann_key="meta_schedule.cooperative_fetch", ann_val=v86) | |
b87 = sch.cache_read(block=b1, read_buffer_index=2, storage_scope="shared") | |
sch.compute_at(block=b87, loop=l67, preserve_unit_loops=True) | |
l88, l89, l90, l91, l92, l93, l94, l95, l96, l97 = sch.get_loops(block=b87) | |
l98 = sch.fuse(l94, l95, l96, l97) | |
v99 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=0) | |
sch.annotate(block_or_loop=b87, ann_key="meta_schedule.cooperative_fetch", ann_val=v99) | |
sch.reverse_compute_inline(block=b3) | |
sch.reverse_compute_inline(block=b2) | |
sch.compute_inline(block=b0) | |
v100 = sch.sample_categorical(candidates=[0, 16, 64, 512, 1024], probs=[0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001], decision=3) | |
sch.annotate(block_or_loop=b4, ann_key="meta_schedule.unroll_explicit", ann_val=v100) | |
[14:36:11] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:96: Initializing Task #10: "vm_mod_fused_nn_conv2d_add_1", mod = | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 1, 1, 256), "float32"], placeholder_1: T.Buffer[(1, 1, 128, 256), "float32"], placeholder_2: T.Buffer[(1, 28, 28, 128), "float32"], T_add: T.Buffer[(1, 14, 14, 256), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
# with T.block("root") | |
pad_temp = T.alloc_buffer([1, 28, 28, 128], dtype="float32") | |
conv2d_nhwc = T.alloc_buffer([1, 14, 14, 256], dtype="float32") | |
for i0, i1, i2, i3 in T.grid(1, 28, 28, 128): | |
with T.block("pad_temp"): | |
i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(placeholder_2[i0_1, i1_1, i2_1, i3_1]) | |
T.writes(pad_temp[i0_1, i1_1, i2_1, i3_1]) | |
pad_temp[i0_1, i1_1, i2_1, i3_1] = placeholder_2[i0_1, i1_1, i2_1, i3_1] | |
for i0, i1, i2, i3, i4, i5, i6 in T.grid(1, 14, 14, 256, 1, 1, 128): | |
with T.block("conv2d_nhwc"): | |
nn, yy, xx, ff, ry, rx, rc = T.axis.remap("SSSSRRR", [i0, i1, i2, i3, i4, i5, i6]) | |
T.reads(conv2d_nhwc[nn, yy, xx, ff], pad_temp[nn, yy * 2 + ry, xx * 2 + rx, rc], placeholder_1[ry, rx, rc, ff]) | |
T.writes(conv2d_nhwc[nn, yy, xx, ff]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "workload":["conv2d_nhwc.gpu", ["TENSOR", [1, 28, 28, 128], "float32"], ["TENSOR", [1, 1, 128, 256], "float32"], [2, 2], [0, 0, 0, 0], [1, 1], "float32"]}) | |
with T.init(): | |
conv2d_nhwc[nn, yy, xx, ff] = T.float32(0) | |
conv2d_nhwc[nn, yy, xx, ff] = conv2d_nhwc[nn, yy, xx, ff] + pad_temp[nn, yy * 2 + ry, xx * 2 + rx, rc] * placeholder_1[ry, rx, rc, ff] | |
for i0, i1, i2, i3 in T.grid(1, 14, 14, 256): | |
with T.block("T_add"): | |
ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(conv2d_nhwc[ax0, ax1, ax2, ax3], placeholder[ax0, 0, 0, ax3]) | |
T.writes(T_add[ax0, ax1, ax2, ax3]) | |
T_add[ax0, ax1, ax2, ax3] = conv2d_nhwc[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3] | |
[14:36:11] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:113: Total 1 design space(s) generated | |
[14:36:11] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:118: Design space #0: | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 1, 1, 256), "float32"], placeholder_1: T.Buffer[(1, 1, 128, 256), "float32"], placeholder_2: T.Buffer[(1, 28, 28, 128), "float32"], T_add: T.Buffer[(1, 14, 14, 256), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
with T.block("root"): | |
T.reads() | |
T.writes() | |
T.block_attr({"meta_schedule.unroll_explicit":16}) | |
conv2d_nhwc_local = T.alloc_buffer([1, 14, 14, 256], dtype="float32", scope="local") | |
pad_temp_shared = T.alloc_buffer([1, 28, 28, 128], dtype="float32", scope="shared") | |
placeholder_shared = T.alloc_buffer([1, 1, 128, 256], dtype="float32", scope="shared") | |
for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(4, thread="blockIdx.x"): | |
for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(8, thread="vthread.x"): | |
for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(8, thread="threadIdx.x"): | |
for i4_0, i5_0, i6_0 in T.grid(1, 1, 16): | |
for ax0_ax1_ax2_ax3_fused in T.serial(2808): | |
with T.block("pad_temp_shared"): | |
v0 = T.axis.spatial(1, 0) | |
v1 = T.axis.spatial(28, i0_0_i1_0_i2_0_i3_0_fused // 2 * 14 + ax0_ax1_ax2_ax3_fused % 2808 // 216) | |
v2 = T.axis.spatial(28, ax0_ax1_ax2_ax3_fused % 216 // 8) | |
v3 = T.axis.spatial(128, i6_0 * 8 + ax0_ax1_ax2_ax3_fused % 8) | |
T.reads(placeholder_2[v0, v1, v2, v3]) | |
T.writes(pad_temp_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":3}) | |
pad_temp_shared[v0, v1, v2, v3] = placeholder_2[v0, v1, v2, v3] | |
for ax0_ax1_ax2_ax3_fused in T.serial(1024): | |
with T.block("placeholder_shared"): | |
v0 = T.axis.spatial(1, 0) | |
v1 = T.axis.spatial(1, 0) | |
v2 = T.axis.spatial(128, i6_0 * 8 + ax0_ax1_ax2_ax3_fused // 128) | |
v3 = T.axis.spatial(256, i0_0_i1_0_i2_0_i3_0_fused % 2 * 128 + ax0_ax1_ax2_ax3_fused % 128) | |
T.reads(placeholder_1[v0, v1, v2, v3]) | |
T.writes(placeholder_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":1}) | |
placeholder_shared[v0, v1, v2, v3] = placeholder_1[v0, v1, v2, v3] | |
for i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3, i4_2, i5_2, i6_2, i0_4, i1_4, i2_4, i3_4 in T.grid(1, 1, 2, 1, 7, 1, 4, 1, 1, 4, 1, 1, 7, 1): | |
with T.block("conv2d_nhwc"): | |
nn = T.axis.spatial(1, 0) | |
yy = T.axis.spatial(14, i0_0_i1_0_i2_0_i3_0_fused // 2 * 7 + i1_3) | |
xx = T.axis.spatial(14, i0_1_i1_1_i2_1_i3_1_fused // 4 * 7 + i2_4) | |
ff = T.axis.spatial(256, i0_0_i1_0_i2_0_i3_0_fused % 2 * 128 + i0_1_i1_1_i2_1_i3_1_fused % 4 * 32 + i0_2_i1_2_i2_2_i3_2_fused * 4 + i3_3) | |
ry = T.axis.reduce(1, 0) | |
rx = T.axis.reduce(1, 0) | |
rc = T.axis.reduce(128, i6_0 * 8 + i6_1 * 4 + i6_2) | |
T.reads(conv2d_nhwc_local[nn, yy, xx, ff], pad_temp_shared[nn, yy * 2 + ry, xx * 2 + rx, rc], placeholder_shared[ry, rx, rc, ff]) | |
T.writes(conv2d_nhwc_local[nn, yy, xx, ff]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["conv2d_nhwc.gpu", ["TENSOR", [1, 28, 28, 128], "float32"], ["TENSOR", [1, 1, 128, 256], "float32"], [2, 2], [0, 0, 0, 0], [1, 1], "float32"]}) | |
with T.init(): | |
conv2d_nhwc_local[nn, yy, xx, ff] = T.float32(0) | |
conv2d_nhwc_local[nn, yy, xx, ff] = conv2d_nhwc_local[nn, yy, xx, ff] + pad_temp_shared[nn, yy * 2 + ry, xx * 2 + rx, rc] * placeholder_shared[ry, rx, rc, ff] | |
for ax0, ax1, ax2, ax3 in T.grid(1, 7, 7, 4): | |
with T.block("conv2d_nhwc_local"): | |
v0 = T.axis.spatial(1, ax0) | |
v1 = T.axis.spatial(14, i0_0_i1_0_i2_0_i3_0_fused // 2 * 7 + ax1) | |
v2 = T.axis.spatial(14, i0_1_i1_1_i2_1_i3_1_fused // 4 * 7 + ax2) | |
v3 = T.axis.spatial(256, i0_0_i1_0_i2_0_i3_0_fused % 2 * 128 + i0_1_i1_1_i2_1_i3_1_fused % 4 * 32 + i0_2_i1_2_i2_2_i3_2_fused * 4 + ax3) | |
T.reads(conv2d_nhwc_local[v0, v1, v2, v3], placeholder[v0, 0, 0, v3]) | |
T.writes(T_add[v0, v1, v2, v3]) | |
T_add[v0, v1, v2, v3] = conv2d_nhwc_local[v0, v1, v2, v3] + placeholder[v0, 0, 0, v3] | |
b0 = sch.get_block(name="pad_temp", func_name="main") | |
b1 = sch.get_block(name="conv2d_nhwc", func_name="main") | |
b2 = sch.get_block(name="T_add", func_name="main") | |
b3 = sch.get_block(name="root", func_name="main") | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS") | |
l4, l5, l6, l7, l8, l9, l10 = sch.get_loops(block=b1) | |
v11, v12, v13, v14, v15 = sch.sample_perfect_tile(loop=l4, n=5, max_innermost_factor=64, decision=[1, 1, 1, 1, 1]) | |
l16, l17, l18, l19, l20 = sch.split(loop=l4, factors=[v11, v12, v13, v14, v15]) | |
v21, v22, v23, v24, v25 = sch.sample_perfect_tile(loop=l5, n=5, max_innermost_factor=64, decision=[2, 1, 1, 7, 1]) | |
l26, l27, l28, l29, l30 = sch.split(loop=l5, factors=[v21, v22, v23, v24, v25]) | |
v31, v32, v33, v34, v35 = sch.sample_perfect_tile(loop=l6, n=5, max_innermost_factor=64, decision=[1, 2, 1, 1, 7]) | |
l36, l37, l38, l39, l40 = sch.split(loop=l6, factors=[v31, v32, v33, v34, v35]) | |
v41, v42, v43, v44, v45 = sch.sample_perfect_tile(loop=l7, n=5, max_innermost_factor=64, decision=[2, 4, 8, 4, 1]) | |
l46, l47, l48, l49, l50 = sch.split(loop=l7, factors=[v41, v42, v43, v44, v45]) | |
v51, v52, v53 = sch.sample_perfect_tile(loop=l8, n=3, max_innermost_factor=64, decision=[1, 1, 1]) | |
l54, l55, l56 = sch.split(loop=l8, factors=[v51, v52, v53]) | |
v57, v58, v59 = sch.sample_perfect_tile(loop=l9, n=3, max_innermost_factor=64, decision=[1, 1, 1]) | |
l60, l61, l62 = sch.split(loop=l9, factors=[v57, v58, v59]) | |
v63, v64, v65 = sch.sample_perfect_tile(loop=l10, n=3, max_innermost_factor=64, decision=[16, 2, 4]) | |
l66, l67, l68 = sch.split(loop=l10, factors=[v63, v64, v65]) | |
sch.reorder(l16, l26, l36, l46, l17, l27, l37, l47, l18, l28, l38, l48, l54, l60, l66, l55, l61, l67, l19, l29, l39, l49, l56, l62, l68, l20, l30, l40, l50) | |
l69 = sch.fuse(l16, l26, l36, l46) | |
sch.bind(loop=l69, thread_axis="blockIdx.x") | |
l70 = sch.fuse(l17, l27, l37, l47) | |
sch.bind(loop=l70, thread_axis="vthread.x") | |
l71 = sch.fuse(l18, l28, l38, l48) | |
sch.bind(loop=l71, thread_axis="threadIdx.x") | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.thread_extent_low_inclusive", ann_val=32) | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=1024) | |
b72 = sch.cache_write(block=b1, write_buffer_index=0, storage_scope="local") | |
sch.reverse_compute_at(block=b72, loop=l71, preserve_unit_loops=True) | |
b73 = sch.cache_read(block=b1, read_buffer_index=1, storage_scope="shared") | |
sch.compute_at(block=b73, loop=l66, preserve_unit_loops=True) | |
l74, l75, l76, l77, l78, l79, l80, l81, l82, l83 = sch.get_loops(block=b73) | |
l84 = sch.fuse(l80, l81, l82, l83) | |
v85 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=2) | |
sch.annotate(block_or_loop=b73, ann_key="meta_schedule.cooperative_fetch", ann_val=v85) | |
b86 = sch.cache_read(block=b1, read_buffer_index=2, storage_scope="shared") | |
sch.compute_at(block=b86, loop=l66, preserve_unit_loops=True) | |
l87, l88, l89, l90, l91, l92, l93, l94, l95, l96 = sch.get_loops(block=b86) | |
l97 = sch.fuse(l93, l94, l95, l96) | |
v98 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=0) | |
sch.annotate(block_or_loop=b86, ann_key="meta_schedule.cooperative_fetch", ann_val=v98) | |
sch.reverse_compute_inline(block=b2) | |
sch.compute_inline(block=b0) | |
v99 = sch.sample_categorical(candidates=[0, 16, 64, 512, 1024], probs=[0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001], decision=1) | |
sch.annotate(block_or_loop=b3, ann_key="meta_schedule.unroll_explicit", ann_val=v99) | |
[14:36:11] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:96: Initializing Task #11: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_2", mod = | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 14, 14, 256), "float32"], placeholder_1: T.Buffer[(1, 1, 1, 256), "float32"], placeholder_2: T.Buffer[(4, 4, 256, 256), "float32"], placeholder_3: T.Buffer[(1, 14, 14, 256), "float32"], T_relu: T.Buffer[(1, 14, 14, 256), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
# with T.block("root") | |
data_pad = T.alloc_buffer([1, 16, 16, 256], dtype="float32") | |
input_tile = T.alloc_buffer([4, 4, 49, 256], dtype="float32") | |
B = T.alloc_buffer([4, 4], dtype="float32") | |
data_pack = T.alloc_buffer([4, 4, 49, 256], dtype="float32") | |
bgemm = T.alloc_buffer([4, 4, 49, 256], dtype="float32") | |
A = T.alloc_buffer([4, 2], dtype="float32") | |
inverse = T.alloc_buffer([2, 2, 49, 256], dtype="float32") | |
conv2d_winograd = T.alloc_buffer([1, 14, 14, 256], dtype="float32") | |
T_add = T.alloc_buffer([1, 14, 14, 256], dtype="float32") | |
T_add_1 = T.alloc_buffer([1, 14, 14, 256], dtype="float32") | |
for i0, i1, i2, i3 in T.grid(1, 16, 16, 256): | |
with T.block("data_pad"): | |
i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(placeholder_3[i0_1, i1_1 - 1, i2_1 - 1, i3_1]) | |
T.writes(data_pad[i0_1, i1_1, i2_1, i3_1]) | |
T.block_attr({"schedule_rule":"None"}) | |
data_pad[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(1 <= i1_1 and i1_1 < 15 and 1 <= i2_1 and i2_1 < 15, placeholder_3[i0_1, i1_1 - 1, i2_1 - 1, i3_1], T.float32(0), dtype="float32") | |
for i0, i1, i2, i3 in T.grid(4, 4, 49, 256): | |
with T.block("input_tile"): | |
eps, nu, p, ci = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(data_pad[p // 49, p % 49 // 7 * 2 + eps, p % 7 * 2 + nu, ci]) | |
T.writes(input_tile[eps, nu, p, ci]) | |
T.block_attr({"schedule_rule":"None"}) | |
input_tile[eps, nu, p, ci] = data_pad[p // 49, p % 49 // 7 * 2 + eps, p % 7 * 2 + nu, ci] | |
for i0, i1 in T.grid(4, 4): | |
with T.block("B"): | |
i, j = T.axis.remap("SS", [i0, i1]) | |
T.reads() | |
T.writes(B[i, j]) | |
T.block_attr({"const_matrix":True, "schedule_rule":"meta_schedule.compute_inline"}) | |
B[i, j] = T.Select(i % 4 == 3 and j % 4 == 3, T.float32(1), T.Select(i % 4 == 3 and j % 4 == 2, T.float32(0), T.Select(i % 4 == 3 and j % 4 == 1, T.float32(0), T.Select(i % 4 == 3 and j % 4 == 0, T.float32(0), T.Select(i % 4 == 2 and j % 4 == 3, T.float32(0), T.Select(i % 4 == 2 and j % 4 == 2, T.float32(1), T.Select(i % 4 == 2 and j % 4 == 1, T.float32(1), T.Select(i % 4 == 2 and j % 4 == 0, T.float32(-1), T.Select(i % 4 == 1 and j % 4 == 3, T.float32(-1), T.Select(i % 4 == 1 and j % 4 == 2, T.float32(1), T.Select(i % 4 == 1 and j % 4 == 1, T.float32(-1), T.Select(i % 4 == 1 and j % 4 == 0, T.float32(0), T.Select(i % 4 == 0 and j % 4 == 3, T.float32(0), T.Select(i % 4 == 0 and j % 4 == 2, T.float32(0), T.Select(i % 4 == 0 and j % 4 == 1, T.float32(0), T.Select(i % 4 == 0 and j % 4 == 0, T.float32(1), T.float32(0))))))))))))))))) | |
for i0, i1, i2, i3, i4, i5 in T.grid(4, 4, 49, 256, 4, 4): | |
with T.block("data_pack"): | |
eps, nu, p, ci, r_a, r_b = T.axis.remap("SSSSRR", [i0, i1, i2, i3, i4, i5]) | |
T.reads(data_pack[eps, nu, p, ci], input_tile[r_a, r_b, p, ci], B[T.min(r_a, r_b) : T.max(r_a, r_b) + 1, T.min(eps, nu) : T.max(eps, nu) + 1]) | |
T.writes(data_pack[eps, nu, p, ci]) | |
T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["eps", "nu", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_data_pack.cuda"}) | |
with T.init(): | |
data_pack[eps, nu, p, ci] = T.float32(0) | |
data_pack[eps, nu, p, ci] = data_pack[eps, nu, p, ci] + input_tile[r_a, r_b, p, ci] * B[r_a, eps] * B[r_b, nu] | |
for i0, i1, i2, i3, i4 in T.grid(4, 4, 49, 256, 256): | |
with T.block("bgemm"): | |
eps, nu, p, co, ci = T.axis.remap("SSSSR", [i0, i1, i2, i3, i4]) | |
T.reads(bgemm[eps, nu, p, co], data_pack[eps, nu, p, ci], placeholder_2[eps, nu, co, ci]) | |
T.writes(bgemm[eps, nu, p, co]) | |
T.block_attr({"layout_free_placeholders":[placeholder_2]}) | |
with T.init(): | |
bgemm[eps, nu, p, co] = T.float32(0) | |
bgemm[eps, nu, p, co] = bgemm[eps, nu, p, co] + data_pack[eps, nu, p, ci] * placeholder_2[eps, nu, co, ci] | |
for i0, i1 in T.grid(4, 2): | |
with T.block("A"): | |
i, j = T.axis.remap("SS", [i0, i1]) | |
T.reads() | |
T.writes(A[i, j]) | |
T.block_attr({"const_matrix":True, "schedule_rule":"meta_schedule.compute_inline"}) | |
A[i, j] = T.Select(i % 4 == 3 and j % 2 == 1, T.float32(1), T.Select(i % 4 == 3 and j % 2 == 0, T.float32(0), T.Select(i % 4 == 2 and j % 2 == 1, T.float32(1), T.Select(i % 4 == 2 and j % 2 == 0, T.float32(1), T.Select(i % 4 == 1 and j % 2 == 1, T.float32(-1), T.Select(i % 4 == 1 and j % 2 == 0, T.float32(1), T.Select(i % 4 == 0 and j % 2 == 1, T.float32(0), T.Select(i % 4 == 0 and j % 2 == 0, T.float32(1), T.float32(0))))))))) | |
for i0, i1, i2, i3, i4, i5 in T.grid(2, 2, 49, 256, 4, 4): | |
with T.block("inverse"): | |
vh, vw, p, co, r_a, r_b = T.axis.remap("SSSSRR", [i0, i1, i2, i3, i4, i5]) | |
T.reads(inverse[vh, vw, p, co], bgemm[r_a, r_b, p, co], A[T.min(r_a, r_b) : T.max(r_a, r_b) + 1, T.min(vh, vw) : T.max(vh, vw) + 1]) | |
T.writes(inverse[vh, vw, p, co]) | |
T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["vh", "vw", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_inverse"}) | |
with T.init(): | |
inverse[vh, vw, p, co] = T.float32(0) | |
inverse[vh, vw, p, co] = inverse[vh, vw, p, co] + bgemm[r_a, r_b, p, co] * A[r_a, vh] * A[r_b, vw] | |
for i0, i1, i2, i3 in T.grid(1, 14, 14, 256): | |
with T.block("conv2d_winograd"): | |
n, h, w, co = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(inverse[h % 2, w % 2, n * 49 + h // 2 * 7 + w // 2, co]) | |
T.writes(conv2d_winograd[n, h, w, co]) | |
conv2d_winograd[n, h, w, co] = inverse[h % 2, w % 2, n * 49 + h // 2 * 7 + w // 2, co] | |
for i0, i1, i2, i3 in T.grid(1, 14, 14, 256): | |
with T.block("T_add"): | |
ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(conv2d_winograd[ax0, ax1, ax2, ax3], placeholder_1[ax0, 0, 0, ax3]) | |
T.writes(T_add[ax0, ax1, ax2, ax3]) | |
T_add[ax0, ax1, ax2, ax3] = conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder_1[ax0, 0, 0, ax3] | |
for i0, i1, i2, i3 in T.grid(1, 14, 14, 256): | |
with T.block("T_add_1"): | |
ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(T_add[ax0, ax1, ax2, ax3], placeholder[ax0, ax1, ax2, ax3]) | |
T.writes(T_add_1[ax0, ax1, ax2, ax3]) | |
T_add_1[ax0, ax1, ax2, ax3] = T_add[ax0, ax1, ax2, ax3] + placeholder[ax0, ax1, ax2, ax3] | |
for i0, i1, i2, i3 in T.grid(1, 14, 14, 256): | |
with T.block("T_relu"): | |
ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(T_add_1[ax0, ax1, ax2, ax3]) | |
T.writes(T_relu[ax0, ax1, ax2, ax3]) | |
T_relu[ax0, ax1, ax2, ax3] = T.max(T_add_1[ax0, ax1, ax2, ax3], T.float32(0)) | |
[14:36:12] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:113: Total 1 design space(s) generated | |
[14:36:12] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:118: Design space #0: | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 14, 14, 256), "float32"], placeholder_1: T.Buffer[(1, 1, 1, 256), "float32"], placeholder_2: T.Buffer[(4, 4, 256, 256), "float32"], placeholder_3: T.Buffer[(1, 14, 14, 256), "float32"], T_relu: T.Buffer[(1, 14, 14, 256), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
with T.block("root"): | |
T.reads() | |
T.writes() | |
T.block_attr({"meta_schedule.unroll_explicit":0}) | |
input_tile_local = T.alloc_buffer([4, 4, 49, 256], dtype="float32", scope="local") | |
data_pack = T.alloc_buffer([4, 4, 49, 256], dtype="float32") | |
bgemm = T.alloc_buffer([4, 4, 49, 256], dtype="float32") | |
inverse = T.alloc_buffer([2, 2, 49, 256], dtype="float32") | |
bgemm_local = T.alloc_buffer([4, 4, 49, 256], dtype="float32", scope="local") | |
data_pack_shared = T.alloc_buffer([4, 4, 49, 256], dtype="float32", scope="shared") | |
placeholder_shared = T.alloc_buffer([4, 4, 256, 256], dtype="float32", scope="shared") | |
for i2_0, i3_0, i2_1, i3_1 in T.grid(49, 16, 1, 16): | |
for ax0, ax1, ax2, ax3 in T.grid(4, 4, 1, 1): | |
with T.block("input_tile"): | |
eps, nu = T.axis.remap("SS", [ax0, ax1]) | |
p = T.axis.spatial(49, i2_0 + ax2) | |
ci = T.axis.spatial(256, i3_0 * 16 + i3_1 + ax3) | |
T.reads(placeholder_3[p // 49, p % 49 // 7 * 2 + eps - 1, p % 7 * 2 + nu - 1, ci]) | |
T.writes(input_tile_local[eps, nu, p, ci]) | |
T.block_attr({"schedule_rule":"None"}) | |
input_tile_local[eps, nu, p, ci] = T.if_then_else(1 <= p % 49 // 7 * 2 + eps and p % 49 // 7 * 2 + eps < 15 and 1 <= p % 7 * 2 + nu and p % 7 * 2 + nu < 15, placeholder_3[p // 49, p % 49 // 7 * 2 + eps - 1, p % 7 * 2 + nu - 1, ci], T.float32(0), dtype="float32") | |
for i0 in T.unroll(4): | |
for i1 in T.unroll(4): | |
for i4 in T.unroll(4): | |
for i5 in T.unroll(4): | |
with T.block("data_pack"): | |
eps, nu, p = T.axis.remap("SSS", [i0, i1, i2_0]) | |
ci = T.axis.spatial(256, i3_0 * 16 + i3_1) | |
r_a, r_b = T.axis.remap("RR", [i4, i5]) | |
T.reads(data_pack[eps, nu, p, ci], input_tile_local[r_a, r_b, p, ci]) | |
T.writes(data_pack[eps, nu, p, ci]) | |
T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["eps", "nu", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_data_pack.cuda"}) | |
with T.init(): | |
data_pack[eps, nu, p, ci] = T.float32(0) | |
data_pack[eps, nu, p, ci] = data_pack[eps, nu, p, ci] + input_tile_local[r_a, r_b, p, ci] * T.Select(r_a % 4 == 3 and eps % 4 == 3, T.float32(1), T.Select(r_a % 4 == 3 and eps % 4 == 2, T.float32(0), T.Select(r_a % 4 == 3 and eps % 4 == 1, T.float32(0), T.Select(r_a % 4 == 3 and eps % 4 == 0, T.float32(0), T.Select(r_a % 4 == 2 and eps % 4 == 3, T.float32(0), T.Select(r_a % 4 == 2 and eps % 4 == 2, T.float32(1), T.Select(r_a % 4 == 2 and eps % 4 == 1, T.float32(1), T.Select(r_a % 4 == 2 and eps % 4 == 0, T.float32(-1), T.Select(r_a % 4 == 1 and eps % 4 == 3, T.float32(-1), T.Select(r_a % 4 == 1 and eps % 4 == 2, T.float32(1), T.Select(r_a % 4 == 1 and eps % 4 == 1, T.float32(-1), T.Select(r_a % 4 == 1 and eps % 4 == 0, T.float32(0), T.Select(r_a % 4 == 0 and eps % 4 == 3, T.float32(0), T.Select(r_a % 4 == 0 and eps % 4 == 2, T.float32(0), T.Select(r_a % 4 == 0 and eps % 4 == 1, T.float32(0), T.Select(r_a % 4 == 0 and eps % 4 == 0, T.float32(1), T.float32(0))))))))))))))))) * T.Select(r_b % 4 == 3 and nu % 4 == 3, T.float32(1), T.Select(r_b % 4 == 3 and nu % 4 == 2, T.float32(0), T.Select(r_b % 4 == 3 and nu % 4 == 1, T.float32(0), T.Select(r_b % 4 == 3 and nu % 4 == 0, T.float32(0), T.Select(r_b % 4 == 2 and nu % 4 == 3, T.float32(0), T.Select(r_b % 4 == 2 and nu % 4 == 2, T.float32(1), T.Select(r_b % 4 == 2 and nu % 4 == 1, T.float32(1), T.Select(r_b % 4 == 2 and nu % 4 == 0, T.float32(-1), T.Select(r_b % 4 == 1 and nu % 4 == 3, T.float32(-1), T.Select(r_b % 4 == 1 and nu % 4 == 2, T.float32(1), T.Select(r_b % 4 == 1 and nu % 4 == 1, T.float32(-1), T.Select(r_b % 4 == 1 and nu % 4 == 0, T.float32(0), T.Select(r_b % 4 == 0 and nu % 4 == 3, T.float32(0), T.Select(r_b % 4 == 0 and nu % 4 == 2, T.float32(0), T.Select(r_b % 4 == 0 and nu % 4 == 1, T.float32(0), T.Select(r_b % 4 == 0 and nu % 4 == 0, T.float32(1), T.float32(0))))))))))))))))) | |
for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(32, thread="blockIdx.x"): | |
for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(4, thread="vthread.x"): | |
for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(2, thread="threadIdx.x"): | |
for i4_0 in T.serial(4): | |
for ax0_ax1_ax2_ax3_fused in T.serial(50176): | |
with T.block("data_pack_shared"): | |
v0 = T.axis.spatial(4, ax0_ax1_ax2_ax3_fused // 12544) | |
v1 = T.axis.spatial(4, ax0_ax1_ax2_ax3_fused % 12544 // 3136) | |
v2 = T.axis.spatial(49, ax0_ax1_ax2_ax3_fused % 3136 // 64) | |
v3 = T.axis.spatial(256, i4_0 * 64 + ax0_ax1_ax2_ax3_fused % 64) | |
T.reads(data_pack[v0, v1, v2, v3]) | |
T.writes(data_pack_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":2}) | |
data_pack_shared[v0, v1, v2, v3] = data_pack[v0, v1, v2, v3] | |
for ax0_ax1_ax2_ax3_fused in T.serial(8192): | |
with T.block("placeholder_shared"): | |
v0 = T.axis.spatial(4, ax0_ax1_ax2_ax3_fused // 2048) | |
v1 = T.axis.spatial(4, ax0_ax1_ax2_ax3_fused % 2048 // 512) | |
v2 = T.axis.spatial(256, i0_0_i1_0_i2_0_i3_0_fused * 8 + ax0_ax1_ax2_ax3_fused % 512 // 64) | |
v3 = T.axis.spatial(256, i4_0 * 64 + ax0_ax1_ax2_ax3_fused % 64) | |
T.reads(placeholder_2[v0, v1, v2, v3]) | |
T.writes(placeholder_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":3}) | |
placeholder_shared[v0, v1, v2, v3] = placeholder_2[v0, v1, v2, v3] | |
for i4_1, i0_3, i1_3, i2_3, i3_3, i4_2, i0_4, i1_4, i2_4, i3_4 in T.grid(16, 1, 1, 1, 1, 4, 1, 4, 49, 4): | |
with T.block("bgemm"): | |
eps = T.axis.spatial(4, i0_1_i1_1_i2_1_i3_1_fused // 2 * 2 + i0_2_i1_2_i2_2_i3_2_fused) | |
nu, p = T.axis.remap("SS", [i1_4, i2_4]) | |
co = T.axis.spatial(256, i0_0_i1_0_i2_0_i3_0_fused * 8 + i0_1_i1_1_i2_1_i3_1_fused % 2 * 4 + i3_4) | |
ci = T.axis.reduce(256, i4_0 * 64 + i4_1 * 4 + i4_2) | |
T.reads(bgemm_local[eps, nu, p, co], data_pack_shared[eps, nu, p, ci], placeholder_shared[eps, nu, co, ci]) | |
T.writes(bgemm_local[eps, nu, p, co]) | |
T.block_attr({"layout_free_placeholders":[placeholder_2], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS"}) | |
with T.init(): | |
bgemm_local[eps, nu, p, co] = T.float32(0) | |
bgemm_local[eps, nu, p, co] = bgemm_local[eps, nu, p, co] + data_pack_shared[eps, nu, p, ci] * placeholder_shared[eps, nu, co, ci] | |
for ax0, ax1, ax2, ax3 in T.grid(1, 4, 49, 4): | |
with T.block("bgemm_local"): | |
v0 = T.axis.spatial(4, i0_1_i1_1_i2_1_i3_1_fused // 2 * 2 + i0_2_i1_2_i2_2_i3_2_fused + ax0) | |
v1, v2 = T.axis.remap("SS", [ax1, ax2]) | |
v3 = T.axis.spatial(256, i0_0_i1_0_i2_0_i3_0_fused * 8 + i0_1_i1_1_i2_1_i3_1_fused % 2 * 4 + ax3) | |
T.reads(bgemm_local[v0, v1, v2, v3]) | |
T.writes(bgemm[v0, v1, v2, v3]) | |
bgemm[v0, v1, v2, v3] = bgemm_local[v0, v1, v2, v3] | |
for i2_0, i3_0, i2_1, i3_1 in T.grid(49, 4, 1, 64): | |
for i0 in T.unroll(2): | |
for i1 in T.unroll(2): | |
for i4 in T.unroll(4): | |
for i5 in T.unroll(4): | |
with T.block("inverse"): | |
vh, vw, p = T.axis.remap("SSS", [i0, i1, i2_0]) | |
co = T.axis.spatial(256, i3_0 * 64 + i3_1) | |
r_a, r_b = T.axis.remap("RR", [i4, i5]) | |
T.reads(inverse[vh, vw, p, co], bgemm[r_a, r_b, p, co]) | |
T.writes(inverse[vh, vw, p, co]) | |
T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["vh", "vw", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_inverse"}) | |
with T.init(): | |
inverse[vh, vw, p, co] = T.float32(0) | |
inverse[vh, vw, p, co] = inverse[vh, vw, p, co] + bgemm[r_a, r_b, p, co] * T.Select(r_a % 4 == 3 and vh % 2 == 1, T.float32(1), T.Select(r_a % 4 == 3 and vh % 2 == 0, T.float32(0), T.Select(r_a % 4 == 2 and vh % 2 == 1, T.float32(1), T.Select(r_a % 4 == 2 and vh % 2 == 0, T.float32(1), T.Select(r_a % 4 == 1 and vh % 2 == 1, T.float32(-1), T.Select(r_a % 4 == 1 and vh % 2 == 0, T.float32(1), T.Select(r_a % 4 == 0 and vh % 2 == 1, T.float32(0), T.Select(r_a % 4 == 0 and vh % 2 == 0, T.float32(1), T.float32(0))))))))) * T.Select(r_b % 4 == 3 and vw % 2 == 1, T.float32(1), T.Select(r_b % 4 == 3 and vw % 2 == 0, T.float32(0), T.Select(r_b % 4 == 2 and vw % 2 == 1, T.float32(1), T.Select(r_b % 4 == 2 and vw % 2 == 0, T.float32(1), T.Select(r_b % 4 == 1 and vw % 2 == 1, T.float32(-1), T.Select(r_b % 4 == 1 and vw % 2 == 0, T.float32(1), T.Select(r_b % 4 == 0 and vw % 2 == 1, T.float32(0), T.Select(r_b % 4 == 0 and vw % 2 == 0, T.float32(1), T.float32(0))))))))) | |
for i0, i1, i2, i3 in T.grid(1, 14, 14, 256): | |
with T.block("conv2d_winograd"): | |
n, h, w, co = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(inverse[h % 2, w % 2, n * 49 + h // 2 * 7 + w // 2, co], placeholder_1[n, 0, 0, co], placeholder[n, h, w, co]) | |
T.writes(T_relu[n, h, w, co]) | |
T_relu[n, h, w, co] = T.max(inverse[h % 2, w % 2, n * 49 + h // 2 * 7 + w // 2, co] + placeholder_1[n, 0, 0, co] + placeholder[n, h, w, co], T.float32(0)) | |
b0 = sch.get_block(name="B", func_name="main") | |
b1 = sch.get_block(name="data_pack", func_name="main") | |
b2 = sch.get_block(name="bgemm", func_name="main") | |
b3 = sch.get_block(name="A", func_name="main") | |
b4 = sch.get_block(name="inverse", func_name="main") | |
b5 = sch.get_block(name="T_add", func_name="main") | |
b6 = sch.get_block(name="T_add_1", func_name="main") | |
b7 = sch.get_block(name="T_relu", func_name="main") | |
b8 = sch.get_block(name="root", func_name="main") | |
sch.compute_inline(block=b0) | |
b9, = sch.get_producers(block=b1) | |
b10, = sch.get_producers(block=b9) | |
l11, l12, l13, l14, l15, l16 = sch.get_loops(block=b1) | |
v17, v18 = sch.sample_perfect_tile(loop=l13, n=2, max_innermost_factor=64, decision=[49, 1]) | |
l19, l20 = sch.split(loop=l13, factors=[v17, v18]) | |
v21, v22 = sch.sample_perfect_tile(loop=l14, n=2, max_innermost_factor=64, decision=[16, 16]) | |
l23, l24 = sch.split(loop=l14, factors=[v21, v22]) | |
sch.unroll(loop=l11) | |
sch.unroll(loop=l12) | |
sch.unroll(loop=l15) | |
sch.unroll(loop=l16) | |
sch.reorder(l19, l23, l20, l24, l11, l12, l15, l16) | |
sch.compute_at(block=b9, loop=l24, preserve_unit_loops=True) | |
sch.set_scope(block=b9, buffer_index=0, storage_scope="local") | |
sch.compute_inline(block=b10) | |
sch.compute_inline(block=b3) | |
l25, l26, l27, l28, l29, l30 = sch.get_loops(block=b4) | |
v31, v32 = sch.sample_perfect_tile(loop=l27, n=2, max_innermost_factor=64, decision=[49, 1]) | |
l33, l34 = sch.split(loop=l27, factors=[v31, v32]) | |
v35, v36 = sch.sample_perfect_tile(loop=l28, n=2, max_innermost_factor=64, decision=[4, 64]) | |
l37, l38 = sch.split(loop=l28, factors=[v35, v36]) | |
sch.unroll(loop=l25) | |
sch.unroll(loop=l26) | |
sch.unroll(loop=l29) | |
sch.unroll(loop=l30) | |
sch.reorder(l33, l37, l34, l38, l25, l26, l29, l30) | |
sch.annotate(block_or_loop=b2, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS") | |
l39, l40, l41, l42, l43 = sch.get_loops(block=b2) | |
v44, v45, v46, v47, v48 = sch.sample_perfect_tile(loop=l39, n=5, max_innermost_factor=64, decision=[1, 2, 2, 1, 1]) | |
l49, l50, l51, l52, l53 = sch.split(loop=l39, factors=[v44, v45, v46, v47, v48]) | |
v54, v55, v56, v57, v58 = sch.sample_perfect_tile(loop=l40, n=5, max_innermost_factor=64, decision=[1, 1, 1, 1, 4]) | |
l59, l60, l61, l62, l63 = sch.split(loop=l40, factors=[v54, v55, v56, v57, v58]) | |
v64, v65, v66, v67, v68 = sch.sample_perfect_tile(loop=l41, n=5, max_innermost_factor=64, decision=[1, 1, 1, 1, 49]) | |
l69, l70, l71, l72, l73 = sch.split(loop=l41, factors=[v64, v65, v66, v67, v68]) | |
v74, v75, v76, v77, v78 = sch.sample_perfect_tile(loop=l42, n=5, max_innermost_factor=64, decision=[32, 2, 1, 1, 4]) | |
l79, l80, l81, l82, l83 = sch.split(loop=l42, factors=[v74, v75, v76, v77, v78]) | |
v84, v85, v86 = sch.sample_perfect_tile(loop=l43, n=3, max_innermost_factor=64, decision=[4, 16, 4]) | |
l87, l88, l89 = sch.split(loop=l43, factors=[v84, v85, v86]) | |
sch.reorder(l49, l59, l69, l79, l50, l60, l70, l80, l51, l61, l71, l81, l87, l88, l52, l62, l72, l82, l89, l53, l63, l73, l83) | |
l90 = sch.fuse(l49, l59, l69, l79) | |
sch.bind(loop=l90, thread_axis="blockIdx.x") | |
l91 = sch.fuse(l50, l60, l70, l80) | |
sch.bind(loop=l91, thread_axis="vthread.x") | |
l92 = sch.fuse(l51, l61, l71, l81) | |
sch.bind(loop=l92, thread_axis="threadIdx.x") | |
sch.annotate(block_or_loop=b2, ann_key="meta_schedule.thread_extent_low_inclusive", ann_val=32) | |
sch.annotate(block_or_loop=b2, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=1024) | |
b93 = sch.cache_write(block=b2, write_buffer_index=0, storage_scope="local") | |
sch.reverse_compute_at(block=b93, loop=l92, preserve_unit_loops=True) | |
b94 = sch.cache_read(block=b2, read_buffer_index=1, storage_scope="shared") | |
sch.compute_at(block=b94, loop=l87, preserve_unit_loops=True) | |
l95, l96, l97, l98, l99, l100, l101, l102 = sch.get_loops(block=b94) | |
l103 = sch.fuse(l99, l100, l101, l102) | |
v104 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=1) | |
sch.annotate(block_or_loop=b94, ann_key="meta_schedule.cooperative_fetch", ann_val=v104) | |
b105 = sch.cache_read(block=b2, read_buffer_index=2, storage_scope="shared") | |
sch.compute_at(block=b105, loop=l87, preserve_unit_loops=True) | |
l106, l107, l108, l109, l110, l111, l112, l113 = sch.get_loops(block=b105) | |
l114 = sch.fuse(l110, l111, l112, l113) | |
v115 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=2) | |
sch.annotate(block_or_loop=b105, ann_key="meta_schedule.cooperative_fetch", ann_val=v115) | |
sch.reverse_compute_inline(block=b7) | |
sch.reverse_compute_inline(block=b6) | |
sch.reverse_compute_inline(block=b5) | |
v116 = sch.sample_categorical(candidates=[0, 16, 64, 512, 1024], probs=[0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001], decision=0) | |
sch.annotate(block_or_loop=b8, ann_key="meta_schedule.unroll_explicit", ann_val=v116) | |
[14:36:12] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:96: Initializing Task #12: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2", mod = | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 1, 1, 256), "float32"], placeholder_1: T.Buffer[(4, 4, 256, 256), "float32"], placeholder_2: T.Buffer[(1, 14, 14, 256), "float32"], T_relu: T.Buffer[(1, 14, 14, 256), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
# with T.block("root") | |
data_pad = T.alloc_buffer([1, 16, 16, 256], dtype="float32") | |
input_tile = T.alloc_buffer([4, 4, 49, 256], dtype="float32") | |
B = T.alloc_buffer([4, 4], dtype="float32") | |
data_pack = T.alloc_buffer([4, 4, 49, 256], dtype="float32") | |
bgemm = T.alloc_buffer([4, 4, 49, 256], dtype="float32") | |
A = T.alloc_buffer([4, 2], dtype="float32") | |
inverse = T.alloc_buffer([2, 2, 49, 256], dtype="float32") | |
conv2d_winograd = T.alloc_buffer([1, 14, 14, 256], dtype="float32") | |
T_add = T.alloc_buffer([1, 14, 14, 256], dtype="float32") | |
for i0, i1, i2, i3 in T.grid(1, 16, 16, 256): | |
with T.block("data_pad"): | |
i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(placeholder_2[i0_1, i1_1 - 1, i2_1 - 1, i3_1]) | |
T.writes(data_pad[i0_1, i1_1, i2_1, i3_1]) | |
T.block_attr({"schedule_rule":"None"}) | |
data_pad[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(1 <= i1_1 and i1_1 < 15 and 1 <= i2_1 and i2_1 < 15, placeholder_2[i0_1, i1_1 - 1, i2_1 - 1, i3_1], T.float32(0), dtype="float32") | |
for i0, i1, i2, i3 in T.grid(4, 4, 49, 256): | |
with T.block("input_tile"): | |
eps, nu, p, ci = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(data_pad[p // 49, p % 49 // 7 * 2 + eps, p % 7 * 2 + nu, ci]) | |
T.writes(input_tile[eps, nu, p, ci]) | |
T.block_attr({"schedule_rule":"None"}) | |
input_tile[eps, nu, p, ci] = data_pad[p // 49, p % 49 // 7 * 2 + eps, p % 7 * 2 + nu, ci] | |
for i0, i1 in T.grid(4, 4): | |
with T.block("B"): | |
i, j = T.axis.remap("SS", [i0, i1]) | |
T.reads() | |
T.writes(B[i, j]) | |
T.block_attr({"const_matrix":True, "schedule_rule":"meta_schedule.compute_inline"}) | |
B[i, j] = T.Select(i % 4 == 3 and j % 4 == 3, T.float32(1), T.Select(i % 4 == 3 and j % 4 == 2, T.float32(0), T.Select(i % 4 == 3 and j % 4 == 1, T.float32(0), T.Select(i % 4 == 3 and j % 4 == 0, T.float32(0), T.Select(i % 4 == 2 and j % 4 == 3, T.float32(0), T.Select(i % 4 == 2 and j % 4 == 2, T.float32(1), T.Select(i % 4 == 2 and j % 4 == 1, T.float32(1), T.Select(i % 4 == 2 and j % 4 == 0, T.float32(-1), T.Select(i % 4 == 1 and j % 4 == 3, T.float32(-1), T.Select(i % 4 == 1 and j % 4 == 2, T.float32(1), T.Select(i % 4 == 1 and j % 4 == 1, T.float32(-1), T.Select(i % 4 == 1 and j % 4 == 0, T.float32(0), T.Select(i % 4 == 0 and j % 4 == 3, T.float32(0), T.Select(i % 4 == 0 and j % 4 == 2, T.float32(0), T.Select(i % 4 == 0 and j % 4 == 1, T.float32(0), T.Select(i % 4 == 0 and j % 4 == 0, T.float32(1), T.float32(0))))))))))))))))) | |
for i0, i1, i2, i3, i4, i5 in T.grid(4, 4, 49, 256, 4, 4): | |
with T.block("data_pack"): | |
eps, nu, p, ci, r_a, r_b = T.axis.remap("SSSSRR", [i0, i1, i2, i3, i4, i5]) | |
T.reads(data_pack[eps, nu, p, ci], input_tile[r_a, r_b, p, ci], B[T.min(r_a, r_b) : T.max(r_a, r_b) + 1, T.min(eps, nu) : T.max(eps, nu) + 1]) | |
T.writes(data_pack[eps, nu, p, ci]) | |
T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["eps", "nu", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_data_pack.cuda"}) | |
with T.init(): | |
data_pack[eps, nu, p, ci] = T.float32(0) | |
data_pack[eps, nu, p, ci] = data_pack[eps, nu, p, ci] + input_tile[r_a, r_b, p, ci] * B[r_a, eps] * B[r_b, nu] | |
for i0, i1, i2, i3, i4 in T.grid(4, 4, 49, 256, 256): | |
with T.block("bgemm"): | |
eps, nu, p, co, ci = T.axis.remap("SSSSR", [i0, i1, i2, i3, i4]) | |
T.reads(bgemm[eps, nu, p, co], data_pack[eps, nu, p, ci], placeholder_1[eps, nu, co, ci]) | |
T.writes(bgemm[eps, nu, p, co]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1]}) | |
with T.init(): | |
bgemm[eps, nu, p, co] = T.float32(0) | |
bgemm[eps, nu, p, co] = bgemm[eps, nu, p, co] + data_pack[eps, nu, p, ci] * placeholder_1[eps, nu, co, ci] | |
for i0, i1 in T.grid(4, 2): | |
with T.block("A"): | |
i, j = T.axis.remap("SS", [i0, i1]) | |
T.reads() | |
T.writes(A[i, j]) | |
T.block_attr({"const_matrix":True, "schedule_rule":"meta_schedule.compute_inline"}) | |
A[i, j] = T.Select(i % 4 == 3 and j % 2 == 1, T.float32(1), T.Select(i % 4 == 3 and j % 2 == 0, T.float32(0), T.Select(i % 4 == 2 and j % 2 == 1, T.float32(1), T.Select(i % 4 == 2 and j % 2 == 0, T.float32(1), T.Select(i % 4 == 1 and j % 2 == 1, T.float32(-1), T.Select(i % 4 == 1 and j % 2 == 0, T.float32(1), T.Select(i % 4 == 0 and j % 2 == 1, T.float32(0), T.Select(i % 4 == 0 and j % 2 == 0, T.float32(1), T.float32(0))))))))) | |
for i0, i1, i2, i3, i4, i5 in T.grid(2, 2, 49, 256, 4, 4): | |
with T.block("inverse"): | |
vh, vw, p, co, r_a, r_b = T.axis.remap("SSSSRR", [i0, i1, i2, i3, i4, i5]) | |
T.reads(inverse[vh, vw, p, co], bgemm[r_a, r_b, p, co], A[T.min(r_a, r_b) : T.max(r_a, r_b) + 1, T.min(vh, vw) : T.max(vh, vw) + 1]) | |
T.writes(inverse[vh, vw, p, co]) | |
T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["vh", "vw", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_inverse"}) | |
with T.init(): | |
inverse[vh, vw, p, co] = T.float32(0) | |
inverse[vh, vw, p, co] = inverse[vh, vw, p, co] + bgemm[r_a, r_b, p, co] * A[r_a, vh] * A[r_b, vw] | |
for i0, i1, i2, i3 in T.grid(1, 14, 14, 256): | |
with T.block("conv2d_winograd"): | |
n, h, w, co = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(inverse[h % 2, w % 2, n * 49 + h // 2 * 7 + w // 2, co]) | |
T.writes(conv2d_winograd[n, h, w, co]) | |
conv2d_winograd[n, h, w, co] = inverse[h % 2, w % 2, n * 49 + h // 2 * 7 + w // 2, co] | |
for i0, i1, i2, i3 in T.grid(1, 14, 14, 256): | |
with T.block("T_add"): | |
ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(conv2d_winograd[ax0, ax1, ax2, ax3], placeholder[ax0, 0, 0, ax3]) | |
T.writes(T_add[ax0, ax1, ax2, ax3]) | |
T_add[ax0, ax1, ax2, ax3] = conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3] | |
for i0, i1, i2, i3 in T.grid(1, 14, 14, 256): | |
with T.block("T_relu"): | |
ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(T_add[ax0, ax1, ax2, ax3]) | |
T.writes(T_relu[ax0, ax1, ax2, ax3]) | |
T_relu[ax0, ax1, ax2, ax3] = T.max(T_add[ax0, ax1, ax2, ax3], T.float32(0)) | |
[14:36:12] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:113: Total 1 design space(s) generated | |
[14:36:12] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:118: Design space #0: | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 1, 1, 256), "float32"], placeholder_1: T.Buffer[(4, 4, 256, 256), "float32"], placeholder_2: T.Buffer[(1, 14, 14, 256), "float32"], T_relu: T.Buffer[(1, 14, 14, 256), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
with T.block("root"): | |
T.reads() | |
T.writes() | |
T.block_attr({"meta_schedule.unroll_explicit":1024}) | |
input_tile_local = T.alloc_buffer([4, 4, 49, 256], dtype="float32", scope="local") | |
data_pack = T.alloc_buffer([4, 4, 49, 256], dtype="float32") | |
bgemm = T.alloc_buffer([4, 4, 49, 256], dtype="float32") | |
inverse = T.alloc_buffer([2, 2, 49, 256], dtype="float32") | |
bgemm_local = T.alloc_buffer([4, 4, 49, 256], dtype="float32", scope="local") | |
data_pack_shared = T.alloc_buffer([4, 4, 49, 256], dtype="float32", scope="shared") | |
placeholder_shared = T.alloc_buffer([4, 4, 256, 256], dtype="float32", scope="shared") | |
for i2_0, i3_0, i2_1, i3_1 in T.grid(1, 16, 49, 16): | |
for ax0, ax1, ax2, ax3 in T.grid(4, 4, 1, 1): | |
with T.block("input_tile"): | |
eps, nu = T.axis.remap("SS", [ax0, ax1]) | |
p = T.axis.spatial(49, i2_1 + ax2) | |
ci = T.axis.spatial(256, i3_0 * 16 + i3_1 + ax3) | |
T.reads(placeholder_2[p // 49, p % 49 // 7 * 2 + eps - 1, p % 7 * 2 + nu - 1, ci]) | |
T.writes(input_tile_local[eps, nu, p, ci]) | |
T.block_attr({"schedule_rule":"None"}) | |
input_tile_local[eps, nu, p, ci] = T.if_then_else(1 <= p % 49 // 7 * 2 + eps and p % 49 // 7 * 2 + eps < 15 and 1 <= p % 7 * 2 + nu and p % 7 * 2 + nu < 15, placeholder_2[p // 49, p % 49 // 7 * 2 + eps - 1, p % 7 * 2 + nu - 1, ci], T.float32(0), dtype="float32") | |
for i0 in T.unroll(4): | |
for i1 in T.unroll(4): | |
for i4 in T.unroll(4): | |
for i5 in T.unroll(4): | |
with T.block("data_pack"): | |
eps, nu, p = T.axis.remap("SSS", [i0, i1, i2_1]) | |
ci = T.axis.spatial(256, i3_0 * 16 + i3_1) | |
r_a, r_b = T.axis.remap("RR", [i4, i5]) | |
T.reads(data_pack[eps, nu, p, ci], input_tile_local[r_a, r_b, p, ci]) | |
T.writes(data_pack[eps, nu, p, ci]) | |
T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["eps", "nu", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_data_pack.cuda"}) | |
with T.init(): | |
data_pack[eps, nu, p, ci] = T.float32(0) | |
data_pack[eps, nu, p, ci] = data_pack[eps, nu, p, ci] + input_tile_local[r_a, r_b, p, ci] * T.Select(r_a % 4 == 3 and eps % 4 == 3, T.float32(1), T.Select(r_a % 4 == 3 and eps % 4 == 2, T.float32(0), T.Select(r_a % 4 == 3 and eps % 4 == 1, T.float32(0), T.Select(r_a % 4 == 3 and eps % 4 == 0, T.float32(0), T.Select(r_a % 4 == 2 and eps % 4 == 3, T.float32(0), T.Select(r_a % 4 == 2 and eps % 4 == 2, T.float32(1), T.Select(r_a % 4 == 2 and eps % 4 == 1, T.float32(1), T.Select(r_a % 4 == 2 and eps % 4 == 0, T.float32(-1), T.Select(r_a % 4 == 1 and eps % 4 == 3, T.float32(-1), T.Select(r_a % 4 == 1 and eps % 4 == 2, T.float32(1), T.Select(r_a % 4 == 1 and eps % 4 == 1, T.float32(-1), T.Select(r_a % 4 == 1 and eps % 4 == 0, T.float32(0), T.Select(r_a % 4 == 0 and eps % 4 == 3, T.float32(0), T.Select(r_a % 4 == 0 and eps % 4 == 2, T.float32(0), T.Select(r_a % 4 == 0 and eps % 4 == 1, T.float32(0), T.Select(r_a % 4 == 0 and eps % 4 == 0, T.float32(1), T.float32(0))))))))))))))))) * T.Select(r_b % 4 == 3 and nu % 4 == 3, T.float32(1), T.Select(r_b % 4 == 3 and nu % 4 == 2, T.float32(0), T.Select(r_b % 4 == 3 and nu % 4 == 1, T.float32(0), T.Select(r_b % 4 == 3 and nu % 4 == 0, T.float32(0), T.Select(r_b % 4 == 2 and nu % 4 == 3, T.float32(0), T.Select(r_b % 4 == 2 and nu % 4 == 2, T.float32(1), T.Select(r_b % 4 == 2 and nu % 4 == 1, T.float32(1), T.Select(r_b % 4 == 2 and nu % 4 == 0, T.float32(-1), T.Select(r_b % 4 == 1 and nu % 4 == 3, T.float32(-1), T.Select(r_b % 4 == 1 and nu % 4 == 2, T.float32(1), T.Select(r_b % 4 == 1 and nu % 4 == 1, T.float32(-1), T.Select(r_b % 4 == 1 and nu % 4 == 0, T.float32(0), T.Select(r_b % 4 == 0 and nu % 4 == 3, T.float32(0), T.Select(r_b % 4 == 0 and nu % 4 == 2, T.float32(0), T.Select(r_b % 4 == 0 and nu % 4 == 1, T.float32(0), T.Select(r_b % 4 == 0 and nu % 4 == 0, T.float32(1), T.float32(0))))))))))))))))) | |
for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(2, thread="blockIdx.x"): | |
for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(4, thread="vthread.x"): | |
for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(1, thread="threadIdx.x"): | |
for i4_0 in T.serial(2): | |
for ax0_ax1_ax2_ax3_fused in T.serial(100352): | |
with T.block("data_pack_shared"): | |
v0 = T.axis.spatial(4, ax0_ax1_ax2_ax3_fused // 25088) | |
v1 = T.axis.spatial(4, ax0_ax1_ax2_ax3_fused % 25088 // 6272) | |
v2 = T.axis.spatial(49, ax0_ax1_ax2_ax3_fused % 6272 // 128) | |
v3 = T.axis.spatial(256, i4_0 * 128 + ax0_ax1_ax2_ax3_fused % 128) | |
T.reads(data_pack[v0, v1, v2, v3]) | |
T.writes(data_pack_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":3}) | |
data_pack_shared[v0, v1, v2, v3] = data_pack[v0, v1, v2, v3] | |
for ax0_ax1_ax2_ax3_fused in T.serial(262144): | |
with T.block("placeholder_shared"): | |
v0 = T.axis.spatial(4, ax0_ax1_ax2_ax3_fused // 65536) | |
v1 = T.axis.spatial(4, ax0_ax1_ax2_ax3_fused % 65536 // 16384) | |
v2 = T.axis.spatial(256, i0_0_i1_0_i2_0_i3_0_fused * 128 + ax0_ax1_ax2_ax3_fused % 16384 // 128) | |
v3 = T.axis.spatial(256, i4_0 * 128 + ax0_ax1_ax2_ax3_fused % 128) | |
T.reads(placeholder_1[v0, v1, v2, v3]) | |
T.writes(placeholder_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":2}) | |
placeholder_shared[v0, v1, v2, v3] = placeholder_1[v0, v1, v2, v3] | |
for i4_1, i0_3, i1_3, i2_3, i3_3, i4_2, i0_4, i1_4, i2_4, i3_4 in T.grid(2, 1, 1, 1, 2, 64, 4, 2, 49, 32): | |
with T.block("bgemm"): | |
eps = T.axis.spatial(4, i0_4) | |
nu = T.axis.spatial(4, i0_1_i1_1_i2_1_i3_1_fused // 2 * 2 + i1_4) | |
p = T.axis.spatial(49, i2_4) | |
co = T.axis.spatial(256, i0_0_i1_0_i2_0_i3_0_fused * 128 + i0_1_i1_1_i2_1_i3_1_fused % 2 * 64 + i3_3 * 32 + i3_4) | |
ci = T.axis.reduce(256, i4_0 * 128 + i4_1 * 64 + i4_2) | |
T.reads(bgemm_local[eps, nu, p, co], data_pack_shared[eps, nu, p, ci], placeholder_shared[eps, nu, co, ci]) | |
T.writes(bgemm_local[eps, nu, p, co]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS"}) | |
with T.init(): | |
bgemm_local[eps, nu, p, co] = T.float32(0) | |
bgemm_local[eps, nu, p, co] = bgemm_local[eps, nu, p, co] + data_pack_shared[eps, nu, p, ci] * placeholder_shared[eps, nu, co, ci] | |
for ax0, ax1, ax2, ax3 in T.grid(4, 2, 49, 64): | |
with T.block("bgemm_local"): | |
v0 = T.axis.spatial(4, ax0) | |
v1 = T.axis.spatial(4, i0_1_i1_1_i2_1_i3_1_fused // 2 * 2 + ax1) | |
v2 = T.axis.spatial(49, ax2) | |
v3 = T.axis.spatial(256, i0_0_i1_0_i2_0_i3_0_fused * 128 + i0_1_i1_1_i2_1_i3_1_fused % 2 * 64 + ax3) | |
T.reads(bgemm_local[v0, v1, v2, v3]) | |
T.writes(bgemm[v0, v1, v2, v3]) | |
bgemm[v0, v1, v2, v3] = bgemm_local[v0, v1, v2, v3] | |
for i2_0, i3_0, i2_1, i3_1 in T.grid(7, 8, 7, 32): | |
for i0 in T.unroll(2): | |
for i1 in T.unroll(2): | |
for i4 in T.unroll(4): | |
for i5 in T.unroll(4): | |
with T.block("inverse"): | |
vh, vw = T.axis.remap("SS", [i0, i1]) | |
p = T.axis.spatial(49, i2_0 * 7 + i2_1) | |
co = T.axis.spatial(256, i3_0 * 32 + i3_1) | |
r_a, r_b = T.axis.remap("RR", [i4, i5]) | |
T.reads(inverse[vh, vw, p, co], bgemm[r_a, r_b, p, co]) | |
T.writes(inverse[vh, vw, p, co]) | |
T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["vh", "vw", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_inverse"}) | |
with T.init(): | |
inverse[vh, vw, p, co] = T.float32(0) | |
inverse[vh, vw, p, co] = inverse[vh, vw, p, co] + bgemm[r_a, r_b, p, co] * T.Select(r_a % 4 == 3 and vh % 2 == 1, T.float32(1), T.Select(r_a % 4 == 3 and vh % 2 == 0, T.float32(0), T.Select(r_a % 4 == 2 and vh % 2 == 1, T.float32(1), T.Select(r_a % 4 == 2 and vh % 2 == 0, T.float32(1), T.Select(r_a % 4 == 1 and vh % 2 == 1, T.float32(-1), T.Select(r_a % 4 == 1 and vh % 2 == 0, T.float32(1), T.Select(r_a % 4 == 0 and vh % 2 == 1, T.float32(0), T.Select(r_a % 4 == 0 and vh % 2 == 0, T.float32(1), T.float32(0))))))))) * T.Select(r_b % 4 == 3 and vw % 2 == 1, T.float32(1), T.Select(r_b % 4 == 3 and vw % 2 == 0, T.float32(0), T.Select(r_b % 4 == 2 and vw % 2 == 1, T.float32(1), T.Select(r_b % 4 == 2 and vw % 2 == 0, T.float32(1), T.Select(r_b % 4 == 1 and vw % 2 == 1, T.float32(-1), T.Select(r_b % 4 == 1 and vw % 2 == 0, T.float32(1), T.Select(r_b % 4 == 0 and vw % 2 == 1, T.float32(0), T.Select(r_b % 4 == 0 and vw % 2 == 0, T.float32(1), T.float32(0))))))))) | |
for i0, i1, i2, i3 in T.grid(1, 14, 14, 256): | |
with T.block("conv2d_winograd"): | |
n, h, w, co = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(inverse[h % 2, w % 2, n * 49 + h // 2 * 7 + w // 2, co], placeholder[n, 0, 0, co]) | |
T.writes(T_relu[n, h, w, co]) | |
T_relu[n, h, w, co] = T.max(inverse[h % 2, w % 2, n * 49 + h // 2 * 7 + w // 2, co] + placeholder[n, 0, 0, co], T.float32(0)) | |
b0 = sch.get_block(name="B", func_name="main") | |
b1 = sch.get_block(name="data_pack", func_name="main") | |
b2 = sch.get_block(name="bgemm", func_name="main") | |
b3 = sch.get_block(name="A", func_name="main") | |
b4 = sch.get_block(name="inverse", func_name="main") | |
b5 = sch.get_block(name="T_add", func_name="main") | |
b6 = sch.get_block(name="T_relu", func_name="main") | |
b7 = sch.get_block(name="root", func_name="main") | |
sch.compute_inline(block=b0) | |
b8, = sch.get_producers(block=b1) | |
b9, = sch.get_producers(block=b8) | |
l10, l11, l12, l13, l14, l15 = sch.get_loops(block=b1) | |
v16, v17 = sch.sample_perfect_tile(loop=l12, n=2, max_innermost_factor=64, decision=[1, 49]) | |
l18, l19 = sch.split(loop=l12, factors=[v16, v17]) | |
v20, v21 = sch.sample_perfect_tile(loop=l13, n=2, max_innermost_factor=64, decision=[16, 16]) | |
l22, l23 = sch.split(loop=l13, factors=[v20, v21]) | |
sch.unroll(loop=l10) | |
sch.unroll(loop=l11) | |
sch.unroll(loop=l14) | |
sch.unroll(loop=l15) | |
sch.reorder(l18, l22, l19, l23, l10, l11, l14, l15) | |
sch.compute_at(block=b8, loop=l23, preserve_unit_loops=True) | |
sch.set_scope(block=b8, buffer_index=0, storage_scope="local") | |
sch.compute_inline(block=b9) | |
sch.compute_inline(block=b3) | |
l24, l25, l26, l27, l28, l29 = sch.get_loops(block=b4) | |
v30, v31 = sch.sample_perfect_tile(loop=l26, n=2, max_innermost_factor=64, decision=[7, 7]) | |
l32, l33 = sch.split(loop=l26, factors=[v30, v31]) | |
v34, v35 = sch.sample_perfect_tile(loop=l27, n=2, max_innermost_factor=64, decision=[8, 32]) | |
l36, l37 = sch.split(loop=l27, factors=[v34, v35]) | |
sch.unroll(loop=l24) | |
sch.unroll(loop=l25) | |
sch.unroll(loop=l28) | |
sch.unroll(loop=l29) | |
sch.reorder(l32, l36, l33, l37, l24, l25, l28, l29) | |
sch.annotate(block_or_loop=b2, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS") | |
l38, l39, l40, l41, l42 = sch.get_loops(block=b2) | |
v43, v44, v45, v46, v47 = sch.sample_perfect_tile(loop=l38, n=5, max_innermost_factor=64, decision=[1, 1, 1, 1, 4]) | |
l48, l49, l50, l51, l52 = sch.split(loop=l38, factors=[v43, v44, v45, v46, v47]) | |
v53, v54, v55, v56, v57 = sch.sample_perfect_tile(loop=l39, n=5, max_innermost_factor=64, decision=[1, 2, 1, 1, 2]) | |
l58, l59, l60, l61, l62 = sch.split(loop=l39, factors=[v53, v54, v55, v56, v57]) | |
v63, v64, v65, v66, v67 = sch.sample_perfect_tile(loop=l40, n=5, max_innermost_factor=64, decision=[1, 1, 1, 1, 49]) | |
l68, l69, l70, l71, l72 = sch.split(loop=l40, factors=[v63, v64, v65, v66, v67]) | |
v73, v74, v75, v76, v77 = sch.sample_perfect_tile(loop=l41, n=5, max_innermost_factor=64, decision=[2, 2, 1, 2, 32]) | |
l78, l79, l80, l81, l82 = sch.split(loop=l41, factors=[v73, v74, v75, v76, v77]) | |
v83, v84, v85 = sch.sample_perfect_tile(loop=l42, n=3, max_innermost_factor=64, decision=[2, 2, 64]) | |
l86, l87, l88 = sch.split(loop=l42, factors=[v83, v84, v85]) | |
sch.reorder(l48, l58, l68, l78, l49, l59, l69, l79, l50, l60, l70, l80, l86, l87, l51, l61, l71, l81, l88, l52, l62, l72, l82) | |
l89 = sch.fuse(l48, l58, l68, l78) | |
sch.bind(loop=l89, thread_axis="blockIdx.x") | |
l90 = sch.fuse(l49, l59, l69, l79) | |
sch.bind(loop=l90, thread_axis="vthread.x") | |
l91 = sch.fuse(l50, l60, l70, l80) | |
sch.bind(loop=l91, thread_axis="threadIdx.x") | |
sch.annotate(block_or_loop=b2, ann_key="meta_schedule.thread_extent_low_inclusive", ann_val=32) | |
sch.annotate(block_or_loop=b2, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=1024) | |
b92 = sch.cache_write(block=b2, write_buffer_index=0, storage_scope="local") | |
sch.reverse_compute_at(block=b92, loop=l91, preserve_unit_loops=True) | |
b93 = sch.cache_read(block=b2, read_buffer_index=1, storage_scope="shared") | |
sch.compute_at(block=b93, loop=l86, preserve_unit_loops=True) | |
l94, l95, l96, l97, l98, l99, l100, l101 = sch.get_loops(block=b93) | |
l102 = sch.fuse(l98, l99, l100, l101) | |
v103 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=2) | |
sch.annotate(block_or_loop=b93, ann_key="meta_schedule.cooperative_fetch", ann_val=v103) | |
b104 = sch.cache_read(block=b2, read_buffer_index=2, storage_scope="shared") | |
sch.compute_at(block=b104, loop=l86, preserve_unit_loops=True) | |
l105, l106, l107, l108, l109, l110, l111, l112 = sch.get_loops(block=b104) | |
l113 = sch.fuse(l109, l110, l111, l112) | |
v114 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=1) | |
sch.annotate(block_or_loop=b104, ann_key="meta_schedule.cooperative_fetch", ann_val=v114) | |
sch.reverse_compute_inline(block=b6) | |
sch.reverse_compute_inline(block=b5) | |
v115 = sch.sample_categorical(candidates=[0, 16, 64, 512, 1024], probs=[0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001], decision=4) | |
sch.annotate(block_or_loop=b7, ann_key="meta_schedule.unroll_explicit", ann_val=v115) | |
[14:36:12] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:96: Initializing Task #13: "vm_mod_fused_nn_conv2d_add_nn_relu_3", mod = | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 1, 1, 512), "float32"], placeholder_1: T.Buffer[(3, 3, 256, 512), "float32"], placeholder_2: T.Buffer[(1, 14, 14, 256), "float32"], T_relu: T.Buffer[(1, 7, 7, 512), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
# with T.block("root") | |
pad_temp = T.alloc_buffer([1, 16, 16, 256], dtype="float32") | |
conv2d_nhwc = T.alloc_buffer([1, 7, 7, 512], dtype="float32") | |
T_add = T.alloc_buffer([1, 7, 7, 512], dtype="float32") | |
for i0, i1, i2, i3 in T.grid(1, 16, 16, 256): | |
with T.block("pad_temp"): | |
i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(placeholder_2[i0_1, i1_1 - 1, i2_1 - 1, i3_1]) | |
T.writes(pad_temp[i0_1, i1_1, i2_1, i3_1]) | |
pad_temp[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(1 <= i1_1 and i1_1 < 15 and 1 <= i2_1 and i2_1 < 15, placeholder_2[i0_1, i1_1 - 1, i2_1 - 1, i3_1], T.float32(0), dtype="float32") | |
for i0, i1, i2, i3, i4, i5, i6 in T.grid(1, 7, 7, 512, 3, 3, 256): | |
with T.block("conv2d_nhwc"): | |
nn, yy, xx, ff, ry, rx, rc = T.axis.remap("SSSSRRR", [i0, i1, i2, i3, i4, i5, i6]) | |
T.reads(conv2d_nhwc[nn, yy, xx, ff], pad_temp[nn, yy * 2 + ry, xx * 2 + rx, rc], placeholder_1[ry, rx, rc, ff]) | |
T.writes(conv2d_nhwc[nn, yy, xx, ff]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "workload":["conv2d_nhwc.gpu", ["TENSOR", [1, 14, 14, 256], "float32"], ["TENSOR", [3, 3, 256, 512], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], "float32"]}) | |
with T.init(): | |
conv2d_nhwc[nn, yy, xx, ff] = T.float32(0) | |
conv2d_nhwc[nn, yy, xx, ff] = conv2d_nhwc[nn, yy, xx, ff] + pad_temp[nn, yy * 2 + ry, xx * 2 + rx, rc] * placeholder_1[ry, rx, rc, ff] | |
for i0, i1, i2, i3 in T.grid(1, 7, 7, 512): | |
with T.block("T_add"): | |
ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(conv2d_nhwc[ax0, ax1, ax2, ax3], placeholder[ax0, 0, 0, ax3]) | |
T.writes(T_add[ax0, ax1, ax2, ax3]) | |
T_add[ax0, ax1, ax2, ax3] = conv2d_nhwc[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3] | |
for i0, i1, i2, i3 in T.grid(1, 7, 7, 512): | |
with T.block("T_relu"): | |
ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(T_add[ax0, ax1, ax2, ax3]) | |
T.writes(T_relu[ax0, ax1, ax2, ax3]) | |
T_relu[ax0, ax1, ax2, ax3] = T.max(T_add[ax0, ax1, ax2, ax3], T.float32(0)) | |
[14:36:12] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:113: Total 1 design space(s) generated | |
[14:36:12] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:118: Design space #0: | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 1, 1, 512), "float32"], placeholder_1: T.Buffer[(3, 3, 256, 512), "float32"], placeholder_2: T.Buffer[(1, 14, 14, 256), "float32"], T_relu: T.Buffer[(1, 7, 7, 512), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
with T.block("root"): | |
T.reads() | |
T.writes() | |
T.block_attr({"meta_schedule.unroll_explicit":64}) | |
conv2d_nhwc_local = T.alloc_buffer([1, 7, 7, 512], dtype="float32", scope="local") | |
pad_temp_shared = T.alloc_buffer([1, 16, 16, 256], dtype="float32", scope="shared") | |
placeholder_shared = T.alloc_buffer([3, 3, 256, 512], dtype="float32", scope="shared") | |
for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(28, thread="blockIdx.x"): | |
for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(1, thread="vthread.x"): | |
for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(4, thread="threadIdx.x"): | |
for i4_0, i5_0, i6_0 in T.grid(1, 1, 8): | |
for ax0_ax1_ax2_ax3_fused in T.serial(1440): | |
with T.block("pad_temp_shared"): | |
v0 = T.axis.spatial(1, 0) | |
v1 = T.axis.spatial(16, i0_0_i1_0_i2_0_i3_0_fused // 4 * 2 + ax0_ax1_ax2_ax3_fused % 1440 // 480) | |
v2 = T.axis.spatial(16, ax0_ax1_ax2_ax3_fused % 480 // 32) | |
v3 = T.axis.spatial(256, i6_0 * 32 + ax0_ax1_ax2_ax3_fused % 32) | |
T.reads(placeholder_2[v0, v1 - 1, v2 - 1, v3]) | |
T.writes(pad_temp_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":2}) | |
pad_temp_shared[v0, v1, v2, v3] = T.if_then_else(1 <= v1 and v1 < 15 and 1 <= v2 and v2 < 15, placeholder_2[v0, v1 - 1, v2 - 1, v3], T.float32(0), dtype="float32") | |
for ax0_ax1_ax2_ax3_fused in T.serial(36864): | |
with T.block("placeholder_shared"): | |
v0 = T.axis.spatial(3, ax0_ax1_ax2_ax3_fused // 12288) | |
v1 = T.axis.spatial(3, ax0_ax1_ax2_ax3_fused % 12288 // 4096) | |
v2 = T.axis.spatial(256, i6_0 * 32 + ax0_ax1_ax2_ax3_fused % 4096 // 128) | |
v3 = T.axis.spatial(512, i0_0_i1_0_i2_0_i3_0_fused % 4 * 128 + ax0_ax1_ax2_ax3_fused % 128) | |
T.reads(placeholder_1[v0, v1, v2, v3]) | |
T.writes(placeholder_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":3}) | |
placeholder_shared[v0, v1, v2, v3] = placeholder_1[v0, v1, v2, v3] | |
for i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3, i4_2, i5_2, i6_2, i0_4, i1_4, i2_4, i3_4 in T.grid(1, 3, 1, 1, 1, 1, 2, 3, 1, 32, 1, 1, 7, 16): | |
with T.block("conv2d_nhwc"): | |
nn = T.axis.spatial(1, 0) | |
yy = T.axis.spatial(7, i0_0_i1_0_i2_0_i3_0_fused // 4) | |
xx = T.axis.spatial(7, i2_4) | |
ff = T.axis.spatial(512, i0_0_i1_0_i2_0_i3_0_fused % 4 * 128 + i0_2_i1_2_i2_2_i3_2_fused * 32 + i3_3 * 16 + i3_4) | |
ry, rx = T.axis.remap("RR", [i4_2, i5_1]) | |
rc = T.axis.reduce(256, i6_0 * 32 + i6_2) | |
T.reads(conv2d_nhwc_local[nn, yy, xx, ff], pad_temp_shared[nn, yy * 2 + ry, xx * 2 + rx, rc], placeholder_shared[ry, rx, rc, ff]) | |
T.writes(conv2d_nhwc_local[nn, yy, xx, ff]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["conv2d_nhwc.gpu", ["TENSOR", [1, 14, 14, 256], "float32"], ["TENSOR", [3, 3, 256, 512], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], "float32"]}) | |
with T.init(): | |
conv2d_nhwc_local[nn, yy, xx, ff] = T.float32(0) | |
conv2d_nhwc_local[nn, yy, xx, ff] = conv2d_nhwc_local[nn, yy, xx, ff] + pad_temp_shared[nn, yy * 2 + ry, xx * 2 + rx, rc] * placeholder_shared[ry, rx, rc, ff] | |
for ax0, ax1, ax2, ax3 in T.grid(1, 1, 7, 32): | |
with T.block("conv2d_nhwc_local"): | |
v0 = T.axis.spatial(1, ax0) | |
v1 = T.axis.spatial(7, i0_0_i1_0_i2_0_i3_0_fused // 4 + ax1) | |
v2 = T.axis.spatial(7, ax2) | |
v3 = T.axis.spatial(512, i0_0_i1_0_i2_0_i3_0_fused % 4 * 128 + i0_2_i1_2_i2_2_i3_2_fused * 32 + ax3) | |
T.reads(conv2d_nhwc_local[v0, v1, v2, v3], placeholder[v0, 0, 0, v3]) | |
T.writes(T_relu[v0, v1, v2, v3]) | |
T_relu[v0, v1, v2, v3] = T.max(conv2d_nhwc_local[v0, v1, v2, v3] + placeholder[v0, 0, 0, v3], T.float32(0)) | |
b0 = sch.get_block(name="pad_temp", func_name="main") | |
b1 = sch.get_block(name="conv2d_nhwc", func_name="main") | |
b2 = sch.get_block(name="T_add", func_name="main") | |
b3 = sch.get_block(name="T_relu", func_name="main") | |
b4 = sch.get_block(name="root", func_name="main") | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS") | |
l5, l6, l7, l8, l9, l10, l11 = sch.get_loops(block=b1) | |
v12, v13, v14, v15, v16 = sch.sample_perfect_tile(loop=l5, n=5, max_innermost_factor=64, decision=[1, 1, 1, 1, 1]) | |
l17, l18, l19, l20, l21 = sch.split(loop=l5, factors=[v12, v13, v14, v15, v16]) | |
v22, v23, v24, v25, v26 = sch.sample_perfect_tile(loop=l6, n=5, max_innermost_factor=64, decision=[7, 1, 1, 1, 1]) | |
l27, l28, l29, l30, l31 = sch.split(loop=l6, factors=[v22, v23, v24, v25, v26]) | |
v32, v33, v34, v35, v36 = sch.sample_perfect_tile(loop=l7, n=5, max_innermost_factor=64, decision=[1, 1, 1, 1, 7]) | |
l37, l38, l39, l40, l41 = sch.split(loop=l7, factors=[v32, v33, v34, v35, v36]) | |
v42, v43, v44, v45, v46 = sch.sample_perfect_tile(loop=l8, n=5, max_innermost_factor=64, decision=[4, 1, 4, 2, 16]) | |
l47, l48, l49, l50, l51 = sch.split(loop=l8, factors=[v42, v43, v44, v45, v46]) | |
v52, v53, v54 = sch.sample_perfect_tile(loop=l9, n=3, max_innermost_factor=64, decision=[1, 1, 3]) | |
l55, l56, l57 = sch.split(loop=l9, factors=[v52, v53, v54]) | |
v58, v59, v60 = sch.sample_perfect_tile(loop=l10, n=3, max_innermost_factor=64, decision=[1, 3, 1]) | |
l61, l62, l63 = sch.split(loop=l10, factors=[v58, v59, v60]) | |
v64, v65, v66 = sch.sample_perfect_tile(loop=l11, n=3, max_innermost_factor=64, decision=[8, 1, 32]) | |
l67, l68, l69 = sch.split(loop=l11, factors=[v64, v65, v66]) | |
sch.reorder(l17, l27, l37, l47, l18, l28, l38, l48, l19, l29, l39, l49, l55, l61, l67, l56, l62, l68, l20, l30, l40, l50, l57, l63, l69, l21, l31, l41, l51) | |
l70 = sch.fuse(l17, l27, l37, l47) | |
sch.bind(loop=l70, thread_axis="blockIdx.x") | |
l71 = sch.fuse(l18, l28, l38, l48) | |
sch.bind(loop=l71, thread_axis="vthread.x") | |
l72 = sch.fuse(l19, l29, l39, l49) | |
sch.bind(loop=l72, thread_axis="threadIdx.x") | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.thread_extent_low_inclusive", ann_val=32) | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=1024) | |
b73 = sch.cache_write(block=b1, write_buffer_index=0, storage_scope="local") | |
sch.reverse_compute_at(block=b73, loop=l72, preserve_unit_loops=True) | |
b74 = sch.cache_read(block=b1, read_buffer_index=1, storage_scope="shared") | |
sch.compute_at(block=b74, loop=l67, preserve_unit_loops=True) | |
l75, l76, l77, l78, l79, l80, l81, l82, l83, l84 = sch.get_loops(block=b74) | |
l85 = sch.fuse(l81, l82, l83, l84) | |
v86 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=1) | |
sch.annotate(block_or_loop=b74, ann_key="meta_schedule.cooperative_fetch", ann_val=v86) | |
b87 = sch.cache_read(block=b1, read_buffer_index=2, storage_scope="shared") | |
sch.compute_at(block=b87, loop=l67, preserve_unit_loops=True) | |
l88, l89, l90, l91, l92, l93, l94, l95, l96, l97 = sch.get_loops(block=b87) | |
l98 = sch.fuse(l94, l95, l96, l97) | |
v99 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=2) | |
sch.annotate(block_or_loop=b87, ann_key="meta_schedule.cooperative_fetch", ann_val=v99) | |
sch.reverse_compute_inline(block=b3) | |
sch.reverse_compute_inline(block=b2) | |
sch.compute_inline(block=b0) | |
v100 = sch.sample_categorical(candidates=[0, 16, 64, 512, 1024], probs=[0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001], decision=2) | |
sch.annotate(block_or_loop=b4, ann_key="meta_schedule.unroll_explicit", ann_val=v100) | |
[14:36:12] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:96: Initializing Task #14: "vm_mod_fused_nn_conv2d_add_2", mod = | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 1, 1, 512), "float32"], placeholder_1: T.Buffer[(1, 1, 256, 512), "float32"], placeholder_2: T.Buffer[(1, 14, 14, 256), "float32"], T_add: T.Buffer[(1, 7, 7, 512), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
# with T.block("root") | |
pad_temp = T.alloc_buffer([1, 14, 14, 256], dtype="float32") | |
conv2d_nhwc = T.alloc_buffer([1, 7, 7, 512], dtype="float32") | |
for i0, i1, i2, i3 in T.grid(1, 14, 14, 256): | |
with T.block("pad_temp"): | |
i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(placeholder_2[i0_1, i1_1, i2_1, i3_1]) | |
T.writes(pad_temp[i0_1, i1_1, i2_1, i3_1]) | |
pad_temp[i0_1, i1_1, i2_1, i3_1] = placeholder_2[i0_1, i1_1, i2_1, i3_1] | |
for i0, i1, i2, i3, i4, i5, i6 in T.grid(1, 7, 7, 512, 1, 1, 256): | |
with T.block("conv2d_nhwc"): | |
nn, yy, xx, ff, ry, rx, rc = T.axis.remap("SSSSRRR", [i0, i1, i2, i3, i4, i5, i6]) | |
T.reads(conv2d_nhwc[nn, yy, xx, ff], pad_temp[nn, yy * 2 + ry, xx * 2 + rx, rc], placeholder_1[ry, rx, rc, ff]) | |
T.writes(conv2d_nhwc[nn, yy, xx, ff]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "workload":["conv2d_nhwc.gpu", ["TENSOR", [1, 14, 14, 256], "float32"], ["TENSOR", [1, 1, 256, 512], "float32"], [2, 2], [0, 0, 0, 0], [1, 1], "float32"]}) | |
with T.init(): | |
conv2d_nhwc[nn, yy, xx, ff] = T.float32(0) | |
conv2d_nhwc[nn, yy, xx, ff] = conv2d_nhwc[nn, yy, xx, ff] + pad_temp[nn, yy * 2 + ry, xx * 2 + rx, rc] * placeholder_1[ry, rx, rc, ff] | |
for i0, i1, i2, i3 in T.grid(1, 7, 7, 512): | |
with T.block("T_add"): | |
ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(conv2d_nhwc[ax0, ax1, ax2, ax3], placeholder[ax0, 0, 0, ax3]) | |
T.writes(T_add[ax0, ax1, ax2, ax3]) | |
T_add[ax0, ax1, ax2, ax3] = conv2d_nhwc[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3] | |
[14:36:12] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:113: Total 1 design space(s) generated | |
[14:36:12] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:118: Design space #0: | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 1, 1, 512), "float32"], placeholder_1: T.Buffer[(1, 1, 256, 512), "float32"], placeholder_2: T.Buffer[(1, 14, 14, 256), "float32"], T_add: T.Buffer[(1, 7, 7, 512), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
with T.block("root"): | |
T.reads() | |
T.writes() | |
T.block_attr({"meta_schedule.unroll_explicit":0}) | |
conv2d_nhwc_local = T.alloc_buffer([1, 7, 7, 512], dtype="float32", scope="local") | |
pad_temp_shared = T.alloc_buffer([1, 14, 14, 256], dtype="float32", scope="shared") | |
placeholder_shared = T.alloc_buffer([1, 1, 256, 512], dtype="float32", scope="shared") | |
for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(28, thread="blockIdx.x"): | |
for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(1, thread="vthread.x"): | |
for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(4, thread="threadIdx.x"): | |
for i4_0, i5_0, i6_0 in T.grid(1, 1, 32): | |
for ax0_ax1_ax2_ax3_fused in T.serial(104): | |
with T.block("pad_temp_shared"): | |
v0 = T.axis.spatial(1, 0) | |
v1 = T.axis.spatial(14, i0_0_i1_0_i2_0_i3_0_fused // 4 * 2) | |
v2 = T.axis.spatial(14, ax0_ax1_ax2_ax3_fused // 8) | |
v3 = T.axis.spatial(256, i6_0 * 8 + ax0_ax1_ax2_ax3_fused % 8) | |
T.reads(placeholder_2[v0, v1, v2, v3]) | |
T.writes(pad_temp_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":4}) | |
pad_temp_shared[v0, v1, v2, v3] = placeholder_2[v0, v1, v2, v3] | |
for ax0_ax1_ax2_ax3_fused in T.serial(1024): | |
with T.block("placeholder_shared"): | |
v0 = T.axis.spatial(1, 0) | |
v1 = T.axis.spatial(1, 0) | |
v2 = T.axis.spatial(256, i6_0 * 8 + ax0_ax1_ax2_ax3_fused // 128) | |
v3 = T.axis.spatial(512, i0_0_i1_0_i2_0_i3_0_fused % 4 * 128 + ax0_ax1_ax2_ax3_fused % 128) | |
T.reads(placeholder_1[v0, v1, v2, v3]) | |
T.writes(placeholder_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":4}) | |
placeholder_shared[v0, v1, v2, v3] = placeholder_1[v0, v1, v2, v3] | |
for i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3, i4_2, i5_2, i6_2, i0_4, i1_4, i2_4, i3_4 in T.grid(1, 1, 4, 1, 1, 7, 2, 1, 1, 2, 1, 1, 1, 16): | |
with T.block("conv2d_nhwc"): | |
nn = T.axis.spatial(1, 0) | |
yy = T.axis.spatial(7, i0_0_i1_0_i2_0_i3_0_fused // 4) | |
xx = T.axis.spatial(7, i2_3) | |
ff = T.axis.spatial(512, i0_0_i1_0_i2_0_i3_0_fused % 4 * 128 + i0_2_i1_2_i2_2_i3_2_fused * 32 + i3_3 * 16 + i3_4) | |
ry = T.axis.reduce(1, 0) | |
rx = T.axis.reduce(1, 0) | |
rc = T.axis.reduce(256, i6_0 * 8 + i6_1 * 2 + i6_2) | |
T.reads(conv2d_nhwc_local[nn, yy, xx, ff], pad_temp_shared[nn, yy * 2 + ry, xx * 2 + rx, rc], placeholder_shared[ry, rx, rc, ff]) | |
T.writes(conv2d_nhwc_local[nn, yy, xx, ff]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["conv2d_nhwc.gpu", ["TENSOR", [1, 14, 14, 256], "float32"], ["TENSOR", [1, 1, 256, 512], "float32"], [2, 2], [0, 0, 0, 0], [1, 1], "float32"]}) | |
with T.init(): | |
conv2d_nhwc_local[nn, yy, xx, ff] = T.float32(0) | |
conv2d_nhwc_local[nn, yy, xx, ff] = conv2d_nhwc_local[nn, yy, xx, ff] + pad_temp_shared[nn, yy * 2 + ry, xx * 2 + rx, rc] * placeholder_shared[ry, rx, rc, ff] | |
for ax0, ax1, ax2, ax3 in T.grid(1, 1, 7, 32): | |
with T.block("conv2d_nhwc_local"): | |
v0 = T.axis.spatial(1, ax0) | |
v1 = T.axis.spatial(7, i0_0_i1_0_i2_0_i3_0_fused // 4 + ax1) | |
v2 = T.axis.spatial(7, ax2) | |
v3 = T.axis.spatial(512, i0_0_i1_0_i2_0_i3_0_fused % 4 * 128 + i0_2_i1_2_i2_2_i3_2_fused * 32 + ax3) | |
T.reads(conv2d_nhwc_local[v0, v1, v2, v3], placeholder[v0, 0, 0, v3]) | |
T.writes(T_add[v0, v1, v2, v3]) | |
T_add[v0, v1, v2, v3] = conv2d_nhwc_local[v0, v1, v2, v3] + placeholder[v0, 0, 0, v3] | |
b0 = sch.get_block(name="pad_temp", func_name="main") | |
b1 = sch.get_block(name="conv2d_nhwc", func_name="main") | |
b2 = sch.get_block(name="T_add", func_name="main") | |
b3 = sch.get_block(name="root", func_name="main") | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS") | |
l4, l5, l6, l7, l8, l9, l10 = sch.get_loops(block=b1) | |
v11, v12, v13, v14, v15 = sch.sample_perfect_tile(loop=l4, n=5, max_innermost_factor=64, decision=[1, 1, 1, 1, 1]) | |
l16, l17, l18, l19, l20 = sch.split(loop=l4, factors=[v11, v12, v13, v14, v15]) | |
v21, v22, v23, v24, v25 = sch.sample_perfect_tile(loop=l5, n=5, max_innermost_factor=64, decision=[7, 1, 1, 1, 1]) | |
l26, l27, l28, l29, l30 = sch.split(loop=l5, factors=[v21, v22, v23, v24, v25]) | |
v31, v32, v33, v34, v35 = sch.sample_perfect_tile(loop=l6, n=5, max_innermost_factor=64, decision=[1, 1, 1, 7, 1]) | |
l36, l37, l38, l39, l40 = sch.split(loop=l6, factors=[v31, v32, v33, v34, v35]) | |
v41, v42, v43, v44, v45 = sch.sample_perfect_tile(loop=l7, n=5, max_innermost_factor=64, decision=[4, 1, 4, 2, 16]) | |
l46, l47, l48, l49, l50 = sch.split(loop=l7, factors=[v41, v42, v43, v44, v45]) | |
v51, v52, v53 = sch.sample_perfect_tile(loop=l8, n=3, max_innermost_factor=64, decision=[1, 1, 1]) | |
l54, l55, l56 = sch.split(loop=l8, factors=[v51, v52, v53]) | |
v57, v58, v59 = sch.sample_perfect_tile(loop=l9, n=3, max_innermost_factor=64, decision=[1, 1, 1]) | |
l60, l61, l62 = sch.split(loop=l9, factors=[v57, v58, v59]) | |
v63, v64, v65 = sch.sample_perfect_tile(loop=l10, n=3, max_innermost_factor=64, decision=[32, 4, 2]) | |
l66, l67, l68 = sch.split(loop=l10, factors=[v63, v64, v65]) | |
sch.reorder(l16, l26, l36, l46, l17, l27, l37, l47, l18, l28, l38, l48, l54, l60, l66, l55, l61, l67, l19, l29, l39, l49, l56, l62, l68, l20, l30, l40, l50) | |
l69 = sch.fuse(l16, l26, l36, l46) | |
sch.bind(loop=l69, thread_axis="blockIdx.x") | |
l70 = sch.fuse(l17, l27, l37, l47) | |
sch.bind(loop=l70, thread_axis="vthread.x") | |
l71 = sch.fuse(l18, l28, l38, l48) | |
sch.bind(loop=l71, thread_axis="threadIdx.x") | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.thread_extent_low_inclusive", ann_val=32) | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=1024) | |
b72 = sch.cache_write(block=b1, write_buffer_index=0, storage_scope="local") | |
sch.reverse_compute_at(block=b72, loop=l71, preserve_unit_loops=True) | |
b73 = sch.cache_read(block=b1, read_buffer_index=1, storage_scope="shared") | |
sch.compute_at(block=b73, loop=l66, preserve_unit_loops=True) | |
l74, l75, l76, l77, l78, l79, l80, l81, l82, l83 = sch.get_loops(block=b73) | |
l84 = sch.fuse(l80, l81, l82, l83) | |
v85 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=3) | |
sch.annotate(block_or_loop=b73, ann_key="meta_schedule.cooperative_fetch", ann_val=v85) | |
b86 = sch.cache_read(block=b1, read_buffer_index=2, storage_scope="shared") | |
sch.compute_at(block=b86, loop=l66, preserve_unit_loops=True) | |
l87, l88, l89, l90, l91, l92, l93, l94, l95, l96 = sch.get_loops(block=b86) | |
l97 = sch.fuse(l93, l94, l95, l96) | |
v98 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=3) | |
sch.annotate(block_or_loop=b86, ann_key="meta_schedule.cooperative_fetch", ann_val=v98) | |
sch.reverse_compute_inline(block=b2) | |
sch.compute_inline(block=b0) | |
v99 = sch.sample_categorical(candidates=[0, 16, 64, 512, 1024], probs=[0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001], decision=0) | |
sch.annotate(block_or_loop=b3, ann_key="meta_schedule.unroll_explicit", ann_val=v99) | |
[14:36:12] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:96: Initializing Task #15: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_3", mod = | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 7, 7, 512), "float32"], placeholder_1: T.Buffer[(1, 1, 1, 512), "float32"], placeholder_2: T.Buffer[(4, 4, 512, 512), "float32"], placeholder_3: T.Buffer[(1, 7, 7, 512), "float32"], T_relu: T.Buffer[(1, 7, 7, 512), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
# with T.block("root") | |
data_pad = T.alloc_buffer([1, 10, 10, 512], dtype="float32") | |
input_tile = T.alloc_buffer([4, 4, 16, 512], dtype="float32") | |
B = T.alloc_buffer([4, 4], dtype="float32") | |
data_pack = T.alloc_buffer([4, 4, 16, 512], dtype="float32") | |
bgemm = T.alloc_buffer([4, 4, 16, 512], dtype="float32") | |
A = T.alloc_buffer([4, 2], dtype="float32") | |
inverse = T.alloc_buffer([2, 2, 16, 512], dtype="float32") | |
conv2d_winograd = T.alloc_buffer([1, 7, 7, 512], dtype="float32") | |
T_add = T.alloc_buffer([1, 7, 7, 512], dtype="float32") | |
T_add_1 = T.alloc_buffer([1, 7, 7, 512], dtype="float32") | |
for i0, i1, i2, i3 in T.grid(1, 10, 10, 512): | |
with T.block("data_pad"): | |
i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(placeholder_3[i0_1, i1_1 - 1, i2_1 - 1, i3_1]) | |
T.writes(data_pad[i0_1, i1_1, i2_1, i3_1]) | |
T.block_attr({"schedule_rule":"None"}) | |
data_pad[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(1 <= i1_1 and i1_1 < 8 and 1 <= i2_1 and i2_1 < 8, placeholder_3[i0_1, i1_1 - 1, i2_1 - 1, i3_1], T.float32(0), dtype="float32") | |
for i0, i1, i2, i3 in T.grid(4, 4, 16, 512): | |
with T.block("input_tile"): | |
eps, nu, p, ci = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(data_pad[p // 16, p % 16 // 4 * 2 + eps, p % 4 * 2 + nu, ci]) | |
T.writes(input_tile[eps, nu, p, ci]) | |
T.block_attr({"schedule_rule":"None"}) | |
input_tile[eps, nu, p, ci] = data_pad[p // 16, p % 16 // 4 * 2 + eps, p % 4 * 2 + nu, ci] | |
for i0, i1 in T.grid(4, 4): | |
with T.block("B"): | |
i, j = T.axis.remap("SS", [i0, i1]) | |
T.reads() | |
T.writes(B[i, j]) | |
T.block_attr({"const_matrix":True, "schedule_rule":"meta_schedule.compute_inline"}) | |
B[i, j] = T.Select(i % 4 == 3 and j % 4 == 3, T.float32(1), T.Select(i % 4 == 3 and j % 4 == 2, T.float32(0), T.Select(i % 4 == 3 and j % 4 == 1, T.float32(0), T.Select(i % 4 == 3 and j % 4 == 0, T.float32(0), T.Select(i % 4 == 2 and j % 4 == 3, T.float32(0), T.Select(i % 4 == 2 and j % 4 == 2, T.float32(1), T.Select(i % 4 == 2 and j % 4 == 1, T.float32(1), T.Select(i % 4 == 2 and j % 4 == 0, T.float32(-1), T.Select(i % 4 == 1 and j % 4 == 3, T.float32(-1), T.Select(i % 4 == 1 and j % 4 == 2, T.float32(1), T.Select(i % 4 == 1 and j % 4 == 1, T.float32(-1), T.Select(i % 4 == 1 and j % 4 == 0, T.float32(0), T.Select(i % 4 == 0 and j % 4 == 3, T.float32(0), T.Select(i % 4 == 0 and j % 4 == 2, T.float32(0), T.Select(i % 4 == 0 and j % 4 == 1, T.float32(0), T.Select(i % 4 == 0 and j % 4 == 0, T.float32(1), T.float32(0))))))))))))))))) | |
for i0, i1, i2, i3, i4, i5 in T.grid(4, 4, 16, 512, 4, 4): | |
with T.block("data_pack"): | |
eps, nu, p, ci, r_a, r_b = T.axis.remap("SSSSRR", [i0, i1, i2, i3, i4, i5]) | |
T.reads(data_pack[eps, nu, p, ci], input_tile[r_a, r_b, p, ci], B[T.min(r_a, r_b) : T.max(r_a, r_b) + 1, T.min(eps, nu) : T.max(eps, nu) + 1]) | |
T.writes(data_pack[eps, nu, p, ci]) | |
T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["eps", "nu", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_data_pack.cuda"}) | |
with T.init(): | |
data_pack[eps, nu, p, ci] = T.float32(0) | |
data_pack[eps, nu, p, ci] = data_pack[eps, nu, p, ci] + input_tile[r_a, r_b, p, ci] * B[r_a, eps] * B[r_b, nu] | |
for i0, i1, i2, i3, i4 in T.grid(4, 4, 16, 512, 512): | |
with T.block("bgemm"): | |
eps, nu, p, co, ci = T.axis.remap("SSSSR", [i0, i1, i2, i3, i4]) | |
T.reads(bgemm[eps, nu, p, co], data_pack[eps, nu, p, ci], placeholder_2[eps, nu, co, ci]) | |
T.writes(bgemm[eps, nu, p, co]) | |
T.block_attr({"layout_free_placeholders":[placeholder_2]}) | |
with T.init(): | |
bgemm[eps, nu, p, co] = T.float32(0) | |
bgemm[eps, nu, p, co] = bgemm[eps, nu, p, co] + data_pack[eps, nu, p, ci] * placeholder_2[eps, nu, co, ci] | |
for i0, i1 in T.grid(4, 2): | |
with T.block("A"): | |
i, j = T.axis.remap("SS", [i0, i1]) | |
T.reads() | |
T.writes(A[i, j]) | |
T.block_attr({"const_matrix":True, "schedule_rule":"meta_schedule.compute_inline"}) | |
A[i, j] = T.Select(i % 4 == 3 and j % 2 == 1, T.float32(1), T.Select(i % 4 == 3 and j % 2 == 0, T.float32(0), T.Select(i % 4 == 2 and j % 2 == 1, T.float32(1), T.Select(i % 4 == 2 and j % 2 == 0, T.float32(1), T.Select(i % 4 == 1 and j % 2 == 1, T.float32(-1), T.Select(i % 4 == 1 and j % 2 == 0, T.float32(1), T.Select(i % 4 == 0 and j % 2 == 1, T.float32(0), T.Select(i % 4 == 0 and j % 2 == 0, T.float32(1), T.float32(0))))))))) | |
for i0, i1, i2, i3, i4, i5 in T.grid(2, 2, 16, 512, 4, 4): | |
with T.block("inverse"): | |
vh, vw, p, co, r_a, r_b = T.axis.remap("SSSSRR", [i0, i1, i2, i3, i4, i5]) | |
T.reads(inverse[vh, vw, p, co], bgemm[r_a, r_b, p, co], A[T.min(r_a, r_b) : T.max(r_a, r_b) + 1, T.min(vh, vw) : T.max(vh, vw) + 1]) | |
T.writes(inverse[vh, vw, p, co]) | |
T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["vh", "vw", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_inverse"}) | |
with T.init(): | |
inverse[vh, vw, p, co] = T.float32(0) | |
inverse[vh, vw, p, co] = inverse[vh, vw, p, co] + bgemm[r_a, r_b, p, co] * A[r_a, vh] * A[r_b, vw] | |
for i0, i1, i2, i3 in T.grid(1, 7, 7, 512): | |
with T.block("conv2d_winograd"): | |
n, h, w, co = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(inverse[h % 2, w % 2, n * 16 + h // 2 * 4 + w // 2, co]) | |
T.writes(conv2d_winograd[n, h, w, co]) | |
conv2d_winograd[n, h, w, co] = inverse[h % 2, w % 2, n * 16 + h // 2 * 4 + w // 2, co] | |
for i0, i1, i2, i3 in T.grid(1, 7, 7, 512): | |
with T.block("T_add"): | |
ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(conv2d_winograd[ax0, ax1, ax2, ax3], placeholder_1[ax0, 0, 0, ax3]) | |
T.writes(T_add[ax0, ax1, ax2, ax3]) | |
T_add[ax0, ax1, ax2, ax3] = conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder_1[ax0, 0, 0, ax3] | |
for i0, i1, i2, i3 in T.grid(1, 7, 7, 512): | |
with T.block("T_add_1"): | |
ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(T_add[ax0, ax1, ax2, ax3], placeholder[ax0, ax1, ax2, ax3]) | |
T.writes(T_add_1[ax0, ax1, ax2, ax3]) | |
T_add_1[ax0, ax1, ax2, ax3] = T_add[ax0, ax1, ax2, ax3] + placeholder[ax0, ax1, ax2, ax3] | |
for i0, i1, i2, i3 in T.grid(1, 7, 7, 512): | |
with T.block("T_relu"): | |
ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(T_add_1[ax0, ax1, ax2, ax3]) | |
T.writes(T_relu[ax0, ax1, ax2, ax3]) | |
T_relu[ax0, ax1, ax2, ax3] = T.max(T_add_1[ax0, ax1, ax2, ax3], T.float32(0)) | |
[14:36:12] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:113: Total 1 design space(s) generated | |
[14:36:12] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:118: Design space #0: | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 7, 7, 512), "float32"], placeholder_1: T.Buffer[(1, 1, 1, 512), "float32"], placeholder_2: T.Buffer[(4, 4, 512, 512), "float32"], placeholder_3: T.Buffer[(1, 7, 7, 512), "float32"], T_relu: T.Buffer[(1, 7, 7, 512), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
with T.block("root"): | |
T.reads() | |
T.writes() | |
T.block_attr({"meta_schedule.unroll_explicit":0}) | |
input_tile_local = T.alloc_buffer([4, 4, 16, 512], dtype="float32", scope="local") | |
data_pack = T.alloc_buffer([4, 4, 16, 512], dtype="float32") | |
bgemm = T.alloc_buffer([4, 4, 16, 512], dtype="float32") | |
inverse = T.alloc_buffer([2, 2, 16, 512], dtype="float32") | |
bgemm_local = T.alloc_buffer([4, 4, 16, 512], dtype="float32", scope="local") | |
data_pack_shared = T.alloc_buffer([4, 4, 16, 512], dtype="float32", scope="shared") | |
placeholder_shared = T.alloc_buffer([4, 4, 512, 512], dtype="float32", scope="shared") | |
for i2_0, i3_0, i2_1, i3_1 in T.grid(16, 32, 1, 16): | |
for ax0, ax1, ax2, ax3 in T.grid(4, 4, 1, 1): | |
with T.block("input_tile"): | |
eps, nu = T.axis.remap("SS", [ax0, ax1]) | |
p = T.axis.spatial(16, i2_0 + ax2) | |
ci = T.axis.spatial(512, i3_0 * 16 + i3_1 + ax3) | |
T.reads(placeholder_3[p // 16, p % 16 // 4 * 2 + eps - 1, p % 4 * 2 + nu - 1, ci]) | |
T.writes(input_tile_local[eps, nu, p, ci]) | |
T.block_attr({"schedule_rule":"None"}) | |
input_tile_local[eps, nu, p, ci] = T.if_then_else(1 <= p % 16 // 4 * 2 + eps and p % 16 // 4 * 2 + eps < 8 and 1 <= p % 4 * 2 + nu and p % 4 * 2 + nu < 8, placeholder_3[p // 16, p % 16 // 4 * 2 + eps - 1, p % 4 * 2 + nu - 1, ci], T.float32(0), dtype="float32") | |
for i0 in T.unroll(4): | |
for i1 in T.unroll(4): | |
for i4 in T.unroll(4): | |
for i5 in T.unroll(4): | |
with T.block("data_pack"): | |
eps, nu, p = T.axis.remap("SSS", [i0, i1, i2_0]) | |
ci = T.axis.spatial(512, i3_0 * 16 + i3_1) | |
r_a, r_b = T.axis.remap("RR", [i4, i5]) | |
T.reads(data_pack[eps, nu, p, ci], input_tile_local[r_a, r_b, p, ci]) | |
T.writes(data_pack[eps, nu, p, ci]) | |
T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["eps", "nu", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_data_pack.cuda"}) | |
with T.init(): | |
data_pack[eps, nu, p, ci] = T.float32(0) | |
data_pack[eps, nu, p, ci] = data_pack[eps, nu, p, ci] + input_tile_local[r_a, r_b, p, ci] * T.Select(r_a % 4 == 3 and eps % 4 == 3, T.float32(1), T.Select(r_a % 4 == 3 and eps % 4 == 2, T.float32(0), T.Select(r_a % 4 == 3 and eps % 4 == 1, T.float32(0), T.Select(r_a % 4 == 3 and eps % 4 == 0, T.float32(0), T.Select(r_a % 4 == 2 and eps % 4 == 3, T.float32(0), T.Select(r_a % 4 == 2 and eps % 4 == 2, T.float32(1), T.Select(r_a % 4 == 2 and eps % 4 == 1, T.float32(1), T.Select(r_a % 4 == 2 and eps % 4 == 0, T.float32(-1), T.Select(r_a % 4 == 1 and eps % 4 == 3, T.float32(-1), T.Select(r_a % 4 == 1 and eps % 4 == 2, T.float32(1), T.Select(r_a % 4 == 1 and eps % 4 == 1, T.float32(-1), T.Select(r_a % 4 == 1 and eps % 4 == 0, T.float32(0), T.Select(r_a % 4 == 0 and eps % 4 == 3, T.float32(0), T.Select(r_a % 4 == 0 and eps % 4 == 2, T.float32(0), T.Select(r_a % 4 == 0 and eps % 4 == 1, T.float32(0), T.Select(r_a % 4 == 0 and eps % 4 == 0, T.float32(1), T.float32(0))))))))))))))))) * T.Select(r_b % 4 == 3 and nu % 4 == 3, T.float32(1), T.Select(r_b % 4 == 3 and nu % 4 == 2, T.float32(0), T.Select(r_b % 4 == 3 and nu % 4 == 1, T.float32(0), T.Select(r_b % 4 == 3 and nu % 4 == 0, T.float32(0), T.Select(r_b % 4 == 2 and nu % 4 == 3, T.float32(0), T.Select(r_b % 4 == 2 and nu % 4 == 2, T.float32(1), T.Select(r_b % 4 == 2 and nu % 4 == 1, T.float32(1), T.Select(r_b % 4 == 2 and nu % 4 == 0, T.float32(-1), T.Select(r_b % 4 == 1 and nu % 4 == 3, T.float32(-1), T.Select(r_b % 4 == 1 and nu % 4 == 2, T.float32(1), T.Select(r_b % 4 == 1 and nu % 4 == 1, T.float32(-1), T.Select(r_b % 4 == 1 and nu % 4 == 0, T.float32(0), T.Select(r_b % 4 == 0 and nu % 4 == 3, T.float32(0), T.Select(r_b % 4 == 0 and nu % 4 == 2, T.float32(0), T.Select(r_b % 4 == 0 and nu % 4 == 1, T.float32(0), T.Select(r_b % 4 == 0 and nu % 4 == 0, T.float32(1), T.float32(0))))))))))))))))) | |
for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(4, thread="blockIdx.x"): | |
for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(256, thread="vthread.x"): | |
for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(4, thread="threadIdx.x"): | |
for i4_0 in T.serial(2): | |
for ax0_ax1_ax2_ax3_fused in T.serial(65536): | |
with T.block("data_pack_shared"): | |
v0 = T.axis.spatial(4, ax0_ax1_ax2_ax3_fused // 16384) | |
v1 = T.axis.spatial(4, ax0_ax1_ax2_ax3_fused % 16384 // 4096) | |
v2 = T.axis.spatial(16, ax0_ax1_ax2_ax3_fused % 4096 // 256) | |
v3 = T.axis.spatial(512, i4_0 * 256 + ax0_ax1_ax2_ax3_fused % 256) | |
T.reads(data_pack[v0, v1, v2, v3]) | |
T.writes(data_pack_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":1}) | |
data_pack_shared[v0, v1, v2, v3] = data_pack[v0, v1, v2, v3] | |
for ax0_ax1_ax2_ax3_fused in T.serial(524288): | |
with T.block("placeholder_shared"): | |
v0 = T.axis.spatial(4, ax0_ax1_ax2_ax3_fused // 131072) | |
v1 = T.axis.spatial(4, ax0_ax1_ax2_ax3_fused % 131072 // 32768) | |
v2 = T.axis.spatial(512, i0_0_i1_0_i2_0_i3_0_fused * 128 + ax0_ax1_ax2_ax3_fused % 32768 // 256) | |
v3 = T.axis.spatial(512, i4_0 * 256 + ax0_ax1_ax2_ax3_fused % 256) | |
T.reads(placeholder_2[v0, v1, v2, v3]) | |
T.writes(placeholder_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":2}) | |
placeholder_shared[v0, v1, v2, v3] = placeholder_2[v0, v1, v2, v3] | |
for i4_1, i0_3, i1_3, i2_3, i3_3, i4_2, i0_4, i1_4, i2_4, i3_4 in T.grid(32, 2, 1, 2, 1, 8, 1, 4, 1, 2): | |
with T.block("bgemm"): | |
eps = T.axis.spatial(4, i0_2_i1_2_i2_2_i3_2_fused // 2 * 2 + i0_3) | |
nu = T.axis.spatial(4, i1_4) | |
p = T.axis.spatial(16, i0_1_i1_1_i2_1_i3_1_fused // 32 * 2 + i2_3) | |
co = T.axis.spatial(512, i0_0_i1_0_i2_0_i3_0_fused * 128 + i0_1_i1_1_i2_1_i3_1_fused % 32 * 4 + i0_2_i1_2_i2_2_i3_2_fused % 2 * 2 + i3_4) | |
ci = T.axis.reduce(512, i4_0 * 256 + i4_1 * 8 + i4_2) | |
T.reads(bgemm_local[eps, nu, p, co], data_pack_shared[eps, nu, p, ci], placeholder_shared[eps, nu, co, ci]) | |
T.writes(bgemm_local[eps, nu, p, co]) | |
T.block_attr({"layout_free_placeholders":[placeholder_2], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS"}) | |
with T.init(): | |
bgemm_local[eps, nu, p, co] = T.float32(0) | |
bgemm_local[eps, nu, p, co] = bgemm_local[eps, nu, p, co] + data_pack_shared[eps, nu, p, ci] * placeholder_shared[eps, nu, co, ci] | |
for ax0, ax1, ax2, ax3 in T.grid(2, 4, 2, 2): | |
with T.block("bgemm_local"): | |
v0 = T.axis.spatial(4, i0_2_i1_2_i2_2_i3_2_fused // 2 * 2 + ax0) | |
v1 = T.axis.spatial(4, ax1) | |
v2 = T.axis.spatial(16, i0_1_i1_1_i2_1_i3_1_fused // 32 * 2 + ax2) | |
v3 = T.axis.spatial(512, i0_0_i1_0_i2_0_i3_0_fused * 128 + i0_1_i1_1_i2_1_i3_1_fused % 32 * 4 + i0_2_i1_2_i2_2_i3_2_fused % 2 * 2 + ax3) | |
T.reads(bgemm_local[v0, v1, v2, v3]) | |
T.writes(bgemm[v0, v1, v2, v3]) | |
bgemm[v0, v1, v2, v3] = bgemm_local[v0, v1, v2, v3] | |
for i2_0, i3_0, i2_1, i3_1 in T.grid(2, 512, 8, 1): | |
for i0 in T.unroll(2): | |
for i1 in T.unroll(2): | |
for i4 in T.unroll(4): | |
for i5 in T.unroll(4): | |
with T.block("inverse"): | |
vh, vw = T.axis.remap("SS", [i0, i1]) | |
p = T.axis.spatial(16, i2_0 * 8 + i2_1) | |
co, r_a, r_b = T.axis.remap("SRR", [i3_0, i4, i5]) | |
T.reads(inverse[vh, vw, p, co], bgemm[r_a, r_b, p, co]) | |
T.writes(inverse[vh, vw, p, co]) | |
T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["vh", "vw", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_inverse"}) | |
with T.init(): | |
inverse[vh, vw, p, co] = T.float32(0) | |
inverse[vh, vw, p, co] = inverse[vh, vw, p, co] + bgemm[r_a, r_b, p, co] * T.Select(r_a % 4 == 3 and vh % 2 == 1, T.float32(1), T.Select(r_a % 4 == 3 and vh % 2 == 0, T.float32(0), T.Select(r_a % 4 == 2 and vh % 2 == 1, T.float32(1), T.Select(r_a % 4 == 2 and vh % 2 == 0, T.float32(1), T.Select(r_a % 4 == 1 and vh % 2 == 1, T.float32(-1), T.Select(r_a % 4 == 1 and vh % 2 == 0, T.float32(1), T.Select(r_a % 4 == 0 and vh % 2 == 1, T.float32(0), T.Select(r_a % 4 == 0 and vh % 2 == 0, T.float32(1), T.float32(0))))))))) * T.Select(r_b % 4 == 3 and vw % 2 == 1, T.float32(1), T.Select(r_b % 4 == 3 and vw % 2 == 0, T.float32(0), T.Select(r_b % 4 == 2 and vw % 2 == 1, T.float32(1), T.Select(r_b % 4 == 2 and vw % 2 == 0, T.float32(1), T.Select(r_b % 4 == 1 and vw % 2 == 1, T.float32(-1), T.Select(r_b % 4 == 1 and vw % 2 == 0, T.float32(1), T.Select(r_b % 4 == 0 and vw % 2 == 1, T.float32(0), T.Select(r_b % 4 == 0 and vw % 2 == 0, T.float32(1), T.float32(0))))))))) | |
for i0, i1, i2, i3 in T.grid(1, 7, 7, 512): | |
with T.block("conv2d_winograd"): | |
n, h, w, co = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(inverse[h % 2, w % 2, n * 16 + h // 2 * 4 + w // 2, co], placeholder_1[n, 0, 0, co], placeholder[n, h, w, co]) | |
T.writes(T_relu[n, h, w, co]) | |
T_relu[n, h, w, co] = T.max(inverse[h % 2, w % 2, n * 16 + h // 2 * 4 + w // 2, co] + placeholder_1[n, 0, 0, co] + placeholder[n, h, w, co], T.float32(0)) | |
b0 = sch.get_block(name="B", func_name="main") | |
b1 = sch.get_block(name="data_pack", func_name="main") | |
b2 = sch.get_block(name="bgemm", func_name="main") | |
b3 = sch.get_block(name="A", func_name="main") | |
b4 = sch.get_block(name="inverse", func_name="main") | |
b5 = sch.get_block(name="T_add", func_name="main") | |
b6 = sch.get_block(name="T_add_1", func_name="main") | |
b7 = sch.get_block(name="T_relu", func_name="main") | |
b8 = sch.get_block(name="root", func_name="main") | |
sch.compute_inline(block=b0) | |
b9, = sch.get_producers(block=b1) | |
b10, = sch.get_producers(block=b9) | |
l11, l12, l13, l14, l15, l16 = sch.get_loops(block=b1) | |
v17, v18 = sch.sample_perfect_tile(loop=l13, n=2, max_innermost_factor=64, decision=[16, 1]) | |
l19, l20 = sch.split(loop=l13, factors=[v17, v18]) | |
v21, v22 = sch.sample_perfect_tile(loop=l14, n=2, max_innermost_factor=64, decision=[32, 16]) | |
l23, l24 = sch.split(loop=l14, factors=[v21, v22]) | |
sch.unroll(loop=l11) | |
sch.unroll(loop=l12) | |
sch.unroll(loop=l15) | |
sch.unroll(loop=l16) | |
sch.reorder(l19, l23, l20, l24, l11, l12, l15, l16) | |
sch.compute_at(block=b9, loop=l24, preserve_unit_loops=True) | |
sch.set_scope(block=b9, buffer_index=0, storage_scope="local") | |
sch.compute_inline(block=b10) | |
sch.compute_inline(block=b3) | |
l25, l26, l27, l28, l29, l30 = sch.get_loops(block=b4) | |
v31, v32 = sch.sample_perfect_tile(loop=l27, n=2, max_innermost_factor=64, decision=[2, 8]) | |
l33, l34 = sch.split(loop=l27, factors=[v31, v32]) | |
v35, v36 = sch.sample_perfect_tile(loop=l28, n=2, max_innermost_factor=64, decision=[512, 1]) | |
l37, l38 = sch.split(loop=l28, factors=[v35, v36]) | |
sch.unroll(loop=l25) | |
sch.unroll(loop=l26) | |
sch.unroll(loop=l29) | |
sch.unroll(loop=l30) | |
sch.reorder(l33, l37, l34, l38, l25, l26, l29, l30) | |
sch.annotate(block_or_loop=b2, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS") | |
l39, l40, l41, l42, l43 = sch.get_loops(block=b2) | |
v44, v45, v46, v47, v48 = sch.sample_perfect_tile(loop=l39, n=5, max_innermost_factor=64, decision=[1, 1, 2, 2, 1]) | |
l49, l50, l51, l52, l53 = sch.split(loop=l39, factors=[v44, v45, v46, v47, v48]) | |
v54, v55, v56, v57, v58 = sch.sample_perfect_tile(loop=l40, n=5, max_innermost_factor=64, decision=[1, 1, 1, 1, 4]) | |
l59, l60, l61, l62, l63 = sch.split(loop=l40, factors=[v54, v55, v56, v57, v58]) | |
v64, v65, v66, v67, v68 = sch.sample_perfect_tile(loop=l41, n=5, max_innermost_factor=64, decision=[1, 8, 1, 2, 1]) | |
l69, l70, l71, l72, l73 = sch.split(loop=l41, factors=[v64, v65, v66, v67, v68]) | |
v74, v75, v76, v77, v78 = sch.sample_perfect_tile(loop=l42, n=5, max_innermost_factor=64, decision=[4, 32, 2, 1, 2]) | |
l79, l80, l81, l82, l83 = sch.split(loop=l42, factors=[v74, v75, v76, v77, v78]) | |
v84, v85, v86 = sch.sample_perfect_tile(loop=l43, n=3, max_innermost_factor=64, decision=[2, 32, 8]) | |
l87, l88, l89 = sch.split(loop=l43, factors=[v84, v85, v86]) | |
sch.reorder(l49, l59, l69, l79, l50, l60, l70, l80, l51, l61, l71, l81, l87, l88, l52, l62, l72, l82, l89, l53, l63, l73, l83) | |
l90 = sch.fuse(l49, l59, l69, l79) | |
sch.bind(loop=l90, thread_axis="blockIdx.x") | |
l91 = sch.fuse(l50, l60, l70, l80) | |
sch.bind(loop=l91, thread_axis="vthread.x") | |
l92 = sch.fuse(l51, l61, l71, l81) | |
sch.bind(loop=l92, thread_axis="threadIdx.x") | |
sch.annotate(block_or_loop=b2, ann_key="meta_schedule.thread_extent_low_inclusive", ann_val=32) | |
sch.annotate(block_or_loop=b2, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=1024) | |
b93 = sch.cache_write(block=b2, write_buffer_index=0, storage_scope="local") | |
sch.reverse_compute_at(block=b93, loop=l92, preserve_unit_loops=True) | |
b94 = sch.cache_read(block=b2, read_buffer_index=1, storage_scope="shared") | |
sch.compute_at(block=b94, loop=l87, preserve_unit_loops=True) | |
l95, l96, l97, l98, l99, l100, l101, l102 = sch.get_loops(block=b94) | |
l103 = sch.fuse(l99, l100, l101, l102) | |
v104 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=0) | |
sch.annotate(block_or_loop=b94, ann_key="meta_schedule.cooperative_fetch", ann_val=v104) | |
b105 = sch.cache_read(block=b2, read_buffer_index=2, storage_scope="shared") | |
sch.compute_at(block=b105, loop=l87, preserve_unit_loops=True) | |
l106, l107, l108, l109, l110, l111, l112, l113 = sch.get_loops(block=b105) | |
l114 = sch.fuse(l110, l111, l112, l113) | |
v115 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=1) | |
sch.annotate(block_or_loop=b105, ann_key="meta_schedule.cooperative_fetch", ann_val=v115) | |
sch.reverse_compute_inline(block=b7) | |
sch.reverse_compute_inline(block=b6) | |
sch.reverse_compute_inline(block=b5) | |
v116 = sch.sample_categorical(candidates=[0, 16, 64, 512, 1024], probs=[0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001], decision=0) | |
sch.annotate(block_or_loop=b8, ann_key="meta_schedule.unroll_explicit", ann_val=v116) | |
[14:36:12] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:96: Initializing Task #16: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3", mod = | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 1, 1, 512), "float32"], placeholder_1: T.Buffer[(4, 4, 512, 512), "float32"], placeholder_2: T.Buffer[(1, 7, 7, 512), "float32"], T_relu: T.Buffer[(1, 7, 7, 512), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
# with T.block("root") | |
data_pad = T.alloc_buffer([1, 10, 10, 512], dtype="float32") | |
input_tile = T.alloc_buffer([4, 4, 16, 512], dtype="float32") | |
B = T.alloc_buffer([4, 4], dtype="float32") | |
data_pack = T.alloc_buffer([4, 4, 16, 512], dtype="float32") | |
bgemm = T.alloc_buffer([4, 4, 16, 512], dtype="float32") | |
A = T.alloc_buffer([4, 2], dtype="float32") | |
inverse = T.alloc_buffer([2, 2, 16, 512], dtype="float32") | |
conv2d_winograd = T.alloc_buffer([1, 7, 7, 512], dtype="float32") | |
T_add = T.alloc_buffer([1, 7, 7, 512], dtype="float32") | |
for i0, i1, i2, i3 in T.grid(1, 10, 10, 512): | |
with T.block("data_pad"): | |
i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(placeholder_2[i0_1, i1_1 - 1, i2_1 - 1, i3_1]) | |
T.writes(data_pad[i0_1, i1_1, i2_1, i3_1]) | |
T.block_attr({"schedule_rule":"None"}) | |
data_pad[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(1 <= i1_1 and i1_1 < 8 and 1 <= i2_1 and i2_1 < 8, placeholder_2[i0_1, i1_1 - 1, i2_1 - 1, i3_1], T.float32(0), dtype="float32") | |
for i0, i1, i2, i3 in T.grid(4, 4, 16, 512): | |
with T.block("input_tile"): | |
eps, nu, p, ci = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(data_pad[p // 16, p % 16 // 4 * 2 + eps, p % 4 * 2 + nu, ci]) | |
T.writes(input_tile[eps, nu, p, ci]) | |
T.block_attr({"schedule_rule":"None"}) | |
input_tile[eps, nu, p, ci] = data_pad[p // 16, p % 16 // 4 * 2 + eps, p % 4 * 2 + nu, ci] | |
for i0, i1 in T.grid(4, 4): | |
with T.block("B"): | |
i, j = T.axis.remap("SS", [i0, i1]) | |
T.reads() | |
T.writes(B[i, j]) | |
T.block_attr({"const_matrix":True, "schedule_rule":"meta_schedule.compute_inline"}) | |
B[i, j] = T.Select(i % 4 == 3 and j % 4 == 3, T.float32(1), T.Select(i % 4 == 3 and j % 4 == 2, T.float32(0), T.Select(i % 4 == 3 and j % 4 == 1, T.float32(0), T.Select(i % 4 == 3 and j % 4 == 0, T.float32(0), T.Select(i % 4 == 2 and j % 4 == 3, T.float32(0), T.Select(i % 4 == 2 and j % 4 == 2, T.float32(1), T.Select(i % 4 == 2 and j % 4 == 1, T.float32(1), T.Select(i % 4 == 2 and j % 4 == 0, T.float32(-1), T.Select(i % 4 == 1 and j % 4 == 3, T.float32(-1), T.Select(i % 4 == 1 and j % 4 == 2, T.float32(1), T.Select(i % 4 == 1 and j % 4 == 1, T.float32(-1), T.Select(i % 4 == 1 and j % 4 == 0, T.float32(0), T.Select(i % 4 == 0 and j % 4 == 3, T.float32(0), T.Select(i % 4 == 0 and j % 4 == 2, T.float32(0), T.Select(i % 4 == 0 and j % 4 == 1, T.float32(0), T.Select(i % 4 == 0 and j % 4 == 0, T.float32(1), T.float32(0))))))))))))))))) | |
for i0, i1, i2, i3, i4, i5 in T.grid(4, 4, 16, 512, 4, 4): | |
with T.block("data_pack"): | |
eps, nu, p, ci, r_a, r_b = T.axis.remap("SSSSRR", [i0, i1, i2, i3, i4, i5]) | |
T.reads(data_pack[eps, nu, p, ci], input_tile[r_a, r_b, p, ci], B[T.min(r_a, r_b) : T.max(r_a, r_b) + 1, T.min(eps, nu) : T.max(eps, nu) + 1]) | |
T.writes(data_pack[eps, nu, p, ci]) | |
T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["eps", "nu", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_data_pack.cuda"}) | |
with T.init(): | |
data_pack[eps, nu, p, ci] = T.float32(0) | |
data_pack[eps, nu, p, ci] = data_pack[eps, nu, p, ci] + input_tile[r_a, r_b, p, ci] * B[r_a, eps] * B[r_b, nu] | |
for i0, i1, i2, i3, i4 in T.grid(4, 4, 16, 512, 512): | |
with T.block("bgemm"): | |
eps, nu, p, co, ci = T.axis.remap("SSSSR", [i0, i1, i2, i3, i4]) | |
T.reads(bgemm[eps, nu, p, co], data_pack[eps, nu, p, ci], placeholder_1[eps, nu, co, ci]) | |
T.writes(bgemm[eps, nu, p, co]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1]}) | |
with T.init(): | |
bgemm[eps, nu, p, co] = T.float32(0) | |
bgemm[eps, nu, p, co] = bgemm[eps, nu, p, co] + data_pack[eps, nu, p, ci] * placeholder_1[eps, nu, co, ci] | |
for i0, i1 in T.grid(4, 2): | |
with T.block("A"): | |
i, j = T.axis.remap("SS", [i0, i1]) | |
T.reads() | |
T.writes(A[i, j]) | |
T.block_attr({"const_matrix":True, "schedule_rule":"meta_schedule.compute_inline"}) | |
A[i, j] = T.Select(i % 4 == 3 and j % 2 == 1, T.float32(1), T.Select(i % 4 == 3 and j % 2 == 0, T.float32(0), T.Select(i % 4 == 2 and j % 2 == 1, T.float32(1), T.Select(i % 4 == 2 and j % 2 == 0, T.float32(1), T.Select(i % 4 == 1 and j % 2 == 1, T.float32(-1), T.Select(i % 4 == 1 and j % 2 == 0, T.float32(1), T.Select(i % 4 == 0 and j % 2 == 1, T.float32(0), T.Select(i % 4 == 0 and j % 2 == 0, T.float32(1), T.float32(0))))))))) | |
for i0, i1, i2, i3, i4, i5 in T.grid(2, 2, 16, 512, 4, 4): | |
with T.block("inverse"): | |
vh, vw, p, co, r_a, r_b = T.axis.remap("SSSSRR", [i0, i1, i2, i3, i4, i5]) | |
T.reads(inverse[vh, vw, p, co], bgemm[r_a, r_b, p, co], A[T.min(r_a, r_b) : T.max(r_a, r_b) + 1, T.min(vh, vw) : T.max(vh, vw) + 1]) | |
T.writes(inverse[vh, vw, p, co]) | |
T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["vh", "vw", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_inverse"}) | |
with T.init(): | |
inverse[vh, vw, p, co] = T.float32(0) | |
inverse[vh, vw, p, co] = inverse[vh, vw, p, co] + bgemm[r_a, r_b, p, co] * A[r_a, vh] * A[r_b, vw] | |
for i0, i1, i2, i3 in T.grid(1, 7, 7, 512): | |
with T.block("conv2d_winograd"): | |
n, h, w, co = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(inverse[h % 2, w % 2, n * 16 + h // 2 * 4 + w // 2, co]) | |
T.writes(conv2d_winograd[n, h, w, co]) | |
conv2d_winograd[n, h, w, co] = inverse[h % 2, w % 2, n * 16 + h // 2 * 4 + w // 2, co] | |
for i0, i1, i2, i3 in T.grid(1, 7, 7, 512): | |
with T.block("T_add"): | |
ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(conv2d_winograd[ax0, ax1, ax2, ax3], placeholder[ax0, 0, 0, ax3]) | |
T.writes(T_add[ax0, ax1, ax2, ax3]) | |
T_add[ax0, ax1, ax2, ax3] = conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3] | |
for i0, i1, i2, i3 in T.grid(1, 7, 7, 512): | |
with T.block("T_relu"): | |
ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(T_add[ax0, ax1, ax2, ax3]) | |
T.writes(T_relu[ax0, ax1, ax2, ax3]) | |
T_relu[ax0, ax1, ax2, ax3] = T.max(T_add[ax0, ax1, ax2, ax3], T.float32(0)) | |
[14:36:12] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:113: Total 1 design space(s) generated | |
[14:36:12] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:118: Design space #0: | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 1, 1, 512), "float32"], placeholder_1: T.Buffer[(4, 4, 512, 512), "float32"], placeholder_2: T.Buffer[(1, 7, 7, 512), "float32"], T_relu: T.Buffer[(1, 7, 7, 512), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
with T.block("root"): | |
T.reads() | |
T.writes() | |
T.block_attr({"meta_schedule.unroll_explicit":16}) | |
input_tile_local = T.alloc_buffer([4, 4, 16, 512], dtype="float32", scope="local") | |
data_pack = T.alloc_buffer([4, 4, 16, 512], dtype="float32") | |
bgemm = T.alloc_buffer([4, 4, 16, 512], dtype="float32") | |
inverse = T.alloc_buffer([2, 2, 16, 512], dtype="float32") | |
bgemm_local = T.alloc_buffer([4, 4, 16, 512], dtype="float32", scope="local") | |
data_pack_shared = T.alloc_buffer([4, 4, 16, 512], dtype="float32", scope="shared") | |
placeholder_shared = T.alloc_buffer([4, 4, 512, 512], dtype="float32", scope="shared") | |
for i2_0, i3_0, i2_1, i3_1 in T.grid(4, 16, 4, 32): | |
for ax0, ax1, ax2, ax3 in T.grid(4, 4, 1, 1): | |
with T.block("input_tile"): | |
eps, nu = T.axis.remap("SS", [ax0, ax1]) | |
p = T.axis.spatial(16, i2_0 * 4 + i2_1 + ax2) | |
ci = T.axis.spatial(512, i3_0 * 32 + i3_1 + ax3) | |
T.reads(placeholder_2[p // 16, p % 16 // 4 * 2 + eps - 1, p % 4 * 2 + nu - 1, ci]) | |
T.writes(input_tile_local[eps, nu, p, ci]) | |
T.block_attr({"schedule_rule":"None"}) | |
input_tile_local[eps, nu, p, ci] = T.if_then_else(1 <= p % 16 // 4 * 2 + eps and p % 16 // 4 * 2 + eps < 8 and 1 <= p % 4 * 2 + nu and p % 4 * 2 + nu < 8, placeholder_2[p // 16, p % 16 // 4 * 2 + eps - 1, p % 4 * 2 + nu - 1, ci], T.float32(0), dtype="float32") | |
for i0 in T.unroll(4): | |
for i1 in T.unroll(4): | |
for i4 in T.unroll(4): | |
for i5 in T.unroll(4): | |
with T.block("data_pack"): | |
eps, nu = T.axis.remap("SS", [i0, i1]) | |
p = T.axis.spatial(16, i2_0 * 4 + i2_1) | |
ci = T.axis.spatial(512, i3_0 * 32 + i3_1) | |
r_a, r_b = T.axis.remap("RR", [i4, i5]) | |
T.reads(data_pack[eps, nu, p, ci], input_tile_local[r_a, r_b, p, ci]) | |
T.writes(data_pack[eps, nu, p, ci]) | |
T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["eps", "nu", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_data_pack.cuda"}) | |
with T.init(): | |
data_pack[eps, nu, p, ci] = T.float32(0) | |
data_pack[eps, nu, p, ci] = data_pack[eps, nu, p, ci] + input_tile_local[r_a, r_b, p, ci] * T.Select(r_a % 4 == 3 and eps % 4 == 3, T.float32(1), T.Select(r_a % 4 == 3 and eps % 4 == 2, T.float32(0), T.Select(r_a % 4 == 3 and eps % 4 == 1, T.float32(0), T.Select(r_a % 4 == 3 and eps % 4 == 0, T.float32(0), T.Select(r_a % 4 == 2 and eps % 4 == 3, T.float32(0), T.Select(r_a % 4 == 2 and eps % 4 == 2, T.float32(1), T.Select(r_a % 4 == 2 and eps % 4 == 1, T.float32(1), T.Select(r_a % 4 == 2 and eps % 4 == 0, T.float32(-1), T.Select(r_a % 4 == 1 and eps % 4 == 3, T.float32(-1), T.Select(r_a % 4 == 1 and eps % 4 == 2, T.float32(1), T.Select(r_a % 4 == 1 and eps % 4 == 1, T.float32(-1), T.Select(r_a % 4 == 1 and eps % 4 == 0, T.float32(0), T.Select(r_a % 4 == 0 and eps % 4 == 3, T.float32(0), T.Select(r_a % 4 == 0 and eps % 4 == 2, T.float32(0), T.Select(r_a % 4 == 0 and eps % 4 == 1, T.float32(0), T.Select(r_a % 4 == 0 and eps % 4 == 0, T.float32(1), T.float32(0))))))))))))))))) * T.Select(r_b % 4 == 3 and nu % 4 == 3, T.float32(1), T.Select(r_b % 4 == 3 and nu % 4 == 2, T.float32(0), T.Select(r_b % 4 == 3 and nu % 4 == 1, T.float32(0), T.Select(r_b % 4 == 3 and nu % 4 == 0, T.float32(0), T.Select(r_b % 4 == 2 and nu % 4 == 3, T.float32(0), T.Select(r_b % 4 == 2 and nu % 4 == 2, T.float32(1), T.Select(r_b % 4 == 2 and nu % 4 == 1, T.float32(1), T.Select(r_b % 4 == 2 and nu % 4 == 0, T.float32(-1), T.Select(r_b % 4 == 1 and nu % 4 == 3, T.float32(-1), T.Select(r_b % 4 == 1 and nu % 4 == 2, T.float32(1), T.Select(r_b % 4 == 1 and nu % 4 == 1, T.float32(-1), T.Select(r_b % 4 == 1 and nu % 4 == 0, T.float32(0), T.Select(r_b % 4 == 0 and nu % 4 == 3, T.float32(0), T.Select(r_b % 4 == 0 and nu % 4 == 2, T.float32(0), T.Select(r_b % 4 == 0 and nu % 4 == 1, T.float32(0), T.Select(r_b % 4 == 0 and nu % 4 == 0, T.float32(1), T.float32(0))))))))))))))))) | |
for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(16, thread="blockIdx.x"): | |
for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(4, thread="vthread.x"): | |
for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(2, thread="threadIdx.x"): | |
for i4_0 in T.serial(1): | |
for ax0_ax1_ax2_ax3_fused in T.serial(65536): | |
with T.block("data_pack_shared"): | |
v0 = T.axis.spatial(4, ax0_ax1_ax2_ax3_fused // 16384) | |
v1 = T.axis.spatial(4, i0_0_i1_0_i2_0_i3_0_fused // 8 * 2 + ax0_ax1_ax2_ax3_fused % 16384 // 8192) | |
v2 = T.axis.spatial(16, ax0_ax1_ax2_ax3_fused % 8192 // 512) | |
v3 = T.axis.spatial(512, ax0_ax1_ax2_ax3_fused % 512) | |
T.reads(data_pack[v0, v1, v2, v3]) | |
T.writes(data_pack_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":4}) | |
data_pack_shared[v0, v1, v2, v3] = data_pack[v0, v1, v2, v3] | |
for ax0_ax1_ax2_ax3_fused in T.serial(262144): | |
with T.block("placeholder_shared"): | |
v0 = T.axis.spatial(4, ax0_ax1_ax2_ax3_fused // 65536) | |
v1 = T.axis.spatial(4, i0_0_i1_0_i2_0_i3_0_fused // 8 * 2 + ax0_ax1_ax2_ax3_fused % 65536 // 32768) | |
v2 = T.axis.spatial(512, i0_0_i1_0_i2_0_i3_0_fused % 8 * 64 + ax0_ax1_ax2_ax3_fused % 32768 // 512) | |
v3 = T.axis.spatial(512, ax0_ax1_ax2_ax3_fused % 512) | |
T.reads(placeholder_1[v0, v1, v2, v3]) | |
T.writes(placeholder_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":3}) | |
placeholder_shared[v0, v1, v2, v3] = placeholder_1[v0, v1, v2, v3] | |
for i4_1, i0_3, i1_3, i2_3, i3_3, i4_2, i0_4, i1_4, i2_4, i3_4 in T.grid(16, 1, 1, 1, 1, 32, 4, 1, 16, 16): | |
with T.block("bgemm"): | |
eps = T.axis.spatial(4, i0_4) | |
nu = T.axis.spatial(4, i0_0_i1_0_i2_0_i3_0_fused // 8 * 2 + i0_2_i1_2_i2_2_i3_2_fused) | |
p = T.axis.spatial(16, i2_4) | |
co = T.axis.spatial(512, i0_0_i1_0_i2_0_i3_0_fused % 8 * 64 + i0_1_i1_1_i2_1_i3_1_fused * 16 + i3_4) | |
ci = T.axis.reduce(512, i4_1 * 32 + i4_2) | |
T.reads(bgemm_local[eps, nu, p, co], data_pack_shared[eps, nu, p, ci], placeholder_shared[eps, nu, co, ci]) | |
T.writes(bgemm_local[eps, nu, p, co]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS"}) | |
with T.init(): | |
bgemm_local[eps, nu, p, co] = T.float32(0) | |
bgemm_local[eps, nu, p, co] = bgemm_local[eps, nu, p, co] + data_pack_shared[eps, nu, p, ci] * placeholder_shared[eps, nu, co, ci] | |
for ax0, ax1, ax2, ax3 in T.grid(4, 1, 16, 16): | |
with T.block("bgemm_local"): | |
v0 = T.axis.spatial(4, ax0) | |
v1 = T.axis.spatial(4, i0_0_i1_0_i2_0_i3_0_fused // 8 * 2 + i0_2_i1_2_i2_2_i3_2_fused + ax1) | |
v2 = T.axis.spatial(16, ax2) | |
v3 = T.axis.spatial(512, i0_0_i1_0_i2_0_i3_0_fused % 8 * 64 + i0_1_i1_1_i2_1_i3_1_fused * 16 + ax3) | |
T.reads(bgemm_local[v0, v1, v2, v3]) | |
T.writes(bgemm[v0, v1, v2, v3]) | |
bgemm[v0, v1, v2, v3] = bgemm_local[v0, v1, v2, v3] | |
for i2_0, i3_0, i2_1, i3_1 in T.grid(8, 32, 2, 16): | |
for i0 in T.unroll(2): | |
for i1 in T.unroll(2): | |
for i4 in T.unroll(4): | |
for i5 in T.unroll(4): | |
with T.block("inverse"): | |
vh, vw = T.axis.remap("SS", [i0, i1]) | |
p = T.axis.spatial(16, i2_0 * 2 + i2_1) | |
co = T.axis.spatial(512, i3_0 * 16 + i3_1) | |
r_a, r_b = T.axis.remap("RR", [i4, i5]) | |
T.reads(inverse[vh, vw, p, co], bgemm[r_a, r_b, p, co]) | |
T.writes(inverse[vh, vw, p, co]) | |
T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["vh", "vw", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_inverse"}) | |
with T.init(): | |
inverse[vh, vw, p, co] = T.float32(0) | |
inverse[vh, vw, p, co] = inverse[vh, vw, p, co] + bgemm[r_a, r_b, p, co] * T.Select(r_a % 4 == 3 and vh % 2 == 1, T.float32(1), T.Select(r_a % 4 == 3 and vh % 2 == 0, T.float32(0), T.Select(r_a % 4 == 2 and vh % 2 == 1, T.float32(1), T.Select(r_a % 4 == 2 and vh % 2 == 0, T.float32(1), T.Select(r_a % 4 == 1 and vh % 2 == 1, T.float32(-1), T.Select(r_a % 4 == 1 and vh % 2 == 0, T.float32(1), T.Select(r_a % 4 == 0 and vh % 2 == 1, T.float32(0), T.Select(r_a % 4 == 0 and vh % 2 == 0, T.float32(1), T.float32(0))))))))) * T.Select(r_b % 4 == 3 and vw % 2 == 1, T.float32(1), T.Select(r_b % 4 == 3 and vw % 2 == 0, T.float32(0), T.Select(r_b % 4 == 2 and vw % 2 == 1, T.float32(1), T.Select(r_b % 4 == 2 and vw % 2 == 0, T.float32(1), T.Select(r_b % 4 == 1 and vw % 2 == 1, T.float32(-1), T.Select(r_b % 4 == 1 and vw % 2 == 0, T.float32(1), T.Select(r_b % 4 == 0 and vw % 2 == 1, T.float32(0), T.Select(r_b % 4 == 0 and vw % 2 == 0, T.float32(1), T.float32(0))))))))) | |
for i0, i1, i2, i3 in T.grid(1, 7, 7, 512): | |
with T.block("conv2d_winograd"): | |
n, h, w, co = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(inverse[h % 2, w % 2, n * 16 + h // 2 * 4 + w // 2, co], placeholder[n, 0, 0, co]) | |
T.writes(T_relu[n, h, w, co]) | |
T_relu[n, h, w, co] = T.max(inverse[h % 2, w % 2, n * 16 + h // 2 * 4 + w // 2, co] + placeholder[n, 0, 0, co], T.float32(0)) | |
b0 = sch.get_block(name="B", func_name="main") | |
b1 = sch.get_block(name="data_pack", func_name="main") | |
b2 = sch.get_block(name="bgemm", func_name="main") | |
b3 = sch.get_block(name="A", func_name="main") | |
b4 = sch.get_block(name="inverse", func_name="main") | |
b5 = sch.get_block(name="T_add", func_name="main") | |
b6 = sch.get_block(name="T_relu", func_name="main") | |
b7 = sch.get_block(name="root", func_name="main") | |
sch.compute_inline(block=b0) | |
b8, = sch.get_producers(block=b1) | |
b9, = sch.get_producers(block=b8) | |
l10, l11, l12, l13, l14, l15 = sch.get_loops(block=b1) | |
v16, v17 = sch.sample_perfect_tile(loop=l12, n=2, max_innermost_factor=64, decision=[4, 4]) | |
l18, l19 = sch.split(loop=l12, factors=[v16, v17]) | |
v20, v21 = sch.sample_perfect_tile(loop=l13, n=2, max_innermost_factor=64, decision=[16, 32]) | |
l22, l23 = sch.split(loop=l13, factors=[v20, v21]) | |
sch.unroll(loop=l10) | |
sch.unroll(loop=l11) | |
sch.unroll(loop=l14) | |
sch.unroll(loop=l15) | |
sch.reorder(l18, l22, l19, l23, l10, l11, l14, l15) | |
sch.compute_at(block=b8, loop=l23, preserve_unit_loops=True) | |
sch.set_scope(block=b8, buffer_index=0, storage_scope="local") | |
sch.compute_inline(block=b9) | |
sch.compute_inline(block=b3) | |
l24, l25, l26, l27, l28, l29 = sch.get_loops(block=b4) | |
v30, v31 = sch.sample_perfect_tile(loop=l26, n=2, max_innermost_factor=64, decision=[8, 2]) | |
l32, l33 = sch.split(loop=l26, factors=[v30, v31]) | |
v34, v35 = sch.sample_perfect_tile(loop=l27, n=2, max_innermost_factor=64, decision=[32, 16]) | |
l36, l37 = sch.split(loop=l27, factors=[v34, v35]) | |
sch.unroll(loop=l24) | |
sch.unroll(loop=l25) | |
sch.unroll(loop=l28) | |
sch.unroll(loop=l29) | |
sch.reorder(l32, l36, l33, l37, l24, l25, l28, l29) | |
sch.annotate(block_or_loop=b2, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS") | |
l38, l39, l40, l41, l42 = sch.get_loops(block=b2) | |
v43, v44, v45, v46, v47 = sch.sample_perfect_tile(loop=l38, n=5, max_innermost_factor=64, decision=[1, 1, 1, 1, 4]) | |
l48, l49, l50, l51, l52 = sch.split(loop=l38, factors=[v43, v44, v45, v46, v47]) | |
v53, v54, v55, v56, v57 = sch.sample_perfect_tile(loop=l39, n=5, max_innermost_factor=64, decision=[2, 1, 2, 1, 1]) | |
l58, l59, l60, l61, l62 = sch.split(loop=l39, factors=[v53, v54, v55, v56, v57]) | |
v63, v64, v65, v66, v67 = sch.sample_perfect_tile(loop=l40, n=5, max_innermost_factor=64, decision=[1, 1, 1, 1, 16]) | |
l68, l69, l70, l71, l72 = sch.split(loop=l40, factors=[v63, v64, v65, v66, v67]) | |
v73, v74, v75, v76, v77 = sch.sample_perfect_tile(loop=l41, n=5, max_innermost_factor=64, decision=[8, 4, 1, 1, 16]) | |
l78, l79, l80, l81, l82 = sch.split(loop=l41, factors=[v73, v74, v75, v76, v77]) | |
v83, v84, v85 = sch.sample_perfect_tile(loop=l42, n=3, max_innermost_factor=64, decision=[1, 16, 32]) | |
l86, l87, l88 = sch.split(loop=l42, factors=[v83, v84, v85]) | |
sch.reorder(l48, l58, l68, l78, l49, l59, l69, l79, l50, l60, l70, l80, l86, l87, l51, l61, l71, l81, l88, l52, l62, l72, l82) | |
l89 = sch.fuse(l48, l58, l68, l78) | |
sch.bind(loop=l89, thread_axis="blockIdx.x") | |
l90 = sch.fuse(l49, l59, l69, l79) | |
sch.bind(loop=l90, thread_axis="vthread.x") | |
l91 = sch.fuse(l50, l60, l70, l80) | |
sch.bind(loop=l91, thread_axis="threadIdx.x") | |
sch.annotate(block_or_loop=b2, ann_key="meta_schedule.thread_extent_low_inclusive", ann_val=32) | |
sch.annotate(block_or_loop=b2, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=1024) | |
b92 = sch.cache_write(block=b2, write_buffer_index=0, storage_scope="local") | |
sch.reverse_compute_at(block=b92, loop=l91, preserve_unit_loops=True) | |
b93 = sch.cache_read(block=b2, read_buffer_index=1, storage_scope="shared") | |
sch.compute_at(block=b93, loop=l86, preserve_unit_loops=True) | |
l94, l95, l96, l97, l98, l99, l100, l101 = sch.get_loops(block=b93) | |
l102 = sch.fuse(l98, l99, l100, l101) | |
v103 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=3) | |
sch.annotate(block_or_loop=b93, ann_key="meta_schedule.cooperative_fetch", ann_val=v103) | |
b104 = sch.cache_read(block=b2, read_buffer_index=2, storage_scope="shared") | |
sch.compute_at(block=b104, loop=l86, preserve_unit_loops=True) | |
l105, l106, l107, l108, l109, l110, l111, l112 = sch.get_loops(block=b104) | |
l113 = sch.fuse(l109, l110, l111, l112) | |
v114 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=2) | |
sch.annotate(block_or_loop=b104, ann_key="meta_schedule.cooperative_fetch", ann_val=v114) | |
sch.reverse_compute_inline(block=b6) | |
sch.reverse_compute_inline(block=b5) | |
v115 = sch.sample_categorical(candidates=[0, 16, 64, 512, 1024], probs=[0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001], decision=1) | |
sch.annotate(block_or_loop=b7, ann_key="meta_schedule.unroll_explicit", ann_val=v115) | |
[14:36:12] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:96: Initializing Task #17: "vm_mod_fused_nn_adaptive_avg_pool2d", mod = | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 7, 7, 512), "float32"], tensor: T.Buffer[(1, 1, 1, 512), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
# with T.block("root") | |
tensor_1 = T.alloc_buffer([1, 1, 1, 512], dtype="float32") | |
for i0, i1, i2, i3, i4, i5 in T.grid(1, 1, 1, 512, 7, 7): | |
with T.block("tensor"): | |
ax0, ax1, ax2, ax3, rv0, rv1 = T.axis.remap("SSSSRR", [i0, i1, i2, i3, i4, i5]) | |
T.reads(tensor_1[ax0, ax1, ax2, ax3], placeholder[ax0, ax1 * 7 + rv0, ax2 * 7 + rv1, ax3]) | |
T.writes(tensor_1[ax0, ax1, ax2, ax3]) | |
with T.init(): | |
tensor_1[ax0, ax1, ax2, ax3] = T.float32(0) | |
tensor_1[ax0, ax1, ax2, ax3] = tensor_1[ax0, ax1, ax2, ax3] + placeholder[ax0, ax1 * 7 + rv0, ax2 * 7 + rv1, ax3] | |
for i0, i1, i2, i3 in T.grid(1, 1, 1, 512): | |
with T.block("tensor_1"): | |
ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(tensor_1[ax0, ax1, ax2, ax3]) | |
T.writes(tensor[ax0, ax1, ax2, ax3]) | |
tensor[ax0, ax1, ax2, ax3] = tensor_1[ax0, ax1, ax2, ax3] * T.float32(0.020408163265306121) | |
[14:36:12] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:113: Total 2 design space(s) generated | |
[14:36:12] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:118: Design space #0: | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 7, 7, 512), "float32"], tensor: T.Buffer[(1, 1, 1, 512), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
with T.block("root"): | |
T.reads() | |
T.writes() | |
T.block_attr({"meta_schedule.unroll_explicit":16}) | |
tensor_shared = T.alloc_buffer([1, 1, 1, 512], dtype="float32", scope="shared") | |
for i0, i1, i2, i3_0 in T.grid(1, 1, 1, 16): | |
for ax0, ax1, ax2, ax3, ax4_ax5_fused_0 in T.grid(1, 1, 1, 32, 2): | |
for ax4_ax5_fused_1 in T.thread_binding(32, thread="threadIdx.x"): | |
with T.block("tensor"): | |
ax0_1 = T.axis.spatial(1, 0) | |
ax1_1 = T.axis.spatial(1, 0) | |
ax2_1 = T.axis.spatial(1, 0) | |
ax3_1 = T.axis.spatial(512, i3_0 * 32 + ax3) | |
rv0 = T.axis.reduce(7, (ax4_ax5_fused_0 * 32 + ax4_ax5_fused_1) // 7) | |
rv1 = T.axis.reduce(7, (ax4_ax5_fused_0 * 32 + ax4_ax5_fused_1) % 7) | |
T.where(ax4_ax5_fused_0 * 32 + ax4_ax5_fused_1 < 49) | |
T.reads(tensor_shared[ax0_1, ax1_1, ax2_1, ax3_1], placeholder[ax0_1, ax1_1 * 7 + rv0, ax2_1 * 7 + rv1, ax3_1]) | |
T.writes(tensor_shared[ax0_1, ax1_1, ax2_1, ax3_1]) | |
with T.init(): | |
tensor_shared[ax0_1, ax1_1, ax2_1, ax3_1] = T.float32(0) | |
tensor_shared[ax0_1, ax1_1, ax2_1, ax3_1] = tensor_shared[ax0_1, ax1_1, ax2_1, ax3_1] + placeholder[ax0_1, ax1_1 * 7 + rv0, ax2_1 * 7 + rv1, ax3_1] | |
for i3_1 in T.thread_binding(32, thread="threadIdx.x"): | |
with T.block("tensor_1"): | |
ax0 = T.axis.spatial(1, 0) | |
ax1 = T.axis.spatial(1, 0) | |
ax2 = T.axis.spatial(1, 0) | |
ax3 = T.axis.spatial(512, i3_0 * 32 + i3_1) | |
T.reads(tensor_shared[ax0, ax1, ax2, ax3]) | |
T.writes(tensor[ax0, ax1, ax2, ax3]) | |
tensor[ax0, ax1, ax2, ax3] = tensor_shared[ax0, ax1, ax2, ax3] * T.float32(0.020408163265306121) | |
b0 = sch.get_block(name="tensor", func_name="main") | |
b1 = sch.get_block(name="root", func_name="main") | |
b2, = sch.get_consumers(block=b0) | |
l3, l4, l5, l6 = sch.get_loops(block=b2) | |
v7 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125], decision=3) | |
l8, l9 = sch.split(loop=l6, factors=[None, v7]) | |
sch.bind(loop=l9, thread_axis="threadIdx.x") | |
sch.compute_at(block=b0, loop=l8, preserve_unit_loops=True) | |
sch.set_scope(block=b0, buffer_index=0, storage_scope="shared") | |
l10, l11, l12, l13, l14, l15, l16, l17, l18, l19 = sch.get_loops(block=b0) | |
l20 = sch.fuse(l18, l19) | |
l21, l22 = sch.split(loop=l20, factors=[None, v7]) | |
sch.bind(loop=l22, thread_axis="threadIdx.x") | |
v23 = sch.sample_categorical(candidates=[0, 16, 64, 512, 1024], probs=[0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001], decision=1) | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.unroll_explicit", ann_val=v23) | |
[14:36:12] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:118: Design space #1: | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 7, 7, 512), "float32"], tensor: T.Buffer[(1, 1, 1, 512), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
with T.block("root"): | |
T.reads() | |
T.writes() | |
T.block_attr({"meta_schedule.unroll_explicit":0}) | |
tensor_1 = T.alloc_buffer([1, 1, 1, 512], dtype="float32") | |
for i0, i1, i2, i3, i4, i5 in T.grid(1, 1, 1, 512, 7, 7): | |
with T.block("tensor"): | |
ax0, ax1, ax2, ax3, rv0, rv1 = T.axis.remap("SSSSRR", [i0, i1, i2, i3, i4, i5]) | |
T.reads(tensor_1[ax0, ax1, ax2, ax3], placeholder[ax0, ax1 * 7 + rv0, ax2 * 7 + rv1, ax3]) | |
T.writes(tensor_1[ax0, ax1, ax2, ax3]) | |
with T.init(): | |
tensor_1[ax0, ax1, ax2, ax3] = T.float32(0) | |
tensor_1[ax0, ax1, ax2, ax3] = tensor_1[ax0, ax1, ax2, ax3] + placeholder[ax0, ax1 * 7 + rv0, ax2 * 7 + rv1, ax3] | |
for i0, i1, i2, i3 in T.grid(1, 1, 1, 512): | |
with T.block("tensor_1"): | |
ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(tensor_1[ax0, ax1, ax2, ax3]) | |
T.writes(tensor[ax0, ax1, ax2, ax3]) | |
tensor[ax0, ax1, ax2, ax3] = tensor_1[ax0, ax1, ax2, ax3] * T.float32(0.020408163265306121) | |
b0 = sch.get_block(name="root", func_name="main") | |
v1 = sch.sample_categorical(candidates=[0, 16, 64, 512, 1024], probs=[0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001], decision=0) | |
sch.annotate(block_or_loop=b0, ann_key="meta_schedule.unroll_explicit", ann_val=v1) | |
[14:36:12] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:96: Initializing Task #18: "vm_mod_fused_layout_transform_reshape_squeeze", mod = | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 1, 1, 512), "float32"], T_squeeze: T.Buffer[(1, 512), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
# with T.block("root") | |
T_layout_trans = T.alloc_buffer([1, 512, 1, 1], dtype="float32") | |
T_reshape = T.alloc_buffer([1, 512, 1, 1], dtype="float32") | |
for i0, i1, i2, i3 in T.grid(1, 512, 1, 1): | |
with T.block("T_layout_trans"): | |
ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(placeholder[ax0, ax2, ax3, ax1]) | |
T.writes(T_layout_trans[ax0, ax1, ax2, ax3]) | |
T_layout_trans[ax0, ax1, ax2, ax3] = T.if_then_else(ax0 < 1 and ax2 < 1 and ax3 < 1 and ax1 < 512, placeholder[ax0, ax2, ax3, ax1], T.float32(0), dtype="float32") | |
for i0, i1, i2, i3 in T.grid(1, 512, 1, 1): | |
with T.block("T_reshape"): | |
ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3]) | |
T.reads(T_layout_trans[0, (ax1 + ax2 + ax3) % 512, 0, 0]) | |
T.writes(T_reshape[ax0, ax1, ax2, ax3]) | |
T_reshape[ax0, ax1, ax2, ax3] = T_layout_trans[0, (ax1 + ax2 + ax3) % 512, 0, 0] | |
for i0, i1 in T.grid(1, 512): | |
with T.block("T_squeeze"): | |
ax0, ax1 = T.axis.remap("SS", [i0, i1]) | |
T.reads(T_reshape[ax0, ax1, 0, 0]) | |
T.writes(T_squeeze[ax0, ax1]) | |
T_squeeze[ax0, ax1] = T_reshape[ax0, ax1, 0, 0] | |
[14:36:13] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:113: Total 1 design space(s) generated | |
[14:36:13] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:118: Design space #0: | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 1, 1, 512), "float32"], T_squeeze: T.Buffer[(1, 512), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
with T.block("root"): | |
T.reads() | |
T.writes() | |
T.block_attr({"meta_schedule.unroll_explicit":64}) | |
for i0, i1 in T.grid(1, 512): | |
with T.block("T_squeeze"): | |
ax0, ax1 = T.axis.remap("SS", [i0, i1]) | |
T.reads(placeholder[0, 0, 0, ax1 % 512]) | |
T.writes(T_squeeze[ax0, ax1]) | |
T_squeeze[ax0, ax1] = T.if_then_else(0 < 1 and 0 < 1 and 0 < 1 and (ax1 + 0 + 0) % 512 < 512, placeholder[0, 0, 0, (ax1 + 0 + 0) % 512], T.float32(0), dtype="float32") | |
b0 = sch.get_block(name="T_layout_trans", func_name="main") | |
b1 = sch.get_block(name="T_reshape", func_name="main") | |
b2 = sch.get_block(name="root", func_name="main") | |
sch.compute_inline(block=b1) | |
sch.compute_inline(block=b0) | |
v3 = sch.sample_categorical(candidates=[0, 16, 64, 512, 1024], probs=[0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001], decision=2) | |
sch.annotate(block_or_loop=b2, ann_key="meta_schedule.unroll_explicit", ann_val=v3) | |
[14:36:13] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:96: Initializing Task #19: "vm_mod_fused_nn_dense_add", mod = | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 1000), "float32"], placeholder_1: T.Buffer[(1000, 512), "float32"], placeholder_2: T.Buffer[(1, 512), "float32"], T_add: T.Buffer[(1, 1000), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
# with T.block("root") | |
T_matmul_NT = T.alloc_buffer([1, 1000], dtype="float32") | |
for i0, i1, i2 in T.grid(1, 1000, 512): | |
with T.block("T_matmul_NT"): | |
i, j, k = T.axis.remap("SSR", [i0, i1, i2]) | |
T.reads(T_matmul_NT[i, j], placeholder_2[i, k], placeholder_1[j, k]) | |
T.writes(T_matmul_NT[i, j]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "workload":["dense_small_batch.gpu", ["TENSOR", [1, 512], "float32"], ["TENSOR", [1000, 512], "float32"], None, "float32"]}) | |
with T.init(): | |
T_matmul_NT[i, j] = T.float32(0) | |
T_matmul_NT[i, j] = T_matmul_NT[i, j] + placeholder_2[i, k] * placeholder_1[j, k] | |
for i0, i1 in T.grid(1, 1000): | |
with T.block("T_add"): | |
ax0, ax1 = T.axis.remap("SS", [i0, i1]) | |
T.reads(T_matmul_NT[ax0, ax1], placeholder[ax0, ax1]) | |
T.writes(T_add[ax0, ax1]) | |
T_add[ax0, ax1] = T_matmul_NT[ax0, ax1] + placeholder[ax0, ax1] | |
[14:36:13] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:113: Total 1 design space(s) generated | |
[14:36:13] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:118: Design space #0: | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 1000), "float32"], placeholder_1: T.Buffer[(1000, 512), "float32"], placeholder_2: T.Buffer[(1, 512), "float32"], T_add: T.Buffer[(1, 1000), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
with T.block("root"): | |
T.reads() | |
T.writes() | |
T.block_attr({"meta_schedule.unroll_explicit":16}) | |
T_matmul_NT_local = T.alloc_buffer([1, 1000], dtype="float32", scope="local") | |
placeholder_shared = T.alloc_buffer([1, 512], dtype="float32", scope="shared") | |
placeholder_shared_1 = T.alloc_buffer([1000, 512], dtype="float32", scope="shared") | |
for i0_0_i1_0_fused in T.thread_binding(10, thread="blockIdx.x"): | |
for i0_1_i1_1_fused in T.thread_binding(1, thread="vthread.x"): | |
for i0_2_i1_2_fused in T.thread_binding(2, thread="threadIdx.x"): | |
for i2_0 in T.serial(2): | |
for ax0_ax1_fused in T.serial(256): | |
with T.block("placeholder_shared"): | |
v0 = T.axis.spatial(1, 0) | |
v1 = T.axis.spatial(512, i2_0 * 256 + ax0_ax1_fused) | |
T.reads(placeholder_2[v0, v1]) | |
T.writes(placeholder_shared[v0, v1]) | |
T.block_attr({"meta_schedule.cooperative_fetch":1}) | |
placeholder_shared[v0, v1] = placeholder_2[v0, v1] | |
for ax0_ax1_fused in T.serial(25600): | |
with T.block("placeholder_shared"): | |
v0 = T.axis.spatial(1000, i0_0_i1_0_fused * 100 + ax0_ax1_fused // 256) | |
v1 = T.axis.spatial(512, i2_0 * 256 + ax0_ax1_fused % 256) | |
T.reads(placeholder_1[v0, v1]) | |
T.writes(placeholder_shared_1[v0, v1]) | |
T.block_attr({"meta_schedule.cooperative_fetch":2}) | |
placeholder_shared_1[v0, v1] = placeholder_1[v0, v1] | |
for i2_1, i0_3, i1_3, i2_2, i0_4, i1_4 in T.grid(4, 1, 2, 64, 1, 25): | |
with T.block("T_matmul_NT"): | |
i = T.axis.spatial(1, 0) | |
j = T.axis.spatial(1000, i0_0_i1_0_fused * 100 + i0_2_i1_2_fused * 50 + i1_3 * 25 + i1_4) | |
k = T.axis.reduce(512, i2_0 * 256 + i2_1 * 64 + i2_2) | |
T.reads(T_matmul_NT_local[i, j], placeholder_shared[i, k], placeholder_shared_1[j, k]) | |
T.writes(T_matmul_NT_local[i, j]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["dense_small_batch.gpu", ["TENSOR", [1, 512], "float32"], ["TENSOR", [1000, 512], "float32"], None, "float32"]}) | |
with T.init(): | |
T_matmul_NT_local[i, j] = T.float32(0) | |
T_matmul_NT_local[i, j] = T_matmul_NT_local[i, j] + placeholder_shared[i, k] * placeholder_shared_1[j, k] | |
for ax0, ax1 in T.grid(1, 50): | |
with T.block("T_matmul_NT_local"): | |
v0 = T.axis.spatial(1, ax0) | |
v1 = T.axis.spatial(1000, i0_0_i1_0_fused * 100 + i0_2_i1_2_fused * 50 + ax1) | |
T.reads(T_matmul_NT_local[v0, v1], placeholder[v0, v1]) | |
T.writes(T_add[v0, v1]) | |
T_add[v0, v1] = T_matmul_NT_local[v0, v1] + placeholder[v0, v1] | |
b0 = sch.get_block(name="T_matmul_NT", func_name="main") | |
b1 = sch.get_block(name="T_add", func_name="main") | |
b2 = sch.get_block(name="root", func_name="main") | |
sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS") | |
l3, l4, l5 = sch.get_loops(block=b0) | |
v6, v7, v8, v9, v10 = sch.sample_perfect_tile(loop=l3, n=5, max_innermost_factor=64, decision=[1, 1, 1, 1, 1]) | |
l11, l12, l13, l14, l15 = sch.split(loop=l3, factors=[v6, v7, v8, v9, v10]) | |
v16, v17, v18, v19, v20 = sch.sample_perfect_tile(loop=l4, n=5, max_innermost_factor=64, decision=[10, 1, 2, 2, 25]) | |
l21, l22, l23, l24, l25 = sch.split(loop=l4, factors=[v16, v17, v18, v19, v20]) | |
v26, v27, v28 = sch.sample_perfect_tile(loop=l5, n=3, max_innermost_factor=64, decision=[2, 4, 64]) | |
l29, l30, l31 = sch.split(loop=l5, factors=[v26, v27, v28]) | |
sch.reorder(l11, l21, l12, l22, l13, l23, l29, l30, l14, l24, l31, l15, l25) | |
l32 = sch.fuse(l11, l21) | |
sch.bind(loop=l32, thread_axis="blockIdx.x") | |
l33 = sch.fuse(l12, l22) | |
sch.bind(loop=l33, thread_axis="vthread.x") | |
l34 = sch.fuse(l13, l23) | |
sch.bind(loop=l34, thread_axis="threadIdx.x") | |
sch.annotate(block_or_loop=b0, ann_key="meta_schedule.thread_extent_low_inclusive", ann_val=32) | |
sch.annotate(block_or_loop=b0, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=1024) | |
b35 = sch.cache_write(block=b0, write_buffer_index=0, storage_scope="local") | |
sch.reverse_compute_at(block=b35, loop=l34, preserve_unit_loops=True) | |
b36 = sch.cache_read(block=b0, read_buffer_index=1, storage_scope="shared") | |
sch.compute_at(block=b36, loop=l29, preserve_unit_loops=True) | |
l37, l38, l39, l40, l41, l42 = sch.get_loops(block=b36) | |
l43 = sch.fuse(l41, l42) | |
v44 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=0) | |
sch.annotate(block_or_loop=b36, ann_key="meta_schedule.cooperative_fetch", ann_val=v44) | |
b45 = sch.cache_read(block=b0, read_buffer_index=2, storage_scope="shared") | |
sch.compute_at(block=b45, loop=l29, preserve_unit_loops=True) | |
l46, l47, l48, l49, l50, l51 = sch.get_loops(block=b45) | |
l52 = sch.fuse(l50, l51) | |
v53 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=1) | |
sch.annotate(block_or_loop=b45, ann_key="meta_schedule.cooperative_fetch", ann_val=v53) | |
sch.reverse_compute_inline(block=b1) | |
v54 = sch.sample_categorical(candidates=[0, 16, 64, 512, 1024], probs=[0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001], decision=1) | |
sch.annotate(block_or_loop=b2, ann_key="meta_schedule.unroll_explicit", ann_val=v54) | |
[14:36:13] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:127: Scheduler picks Task #0: "vm_mod_fused_layout_transform" | |
[14:36:13] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:33: Sending 32 sample(s) to builder | |
[14:36:14] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:54: Sending 32 sample(s) to runner | |
[14:36:21] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:127: Scheduler picks Task #1: "vm_mod_fused_nn_conv2d_add_nn_relu" | |
[14:36:22] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:33: Sending 32 sample(s) to builder | |
[14:36:36] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:54: Sending 32 sample(s) to runner | |
terminate called after throwing an instance of 'tvm::runtime::InternalError' | |
what(): [14:36:37] /home/zxybazh/tvm-tensorir/src/runtime/cuda/cuda_module.cc:61: CUDAError: cuModuleUnload(module_[i]) failed with error: CUDA_ERROR_MISALIGNED_ADDRESS | |
Stack trace: | |
0: tvm::runtime::CUDAModuleNode::~CUDAModuleNode() | |
at /home/zxybazh/tvm-tensorir/src/runtime/cuda/cuda_module.cc:60 | |
1: tvm::runtime::SimpleObjAllocator::Handler<tvm::runtime::CUDAModuleNode>::Deleter_(tvm::runtime::Object*) | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/memory.h:138 | |
2: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:805 | |
3: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:801 | |
4: tvm::runtime::ObjectPtr<tvm::runtime::Object>::reset() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:451 | |
5: tvm::runtime::ObjectPtr<tvm::runtime::Object>::~ObjectPtr() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:400 | |
6: tvm::runtime::ObjectRef::~ObjectRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:511 | |
7: tvm::runtime::Module::~Module() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/module.h:48 | |
8: void std::_Destroy<tvm::runtime::Module>(tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:140 | |
9: void std::_Destroy_aux<false>::__destroy<tvm::runtime::Module*>(tvm::runtime::Module*, tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:152 | |
10: void std::_Destroy<tvm::runtime::Module*>(tvm::runtime::Module*, tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:185 | |
11: void std::_Destroy<tvm::runtime::Module*, tvm::runtime::Module>(tvm::runtime::Module*, tvm::runtime::Module*, std::allocator<tvm::runtime::Module>&) | |
at /usr/include/c++/11/bits/alloc_traits.h:746 | |
12: std::vector<tvm::runtime::Module, std::allocator<tvm::runtime::Module> >::~vector() | |
at /usr/include/c++/11/bits/stl_vector.h:680 | |
13: tvm::runtime::ModuleNode::~ModuleNode() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/module.h:114 | |
14: tvm::runtime::LibraryModuleNode::~LibraryModuleNode() | |
at /home/zxybazh/tvm-tensorir/src/runtime/library_module.cc:38 | |
15: tvm::runtime::SimpleObjAllocator::Handler<tvm::runtime::LibraryModuleNode>::Deleter_(tvm::runtime::Object*) | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/memory.h:138 | |
16: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:805 | |
17: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:801 | |
18: tvm::runtime::ObjectInternal::ObjectFree(void*) | |
at /home/zxybazh/tvm-tensorir/src/runtime/object_internal.h:56 | |
19: TVMObjectFree | |
at /home/zxybazh/tvm-tensorir/src/runtime/object.cc:249 | |
20: ffi_call_unix64 | |
21: ffi_call_int | |
22: _call_function_pointer | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/callproc.c:858 | |
23: _ctypes_callproc | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/callproc.c:1201 | |
24: PyCFuncPtr_call | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/_ctypes.c:4201 | |
25: _PyObject_MakeTpCall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:159 | |
26: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:125 | |
27: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
28: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3469 | |
29: function_code_fastcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:284 | |
30: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:411 | |
31: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
32: _PyObject_FastCall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:147 | |
33: call_unbound_noarg | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:1465 | |
34: slot_tp_finalize | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:6838 | |
35: PyObject_CallFinalizer | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/object.c:310 | |
36: PyObject_CallFinalizerFromDealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/object.c:328 | |
37: subtype_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:1221 | |
38: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
39: frame_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/frameobject.c:430 | |
40: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
41: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
42: tb_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Python/traceback.c:168 | |
43: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
44: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
45: tb_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Python/traceback.c:167 | |
46: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
47: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
48: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:2119 | |
49: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
50: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
51: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
52: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
53: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
54: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
55: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
56: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
57: PyEval_EvalCodeEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4327 | |
58: PyEval_EvalCode | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:718 | |
59: builtin_exec_impl.isra.15 | |
at /tmp/build/80754af9/python_1618343417471/work/Python/bltinmodule.c:1034 | |
60: builtin_exec | |
at /tmp/build/80754af9/python_1618343417471/work/Python/clinic/bltinmodule.c.h:396 | |
61: cfunction_vectorcall_FASTCALL | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/methodobject.c:422 | |
62: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
63: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
64: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
65: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
66: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
67: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
68: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
69: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
70: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
71: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
72: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
73: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
74: PyVectorcall_Call | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:200 | |
75: PyObject_Call | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:228 | |
76: pymain_run_module | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:309 | |
77: pymain_run_python | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:610 | |
78: Py_RunMain | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:695 | |
79: Py_BytesMain | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:1141 | |
80: __libc_start_main | |
81: 0x00005596bdcd4fe4 | |
at ../sysdeps/x86_64/elf/start.S:103 | |
terminate called after throwing an instance of 'tvm::runtime::InternalError' | |
what(): [14:36:40] /home/zxybazh/tvm-tensorir/src/runtime/cuda/cuda_module.cc:61: CUDAError: cuModuleUnload(module_[i]) failed with error: CUDA_ERROR_MISALIGNED_ADDRESS | |
Stack trace: | |
0: tvm::runtime::CUDAModuleNode::~CUDAModuleNode() | |
at /home/zxybazh/tvm-tensorir/src/runtime/cuda/cuda_module.cc:60 | |
1: tvm::runtime::SimpleObjAllocator::Handler<tvm::runtime::CUDAModuleNode>::Deleter_(tvm::runtime::Object*) | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/memory.h:138 | |
2: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:805 | |
3: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:801 | |
4: tvm::runtime::ObjectPtr<tvm::runtime::Object>::reset() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:451 | |
5: tvm::runtime::ObjectPtr<tvm::runtime::Object>::~ObjectPtr() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:400 | |
6: tvm::runtime::ObjectRef::~ObjectRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:511 | |
7: tvm::runtime::Module::~Module() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/module.h:48 | |
8: void std::_Destroy<tvm::runtime::Module>(tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:140 | |
9: void std::_Destroy_aux<false>::__destroy<tvm::runtime::Module*>(tvm::runtime::Module*, tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:152 | |
10: void std::_Destroy<tvm::runtime::Module*>(tvm::runtime::Module*, tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:185 | |
11: void std::_Destroy<tvm::runtime::Module*, tvm::runtime::Module>(tvm::runtime::Module*, tvm::runtime::Module*, std::allocator<tvm::runtime::Module>&) | |
at /usr/include/c++/11/bits/alloc_traits.h:746 | |
12: std::vector<tvm::runtime::Module, std::allocator<tvm::runtime::Module> >::~vector() | |
at /usr/include/c++/11/bits/stl_vector.h:680 | |
13: tvm::runtime::ModuleNode::~ModuleNode() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/module.h:114 | |
14: tvm::runtime::LibraryModuleNode::~LibraryModuleNode() | |
at /home/zxybazh/tvm-tensorir/src/runtime/library_module.cc:38 | |
15: tvm::runtime::SimpleObjAllocator::Handler<tvm::runtime::LibraryModuleNode>::Deleter_(tvm::runtime::Object*) | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/memory.h:138 | |
16: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:805 | |
17: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:801 | |
18: tvm::runtime::ObjectInternal::ObjectFree(void*) | |
at /home/zxybazh/tvm-tensorir/src/runtime/object_internal.h:56 | |
19: TVMObjectFree | |
at /home/zxybazh/tvm-tensorir/src/runtime/object.cc:249 | |
20: ffi_call_unix64 | |
21: ffi_call_int | |
22: _call_function_pointer | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/callproc.c:858 | |
23: _ctypes_callproc | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/callproc.c:1201 | |
24: PyCFuncPtr_call | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/_ctypes.c:4201 | |
25: _PyObject_MakeTpCall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:159 | |
26: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:125 | |
27: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
28: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3469 | |
29: function_code_fastcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:284 | |
30: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:411 | |
31: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
32: _PyObject_FastCall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:147 | |
33: call_unbound_noarg | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:1465 | |
34: slot_tp_finalize | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:6838 | |
35: PyObject_CallFinalizer | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/object.c:310 | |
36: PyObject_CallFinalizerFromDealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/object.c:328 | |
37: subtype_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:1221 | |
38: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
39: frame_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/frameobject.c:430 | |
40: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
41: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
42: tb_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Python/traceback.c:168 | |
43: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
44: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
45: tb_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Python/traceback.c:167 | |
46: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
47: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
48: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:2119 | |
49: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
50: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
51: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
52: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
53: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
54: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
55: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
56: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
57: PyEval_EvalCodeEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4327 | |
58: PyEval_EvalCode | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:718 | |
59: builtin_exec_impl.isra.15 | |
at /tmp/build/80754af9/python_1618343417471/work/Python/bltinmodule.c:1034 | |
60: builtin_exec | |
at /tmp/build/80754af9/python_1618343417471/work/Python/clinic/bltinmodule.c.h:396 | |
61: cfunction_vectorcall_FASTCALL | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/methodobject.c:422 | |
62: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
63: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
64: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
65: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
66: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
67: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
68: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
69: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
70: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
71: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
72: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
73: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
74: PyVectorcall_Call | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:200 | |
75: PyObject_Call | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:228 | |
76: pymain_run_module | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:309 | |
77: pymain_run_python | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:610 | |
78: Py_RunMain | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:695 | |
79: Py_BytesMain | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:1141 | |
80: __libc_start_main | |
81: 0x00005615e8a8afe4 | |
at ../sysdeps/x86_64/elf/start.S:103 | |
terminate called after throwing an instance of 'tvm::runtime::InternalError' | |
what(): [14:36:42] /home/zxybazh/tvm-tensorir/src/runtime/cuda/cuda_module.cc:61: CUDAError: cuModuleUnload(module_[i]) failed with error: CUDA_ERROR_MISALIGNED_ADDRESS | |
Stack trace: | |
0: tvm::runtime::CUDAModuleNode::~CUDAModuleNode() | |
at /home/zxybazh/tvm-tensorir/src/runtime/cuda/cuda_module.cc:60 | |
1: tvm::runtime::SimpleObjAllocator::Handler<tvm::runtime::CUDAModuleNode>::Deleter_(tvm::runtime::Object*) | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/memory.h:138 | |
2: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:805 | |
3: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:801 | |
4: tvm::runtime::ObjectPtr<tvm::runtime::Object>::reset() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:451 | |
5: tvm::runtime::ObjectPtr<tvm::runtime::Object>::~ObjectPtr() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:400 | |
6: tvm::runtime::ObjectRef::~ObjectRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:511 | |
7: tvm::runtime::Module::~Module() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/module.h:48 | |
8: void std::_Destroy<tvm::runtime::Module>(tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:140 | |
9: void std::_Destroy_aux<false>::__destroy<tvm::runtime::Module*>(tvm::runtime::Module*, tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:152 | |
10: void std::_Destroy<tvm::runtime::Module*>(tvm::runtime::Module*, tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:185 | |
11: void std::_Destroy<tvm::runtime::Module*, tvm::runtime::Module>(tvm::runtime::Module*, tvm::runtime::Module*, std::allocator<tvm::runtime::Module>&) | |
at /usr/include/c++/11/bits/alloc_traits.h:746 | |
12: std::vector<tvm::runtime::Module, std::allocator<tvm::runtime::Module> >::~vector() | |
at /usr/include/c++/11/bits/stl_vector.h:680 | |
13: tvm::runtime::ModuleNode::~ModuleNode() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/module.h:114 | |
14: tvm::runtime::LibraryModuleNode::~LibraryModuleNode() | |
at /home/zxybazh/tvm-tensorir/src/runtime/library_module.cc:38 | |
15: tvm::runtime::SimpleObjAllocator::Handler<tvm::runtime::LibraryModuleNode>::Deleter_(tvm::runtime::Object*) | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/memory.h:138 | |
16: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:805 | |
17: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:801 | |
18: tvm::runtime::ObjectInternal::ObjectFree(void*) | |
at /home/zxybazh/tvm-tensorir/src/runtime/object_internal.h:56 | |
19: TVMObjectFree | |
at /home/zxybazh/tvm-tensorir/src/runtime/object.cc:249 | |
20: ffi_call_unix64 | |
21: ffi_call_int | |
22: _call_function_pointer | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/callproc.c:858 | |
23: _ctypes_callproc | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/callproc.c:1201 | |
24: PyCFuncPtr_call | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/_ctypes.c:4201 | |
25: _PyObject_MakeTpCall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:159 | |
26: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:125 | |
27: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
28: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3469 | |
29: function_code_fastcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:284 | |
30: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:411 | |
31: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
32: _PyObject_FastCall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:147 | |
33: call_unbound_noarg | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:1465 | |
34: slot_tp_finalize | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:6838 | |
35: PyObject_CallFinalizer | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/object.c:310 | |
36: PyObject_CallFinalizerFromDealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/object.c:328 | |
37: subtype_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:1221 | |
38: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
39: frame_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/frameobject.c:430 | |
40: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
41: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
42: tb_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Python/traceback.c:168 | |
43: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
44: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
45: tb_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Python/traceback.c:167 | |
46: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
47: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
48: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:2119 | |
49: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
50: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
51: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
52: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
53: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
54: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
55: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
56: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
57: PyEval_EvalCodeEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4327 | |
58: PyEval_EvalCode | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:718 | |
59: builtin_exec_impl.isra.15 | |
at /tmp/build/80754af9/python_1618343417471/work/Python/bltinmodule.c:1034 | |
60: builtin_exec | |
at /tmp/build/80754af9/python_1618343417471/work/Python/clinic/bltinmodule.c.h:396 | |
61: cfunction_vectorcall_FASTCALL | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/methodobject.c:422 | |
62: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
63: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
64: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
65: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
66: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
67: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
68: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
69: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
70: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
71: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
72: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
73: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
74: PyVectorcall_Call | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:200 | |
75: PyObject_Call | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:228 | |
76: pymain_run_module | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:309 | |
77: pymain_run_python | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:610 | |
78: Py_RunMain | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:695 | |
79: Py_BytesMain | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:1141 | |
80: __libc_start_main | |
81: 0x000056086d264fe4 | |
at ../sysdeps/x86_64/elf/start.S:103 | |
terminate called after throwing an instance of 'tvm::runtime::InternalError' | |
what(): [14:36:44] /home/zxybazh/tvm-tensorir/src/runtime/cuda/cuda_module.cc:61: CUDAError: cuModuleUnload(module_[i]) failed with error: CUDA_ERROR_MISALIGNED_ADDRESS | |
Stack trace: | |
0: tvm::runtime::CUDAModuleNode::~CUDAModuleNode() | |
at /home/zxybazh/tvm-tensorir/src/runtime/cuda/cuda_module.cc:60 | |
1: tvm::runtime::SimpleObjAllocator::Handler<tvm::runtime::CUDAModuleNode>::Deleter_(tvm::runtime::Object*) | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/memory.h:138 | |
2: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:805 | |
3: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:801 | |
4: tvm::runtime::ObjectPtr<tvm::runtime::Object>::reset() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:451 | |
5: tvm::runtime::ObjectPtr<tvm::runtime::Object>::~ObjectPtr() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:400 | |
6: tvm::runtime::ObjectRef::~ObjectRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:511 | |
7: tvm::runtime::Module::~Module() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/module.h:48 | |
8: void std::_Destroy<tvm::runtime::Module>(tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:140 | |
9: void std::_Destroy_aux<false>::__destroy<tvm::runtime::Module*>(tvm::runtime::Module*, tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:152 | |
10: void std::_Destroy<tvm::runtime::Module*>(tvm::runtime::Module*, tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:185 | |
11: void std::_Destroy<tvm::runtime::Module*, tvm::runtime::Module>(tvm::runtime::Module*, tvm::runtime::Module*, std::allocator<tvm::runtime::Module>&) | |
at /usr/include/c++/11/bits/alloc_traits.h:746 | |
12: std::vector<tvm::runtime::Module, std::allocator<tvm::runtime::Module> >::~vector() | |
at /usr/include/c++/11/bits/stl_vector.h:680 | |
13: tvm::runtime::ModuleNode::~ModuleNode() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/module.h:114 | |
14: tvm::runtime::LibraryModuleNode::~LibraryModuleNode() | |
at /home/zxybazh/tvm-tensorir/src/runtime/library_module.cc:38 | |
15: tvm::runtime::SimpleObjAllocator::Handler<tvm::runtime::LibraryModuleNode>::Deleter_(tvm::runtime::Object*) | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/memory.h:138 | |
16: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:805 | |
17: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:801 | |
18: tvm::runtime::ObjectInternal::ObjectFree(void*) | |
at /home/zxybazh/tvm-tensorir/src/runtime/object_internal.h:56 | |
19: TVMObjectFree | |
at /home/zxybazh/tvm-tensorir/src/runtime/object.cc:249 | |
20: ffi_call_unix64 | |
21: ffi_call_int | |
22: _call_function_pointer | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/callproc.c:858 | |
23: _ctypes_callproc | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/callproc.c:1201 | |
24: PyCFuncPtr_call | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/_ctypes.c:4201 | |
25: _PyObject_MakeTpCall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:159 | |
26: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:125 | |
27: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
28: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3469 | |
29: function_code_fastcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:284 | |
30: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:411 | |
31: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
32: _PyObject_FastCall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:147 | |
33: call_unbound_noarg | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:1465 | |
34: slot_tp_finalize | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:6838 | |
35: PyObject_CallFinalizer | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/object.c:310 | |
36: PyObject_CallFinalizerFromDealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/object.c:328 | |
37: subtype_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:1221 | |
38: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
39: frame_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/frameobject.c:430 | |
40: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
41: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
42: tb_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Python/traceback.c:168 | |
43: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
44: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
45: tb_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Python/traceback.c:167 | |
46: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
47: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
48: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:2119 | |
49: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
50: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
51: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
52: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
53: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
54: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
55: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
56: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
57: PyEval_EvalCodeEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4327 | |
58: PyEval_EvalCode | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:718 | |
59: builtin_exec_impl.isra.15 | |
at /tmp/build/80754af9/python_1618343417471/work/Python/bltinmodule.c:1034 | |
60: builtin_exec | |
at /tmp/build/80754af9/python_1618343417471/work/Python/clinic/bltinmodule.c.h:396 | |
61: cfunction_vectorcall_FASTCALL | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/methodobject.c:422 | |
62: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
63: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
64: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
65: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
66: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
67: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
68: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
69: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
70: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
71: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
72: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
73: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
74: PyVectorcall_Call | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:200 | |
75: PyObject_Call | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:228 | |
76: pymain_run_module | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:309 | |
77: pymain_run_python | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:610 | |
78: Py_RunMain | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:695 | |
79: Py_BytesMain | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:1141 | |
80: __libc_start_main | |
81: 0x0000560899b1ffe4 | |
at ../sysdeps/x86_64/elf/start.S:103 | |
terminate called after throwing an instance of 'tvm::runtime::InternalError' | |
what(): [14:36:48] /home/zxybazh/tvm-tensorir/src/runtime/cuda/cuda_module.cc:61: CUDAError: cuModuleUnload(module_[i]) failed with error: CUDA_ERROR_MISALIGNED_ADDRESS | |
Stack trace: | |
0: tvm::runtime::CUDAModuleNode::~CUDAModuleNode() | |
at /home/zxybazh/tvm-tensorir/src/runtime/cuda/cuda_module.cc:60 | |
1: tvm::runtime::SimpleObjAllocator::Handler<tvm::runtime::CUDAModuleNode>::Deleter_(tvm::runtime::Object*) | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/memory.h:138 | |
2: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:805 | |
3: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:801 | |
4: tvm::runtime::ObjectPtr<tvm::runtime::Object>::reset() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:451 | |
5: tvm::runtime::ObjectPtr<tvm::runtime::Object>::~ObjectPtr() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:400 | |
6: tvm::runtime::ObjectRef::~ObjectRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:511 | |
7: tvm::runtime::Module::~Module() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/module.h:48 | |
8: void std::_Destroy<tvm::runtime::Module>(tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:140 | |
9: void std::_Destroy_aux<false>::__destroy<tvm::runtime::Module*>(tvm::runtime::Module*, tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:152 | |
10: void std::_Destroy<tvm::runtime::Module*>(tvm::runtime::Module*, tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:185 | |
11: void std::_Destroy<tvm::runtime::Module*, tvm::runtime::Module>(tvm::runtime::Module*, tvm::runtime::Module*, std::allocator<tvm::runtime::Module>&) | |
at /usr/include/c++/11/bits/alloc_traits.h:746 | |
12: std::vector<tvm::runtime::Module, std::allocator<tvm::runtime::Module> >::~vector() | |
at /usr/include/c++/11/bits/stl_vector.h:680 | |
13: tvm::runtime::ModuleNode::~ModuleNode() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/module.h:114 | |
14: tvm::runtime::LibraryModuleNode::~LibraryModuleNode() | |
at /home/zxybazh/tvm-tensorir/src/runtime/library_module.cc:38 | |
15: tvm::runtime::SimpleObjAllocator::Handler<tvm::runtime::LibraryModuleNode>::Deleter_(tvm::runtime::Object*) | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/memory.h:138 | |
16: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:805 | |
17: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:801 | |
18: tvm::runtime::ObjectInternal::ObjectFree(void*) | |
at /home/zxybazh/tvm-tensorir/src/runtime/object_internal.h:56 | |
19: TVMObjectFree | |
at /home/zxybazh/tvm-tensorir/src/runtime/object.cc:249 | |
20: ffi_call_unix64 | |
21: ffi_call_int | |
22: _call_function_pointer | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/callproc.c:858 | |
23: _ctypes_callproc | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/callproc.c:1201 | |
24: PyCFuncPtr_call | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/_ctypes.c:4201 | |
25: _PyObject_MakeTpCall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:159 | |
26: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:125 | |
27: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
28: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3469 | |
29: function_code_fastcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:284 | |
30: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:411 | |
31: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
32: _PyObject_FastCall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:147 | |
33: call_unbound_noarg | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:1465 | |
34: slot_tp_finalize | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:6838 | |
35: PyObject_CallFinalizer | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/object.c:310 | |
36: PyObject_CallFinalizerFromDealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/object.c:328 | |
37: subtype_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:1221 | |
38: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
39: frame_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/frameobject.c:430 | |
40: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
41: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
42: tb_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Python/traceback.c:168 | |
43: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
44: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
45: tb_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Python/traceback.c:167 | |
46: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
47: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
48: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:2119 | |
49: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
50: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
51: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
52: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
53: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
54: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
55: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
56: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
57: PyEval_EvalCodeEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4327 | |
58: PyEval_EvalCode | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:718 | |
59: builtin_exec_impl.isra.15 | |
at /tmp/build/80754af9/python_1618343417471/work/Python/bltinmodule.c:1034 | |
60: builtin_exec | |
at /tmp/build/80754af9/python_1618343417471/work/Python/clinic/bltinmodule.c.h:396 | |
61: cfunction_vectorcall_FASTCALL | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/methodobject.c:422 | |
62: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
63: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
64: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
65: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
66: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
67: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
68: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
69: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
70: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
71: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
72: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
73: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
74: PyVectorcall_Call | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:200 | |
75: PyObject_Call | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:228 | |
76: pymain_run_module | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:309 | |
77: pymain_run_python | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:610 | |
78: Py_RunMain | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:695 | |
79: Py_BytesMain | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:1141 | |
80: __libc_start_main | |
81: 0x00005647470b2fe4 | |
at ../sysdeps/x86_64/elf/start.S:103 | |
[14:36:48] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:127: Scheduler picks Task #2: "vm_mod_fused_nn_max_pool2d" | |
[14:36:48] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:33: Sending 32 sample(s) to builder | |
[14:36:49] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:54: Sending 32 sample(s) to runner | |
[14:36:57] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:127: Scheduler picks Task #3: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu" | |
[14:37:02] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:33: Sending 32 sample(s) to builder | |
[14:37:17] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:54: Sending 32 sample(s) to runner | |
[14:37:24] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:127: Scheduler picks Task #4: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu" | |
[14:37:29] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:33: Sending 32 sample(s) to builder | |
[14:37:38] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:54: Sending 32 sample(s) to runner | |
[14:37:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:127: Scheduler picks Task #5: "vm_mod_fused_nn_conv2d_add_nn_relu_1" | |
[14:37:47] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:33: Sending 32 sample(s) to builder | |
[14:37:54] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:54: Sending 32 sample(s) to runner | |
terminate called after throwing an instance of 'tvm::runtime::InternalError' | |
what(): [14:37:56] /home/zxybazh/tvm-tensorir/src/runtime/cuda/cuda_module.cc:61: CUDAError: cuModuleUnload(module_[i]) failed with error: CUDA_ERROR_MISALIGNED_ADDRESS | |
Stack trace: | |
0: tvm::runtime::CUDAModuleNode::~CUDAModuleNode() | |
at /home/zxybazh/tvm-tensorir/src/runtime/cuda/cuda_module.cc:60 | |
1: tvm::runtime::SimpleObjAllocator::Handler<tvm::runtime::CUDAModuleNode>::Deleter_(tvm::runtime::Object*) | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/memory.h:138 | |
2: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:805 | |
3: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:801 | |
4: tvm::runtime::ObjectPtr<tvm::runtime::Object>::reset() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:451 | |
5: tvm::runtime::ObjectPtr<tvm::runtime::Object>::~ObjectPtr() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:400 | |
6: tvm::runtime::ObjectRef::~ObjectRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:511 | |
7: tvm::runtime::Module::~Module() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/module.h:48 | |
8: void std::_Destroy<tvm::runtime::Module>(tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:140 | |
9: void std::_Destroy_aux<false>::__destroy<tvm::runtime::Module*>(tvm::runtime::Module*, tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:152 | |
10: void std::_Destroy<tvm::runtime::Module*>(tvm::runtime::Module*, tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:185 | |
11: void std::_Destroy<tvm::runtime::Module*, tvm::runtime::Module>(tvm::runtime::Module*, tvm::runtime::Module*, std::allocator<tvm::runtime::Module>&) | |
at /usr/include/c++/11/bits/alloc_traits.h:746 | |
12: std::vector<tvm::runtime::Module, std::allocator<tvm::runtime::Module> >::~vector() | |
at /usr/include/c++/11/bits/stl_vector.h:680 | |
13: tvm::runtime::ModuleNode::~ModuleNode() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/module.h:114 | |
14: tvm::runtime::LibraryModuleNode::~LibraryModuleNode() | |
at /home/zxybazh/tvm-tensorir/src/runtime/library_module.cc:38 | |
15: tvm::runtime::SimpleObjAllocator::Handler<tvm::runtime::LibraryModuleNode>::Deleter_(tvm::runtime::Object*) | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/memory.h:138 | |
16: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:805 | |
17: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:801 | |
18: tvm::runtime::ObjectInternal::ObjectFree(void*) | |
at /home/zxybazh/tvm-tensorir/src/runtime/object_internal.h:56 | |
19: TVMObjectFree | |
at /home/zxybazh/tvm-tensorir/src/runtime/object.cc:249 | |
20: ffi_call_unix64 | |
21: ffi_call_int | |
22: _call_function_pointer | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/callproc.c:858 | |
23: _ctypes_callproc | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/callproc.c:1201 | |
24: PyCFuncPtr_call | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/_ctypes.c:4201 | |
25: _PyObject_MakeTpCall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:159 | |
26: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:125 | |
27: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
28: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3469 | |
29: function_code_fastcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:284 | |
30: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:411 | |
31: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
32: _PyObject_FastCall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:147 | |
33: call_unbound_noarg | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:1465 | |
34: slot_tp_finalize | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:6838 | |
35: PyObject_CallFinalizer | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/object.c:310 | |
36: PyObject_CallFinalizerFromDealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/object.c:328 | |
37: subtype_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:1221 | |
38: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
39: frame_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/frameobject.c:430 | |
40: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
41: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
42: tb_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Python/traceback.c:168 | |
43: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
44: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
45: tb_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Python/traceback.c:167 | |
46: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
47: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
48: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:2119 | |
49: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
50: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
51: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
52: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
53: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
54: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
55: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
56: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
57: PyEval_EvalCodeEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4327 | |
58: PyEval_EvalCode | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:718 | |
59: builtin_exec_impl.isra.15 | |
at /tmp/build/80754af9/python_1618343417471/work/Python/bltinmodule.c:1034 | |
60: builtin_exec | |
at /tmp/build/80754af9/python_1618343417471/work/Python/clinic/bltinmodule.c.h:396 | |
61: cfunction_vectorcall_FASTCALL | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/methodobject.c:422 | |
62: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
63: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
64: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
65: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
66: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
67: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
68: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
69: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
70: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
71: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
72: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
73: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
74: PyVectorcall_Call | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:200 | |
75: PyObject_Call | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:228 | |
76: pymain_run_module | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:309 | |
77: pymain_run_python | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:610 | |
78: Py_RunMain | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:695 | |
79: Py_BytesMain | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:1141 | |
80: __libc_start_main | |
81: 0x00005616df15ffe4 | |
at ../sysdeps/x86_64/elf/start.S:103 | |
[14:38:01] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:127: Scheduler picks Task #6: "vm_mod_fused_nn_conv2d_add" | |
[14:38:02] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:33: Sending 32 sample(s) to builder | |
[14:38:06] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:54: Sending 32 sample(s) to runner | |
[14:38:15] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:127: Scheduler picks Task #7: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1" | |
[14:38:17] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:33: Sending 32 sample(s) to builder | |
[14:38:30] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:54: Sending 32 sample(s) to runner | |
[14:38:37] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:127: Scheduler picks Task #8: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1" | |
[14:38:39] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:33: Sending 32 sample(s) to builder | |
[14:38:53] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:54: Sending 32 sample(s) to runner | |
[14:38:59] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:127: Scheduler picks Task #9: "vm_mod_fused_nn_conv2d_add_nn_relu_2" | |
[14:38:59] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:33: Sending 32 sample(s) to builder | |
[14:39:05] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:54: Sending 32 sample(s) to runner | |
terminate called after throwing an instance of 'tvm::runtime::InternalError' | |
what(): [14:39:10] /home/zxybazh/tvm-tensorir/src/runtime/cuda/cuda_module.cc:61: CUDAError: cuModuleUnload(module_[i]) failed with error: CUDA_ERROR_MISALIGNED_ADDRESS | |
Stack trace: | |
0: tvm::runtime::CUDAModuleNode::~CUDAModuleNode() | |
at /home/zxybazh/tvm-tensorir/src/runtime/cuda/cuda_module.cc:60 | |
1: tvm::runtime::SimpleObjAllocator::Handler<tvm::runtime::CUDAModuleNode>::Deleter_(tvm::runtime::Object*) | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/memory.h:138 | |
2: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:805 | |
3: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:801 | |
4: tvm::runtime::ObjectPtr<tvm::runtime::Object>::reset() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:451 | |
5: tvm::runtime::ObjectPtr<tvm::runtime::Object>::~ObjectPtr() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:400 | |
6: tvm::runtime::ObjectRef::~ObjectRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:511 | |
7: tvm::runtime::Module::~Module() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/module.h:48 | |
8: void std::_Destroy<tvm::runtime::Module>(tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:140 | |
9: void std::_Destroy_aux<false>::__destroy<tvm::runtime::Module*>(tvm::runtime::Module*, tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:152 | |
10: void std::_Destroy<tvm::runtime::Module*>(tvm::runtime::Module*, tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:185 | |
11: void std::_Destroy<tvm::runtime::Module*, tvm::runtime::Module>(tvm::runtime::Module*, tvm::runtime::Module*, std::allocator<tvm::runtime::Module>&) | |
at /usr/include/c++/11/bits/alloc_traits.h:746 | |
12: std::vector<tvm::runtime::Module, std::allocator<tvm::runtime::Module> >::~vector() | |
at /usr/include/c++/11/bits/stl_vector.h:680 | |
13: tvm::runtime::ModuleNode::~ModuleNode() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/module.h:114 | |
14: tvm::runtime::LibraryModuleNode::~LibraryModuleNode() | |
at /home/zxybazh/tvm-tensorir/src/runtime/library_module.cc:38 | |
15: tvm::runtime::SimpleObjAllocator::Handler<tvm::runtime::LibraryModuleNode>::Deleter_(tvm::runtime::Object*) | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/memory.h:138 | |
16: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:805 | |
17: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:801 | |
18: tvm::runtime::ObjectInternal::ObjectFree(void*) | |
at /home/zxybazh/tvm-tensorir/src/runtime/object_internal.h:56 | |
19: TVMObjectFree | |
at /home/zxybazh/tvm-tensorir/src/runtime/object.cc:249 | |
20: ffi_call_unix64 | |
21: ffi_call_int | |
22: _call_function_pointer | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/callproc.c:858 | |
23: _ctypes_callproc | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/callproc.c:1201 | |
24: PyCFuncPtr_call | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/_ctypes.c:4201 | |
25: _PyObject_MakeTpCall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:159 | |
26: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:125 | |
27: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
28: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3469 | |
29: function_code_fastcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:284 | |
30: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:411 | |
31: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
32: _PyObject_FastCall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:147 | |
33: call_unbound_noarg | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:1465 | |
34: slot_tp_finalize | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:6838 | |
35: PyObject_CallFinalizer | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/object.c:310 | |
36: PyObject_CallFinalizerFromDealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/object.c:328 | |
37: subtype_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:1221 | |
38: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
39: frame_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/frameobject.c:430 | |
40: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
41: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
42: tb_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Python/traceback.c:168 | |
43: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
44: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
45: tb_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Python/traceback.c:167 | |
46: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
47: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
48: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:2119 | |
49: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
50: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
51: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
52: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
53: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
54: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
55: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
56: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
57: PyEval_EvalCodeEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4327 | |
58: PyEval_EvalCode | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:718 | |
59: builtin_exec_impl.isra.15 | |
at /tmp/build/80754af9/python_1618343417471/work/Python/bltinmodule.c:1034 | |
60: builtin_exec | |
at /tmp/build/80754af9/python_1618343417471/work/Python/clinic/bltinmodule.c.h:396 | |
61: cfunction_vectorcall_FASTCALL | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/methodobject.c:422 | |
62: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
63: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
64: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
65: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
66: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
67: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
68: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
69: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
70: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
71: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
72: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
73: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
74: PyVectorcall_Call | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:200 | |
75: PyObject_Call | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:228 | |
76: pymain_run_module | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:309 | |
77: pymain_run_python | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:610 | |
78: Py_RunMain | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:695 | |
79: Py_BytesMain | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:1141 | |
80: __libc_start_main | |
81: 0x000055aa71739fe4 | |
at ../sysdeps/x86_64/elf/start.S:103 | |
terminate called after throwing an instance of 'tvm::runtime::InternalError' | |
what(): [14:39:12] /home/zxybazh/tvm-tensorir/src/runtime/cuda/cuda_module.cc:61: CUDAError: cuModuleUnload(module_[i]) failed with error: CUDA_ERROR_MISALIGNED_ADDRESS | |
Stack trace: | |
0: tvm::runtime::CUDAModuleNode::~CUDAModuleNode() | |
at /home/zxybazh/tvm-tensorir/src/runtime/cuda/cuda_module.cc:60 | |
1: tvm::runtime::SimpleObjAllocator::Handler<tvm::runtime::CUDAModuleNode>::Deleter_(tvm::runtime::Object*) | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/memory.h:138 | |
2: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:805 | |
3: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:801 | |
4: tvm::runtime::ObjectPtr<tvm::runtime::Object>::reset() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:451 | |
5: tvm::runtime::ObjectPtr<tvm::runtime::Object>::~ObjectPtr() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:400 | |
6: tvm::runtime::ObjectRef::~ObjectRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:511 | |
7: tvm::runtime::Module::~Module() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/module.h:48 | |
8: void std::_Destroy<tvm::runtime::Module>(tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:140 | |
9: void std::_Destroy_aux<false>::__destroy<tvm::runtime::Module*>(tvm::runtime::Module*, tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:152 | |
10: void std::_Destroy<tvm::runtime::Module*>(tvm::runtime::Module*, tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:185 | |
11: void std::_Destroy<tvm::runtime::Module*, tvm::runtime::Module>(tvm::runtime::Module*, tvm::runtime::Module*, std::allocator<tvm::runtime::Module>&) | |
at /usr/include/c++/11/bits/alloc_traits.h:746 | |
12: std::vector<tvm::runtime::Module, std::allocator<tvm::runtime::Module> >::~vector() | |
at /usr/include/c++/11/bits/stl_vector.h:680 | |
13: tvm::runtime::ModuleNode::~ModuleNode() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/module.h:114 | |
14: tvm::runtime::LibraryModuleNode::~LibraryModuleNode() | |
at /home/zxybazh/tvm-tensorir/src/runtime/library_module.cc:38 | |
15: tvm::runtime::SimpleObjAllocator::Handler<tvm::runtime::LibraryModuleNode>::Deleter_(tvm::runtime::Object*) | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/memory.h:138 | |
16: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:805 | |
17: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:801 | |
18: tvm::runtime::ObjectInternal::ObjectFree(void*) | |
at /home/zxybazh/tvm-tensorir/src/runtime/object_internal.h:56 | |
19: TVMObjectFree | |
at /home/zxybazh/tvm-tensorir/src/runtime/object.cc:249 | |
20: ffi_call_unix64 | |
21: ffi_call_int | |
22: _call_function_pointer | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/callproc.c:858 | |
23: _ctypes_callproc | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/callproc.c:1201 | |
24: PyCFuncPtr_call | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/_ctypes.c:4201 | |
25: _PyObject_MakeTpCall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:159 | |
26: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:125 | |
27: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
28: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3469 | |
29: function_code_fastcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:284 | |
30: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:411 | |
31: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
32: _PyObject_FastCall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:147 | |
33: call_unbound_noarg | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:1465 | |
34: slot_tp_finalize | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:6838 | |
35: PyObject_CallFinalizer | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/object.c:310 | |
36: PyObject_CallFinalizerFromDealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/object.c:328 | |
37: subtype_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:1221 | |
38: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
39: frame_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/frameobject.c:430 | |
40: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
41: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
42: tb_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Python/traceback.c:168 | |
43: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
44: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
45: tb_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Python/traceback.c:167 | |
46: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
47: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
48: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:2119 | |
49: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
50: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
51: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
52: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
53: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
54: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
55: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
56: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
57: PyEval_EvalCodeEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4327 | |
58: PyEval_EvalCode | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:718 | |
59: builtin_exec_impl.isra.15 | |
at /tmp/build/80754af9/python_1618343417471/work/Python/bltinmodule.c:1034 | |
60: builtin_exec | |
at /tmp/build/80754af9/python_1618343417471/work/Python/clinic/bltinmodule.c.h:396 | |
61: cfunction_vectorcall_FASTCALL | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/methodobject.c:422 | |
62: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
63: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
64: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
65: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
66: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
67: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
68: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
69: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
70: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
71: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
72: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
73: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
74: PyVectorcall_Call | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:200 | |
75: PyObject_Call | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:228 | |
76: pymain_run_module | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:309 | |
77: pymain_run_python | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:610 | |
78: Py_RunMain | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:695 | |
79: Py_BytesMain | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:1141 | |
80: __libc_start_main | |
81: 0x0000561c4a245fe4 | |
at ../sysdeps/x86_64/elf/start.S:103 | |
[14:39:14] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:127: Scheduler picks Task #10: "vm_mod_fused_nn_conv2d_add_1" | |
[14:39:14] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:33: Sending 32 sample(s) to builder | |
[14:39:21] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:54: Sending 32 sample(s) to runner | |
terminate called after throwing an instance of 'tvm::runtime::InternalError' | |
what(): [14:39:28] /home/zxybazh/tvm-tensorir/src/runtime/cuda/cuda_module.cc:61: CUDAError: cuModuleUnload(module_[i]) failed with error: CUDA_ERROR_MISALIGNED_ADDRESS | |
Stack trace: | |
0: tvm::runtime::CUDAModuleNode::~CUDAModuleNode() | |
at /home/zxybazh/tvm-tensorir/src/runtime/cuda/cuda_module.cc:60 | |
1: tvm::runtime::SimpleObjAllocator::Handler<tvm::runtime::CUDAModuleNode>::Deleter_(tvm::runtime::Object*) | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/memory.h:138 | |
2: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:805 | |
3: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:801 | |
4: tvm::runtime::ObjectPtr<tvm::runtime::Object>::reset() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:451 | |
5: tvm::runtime::ObjectPtr<tvm::runtime::Object>::~ObjectPtr() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:400 | |
6: tvm::runtime::ObjectRef::~ObjectRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:511 | |
7: tvm::runtime::Module::~Module() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/module.h:48 | |
8: void std::_Destroy<tvm::runtime::Module>(tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:140 | |
9: void std::_Destroy_aux<false>::__destroy<tvm::runtime::Module*>(tvm::runtime::Module*, tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:152 | |
10: void std::_Destroy<tvm::runtime::Module*>(tvm::runtime::Module*, tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:185 | |
11: void std::_Destroy<tvm::runtime::Module*, tvm::runtime::Module>(tvm::runtime::Module*, tvm::runtime::Module*, std::allocator<tvm::runtime::Module>&) | |
at /usr/include/c++/11/bits/alloc_traits.h:746 | |
12: std::vector<tvm::runtime::Module, std::allocator<tvm::runtime::Module> >::~vector() | |
at /usr/include/c++/11/bits/stl_vector.h:680 | |
13: tvm::runtime::ModuleNode::~ModuleNode() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/module.h:114 | |
14: tvm::runtime::LibraryModuleNode::~LibraryModuleNode() | |
at /home/zxybazh/tvm-tensorir/src/runtime/library_module.cc:38 | |
15: tvm::runtime::SimpleObjAllocator::Handler<tvm::runtime::LibraryModuleNode>::Deleter_(tvm::runtime::Object*) | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/memory.h:138 | |
16: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:805 | |
17: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:801 | |
18: tvm::runtime::ObjectInternal::ObjectFree(void*) | |
at /home/zxybazh/tvm-tensorir/src/runtime/object_internal.h:56 | |
19: TVMObjectFree | |
at /home/zxybazh/tvm-tensorir/src/runtime/object.cc:249 | |
20: ffi_call_unix64 | |
21: ffi_call_int | |
22: _call_function_pointer | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/callproc.c:858 | |
23: _ctypes_callproc | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/callproc.c:1201 | |
24: PyCFuncPtr_call | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/_ctypes.c:4201 | |
25: _PyObject_MakeTpCall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:159 | |
26: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:125 | |
27: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
28: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3469 | |
29: function_code_fastcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:284 | |
30: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:411 | |
31: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
32: _PyObject_FastCall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:147 | |
33: call_unbound_noarg | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:1465 | |
34: slot_tp_finalize | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:6838 | |
35: PyObject_CallFinalizer | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/object.c:310 | |
36: PyObject_CallFinalizerFromDealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/object.c:328 | |
37: subtype_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:1221 | |
38: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
39: frame_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/frameobject.c:430 | |
40: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
41: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
42: tb_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Python/traceback.c:168 | |
43: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
44: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
45: tb_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Python/traceback.c:167 | |
46: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
47: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
48: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:2119 | |
49: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
50: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
51: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
52: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
53: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
54: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
55: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
56: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
57: PyEval_EvalCodeEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4327 | |
58: PyEval_EvalCode | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:718 | |
59: builtin_exec_impl.isra.15 | |
at /tmp/build/80754af9/python_1618343417471/work/Python/bltinmodule.c:1034 | |
60: builtin_exec | |
at /tmp/build/80754af9/python_1618343417471/work/Python/clinic/bltinmodule.c.h:396 | |
61: cfunction_vectorcall_FASTCALL | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/methodobject.c:422 | |
62: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
63: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
64: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
65: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
66: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
67: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
68: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
69: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
70: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
71: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
72: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
73: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
74: PyVectorcall_Call | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:200 | |
75: PyObject_Call | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:228 | |
76: pymain_run_module | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:309 | |
77: pymain_run_python | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:610 | |
78: Py_RunMain | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:695 | |
79: Py_BytesMain | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:1141 | |
80: __libc_start_main | |
81: 0x0000557f218c5fe4 | |
at ../sysdeps/x86_64/elf/start.S:103 | |
[14:39:29] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:127: Scheduler picks Task #11: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_2" | |
[14:39:31] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:33: Sending 32 sample(s) to builder | |
[14:39:38] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:54: Sending 32 sample(s) to runner | |
[14:39:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:127: Scheduler picks Task #12: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2" | |
[14:39:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:33: Sending 32 sample(s) to builder | |
[14:39:54] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:54: Sending 32 sample(s) to runner | |
[14:40:00] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:127: Scheduler picks Task #13: "vm_mod_fused_nn_conv2d_add_nn_relu_3" | |
[14:40:01] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:33: Sending 32 sample(s) to builder | |
[14:40:07] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:54: Sending 32 sample(s) to runner | |
terminate called after throwing an instance of 'tvm::runtime::InternalError' | |
what(): [14:40:11] /home/zxybazh/tvm-tensorir/src/runtime/cuda/cuda_module.cc:61: CUDAError: cuModuleUnload(module_[i]) failed with error: CUDA_ERROR_MISALIGNED_ADDRESS | |
Stack trace: | |
0: tvm::runtime::CUDAModuleNode::~CUDAModuleNode() | |
at /home/zxybazh/tvm-tensorir/src/runtime/cuda/cuda_module.cc:60 | |
1: tvm::runtime::SimpleObjAllocator::Handler<tvm::runtime::CUDAModuleNode>::Deleter_(tvm::runtime::Object*) | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/memory.h:138 | |
2: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:805 | |
3: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:801 | |
4: tvm::runtime::ObjectPtr<tvm::runtime::Object>::reset() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:451 | |
5: tvm::runtime::ObjectPtr<tvm::runtime::Object>::~ObjectPtr() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:400 | |
6: tvm::runtime::ObjectRef::~ObjectRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:511 | |
7: tvm::runtime::Module::~Module() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/module.h:48 | |
8: void std::_Destroy<tvm::runtime::Module>(tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:140 | |
9: void std::_Destroy_aux<false>::__destroy<tvm::runtime::Module*>(tvm::runtime::Module*, tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:152 | |
10: void std::_Destroy<tvm::runtime::Module*>(tvm::runtime::Module*, tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:185 | |
11: void std::_Destroy<tvm::runtime::Module*, tvm::runtime::Module>(tvm::runtime::Module*, tvm::runtime::Module*, std::allocator<tvm::runtime::Module>&) | |
at /usr/include/c++/11/bits/alloc_traits.h:746 | |
12: std::vector<tvm::runtime::Module, std::allocator<tvm::runtime::Module> >::~vector() | |
at /usr/include/c++/11/bits/stl_vector.h:680 | |
13: tvm::runtime::ModuleNode::~ModuleNode() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/module.h:114 | |
14: tvm::runtime::LibraryModuleNode::~LibraryModuleNode() | |
at /home/zxybazh/tvm-tensorir/src/runtime/library_module.cc:38 | |
15: tvm::runtime::SimpleObjAllocator::Handler<tvm::runtime::LibraryModuleNode>::Deleter_(tvm::runtime::Object*) | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/memory.h:138 | |
16: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:805 | |
17: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:801 | |
18: tvm::runtime::ObjectInternal::ObjectFree(void*) | |
at /home/zxybazh/tvm-tensorir/src/runtime/object_internal.h:56 | |
19: TVMObjectFree | |
at /home/zxybazh/tvm-tensorir/src/runtime/object.cc:249 | |
20: ffi_call_unix64 | |
21: ffi_call_int | |
22: _call_function_pointer | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/callproc.c:858 | |
23: _ctypes_callproc | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/callproc.c:1201 | |
24: PyCFuncPtr_call | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/_ctypes.c:4201 | |
25: _PyObject_MakeTpCall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:159 | |
26: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:125 | |
27: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
28: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3469 | |
29: function_code_fastcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:284 | |
30: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:411 | |
31: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
32: _PyObject_FastCall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:147 | |
33: call_unbound_noarg | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:1465 | |
34: slot_tp_finalize | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:6838 | |
35: PyObject_CallFinalizer | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/object.c:310 | |
36: PyObject_CallFinalizerFromDealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/object.c:328 | |
37: subtype_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:1221 | |
38: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
39: frame_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/frameobject.c:430 | |
40: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
41: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
42: tb_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Python/traceback.c:168 | |
43: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
44: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
45: tb_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Python/traceback.c:167 | |
46: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
47: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
48: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:2119 | |
49: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
50: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
51: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
52: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
53: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
54: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
55: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
56: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
57: PyEval_EvalCodeEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4327 | |
58: PyEval_EvalCode | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:718 | |
59: builtin_exec_impl.isra.15 | |
at /tmp/build/80754af9/python_1618343417471/work/Python/bltinmodule.c:1034 | |
60: builtin_exec | |
at /tmp/build/80754af9/python_1618343417471/work/Python/clinic/bltinmodule.c.h:396 | |
61: cfunction_vectorcall_FASTCALL | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/methodobject.c:422 | |
62: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
63: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
64: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
65: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
66: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
67: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
68: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
69: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
70: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
71: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
72: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
73: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
74: PyVectorcall_Call | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:200 | |
75: PyObject_Call | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:228 | |
76: pymain_run_module | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:309 | |
77: pymain_run_python | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:610 | |
78: Py_RunMain | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:695 | |
79: Py_BytesMain | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:1141 | |
80: __libc_start_main | |
81: 0x000055c34961ffe4 | |
at ../sysdeps/x86_64/elf/start.S:103 | |
terminate called after throwing an instance of 'tvm::runtime::InternalError' | |
what(): [14:40:14] /home/zxybazh/tvm-tensorir/src/runtime/cuda/cuda_module.cc:61: CUDAError: cuModuleUnload(module_[i]) failed with error: CUDA_ERROR_MISALIGNED_ADDRESS | |
Stack trace: | |
0: tvm::runtime::CUDAModuleNode::~CUDAModuleNode() | |
at /home/zxybazh/tvm-tensorir/src/runtime/cuda/cuda_module.cc:60 | |
1: tvm::runtime::SimpleObjAllocator::Handler<tvm::runtime::CUDAModuleNode>::Deleter_(tvm::runtime::Object*) | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/memory.h:138 | |
2: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:805 | |
3: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:801 | |
4: tvm::runtime::ObjectPtr<tvm::runtime::Object>::reset() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:451 | |
5: tvm::runtime::ObjectPtr<tvm::runtime::Object>::~ObjectPtr() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:400 | |
6: tvm::runtime::ObjectRef::~ObjectRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:511 | |
7: tvm::runtime::Module::~Module() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/module.h:48 | |
8: void std::_Destroy<tvm::runtime::Module>(tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:140 | |
9: void std::_Destroy_aux<false>::__destroy<tvm::runtime::Module*>(tvm::runtime::Module*, tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:152 | |
10: void std::_Destroy<tvm::runtime::Module*>(tvm::runtime::Module*, tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:185 | |
11: void std::_Destroy<tvm::runtime::Module*, tvm::runtime::Module>(tvm::runtime::Module*, tvm::runtime::Module*, std::allocator<tvm::runtime::Module>&) | |
at /usr/include/c++/11/bits/alloc_traits.h:746 | |
12: std::vector<tvm::runtime::Module, std::allocator<tvm::runtime::Module> >::~vector() | |
at /usr/include/c++/11/bits/stl_vector.h:680 | |
13: tvm::runtime::ModuleNode::~ModuleNode() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/module.h:114 | |
14: tvm::runtime::LibraryModuleNode::~LibraryModuleNode() | |
at /home/zxybazh/tvm-tensorir/src/runtime/library_module.cc:38 | |
15: tvm::runtime::SimpleObjAllocator::Handler<tvm::runtime::LibraryModuleNode>::Deleter_(tvm::runtime::Object*) | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/memory.h:138 | |
16: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:805 | |
17: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:801 | |
18: tvm::runtime::ObjectInternal::ObjectFree(void*) | |
at /home/zxybazh/tvm-tensorir/src/runtime/object_internal.h:56 | |
19: TVMObjectFree | |
at /home/zxybazh/tvm-tensorir/src/runtime/object.cc:249 | |
20: ffi_call_unix64 | |
21: ffi_call_int | |
22: _call_function_pointer | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/callproc.c:858 | |
23: _ctypes_callproc | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/callproc.c:1201 | |
24: PyCFuncPtr_call | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/_ctypes.c:4201 | |
25: _PyObject_MakeTpCall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:159 | |
26: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:125 | |
27: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
28: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3469 | |
29: function_code_fastcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:284 | |
30: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:411 | |
31: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
32: _PyObject_FastCall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:147 | |
33: call_unbound_noarg | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:1465 | |
34: slot_tp_finalize | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:6838 | |
35: PyObject_CallFinalizer | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/object.c:310 | |
36: PyObject_CallFinalizerFromDealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/object.c:328 | |
37: subtype_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:1221 | |
38: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
39: frame_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/frameobject.c:430 | |
40: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
41: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
42: tb_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Python/traceback.c:168 | |
43: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
44: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
45: tb_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Python/traceback.c:167 | |
46: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
47: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
48: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:2119 | |
49: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
50: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
51: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
52: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
53: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
54: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
55: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
56: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
57: PyEval_EvalCodeEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4327 | |
58: PyEval_EvalCode | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:718 | |
59: builtin_exec_impl.isra.15 | |
at /tmp/build/80754af9/python_1618343417471/work/Python/bltinmodule.c:1034 | |
60: builtin_exec | |
at /tmp/build/80754af9/python_1618343417471/work/Python/clinic/bltinmodule.c.h:396 | |
61: cfunction_vectorcall_FASTCALL | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/methodobject.c:422 | |
62: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
63: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
64: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
65: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
66: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
67: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
68: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
69: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
70: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
71: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
72: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
73: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
74: PyVectorcall_Call | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:200 | |
75: PyObject_Call | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:228 | |
76: pymain_run_module | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:309 | |
77: pymain_run_python | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:610 | |
78: Py_RunMain | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:695 | |
79: Py_BytesMain | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:1141 | |
80: __libc_start_main | |
81: 0x00005586598eafe4 | |
at ../sysdeps/x86_64/elf/start.S:103 | |
[14:40:15] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:127: Scheduler picks Task #14: "vm_mod_fused_nn_conv2d_add_2" | |
[14:40:15] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:33: Sending 32 sample(s) to builder | |
[14:40:18] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:54: Sending 32 sample(s) to runner | |
[14:40:26] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:127: Scheduler picks Task #15: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_3" | |
[14:40:29] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:33: Sending 32 sample(s) to builder | |
[14:40:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:54: Sending 32 sample(s) to runner | |
[14:40:51] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:127: Scheduler picks Task #16: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3" | |
[14:40:55] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:33: Sending 32 sample(s) to builder | |
[14:41:00] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:54: Sending 32 sample(s) to runner | |
[14:41:08] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:127: Scheduler picks Task #17: "vm_mod_fused_nn_adaptive_avg_pool2d" | |
[14:41:08] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:33: Sending 32 sample(s) to builder | |
[14:41:08] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:54: Sending 32 sample(s) to runner | |
[14:41:16] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:127: Scheduler picks Task #18: "vm_mod_fused_layout_transform_reshape_squeeze" | |
[14:41:16] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:33: Sending 32 sample(s) to builder | |
[14:41:16] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:54: Sending 32 sample(s) to runner | |
[14:41:26] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:127: Scheduler picks Task #19: "vm_mod_fused_nn_dense_add" | |
[14:41:27] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:33: Sending 32 sample(s) to builder | |
[14:41:29] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:54: Sending 32 sample(s) to runner | |
terminate called after throwing an instance of 'tvm::runtime::InternalError' | |
what(): [14:41:30] /home/zxybazh/tvm-tensorir/src/runtime/cuda/cuda_module.cc:61: CUDAError: cuModuleUnload(module_[i]) failed with error: CUDA_ERROR_MISALIGNED_ADDRESS | |
Stack trace: | |
0: tvm::runtime::CUDAModuleNode::~CUDAModuleNode() | |
at /home/zxybazh/tvm-tensorir/src/runtime/cuda/cuda_module.cc:60 | |
1: tvm::runtime::SimpleObjAllocator::Handler<tvm::runtime::CUDAModuleNode>::Deleter_(tvm::runtime::Object*) | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/memory.h:138 | |
2: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:805 | |
3: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:801 | |
4: tvm::runtime::ObjectPtr<tvm::runtime::Object>::reset() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:451 | |
5: tvm::runtime::ObjectPtr<tvm::runtime::Object>::~ObjectPtr() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:400 | |
6: tvm::runtime::ObjectRef::~ObjectRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:511 | |
7: tvm::runtime::Module::~Module() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/module.h:48 | |
8: void std::_Destroy<tvm::runtime::Module>(tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:140 | |
9: void std::_Destroy_aux<false>::__destroy<tvm::runtime::Module*>(tvm::runtime::Module*, tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:152 | |
10: void std::_Destroy<tvm::runtime::Module*>(tvm::runtime::Module*, tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:185 | |
11: void std::_Destroy<tvm::runtime::Module*, tvm::runtime::Module>(tvm::runtime::Module*, tvm::runtime::Module*, std::allocator<tvm::runtime::Module>&) | |
at /usr/include/c++/11/bits/alloc_traits.h:746 | |
12: std::vector<tvm::runtime::Module, std::allocator<tvm::runtime::Module> >::~vector() | |
at /usr/include/c++/11/bits/stl_vector.h:680 | |
13: tvm::runtime::ModuleNode::~ModuleNode() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/module.h:114 | |
14: tvm::runtime::LibraryModuleNode::~LibraryModuleNode() | |
at /home/zxybazh/tvm-tensorir/src/runtime/library_module.cc:38 | |
15: tvm::runtime::SimpleObjAllocator::Handler<tvm::runtime::LibraryModuleNode>::Deleter_(tvm::runtime::Object*) | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/memory.h:138 | |
16: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:805 | |
17: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:801 | |
18: tvm::runtime::ObjectInternal::ObjectFree(void*) | |
at /home/zxybazh/tvm-tensorir/src/runtime/object_internal.h:56 | |
19: TVMObjectFree | |
at /home/zxybazh/tvm-tensorir/src/runtime/object.cc:249 | |
20: ffi_call_unix64 | |
21: ffi_call_int | |
22: _call_function_pointer | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/callproc.c:858 | |
23: _ctypes_callproc | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/callproc.c:1201 | |
24: PyCFuncPtr_call | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/_ctypes.c:4201 | |
25: _PyObject_MakeTpCall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:159 | |
26: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:125 | |
27: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
28: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3469 | |
29: function_code_fastcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:284 | |
30: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:411 | |
31: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
32: _PyObject_FastCall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:147 | |
33: call_unbound_noarg | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:1465 | |
34: slot_tp_finalize | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:6838 | |
35: PyObject_CallFinalizer | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/object.c:310 | |
36: PyObject_CallFinalizerFromDealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/object.c:328 | |
37: subtype_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:1221 | |
38: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
39: frame_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/frameobject.c:430 | |
40: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
41: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
42: tb_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Python/traceback.c:168 | |
43: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
44: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
45: tb_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Python/traceback.c:167 | |
46: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
47: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
48: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:2119 | |
49: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
50: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
51: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
52: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
53: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
54: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
55: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
56: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
57: PyEval_EvalCodeEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4327 | |
58: PyEval_EvalCode | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:718 | |
59: builtin_exec_impl.isra.15 | |
at /tmp/build/80754af9/python_1618343417471/work/Python/bltinmodule.c:1034 | |
60: builtin_exec | |
at /tmp/build/80754af9/python_1618343417471/work/Python/clinic/bltinmodule.c.h:396 | |
61: cfunction_vectorcall_FASTCALL | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/methodobject.c:422 | |
62: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
63: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
64: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
65: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
66: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
67: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
68: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
69: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
70: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
71: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
72: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
73: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
74: PyVectorcall_Call | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:200 | |
75: PyObject_Call | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:228 | |
76: pymain_run_module | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:309 | |
77: pymain_run_python | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:610 | |
78: Py_RunMain | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:695 | |
79: Py_BytesMain | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:1141 | |
80: __libc_start_main | |
81: 0x00005645c2064fe4 | |
at ../sysdeps/x86_64/elf/start.S:103 | |
terminate called after throwing an instance of 'tvm::runtime::InternalError' | |
what(): [14:41:32] /home/zxybazh/tvm-tensorir/src/runtime/cuda/cuda_module.cc:61: CUDAError: cuModuleUnload(module_[i]) failed with error: CUDA_ERROR_MISALIGNED_ADDRESS | |
Stack trace: | |
0: tvm::runtime::CUDAModuleNode::~CUDAModuleNode() | |
at /home/zxybazh/tvm-tensorir/src/runtime/cuda/cuda_module.cc:60 | |
1: tvm::runtime::SimpleObjAllocator::Handler<tvm::runtime::CUDAModuleNode>::Deleter_(tvm::runtime::Object*) | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/memory.h:138 | |
2: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:805 | |
3: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:801 | |
4: tvm::runtime::ObjectPtr<tvm::runtime::Object>::reset() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:451 | |
5: tvm::runtime::ObjectPtr<tvm::runtime::Object>::~ObjectPtr() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:400 | |
6: tvm::runtime::ObjectRef::~ObjectRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:511 | |
7: tvm::runtime::Module::~Module() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/module.h:48 | |
8: void std::_Destroy<tvm::runtime::Module>(tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:140 | |
9: void std::_Destroy_aux<false>::__destroy<tvm::runtime::Module*>(tvm::runtime::Module*, tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:152 | |
10: void std::_Destroy<tvm::runtime::Module*>(tvm::runtime::Module*, tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:185 | |
11: void std::_Destroy<tvm::runtime::Module*, tvm::runtime::Module>(tvm::runtime::Module*, tvm::runtime::Module*, std::allocator<tvm::runtime::Module>&) | |
at /usr/include/c++/11/bits/alloc_traits.h:746 | |
12: std::vector<tvm::runtime::Module, std::allocator<tvm::runtime::Module> >::~vector() | |
at /usr/include/c++/11/bits/stl_vector.h:680 | |
13: tvm::runtime::ModuleNode::~ModuleNode() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/module.h:114 | |
14: tvm::runtime::LibraryModuleNode::~LibraryModuleNode() | |
at /home/zxybazh/tvm-tensorir/src/runtime/library_module.cc:38 | |
15: tvm::runtime::SimpleObjAllocator::Handler<tvm::runtime::LibraryModuleNode>::Deleter_(tvm::runtime::Object*) | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/memory.h:138 | |
16: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:805 | |
17: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:801 | |
18: tvm::runtime::ObjectInternal::ObjectFree(void*) | |
at /home/zxybazh/tvm-tensorir/src/runtime/object_internal.h:56 | |
19: TVMObjectFree | |
at /home/zxybazh/tvm-tensorir/src/runtime/object.cc:249 | |
20: ffi_call_unix64 | |
21: ffi_call_int | |
22: _call_function_pointer | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/callproc.c:858 | |
23: _ctypes_callproc | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/callproc.c:1201 | |
24: PyCFuncPtr_call | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/_ctypes.c:4201 | |
25: _PyObject_MakeTpCall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:159 | |
26: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:125 | |
27: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
28: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3469 | |
29: function_code_fastcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:284 | |
30: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:411 | |
31: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
32: _PyObject_FastCall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:147 | |
33: call_unbound_noarg | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:1465 | |
34: slot_tp_finalize | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:6838 | |
35: PyObject_CallFinalizer | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/object.c:310 | |
36: PyObject_CallFinalizerFromDealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/object.c:328 | |
37: subtype_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:1221 | |
38: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
39: frame_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/frameobject.c:430 | |
40: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
41: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
42: tb_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Python/traceback.c:168 | |
43: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
44: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
45: tb_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Python/traceback.c:167 | |
46: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
47: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
48: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:2119 | |
49: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
50: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
51: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
52: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
53: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
54: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
55: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
56: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
57: PyEval_EvalCodeEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4327 | |
58: PyEval_EvalCode | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:718 | |
59: builtin_exec_impl.isra.15 | |
at /tmp/build/80754af9/python_1618343417471/work/Python/bltinmodule.c:1034 | |
60: builtin_exec | |
at /tmp/build/80754af9/python_1618343417471/work/Python/clinic/bltinmodule.c.h:396 | |
61: cfunction_vectorcall_FASTCALL | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/methodobject.c:422 | |
62: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
63: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
64: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
65: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
66: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
67: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
68: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
69: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
70: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
71: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
72: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
73: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
74: PyVectorcall_Call | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:200 | |
75: PyObject_Call | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:228 | |
76: pymain_run_module | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:309 | |
77: pymain_run_python | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:610 | |
78: Py_RunMain | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:695 | |
79: Py_BytesMain | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:1141 | |
80: __libc_start_main | |
81: 0x000055644bb7efe4 | |
at ../sysdeps/x86_64/elf/start.S:103 | |
terminate called after throwing an instance of 'tvm::runtime::InternalError' | |
what(): [14:41:36] /home/zxybazh/tvm-tensorir/src/runtime/cuda/cuda_module.cc:61: CUDAError: cuModuleUnload(module_[i]) failed with error: CUDA_ERROR_MISALIGNED_ADDRESS | |
Stack trace: | |
0: tvm::runtime::CUDAModuleNode::~CUDAModuleNode() | |
at /home/zxybazh/tvm-tensorir/src/runtime/cuda/cuda_module.cc:60 | |
1: tvm::runtime::SimpleObjAllocator::Handler<tvm::runtime::CUDAModuleNode>::Deleter_(tvm::runtime::Object*) | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/memory.h:138 | |
2: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:805 | |
3: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:801 | |
4: tvm::runtime::ObjectPtr<tvm::runtime::Object>::reset() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:451 | |
5: tvm::runtime::ObjectPtr<tvm::runtime::Object>::~ObjectPtr() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:400 | |
6: tvm::runtime::ObjectRef::~ObjectRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:511 | |
7: tvm::runtime::Module::~Module() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/module.h:48 | |
8: void std::_Destroy<tvm::runtime::Module>(tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:140 | |
9: void std::_Destroy_aux<false>::__destroy<tvm::runtime::Module*>(tvm::runtime::Module*, tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:152 | |
10: void std::_Destroy<tvm::runtime::Module*>(tvm::runtime::Module*, tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:185 | |
11: void std::_Destroy<tvm::runtime::Module*, tvm::runtime::Module>(tvm::runtime::Module*, tvm::runtime::Module*, std::allocator<tvm::runtime::Module>&) | |
at /usr/include/c++/11/bits/alloc_traits.h:746 | |
12: std::vector<tvm::runtime::Module, std::allocator<tvm::runtime::Module> >::~vector() | |
at /usr/include/c++/11/bits/stl_vector.h:680 | |
13: tvm::runtime::ModuleNode::~ModuleNode() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/module.h:114 | |
14: tvm::runtime::LibraryModuleNode::~LibraryModuleNode() | |
at /home/zxybazh/tvm-tensorir/src/runtime/library_module.cc:38 | |
15: tvm::runtime::SimpleObjAllocator::Handler<tvm::runtime::LibraryModuleNode>::Deleter_(tvm::runtime::Object*) | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/memory.h:138 | |
16: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:805 | |
17: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:801 | |
18: tvm::runtime::ObjectInternal::ObjectFree(void*) | |
at /home/zxybazh/tvm-tensorir/src/runtime/object_internal.h:56 | |
19: TVMObjectFree | |
at /home/zxybazh/tvm-tensorir/src/runtime/object.cc:249 | |
20: ffi_call_unix64 | |
21: ffi_call_int | |
22: _call_function_pointer | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/callproc.c:858 | |
23: _ctypes_callproc | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/callproc.c:1201 | |
24: PyCFuncPtr_call | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/_ctypes.c:4201 | |
25: _PyObject_MakeTpCall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:159 | |
26: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:125 | |
27: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
28: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3469 | |
29: function_code_fastcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:284 | |
30: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:411 | |
31: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
32: _PyObject_FastCall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:147 | |
33: call_unbound_noarg | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:1465 | |
34: slot_tp_finalize | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:6838 | |
35: PyObject_CallFinalizer | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/object.c:310 | |
36: PyObject_CallFinalizerFromDealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/object.c:328 | |
37: subtype_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:1221 | |
38: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
39: frame_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/frameobject.c:430 | |
40: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
41: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
42: tb_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Python/traceback.c:168 | |
43: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
44: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
45: tb_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Python/traceback.c:167 | |
46: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
47: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
48: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:2119 | |
49: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
50: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
51: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
52: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
53: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
54: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
55: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
56: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
57: PyEval_EvalCodeEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4327 | |
58: PyEval_EvalCode | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:718 | |
59: builtin_exec_impl.isra.15 | |
at /tmp/build/80754af9/python_1618343417471/work/Python/bltinmodule.c:1034 | |
60: builtin_exec | |
at /tmp/build/80754af9/python_1618343417471/work/Python/clinic/bltinmodule.c.h:396 | |
61: cfunction_vectorcall_FASTCALL | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/methodobject.c:422 | |
62: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
63: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
64: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
65: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
66: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
67: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
68: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
69: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
70: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
71: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
72: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
73: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
74: PyVectorcall_Call | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:200 | |
75: PyObject_Call | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:228 | |
76: pymain_run_module | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:309 | |
77: pymain_run_python | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:610 | |
78: Py_RunMain | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:695 | |
79: Py_BytesMain | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:1141 | |
80: __libc_start_main | |
81: 0x000055ef4bebefe4 | |
at ../sysdeps/x86_64/elf/start.S:103 | |
terminate called after throwing an instance of 'tvm::runtime::InternalError' | |
what(): [14:41:39] /home/zxybazh/tvm-tensorir/src/runtime/cuda/cuda_module.cc:61: CUDAError: cuModuleUnload(module_[i]) failed with error: CUDA_ERROR_MISALIGNED_ADDRESS | |
Stack trace: | |
0: tvm::runtime::CUDAModuleNode::~CUDAModuleNode() | |
at /home/zxybazh/tvm-tensorir/src/runtime/cuda/cuda_module.cc:60 | |
1: tvm::runtime::SimpleObjAllocator::Handler<tvm::runtime::CUDAModuleNode>::Deleter_(tvm::runtime::Object*) | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/memory.h:138 | |
2: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:805 | |
3: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:801 | |
4: tvm::runtime::ObjectPtr<tvm::runtime::Object>::reset() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:451 | |
5: tvm::runtime::ObjectPtr<tvm::runtime::Object>::~ObjectPtr() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:400 | |
6: tvm::runtime::ObjectRef::~ObjectRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:511 | |
7: tvm::runtime::Module::~Module() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/module.h:48 | |
8: void std::_Destroy<tvm::runtime::Module>(tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:140 | |
9: void std::_Destroy_aux<false>::__destroy<tvm::runtime::Module*>(tvm::runtime::Module*, tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:152 | |
10: void std::_Destroy<tvm::runtime::Module*>(tvm::runtime::Module*, tvm::runtime::Module*) | |
at /usr/include/c++/11/bits/stl_construct.h:185 | |
11: void std::_Destroy<tvm::runtime::Module*, tvm::runtime::Module>(tvm::runtime::Module*, tvm::runtime::Module*, std::allocator<tvm::runtime::Module>&) | |
at /usr/include/c++/11/bits/alloc_traits.h:746 | |
12: std::vector<tvm::runtime::Module, std::allocator<tvm::runtime::Module> >::~vector() | |
at /usr/include/c++/11/bits/stl_vector.h:680 | |
13: tvm::runtime::ModuleNode::~ModuleNode() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/module.h:114 | |
14: tvm::runtime::LibraryModuleNode::~LibraryModuleNode() | |
at /home/zxybazh/tvm-tensorir/src/runtime/library_module.cc:38 | |
15: tvm::runtime::SimpleObjAllocator::Handler<tvm::runtime::LibraryModuleNode>::Deleter_(tvm::runtime::Object*) | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/memory.h:138 | |
16: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:805 | |
17: tvm::runtime::Object::DecRef() | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/object.h:801 | |
18: tvm::runtime::ObjectInternal::ObjectFree(void*) | |
at /home/zxybazh/tvm-tensorir/src/runtime/object_internal.h:56 | |
19: TVMObjectFree | |
at /home/zxybazh/tvm-tensorir/src/runtime/object.cc:249 | |
20: ffi_call_unix64 | |
21: ffi_call_int | |
22: _call_function_pointer | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/callproc.c:858 | |
23: _ctypes_callproc | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/callproc.c:1201 | |
24: PyCFuncPtr_call | |
at /usr/local/src/conda/python-3.8.8/Modules/_ctypes/_ctypes.c:4201 | |
25: _PyObject_MakeTpCall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:159 | |
26: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:125 | |
27: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
28: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3469 | |
29: function_code_fastcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:284 | |
30: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:411 | |
31: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
32: _PyObject_FastCall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:147 | |
33: call_unbound_noarg | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:1465 | |
34: slot_tp_finalize | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:6838 | |
35: PyObject_CallFinalizer | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/object.c:310 | |
36: PyObject_CallFinalizerFromDealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/object.c:328 | |
37: subtype_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/typeobject.c:1221 | |
38: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
39: frame_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/frameobject.c:430 | |
40: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
41: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
42: tb_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Python/traceback.c:168 | |
43: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
44: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
45: tb_dealloc | |
at /tmp/build/80754af9/python_1618343417471/work/Python/traceback.c:167 | |
46: _Py_DECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:478 | |
47: _Py_XDECREF | |
at /tmp/build/80754af9/python_1618343417471/work/Include/object.h:541 | |
48: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:2119 | |
49: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
50: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
51: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
52: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
53: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
54: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
55: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
56: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
57: PyEval_EvalCodeEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4327 | |
58: PyEval_EvalCode | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:718 | |
59: builtin_exec_impl.isra.15 | |
at /tmp/build/80754af9/python_1618343417471/work/Python/bltinmodule.c:1034 | |
60: builtin_exec | |
at /tmp/build/80754af9/python_1618343417471/work/Python/clinic/bltinmodule.c.h:396 | |
61: cfunction_vectorcall_FASTCALL | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/methodobject.c:422 | |
62: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
63: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
64: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
65: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
66: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
67: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
68: _PyObject_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Include/cpython/abstract.h:127 | |
69: call_function | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4963 | |
70: _PyEval_EvalFrameDefault | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:3500 | |
71: PyEval_EvalFrameEx | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:741 | |
72: _PyEval_EvalCodeWithName | |
at /tmp/build/80754af9/python_1618343417471/work/Python/ceval.c:4298 | |
73: _PyFunction_Vectorcall | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:436 | |
74: PyVectorcall_Call | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:200 | |
75: PyObject_Call | |
at /tmp/build/80754af9/python_1618343417471/work/Objects/call.c:228 | |
76: pymain_run_module | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:309 | |
77: pymain_run_python | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:610 | |
78: Py_RunMain | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:695 | |
79: Py_BytesMain | |
at /tmp/build/80754af9/python_1618343417471/work/Modules/main.c:1141 | |
80: __libc_start_main | |
81: 0x0000560c569b1fe4 | |
at ../sysdeps/x86_64/elf/start.S:103 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #0: "vm_mod_fused_layout_transform"] Trial #0: GFLOPs: 0.0000. Time: 0.0054 ms. Best GFLOPs: 0.0000 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #0: "vm_mod_fused_layout_transform"] Trial #1: GFLOPs: 0.0000. Time: 0.0047 ms. Best GFLOPs: 0.0000 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #0: "vm_mod_fused_layout_transform"] Trial #2: GFLOPs: 0.0000. Time: 0.0047 ms. Best GFLOPs: 0.0000 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #0: "vm_mod_fused_layout_transform"] Trial #3: GFLOPs: 0.0000. Time: 0.0047 ms. Best GFLOPs: 0.0000 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #0: "vm_mod_fused_layout_transform"] Trial #4: GFLOPs: 0.0000. Time: 0.0047 ms. Best GFLOPs: 0.0000 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #0: "vm_mod_fused_layout_transform"] Trial #5: GFLOPs: 0.0000. Time: 0.0047 ms. Best GFLOPs: 0.0000 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #0: "vm_mod_fused_layout_transform"] Trial #6: GFLOPs: 0.0000. Time: 0.0047 ms. Best GFLOPs: 0.0000 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #0: "vm_mod_fused_layout_transform"] Trial #7: GFLOPs: 0.0000. Time: 0.0047 ms. Best GFLOPs: 0.0000 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #0: "vm_mod_fused_layout_transform"] Trial #8: GFLOPs: 0.0000. Time: 0.0047 ms. Best GFLOPs: 0.0000 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #0: "vm_mod_fused_layout_transform"] Trial #9: GFLOPs: 0.0000. Time: 0.0047 ms. Best GFLOPs: 0.0000 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #0: "vm_mod_fused_layout_transform"] Trial #10: GFLOPs: 0.0000. Time: 0.0047 ms. Best GFLOPs: 0.0000 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #0: "vm_mod_fused_layout_transform"] Trial #11: GFLOPs: 0.0000. Time: 0.0047 ms. Best GFLOPs: 0.0000 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #0: "vm_mod_fused_layout_transform"] Trial #12: GFLOPs: 0.0000. Time: 0.0047 ms. Best GFLOPs: 0.0000 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #0: "vm_mod_fused_layout_transform"] Trial #13: GFLOPs: 0.0000. Time: 0.0047 ms. Best GFLOPs: 0.0000 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #0: "vm_mod_fused_layout_transform"] Trial #14: GFLOPs: 0.0000. Time: 0.0047 ms. Best GFLOPs: 0.0000 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #0: "vm_mod_fused_layout_transform"] Trial #15: GFLOPs: 0.0000. Time: 0.0047 ms. Best GFLOPs: 0.0000 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #0: "vm_mod_fused_layout_transform"] Trial #16: GFLOPs: 0.0000. Time: 0.0047 ms. Best GFLOPs: 0.0000 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #0: "vm_mod_fused_layout_transform"] Trial #17: GFLOPs: 0.0000. Time: 0.0047 ms. Best GFLOPs: 0.0000 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #0: "vm_mod_fused_layout_transform"] Trial #18: GFLOPs: 0.0000. Time: 0.0047 ms. Best GFLOPs: 0.0000 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #0: "vm_mod_fused_layout_transform"] Trial #19: GFLOPs: 0.0000. Time: 0.0047 ms. Best GFLOPs: 0.0000 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #0: "vm_mod_fused_layout_transform"] Trial #20: GFLOPs: 0.0000. Time: 0.0047 ms. Best GFLOPs: 0.0000 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #0: "vm_mod_fused_layout_transform"] Trial #21: GFLOPs: 0.0000. Time: 0.0047 ms. Best GFLOPs: 0.0000 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #0: "vm_mod_fused_layout_transform"] Trial #22: GFLOPs: 0.0000. Time: 0.0047 ms. Best GFLOPs: 0.0000 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #0: "vm_mod_fused_layout_transform"] Trial #23: GFLOPs: 0.0000. Time: 0.0047 ms. Best GFLOPs: 0.0000 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #0: "vm_mod_fused_layout_transform"] Trial #24: GFLOPs: 0.0000. Time: 0.0047 ms. Best GFLOPs: 0.0000 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #0: "vm_mod_fused_layout_transform"] Trial #25: GFLOPs: 0.0000. Time: 0.0047 ms. Best GFLOPs: 0.0000 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #0: "vm_mod_fused_layout_transform"] Trial #26: GFLOPs: 0.0000. Time: 0.0047 ms. Best GFLOPs: 0.0000 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #0: "vm_mod_fused_layout_transform"] Trial #27: GFLOPs: 0.0000. Time: 0.0047 ms. Best GFLOPs: 0.0000 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #0: "vm_mod_fused_layout_transform"] Trial #28: GFLOPs: 0.0000. Time: 0.0047 ms. Best GFLOPs: 0.0000 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #0: "vm_mod_fused_layout_transform"] Trial #29: GFLOPs: 0.0000. Time: 0.0047 ms. Best GFLOPs: 0.0000 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #0: "vm_mod_fused_layout_transform"] Trial #30: GFLOPs: 0.0000. Time: 0.0047 ms. Best GFLOPs: 0.0000 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #0: "vm_mod_fused_layout_transform"] Trial #31: GFLOPs: 0.0000. Time: 0.0047 ms. Best GFLOPs: 0.0000 | |
/home/zxybazh/anaconda3/lib/python3.8/site-packages/xgboost/training.py:17: UserWarning: Old style callback is deprecated. See: https://xgboost.readthedocs.io/en/latest/python/callbacks.html | |
warnings.warn(f'Old style callback is deprecated. See: {link}', UserWarning) | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 0: tr-p-rmse: 0.400038 tr-a-peak@32: 0.999986 tr-rmse: 0.400495 tr-rmse: 0.400495 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 25: tr-p-rmse: 0.023301 tr-a-peak@32: 0.999986 tr-rmse: 0.021828 tr-rmse: 0.021828 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 50: tr-p-rmse: 0.023272 tr-a-peak@32: 0.999986 tr-rmse: 0.021752 tr-rmse: 0.021752 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 75: tr-p-rmse: 0.023272 tr-a-peak@32: 0.999986 tr-rmse: 0.021752 tr-rmse: 0.021752 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB stopped. Best iteration: [30] tr-p-rmse:0.02327 tr-a-peak@32:0.99999 tr-rmse:0.02176 tr-rmse:0.02176 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:127: Scheduler picks Task #0: "vm_mod_fused_layout_transform" | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:141: Task #0 has finished. Remaining task(s): 19 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #1: "vm_mod_fused_nn_conv2d_add_nn_relu"] Trial #0: GFLOPs: 180.5319. Time: 1.3163 ms. Best GFLOPs: 180.5319 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #1: "vm_mod_fused_nn_conv2d_add_nn_relu"] Trial #1: GFLOPs: 63.7932. Time: 3.7251 ms. Best GFLOPs: 180.5319 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #1: "vm_mod_fused_nn_conv2d_add_nn_relu"] Trial #2: GFLOPs: 862.0247. Time: 0.2757 ms. Best GFLOPs: 862.0247 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:271: [Task #1: "vm_mod_fused_nn_conv2d_add_nn_relu"] Trial #3: Error in building: LocalRunner: An exception occurred | |
Subprocess terminated | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 1, 1, 64), "float32"], placeholder_1: T.Buffer[(7, 7, 3, 64), "float32"], placeholder_2: T.Buffer[(1, 224, 224, 3), "float32"], T_relu: T.Buffer[(1, 112, 112, 64), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"tir.noalias": True, "global_symbol": "main"}) | |
# body | |
# with T.block("root") | |
conv2d_nhwc_local = T.alloc_buffer([1, 112, 112, 64], dtype="float32", scope="local") | |
pad_temp_shared = T.alloc_buffer([1, 230, 230, 3], dtype="float32", scope="shared") | |
placeholder_shared = T.alloc_buffer([7, 7, 3, 64], dtype="float32", scope="shared") | |
for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(8, thread="blockIdx.x", annotations={"pragma_auto_unroll_max_step":512, "pragma_unroll_explicit":1}): | |
for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(2, thread="vthread.x"): | |
for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(56, thread="threadIdx.x"): | |
for i3_3_init, i2_4_init, i3_4_init in T.grid(16, 14, 4): | |
with T.block("conv2d_nhwc_init"): | |
nn = T.axis.spatial(1, 0) | |
yy = T.axis.spatial(112, i0_1_i1_1_i2_1_i3_1_fused * 56 + i0_2_i1_2_i2_2_i3_2_fused) | |
xx = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused * 14 + i2_4_init) | |
ff = T.axis.spatial(64, i3_3_init * 4 + i3_4_init) | |
T.reads() | |
T.writes(conv2d_nhwc_local[nn, yy, xx, ff]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["conv2d_nhwc.gpu", ["TENSOR", [1, 224, 224, 3], "float32"], ["TENSOR", [7, 7, 3, 64], "float32"], [2, 2], [3, 3, 3, 3], [1, 1], "float32"]}) | |
conv2d_nhwc_local[nn, yy, xx, ff] = T.float32(0) | |
for i4_0, i5_0, i6_0 in T.grid(7, 1, 3): | |
for ax0_ax1_ax2_ax3_fused_0 in T.serial(33): | |
for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(56, thread="threadIdx.x"): | |
for ax0_ax1_ax2_ax3_fused_2 in T.vectorized(4): | |
with T.block("pad_temp_shared"): | |
v0 = T.axis.spatial(1, 0) | |
v1 = T.axis.spatial(230, i4_0 + ((ax0_ax1_ax2_ax3_fused_0 * 56 + ax0_ax1_ax2_ax3_fused_1) * 4 + ax0_ax1_ax2_ax3_fused_2) % 7359 // 33) | |
v2 = T.axis.spatial(230, i0_0_i1_0_i2_0_i3_0_fused * 28 + ((ax0_ax1_ax2_ax3_fused_0 * 56 + ax0_ax1_ax2_ax3_fused_1) * 4 + ax0_ax1_ax2_ax3_fused_2) % 33) | |
v3 = T.axis.spatial(3, i6_0 + 0) | |
T.where((ax0_ax1_ax2_ax3_fused_0 * 56 + ax0_ax1_ax2_ax3_fused_1) * 4 + ax0_ax1_ax2_ax3_fused_2 < 7359) | |
T.reads(placeholder_2[v0, v1 - 3, v2 - 3, v3]) | |
T.writes(pad_temp_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":4}) | |
pad_temp_shared[v0, v1, v2, v3] = T.if_then_else(3 <= v1 and v1 < 227 and 3 <= v2 and v2 < 227, placeholder_2[v0, v1 - 3, v2 - 3, v3], T.float32(0), dtype="float32") | |
for ax0_ax1_ax2_ax3_fused_0 in T.serial(4): | |
for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(56, thread="threadIdx.x"): | |
for ax0_ax1_ax2_ax3_fused_2 in T.vectorized(2): | |
with T.block("placeholder_shared"): | |
v0 = T.axis.spatial(7, i4_0) | |
v1 = T.axis.spatial(7, (ax0_ax1_ax2_ax3_fused_0 * 112 + ax0_ax1_ax2_ax3_fused_1 * 2 + ax0_ax1_ax2_ax3_fused_2) // 64) | |
v2 = T.axis.spatial(3, i6_0) | |
v3 = T.axis.spatial(64, (ax0_ax1_ax2_ax3_fused_0 * 112 + ax0_ax1_ax2_ax3_fused_1 * 2 + ax0_ax1_ax2_ax3_fused_2) % 64) | |
T.reads(placeholder_1[v0, v1, v2, v3]) | |
T.writes(placeholder_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":2}) | |
placeholder_shared[v0, v1, v2, v3] = placeholder_1[v0, v1, v2, v3] | |
for i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3, i4_2, i5_2, i6_2, i0_4, i1_4, i2_4, i3_4 in T.grid(1, 1, 1, 1, 1, 1, 16, 1, 7, 1, 1, 1, 14, 4): | |
with T.block("conv2d_nhwc_update"): | |
nn = T.axis.spatial(1, 0) | |
yy = T.axis.spatial(112, i0_1_i1_1_i2_1_i3_1_fused * 56 + i0_2_i1_2_i2_2_i3_2_fused) | |
xx = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused * 14 + i2_4) | |
ff = T.axis.spatial(64, i3_3 * 4 + i3_4) | |
ry, rx, rc = T.axis.remap("RRR", [i4_0, i5_2, i6_0]) | |
T.reads(conv2d_nhwc_local[nn, yy, xx, ff], pad_temp_shared[nn, yy * 2 + ry, xx * 2 + rx, rc], placeholder_shared[ry, rx, rc, ff]) | |
T.writes(conv2d_nhwc_local[nn, yy, xx, ff]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["conv2d_nhwc.gpu", ["TENSOR", [1, 224, 224, 3], "float32"], ["TENSOR", [7, 7, 3, 64], "float32"], [2, 2], [3, 3, 3, 3], [1, 1], "float32"]}) | |
conv2d_nhwc_local[nn, yy, xx, ff] = conv2d_nhwc_local[nn, yy, xx, ff] + pad_temp_shared[nn, yy * 2 + ry, xx * 2 + rx, rc] * placeholder_shared[ry, rx, rc, ff] | |
for ax0, ax1, ax2, ax3 in T.grid(1, 1, 14, 64): | |
with T.block("conv2d_nhwc_local"): | |
v0 = T.axis.spatial(1, ax0) | |
v1 = T.axis.spatial(112, i0_1_i1_1_i2_1_i3_1_fused * 56 + i0_2_i1_2_i2_2_i3_2_fused + ax1) | |
v2 = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused * 14 + ax2) | |
v3 = T.axis.spatial(64, ax3) | |
T.reads(conv2d_nhwc_local[v0, v1, v2, v3], placeholder[v0, 0, 0, v3]) | |
T.writes(T_relu[v0, v1, v2, v3]) | |
T_relu[v0, v1, v2, v3] = T.max(conv2d_nhwc_local[v0, v1, v2, v3] + placeholder[v0, 0, 0, v3], T.float32(0)) | |
b0 = sch.get_block(name="pad_temp", func_name="main") | |
b1 = sch.get_block(name="conv2d_nhwc", func_name="main") | |
b2 = sch.get_block(name="T_add", func_name="main") | |
b3 = sch.get_block(name="T_relu", func_name="main") | |
b4 = sch.get_block(name="root", func_name="main") | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS") | |
l5, l6, l7, l8, l9, l10, l11 = sch.get_loops(block=b1) | |
v12, v13, v14, v15, v16 = sch.sample_perfect_tile(loop=l5, n=5, max_innermost_factor=64, decision=[1, 1, 1, 1, 1]) | |
l17, l18, l19, l20, l21 = sch.split(loop=l5, factors=[v12, v13, v14, v15, v16]) | |
v22, v23, v24, v25, v26 = sch.sample_perfect_tile(loop=l6, n=5, max_innermost_factor=64, decision=[1, 2, 56, 1, 1]) | |
l27, l28, l29, l30, l31 = sch.split(loop=l6, factors=[v22, v23, v24, v25, v26]) | |
v32, v33, v34, v35, v36 = sch.sample_perfect_tile(loop=l7, n=5, max_innermost_factor=64, decision=[8, 1, 1, 1, 14]) | |
l37, l38, l39, l40, l41 = sch.split(loop=l7, factors=[v32, v33, v34, v35, v36]) | |
v42, v43, v44, v45, v46 = sch.sample_perfect_tile(loop=l8, n=5, max_innermost_factor=64, decision=[1, 1, 1, 16, 4]) | |
l47, l48, l49, l50, l51 = sch.split(loop=l8, factors=[v42, v43, v44, v45, v46]) | |
v52, v53, v54 = sch.sample_perfect_tile(loop=l9, n=3, max_innermost_factor=64, decision=[7, 1, 1]) | |
l55, l56, l57 = sch.split(loop=l9, factors=[v52, v53, v54]) | |
v58, v59, v60 = sch.sample_perfect_tile(loop=l10, n=3, max_innermost_factor=64, decision=[1, 1, 7]) | |
l61, l62, l63 = sch.split(loop=l10, factors=[v58, v59, v60]) | |
v64, v65, v66 = sch.sample_perfect_tile(loop=l11, n=3, max_innermost_factor=64, decision=[3, 1, 1]) | |
l67, l68, l69 = sch.split(loop=l11, factors=[v64, v65, v66]) | |
sch.reorder(l17, l27, l37, l47, l18, l28, l38, l48, l19, l29, l39, l49, l55, l61, l67, l56, l62, l68, l20, l30, l40, l50, l57, l63, l69, l21, l31, l41, l51) | |
l70 = sch.fuse(l17, l27, l37, l47) | |
sch.bind(loop=l70, thread_axis="blockIdx.x") | |
l71 = sch.fuse(l18, l28, l38, l48) | |
sch.bind(loop=l71, thread_axis="vthread.x") | |
l72 = sch.fuse(l19, l29, l39, l49) | |
sch.bind(loop=l72, thread_axis="threadIdx.x") | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.thread_extent_low_inclusive", ann_val=32) | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=1024) | |
b73 = sch.cache_write(block=b1, write_buffer_index=0, storage_scope="local") | |
sch.reverse_compute_at(block=b73, loop=l72, preserve_unit_loops=True) | |
b74 = sch.cache_read(block=b1, read_buffer_index=1, storage_scope="shared") | |
sch.compute_at(block=b74, loop=l67, preserve_unit_loops=True) | |
l75, l76, l77, l78, l79, l80, l81, l82, l83, l84 = sch.get_loops(block=b74) | |
l85 = sch.fuse(l81, l82, l83, l84) | |
v86 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=3) | |
sch.annotate(block_or_loop=b74, ann_key="meta_schedule.cooperative_fetch", ann_val=v86) | |
b87 = sch.cache_read(block=b1, read_buffer_index=2, storage_scope="shared") | |
sch.compute_at(block=b87, loop=l67, preserve_unit_loops=True) | |
l88, l89, l90, l91, l92, l93, l94, l95, l96, l97 = sch.get_loops(block=b87) | |
l98 = sch.fuse(l94, l95, l96, l97) | |
v99 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=1) | |
sch.annotate(block_or_loop=b87, ann_key="meta_schedule.cooperative_fetch", ann_val=v99) | |
sch.reverse_compute_inline(block=b3) | |
sch.reverse_compute_inline(block=b2) | |
sch.compute_inline(block=b0) | |
v100 = sch.sample_categorical(candidates=[0, 16, 64, 512, 1024], probs=[0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001], decision=3) | |
sch.annotate(block_or_loop=b4, ann_key="meta_schedule.unroll_explicit", ann_val=v100) | |
sch.enter_postproc() | |
l101, l102, l103, l104, l105, l106, l107 = sch.get_loops(block=b74) | |
l108, l109, l110 = sch.split(loop=l107, factors=[None, 56, 4]) | |
sch.vectorize(loop=l110) | |
sch.bind(loop=l109, thread_axis="threadIdx.x") | |
l111, l112, l113, l114, l115, l116, l117 = sch.get_loops(block=b87) | |
l118, l119, l120 = sch.split(loop=l117, factors=[None, 56, 2]) | |
sch.vectorize(loop=l120) | |
sch.bind(loop=l119, thread_axis="threadIdx.x") | |
b121 = sch.get_block(name="root", func_name="main") | |
sch.unannotate(block_or_loop=b121, ann_key="meta_schedule.unroll_explicit") | |
b122, b123, b124, b125 = sch.get_child_blocks(b121) | |
l126, l127, l128, l129, l130, l131, l132, l133, l134 = sch.get_loops(block=b122) | |
sch.annotate(block_or_loop=l126, ann_key="pragma_auto_unroll_max_step", ann_val=512) | |
sch.annotate(block_or_loop=l126, ann_key="pragma_unroll_explicit", ann_val=1) | |
l135, l136, l137, l138, l139, l140, l141, l142, l143 = sch.get_loops(block=b123) | |
sch.annotate(block_or_loop=l135, ann_key="pragma_auto_unroll_max_step", ann_val=512) | |
sch.annotate(block_or_loop=l135, ann_key="pragma_unroll_explicit", ann_val=1) | |
l144, l145, l146, l147, l148, l149, l150, l151, l152, l153, l154, l155, l156, l157, l158, l159, l160, l161, l162, l163 = sch.get_loops(block=b124) | |
sch.annotate(block_or_loop=l144, ann_key="pragma_auto_unroll_max_step", ann_val=512) | |
sch.annotate(block_or_loop=l144, ann_key="pragma_unroll_explicit", ann_val=1) | |
l164, l165, l166, l167, l168, l169, l170 = sch.get_loops(block=b125) | |
sch.annotate(block_or_loop=l164, ann_key="pragma_auto_unroll_max_step", ann_val=512) | |
sch.annotate(block_or_loop=l164, ann_key="pragma_unroll_explicit", ann_val=1) | |
b171 = sch.get_block(name="conv2d_nhwc", func_name="main") | |
l172, l173, l174, l175, l176, l177, l178, l179, l180, l181, l182, l183, l184, l185, l186, l187, l188, l189, l190, l191 = sch.get_loops(block=b171) | |
b192 = sch.decompose_reduction(block=b171, loop=l175) | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #1: "vm_mod_fused_nn_conv2d_add_nn_relu"] Trial #4: GFLOPs: 3030.8113. Time: 0.0784 ms. Best GFLOPs: 3030.8113 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #1: "vm_mod_fused_nn_conv2d_add_nn_relu"] Trial #5: GFLOPs: 1506.1816. Time: 0.1578 ms. Best GFLOPs: 3030.8113 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #1: "vm_mod_fused_nn_conv2d_add_nn_relu"] Trial #6: GFLOPs: 2617.7577. Time: 0.0908 ms. Best GFLOPs: 3030.8113 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #1: "vm_mod_fused_nn_conv2d_add_nn_relu"] Trial #7: GFLOPs: 439.0994. Time: 0.5412 ms. Best GFLOPs: 3030.8113 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #1: "vm_mod_fused_nn_conv2d_add_nn_relu"] Trial #8: GFLOPs: 1251.9068. Time: 0.1898 ms. Best GFLOPs: 3030.8113 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #1: "vm_mod_fused_nn_conv2d_add_nn_relu"] Trial #9: GFLOPs: 66.6375. Time: 3.5661 ms. Best GFLOPs: 3030.8113 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #1: "vm_mod_fused_nn_conv2d_add_nn_relu"] Trial #10: GFLOPs: 1080.8004. Time: 0.2199 ms. Best GFLOPs: 3030.8113 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:271: [Task #1: "vm_mod_fused_nn_conv2d_add_nn_relu"] Trial #11: Error in building: LocalRunner: An exception occurred | |
Subprocess terminated | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 1, 1, 64), "float32"], placeholder_1: T.Buffer[(7, 7, 3, 64), "float32"], placeholder_2: T.Buffer[(1, 224, 224, 3), "float32"], T_relu: T.Buffer[(1, 112, 112, 64), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"tir.noalias": True, "global_symbol": "main"}) | |
# body | |
# with T.block("root") | |
conv2d_nhwc_local = T.alloc_buffer([1, 112, 112, 64], dtype="float32", scope="local") | |
pad_temp_shared = T.alloc_buffer([1, 230, 230, 3], dtype="float32", scope="shared") | |
placeholder_shared = T.alloc_buffer([7, 7, 3, 64], dtype="float32", scope="shared") | |
for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(28, thread="blockIdx.x", annotations={"pragma_auto_unroll_max_step":1024, "pragma_unroll_explicit":1}): | |
for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(1, thread="vthread.x"): | |
for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(56, thread="threadIdx.x"): | |
for i1_4_init, i2_4_init, i3_4_init in T.grid(2, 16, 16): | |
with T.block("conv2d_nhwc_init"): | |
nn = T.axis.spatial(1, 0) | |
yy = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused // 2 * 8 + i0_2_i1_2_i2_2_i3_2_fused // 14 * 2 + i1_4_init) | |
xx = T.axis.spatial(112, i0_2_i1_2_i2_2_i3_2_fused % 14 // 2 * 16 + i2_4_init) | |
ff = T.axis.spatial(64, i0_0_i1_0_i2_0_i3_0_fused % 2 * 32 + i0_2_i1_2_i2_2_i3_2_fused % 2 * 16 + i3_4_init) | |
T.reads() | |
T.writes(conv2d_nhwc_local[nn, yy, xx, ff]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["conv2d_nhwc.gpu", ["TENSOR", [1, 224, 224, 3], "float32"], ["TENSOR", [7, 7, 3, 64], "float32"], [2, 2], [3, 3, 3, 3], [1, 1], "float32"]}) | |
conv2d_nhwc_local[nn, yy, xx, ff] = T.float32(0) | |
for i4_0, i5_0, i6_0 in T.grid(7, 1, 1): | |
for ax0_ax1_ax2_ax3_fused_0 in T.serial(93): | |
for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(56, thread="threadIdx.x"): | |
for ax0_ax1_ax2_ax3_fused_2 in T.vectorized(2): | |
with T.block("pad_temp_shared"): | |
v0 = T.axis.spatial(1, 0) | |
v1 = T.axis.spatial(230, i0_0_i1_0_i2_0_i3_0_fused // 2 * 16 + i4_0 + ((ax0_ax1_ax2_ax3_fused_0 * 56 + ax0_ax1_ax2_ax3_fused_1) * 2 + ax0_ax1_ax2_ax3_fused_2) % 10305 // 687) | |
v2 = T.axis.spatial(230, ((ax0_ax1_ax2_ax3_fused_0 * 56 + ax0_ax1_ax2_ax3_fused_1) * 2 + ax0_ax1_ax2_ax3_fused_2) % 687 // 3) | |
v3 = T.axis.spatial(3, ((ax0_ax1_ax2_ax3_fused_0 * 56 + ax0_ax1_ax2_ax3_fused_1) * 2 + ax0_ax1_ax2_ax3_fused_2) % 3) | |
T.where((ax0_ax1_ax2_ax3_fused_0 * 56 + ax0_ax1_ax2_ax3_fused_1) * 2 + ax0_ax1_ax2_ax3_fused_2 < 10305) | |
T.reads(placeholder_2[v0, v1 - 3, v2 - 3, v3]) | |
T.writes(pad_temp_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":2}) | |
pad_temp_shared[v0, v1, v2, v3] = T.if_then_else(3 <= v1 and v1 < 227 and 3 <= v2 and v2 < 227, placeholder_2[v0, v1 - 3, v2 - 3, v3], T.float32(0), dtype="float32") | |
for ax0_ax1_ax2_ax3_fused_0 in T.serial(6): | |
for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(56, thread="threadIdx.x"): | |
for ax0_ax1_ax2_ax3_fused_2 in T.vectorized(2): | |
with T.block("placeholder_shared"): | |
v0 = T.axis.spatial(7, i4_0) | |
v1 = T.axis.spatial(7, (ax0_ax1_ax2_ax3_fused_0 * 112 + ax0_ax1_ax2_ax3_fused_1 * 2 + ax0_ax1_ax2_ax3_fused_2) // 96) | |
v2 = T.axis.spatial(3, (ax0_ax1_ax2_ax3_fused_0 * 112 + ax0_ax1_ax2_ax3_fused_1 * 2 + ax0_ax1_ax2_ax3_fused_2) % 96 // 32) | |
v3 = T.axis.spatial(64, i0_0_i1_0_i2_0_i3_0_fused % 2 * 32 + (ax0_ax1_ax2_ax3_fused_0 * 112 + ax0_ax1_ax2_ax3_fused_1 * 2 + ax0_ax1_ax2_ax3_fused_2) % 32) | |
T.reads(placeholder_1[v0, v1, v2, v3]) | |
T.writes(placeholder_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":2}) | |
placeholder_shared[v0, v1, v2, v3] = placeholder_1[v0, v1, v2, v3] | |
for i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3, i4_2, i5_2, i6_2, i0_4, i1_4, i2_4, i3_4 in T.grid(1, 7, 1, 1, 1, 1, 1, 1, 1, 3, 1, 2, 16, 16): | |
with T.block("conv2d_nhwc_update"): | |
nn = T.axis.spatial(1, 0) | |
yy = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused // 2 * 8 + i0_2_i1_2_i2_2_i3_2_fused // 14 * 2 + i1_4) | |
xx = T.axis.spatial(112, i0_2_i1_2_i2_2_i3_2_fused % 14 // 2 * 16 + i2_4) | |
ff = T.axis.spatial(64, i0_0_i1_0_i2_0_i3_0_fused % 2 * 32 + i0_2_i1_2_i2_2_i3_2_fused % 2 * 16 + i3_4) | |
ry, rx, rc = T.axis.remap("RRR", [i4_0, i5_1, i6_2]) | |
T.reads(conv2d_nhwc_local[nn, yy, xx, ff], pad_temp_shared[nn, yy * 2 + ry, xx * 2 + rx, rc], placeholder_shared[ry, rx, rc, ff]) | |
T.writes(conv2d_nhwc_local[nn, yy, xx, ff]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["conv2d_nhwc.gpu", ["TENSOR", [1, 224, 224, 3], "float32"], ["TENSOR", [7, 7, 3, 64], "float32"], [2, 2], [3, 3, 3, 3], [1, 1], "float32"]}) | |
conv2d_nhwc_local[nn, yy, xx, ff] = conv2d_nhwc_local[nn, yy, xx, ff] + pad_temp_shared[nn, yy * 2 + ry, xx * 2 + rx, rc] * placeholder_shared[ry, rx, rc, ff] | |
for ax0, ax1, ax2, ax3 in T.grid(1, 2, 16, 16): | |
with T.block("conv2d_nhwc_local"): | |
v0 = T.axis.spatial(1, ax0) | |
v1 = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused // 2 * 8 + i0_2_i1_2_i2_2_i3_2_fused // 14 * 2 + ax1) | |
v2 = T.axis.spatial(112, i0_2_i1_2_i2_2_i3_2_fused % 14 // 2 * 16 + ax2) | |
v3 = T.axis.spatial(64, i0_0_i1_0_i2_0_i3_0_fused % 2 * 32 + i0_2_i1_2_i2_2_i3_2_fused % 2 * 16 + ax3) | |
T.reads(conv2d_nhwc_local[v0, v1, v2, v3], placeholder[v0, 0, 0, v3]) | |
T.writes(T_relu[v0, v1, v2, v3]) | |
T_relu[v0, v1, v2, v3] = T.max(conv2d_nhwc_local[v0, v1, v2, v3] + placeholder[v0, 0, 0, v3], T.float32(0)) | |
b0 = sch.get_block(name="pad_temp", func_name="main") | |
b1 = sch.get_block(name="conv2d_nhwc", func_name="main") | |
b2 = sch.get_block(name="T_add", func_name="main") | |
b3 = sch.get_block(name="T_relu", func_name="main") | |
b4 = sch.get_block(name="root", func_name="main") | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS") | |
l5, l6, l7, l8, l9, l10, l11 = sch.get_loops(block=b1) | |
v12, v13, v14, v15, v16 = sch.sample_perfect_tile(loop=l5, n=5, max_innermost_factor=64, decision=[1, 1, 1, 1, 1]) | |
l17, l18, l19, l20, l21 = sch.split(loop=l5, factors=[v12, v13, v14, v15, v16]) | |
v22, v23, v24, v25, v26 = sch.sample_perfect_tile(loop=l6, n=5, max_innermost_factor=64, decision=[14, 1, 4, 1, 2]) | |
l27, l28, l29, l30, l31 = sch.split(loop=l6, factors=[v22, v23, v24, v25, v26]) | |
v32, v33, v34, v35, v36 = sch.sample_perfect_tile(loop=l7, n=5, max_innermost_factor=64, decision=[1, 1, 7, 1, 16]) | |
l37, l38, l39, l40, l41 = sch.split(loop=l7, factors=[v32, v33, v34, v35, v36]) | |
v42, v43, v44, v45, v46 = sch.sample_perfect_tile(loop=l8, n=5, max_innermost_factor=64, decision=[2, 1, 2, 1, 16]) | |
l47, l48, l49, l50, l51 = sch.split(loop=l8, factors=[v42, v43, v44, v45, v46]) | |
v52, v53, v54 = sch.sample_perfect_tile(loop=l9, n=3, max_innermost_factor=64, decision=[7, 1, 1]) | |
l55, l56, l57 = sch.split(loop=l9, factors=[v52, v53, v54]) | |
v58, v59, v60 = sch.sample_perfect_tile(loop=l10, n=3, max_innermost_factor=64, decision=[1, 7, 1]) | |
l61, l62, l63 = sch.split(loop=l10, factors=[v58, v59, v60]) | |
v64, v65, v66 = sch.sample_perfect_tile(loop=l11, n=3, max_innermost_factor=64, decision=[1, 1, 3]) | |
l67, l68, l69 = sch.split(loop=l11, factors=[v64, v65, v66]) | |
sch.reorder(l17, l27, l37, l47, l18, l28, l38, l48, l19, l29, l39, l49, l55, l61, l67, l56, l62, l68, l20, l30, l40, l50, l57, l63, l69, l21, l31, l41, l51) | |
l70 = sch.fuse(l17, l27, l37, l47) | |
sch.bind(loop=l70, thread_axis="blockIdx.x") | |
l71 = sch.fuse(l18, l28, l38, l48) | |
sch.bind(loop=l71, thread_axis="vthread.x") | |
l72 = sch.fuse(l19, l29, l39, l49) | |
sch.bind(loop=l72, thread_axis="threadIdx.x") | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.thread_extent_low_inclusive", ann_val=32) | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=1024) | |
b73 = sch.cache_write(block=b1, write_buffer_index=0, storage_scope="local") | |
sch.reverse_compute_at(block=b73, loop=l72, preserve_unit_loops=True) | |
b74 = sch.cache_read(block=b1, read_buffer_index=1, storage_scope="shared") | |
sch.compute_at(block=b74, loop=l67, preserve_unit_loops=True) | |
l75, l76, l77, l78, l79, l80, l81, l82, l83, l84 = sch.get_loops(block=b74) | |
l85 = sch.fuse(l81, l82, l83, l84) | |
v86 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=1) | |
sch.annotate(block_or_loop=b74, ann_key="meta_schedule.cooperative_fetch", ann_val=v86) | |
b87 = sch.cache_read(block=b1, read_buffer_index=2, storage_scope="shared") | |
sch.compute_at(block=b87, loop=l67, preserve_unit_loops=True) | |
l88, l89, l90, l91, l92, l93, l94, l95, l96, l97 = sch.get_loops(block=b87) | |
l98 = sch.fuse(l94, l95, l96, l97) | |
v99 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=1) | |
sch.annotate(block_or_loop=b87, ann_key="meta_schedule.cooperative_fetch", ann_val=v99) | |
sch.reverse_compute_inline(block=b3) | |
sch.reverse_compute_inline(block=b2) | |
sch.compute_inline(block=b0) | |
v100 = sch.sample_categorical(candidates=[0, 16, 64, 512, 1024], probs=[0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001], decision=4) | |
sch.annotate(block_or_loop=b4, ann_key="meta_schedule.unroll_explicit", ann_val=v100) | |
sch.enter_postproc() | |
l101, l102, l103, l104, l105, l106, l107 = sch.get_loops(block=b74) | |
l108, l109, l110 = sch.split(loop=l107, factors=[None, 56, 2]) | |
sch.vectorize(loop=l110) | |
sch.bind(loop=l109, thread_axis="threadIdx.x") | |
l111, l112, l113, l114, l115, l116, l117 = sch.get_loops(block=b87) | |
l118, l119, l120 = sch.split(loop=l117, factors=[None, 56, 2]) | |
sch.vectorize(loop=l120) | |
sch.bind(loop=l119, thread_axis="threadIdx.x") | |
b121 = sch.get_block(name="root", func_name="main") | |
sch.unannotate(block_or_loop=b121, ann_key="meta_schedule.unroll_explicit") | |
b122, b123, b124, b125 = sch.get_child_blocks(b121) | |
l126, l127, l128, l129, l130, l131, l132, l133, l134 = sch.get_loops(block=b122) | |
sch.annotate(block_or_loop=l126, ann_key="pragma_auto_unroll_max_step", ann_val=1024) | |
sch.annotate(block_or_loop=l126, ann_key="pragma_unroll_explicit", ann_val=1) | |
l135, l136, l137, l138, l139, l140, l141, l142, l143 = sch.get_loops(block=b123) | |
sch.annotate(block_or_loop=l135, ann_key="pragma_auto_unroll_max_step", ann_val=1024) | |
sch.annotate(block_or_loop=l135, ann_key="pragma_unroll_explicit", ann_val=1) | |
l144, l145, l146, l147, l148, l149, l150, l151, l152, l153, l154, l155, l156, l157, l158, l159, l160, l161, l162, l163 = sch.get_loops(block=b124) | |
sch.annotate(block_or_loop=l144, ann_key="pragma_auto_unroll_max_step", ann_val=1024) | |
sch.annotate(block_or_loop=l144, ann_key="pragma_unroll_explicit", ann_val=1) | |
l164, l165, l166, l167, l168, l169, l170 = sch.get_loops(block=b125) | |
sch.annotate(block_or_loop=l164, ann_key="pragma_auto_unroll_max_step", ann_val=1024) | |
sch.annotate(block_or_loop=l164, ann_key="pragma_unroll_explicit", ann_val=1) | |
b171 = sch.get_block(name="conv2d_nhwc", func_name="main") | |
l172, l173, l174, l175, l176, l177, l178, l179, l180, l181, l182, l183, l184, l185, l186, l187, l188, l189, l190, l191 = sch.get_loops(block=b171) | |
b192 = sch.decompose_reduction(block=b171, loop=l175) | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #1: "vm_mod_fused_nn_conv2d_add_nn_relu"] Trial #12: GFLOPs: 228.6223. Time: 1.0394 ms. Best GFLOPs: 3030.8113 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:271: [Task #1: "vm_mod_fused_nn_conv2d_add_nn_relu"] Trial #13: Error in building: LocalRunner: An exception occurred | |
Subprocess terminated | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 1, 1, 64), "float32"], placeholder_1: T.Buffer[(7, 7, 3, 64), "float32"], placeholder_2: T.Buffer[(1, 224, 224, 3), "float32"], T_relu: T.Buffer[(1, 112, 112, 64), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"tir.noalias": True, "global_symbol": "main"}) | |
# body | |
# with T.block("root") | |
conv2d_nhwc_local = T.alloc_buffer([1, 112, 112, 64], dtype="float32", scope="local") | |
pad_temp_shared = T.alloc_buffer([1, 230, 230, 3], dtype="float32", scope="shared") | |
placeholder_shared = T.alloc_buffer([7, 7, 3, 64], dtype="float32", scope="shared") | |
for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(14, thread="blockIdx.x", annotations={"pragma_auto_unroll_max_step":512, "pragma_unroll_explicit":1}): | |
for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(4, thread="vthread.x"): | |
for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(32, thread="threadIdx.x"): | |
for i1_3_init, i2_3_init, i3_3_init, i2_4_init, i3_4_init in T.grid(4, 2, 2, 7, 4): | |
with T.block("conv2d_nhwc_init"): | |
nn = T.axis.spatial(1, 0) | |
yy = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused * 8 + i0_1_i1_1_i2_1_i3_1_fused // 2 * 4 + i1_3_init) | |
xx = T.axis.spatial(112, i0_1_i1_1_i2_1_i3_1_fused % 2 * 56 + i0_2_i1_2_i2_2_i3_2_fused // 8 * 14 + i2_3_init * 7 + i2_4_init) | |
ff = T.axis.spatial(64, i0_2_i1_2_i2_2_i3_2_fused % 8 * 8 + i3_3_init * 4 + i3_4_init) | |
T.reads() | |
T.writes(conv2d_nhwc_local[nn, yy, xx, ff]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["conv2d_nhwc.gpu", ["TENSOR", [1, 224, 224, 3], "float32"], ["TENSOR", [7, 7, 3, 64], "float32"], [2, 2], [3, 3, 3, 3], [1, 1], "float32"]}) | |
conv2d_nhwc_local[nn, yy, xx, ff] = T.float32(0) | |
for i4_0, i5_0, i6_0 in T.grid(7, 7, 1): | |
for ax0_ax1_ax2_ax3_fused_0 in T.serial(79): | |
for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(32, thread="threadIdx.x"): | |
for ax0_ax1_ax2_ax3_fused_2 in T.vectorized(4): | |
with T.block("pad_temp_shared"): | |
v0 = T.axis.spatial(1, 0) | |
v1 = T.axis.spatial(230, i0_0_i1_0_i2_0_i3_0_fused * 16 + i4_0 + ((ax0_ax1_ax2_ax3_fused_0 * 32 + ax0_ax1_ax2_ax3_fused_1) * 4 + ax0_ax1_ax2_ax3_fused_2) % 10035 // 669) | |
v2 = T.axis.spatial(230, i5_0 + ((ax0_ax1_ax2_ax3_fused_0 * 32 + ax0_ax1_ax2_ax3_fused_1) * 4 + ax0_ax1_ax2_ax3_fused_2) % 669 // 3) | |
v3 = T.axis.spatial(3, ((ax0_ax1_ax2_ax3_fused_0 * 32 + ax0_ax1_ax2_ax3_fused_1) * 4 + ax0_ax1_ax2_ax3_fused_2) % 3) | |
T.where((ax0_ax1_ax2_ax3_fused_0 * 32 + ax0_ax1_ax2_ax3_fused_1) * 4 + ax0_ax1_ax2_ax3_fused_2 < 10035) | |
T.reads(placeholder_2[v0, v1 - 3, v2 - 3, v3]) | |
T.writes(pad_temp_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":4}) | |
pad_temp_shared[v0, v1, v2, v3] = T.if_then_else(3 <= v1 and v1 < 227 and 3 <= v2 and v2 < 227, placeholder_2[v0, v1 - 3, v2 - 3, v3], T.float32(0), dtype="float32") | |
for ax0_ax1_ax2_ax3_fused_0 in T.serial(3): | |
for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(32, thread="threadIdx.x"): | |
for ax0_ax1_ax2_ax3_fused_2 in T.vectorized(2): | |
with T.block("placeholder_shared"): | |
v0, v1 = T.axis.remap("SS", [i4_0, i5_0]) | |
v2 = T.axis.spatial(3, (ax0_ax1_ax2_ax3_fused_0 * 64 + ax0_ax1_ax2_ax3_fused_1 * 2 + ax0_ax1_ax2_ax3_fused_2) // 64) | |
v3 = T.axis.spatial(64, (ax0_ax1_ax2_ax3_fused_0 * 64 + ax0_ax1_ax2_ax3_fused_1 * 2 + ax0_ax1_ax2_ax3_fused_2) % 64) | |
T.reads(placeholder_1[v0, v1, v2, v3]) | |
T.writes(placeholder_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":2}) | |
placeholder_shared[v0, v1, v2, v3] = placeholder_1[v0, v1, v2, v3] | |
for i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3, i4_2, i5_2, i6_2, i0_4, i1_4, i2_4, i3_4 in T.grid(1, 1, 1, 1, 4, 2, 2, 1, 1, 3, 1, 1, 7, 4): | |
with T.block("conv2d_nhwc_update"): | |
nn = T.axis.spatial(1, 0) | |
yy = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused * 8 + i0_1_i1_1_i2_1_i3_1_fused // 2 * 4 + i1_3) | |
xx = T.axis.spatial(112, i0_1_i1_1_i2_1_i3_1_fused % 2 * 56 + i0_2_i1_2_i2_2_i3_2_fused // 8 * 14 + i2_3 * 7 + i2_4) | |
ff = T.axis.spatial(64, i0_2_i1_2_i2_2_i3_2_fused % 8 * 8 + i3_3 * 4 + i3_4) | |
ry, rx, rc = T.axis.remap("RRR", [i4_0, i5_0, i6_2]) | |
T.reads(conv2d_nhwc_local[nn, yy, xx, ff], pad_temp_shared[nn, yy * 2 + ry, xx * 2 + rx, rc], placeholder_shared[ry, rx, rc, ff]) | |
T.writes(conv2d_nhwc_local[nn, yy, xx, ff]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["conv2d_nhwc.gpu", ["TENSOR", [1, 224, 224, 3], "float32"], ["TENSOR", [7, 7, 3, 64], "float32"], [2, 2], [3, 3, 3, 3], [1, 1], "float32"]}) | |
conv2d_nhwc_local[nn, yy, xx, ff] = conv2d_nhwc_local[nn, yy, xx, ff] + pad_temp_shared[nn, yy * 2 + ry, xx * 2 + rx, rc] * placeholder_shared[ry, rx, rc, ff] | |
for ax0, ax1, ax2, ax3 in T.grid(1, 4, 14, 8): | |
with T.block("conv2d_nhwc_local"): | |
v0 = T.axis.spatial(1, ax0) | |
v1 = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused * 8 + i0_1_i1_1_i2_1_i3_1_fused // 2 * 4 + ax1) | |
v2 = T.axis.spatial(112, i0_1_i1_1_i2_1_i3_1_fused % 2 * 56 + i0_2_i1_2_i2_2_i3_2_fused // 8 * 14 + ax2) | |
v3 = T.axis.spatial(64, i0_2_i1_2_i2_2_i3_2_fused % 8 * 8 + ax3) | |
T.reads(conv2d_nhwc_local[v0, v1, v2, v3], placeholder[v0, 0, 0, v3]) | |
T.writes(T_relu[v0, v1, v2, v3]) | |
T_relu[v0, v1, v2, v3] = T.max(conv2d_nhwc_local[v0, v1, v2, v3] + placeholder[v0, 0, 0, v3], T.float32(0)) | |
b0 = sch.get_block(name="pad_temp", func_name="main") | |
b1 = sch.get_block(name="conv2d_nhwc", func_name="main") | |
b2 = sch.get_block(name="T_add", func_name="main") | |
b3 = sch.get_block(name="T_relu", func_name="main") | |
b4 = sch.get_block(name="root", func_name="main") | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS") | |
l5, l6, l7, l8, l9, l10, l11 = sch.get_loops(block=b1) | |
v12, v13, v14, v15, v16 = sch.sample_perfect_tile(loop=l5, n=5, max_innermost_factor=64, decision=[1, 1, 1, 1, 1]) | |
l17, l18, l19, l20, l21 = sch.split(loop=l5, factors=[v12, v13, v14, v15, v16]) | |
v22, v23, v24, v25, v26 = sch.sample_perfect_tile(loop=l6, n=5, max_innermost_factor=64, decision=[14, 2, 1, 4, 1]) | |
l27, l28, l29, l30, l31 = sch.split(loop=l6, factors=[v22, v23, v24, v25, v26]) | |
v32, v33, v34, v35, v36 = sch.sample_perfect_tile(loop=l7, n=5, max_innermost_factor=64, decision=[1, 2, 4, 2, 7]) | |
l37, l38, l39, l40, l41 = sch.split(loop=l7, factors=[v32, v33, v34, v35, v36]) | |
v42, v43, v44, v45, v46 = sch.sample_perfect_tile(loop=l8, n=5, max_innermost_factor=64, decision=[1, 1, 8, 2, 4]) | |
l47, l48, l49, l50, l51 = sch.split(loop=l8, factors=[v42, v43, v44, v45, v46]) | |
v52, v53, v54 = sch.sample_perfect_tile(loop=l9, n=3, max_innermost_factor=64, decision=[7, 1, 1]) | |
l55, l56, l57 = sch.split(loop=l9, factors=[v52, v53, v54]) | |
v58, v59, v60 = sch.sample_perfect_tile(loop=l10, n=3, max_innermost_factor=64, decision=[7, 1, 1]) | |
l61, l62, l63 = sch.split(loop=l10, factors=[v58, v59, v60]) | |
v64, v65, v66 = sch.sample_perfect_tile(loop=l11, n=3, max_innermost_factor=64, decision=[1, 1, 3]) | |
l67, l68, l69 = sch.split(loop=l11, factors=[v64, v65, v66]) | |
sch.reorder(l17, l27, l37, l47, l18, l28, l38, l48, l19, l29, l39, l49, l55, l61, l67, l56, l62, l68, l20, l30, l40, l50, l57, l63, l69, l21, l31, l41, l51) | |
l70 = sch.fuse(l17, l27, l37, l47) | |
sch.bind(loop=l70, thread_axis="blockIdx.x") | |
l71 = sch.fuse(l18, l28, l38, l48) | |
sch.bind(loop=l71, thread_axis="vthread.x") | |
l72 = sch.fuse(l19, l29, l39, l49) | |
sch.bind(loop=l72, thread_axis="threadIdx.x") | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.thread_extent_low_inclusive", ann_val=32) | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=1024) | |
b73 = sch.cache_write(block=b1, write_buffer_index=0, storage_scope="local") | |
sch.reverse_compute_at(block=b73, loop=l72, preserve_unit_loops=True) | |
b74 = sch.cache_read(block=b1, read_buffer_index=1, storage_scope="shared") | |
sch.compute_at(block=b74, loop=l67, preserve_unit_loops=True) | |
l75, l76, l77, l78, l79, l80, l81, l82, l83, l84 = sch.get_loops(block=b74) | |
l85 = sch.fuse(l81, l82, l83, l84) | |
v86 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=3) | |
sch.annotate(block_or_loop=b74, ann_key="meta_schedule.cooperative_fetch", ann_val=v86) | |
b87 = sch.cache_read(block=b1, read_buffer_index=2, storage_scope="shared") | |
sch.compute_at(block=b87, loop=l67, preserve_unit_loops=True) | |
l88, l89, l90, l91, l92, l93, l94, l95, l96, l97 = sch.get_loops(block=b87) | |
l98 = sch.fuse(l94, l95, l96, l97) | |
v99 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=1) | |
sch.annotate(block_or_loop=b87, ann_key="meta_schedule.cooperative_fetch", ann_val=v99) | |
sch.reverse_compute_inline(block=b3) | |
sch.reverse_compute_inline(block=b2) | |
sch.compute_inline(block=b0) | |
v100 = sch.sample_categorical(candidates=[0, 16, 64, 512, 1024], probs=[0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001], decision=3) | |
sch.annotate(block_or_loop=b4, ann_key="meta_schedule.unroll_explicit", ann_val=v100) | |
sch.enter_postproc() | |
l101, l102, l103, l104, l105, l106, l107 = sch.get_loops(block=b74) | |
l108, l109, l110 = sch.split(loop=l107, factors=[None, 32, 4]) | |
sch.vectorize(loop=l110) | |
sch.bind(loop=l109, thread_axis="threadIdx.x") | |
l111, l112, l113, l114, l115, l116, l117 = sch.get_loops(block=b87) | |
l118, l119, l120 = sch.split(loop=l117, factors=[None, 32, 2]) | |
sch.vectorize(loop=l120) | |
sch.bind(loop=l119, thread_axis="threadIdx.x") | |
b121 = sch.get_block(name="root", func_name="main") | |
sch.unannotate(block_or_loop=b121, ann_key="meta_schedule.unroll_explicit") | |
b122, b123, b124, b125 = sch.get_child_blocks(b121) | |
l126, l127, l128, l129, l130, l131, l132, l133, l134 = sch.get_loops(block=b122) | |
sch.annotate(block_or_loop=l126, ann_key="pragma_auto_unroll_max_step", ann_val=512) | |
sch.annotate(block_or_loop=l126, ann_key="pragma_unroll_explicit", ann_val=1) | |
l135, l136, l137, l138, l139, l140, l141, l142, l143 = sch.get_loops(block=b123) | |
sch.annotate(block_or_loop=l135, ann_key="pragma_auto_unroll_max_step", ann_val=512) | |
sch.annotate(block_or_loop=l135, ann_key="pragma_unroll_explicit", ann_val=1) | |
l144, l145, l146, l147, l148, l149, l150, l151, l152, l153, l154, l155, l156, l157, l158, l159, l160, l161, l162, l163 = sch.get_loops(block=b124) | |
sch.annotate(block_or_loop=l144, ann_key="pragma_auto_unroll_max_step", ann_val=512) | |
sch.annotate(block_or_loop=l144, ann_key="pragma_unroll_explicit", ann_val=1) | |
l164, l165, l166, l167, l168, l169, l170 = sch.get_loops(block=b125) | |
sch.annotate(block_or_loop=l164, ann_key="pragma_auto_unroll_max_step", ann_val=512) | |
sch.annotate(block_or_loop=l164, ann_key="pragma_unroll_explicit", ann_val=1) | |
b171 = sch.get_block(name="conv2d_nhwc", func_name="main") | |
l172, l173, l174, l175, l176, l177, l178, l179, l180, l181, l182, l183, l184, l185, l186, l187, l188, l189, l190, l191 = sch.get_loops(block=b171) | |
b192 = sch.decompose_reduction(block=b171, loop=l175) | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #1: "vm_mod_fused_nn_conv2d_add_nn_relu"] Trial #14: GFLOPs: 998.6803. Time: 0.2379 ms. Best GFLOPs: 3030.8113 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #1: "vm_mod_fused_nn_conv2d_add_nn_relu"] Trial #15: GFLOPs: 87.4917. Time: 2.7161 ms. Best GFLOPs: 3030.8113 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #1: "vm_mod_fused_nn_conv2d_add_nn_relu"] Trial #16: GFLOPs: 257.3368. Time: 0.9234 ms. Best GFLOPs: 3030.8113 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #1: "vm_mod_fused_nn_conv2d_add_nn_relu"] Trial #17: GFLOPs: 3198.9460. Time: 0.0743 ms. Best GFLOPs: 3198.9460 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #1: "vm_mod_fused_nn_conv2d_add_nn_relu"] Trial #18: GFLOPs: 883.0038. Time: 0.2691 ms. Best GFLOPs: 3198.9460 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:271: [Task #1: "vm_mod_fused_nn_conv2d_add_nn_relu"] Trial #19: Error in building: LocalRunner: An exception occurred | |
Subprocess terminated | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 1, 1, 64), "float32"], placeholder_1: T.Buffer[(7, 7, 3, 64), "float32"], placeholder_2: T.Buffer[(1, 224, 224, 3), "float32"], T_relu: T.Buffer[(1, 112, 112, 64), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"tir.noalias": True, "global_symbol": "main"}) | |
# body | |
# with T.block("root") | |
conv2d_nhwc_local = T.alloc_buffer([1, 112, 112, 64], dtype="float32", scope="local") | |
pad_temp_shared = T.alloc_buffer([1, 230, 230, 3], dtype="float32", scope="shared") | |
placeholder_shared = T.alloc_buffer([7, 7, 3, 64], dtype="float32", scope="shared") | |
for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(8, thread="blockIdx.x", annotations={"pragma_auto_unroll_max_step":64, "pragma_unroll_explicit":1}): | |
for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(2, thread="vthread.x"): | |
for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(56, thread="threadIdx.x"): | |
for i4_0 in T.serial(1): | |
for i2_3_init, i1_4_init, i2_4_init, i3_4_init in T.grid(2, 14, 4, 8): | |
with T.block("conv2d_nhwc_init"): | |
nn = T.axis.spatial(1, 0) | |
yy = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused * 14 + i1_4_init) | |
xx = T.axis.spatial(112, i0_1_i1_1_i2_1_i3_1_fused * 56 + i0_2_i1_2_i2_2_i3_2_fused // 8 * 8 + i2_3_init * 4 + i2_4_init) | |
ff = T.axis.spatial(64, i0_2_i1_2_i2_2_i3_2_fused % 8 * 8 + i3_4_init) | |
T.reads() | |
T.writes(conv2d_nhwc_local[nn, yy, xx, ff]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["conv2d_nhwc.gpu", ["TENSOR", [1, 224, 224, 3], "float32"], ["TENSOR", [7, 7, 3, 64], "float32"], [2, 2], [3, 3, 3, 3], [1, 1], "float32"]}) | |
conv2d_nhwc_local[nn, yy, xx, ff] = T.float32(0) | |
for i5_0, i6_0 in T.grid(7, 3): | |
for ax0_ax1_ax2_ax3_fused_0 in T.serial(66): | |
for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(56, thread="threadIdx.x"): | |
for ax0_ax1_ax2_ax3_fused_2 in T.vectorized(2): | |
with T.block("pad_temp_shared"): | |
v0 = T.axis.spatial(1, 0) | |
v1 = T.axis.spatial(230, i0_0_i1_0_i2_0_i3_0_fused * 28 + ((ax0_ax1_ax2_ax3_fused_0 * 56 + ax0_ax1_ax2_ax3_fused_1) * 2 + ax0_ax1_ax2_ax3_fused_2) % 7359 // 223) | |
v2 = T.axis.spatial(230, i5_0 + ((ax0_ax1_ax2_ax3_fused_0 * 56 + ax0_ax1_ax2_ax3_fused_1) * 2 + ax0_ax1_ax2_ax3_fused_2) % 223) | |
v3 = T.axis.spatial(3, i6_0 + 0) | |
T.where((ax0_ax1_ax2_ax3_fused_0 * 56 + ax0_ax1_ax2_ax3_fused_1) * 2 + ax0_ax1_ax2_ax3_fused_2 < 7359) | |
T.reads(placeholder_2[v0, v1 - 3, v2 - 3, v3]) | |
T.writes(pad_temp_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":2}) | |
pad_temp_shared[v0, v1, v2, v3] = T.if_then_else(3 <= v1 and v1 < 227 and 3 <= v2 and v2 < 227, placeholder_2[v0, v1 - 3, v2 - 3, v3], T.float32(0), dtype="float32") | |
for ax0_ax1_ax2_ax3_fused_0 in T.serial(4): | |
for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(56, thread="threadIdx.x"): | |
for ax0_ax1_ax2_ax3_fused_2 in T.vectorized(2): | |
with T.block("placeholder_shared"): | |
v0 = T.axis.spatial(7, (ax0_ax1_ax2_ax3_fused_0 * 112 + ax0_ax1_ax2_ax3_fused_1 * 2 + ax0_ax1_ax2_ax3_fused_2) // 64) | |
v1, v2 = T.axis.remap("SS", [i5_0, i6_0]) | |
v3 = T.axis.spatial(64, (ax0_ax1_ax2_ax3_fused_0 * 112 + ax0_ax1_ax2_ax3_fused_1 * 2 + ax0_ax1_ax2_ax3_fused_2) % 64) | |
T.reads(placeholder_1[v0, v1, v2, v3]) | |
T.writes(placeholder_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":2}) | |
placeholder_shared[v0, v1, v2, v3] = placeholder_1[v0, v1, v2, v3] | |
for i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3, i4_2, i5_2, i6_2, i0_4, i1_4, i2_4, i3_4 in T.grid(1, 1, 1, 1, 1, 2, 1, 7, 1, 1, 1, 14, 4, 8): | |
with T.block("conv2d_nhwc_update"): | |
nn = T.axis.spatial(1, 0) | |
yy = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused * 14 + i1_4) | |
xx = T.axis.spatial(112, i0_1_i1_1_i2_1_i3_1_fused * 56 + i0_2_i1_2_i2_2_i3_2_fused // 8 * 8 + i2_3 * 4 + i2_4) | |
ff = T.axis.spatial(64, i0_2_i1_2_i2_2_i3_2_fused % 8 * 8 + i3_4) | |
ry, rx, rc = T.axis.remap("RRR", [i4_2, i5_0, i6_0]) | |
T.reads(conv2d_nhwc_local[nn, yy, xx, ff], pad_temp_shared[nn, yy * 2 + ry, xx * 2 + rx, rc], placeholder_shared[ry, rx, rc, ff]) | |
T.writes(conv2d_nhwc_local[nn, yy, xx, ff]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["conv2d_nhwc.gpu", ["TENSOR", [1, 224, 224, 3], "float32"], ["TENSOR", [7, 7, 3, 64], "float32"], [2, 2], [3, 3, 3, 3], [1, 1], "float32"]}) | |
conv2d_nhwc_local[nn, yy, xx, ff] = conv2d_nhwc_local[nn, yy, xx, ff] + pad_temp_shared[nn, yy * 2 + ry, xx * 2 + rx, rc] * placeholder_shared[ry, rx, rc, ff] | |
for ax0, ax1, ax2, ax3 in T.grid(1, 14, 8, 8): | |
with T.block("conv2d_nhwc_local"): | |
v0 = T.axis.spatial(1, ax0) | |
v1 = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused * 14 + ax1) | |
v2 = T.axis.spatial(112, i0_1_i1_1_i2_1_i3_1_fused * 56 + i0_2_i1_2_i2_2_i3_2_fused // 8 * 8 + ax2) | |
v3 = T.axis.spatial(64, i0_2_i1_2_i2_2_i3_2_fused % 8 * 8 + ax3) | |
T.reads(conv2d_nhwc_local[v0, v1, v2, v3], placeholder[v0, 0, 0, v3]) | |
T.writes(T_relu[v0, v1, v2, v3]) | |
T_relu[v0, v1, v2, v3] = T.max(conv2d_nhwc_local[v0, v1, v2, v3] + placeholder[v0, 0, 0, v3], T.float32(0)) | |
b0 = sch.get_block(name="pad_temp", func_name="main") | |
b1 = sch.get_block(name="conv2d_nhwc", func_name="main") | |
b2 = sch.get_block(name="T_add", func_name="main") | |
b3 = sch.get_block(name="T_relu", func_name="main") | |
b4 = sch.get_block(name="root", func_name="main") | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS") | |
l5, l6, l7, l8, l9, l10, l11 = sch.get_loops(block=b1) | |
v12, v13, v14, v15, v16 = sch.sample_perfect_tile(loop=l5, n=5, max_innermost_factor=64, decision=[1, 1, 1, 1, 1]) | |
l17, l18, l19, l20, l21 = sch.split(loop=l5, factors=[v12, v13, v14, v15, v16]) | |
v22, v23, v24, v25, v26 = sch.sample_perfect_tile(loop=l6, n=5, max_innermost_factor=64, decision=[8, 1, 1, 1, 14]) | |
l27, l28, l29, l30, l31 = sch.split(loop=l6, factors=[v22, v23, v24, v25, v26]) | |
v32, v33, v34, v35, v36 = sch.sample_perfect_tile(loop=l7, n=5, max_innermost_factor=64, decision=[1, 2, 7, 2, 4]) | |
l37, l38, l39, l40, l41 = sch.split(loop=l7, factors=[v32, v33, v34, v35, v36]) | |
v42, v43, v44, v45, v46 = sch.sample_perfect_tile(loop=l8, n=5, max_innermost_factor=64, decision=[1, 1, 8, 1, 8]) | |
l47, l48, l49, l50, l51 = sch.split(loop=l8, factors=[v42, v43, v44, v45, v46]) | |
v52, v53, v54 = sch.sample_perfect_tile(loop=l9, n=3, max_innermost_factor=64, decision=[1, 1, 7]) | |
l55, l56, l57 = sch.split(loop=l9, factors=[v52, v53, v54]) | |
v58, v59, v60 = sch.sample_perfect_tile(loop=l10, n=3, max_innermost_factor=64, decision=[7, 1, 1]) | |
l61, l62, l63 = sch.split(loop=l10, factors=[v58, v59, v60]) | |
v64, v65, v66 = sch.sample_perfect_tile(loop=l11, n=3, max_innermost_factor=64, decision=[3, 1, 1]) | |
l67, l68, l69 = sch.split(loop=l11, factors=[v64, v65, v66]) | |
sch.reorder(l17, l27, l37, l47, l18, l28, l38, l48, l19, l29, l39, l49, l55, l61, l67, l56, l62, l68, l20, l30, l40, l50, l57, l63, l69, l21, l31, l41, l51) | |
l70 = sch.fuse(l17, l27, l37, l47) | |
sch.bind(loop=l70, thread_axis="blockIdx.x") | |
l71 = sch.fuse(l18, l28, l38, l48) | |
sch.bind(loop=l71, thread_axis="vthread.x") | |
l72 = sch.fuse(l19, l29, l39, l49) | |
sch.bind(loop=l72, thread_axis="threadIdx.x") | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.thread_extent_low_inclusive", ann_val=32) | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=1024) | |
b73 = sch.cache_write(block=b1, write_buffer_index=0, storage_scope="local") | |
sch.reverse_compute_at(block=b73, loop=l72, preserve_unit_loops=True) | |
b74 = sch.cache_read(block=b1, read_buffer_index=1, storage_scope="shared") | |
sch.compute_at(block=b74, loop=l67, preserve_unit_loops=True) | |
l75, l76, l77, l78, l79, l80, l81, l82, l83, l84 = sch.get_loops(block=b74) | |
l85 = sch.fuse(l81, l82, l83, l84) | |
v86 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=1) | |
sch.annotate(block_or_loop=b74, ann_key="meta_schedule.cooperative_fetch", ann_val=v86) | |
b87 = sch.cache_read(block=b1, read_buffer_index=2, storage_scope="shared") | |
sch.compute_at(block=b87, loop=l67, preserve_unit_loops=True) | |
l88, l89, l90, l91, l92, l93, l94, l95, l96, l97 = sch.get_loops(block=b87) | |
l98 = sch.fuse(l94, l95, l96, l97) | |
v99 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=1) | |
sch.annotate(block_or_loop=b87, ann_key="meta_schedule.cooperative_fetch", ann_val=v99) | |
sch.reverse_compute_inline(block=b3) | |
sch.reverse_compute_inline(block=b2) | |
sch.compute_inline(block=b0) | |
v100 = sch.sample_categorical(candidates=[0, 16, 64, 512, 1024], probs=[0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001], decision=2) | |
sch.annotate(block_or_loop=b4, ann_key="meta_schedule.unroll_explicit", ann_val=v100) | |
sch.enter_postproc() | |
l101, l102, l103, l104, l105, l106, l107 = sch.get_loops(block=b74) | |
l108, l109, l110 = sch.split(loop=l107, factors=[None, 56, 2]) | |
sch.vectorize(loop=l110) | |
sch.bind(loop=l109, thread_axis="threadIdx.x") | |
l111, l112, l113, l114, l115, l116, l117 = sch.get_loops(block=b87) | |
l118, l119, l120 = sch.split(loop=l117, factors=[None, 56, 2]) | |
sch.vectorize(loop=l120) | |
sch.bind(loop=l119, thread_axis="threadIdx.x") | |
b121 = sch.get_block(name="root", func_name="main") | |
sch.unannotate(block_or_loop=b121, ann_key="meta_schedule.unroll_explicit") | |
b122, b123, b124, b125 = sch.get_child_blocks(b121) | |
l126, l127, l128, l129, l130, l131, l132, l133, l134 = sch.get_loops(block=b122) | |
sch.annotate(block_or_loop=l126, ann_key="pragma_auto_unroll_max_step", ann_val=64) | |
sch.annotate(block_or_loop=l126, ann_key="pragma_unroll_explicit", ann_val=1) | |
l135, l136, l137, l138, l139, l140, l141, l142, l143 = sch.get_loops(block=b123) | |
sch.annotate(block_or_loop=l135, ann_key="pragma_auto_unroll_max_step", ann_val=64) | |
sch.annotate(block_or_loop=l135, ann_key="pragma_unroll_explicit", ann_val=1) | |
l144, l145, l146, l147, l148, l149, l150, l151, l152, l153, l154, l155, l156, l157, l158, l159, l160, l161, l162, l163 = sch.get_loops(block=b124) | |
sch.annotate(block_or_loop=l144, ann_key="pragma_auto_unroll_max_step", ann_val=64) | |
sch.annotate(block_or_loop=l144, ann_key="pragma_unroll_explicit", ann_val=1) | |
l164, l165, l166, l167, l168, l169, l170 = sch.get_loops(block=b125) | |
sch.annotate(block_or_loop=l164, ann_key="pragma_auto_unroll_max_step", ann_val=64) | |
sch.annotate(block_or_loop=l164, ann_key="pragma_unroll_explicit", ann_val=1) | |
b171 = sch.get_block(name="conv2d_nhwc", func_name="main") | |
l172, l173, l174, l175, l176, l177, l178, l179, l180, l181, l182, l183, l184, l185, l186, l187, l188, l189, l190, l191 = sch.get_loops(block=b171) | |
b192 = sch.decompose_reduction(block=b171, loop=l176) | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #1: "vm_mod_fused_nn_conv2d_add_nn_relu"] Trial #20: GFLOPs: 704.5566. Time: 0.3373 ms. Best GFLOPs: 3198.9460 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #1: "vm_mod_fused_nn_conv2d_add_nn_relu"] Trial #21: GFLOPs: 874.5672. Time: 0.2717 ms. Best GFLOPs: 3198.9460 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #1: "vm_mod_fused_nn_conv2d_add_nn_relu"] Trial #22: GFLOPs: 32.4109. Time: 7.3319 ms. Best GFLOPs: 3198.9460 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #1: "vm_mod_fused_nn_conv2d_add_nn_relu"] Trial #23: GFLOPs: 123.4548. Time: 1.9249 ms. Best GFLOPs: 3198.9460 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #1: "vm_mod_fused_nn_conv2d_add_nn_relu"] Trial #24: GFLOPs: 2620.3134. Time: 0.0907 ms. Best GFLOPs: 3198.9460 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #1: "vm_mod_fused_nn_conv2d_add_nn_relu"] Trial #25: GFLOPs: 1450.6291. Time: 0.1638 ms. Best GFLOPs: 3198.9460 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #1: "vm_mod_fused_nn_conv2d_add_nn_relu"] Trial #26: GFLOPs: 211.2150. Time: 1.1251 ms. Best GFLOPs: 3198.9460 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #1: "vm_mod_fused_nn_conv2d_add_nn_relu"] Trial #27: GFLOPs: 268.4614. Time: 0.8852 ms. Best GFLOPs: 3198.9460 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #1: "vm_mod_fused_nn_conv2d_add_nn_relu"] Trial #28: GFLOPs: 130.4696. Time: 1.8214 ms. Best GFLOPs: 3198.9460 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #1: "vm_mod_fused_nn_conv2d_add_nn_relu"] Trial #29: GFLOPs: 55.7257. Time: 4.2643 ms. Best GFLOPs: 3198.9460 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:271: [Task #1: "vm_mod_fused_nn_conv2d_add_nn_relu"] Trial #30: Error in building: LocalRunner: An exception occurred | |
Subprocess terminated | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 1, 1, 64), "float32"], placeholder_1: T.Buffer[(7, 7, 3, 64), "float32"], placeholder_2: T.Buffer[(1, 224, 224, 3), "float32"], T_relu: T.Buffer[(1, 112, 112, 64), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"tir.noalias": True, "global_symbol": "main"}) | |
# body | |
# with T.block("root") | |
conv2d_nhwc_local = T.alloc_buffer([1, 112, 112, 64], dtype="float32", scope="local") | |
pad_temp_shared = T.alloc_buffer([1, 230, 230, 3], dtype="float32", scope="shared") | |
placeholder_shared = T.alloc_buffer([7, 7, 3, 64], dtype="float32", scope="shared") | |
for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(8, thread="blockIdx.x", annotations={"pragma_auto_unroll_max_step":16, "pragma_unroll_explicit":1}): | |
for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(8, thread="vthread.x"): | |
for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(56, thread="threadIdx.x"): | |
for i4_0, i5_0 in T.grid(1, 1): | |
for i1_3_init, i2_3_init, i1_4_init, i3_4_init in T.grid(2, 2, 14, 4): | |
with T.block("conv2d_nhwc_init"): | |
nn = T.axis.spatial(1, 0) | |
yy = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused // 2 * 28 + i1_3_init * 14 + i1_4_init) | |
xx = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused % 2 * 56 + i0_1_i1_1_i2_1_i3_1_fused // 2 * 14 + i0_2_i1_2_i2_2_i3_2_fused // 8 * 2 + i2_3_init) | |
ff = T.axis.spatial(64, i0_1_i1_1_i2_1_i3_1_fused % 2 * 32 + i0_2_i1_2_i2_2_i3_2_fused % 8 * 4 + i3_4_init) | |
T.reads() | |
T.writes(conv2d_nhwc_local[nn, yy, xx, ff]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["conv2d_nhwc.gpu", ["TENSOR", [1, 224, 224, 3], "float32"], ["TENSOR", [7, 7, 3, 64], "float32"], [2, 2], [3, 3, 3, 3], [1, 1], "float32"]}) | |
conv2d_nhwc_local[nn, yy, xx, ff] = T.float32(0) | |
for i6_0 in T.serial(3): | |
for ax0_ax1_ax2_ax3_fused_0 in T.serial(43): | |
for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(56, thread="threadIdx.x"): | |
for ax0_ax1_ax2_ax3_fused_2 in T.vectorized(3): | |
with T.block("pad_temp_shared"): | |
v0 = T.axis.spatial(1, 0) | |
v1 = T.axis.spatial(230, i0_0_i1_0_i2_0_i3_0_fused // 2 * 56 + ((ax0_ax1_ax2_ax3_fused_0 * 56 + ax0_ax1_ax2_ax3_fused_1) * 3 + ax0_ax1_ax2_ax3_fused_2) % 7137 // 117) | |
v2 = T.axis.spatial(230, i0_0_i1_0_i2_0_i3_0_fused % 2 * 112 + ((ax0_ax1_ax2_ax3_fused_0 * 56 + ax0_ax1_ax2_ax3_fused_1) * 3 + ax0_ax1_ax2_ax3_fused_2) % 117) | |
v3 = T.axis.spatial(3, i6_0 + 0) | |
T.where((ax0_ax1_ax2_ax3_fused_0 * 56 + ax0_ax1_ax2_ax3_fused_1) * 3 + ax0_ax1_ax2_ax3_fused_2 < 7137) | |
T.reads(placeholder_2[v0, v1 - 3, v2 - 3, v3]) | |
T.writes(pad_temp_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":3}) | |
pad_temp_shared[v0, v1, v2, v3] = T.if_then_else(3 <= v1 and v1 < 227 and 3 <= v2 and v2 < 227, placeholder_2[v0, v1 - 3, v2 - 3, v3], T.float32(0), dtype="float32") | |
for ax0_ax1_ax2_ax3_fused_0 in T.serial(28): | |
for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(56, thread="threadIdx.x"): | |
for ax0_ax1_ax2_ax3_fused_2 in T.vectorized(2): | |
with T.block("placeholder_shared"): | |
v0 = T.axis.spatial(7, (ax0_ax1_ax2_ax3_fused_0 * 112 + ax0_ax1_ax2_ax3_fused_1 * 2 + ax0_ax1_ax2_ax3_fused_2) // 448) | |
v1 = T.axis.spatial(7, (ax0_ax1_ax2_ax3_fused_0 * 112 + ax0_ax1_ax2_ax3_fused_1 * 2 + ax0_ax1_ax2_ax3_fused_2) % 448 // 64) | |
v2 = T.axis.spatial(3, i6_0) | |
v3 = T.axis.spatial(64, (ax0_ax1_ax2_ax3_fused_0 * 112 + ax0_ax1_ax2_ax3_fused_1 * 2 + ax0_ax1_ax2_ax3_fused_2) % 64) | |
T.reads(placeholder_1[v0, v1, v2, v3]) | |
T.writes(placeholder_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":2}) | |
placeholder_shared[v0, v1, v2, v3] = placeholder_1[v0, v1, v2, v3] | |
for i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3, i4_2, i5_2, i6_2, i0_4, i1_4, i2_4, i3_4 in T.grid(1, 1, 1, 1, 2, 2, 1, 7, 7, 1, 1, 14, 1, 4): | |
with T.block("conv2d_nhwc_update"): | |
nn = T.axis.spatial(1, 0) | |
yy = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused // 2 * 28 + i1_3 * 14 + i1_4) | |
xx = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused % 2 * 56 + i0_1_i1_1_i2_1_i3_1_fused // 2 * 14 + i0_2_i1_2_i2_2_i3_2_fused // 8 * 2 + i2_3) | |
ff = T.axis.spatial(64, i0_1_i1_1_i2_1_i3_1_fused % 2 * 32 + i0_2_i1_2_i2_2_i3_2_fused % 8 * 4 + i3_4) | |
ry, rx, rc = T.axis.remap("RRR", [i4_2, i5_2, i6_0]) | |
T.reads(conv2d_nhwc_local[nn, yy, xx, ff], pad_temp_shared[nn, yy * 2 + ry, xx * 2 + rx, rc], placeholder_shared[ry, rx, rc, ff]) | |
T.writes(conv2d_nhwc_local[nn, yy, xx, ff]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["conv2d_nhwc.gpu", ["TENSOR", [1, 224, 224, 3], "float32"], ["TENSOR", [7, 7, 3, 64], "float32"], [2, 2], [3, 3, 3, 3], [1, 1], "float32"]}) | |
conv2d_nhwc_local[nn, yy, xx, ff] = conv2d_nhwc_local[nn, yy, xx, ff] + pad_temp_shared[nn, yy * 2 + ry, xx * 2 + rx, rc] * placeholder_shared[ry, rx, rc, ff] | |
for ax0, ax1, ax2, ax3 in T.grid(1, 28, 2, 4): | |
with T.block("conv2d_nhwc_local"): | |
v0 = T.axis.spatial(1, ax0) | |
v1 = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused // 2 * 28 + ax1) | |
v2 = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused % 2 * 56 + i0_1_i1_1_i2_1_i3_1_fused // 2 * 14 + i0_2_i1_2_i2_2_i3_2_fused // 8 * 2 + ax2) | |
v3 = T.axis.spatial(64, i0_1_i1_1_i2_1_i3_1_fused % 2 * 32 + i0_2_i1_2_i2_2_i3_2_fused % 8 * 4 + ax3) | |
T.reads(conv2d_nhwc_local[v0, v1, v2, v3], placeholder[v0, 0, 0, v3]) | |
T.writes(T_relu[v0, v1, v2, v3]) | |
T_relu[v0, v1, v2, v3] = T.max(conv2d_nhwc_local[v0, v1, v2, v3] + placeholder[v0, 0, 0, v3], T.float32(0)) | |
b0 = sch.get_block(name="pad_temp", func_name="main") | |
b1 = sch.get_block(name="conv2d_nhwc", func_name="main") | |
b2 = sch.get_block(name="T_add", func_name="main") | |
b3 = sch.get_block(name="T_relu", func_name="main") | |
b4 = sch.get_block(name="root", func_name="main") | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS") | |
l5, l6, l7, l8, l9, l10, l11 = sch.get_loops(block=b1) | |
v12, v13, v14, v15, v16 = sch.sample_perfect_tile(loop=l5, n=5, max_innermost_factor=64, decision=[1, 1, 1, 1, 1]) | |
l17, l18, l19, l20, l21 = sch.split(loop=l5, factors=[v12, v13, v14, v15, v16]) | |
v22, v23, v24, v25, v26 = sch.sample_perfect_tile(loop=l6, n=5, max_innermost_factor=64, decision=[4, 1, 1, 2, 14]) | |
l27, l28, l29, l30, l31 = sch.split(loop=l6, factors=[v22, v23, v24, v25, v26]) | |
v32, v33, v34, v35, v36 = sch.sample_perfect_tile(loop=l7, n=5, max_innermost_factor=64, decision=[2, 4, 7, 2, 1]) | |
l37, l38, l39, l40, l41 = sch.split(loop=l7, factors=[v32, v33, v34, v35, v36]) | |
v42, v43, v44, v45, v46 = sch.sample_perfect_tile(loop=l8, n=5, max_innermost_factor=64, decision=[1, 2, 8, 1, 4]) | |
l47, l48, l49, l50, l51 = sch.split(loop=l8, factors=[v42, v43, v44, v45, v46]) | |
v52, v53, v54 = sch.sample_perfect_tile(loop=l9, n=3, max_innermost_factor=64, decision=[1, 1, 7]) | |
l55, l56, l57 = sch.split(loop=l9, factors=[v52, v53, v54]) | |
v58, v59, v60 = sch.sample_perfect_tile(loop=l10, n=3, max_innermost_factor=64, decision=[1, 1, 7]) | |
l61, l62, l63 = sch.split(loop=l10, factors=[v58, v59, v60]) | |
v64, v65, v66 = sch.sample_perfect_tile(loop=l11, n=3, max_innermost_factor=64, decision=[3, 1, 1]) | |
l67, l68, l69 = sch.split(loop=l11, factors=[v64, v65, v66]) | |
sch.reorder(l17, l27, l37, l47, l18, l28, l38, l48, l19, l29, l39, l49, l55, l61, l67, l56, l62, l68, l20, l30, l40, l50, l57, l63, l69, l21, l31, l41, l51) | |
l70 = sch.fuse(l17, l27, l37, l47) | |
sch.bind(loop=l70, thread_axis="blockIdx.x") | |
l71 = sch.fuse(l18, l28, l38, l48) | |
sch.bind(loop=l71, thread_axis="vthread.x") | |
l72 = sch.fuse(l19, l29, l39, l49) | |
sch.bind(loop=l72, thread_axis="threadIdx.x") | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.thread_extent_low_inclusive", ann_val=32) | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=1024) | |
b73 = sch.cache_write(block=b1, write_buffer_index=0, storage_scope="local") | |
sch.reverse_compute_at(block=b73, loop=l72, preserve_unit_loops=True) | |
b74 = sch.cache_read(block=b1, read_buffer_index=1, storage_scope="shared") | |
sch.compute_at(block=b74, loop=l67, preserve_unit_loops=True) | |
l75, l76, l77, l78, l79, l80, l81, l82, l83, l84 = sch.get_loops(block=b74) | |
l85 = sch.fuse(l81, l82, l83, l84) | |
v86 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=2) | |
sch.annotate(block_or_loop=b74, ann_key="meta_schedule.cooperative_fetch", ann_val=v86) | |
b87 = sch.cache_read(block=b1, read_buffer_index=2, storage_scope="shared") | |
sch.compute_at(block=b87, loop=l67, preserve_unit_loops=True) | |
l88, l89, l90, l91, l92, l93, l94, l95, l96, l97 = sch.get_loops(block=b87) | |
l98 = sch.fuse(l94, l95, l96, l97) | |
v99 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=1) | |
sch.annotate(block_or_loop=b87, ann_key="meta_schedule.cooperative_fetch", ann_val=v99) | |
sch.reverse_compute_inline(block=b3) | |
sch.reverse_compute_inline(block=b2) | |
sch.compute_inline(block=b0) | |
v100 = sch.sample_categorical(candidates=[0, 16, 64, 512, 1024], probs=[0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001], decision=1) | |
sch.annotate(block_or_loop=b4, ann_key="meta_schedule.unroll_explicit", ann_val=v100) | |
sch.enter_postproc() | |
l101, l102, l103, l104, l105, l106, l107 = sch.get_loops(block=b74) | |
l108, l109, l110 = sch.split(loop=l107, factors=[None, 56, 3]) | |
sch.vectorize(loop=l110) | |
sch.bind(loop=l109, thread_axis="threadIdx.x") | |
l111, l112, l113, l114, l115, l116, l117 = sch.get_loops(block=b87) | |
l118, l119, l120 = sch.split(loop=l117, factors=[None, 56, 2]) | |
sch.vectorize(loop=l120) | |
sch.bind(loop=l119, thread_axis="threadIdx.x") | |
b121 = sch.get_block(name="root", func_name="main") | |
sch.unannotate(block_or_loop=b121, ann_key="meta_schedule.unroll_explicit") | |
b122, b123, b124, b125 = sch.get_child_blocks(b121) | |
l126, l127, l128, l129, l130, l131, l132, l133, l134 = sch.get_loops(block=b122) | |
sch.annotate(block_or_loop=l126, ann_key="pragma_auto_unroll_max_step", ann_val=16) | |
sch.annotate(block_or_loop=l126, ann_key="pragma_unroll_explicit", ann_val=1) | |
l135, l136, l137, l138, l139, l140, l141, l142, l143 = sch.get_loops(block=b123) | |
sch.annotate(block_or_loop=l135, ann_key="pragma_auto_unroll_max_step", ann_val=16) | |
sch.annotate(block_or_loop=l135, ann_key="pragma_unroll_explicit", ann_val=1) | |
l144, l145, l146, l147, l148, l149, l150, l151, l152, l153, l154, l155, l156, l157, l158, l159, l160, l161, l162, l163 = sch.get_loops(block=b124) | |
sch.annotate(block_or_loop=l144, ann_key="pragma_auto_unroll_max_step", ann_val=16) | |
sch.annotate(block_or_loop=l144, ann_key="pragma_unroll_explicit", ann_val=1) | |
l164, l165, l166, l167, l168, l169, l170 = sch.get_loops(block=b125) | |
sch.annotate(block_or_loop=l164, ann_key="pragma_auto_unroll_max_step", ann_val=16) | |
sch.annotate(block_or_loop=l164, ann_key="pragma_unroll_explicit", ann_val=1) | |
b171 = sch.get_block(name="conv2d_nhwc", func_name="main") | |
l172, l173, l174, l175, l176, l177, l178, l179, l180, l181, l182, l183, l184, l185, l186, l187, l188, l189, l190, l191 = sch.get_loops(block=b171) | |
b192 = sch.decompose_reduction(block=b171, loop=l177) | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #1: "vm_mod_fused_nn_conv2d_add_nn_relu"] Trial #31: GFLOPs: 812.9236. Time: 0.2923 ms. Best GFLOPs: 3198.9460 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB validation: p-rmse: 3.969706 a-peak@32: 0.863064 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 0: tr-p-rmse: 0.875849 tr-a-peak@32: 0.050899 tr-rmse: 0.392205 tr-rmse: 0.392205 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 25: tr-p-rmse: 0.034387 tr-a-peak@32: 0.999999 tr-rmse: 0.022466 tr-rmse: 0.022466 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 50: tr-p-rmse: 0.038638 tr-a-peak@32: 0.999999 tr-rmse: 0.022278 tr-rmse: 0.022278 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB stopped. Best iteration: [14] tr-p-rmse:0.02555 tr-a-peak@32:1.00000 tr-rmse:0.02951 tr-rmse:0.02951 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:127: Scheduler picks Task #1: "vm_mod_fused_nn_conv2d_add_nn_relu" | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:141: Task #1 has finished. Remaining task(s): 18 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #2: "vm_mod_fused_nn_max_pool2d"] Trial #0: GFLOPs: 286.8822. Time: 0.0063 ms. Best GFLOPs: 286.8822 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #2: "vm_mod_fused_nn_max_pool2d"] Trial #1: GFLOPs: 286.5829. Time: 0.0063 ms. Best GFLOPs: 286.8822 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #2: "vm_mod_fused_nn_max_pool2d"] Trial #2: GFLOPs: 286.9773. Time: 0.0063 ms. Best GFLOPs: 286.9773 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #2: "vm_mod_fused_nn_max_pool2d"] Trial #3: GFLOPs: 286.7742. Time: 0.0063 ms. Best GFLOPs: 286.9773 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #2: "vm_mod_fused_nn_max_pool2d"] Trial #4: GFLOPs: 284.2400. Time: 0.0064 ms. Best GFLOPs: 286.9773 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #2: "vm_mod_fused_nn_max_pool2d"] Trial #5: GFLOPs: 284.1844. Time: 0.0064 ms. Best GFLOPs: 286.9773 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #2: "vm_mod_fused_nn_max_pool2d"] Trial #6: GFLOPs: 284.1610. Time: 0.0064 ms. Best GFLOPs: 286.9773 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #2: "vm_mod_fused_nn_max_pool2d"] Trial #7: GFLOPs: 284.3318. Time: 0.0064 ms. Best GFLOPs: 286.9773 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #2: "vm_mod_fused_nn_max_pool2d"] Trial #8: GFLOPs: 284.2536. Time: 0.0064 ms. Best GFLOPs: 286.9773 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #2: "vm_mod_fused_nn_max_pool2d"] Trial #9: GFLOPs: 284.3603. Time: 0.0064 ms. Best GFLOPs: 286.9773 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #2: "vm_mod_fused_nn_max_pool2d"] Trial #10: GFLOPs: 284.3180. Time: 0.0064 ms. Best GFLOPs: 286.9773 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #2: "vm_mod_fused_nn_max_pool2d"] Trial #11: GFLOPs: 284.3937. Time: 0.0064 ms. Best GFLOPs: 286.9773 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #2: "vm_mod_fused_nn_max_pool2d"] Trial #12: GFLOPs: 284.3008. Time: 0.0064 ms. Best GFLOPs: 286.9773 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #2: "vm_mod_fused_nn_max_pool2d"] Trial #13: GFLOPs: 284.3790. Time: 0.0064 ms. Best GFLOPs: 286.9773 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #2: "vm_mod_fused_nn_max_pool2d"] Trial #14: GFLOPs: 284.3845. Time: 0.0064 ms. Best GFLOPs: 286.9773 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #2: "vm_mod_fused_nn_max_pool2d"] Trial #15: GFLOPs: 284.0858. Time: 0.0064 ms. Best GFLOPs: 286.9773 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #2: "vm_mod_fused_nn_max_pool2d"] Trial #16: GFLOPs: 284.3989. Time: 0.0064 ms. Best GFLOPs: 286.9773 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #2: "vm_mod_fused_nn_max_pool2d"] Trial #17: GFLOPs: 292.8340. Time: 0.0062 ms. Best GFLOPs: 292.8340 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #2: "vm_mod_fused_nn_max_pool2d"] Trial #18: GFLOPs: 284.5754. Time: 0.0063 ms. Best GFLOPs: 292.8340 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #2: "vm_mod_fused_nn_max_pool2d"] Trial #19: GFLOPs: 284.1538. Time: 0.0064 ms. Best GFLOPs: 292.8340 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #2: "vm_mod_fused_nn_max_pool2d"] Trial #20: GFLOPs: 293.0965. Time: 0.0062 ms. Best GFLOPs: 293.0965 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #2: "vm_mod_fused_nn_max_pool2d"] Trial #21: GFLOPs: 284.4264. Time: 0.0064 ms. Best GFLOPs: 293.0965 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #2: "vm_mod_fused_nn_max_pool2d"] Trial #22: GFLOPs: 293.0809. Time: 0.0062 ms. Best GFLOPs: 293.0965 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #2: "vm_mod_fused_nn_max_pool2d"] Trial #23: GFLOPs: 284.4304. Time: 0.0064 ms. Best GFLOPs: 293.0965 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #2: "vm_mod_fused_nn_max_pool2d"] Trial #24: GFLOPs: 284.4495. Time: 0.0064 ms. Best GFLOPs: 293.0965 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #2: "vm_mod_fused_nn_max_pool2d"] Trial #25: GFLOPs: 284.4162. Time: 0.0064 ms. Best GFLOPs: 293.0965 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #2: "vm_mod_fused_nn_max_pool2d"] Trial #26: GFLOPs: 284.4193. Time: 0.0064 ms. Best GFLOPs: 293.0965 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #2: "vm_mod_fused_nn_max_pool2d"] Trial #27: GFLOPs: 284.4024. Time: 0.0064 ms. Best GFLOPs: 293.0965 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #2: "vm_mod_fused_nn_max_pool2d"] Trial #28: GFLOPs: 284.4777. Time: 0.0063 ms. Best GFLOPs: 293.0965 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #2: "vm_mod_fused_nn_max_pool2d"] Trial #29: GFLOPs: 283.9976. Time: 0.0064 ms. Best GFLOPs: 293.0965 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #2: "vm_mod_fused_nn_max_pool2d"] Trial #30: GFLOPs: 284.4067. Time: 0.0064 ms. Best GFLOPs: 293.0965 | |
[14:41:40] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #2: "vm_mod_fused_nn_max_pool2d"] Trial #31: GFLOPs: 284.0343. Time: 0.0064 ms. Best GFLOPs: 293.0965 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB validation: p-rmse: 0.246248 a-peak@32: 0.987542 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 0: tr-p-rmse: 0.803377 tr-a-peak@32: 0.060932 tr-rmse: 0.325151 tr-rmse: 0.325151 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 25: tr-p-rmse: 0.088199 tr-a-peak@32: 0.999982 tr-rmse: 0.017458 tr-rmse: 0.017458 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 50: tr-p-rmse: 0.090216 tr-a-peak@32: 0.999982 tr-rmse: 0.017325 tr-rmse: 0.017325 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB stopped. Best iteration: [12] tr-p-rmse:0.08089 tr-a-peak@32:0.99581 tr-rmse:0.03017 tr-rmse:0.03017 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:127: Scheduler picks Task #2: "vm_mod_fused_nn_max_pool2d" | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:141: Task #2 has finished. Remaining task(s): 17 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #3: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu"] Trial #0: GFLOPs: 330.3803. Time: 0.3894 ms. Best GFLOPs: 330.3803 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #3: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu"] Trial #1: GFLOPs: 635.3665. Time: 0.2025 ms. Best GFLOPs: 635.3665 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #3: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu"] Trial #2: GFLOPs: 65.1227. Time: 1.9755 ms. Best GFLOPs: 635.3665 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #3: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu"] Trial #3: GFLOPs: 92.5347. Time: 1.3903 ms. Best GFLOPs: 635.3665 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #3: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu"] Trial #4: GFLOPs: 597.6308. Time: 0.2153 ms. Best GFLOPs: 635.3665 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #3: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu"] Trial #5: GFLOPs: 124.3644. Time: 1.0345 ms. Best GFLOPs: 635.3665 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #3: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu"] Trial #6: GFLOPs: 555.5140. Time: 0.2316 ms. Best GFLOPs: 635.3665 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #3: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu"] Trial #7: GFLOPs: 431.0264. Time: 0.2985 ms. Best GFLOPs: 635.3665 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #3: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu"] Trial #8: GFLOPs: 234.9563. Time: 0.5476 ms. Best GFLOPs: 635.3665 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #3: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu"] Trial #9: GFLOPs: 296.8389. Time: 0.4334 ms. Best GFLOPs: 635.3665 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #3: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu"] Trial #10: GFLOPs: 305.2682. Time: 0.4214 ms. Best GFLOPs: 635.3665 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #3: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu"] Trial #11: GFLOPs: 286.5480. Time: 0.4490 ms. Best GFLOPs: 635.3665 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #3: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu"] Trial #12: GFLOPs: 18.7452. Time: 6.8631 ms. Best GFLOPs: 635.3665 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #3: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu"] Trial #13: GFLOPs: 245.0988. Time: 0.5249 ms. Best GFLOPs: 635.3665 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #3: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu"] Trial #14: GFLOPs: 480.6540. Time: 0.2677 ms. Best GFLOPs: 635.3665 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #3: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu"] Trial #15: GFLOPs: 231.6771. Time: 0.5553 ms. Best GFLOPs: 635.3665 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #3: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu"] Trial #16: GFLOPs: 558.1659. Time: 0.2305 ms. Best GFLOPs: 635.3665 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #3: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu"] Trial #17: GFLOPs: 551.6620. Time: 0.2332 ms. Best GFLOPs: 635.3665 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #3: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu"] Trial #18: GFLOPs: 383.8231. Time: 0.3352 ms. Best GFLOPs: 635.3665 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #3: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu"] Trial #19: GFLOPs: 140.9558. Time: 0.9127 ms. Best GFLOPs: 635.3665 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #3: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu"] Trial #20: GFLOPs: 81.8270. Time: 1.5722 ms. Best GFLOPs: 635.3665 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #3: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu"] Trial #21: GFLOPs: 53.2334. Time: 2.4167 ms. Best GFLOPs: 635.3665 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #3: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu"] Trial #22: GFLOPs: 132.2959. Time: 0.9725 ms. Best GFLOPs: 635.3665 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #3: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu"] Trial #23: GFLOPs: 75.1869. Time: 1.7111 ms. Best GFLOPs: 635.3665 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #3: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu"] Trial #24: GFLOPs: 4.3807. Time: 29.3675 ms. Best GFLOPs: 635.3665 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #3: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu"] Trial #25: GFLOPs: 52.6769. Time: 2.4423 ms. Best GFLOPs: 635.3665 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #3: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu"] Trial #26: GFLOPs: 115.4836. Time: 1.1140 ms. Best GFLOPs: 635.3665 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #3: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu"] Trial #27: GFLOPs: 211.1236. Time: 0.6094 ms. Best GFLOPs: 635.3665 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #3: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu"] Trial #28: GFLOPs: 69.2966. Time: 1.8565 ms. Best GFLOPs: 635.3665 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #3: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu"] Trial #29: GFLOPs: 296.2717. Time: 0.4342 ms. Best GFLOPs: 635.3665 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #3: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu"] Trial #30: GFLOPs: 85.1386. Time: 1.5111 ms. Best GFLOPs: 635.3665 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #3: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu"] Trial #31: GFLOPs: 32.7273. Time: 3.9310 ms. Best GFLOPs: 635.3665 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB validation: p-rmse: 1.358813 a-peak@32: 0.975116 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 0: tr-p-rmse: 0.212889 tr-a-peak@32: 0.995814 tr-rmse: 0.318102 tr-rmse: 0.318102 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 25: tr-p-rmse: 0.090687 tr-a-peak@32: 0.999982 tr-rmse: 0.032430 tr-rmse: 0.032430 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 50: tr-p-rmse: 0.092890 tr-a-peak@32: 0.999982 tr-rmse: 0.032326 tr-rmse: 0.032326 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB stopped. Best iteration: [13] tr-p-rmse:0.08855 tr-a-peak@32:0.99581 tr-rmse:0.03755 tr-rmse:0.03755 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:127: Scheduler picks Task #3: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu" | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:141: Task #3 has finished. Remaining task(s): 16 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #4: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu"] Trial #0: GFLOPs: 479.5018. Time: 0.2687 ms. Best GFLOPs: 479.5018 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #4: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu"] Trial #1: GFLOPs: 400.7620. Time: 0.3215 ms. Best GFLOPs: 479.5018 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #4: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu"] Trial #2: GFLOPs: 67.7644. Time: 1.9015 ms. Best GFLOPs: 479.5018 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #4: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu"] Trial #3: GFLOPs: 541.9501. Time: 0.2378 ms. Best GFLOPs: 541.9501 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #4: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu"] Trial #4: GFLOPs: 93.5163. Time: 1.3779 ms. Best GFLOPs: 541.9501 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #4: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu"] Trial #5: GFLOPs: 332.1356. Time: 0.3879 ms. Best GFLOPs: 541.9501 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #4: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu"] Trial #6: GFLOPs: 159.3918. Time: 0.8084 ms. Best GFLOPs: 541.9501 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #4: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu"] Trial #7: GFLOPs: 530.7538. Time: 0.2428 ms. Best GFLOPs: 541.9501 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #4: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu"] Trial #8: GFLOPs: 494.5874. Time: 0.2605 ms. Best GFLOPs: 541.9501 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #4: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu"] Trial #9: GFLOPs: 30.4532. Time: 4.2311 ms. Best GFLOPs: 541.9501 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #4: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu"] Trial #10: GFLOPs: 359.5534. Time: 0.3584 ms. Best GFLOPs: 541.9501 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #4: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu"] Trial #11: GFLOPs: 41.0293. Time: 3.1405 ms. Best GFLOPs: 541.9501 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #4: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu"] Trial #12: GFLOPs: 32.7001. Time: 3.9404 ms. Best GFLOPs: 541.9501 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #4: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu"] Trial #13: GFLOPs: 302.0808. Time: 0.4265 ms. Best GFLOPs: 541.9501 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #4: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu"] Trial #14: GFLOPs: 349.1851. Time: 0.3690 ms. Best GFLOPs: 541.9501 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #4: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu"] Trial #15: GFLOPs: 326.7315. Time: 0.3944 ms. Best GFLOPs: 541.9501 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #4: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu"] Trial #16: GFLOPs: 595.7175. Time: 0.2163 ms. Best GFLOPs: 595.7175 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #4: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu"] Trial #17: GFLOPs: 276.5458. Time: 0.4659 ms. Best GFLOPs: 595.7175 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #4: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu"] Trial #18: GFLOPs: 72.9879. Time: 1.7654 ms. Best GFLOPs: 595.7175 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #4: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu"] Trial #19: GFLOPs: 225.3864. Time: 0.5717 ms. Best GFLOPs: 595.7175 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #4: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu"] Trial #20: GFLOPs: 563.5579. Time: 0.2286 ms. Best GFLOPs: 595.7175 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #4: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu"] Trial #21: GFLOPs: 82.1950. Time: 1.5676 ms. Best GFLOPs: 595.7175 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #4: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu"] Trial #22: GFLOPs: 410.1290. Time: 0.3142 ms. Best GFLOPs: 595.7175 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #4: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu"] Trial #23: GFLOPs: 58.9400. Time: 2.1862 ms. Best GFLOPs: 595.7175 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #4: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu"] Trial #24: GFLOPs: 71.3278. Time: 1.8065 ms. Best GFLOPs: 595.7175 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #4: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu"] Trial #25: GFLOPs: 139.5297. Time: 0.9235 ms. Best GFLOPs: 595.7175 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #4: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu"] Trial #26: GFLOPs: 38.3549. Time: 3.3595 ms. Best GFLOPs: 595.7175 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #4: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu"] Trial #27: GFLOPs: 297.6361. Time: 0.4329 ms. Best GFLOPs: 595.7175 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #4: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu"] Trial #28: GFLOPs: 80.3955. Time: 1.6027 ms. Best GFLOPs: 595.7175 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #4: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu"] Trial #29: GFLOPs: 632.6474. Time: 0.2037 ms. Best GFLOPs: 632.6474 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #4: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu"] Trial #30: GFLOPs: 54.5540. Time: 2.3619 ms. Best GFLOPs: 632.6474 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #4: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu"] Trial #31: GFLOPs: 189.6052. Time: 0.6796 ms. Best GFLOPs: 632.6474 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB validation: p-rmse: 0.049624 a-peak@32: 0.926147 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 0: tr-p-rmse: 0.704695 tr-a-peak@32: 0.995814 tr-rmse: 0.314108 tr-rmse: 0.314108 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 25: tr-p-rmse: 0.063973 tr-a-peak@32: 0.999981 tr-rmse: 0.037766 tr-rmse: 0.037766 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 50: tr-p-rmse: 0.063420 tr-a-peak@32: 0.999981 tr-rmse: 0.037767 tr-rmse: 0.037767 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB stopped. Best iteration: [9] tr-p-rmse:0.06079 tr-a-peak@32:0.99581 tr-rmse:0.05846 tr-rmse:0.05846 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:127: Scheduler picks Task #4: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu" | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:141: Task #4 has finished. Remaining task(s): 15 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #5: "vm_mod_fused_nn_conv2d_add_nn_relu_1"] Trial #0: GFLOPs: 104.3495. Time: 1.1098 ms. Best GFLOPs: 104.3495 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #5: "vm_mod_fused_nn_conv2d_add_nn_relu_1"] Trial #1: GFLOPs: 106.6877. Time: 1.0855 ms. Best GFLOPs: 106.6877 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #5: "vm_mod_fused_nn_conv2d_add_nn_relu_1"] Trial #2: GFLOPs: 54.0173. Time: 2.1439 ms. Best GFLOPs: 106.6877 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #5: "vm_mod_fused_nn_conv2d_add_nn_relu_1"] Trial #3: GFLOPs: 347.3480. Time: 0.3334 ms. Best GFLOPs: 347.3480 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #5: "vm_mod_fused_nn_conv2d_add_nn_relu_1"] Trial #4: GFLOPs: 36.4929. Time: 3.1734 ms. Best GFLOPs: 347.3480 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #5: "vm_mod_fused_nn_conv2d_add_nn_relu_1"] Trial #5: GFLOPs: 350.8541. Time: 0.3301 ms. Best GFLOPs: 350.8541 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #5: "vm_mod_fused_nn_conv2d_add_nn_relu_1"] Trial #6: GFLOPs: 21.5840. Time: 5.3654 ms. Best GFLOPs: 350.8541 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #5: "vm_mod_fused_nn_conv2d_add_nn_relu_1"] Trial #7: GFLOPs: 548.8997. Time: 0.2110 ms. Best GFLOPs: 548.8997 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #5: "vm_mod_fused_nn_conv2d_add_nn_relu_1"] Trial #8: GFLOPs: 20.4343. Time: 5.6672 ms. Best GFLOPs: 548.8997 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #5: "vm_mod_fused_nn_conv2d_add_nn_relu_1"] Trial #9: GFLOPs: 1552.9004. Time: 0.0746 ms. Best GFLOPs: 1552.9004 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #5: "vm_mod_fused_nn_conv2d_add_nn_relu_1"] Trial #10: GFLOPs: 50.3252. Time: 2.3012 ms. Best GFLOPs: 1552.9004 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:271: [Task #5: "vm_mod_fused_nn_conv2d_add_nn_relu_1"] Trial #11: Error in building: LocalRunner: An exception occurred | |
Subprocess terminated | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 1, 1, 128), "float32"], placeholder_1: T.Buffer[(3, 3, 64, 128), "float32"], placeholder_2: T.Buffer[(1, 56, 56, 64), "float32"], T_relu: T.Buffer[(1, 28, 28, 128), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"tir.noalias": True, "global_symbol": "main"}) | |
# body | |
# with T.block("root") | |
conv2d_nhwc_local = T.alloc_buffer([1, 28, 28, 128], dtype="float32", scope="local") | |
pad_temp_shared = T.alloc_buffer([1, 58, 58, 64], dtype="float32", scope="shared") | |
placeholder_shared = T.alloc_buffer([3, 3, 64, 128], dtype="float32", scope="shared") | |
for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(1, thread="blockIdx.x", annotations={"pragma_auto_unroll_max_step":1024, "pragma_unroll_explicit":1}): | |
for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(14, thread="vthread.x"): | |
for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(32, thread="threadIdx.x"): | |
for i2_3_init, i2_4_init, i3_4_init in T.grid(14, 2, 8): | |
with T.block("conv2d_nhwc_init"): | |
nn = T.axis.spatial(1, 0) | |
yy = T.axis.spatial(28, i0_1_i1_1_i2_1_i3_1_fused // 2 * 4 + i0_2_i1_2_i2_2_i3_2_fused // 8) | |
xx = T.axis.spatial(28, i2_3_init * 2 + i2_4_init) | |
ff = T.axis.spatial(128, i0_1_i1_1_i2_1_i3_1_fused % 2 * 64 + i0_2_i1_2_i2_2_i3_2_fused % 8 * 8 + i3_4_init) | |
T.reads() | |
T.writes(conv2d_nhwc_local[nn, yy, xx, ff]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["conv2d_nhwc.gpu", ["TENSOR", [1, 56, 56, 64], "float32"], ["TENSOR", [3, 3, 64, 128], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], "float32"]}) | |
conv2d_nhwc_local[nn, yy, xx, ff] = T.float32(0) | |
for i4_0, i5_0, i6_0 in T.grid(3, 1, 64): | |
for ax0_ax1_ax2_ax3_fused_0 in T.serial(49): | |
for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(32, thread="threadIdx.x"): | |
for ax0_ax1_ax2_ax3_fused_2 in T.vectorized(2): | |
with T.block("pad_temp_shared"): | |
v0 = T.axis.spatial(1, 0) | |
v1 = T.axis.spatial(58, i4_0 + ((ax0_ax1_ax2_ax3_fused_0 * 32 + ax0_ax1_ax2_ax3_fused_1) * 2 + ax0_ax1_ax2_ax3_fused_2) % 3135 // 57) | |
v2 = T.axis.spatial(58, ((ax0_ax1_ax2_ax3_fused_0 * 32 + ax0_ax1_ax2_ax3_fused_1) * 2 + ax0_ax1_ax2_ax3_fused_2) % 57) | |
v3 = T.axis.spatial(64, i6_0 + 0) | |
T.where((ax0_ax1_ax2_ax3_fused_0 * 32 + ax0_ax1_ax2_ax3_fused_1) * 2 + ax0_ax1_ax2_ax3_fused_2 < 3135) | |
T.reads(placeholder_2[v0, v1 - 1, v2 - 1, v3]) | |
T.writes(pad_temp_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":2}) | |
pad_temp_shared[v0, v1, v2, v3] = T.if_then_else(1 <= v1 and v1 < 57 and 1 <= v2 and v2 < 57, placeholder_2[v0, v1 - 1, v2 - 1, v3], T.float32(0), dtype="float32") | |
for ax0_ax1_ax2_ax3_fused_0 in T.serial(6): | |
for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(32, thread="threadIdx.x"): | |
for ax0_ax1_ax2_ax3_fused_2 in T.vectorized(2): | |
with T.block("placeholder_shared"): | |
v0 = T.axis.spatial(3, i4_0) | |
v1 = T.axis.spatial(3, (ax0_ax1_ax2_ax3_fused_0 * 64 + ax0_ax1_ax2_ax3_fused_1 * 2 + ax0_ax1_ax2_ax3_fused_2) // 128) | |
v2 = T.axis.spatial(64, i6_0) | |
v3 = T.axis.spatial(128, (ax0_ax1_ax2_ax3_fused_0 * 64 + ax0_ax1_ax2_ax3_fused_1 * 2 + ax0_ax1_ax2_ax3_fused_2) % 128) | |
T.reads(placeholder_1[v0, v1, v2, v3]) | |
T.writes(placeholder_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":2}) | |
placeholder_shared[v0, v1, v2, v3] = placeholder_1[v0, v1, v2, v3] | |
for i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3, i4_2, i5_2, i6_2, i0_4, i1_4, i2_4, i3_4 in T.grid(1, 3, 1, 1, 1, 14, 1, 1, 1, 1, 1, 1, 2, 8): | |
with T.block("conv2d_nhwc_update"): | |
nn = T.axis.spatial(1, 0) | |
yy = T.axis.spatial(28, i0_1_i1_1_i2_1_i3_1_fused // 2 * 4 + i0_2_i1_2_i2_2_i3_2_fused // 8) | |
xx = T.axis.spatial(28, i2_3 * 2 + i2_4) | |
ff = T.axis.spatial(128, i0_1_i1_1_i2_1_i3_1_fused % 2 * 64 + i0_2_i1_2_i2_2_i3_2_fused % 8 * 8 + i3_4) | |
ry, rx, rc = T.axis.remap("RRR", [i4_0, i5_1, i6_0]) | |
T.reads(conv2d_nhwc_local[nn, yy, xx, ff], pad_temp_shared[nn, yy * 2 + ry, xx * 2 + rx, rc], placeholder_shared[ry, rx, rc, ff]) | |
T.writes(conv2d_nhwc_local[nn, yy, xx, ff]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["conv2d_nhwc.gpu", ["TENSOR", [1, 56, 56, 64], "float32"], ["TENSOR", [3, 3, 64, 128], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], "float32"]}) | |
conv2d_nhwc_local[nn, yy, xx, ff] = conv2d_nhwc_local[nn, yy, xx, ff] + pad_temp_shared[nn, yy * 2 + ry, xx * 2 + rx, rc] * placeholder_shared[ry, rx, rc, ff] | |
for ax0, ax1, ax2, ax3 in T.grid(1, 1, 28, 8): | |
with T.block("conv2d_nhwc_local"): | |
v0 = T.axis.spatial(1, ax0) | |
v1 = T.axis.spatial(28, i0_1_i1_1_i2_1_i3_1_fused // 2 * 4 + i0_2_i1_2_i2_2_i3_2_fused // 8 + ax1) | |
v2 = T.axis.spatial(28, ax2) | |
v3 = T.axis.spatial(128, i0_1_i1_1_i2_1_i3_1_fused % 2 * 64 + i0_2_i1_2_i2_2_i3_2_fused % 8 * 8 + ax3) | |
T.reads(conv2d_nhwc_local[v0, v1, v2, v3], placeholder[v0, 0, 0, v3]) | |
T.writes(T_relu[v0, v1, v2, v3]) | |
T_relu[v0, v1, v2, v3] = T.max(conv2d_nhwc_local[v0, v1, v2, v3] + placeholder[v0, 0, 0, v3], T.float32(0)) | |
b0 = sch.get_block(name="pad_temp", func_name="main") | |
b1 = sch.get_block(name="conv2d_nhwc", func_name="main") | |
b2 = sch.get_block(name="T_add", func_name="main") | |
b3 = sch.get_block(name="T_relu", func_name="main") | |
b4 = sch.get_block(name="root", func_name="main") | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS") | |
l5, l6, l7, l8, l9, l10, l11 = sch.get_loops(block=b1) | |
v12, v13, v14, v15, v16 = sch.sample_perfect_tile(loop=l5, n=5, max_innermost_factor=64, decision=[1, 1, 1, 1, 1]) | |
l17, l18, l19, l20, l21 = sch.split(loop=l5, factors=[v12, v13, v14, v15, v16]) | |
v22, v23, v24, v25, v26 = sch.sample_perfect_tile(loop=l6, n=5, max_innermost_factor=64, decision=[1, 7, 4, 1, 1]) | |
l27, l28, l29, l30, l31 = sch.split(loop=l6, factors=[v22, v23, v24, v25, v26]) | |
v32, v33, v34, v35, v36 = sch.sample_perfect_tile(loop=l7, n=5, max_innermost_factor=64, decision=[1, 1, 1, 14, 2]) | |
l37, l38, l39, l40, l41 = sch.split(loop=l7, factors=[v32, v33, v34, v35, v36]) | |
v42, v43, v44, v45, v46 = sch.sample_perfect_tile(loop=l8, n=5, max_innermost_factor=64, decision=[1, 2, 8, 1, 8]) | |
l47, l48, l49, l50, l51 = sch.split(loop=l8, factors=[v42, v43, v44, v45, v46]) | |
v52, v53, v54 = sch.sample_perfect_tile(loop=l9, n=3, max_innermost_factor=64, decision=[3, 1, 1]) | |
l55, l56, l57 = sch.split(loop=l9, factors=[v52, v53, v54]) | |
v58, v59, v60 = sch.sample_perfect_tile(loop=l10, n=3, max_innermost_factor=64, decision=[1, 3, 1]) | |
l61, l62, l63 = sch.split(loop=l10, factors=[v58, v59, v60]) | |
v64, v65, v66 = sch.sample_perfect_tile(loop=l11, n=3, max_innermost_factor=64, decision=[64, 1, 1]) | |
l67, l68, l69 = sch.split(loop=l11, factors=[v64, v65, v66]) | |
sch.reorder(l17, l27, l37, l47, l18, l28, l38, l48, l19, l29, l39, l49, l55, l61, l67, l56, l62, l68, l20, l30, l40, l50, l57, l63, l69, l21, l31, l41, l51) | |
l70 = sch.fuse(l17, l27, l37, l47) | |
sch.bind(loop=l70, thread_axis="blockIdx.x") | |
l71 = sch.fuse(l18, l28, l38, l48) | |
sch.bind(loop=l71, thread_axis="vthread.x") | |
l72 = sch.fuse(l19, l29, l39, l49) | |
sch.bind(loop=l72, thread_axis="threadIdx.x") | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.thread_extent_low_inclusive", ann_val=32) | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=1024) | |
b73 = sch.cache_write(block=b1, write_buffer_index=0, storage_scope="local") | |
sch.reverse_compute_at(block=b73, loop=l72, preserve_unit_loops=True) | |
b74 = sch.cache_read(block=b1, read_buffer_index=1, storage_scope="shared") | |
sch.compute_at(block=b74, loop=l67, preserve_unit_loops=True) | |
l75, l76, l77, l78, l79, l80, l81, l82, l83, l84 = sch.get_loops(block=b74) | |
l85 = sch.fuse(l81, l82, l83, l84) | |
v86 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=1) | |
sch.annotate(block_or_loop=b74, ann_key="meta_schedule.cooperative_fetch", ann_val=v86) | |
b87 = sch.cache_read(block=b1, read_buffer_index=2, storage_scope="shared") | |
sch.compute_at(block=b87, loop=l67, preserve_unit_loops=True) | |
l88, l89, l90, l91, l92, l93, l94, l95, l96, l97 = sch.get_loops(block=b87) | |
l98 = sch.fuse(l94, l95, l96, l97) | |
v99 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=1) | |
sch.annotate(block_or_loop=b87, ann_key="meta_schedule.cooperative_fetch", ann_val=v99) | |
sch.reverse_compute_inline(block=b3) | |
sch.reverse_compute_inline(block=b2) | |
sch.compute_inline(block=b0) | |
v100 = sch.sample_categorical(candidates=[0, 16, 64, 512, 1024], probs=[0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001], decision=4) | |
sch.annotate(block_or_loop=b4, ann_key="meta_schedule.unroll_explicit", ann_val=v100) | |
sch.enter_postproc() | |
l101, l102, l103, l104, l105, l106, l107 = sch.get_loops(block=b74) | |
l108, l109, l110 = sch.split(loop=l107, factors=[None, 32, 2]) | |
sch.vectorize(loop=l110) | |
sch.bind(loop=l109, thread_axis="threadIdx.x") | |
l111, l112, l113, l114, l115, l116, l117 = sch.get_loops(block=b87) | |
l118, l119, l120 = sch.split(loop=l117, factors=[None, 32, 2]) | |
sch.vectorize(loop=l120) | |
sch.bind(loop=l119, thread_axis="threadIdx.x") | |
b121 = sch.get_block(name="root", func_name="main") | |
sch.unannotate(block_or_loop=b121, ann_key="meta_schedule.unroll_explicit") | |
b122, b123, b124, b125 = sch.get_child_blocks(b121) | |
l126, l127, l128, l129, l130, l131, l132, l133, l134 = sch.get_loops(block=b122) | |
sch.annotate(block_or_loop=l126, ann_key="pragma_auto_unroll_max_step", ann_val=1024) | |
sch.annotate(block_or_loop=l126, ann_key="pragma_unroll_explicit", ann_val=1) | |
l135, l136, l137, l138, l139, l140, l141, l142, l143 = sch.get_loops(block=b123) | |
sch.annotate(block_or_loop=l135, ann_key="pragma_auto_unroll_max_step", ann_val=1024) | |
sch.annotate(block_or_loop=l135, ann_key="pragma_unroll_explicit", ann_val=1) | |
l144, l145, l146, l147, l148, l149, l150, l151, l152, l153, l154, l155, l156, l157, l158, l159, l160, l161, l162, l163 = sch.get_loops(block=b124) | |
sch.annotate(block_or_loop=l144, ann_key="pragma_auto_unroll_max_step", ann_val=1024) | |
sch.annotate(block_or_loop=l144, ann_key="pragma_unroll_explicit", ann_val=1) | |
l164, l165, l166, l167, l168, l169, l170 = sch.get_loops(block=b125) | |
sch.annotate(block_or_loop=l164, ann_key="pragma_auto_unroll_max_step", ann_val=1024) | |
sch.annotate(block_or_loop=l164, ann_key="pragma_unroll_explicit", ann_val=1) | |
b171 = sch.get_block(name="conv2d_nhwc", func_name="main") | |
l172, l173, l174, l175, l176, l177, l178, l179, l180, l181, l182, l183, l184, l185, l186, l187, l188, l189, l190, l191 = sch.get_loops(block=b171) | |
b192 = sch.decompose_reduction(block=b171, loop=l175) | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #5: "vm_mod_fused_nn_conv2d_add_nn_relu_1"] Trial #12: GFLOPs: 29.3420. Time: 3.9468 ms. Best GFLOPs: 1552.9004 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #5: "vm_mod_fused_nn_conv2d_add_nn_relu_1"] Trial #13: GFLOPs: 225.2848. Time: 0.5140 ms. Best GFLOPs: 1552.9004 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #5: "vm_mod_fused_nn_conv2d_add_nn_relu_1"] Trial #14: GFLOPs: 1478.4790. Time: 0.0783 ms. Best GFLOPs: 1552.9004 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #5: "vm_mod_fused_nn_conv2d_add_nn_relu_1"] Trial #15: GFLOPs: 52.6852. Time: 2.1981 ms. Best GFLOPs: 1552.9004 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #5: "vm_mod_fused_nn_conv2d_add_nn_relu_1"] Trial #16: GFLOPs: 1134.9145. Time: 0.1020 ms. Best GFLOPs: 1552.9004 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #5: "vm_mod_fused_nn_conv2d_add_nn_relu_1"] Trial #17: GFLOPs: 1123.0330. Time: 0.1031 ms. Best GFLOPs: 1552.9004 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #5: "vm_mod_fused_nn_conv2d_add_nn_relu_1"] Trial #18: GFLOPs: 9.2174. Time: 12.5638 ms. Best GFLOPs: 1552.9004 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #5: "vm_mod_fused_nn_conv2d_add_nn_relu_1"] Trial #19: GFLOPs: 158.6061. Time: 0.7302 ms. Best GFLOPs: 1552.9004 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #5: "vm_mod_fused_nn_conv2d_add_nn_relu_1"] Trial #20: GFLOPs: 900.6749. Time: 0.1286 ms. Best GFLOPs: 1552.9004 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #5: "vm_mod_fused_nn_conv2d_add_nn_relu_1"] Trial #21: GFLOPs: 717.7440. Time: 0.1613 ms. Best GFLOPs: 1552.9004 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #5: "vm_mod_fused_nn_conv2d_add_nn_relu_1"] Trial #22: GFLOPs: 42.2763. Time: 2.7393 ms. Best GFLOPs: 1552.9004 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #5: "vm_mod_fused_nn_conv2d_add_nn_relu_1"] Trial #23: GFLOPs: 195.2390. Time: 0.5932 ms. Best GFLOPs: 1552.9004 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #5: "vm_mod_fused_nn_conv2d_add_nn_relu_1"] Trial #24: GFLOPs: 9.8909. Time: 11.7084 ms. Best GFLOPs: 1552.9004 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #5: "vm_mod_fused_nn_conv2d_add_nn_relu_1"] Trial #25: GFLOPs: 27.5552. Time: 4.2027 ms. Best GFLOPs: 1552.9004 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #5: "vm_mod_fused_nn_conv2d_add_nn_relu_1"] Trial #26: GFLOPs: 1098.3952. Time: 0.1054 ms. Best GFLOPs: 1552.9004 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #5: "vm_mod_fused_nn_conv2d_add_nn_relu_1"] Trial #27: GFLOPs: 175.3289. Time: 0.6605 ms. Best GFLOPs: 1552.9004 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #5: "vm_mod_fused_nn_conv2d_add_nn_relu_1"] Trial #28: GFLOPs: 42.6613. Time: 2.7145 ms. Best GFLOPs: 1552.9004 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #5: "vm_mod_fused_nn_conv2d_add_nn_relu_1"] Trial #29: GFLOPs: 213.0887. Time: 0.5435 ms. Best GFLOPs: 1552.9004 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #5: "vm_mod_fused_nn_conv2d_add_nn_relu_1"] Trial #30: GFLOPs: 156.7256. Time: 0.7389 ms. Best GFLOPs: 1552.9004 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #5: "vm_mod_fused_nn_conv2d_add_nn_relu_1"] Trial #31: GFLOPs: 461.3285. Time: 0.2510 ms. Best GFLOPs: 1552.9004 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB validation: p-rmse: 0.108648 a-peak@32: 0.680826 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 0: tr-p-rmse: 0.402283 tr-a-peak@32: 0.995814 tr-rmse: 0.308661 tr-rmse: 0.308661 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 25: tr-p-rmse: 0.050787 tr-a-peak@32: 0.999982 tr-rmse: 0.035468 tr-rmse: 0.035468 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 50: tr-p-rmse: 0.052154 tr-a-peak@32: 0.999982 tr-rmse: 0.035414 tr-rmse: 0.035414 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB stopped. Best iteration: [10] tr-p-rmse:0.04639 tr-a-peak@32:0.99581 tr-rmse:0.04990 tr-rmse:0.04990 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:127: Scheduler picks Task #5: "vm_mod_fused_nn_conv2d_add_nn_relu_1" | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:141: Task #5 has finished. Remaining task(s): 14 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #6: "vm_mod_fused_nn_conv2d_add"] Trial #0: GFLOPs: 122.6745. Time: 0.1055 ms. Best GFLOPs: 122.6745 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #6: "vm_mod_fused_nn_conv2d_add"] Trial #1: GFLOPs: 16.4136. Time: 0.7887 ms. Best GFLOPs: 122.6745 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #6: "vm_mod_fused_nn_conv2d_add"] Trial #2: GFLOPs: 53.5771. Time: 0.2416 ms. Best GFLOPs: 122.6745 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #6: "vm_mod_fused_nn_conv2d_add"] Trial #3: GFLOPs: 23.2213. Time: 0.5575 ms. Best GFLOPs: 122.6745 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #6: "vm_mod_fused_nn_conv2d_add"] Trial #4: GFLOPs: 564.9516. Time: 0.0229 ms. Best GFLOPs: 564.9516 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #6: "vm_mod_fused_nn_conv2d_add"] Trial #5: GFLOPs: 39.0463. Time: 0.3315 ms. Best GFLOPs: 564.9516 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #6: "vm_mod_fused_nn_conv2d_add"] Trial #6: GFLOPs: 404.8811. Time: 0.0320 ms. Best GFLOPs: 564.9516 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #6: "vm_mod_fused_nn_conv2d_add"] Trial #7: GFLOPs: 35.1817. Time: 0.3680 ms. Best GFLOPs: 564.9516 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #6: "vm_mod_fused_nn_conv2d_add"] Trial #8: GFLOPs: 124.2743. Time: 0.1042 ms. Best GFLOPs: 564.9516 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #6: "vm_mod_fused_nn_conv2d_add"] Trial #9: GFLOPs: 22.8645. Time: 0.5662 ms. Best GFLOPs: 564.9516 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #6: "vm_mod_fused_nn_conv2d_add"] Trial #10: GFLOPs: 7.8452. Time: 1.6501 ms. Best GFLOPs: 564.9516 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #6: "vm_mod_fused_nn_conv2d_add"] Trial #11: GFLOPs: 531.2349. Time: 0.0244 ms. Best GFLOPs: 564.9516 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #6: "vm_mod_fused_nn_conv2d_add"] Trial #12: GFLOPs: 1245.7964. Time: 0.0104 ms. Best GFLOPs: 1245.7964 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #6: "vm_mod_fused_nn_conv2d_add"] Trial #13: GFLOPs: 441.3877. Time: 0.0293 ms. Best GFLOPs: 1245.7964 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #6: "vm_mod_fused_nn_conv2d_add"] Trial #14: GFLOPs: 428.5374. Time: 0.0302 ms. Best GFLOPs: 1245.7964 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #6: "vm_mod_fused_nn_conv2d_add"] Trial #15: GFLOPs: 906.3594. Time: 0.0143 ms. Best GFLOPs: 1245.7964 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #6: "vm_mod_fused_nn_conv2d_add"] Trial #16: GFLOPs: 72.4367. Time: 0.1787 ms. Best GFLOPs: 1245.7964 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #6: "vm_mod_fused_nn_conv2d_add"] Trial #17: GFLOPs: 257.0078. Time: 0.0504 ms. Best GFLOPs: 1245.7964 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #6: "vm_mod_fused_nn_conv2d_add"] Trial #18: GFLOPs: 53.8819. Time: 0.2403 ms. Best GFLOPs: 1245.7964 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #6: "vm_mod_fused_nn_conv2d_add"] Trial #19: GFLOPs: 69.5943. Time: 0.1860 ms. Best GFLOPs: 1245.7964 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #6: "vm_mod_fused_nn_conv2d_add"] Trial #20: GFLOPs: 1049.3247. Time: 0.0123 ms. Best GFLOPs: 1245.7964 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #6: "vm_mod_fused_nn_conv2d_add"] Trial #21: GFLOPs: 18.3852. Time: 0.7041 ms. Best GFLOPs: 1245.7964 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #6: "vm_mod_fused_nn_conv2d_add"] Trial #22: GFLOPs: 209.5851. Time: 0.0618 ms. Best GFLOPs: 1245.7964 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #6: "vm_mod_fused_nn_conv2d_add"] Trial #23: GFLOPs: 610.9557. Time: 0.0212 ms. Best GFLOPs: 1245.7964 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #6: "vm_mod_fused_nn_conv2d_add"] Trial #24: GFLOPs: 699.4623. Time: 0.0185 ms. Best GFLOPs: 1245.7964 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #6: "vm_mod_fused_nn_conv2d_add"] Trial #25: GFLOPs: 229.4480. Time: 0.0564 ms. Best GFLOPs: 1245.7964 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #6: "vm_mod_fused_nn_conv2d_add"] Trial #26: GFLOPs: 25.9995. Time: 0.4979 ms. Best GFLOPs: 1245.7964 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #6: "vm_mod_fused_nn_conv2d_add"] Trial #27: GFLOPs: 20.0078. Time: 0.6470 ms. Best GFLOPs: 1245.7964 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #6: "vm_mod_fused_nn_conv2d_add"] Trial #28: GFLOPs: 196.2850. Time: 0.0660 ms. Best GFLOPs: 1245.7964 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #6: "vm_mod_fused_nn_conv2d_add"] Trial #29: GFLOPs: 28.4187. Time: 0.4555 ms. Best GFLOPs: 1245.7964 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #6: "vm_mod_fused_nn_conv2d_add"] Trial #30: GFLOPs: 806.3965. Time: 0.0161 ms. Best GFLOPs: 1245.7964 | |
[14:41:41] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #6: "vm_mod_fused_nn_conv2d_add"] Trial #31: GFLOPs: 303.6617. Time: 0.0426 ms. Best GFLOPs: 1245.7964 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB validation: p-rmse: 0.187410 a-peak@32: 0.639580 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 0: tr-p-rmse: 0.407119 tr-a-peak@32: 0.668149 tr-rmse: 0.289562 tr-rmse: 0.289562 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 25: tr-p-rmse: 0.080152 tr-a-peak@32: 0.999981 tr-rmse: 0.094175 tr-rmse: 0.094175 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 50: tr-p-rmse: 0.080146 tr-a-peak@32: 0.999981 tr-rmse: 0.094095 tr-rmse: 0.094095 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 75: tr-p-rmse: 0.080146 tr-a-peak@32: 0.999981 tr-rmse: 0.094095 tr-rmse: 0.094095 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB stopped. Best iteration: [27] tr-p-rmse:0.08015 tr-a-peak@32:0.99998 tr-rmse:0.09412 tr-rmse:0.09412 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:127: Scheduler picks Task #6: "vm_mod_fused_nn_conv2d_add" | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:141: Task #6 has finished. Remaining task(s): 13 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #7: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1"] Trial #0: GFLOPs: 742.2272. Time: 0.1713 ms. Best GFLOPs: 742.2272 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #7: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1"] Trial #1: GFLOPs: 113.2852. Time: 1.1224 ms. Best GFLOPs: 742.2272 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #7: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1"] Trial #2: GFLOPs: 75.0282. Time: 1.6946 ms. Best GFLOPs: 742.2272 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #7: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1"] Trial #3: GFLOPs: 40.8453. Time: 3.1129 ms. Best GFLOPs: 742.2272 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #7: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1"] Trial #4: GFLOPs: 16.4706. Time: 7.7196 ms. Best GFLOPs: 742.2272 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #7: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1"] Trial #5: GFLOPs: 94.8549. Time: 1.3404 ms. Best GFLOPs: 742.2272 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #7: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1"] Trial #6: GFLOPs: 1237.7699. Time: 0.1027 ms. Best GFLOPs: 1237.7699 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #7: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1"] Trial #7: GFLOPs: 87.6553. Time: 1.4505 ms. Best GFLOPs: 1237.7699 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #7: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1"] Trial #8: GFLOPs: 1003.9592. Time: 0.1266 ms. Best GFLOPs: 1237.7699 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #7: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1"] Trial #9: GFLOPs: 80.3445. Time: 1.5825 ms. Best GFLOPs: 1237.7699 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #7: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1"] Trial #10: GFLOPs: 11.0787. Time: 11.4766 ms. Best GFLOPs: 1237.7699 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #7: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1"] Trial #11: GFLOPs: 565.8800. Time: 0.2247 ms. Best GFLOPs: 1237.7699 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #7: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1"] Trial #12: GFLOPs: 42.4922. Time: 2.9922 ms. Best GFLOPs: 1237.7699 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #7: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1"] Trial #13: GFLOPs: 322.1066. Time: 0.3947 ms. Best GFLOPs: 1237.7699 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #7: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1"] Trial #14: GFLOPs: 15.9722. Time: 7.9604 ms. Best GFLOPs: 1237.7699 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #7: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1"] Trial #15: GFLOPs: 48.1994. Time: 2.6379 ms. Best GFLOPs: 1237.7699 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #7: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1"] Trial #16: GFLOPs: 14.8083. Time: 8.5861 ms. Best GFLOPs: 1237.7699 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #7: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1"] Trial #17: GFLOPs: 117.5133. Time: 1.0820 ms. Best GFLOPs: 1237.7699 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #7: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1"] Trial #18: GFLOPs: 97.8849. Time: 1.2989 ms. Best GFLOPs: 1237.7699 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #7: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1"] Trial #19: GFLOPs: 1140.5067. Time: 0.1115 ms. Best GFLOPs: 1237.7699 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #7: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1"] Trial #20: GFLOPs: 564.3063. Time: 0.2253 ms. Best GFLOPs: 1237.7699 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #7: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1"] Trial #21: GFLOPs: 294.5971. Time: 0.4316 ms. Best GFLOPs: 1237.7699 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #7: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1"] Trial #22: GFLOPs: 12.1724. Time: 10.4454 ms. Best GFLOPs: 1237.7699 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #7: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1"] Trial #23: GFLOPs: 50.8344. Time: 2.5012 ms. Best GFLOPs: 1237.7699 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #7: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1"] Trial #24: GFLOPs: 470.3097. Time: 0.2703 ms. Best GFLOPs: 1237.7699 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #7: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1"] Trial #25: GFLOPs: 259.2796. Time: 0.4904 ms. Best GFLOPs: 1237.7699 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #7: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1"] Trial #26: GFLOPs: 14.0353. Time: 9.0590 ms. Best GFLOPs: 1237.7699 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #7: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1"] Trial #27: GFLOPs: 211.8448. Time: 0.6002 ms. Best GFLOPs: 1237.7699 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #7: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1"] Trial #28: GFLOPs: 61.5464. Time: 2.0659 ms. Best GFLOPs: 1237.7699 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #7: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1"] Trial #29: GFLOPs: 73.3675. Time: 1.7330 ms. Best GFLOPs: 1237.7699 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #7: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1"] Trial #30: GFLOPs: 58.9696. Time: 2.1561 ms. Best GFLOPs: 1237.7699 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #7: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1"] Trial #31: GFLOPs: 63.6403. Time: 1.9979 ms. Best GFLOPs: 1237.7699 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB validation: p-rmse: 0.194619 a-peak@32: 1.000000 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 0: tr-p-rmse: 0.583827 tr-a-peak@32: 0.672398 tr-rmse: 0.288438 tr-rmse: 0.288438 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 25: tr-p-rmse: 0.067568 tr-a-peak@32: 0.999984 tr-rmse: 0.095708 tr-rmse: 0.095708 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 50: tr-p-rmse: 0.067690 tr-a-peak@32: 0.999982 tr-rmse: 0.095616 tr-rmse: 0.095616 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB stopped. Best iteration: [18] tr-p-rmse:0.06724 tr-a-peak@32:0.99998 tr-rmse:0.09604 tr-rmse:0.09604 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:127: Scheduler picks Task #7: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1" | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:141: Task #7 has finished. Remaining task(s): 12 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #8: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1"] Trial #0: GFLOPs: 88.3971. Time: 1.4372 ms. Best GFLOPs: 88.3971 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #8: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1"] Trial #1: GFLOPs: 27.6942. Time: 4.5874 ms. Best GFLOPs: 88.3971 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #8: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1"] Trial #2: GFLOPs: 47.8571. Time: 2.6547 ms. Best GFLOPs: 88.3971 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #8: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1"] Trial #3: GFLOPs: 18.6734. Time: 6.8036 ms. Best GFLOPs: 88.3971 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #8: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1"] Trial #4: GFLOPs: 138.5499. Time: 0.9170 ms. Best GFLOPs: 138.5499 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #8: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1"] Trial #5: GFLOPs: 149.7843. Time: 0.8482 ms. Best GFLOPs: 149.7843 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #8: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1"] Trial #6: GFLOPs: 45.9370. Time: 2.7656 ms. Best GFLOPs: 149.7843 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #8: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1"] Trial #7: GFLOPs: 23.5059. Time: 5.4048 ms. Best GFLOPs: 149.7843 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #8: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1"] Trial #8: GFLOPs: 61.9246. Time: 2.0516 ms. Best GFLOPs: 149.7843 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #8: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1"] Trial #9: GFLOPs: 232.9236. Time: 0.5454 ms. Best GFLOPs: 232.9236 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #8: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1"] Trial #10: GFLOPs: 762.3641. Time: 0.1666 ms. Best GFLOPs: 762.3641 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #8: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1"] Trial #11: GFLOPs: 21.5715. Time: 5.8895 ms. Best GFLOPs: 762.3641 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #8: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1"] Trial #12: GFLOPs: 14.5871. Time: 8.7094 ms. Best GFLOPs: 762.3641 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #8: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1"] Trial #13: GFLOPs: 13.1669. Time: 9.6489 ms. Best GFLOPs: 762.3641 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #8: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1"] Trial #14: GFLOPs: 7.9418. Time: 15.9970 ms. Best GFLOPs: 762.3641 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #8: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1"] Trial #15: GFLOPs: 599.8774. Time: 0.2118 ms. Best GFLOPs: 762.3641 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #8: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1"] Trial #16: GFLOPs: 199.4739. Time: 0.6369 ms. Best GFLOPs: 762.3641 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #8: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1"] Trial #17: GFLOPs: 996.5704. Time: 0.1275 ms. Best GFLOPs: 996.5704 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #8: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1"] Trial #18: GFLOPs: 75.1199. Time: 1.6912 ms. Best GFLOPs: 996.5704 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #8: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1"] Trial #19: GFLOPs: 459.3848. Time: 0.2766 ms. Best GFLOPs: 996.5704 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #8: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1"] Trial #20: GFLOPs: 36.2475. Time: 3.5050 ms. Best GFLOPs: 996.5704 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #8: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1"] Trial #21: GFLOPs: 429.3834. Time: 0.2959 ms. Best GFLOPs: 996.5704 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #8: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1"] Trial #22: GFLOPs: 1996.7256. Time: 0.0636 ms. Best GFLOPs: 1996.7256 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #8: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1"] Trial #23: GFLOPs: 641.8337. Time: 0.1979 ms. Best GFLOPs: 1996.7256 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #8: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1"] Trial #24: GFLOPs: 112.3634. Time: 1.1307 ms. Best GFLOPs: 1996.7256 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #8: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1"] Trial #25: GFLOPs: 97.0309. Time: 1.3093 ms. Best GFLOPs: 1996.7256 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #8: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1"] Trial #26: GFLOPs: 111.3724. Time: 1.1407 ms. Best GFLOPs: 1996.7256 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #8: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1"] Trial #27: GFLOPs: 39.1317. Time: 3.2466 ms. Best GFLOPs: 1996.7256 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #8: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1"] Trial #28: GFLOPs: 430.9527. Time: 0.2948 ms. Best GFLOPs: 1996.7256 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #8: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1"] Trial #29: GFLOPs: 67.5153. Time: 1.8817 ms. Best GFLOPs: 1996.7256 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #8: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1"] Trial #30: GFLOPs: 11.2542. Time: 11.2887 ms. Best GFLOPs: 1996.7256 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #8: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1"] Trial #31: GFLOPs: 199.0442. Time: 0.6383 ms. Best GFLOPs: 1996.7256 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB validation: p-rmse: 0.103402 a-peak@32: 0.480906 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 0: tr-p-rmse: 0.752528 tr-a-peak@32: 0.670863 tr-rmse: 0.288110 tr-rmse: 0.288110 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 25: tr-p-rmse: 0.073885 tr-a-peak@32: 0.999982 tr-rmse: 0.096901 tr-rmse: 0.096901 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 50: tr-p-rmse: 0.073938 tr-a-peak@32: 0.999982 tr-rmse: 0.096860 tr-rmse: 0.096860 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB stopped. Best iteration: [22] tr-p-rmse:0.07374 tr-a-peak@32:0.99998 tr-rmse:0.09716 tr-rmse:0.09716 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:127: Scheduler picks Task #8: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1" | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:141: Task #8 has finished. Remaining task(s): 11 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #9: "vm_mod_fused_nn_conv2d_add_nn_relu_2"] Trial #0: GFLOPs: 385.6584. Time: 0.3000 ms. Best GFLOPs: 385.6584 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #9: "vm_mod_fused_nn_conv2d_add_nn_relu_2"] Trial #1: GFLOPs: 17.6814. Time: 6.5439 ms. Best GFLOPs: 385.6584 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #9: "vm_mod_fused_nn_conv2d_add_nn_relu_2"] Trial #2: GFLOPs: 27.0263. Time: 4.2812 ms. Best GFLOPs: 385.6584 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #9: "vm_mod_fused_nn_conv2d_add_nn_relu_2"] Trial #3: GFLOPs: 34.0306. Time: 3.4001 ms. Best GFLOPs: 385.6584 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #9: "vm_mod_fused_nn_conv2d_add_nn_relu_2"] Trial #4: GFLOPs: 382.3983. Time: 0.3026 ms. Best GFLOPs: 385.6584 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #9: "vm_mod_fused_nn_conv2d_add_nn_relu_2"] Trial #5: GFLOPs: 2291.7312. Time: 0.0505 ms. Best GFLOPs: 2291.7312 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #9: "vm_mod_fused_nn_conv2d_add_nn_relu_2"] Trial #6: GFLOPs: 1555.5300. Time: 0.0744 ms. Best GFLOPs: 2291.7312 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #9: "vm_mod_fused_nn_conv2d_add_nn_relu_2"] Trial #7: GFLOPs: 203.7158. Time: 0.5680 ms. Best GFLOPs: 2291.7312 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #9: "vm_mod_fused_nn_conv2d_add_nn_relu_2"] Trial #8: GFLOPs: 2280.3034. Time: 0.0507 ms. Best GFLOPs: 2291.7312 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #9: "vm_mod_fused_nn_conv2d_add_nn_relu_2"] Trial #9: GFLOPs: 79.9299. Time: 1.4476 ms. Best GFLOPs: 2291.7312 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #9: "vm_mod_fused_nn_conv2d_add_nn_relu_2"] Trial #10: GFLOPs: 1296.6258. Time: 0.0892 ms. Best GFLOPs: 2291.7312 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #9: "vm_mod_fused_nn_conv2d_add_nn_relu_2"] Trial #11: GFLOPs: 168.4271. Time: 0.6870 ms. Best GFLOPs: 2291.7312 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #9: "vm_mod_fused_nn_conv2d_add_nn_relu_2"] Trial #12: GFLOPs: 126.8220. Time: 0.9123 ms. Best GFLOPs: 2291.7312 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #9: "vm_mod_fused_nn_conv2d_add_nn_relu_2"] Trial #13: GFLOPs: 266.5798. Time: 0.4340 ms. Best GFLOPs: 2291.7312 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #9: "vm_mod_fused_nn_conv2d_add_nn_relu_2"] Trial #14: GFLOPs: 780.0108. Time: 0.1483 ms. Best GFLOPs: 2291.7312 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #9: "vm_mod_fused_nn_conv2d_add_nn_relu_2"] Trial #15: GFLOPs: 1880.4119. Time: 0.0615 ms. Best GFLOPs: 2291.7312 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #9: "vm_mod_fused_nn_conv2d_add_nn_relu_2"] Trial #16: GFLOPs: 26.4066. Time: 4.3817 ms. Best GFLOPs: 2291.7312 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #9: "vm_mod_fused_nn_conv2d_add_nn_relu_2"] Trial #17: GFLOPs: 501.8968. Time: 0.2305 ms. Best GFLOPs: 2291.7312 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #9: "vm_mod_fused_nn_conv2d_add_nn_relu_2"] Trial #18: GFLOPs: 558.9765. Time: 0.2070 ms. Best GFLOPs: 2291.7312 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #9: "vm_mod_fused_nn_conv2d_add_nn_relu_2"] Trial #19: GFLOPs: 25.7613. Time: 4.4915 ms. Best GFLOPs: 2291.7312 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #9: "vm_mod_fused_nn_conv2d_add_nn_relu_2"] Trial #20: GFLOPs: 788.3747. Time: 0.1468 ms. Best GFLOPs: 2291.7312 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:271: [Task #9: "vm_mod_fused_nn_conv2d_add_nn_relu_2"] Trial #21: Error in building: LocalRunner: An exception occurred | |
Subprocess terminated | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 1, 1, 256), "float32"], placeholder_1: T.Buffer[(3, 3, 128, 256), "float32"], placeholder_2: T.Buffer[(1, 28, 28, 128), "float32"], T_relu: T.Buffer[(1, 14, 14, 256), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"tir.noalias": True, "global_symbol": "main"}) | |
# body | |
# with T.block("root") | |
conv2d_nhwc_local = T.alloc_buffer([1, 14, 14, 256], dtype="float32", scope="local") | |
pad_temp_shared = T.alloc_buffer([1, 30, 30, 128], dtype="float32", scope="shared") | |
placeholder_shared = T.alloc_buffer([3, 3, 128, 256], dtype="float32", scope="shared") | |
for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(2, thread="blockIdx.x", annotations={"pragma_auto_unroll_max_step":16, "pragma_unroll_explicit":1}): | |
for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(2, thread="vthread.x"): | |
for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(32, thread="threadIdx.x"): | |
for i4_0, i5_0 in T.grid(1, 1): | |
for i1_3_init, i3_3_init, i1_4_init, i2_4_init, i3_4_init in T.grid(2, 2, 7, 7, 2): | |
with T.block("conv2d_nhwc_init"): | |
nn = T.axis.spatial(1, 0) | |
yy = T.axis.spatial(14, i1_3_init * 7 + i1_4_init) | |
xx = T.axis.spatial(14, i0_0_i1_0_i2_0_i3_0_fused * 7 + i2_4_init) | |
ff = T.axis.spatial(256, i0_1_i1_1_i2_1_i3_1_fused * 128 + i0_2_i1_2_i2_2_i3_2_fused * 4 + i3_3_init * 2 + i3_4_init) | |
T.reads() | |
T.writes(conv2d_nhwc_local[nn, yy, xx, ff]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["conv2d_nhwc.gpu", ["TENSOR", [1, 28, 28, 128], "float32"], ["TENSOR", [3, 3, 128, 256], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], "float32"]}) | |
conv2d_nhwc_local[nn, yy, xx, ff] = T.float32(0) | |
for i6_0 in T.serial(64): | |
for ax0_ax1_ax2_ax3_fused_0 in T.serial(7): | |
for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(32, thread="threadIdx.x"): | |
for ax0_ax1_ax2_ax3_fused_2 in T.vectorized(4): | |
with T.block("pad_temp_shared"): | |
v0 = T.axis.spatial(1, 0) | |
v1 = T.axis.spatial(30, ((ax0_ax1_ax2_ax3_fused_0 * 32 + ax0_ax1_ax2_ax3_fused_1) * 4 + ax0_ax1_ax2_ax3_fused_2) % 870 // 30) | |
v2 = T.axis.spatial(30, i0_0_i1_0_i2_0_i3_0_fused * 14 + ((ax0_ax1_ax2_ax3_fused_0 * 32 + ax0_ax1_ax2_ax3_fused_1) * 4 + ax0_ax1_ax2_ax3_fused_2) % 30 // 2) | |
v3 = T.axis.spatial(128, i6_0 * 2 + ((ax0_ax1_ax2_ax3_fused_0 * 32 + ax0_ax1_ax2_ax3_fused_1) * 4 + ax0_ax1_ax2_ax3_fused_2) % 2) | |
T.where((ax0_ax1_ax2_ax3_fused_0 * 32 + ax0_ax1_ax2_ax3_fused_1) * 4 + ax0_ax1_ax2_ax3_fused_2 < 870) | |
T.reads(placeholder_2[v0, v1 - 1, v2 - 1, v3]) | |
T.writes(pad_temp_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":4}) | |
pad_temp_shared[v0, v1, v2, v3] = T.if_then_else(1 <= v1 and v1 < 29 and 1 <= v2 and v2 < 29, placeholder_2[v0, v1 - 1, v2 - 1, v3], T.float32(0), dtype="float32") | |
for ax0_ax1_ax2_ax3_fused_0 in T.serial(36): | |
for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(32, thread="threadIdx.x"): | |
for ax0_ax1_ax2_ax3_fused_2 in T.vectorized(4): | |
with T.block("placeholder_shared"): | |
v0 = T.axis.spatial(3, (ax0_ax1_ax2_ax3_fused_0 * 128 + ax0_ax1_ax2_ax3_fused_1 * 4 + ax0_ax1_ax2_ax3_fused_2) // 1536) | |
v1 = T.axis.spatial(3, (ax0_ax1_ax2_ax3_fused_0 * 128 + ax0_ax1_ax2_ax3_fused_1 * 4 + ax0_ax1_ax2_ax3_fused_2) % 1536 // 512) | |
v2 = T.axis.spatial(128, i6_0 * 2 + (ax0_ax1_ax2_ax3_fused_0 * 128 + ax0_ax1_ax2_ax3_fused_1 * 4 + ax0_ax1_ax2_ax3_fused_2) % 512 // 256) | |
v3 = T.axis.spatial(256, (ax0_ax1_ax2_ax3_fused_0 * 128 + ax0_ax1_ax2_ax3_fused_1 * 4 + ax0_ax1_ax2_ax3_fused_2) % 256) | |
T.reads(placeholder_1[v0, v1, v2, v3]) | |
T.writes(placeholder_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":4}) | |
placeholder_shared[v0, v1, v2, v3] = placeholder_1[v0, v1, v2, v3] | |
for i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3, i4_2, i5_2, i6_2, i0_4, i1_4, i2_4, i3_4 in T.grid(1, 1, 2, 1, 2, 1, 2, 3, 3, 1, 1, 7, 7, 2): | |
with T.block("conv2d_nhwc_update"): | |
nn = T.axis.spatial(1, 0) | |
yy = T.axis.spatial(14, i1_3 * 7 + i1_4) | |
xx = T.axis.spatial(14, i0_0_i1_0_i2_0_i3_0_fused * 7 + i2_4) | |
ff = T.axis.spatial(256, i0_1_i1_1_i2_1_i3_1_fused * 128 + i0_2_i1_2_i2_2_i3_2_fused * 4 + i3_3 * 2 + i3_4) | |
ry, rx = T.axis.remap("RR", [i4_2, i5_2]) | |
rc = T.axis.reduce(128, i6_0 * 2 + i6_1) | |
T.reads(conv2d_nhwc_local[nn, yy, xx, ff], pad_temp_shared[nn, yy * 2 + ry, xx * 2 + rx, rc], placeholder_shared[ry, rx, rc, ff]) | |
T.writes(conv2d_nhwc_local[nn, yy, xx, ff]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["conv2d_nhwc.gpu", ["TENSOR", [1, 28, 28, 128], "float32"], ["TENSOR", [3, 3, 128, 256], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], "float32"]}) | |
conv2d_nhwc_local[nn, yy, xx, ff] = conv2d_nhwc_local[nn, yy, xx, ff] + pad_temp_shared[nn, yy * 2 + ry, xx * 2 + rx, rc] * placeholder_shared[ry, rx, rc, ff] | |
for ax0, ax1, ax2, ax3 in T.grid(1, 14, 7, 4): | |
with T.block("conv2d_nhwc_local"): | |
v0, v1 = T.axis.remap("SS", [ax0, ax1]) | |
v2 = T.axis.spatial(14, i0_0_i1_0_i2_0_i3_0_fused * 7 + ax2) | |
v3 = T.axis.spatial(256, i0_1_i1_1_i2_1_i3_1_fused * 128 + i0_2_i1_2_i2_2_i3_2_fused * 4 + ax3) | |
T.reads(conv2d_nhwc_local[v0, v1, v2, v3], placeholder[v0, 0, 0, v3]) | |
T.writes(T_relu[v0, v1, v2, v3]) | |
T_relu[v0, v1, v2, v3] = T.max(conv2d_nhwc_local[v0, v1, v2, v3] + placeholder[v0, 0, 0, v3], T.float32(0)) | |
b0 = sch.get_block(name="pad_temp", func_name="main") | |
b1 = sch.get_block(name="conv2d_nhwc", func_name="main") | |
b2 = sch.get_block(name="T_add", func_name="main") | |
b3 = sch.get_block(name="T_relu", func_name="main") | |
b4 = sch.get_block(name="root", func_name="main") | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS") | |
l5, l6, l7, l8, l9, l10, l11 = sch.get_loops(block=b1) | |
v12, v13, v14, v15, v16 = sch.sample_perfect_tile(loop=l5, n=5, max_innermost_factor=64, decision=[1, 1, 1, 1, 1]) | |
l17, l18, l19, l20, l21 = sch.split(loop=l5, factors=[v12, v13, v14, v15, v16]) | |
v22, v23, v24, v25, v26 = sch.sample_perfect_tile(loop=l6, n=5, max_innermost_factor=64, decision=[1, 1, 1, 2, 7]) | |
l27, l28, l29, l30, l31 = sch.split(loop=l6, factors=[v22, v23, v24, v25, v26]) | |
v32, v33, v34, v35, v36 = sch.sample_perfect_tile(loop=l7, n=5, max_innermost_factor=64, decision=[2, 1, 1, 1, 7]) | |
l37, l38, l39, l40, l41 = sch.split(loop=l7, factors=[v32, v33, v34, v35, v36]) | |
v42, v43, v44, v45, v46 = sch.sample_perfect_tile(loop=l8, n=5, max_innermost_factor=64, decision=[1, 2, 32, 2, 2]) | |
l47, l48, l49, l50, l51 = sch.split(loop=l8, factors=[v42, v43, v44, v45, v46]) | |
v52, v53, v54 = sch.sample_perfect_tile(loop=l9, n=3, max_innermost_factor=64, decision=[1, 1, 3]) | |
l55, l56, l57 = sch.split(loop=l9, factors=[v52, v53, v54]) | |
v58, v59, v60 = sch.sample_perfect_tile(loop=l10, n=3, max_innermost_factor=64, decision=[1, 1, 3]) | |
l61, l62, l63 = sch.split(loop=l10, factors=[v58, v59, v60]) | |
v64, v65, v66 = sch.sample_perfect_tile(loop=l11, n=3, max_innermost_factor=64, decision=[64, 2, 1]) | |
l67, l68, l69 = sch.split(loop=l11, factors=[v64, v65, v66]) | |
sch.reorder(l17, l27, l37, l47, l18, l28, l38, l48, l19, l29, l39, l49, l55, l61, l67, l56, l62, l68, l20, l30, l40, l50, l57, l63, l69, l21, l31, l41, l51) | |
l70 = sch.fuse(l17, l27, l37, l47) | |
sch.bind(loop=l70, thread_axis="blockIdx.x") | |
l71 = sch.fuse(l18, l28, l38, l48) | |
sch.bind(loop=l71, thread_axis="vthread.x") | |
l72 = sch.fuse(l19, l29, l39, l49) | |
sch.bind(loop=l72, thread_axis="threadIdx.x") | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.thread_extent_low_inclusive", ann_val=32) | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=1024) | |
b73 = sch.cache_write(block=b1, write_buffer_index=0, storage_scope="local") | |
sch.reverse_compute_at(block=b73, loop=l72, preserve_unit_loops=True) | |
b74 = sch.cache_read(block=b1, read_buffer_index=1, storage_scope="shared") | |
sch.compute_at(block=b74, loop=l67, preserve_unit_loops=True) | |
l75, l76, l77, l78, l79, l80, l81, l82, l83, l84 = sch.get_loops(block=b74) | |
l85 = sch.fuse(l81, l82, l83, l84) | |
v86 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=3) | |
sch.annotate(block_or_loop=b74, ann_key="meta_schedule.cooperative_fetch", ann_val=v86) | |
b87 = sch.cache_read(block=b1, read_buffer_index=2, storage_scope="shared") | |
sch.compute_at(block=b87, loop=l67, preserve_unit_loops=True) | |
l88, l89, l90, l91, l92, l93, l94, l95, l96, l97 = sch.get_loops(block=b87) | |
l98 = sch.fuse(l94, l95, l96, l97) | |
v99 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=3) | |
sch.annotate(block_or_loop=b87, ann_key="meta_schedule.cooperative_fetch", ann_val=v99) | |
sch.reverse_compute_inline(block=b3) | |
sch.reverse_compute_inline(block=b2) | |
sch.compute_inline(block=b0) | |
v100 = sch.sample_categorical(candidates=[0, 16, 64, 512, 1024], probs=[0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001], decision=1) | |
sch.annotate(block_or_loop=b4, ann_key="meta_schedule.unroll_explicit", ann_val=v100) | |
sch.enter_postproc() | |
l101, l102, l103, l104, l105, l106, l107 = sch.get_loops(block=b74) | |
l108, l109, l110 = sch.split(loop=l107, factors=[None, 32, 4]) | |
sch.vectorize(loop=l110) | |
sch.bind(loop=l109, thread_axis="threadIdx.x") | |
l111, l112, l113, l114, l115, l116, l117 = sch.get_loops(block=b87) | |
l118, l119, l120 = sch.split(loop=l117, factors=[None, 32, 4]) | |
sch.vectorize(loop=l120) | |
sch.bind(loop=l119, thread_axis="threadIdx.x") | |
b121 = sch.get_block(name="root", func_name="main") | |
sch.unannotate(block_or_loop=b121, ann_key="meta_schedule.unroll_explicit") | |
b122, b123, b124, b125 = sch.get_child_blocks(b121) | |
l126, l127, l128, l129, l130, l131, l132, l133, l134 = sch.get_loops(block=b122) | |
sch.annotate(block_or_loop=l126, ann_key="pragma_auto_unroll_max_step", ann_val=16) | |
sch.annotate(block_or_loop=l126, ann_key="pragma_unroll_explicit", ann_val=1) | |
l135, l136, l137, l138, l139, l140, l141, l142, l143 = sch.get_loops(block=b123) | |
sch.annotate(block_or_loop=l135, ann_key="pragma_auto_unroll_max_step", ann_val=16) | |
sch.annotate(block_or_loop=l135, ann_key="pragma_unroll_explicit", ann_val=1) | |
l144, l145, l146, l147, l148, l149, l150, l151, l152, l153, l154, l155, l156, l157, l158, l159, l160, l161, l162, l163 = sch.get_loops(block=b124) | |
sch.annotate(block_or_loop=l144, ann_key="pragma_auto_unroll_max_step", ann_val=16) | |
sch.annotate(block_or_loop=l144, ann_key="pragma_unroll_explicit", ann_val=1) | |
l164, l165, l166, l167, l168, l169, l170 = sch.get_loops(block=b125) | |
sch.annotate(block_or_loop=l164, ann_key="pragma_auto_unroll_max_step", ann_val=16) | |
sch.annotate(block_or_loop=l164, ann_key="pragma_unroll_explicit", ann_val=1) | |
b171 = sch.get_block(name="conv2d_nhwc", func_name="main") | |
l172, l173, l174, l175, l176, l177, l178, l179, l180, l181, l182, l183, l184, l185, l186, l187, l188, l189, l190, l191 = sch.get_loops(block=b171) | |
b192 = sch.decompose_reduction(block=b171, loop=l177) | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #9: "vm_mod_fused_nn_conv2d_add_nn_relu_2"] Trial #22: GFLOPs: 1851.0858. Time: 0.0625 ms. Best GFLOPs: 2291.7312 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #9: "vm_mod_fused_nn_conv2d_add_nn_relu_2"] Trial #23: GFLOPs: 81.7298. Time: 1.4157 ms. Best GFLOPs: 2291.7312 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #9: "vm_mod_fused_nn_conv2d_add_nn_relu_2"] Trial #24: GFLOPs: 78.4198. Time: 1.4755 ms. Best GFLOPs: 2291.7312 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #9: "vm_mod_fused_nn_conv2d_add_nn_relu_2"] Trial #25: GFLOPs: 266.4723. Time: 0.4342 ms. Best GFLOPs: 2291.7312 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:271: [Task #9: "vm_mod_fused_nn_conv2d_add_nn_relu_2"] Trial #26: Error in building: LocalRunner: An exception occurred | |
Subprocess terminated | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 1, 1, 256), "float32"], placeholder_1: T.Buffer[(3, 3, 128, 256), "float32"], placeholder_2: T.Buffer[(1, 28, 28, 128), "float32"], T_relu: T.Buffer[(1, 14, 14, 256), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"tir.noalias": True, "global_symbol": "main"}) | |
# body | |
# with T.block("root") | |
conv2d_nhwc_local = T.alloc_buffer([1, 14, 14, 256], dtype="float32", scope="local") | |
pad_temp_shared = T.alloc_buffer([1, 30, 30, 128], dtype="float32", scope="shared") | |
placeholder_shared = T.alloc_buffer([3, 3, 128, 256], dtype="float32", scope="shared") | |
for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(7, thread="blockIdx.x"): | |
for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(1, thread="vthread.x"): | |
for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(32, thread="threadIdx.x"): | |
for i4_0, i5_0 in T.grid(1, 1): | |
for i1_4_init, i2_4_init, i3_4_init in T.grid(2, 14, 8): | |
with T.block("conv2d_nhwc_init"): | |
nn = T.axis.spatial(1, 0) | |
yy = T.axis.spatial(14, i0_0_i1_0_i2_0_i3_0_fused * 2 + i1_4_init) | |
xx = T.axis.spatial(14, i2_4_init) | |
ff = T.axis.spatial(256, i0_2_i1_2_i2_2_i3_2_fused * 8 + i3_4_init) | |
T.reads() | |
T.writes(conv2d_nhwc_local[nn, yy, xx, ff]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["conv2d_nhwc.gpu", ["TENSOR", [1, 28, 28, 128], "float32"], ["TENSOR", [3, 3, 128, 256], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], "float32"]}) | |
conv2d_nhwc_local[nn, yy, xx, ff] = T.float32(0) | |
for i6_0 in T.serial(128): | |
for ax0_ax1_ax2_ax3_fused_0 in T.serial(5): | |
for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(32, thread="threadIdx.x"): | |
with T.block("pad_temp_shared"): | |
v0 = T.axis.spatial(1, 0) | |
v1 = T.axis.spatial(30, i0_0_i1_0_i2_0_i3_0_fused * 4 + (ax0_ax1_ax2_ax3_fused_0 * 32 + ax0_ax1_ax2_ax3_fused_1) % 145 // 29) | |
v2 = T.axis.spatial(30, (ax0_ax1_ax2_ax3_fused_0 * 32 + ax0_ax1_ax2_ax3_fused_1) % 29) | |
v3 = T.axis.spatial(128, i6_0 + 0) | |
T.where(ax0_ax1_ax2_ax3_fused_0 * 32 + ax0_ax1_ax2_ax3_fused_1 < 145) | |
T.reads(placeholder_2[v0, v1 - 1, v2 - 1, v3]) | |
T.writes(pad_temp_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":1}) | |
pad_temp_shared[v0, v1, v2, v3] = T.if_then_else(1 <= v1 and v1 < 29 and 1 <= v2 and v2 < 29, placeholder_2[v0, v1 - 1, v2 - 1, v3], T.float32(0), dtype="float32") | |
for ax0_ax1_ax2_ax3_fused_0 in T.serial(18): | |
for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(32, thread="threadIdx.x"): | |
for ax0_ax1_ax2_ax3_fused_2 in T.vectorized(4): | |
with T.block("placeholder_shared"): | |
v0 = T.axis.spatial(3, (ax0_ax1_ax2_ax3_fused_0 * 128 + ax0_ax1_ax2_ax3_fused_1 * 4 + ax0_ax1_ax2_ax3_fused_2) // 768) | |
v1 = T.axis.spatial(3, (ax0_ax1_ax2_ax3_fused_0 * 128 + ax0_ax1_ax2_ax3_fused_1 * 4 + ax0_ax1_ax2_ax3_fused_2) % 768 // 256) | |
v2 = T.axis.spatial(128, i6_0) | |
v3 = T.axis.spatial(256, (ax0_ax1_ax2_ax3_fused_0 * 128 + ax0_ax1_ax2_ax3_fused_1 * 4 + ax0_ax1_ax2_ax3_fused_2) % 256) | |
T.reads(placeholder_1[v0, v1, v2, v3]) | |
T.writes(placeholder_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":4}) | |
placeholder_shared[v0, v1, v2, v3] = placeholder_1[v0, v1, v2, v3] | |
for i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3, i4_2, i5_2, i6_2, i0_4, i1_4, i2_4, i3_4 in T.grid(1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 2, 14, 8): | |
with T.block("conv2d_nhwc_update"): | |
nn = T.axis.spatial(1, 0) | |
yy = T.axis.spatial(14, i0_0_i1_0_i2_0_i3_0_fused * 2 + i1_4) | |
xx = T.axis.spatial(14, i2_4) | |
ff = T.axis.spatial(256, i0_2_i1_2_i2_2_i3_2_fused * 8 + i3_4) | |
ry, rx, rc = T.axis.remap("RRR", [i4_2, i5_2, i6_0]) | |
T.reads(conv2d_nhwc_local[nn, yy, xx, ff], pad_temp_shared[nn, yy * 2 + ry, xx * 2 + rx, rc], placeholder_shared[ry, rx, rc, ff]) | |
T.writes(conv2d_nhwc_local[nn, yy, xx, ff]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["conv2d_nhwc.gpu", ["TENSOR", [1, 28, 28, 128], "float32"], ["TENSOR", [3, 3, 128, 256], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], "float32"]}) | |
conv2d_nhwc_local[nn, yy, xx, ff] = conv2d_nhwc_local[nn, yy, xx, ff] + pad_temp_shared[nn, yy * 2 + ry, xx * 2 + rx, rc] * placeholder_shared[ry, rx, rc, ff] | |
for ax0, ax1, ax2, ax3 in T.grid(1, 2, 14, 8): | |
with T.block("conv2d_nhwc_local"): | |
v0 = T.axis.spatial(1, ax0) | |
v1 = T.axis.spatial(14, i0_0_i1_0_i2_0_i3_0_fused * 2 + ax1) | |
v2 = T.axis.spatial(14, ax2) | |
v3 = T.axis.spatial(256, i0_2_i1_2_i2_2_i3_2_fused * 8 + ax3) | |
T.reads(conv2d_nhwc_local[v0, v1, v2, v3], placeholder[v0, 0, 0, v3]) | |
T.writes(T_relu[v0, v1, v2, v3]) | |
T_relu[v0, v1, v2, v3] = T.max(conv2d_nhwc_local[v0, v1, v2, v3] + placeholder[v0, 0, 0, v3], T.float32(0)) | |
b0 = sch.get_block(name="pad_temp", func_name="main") | |
b1 = sch.get_block(name="conv2d_nhwc", func_name="main") | |
b2 = sch.get_block(name="T_add", func_name="main") | |
b3 = sch.get_block(name="T_relu", func_name="main") | |
b4 = sch.get_block(name="root", func_name="main") | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS") | |
l5, l6, l7, l8, l9, l10, l11 = sch.get_loops(block=b1) | |
v12, v13, v14, v15, v16 = sch.sample_perfect_tile(loop=l5, n=5, max_innermost_factor=64, decision=[1, 1, 1, 1, 1]) | |
l17, l18, l19, l20, l21 = sch.split(loop=l5, factors=[v12, v13, v14, v15, v16]) | |
v22, v23, v24, v25, v26 = sch.sample_perfect_tile(loop=l6, n=5, max_innermost_factor=64, decision=[7, 1, 1, 1, 2]) | |
l27, l28, l29, l30, l31 = sch.split(loop=l6, factors=[v22, v23, v24, v25, v26]) | |
v32, v33, v34, v35, v36 = sch.sample_perfect_tile(loop=l7, n=5, max_innermost_factor=64, decision=[1, 1, 1, 1, 14]) | |
l37, l38, l39, l40, l41 = sch.split(loop=l7, factors=[v32, v33, v34, v35, v36]) | |
v42, v43, v44, v45, v46 = sch.sample_perfect_tile(loop=l8, n=5, max_innermost_factor=64, decision=[1, 1, 32, 1, 8]) | |
l47, l48, l49, l50, l51 = sch.split(loop=l8, factors=[v42, v43, v44, v45, v46]) | |
v52, v53, v54 = sch.sample_perfect_tile(loop=l9, n=3, max_innermost_factor=64, decision=[1, 1, 3]) | |
l55, l56, l57 = sch.split(loop=l9, factors=[v52, v53, v54]) | |
v58, v59, v60 = sch.sample_perfect_tile(loop=l10, n=3, max_innermost_factor=64, decision=[1, 1, 3]) | |
l61, l62, l63 = sch.split(loop=l10, factors=[v58, v59, v60]) | |
v64, v65, v66 = sch.sample_perfect_tile(loop=l11, n=3, max_innermost_factor=64, decision=[128, 1, 1]) | |
l67, l68, l69 = sch.split(loop=l11, factors=[v64, v65, v66]) | |
sch.reorder(l17, l27, l37, l47, l18, l28, l38, l48, l19, l29, l39, l49, l55, l61, l67, l56, l62, l68, l20, l30, l40, l50, l57, l63, l69, l21, l31, l41, l51) | |
l70 = sch.fuse(l17, l27, l37, l47) | |
sch.bind(loop=l70, thread_axis="blockIdx.x") | |
l71 = sch.fuse(l18, l28, l38, l48) | |
sch.bind(loop=l71, thread_axis="vthread.x") | |
l72 = sch.fuse(l19, l29, l39, l49) | |
sch.bind(loop=l72, thread_axis="threadIdx.x") | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.thread_extent_low_inclusive", ann_val=32) | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=1024) | |
b73 = sch.cache_write(block=b1, write_buffer_index=0, storage_scope="local") | |
sch.reverse_compute_at(block=b73, loop=l72, preserve_unit_loops=True) | |
b74 = sch.cache_read(block=b1, read_buffer_index=1, storage_scope="shared") | |
sch.compute_at(block=b74, loop=l67, preserve_unit_loops=True) | |
l75, l76, l77, l78, l79, l80, l81, l82, l83, l84 = sch.get_loops(block=b74) | |
l85 = sch.fuse(l81, l82, l83, l84) | |
v86 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=0) | |
sch.annotate(block_or_loop=b74, ann_key="meta_schedule.cooperative_fetch", ann_val=v86) | |
b87 = sch.cache_read(block=b1, read_buffer_index=2, storage_scope="shared") | |
sch.compute_at(block=b87, loop=l67, preserve_unit_loops=True) | |
l88, l89, l90, l91, l92, l93, l94, l95, l96, l97 = sch.get_loops(block=b87) | |
l98 = sch.fuse(l94, l95, l96, l97) | |
v99 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=3) | |
sch.annotate(block_or_loop=b87, ann_key="meta_schedule.cooperative_fetch", ann_val=v99) | |
sch.reverse_compute_inline(block=b3) | |
sch.reverse_compute_inline(block=b2) | |
sch.compute_inline(block=b0) | |
v100 = sch.sample_categorical(candidates=[0, 16, 64, 512, 1024], probs=[0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001], decision=0) | |
sch.annotate(block_or_loop=b4, ann_key="meta_schedule.unroll_explicit", ann_val=v100) | |
sch.enter_postproc() | |
l101, l102, l103, l104, l105, l106, l107 = sch.get_loops(block=b74) | |
l108, l109 = sch.split(loop=l107, factors=[None, 32]) | |
sch.bind(loop=l109, thread_axis="threadIdx.x") | |
l110, l111, l112, l113, l114, l115, l116 = sch.get_loops(block=b87) | |
l117, l118, l119 = sch.split(loop=l116, factors=[None, 32, 4]) | |
sch.vectorize(loop=l119) | |
sch.bind(loop=l118, thread_axis="threadIdx.x") | |
b120 = sch.get_block(name="root", func_name="main") | |
sch.unannotate(block_or_loop=b120, ann_key="meta_schedule.unroll_explicit") | |
b121, b122, b123, b124 = sch.get_child_blocks(b120) | |
l125, l126, l127, l128, l129, l130, l131, l132 = sch.get_loops(block=b121) | |
l133, l134, l135, l136, l137, l138, l139, l140, l141 = sch.get_loops(block=b122) | |
l142, l143, l144, l145, l146, l147, l148, l149, l150, l151, l152, l153, l154, l155, l156, l157, l158, l159, l160, l161 = sch.get_loops(block=b123) | |
l162, l163, l164, l165, l166, l167, l168 = sch.get_loops(block=b124) | |
b169 = sch.get_block(name="conv2d_nhwc", func_name="main") | |
l170, l171, l172, l173, l174, l175, l176, l177, l178, l179, l180, l181, l182, l183, l184, l185, l186, l187, l188, l189 = sch.get_loops(block=b169) | |
b190 = sch.decompose_reduction(block=b169, loop=l175) | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #9: "vm_mod_fused_nn_conv2d_add_nn_relu_2"] Trial #27: GFLOPs: 1072.7530. Time: 0.1079 ms. Best GFLOPs: 2291.7312 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #9: "vm_mod_fused_nn_conv2d_add_nn_relu_2"] Trial #28: GFLOPs: 26.9562. Time: 4.2924 ms. Best GFLOPs: 2291.7312 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #9: "vm_mod_fused_nn_conv2d_add_nn_relu_2"] Trial #29: GFLOPs: 997.0992. Time: 0.1160 ms. Best GFLOPs: 2291.7312 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #9: "vm_mod_fused_nn_conv2d_add_nn_relu_2"] Trial #30: GFLOPs: 289.1136. Time: 0.4002 ms. Best GFLOPs: 2291.7312 | |
[14:41:42] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #9: "vm_mod_fused_nn_conv2d_add_nn_relu_2"] Trial #31: GFLOPs: 21.4354. Time: 5.3979 ms. Best GFLOPs: 2291.7312 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB validation: p-rmse: 0.095176 a-peak@32: 0.937789 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 0: tr-p-rmse: 0.616692 tr-a-peak@32: 0.666516 tr-rmse: 0.282541 tr-rmse: 0.282541 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 25: tr-p-rmse: 0.083896 tr-a-peak@32: 0.995813 tr-rmse: 0.096244 tr-rmse: 0.096244 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 50: tr-p-rmse: 0.083564 tr-a-peak@32: 0.995813 tr-rmse: 0.096124 tr-rmse: 0.096124 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB stopped. Best iteration: [23] tr-p-rmse:0.08350 tr-a-peak@32:0.99581 tr-rmse:0.09606 tr-rmse:0.09606 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:127: Scheduler picks Task #9: "vm_mod_fused_nn_conv2d_add_nn_relu_2" | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:141: Task #9 has finished. Remaining task(s): 10 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #10: "vm_mod_fused_nn_conv2d_add_1"] Trial #0: GFLOPs: 22.7857. Time: 0.5659 ms. Best GFLOPs: 22.7857 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #10: "vm_mod_fused_nn_conv2d_add_1"] Trial #1: GFLOPs: 1238.4849. Time: 0.0104 ms. Best GFLOPs: 1238.4849 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #10: "vm_mod_fused_nn_conv2d_add_1"] Trial #2: GFLOPs: 178.4079. Time: 0.0723 ms. Best GFLOPs: 1238.4849 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #10: "vm_mod_fused_nn_conv2d_add_1"] Trial #3: GFLOPs: 30.6690. Time: 0.4205 ms. Best GFLOPs: 1238.4849 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #10: "vm_mod_fused_nn_conv2d_add_1"] Trial #4: GFLOPs: 692.1987. Time: 0.0186 ms. Best GFLOPs: 1238.4849 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #10: "vm_mod_fused_nn_conv2d_add_1"] Trial #5: GFLOPs: 1148.0013. Time: 0.0112 ms. Best GFLOPs: 1238.4849 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #10: "vm_mod_fused_nn_conv2d_add_1"] Trial #6: GFLOPs: 780.8190. Time: 0.0165 ms. Best GFLOPs: 1238.4849 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #10: "vm_mod_fused_nn_conv2d_add_1"] Trial #7: GFLOPs: 10.6236. Time: 1.2138 ms. Best GFLOPs: 1238.4849 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #10: "vm_mod_fused_nn_conv2d_add_1"] Trial #8: GFLOPs: 55.1357. Time: 0.2339 ms. Best GFLOPs: 1238.4849 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #10: "vm_mod_fused_nn_conv2d_add_1"] Trial #9: GFLOPs: 14.2388. Time: 0.9056 ms. Best GFLOPs: 1238.4849 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #10: "vm_mod_fused_nn_conv2d_add_1"] Trial #10: GFLOPs: 20.9431. Time: 0.6157 ms. Best GFLOPs: 1238.4849 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #10: "vm_mod_fused_nn_conv2d_add_1"] Trial #11: GFLOPs: 237.0230. Time: 0.0544 ms. Best GFLOPs: 1238.4849 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #10: "vm_mod_fused_nn_conv2d_add_1"] Trial #12: GFLOPs: 218.5046. Time: 0.0590 ms. Best GFLOPs: 1238.4849 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #10: "vm_mod_fused_nn_conv2d_add_1"] Trial #13: GFLOPs: 18.1518. Time: 0.7104 ms. Best GFLOPs: 1238.4849 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #10: "vm_mod_fused_nn_conv2d_add_1"] Trial #14: GFLOPs: 101.0683. Time: 0.1276 ms. Best GFLOPs: 1238.4849 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #10: "vm_mod_fused_nn_conv2d_add_1"] Trial #15: GFLOPs: 14.6653. Time: 0.8793 ms. Best GFLOPs: 1238.4849 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #10: "vm_mod_fused_nn_conv2d_add_1"] Trial #16: GFLOPs: 133.3059. Time: 0.0967 ms. Best GFLOPs: 1238.4849 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #10: "vm_mod_fused_nn_conv2d_add_1"] Trial #17: GFLOPs: 226.5678. Time: 0.0569 ms. Best GFLOPs: 1238.4849 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #10: "vm_mod_fused_nn_conv2d_add_1"] Trial #18: GFLOPs: 9.5797. Time: 1.3461 ms. Best GFLOPs: 1238.4849 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #10: "vm_mod_fused_nn_conv2d_add_1"] Trial #19: GFLOPs: 78.7110. Time: 0.1638 ms. Best GFLOPs: 1238.4849 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #10: "vm_mod_fused_nn_conv2d_add_1"] Trial #20: GFLOPs: 34.1988. Time: 0.3771 ms. Best GFLOPs: 1238.4849 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #10: "vm_mod_fused_nn_conv2d_add_1"] Trial #21: GFLOPs: 102.0015. Time: 0.1264 ms. Best GFLOPs: 1238.4849 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #10: "vm_mod_fused_nn_conv2d_add_1"] Trial #22: GFLOPs: 9.5524. Time: 1.3499 ms. Best GFLOPs: 1238.4849 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #10: "vm_mod_fused_nn_conv2d_add_1"] Trial #23: GFLOPs: 192.1871. Time: 0.0671 ms. Best GFLOPs: 1238.4849 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #10: "vm_mod_fused_nn_conv2d_add_1"] Trial #24: GFLOPs: 59.6661. Time: 0.2161 ms. Best GFLOPs: 1238.4849 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #10: "vm_mod_fused_nn_conv2d_add_1"] Trial #25: GFLOPs: 570.5707. Time: 0.0226 ms. Best GFLOPs: 1238.4849 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #10: "vm_mod_fused_nn_conv2d_add_1"] Trial #26: GFLOPs: 223.9202. Time: 0.0576 ms. Best GFLOPs: 1238.4849 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #10: "vm_mod_fused_nn_conv2d_add_1"] Trial #27: GFLOPs: 488.9247. Time: 0.0264 ms. Best GFLOPs: 1238.4849 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:271: [Task #10: "vm_mod_fused_nn_conv2d_add_1"] Trial #28: Error in building: LocalRunner: An exception occurred | |
Subprocess terminated | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 1, 1, 256), "float32"], placeholder_1: T.Buffer[(1, 1, 128, 256), "float32"], placeholder_2: T.Buffer[(1, 28, 28, 128), "float32"], T_add: T.Buffer[(1, 14, 14, 256), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"tir.noalias": True, "global_symbol": "main"}) | |
# body | |
# with T.block("root") | |
conv2d_nhwc_local = T.alloc_buffer([1, 14, 14, 256], dtype="float32", scope="local") | |
pad_temp_shared = T.alloc_buffer([1, 28, 28, 128], dtype="float32", scope="shared") | |
placeholder_shared = T.alloc_buffer([1, 1, 128, 256], dtype="float32", scope="shared") | |
for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(1, thread="blockIdx.x", annotations={"pragma_auto_unroll_max_step":16, "pragma_unroll_explicit":1}): | |
for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(14, thread="vthread.x"): | |
for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(128, thread="threadIdx.x"): | |
for i4_0, i5_0 in T.grid(1, 1): | |
for i1_3_init, i3_3_init, i1_4_init in T.grid(7, 2, 2): | |
with T.block("conv2d_nhwc_init"): | |
nn = T.axis.spatial(1, 0) | |
yy = T.axis.spatial(14, i1_3_init * 2 + i1_4_init) | |
xx = T.axis.spatial(14, i0_1_i1_1_i2_1_i3_1_fused) | |
ff = T.axis.spatial(256, i0_2_i1_2_i2_2_i3_2_fused * 2 + i3_3_init) | |
T.reads() | |
T.writes(conv2d_nhwc_local[nn, yy, xx, ff]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["conv2d_nhwc.gpu", ["TENSOR", [1, 28, 28, 128], "float32"], ["TENSOR", [1, 1, 128, 256], "float32"], [2, 2], [0, 0, 0, 0], [1, 1], "float32"]}) | |
conv2d_nhwc_local[nn, yy, xx, ff] = T.float32(0) | |
for i6_0 in T.serial(128): | |
for ax0_ax1_ax2_ax3_fused_0 in T.serial(6): | |
for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(128, thread="threadIdx.x"): | |
with T.block("pad_temp_shared"): | |
v0 = T.axis.spatial(1, 0) | |
v1 = T.axis.spatial(28, (ax0_ax1_ax2_ax3_fused_0 * 128 + ax0_ax1_ax2_ax3_fused_1) // 27) | |
v2 = T.axis.spatial(28, (ax0_ax1_ax2_ax3_fused_0 * 128 + ax0_ax1_ax2_ax3_fused_1) % 27) | |
v3 = T.axis.spatial(128, i6_0) | |
T.where(ax0_ax1_ax2_ax3_fused_0 * 128 + ax0_ax1_ax2_ax3_fused_1 < 729) | |
T.reads(placeholder_2[v0, v1, v2, v3]) | |
T.writes(pad_temp_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":1}) | |
pad_temp_shared[v0, v1, v2, v3] = placeholder_2[v0, v1, v2, v3] | |
for ax0_ax1_ax2_ax3_fused_0 in T.serial(1): | |
for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(128, thread="threadIdx.x"): | |
for ax0_ax1_ax2_ax3_fused_2 in T.vectorized(2): | |
with T.block("placeholder_shared"): | |
v0 = T.axis.spatial(1, 0) | |
v1 = T.axis.spatial(1, 0) | |
v2 = T.axis.spatial(128, i6_0) | |
v3 = T.axis.spatial(256, ax0_ax1_ax2_ax3_fused_1 * 2 + ax0_ax1_ax2_ax3_fused_2) | |
T.reads(placeholder_1[v0, v1, v2, v3]) | |
T.writes(placeholder_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":2}) | |
placeholder_shared[v0, v1, v2, v3] = placeholder_1[v0, v1, v2, v3] | |
for i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3, i4_2, i5_2, i6_2, i0_4, i1_4, i2_4, i3_4 in T.grid(1, 1, 1, 1, 7, 1, 2, 1, 1, 1, 1, 2, 1, 1): | |
with T.block("conv2d_nhwc_update"): | |
nn = T.axis.spatial(1, 0) | |
yy = T.axis.spatial(14, i1_3 * 2 + i1_4) | |
xx = T.axis.spatial(14, i0_1_i1_1_i2_1_i3_1_fused) | |
ff = T.axis.spatial(256, i0_2_i1_2_i2_2_i3_2_fused * 2 + i3_3) | |
ry = T.axis.reduce(1, 0) | |
rx = T.axis.reduce(1, 0) | |
rc = T.axis.reduce(128, i6_0) | |
T.reads(conv2d_nhwc_local[nn, yy, xx, ff], pad_temp_shared[nn, yy * 2 + ry, xx * 2 + rx, rc], placeholder_shared[ry, rx, rc, ff]) | |
T.writes(conv2d_nhwc_local[nn, yy, xx, ff]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["conv2d_nhwc.gpu", ["TENSOR", [1, 28, 28, 128], "float32"], ["TENSOR", [1, 1, 128, 256], "float32"], [2, 2], [0, 0, 0, 0], [1, 1], "float32"]}) | |
conv2d_nhwc_local[nn, yy, xx, ff] = conv2d_nhwc_local[nn, yy, xx, ff] + pad_temp_shared[nn, yy * 2 + ry, xx * 2 + rx, rc] * placeholder_shared[ry, rx, rc, ff] | |
for ax0, ax1, ax2, ax3 in T.grid(1, 14, 1, 2): | |
with T.block("conv2d_nhwc_local"): | |
v0, v1 = T.axis.remap("SS", [ax0, ax1]) | |
v2 = T.axis.spatial(14, i0_1_i1_1_i2_1_i3_1_fused + ax2) | |
v3 = T.axis.spatial(256, i0_2_i1_2_i2_2_i3_2_fused * 2 + ax3) | |
T.reads(conv2d_nhwc_local[v0, v1, v2, v3], placeholder[v0, 0, 0, v3]) | |
T.writes(T_add[v0, v1, v2, v3]) | |
T_add[v0, v1, v2, v3] = conv2d_nhwc_local[v0, v1, v2, v3] + placeholder[v0, 0, 0, v3] | |
b0 = sch.get_block(name="pad_temp", func_name="main") | |
b1 = sch.get_block(name="conv2d_nhwc", func_name="main") | |
b2 = sch.get_block(name="T_add", func_name="main") | |
b3 = sch.get_block(name="root", func_name="main") | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS") | |
l4, l5, l6, l7, l8, l9, l10 = sch.get_loops(block=b1) | |
v11, v12, v13, v14, v15 = sch.sample_perfect_tile(loop=l4, n=5, max_innermost_factor=64, decision=[1, 1, 1, 1, 1]) | |
l16, l17, l18, l19, l20 = sch.split(loop=l4, factors=[v11, v12, v13, v14, v15]) | |
v21, v22, v23, v24, v25 = sch.sample_perfect_tile(loop=l5, n=5, max_innermost_factor=64, decision=[1, 1, 1, 7, 2]) | |
l26, l27, l28, l29, l30 = sch.split(loop=l5, factors=[v21, v22, v23, v24, v25]) | |
v31, v32, v33, v34, v35 = sch.sample_perfect_tile(loop=l6, n=5, max_innermost_factor=64, decision=[1, 14, 1, 1, 1]) | |
l36, l37, l38, l39, l40 = sch.split(loop=l6, factors=[v31, v32, v33, v34, v35]) | |
v41, v42, v43, v44, v45 = sch.sample_perfect_tile(loop=l7, n=5, max_innermost_factor=64, decision=[1, 1, 128, 2, 1]) | |
l46, l47, l48, l49, l50 = sch.split(loop=l7, factors=[v41, v42, v43, v44, v45]) | |
v51, v52, v53 = sch.sample_perfect_tile(loop=l8, n=3, max_innermost_factor=64, decision=[1, 1, 1]) | |
l54, l55, l56 = sch.split(loop=l8, factors=[v51, v52, v53]) | |
v57, v58, v59 = sch.sample_perfect_tile(loop=l9, n=3, max_innermost_factor=64, decision=[1, 1, 1]) | |
l60, l61, l62 = sch.split(loop=l9, factors=[v57, v58, v59]) | |
v63, v64, v65 = sch.sample_perfect_tile(loop=l10, n=3, max_innermost_factor=64, decision=[128, 1, 1]) | |
l66, l67, l68 = sch.split(loop=l10, factors=[v63, v64, v65]) | |
sch.reorder(l16, l26, l36, l46, l17, l27, l37, l47, l18, l28, l38, l48, l54, l60, l66, l55, l61, l67, l19, l29, l39, l49, l56, l62, l68, l20, l30, l40, l50) | |
l69 = sch.fuse(l16, l26, l36, l46) | |
sch.bind(loop=l69, thread_axis="blockIdx.x") | |
l70 = sch.fuse(l17, l27, l37, l47) | |
sch.bind(loop=l70, thread_axis="vthread.x") | |
l71 = sch.fuse(l18, l28, l38, l48) | |
sch.bind(loop=l71, thread_axis="threadIdx.x") | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.thread_extent_low_inclusive", ann_val=32) | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=1024) | |
b72 = sch.cache_write(block=b1, write_buffer_index=0, storage_scope="local") | |
sch.reverse_compute_at(block=b72, loop=l71, preserve_unit_loops=True) | |
b73 = sch.cache_read(block=b1, read_buffer_index=1, storage_scope="shared") | |
sch.compute_at(block=b73, loop=l66, preserve_unit_loops=True) | |
l74, l75, l76, l77, l78, l79, l80, l81, l82, l83 = sch.get_loops(block=b73) | |
l84 = sch.fuse(l80, l81, l82, l83) | |
v85 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=0) | |
sch.annotate(block_or_loop=b73, ann_key="meta_schedule.cooperative_fetch", ann_val=v85) | |
b86 = sch.cache_read(block=b1, read_buffer_index=2, storage_scope="shared") | |
sch.compute_at(block=b86, loop=l66, preserve_unit_loops=True) | |
l87, l88, l89, l90, l91, l92, l93, l94, l95, l96 = sch.get_loops(block=b86) | |
l97 = sch.fuse(l93, l94, l95, l96) | |
v98 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=1) | |
sch.annotate(block_or_loop=b86, ann_key="meta_schedule.cooperative_fetch", ann_val=v98) | |
sch.reverse_compute_inline(block=b2) | |
sch.compute_inline(block=b0) | |
v99 = sch.sample_categorical(candidates=[0, 16, 64, 512, 1024], probs=[0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001], decision=1) | |
sch.annotate(block_or_loop=b3, ann_key="meta_schedule.unroll_explicit", ann_val=v99) | |
sch.enter_postproc() | |
l100, l101, l102, l103, l104, l105, l106 = sch.get_loops(block=b73) | |
l107, l108 = sch.split(loop=l106, factors=[None, 128]) | |
sch.bind(loop=l108, thread_axis="threadIdx.x") | |
l109, l110, l111, l112, l113, l114, l115 = sch.get_loops(block=b86) | |
l116, l117, l118 = sch.split(loop=l115, factors=[None, 128, 2]) | |
sch.vectorize(loop=l118) | |
sch.bind(loop=l117, thread_axis="threadIdx.x") | |
b119 = sch.get_block(name="root", func_name="main") | |
sch.unannotate(block_or_loop=b119, ann_key="meta_schedule.unroll_explicit") | |
b120, b121, b122, b123 = sch.get_child_blocks(b119) | |
l124, l125, l126, l127, l128, l129, l130, l131 = sch.get_loops(block=b120) | |
sch.annotate(block_or_loop=l124, ann_key="pragma_auto_unroll_max_step", ann_val=16) | |
sch.annotate(block_or_loop=l124, ann_key="pragma_unroll_explicit", ann_val=1) | |
l132, l133, l134, l135, l136, l137, l138, l139, l140 = sch.get_loops(block=b121) | |
sch.annotate(block_or_loop=l132, ann_key="pragma_auto_unroll_max_step", ann_val=16) | |
sch.annotate(block_or_loop=l132, ann_key="pragma_unroll_explicit", ann_val=1) | |
l141, l142, l143, l144, l145, l146, l147, l148, l149, l150, l151, l152, l153, l154, l155, l156, l157, l158, l159, l160 = sch.get_loops(block=b122) | |
sch.annotate(block_or_loop=l141, ann_key="pragma_auto_unroll_max_step", ann_val=16) | |
sch.annotate(block_or_loop=l141, ann_key="pragma_unroll_explicit", ann_val=1) | |
l161, l162, l163, l164, l165, l166, l167 = sch.get_loops(block=b123) | |
sch.annotate(block_or_loop=l161, ann_key="pragma_auto_unroll_max_step", ann_val=16) | |
sch.annotate(block_or_loop=l161, ann_key="pragma_unroll_explicit", ann_val=1) | |
b168 = sch.get_block(name="conv2d_nhwc", func_name="main") | |
l169, l170, l171, l172, l173, l174, l175, l176, l177, l178, l179, l180, l181, l182, l183, l184, l185, l186, l187, l188 = sch.get_loops(block=b168) | |
b189 = sch.decompose_reduction(block=b168, loop=l174) | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #10: "vm_mod_fused_nn_conv2d_add_1"] Trial #29: GFLOPs: 189.7959. Time: 0.0679 ms. Best GFLOPs: 1238.4849 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #10: "vm_mod_fused_nn_conv2d_add_1"] Trial #30: GFLOPs: 218.6293. Time: 0.0590 ms. Best GFLOPs: 1238.4849 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #10: "vm_mod_fused_nn_conv2d_add_1"] Trial #31: GFLOPs: 217.6956. Time: 0.0592 ms. Best GFLOPs: 1238.4849 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB validation: p-rmse: 0.124635 a-peak@32: 0.932545 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 0: tr-p-rmse: 0.592826 tr-a-peak@32: 0.667930 tr-rmse: 0.273255 tr-rmse: 0.273255 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 25: tr-p-rmse: 0.081284 tr-a-peak@32: 0.999986 tr-rmse: 0.115487 tr-rmse: 0.115487 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 50: tr-p-rmse: 0.081462 tr-a-peak@32: 0.999986 tr-rmse: 0.115491 tr-rmse: 0.115491 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 75: tr-p-rmse: 0.081462 tr-a-peak@32: 0.999986 tr-rmse: 0.115491 tr-rmse: 0.115491 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB stopped. Best iteration: [25] tr-p-rmse:0.08128 tr-a-peak@32:0.99999 tr-rmse:0.11549 tr-rmse:0.11549 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:127: Scheduler picks Task #10: "vm_mod_fused_nn_conv2d_add_1" | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:141: Task #10 has finished. Remaining task(s): 9 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #11: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_2"] Trial #0: GFLOPs: 309.0906. Time: 0.3719 ms. Best GFLOPs: 309.0906 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #11: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_2"] Trial #1: GFLOPs: 67.8481. Time: 1.6943 ms. Best GFLOPs: 309.0906 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #11: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_2"] Trial #2: GFLOPs: 14.3524. Time: 8.0093 ms. Best GFLOPs: 309.0906 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #11: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_2"] Trial #3: GFLOPs: 13.8562. Time: 8.2962 ms. Best GFLOPs: 309.0906 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #11: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_2"] Trial #4: GFLOPs: 109.8259. Time: 1.0467 ms. Best GFLOPs: 309.0906 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #11: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_2"] Trial #5: GFLOPs: 22.4905. Time: 5.1112 ms. Best GFLOPs: 309.0906 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #11: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_2"] Trial #6: GFLOPs: 13.5658. Time: 8.4738 ms. Best GFLOPs: 309.0906 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #11: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_2"] Trial #7: GFLOPs: 12.2544. Time: 9.3805 ms. Best GFLOPs: 309.0906 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #11: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_2"] Trial #8: GFLOPs: 120.3279. Time: 0.9553 ms. Best GFLOPs: 309.0906 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #11: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_2"] Trial #9: GFLOPs: 22.3912. Time: 5.1339 ms. Best GFLOPs: 309.0906 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #11: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_2"] Trial #10: GFLOPs: 18.1296. Time: 6.3406 ms. Best GFLOPs: 309.0906 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #11: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_2"] Trial #11: GFLOPs: 16.8280. Time: 6.8311 ms. Best GFLOPs: 309.0906 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #11: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_2"] Trial #12: GFLOPs: 279.8690. Time: 0.4107 ms. Best GFLOPs: 309.0906 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #11: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_2"] Trial #13: GFLOPs: 351.2187. Time: 0.3273 ms. Best GFLOPs: 351.2187 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #11: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_2"] Trial #14: GFLOPs: 57.6130. Time: 1.9953 ms. Best GFLOPs: 351.2187 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #11: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_2"] Trial #15: GFLOPs: 174.6020. Time: 0.6584 ms. Best GFLOPs: 351.2187 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #11: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_2"] Trial #16: GFLOPs: 47.0580. Time: 2.4428 ms. Best GFLOPs: 351.2187 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #11: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_2"] Trial #17: GFLOPs: 164.1515. Time: 0.7003 ms. Best GFLOPs: 351.2187 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #11: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_2"] Trial #18: GFLOPs: 9.9546. Time: 11.5477 ms. Best GFLOPs: 351.2187 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #11: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_2"] Trial #19: GFLOPs: 26.9861. Time: 4.2597 ms. Best GFLOPs: 351.2187 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #11: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_2"] Trial #20: GFLOPs: 55.9224. Time: 2.0556 ms. Best GFLOPs: 351.2187 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #11: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_2"] Trial #21: GFLOPs: 64.8493. Time: 1.7726 ms. Best GFLOPs: 351.2187 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #11: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_2"] Trial #22: GFLOPs: 592.9306. Time: 0.1939 ms. Best GFLOPs: 592.9306 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #11: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_2"] Trial #23: GFLOPs: 6.8606. Time: 16.7555 ms. Best GFLOPs: 592.9306 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #11: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_2"] Trial #24: GFLOPs: 37.2099. Time: 3.0893 ms. Best GFLOPs: 592.9306 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #11: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_2"] Trial #25: GFLOPs: 139.6683. Time: 0.8230 ms. Best GFLOPs: 592.9306 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #11: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_2"] Trial #26: GFLOPs: 563.1596. Time: 0.2041 ms. Best GFLOPs: 592.9306 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #11: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_2"] Trial #27: GFLOPs: 329.9689. Time: 0.3484 ms. Best GFLOPs: 592.9306 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #11: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_2"] Trial #28: GFLOPs: 82.6947. Time: 1.3901 ms. Best GFLOPs: 592.9306 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #11: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_2"] Trial #29: GFLOPs: 298.7382. Time: 0.3848 ms. Best GFLOPs: 592.9306 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #11: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_2"] Trial #30: GFLOPs: 75.0749. Time: 1.5312 ms. Best GFLOPs: 592.9306 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #11: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_2"] Trial #31: GFLOPs: 2.1485. Time: 53.5033 ms. Best GFLOPs: 592.9306 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB validation: p-rmse: 0.371838 a-peak@32: 0.891431 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 0: tr-p-rmse: 0.627512 tr-a-peak@32: 0.667930 tr-rmse: 0.273228 tr-rmse: 0.273228 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 25: tr-p-rmse: 0.076899 tr-a-peak@32: 0.999986 tr-rmse: 0.115848 tr-rmse: 0.115848 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 50: tr-p-rmse: 0.076869 tr-a-peak@32: 0.999986 tr-rmse: 0.115820 tr-rmse: 0.115820 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 75: tr-p-rmse: 0.076869 tr-a-peak@32: 0.999986 tr-rmse: 0.115820 tr-rmse: 0.115820 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB stopped. Best iteration: [33] tr-p-rmse:0.07687 tr-a-peak@32:0.99999 tr-rmse:0.11582 tr-rmse:0.11582 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:127: Scheduler picks Task #11: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_2" | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:141: Task #11 has finished. Remaining task(s): 8 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #12: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2"] Trial #0: GFLOPs: 71.3301. Time: 1.6109 ms. Best GFLOPs: 71.3301 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #12: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2"] Trial #1: GFLOPs: 15.4883. Time: 7.4187 ms. Best GFLOPs: 71.3301 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #12: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2"] Trial #2: GFLOPs: 48.9681. Time: 2.3465 ms. Best GFLOPs: 71.3301 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #12: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2"] Trial #3: GFLOPs: 19.1799. Time: 5.9908 ms. Best GFLOPs: 71.3301 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #12: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2"] Trial #4: GFLOPs: 483.5489. Time: 0.2376 ms. Best GFLOPs: 483.5489 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #12: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2"] Trial #5: GFLOPs: 15.5635. Time: 7.3829 ms. Best GFLOPs: 483.5489 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #12: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2"] Trial #6: GFLOPs: 11.6915. Time: 9.8279 ms. Best GFLOPs: 483.5489 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #12: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2"] Trial #7: GFLOPs: 8.9453. Time: 12.8450 ms. Best GFLOPs: 483.5489 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #12: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2"] Trial #8: GFLOPs: 11.9221. Time: 9.6378 ms. Best GFLOPs: 483.5489 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #12: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2"] Trial #9: GFLOPs: 59.2422. Time: 1.9395 ms. Best GFLOPs: 483.5489 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #12: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2"] Trial #10: GFLOPs: 750.4066. Time: 0.1531 ms. Best GFLOPs: 750.4066 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #12: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2"] Trial #11: GFLOPs: 220.0120. Time: 0.5223 ms. Best GFLOPs: 750.4066 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #12: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2"] Trial #12: GFLOPs: 42.8067. Time: 2.6842 ms. Best GFLOPs: 750.4066 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #12: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2"] Trial #13: GFLOPs: 529.0477. Time: 0.2172 ms. Best GFLOPs: 750.4066 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #12: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2"] Trial #14: GFLOPs: 21.4663. Time: 5.3527 ms. Best GFLOPs: 750.4066 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #12: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2"] Trial #15: GFLOPs: 1192.0679. Time: 0.0964 ms. Best GFLOPs: 1192.0679 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #12: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2"] Trial #16: GFLOPs: 16.5844. Time: 6.9284 ms. Best GFLOPs: 1192.0679 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #12: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2"] Trial #17: GFLOPs: 101.7458. Time: 1.1293 ms. Best GFLOPs: 1192.0679 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #12: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2"] Trial #18: GFLOPs: 201.0007. Time: 0.5717 ms. Best GFLOPs: 1192.0679 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #12: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2"] Trial #19: GFLOPs: 27.1742. Time: 4.2284 ms. Best GFLOPs: 1192.0679 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #12: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2"] Trial #20: GFLOPs: 100.3793. Time: 1.1447 ms. Best GFLOPs: 1192.0679 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #12: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2"] Trial #21: GFLOPs: 26.1055. Time: 4.4015 ms. Best GFLOPs: 1192.0679 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #12: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2"] Trial #22: GFLOPs: 216.9639. Time: 0.5296 ms. Best GFLOPs: 1192.0679 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #12: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2"] Trial #23: GFLOPs: 10.6665. Time: 10.7723 ms. Best GFLOPs: 1192.0679 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #12: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2"] Trial #24: GFLOPs: 205.2974. Time: 0.5597 ms. Best GFLOPs: 1192.0679 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #12: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2"] Trial #25: GFLOPs: 82.4651. Time: 1.3934 ms. Best GFLOPs: 1192.0679 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #12: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2"] Trial #26: GFLOPs: 69.7178. Time: 1.6481 ms. Best GFLOPs: 1192.0679 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #12: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2"] Trial #27: GFLOPs: 568.3458. Time: 0.2022 ms. Best GFLOPs: 1192.0679 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #12: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2"] Trial #28: GFLOPs: 12.9994. Time: 8.8391 ms. Best GFLOPs: 1192.0679 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #12: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2"] Trial #29: GFLOPs: 77.5164. Time: 1.4823 ms. Best GFLOPs: 1192.0679 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #12: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2"] Trial #30: GFLOPs: 12.3624. Time: 9.2946 ms. Best GFLOPs: 1192.0679 | |
[14:41:43] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #12: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2"] Trial #31: GFLOPs: 87.9660. Time: 1.3062 ms. Best GFLOPs: 1192.0679 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB validation: p-rmse: 0.109196 a-peak@32: 0.911815 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 0: tr-p-rmse: 0.678573 tr-a-peak@32: 0.667930 tr-rmse: 0.273333 tr-rmse: 0.273333 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 25: tr-p-rmse: 0.077316 tr-a-peak@32: 0.999982 tr-rmse: 0.116884 tr-rmse: 0.116884 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 50: tr-p-rmse: 0.077287 tr-a-peak@32: 0.999982 tr-rmse: 0.116860 tr-rmse: 0.116860 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB stopped. Best iteration: [16] tr-p-rmse:0.07695 tr-a-peak@32:0.99581 tr-rmse:0.11759 tr-rmse:0.11759 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:127: Scheduler picks Task #12: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_2" | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:141: Task #12 has finished. Remaining task(s): 7 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #13: "vm_mod_fused_nn_conv2d_add_nn_relu_3"] Trial #0: GFLOPs: 596.2629. Time: 0.1940 ms. Best GFLOPs: 596.2629 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #13: "vm_mod_fused_nn_conv2d_add_nn_relu_3"] Trial #1: GFLOPs: 116.8903. Time: 0.9894 ms. Best GFLOPs: 596.2629 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #13: "vm_mod_fused_nn_conv2d_add_nn_relu_3"] Trial #2: GFLOPs: 11.1097. Time: 10.4104 ms. Best GFLOPs: 596.2629 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #13: "vm_mod_fused_nn_conv2d_add_nn_relu_3"] Trial #3: GFLOPs: 316.7659. Time: 0.3651 ms. Best GFLOPs: 596.2629 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #13: "vm_mod_fused_nn_conv2d_add_nn_relu_3"] Trial #4: GFLOPs: 184.7872. Time: 0.6259 ms. Best GFLOPs: 596.2629 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #13: "vm_mod_fused_nn_conv2d_add_nn_relu_3"] Trial #5: GFLOPs: 317.4154. Time: 0.3644 ms. Best GFLOPs: 596.2629 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #13: "vm_mod_fused_nn_conv2d_add_nn_relu_3"] Trial #6: GFLOPs: 249.3298. Time: 0.4639 ms. Best GFLOPs: 596.2629 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #13: "vm_mod_fused_nn_conv2d_add_nn_relu_3"] Trial #7: GFLOPs: 138.0053. Time: 0.8381 ms. Best GFLOPs: 596.2629 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #13: "vm_mod_fused_nn_conv2d_add_nn_relu_3"] Trial #8: GFLOPs: 199.0005. Time: 0.5812 ms. Best GFLOPs: 596.2629 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #13: "vm_mod_fused_nn_conv2d_add_nn_relu_3"] Trial #9: GFLOPs: 2453.0249. Time: 0.0471 ms. Best GFLOPs: 2453.0249 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #13: "vm_mod_fused_nn_conv2d_add_nn_relu_3"] Trial #10: GFLOPs: 27.5749. Time: 4.1942 ms. Best GFLOPs: 2453.0249 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #13: "vm_mod_fused_nn_conv2d_add_nn_relu_3"] Trial #11: GFLOPs: 1190.5859. Time: 0.0971 ms. Best GFLOPs: 2453.0249 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #13: "vm_mod_fused_nn_conv2d_add_nn_relu_3"] Trial #12: GFLOPs: 559.5036. Time: 0.2067 ms. Best GFLOPs: 2453.0249 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #13: "vm_mod_fused_nn_conv2d_add_nn_relu_3"] Trial #13: GFLOPs: 21.9074. Time: 5.2793 ms. Best GFLOPs: 2453.0249 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #13: "vm_mod_fused_nn_conv2d_add_nn_relu_3"] Trial #14: GFLOPs: 300.9421. Time: 0.3843 ms. Best GFLOPs: 2453.0249 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #13: "vm_mod_fused_nn_conv2d_add_nn_relu_3"] Trial #15: GFLOPs: 450.2700. Time: 0.2569 ms. Best GFLOPs: 2453.0249 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:271: [Task #13: "vm_mod_fused_nn_conv2d_add_nn_relu_3"] Trial #16: Error in building: LocalRunner: An exception occurred | |
Subprocess terminated | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 1, 1, 512), "float32"], placeholder_1: T.Buffer[(3, 3, 256, 512), "float32"], placeholder_2: T.Buffer[(1, 14, 14, 256), "float32"], T_relu: T.Buffer[(1, 7, 7, 512), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"tir.noalias": True, "global_symbol": "main"}) | |
# body | |
# with T.block("root") | |
conv2d_nhwc_local = T.alloc_buffer([1, 7, 7, 512], dtype="float32", scope="local") | |
pad_temp_shared = T.alloc_buffer([1, 16, 16, 256], dtype="float32", scope="shared") | |
placeholder_shared = T.alloc_buffer([3, 3, 256, 512], dtype="float32", scope="shared") | |
for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(28, thread="blockIdx.x"): | |
for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(1, thread="vthread.x"): | |
for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(32, thread="threadIdx.x"): | |
for i4_0, i5_0 in T.grid(1, 1): | |
for i3_3_init, i1_4_init, i3_4_init in T.grid(2, 7, 2): | |
with T.block("conv2d_nhwc_init"): | |
nn = T.axis.spatial(1, 0) | |
yy = T.axis.spatial(7, i1_4_init) | |
xx = T.axis.spatial(7, i0_0_i1_0_i2_0_i3_0_fused // 4) | |
ff = T.axis.spatial(512, i0_0_i1_0_i2_0_i3_0_fused % 4 * 128 + i0_2_i1_2_i2_2_i3_2_fused * 4 + i3_3_init * 2 + i3_4_init) | |
T.reads() | |
T.writes(conv2d_nhwc_local[nn, yy, xx, ff]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["conv2d_nhwc.gpu", ["TENSOR", [1, 14, 14, 256], "float32"], ["TENSOR", [3, 3, 256, 512], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], "float32"]}) | |
conv2d_nhwc_local[nn, yy, xx, ff] = T.float32(0) | |
for i6_0 in T.serial(256): | |
for ax0_ax1_ax2_ax3_fused_0 in T.serial(1): | |
for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(32, thread="threadIdx.x"): | |
for ax0_ax1_ax2_ax3_fused_2 in T.vectorized(4): | |
with T.block("pad_temp_shared"): | |
v0 = T.axis.spatial(1, 0) | |
v1 = T.axis.spatial(16, (ax0_ax1_ax2_ax3_fused_1 * 4 + ax0_ax1_ax2_ax3_fused_2) % 45 // 3) | |
v2 = T.axis.spatial(16, i0_0_i1_0_i2_0_i3_0_fused // 4 * 2 + (ax0_ax1_ax2_ax3_fused_1 * 4 + ax0_ax1_ax2_ax3_fused_2) % 3) | |
v3 = T.axis.spatial(256, i6_0 + 0) | |
T.where(ax0_ax1_ax2_ax3_fused_1 * 4 + ax0_ax1_ax2_ax3_fused_2 < 45) | |
T.reads(placeholder_2[v0, v1 - 1, v2 - 1, v3]) | |
T.writes(pad_temp_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":4}) | |
pad_temp_shared[v0, v1, v2, v3] = T.if_then_else(1 <= v1 and v1 < 15 and 1 <= v2 and v2 < 15, placeholder_2[v0, v1 - 1, v2 - 1, v3], T.float32(0), dtype="float32") | |
for ax0_ax1_ax2_ax3_fused_0 in T.serial(9): | |
for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(32, thread="threadIdx.x"): | |
for ax0_ax1_ax2_ax3_fused_2 in T.vectorized(4): | |
with T.block("placeholder_shared"): | |
v0 = T.axis.spatial(3, (ax0_ax1_ax2_ax3_fused_0 * 128 + ax0_ax1_ax2_ax3_fused_1 * 4 + ax0_ax1_ax2_ax3_fused_2) // 384) | |
v1 = T.axis.spatial(3, (ax0_ax1_ax2_ax3_fused_0 * 128 + ax0_ax1_ax2_ax3_fused_1 * 4 + ax0_ax1_ax2_ax3_fused_2) % 384 // 128) | |
v2 = T.axis.spatial(256, i6_0) | |
v3 = T.axis.spatial(512, i0_0_i1_0_i2_0_i3_0_fused % 4 * 128 + (ax0_ax1_ax2_ax3_fused_0 * 128 + ax0_ax1_ax2_ax3_fused_1 * 4 + ax0_ax1_ax2_ax3_fused_2) % 128) | |
T.reads(placeholder_1[v0, v1, v2, v3]) | |
T.writes(placeholder_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":4}) | |
placeholder_shared[v0, v1, v2, v3] = placeholder_1[v0, v1, v2, v3] | |
for i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3, i4_2, i5_2, i6_2, i0_4, i1_4, i2_4, i3_4 in T.grid(1, 1, 1, 1, 1, 1, 2, 3, 3, 1, 1, 7, 1, 2): | |
with T.block("conv2d_nhwc_update"): | |
nn = T.axis.spatial(1, 0) | |
yy = T.axis.spatial(7, i1_4) | |
xx = T.axis.spatial(7, i0_0_i1_0_i2_0_i3_0_fused // 4) | |
ff = T.axis.spatial(512, i0_0_i1_0_i2_0_i3_0_fused % 4 * 128 + i0_2_i1_2_i2_2_i3_2_fused * 4 + i3_3 * 2 + i3_4) | |
ry, rx, rc = T.axis.remap("RRR", [i4_2, i5_2, i6_0]) | |
T.reads(conv2d_nhwc_local[nn, yy, xx, ff], pad_temp_shared[nn, yy * 2 + ry, xx * 2 + rx, rc], placeholder_shared[ry, rx, rc, ff]) | |
T.writes(conv2d_nhwc_local[nn, yy, xx, ff]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["conv2d_nhwc.gpu", ["TENSOR", [1, 14, 14, 256], "float32"], ["TENSOR", [3, 3, 256, 512], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], "float32"]}) | |
conv2d_nhwc_local[nn, yy, xx, ff] = conv2d_nhwc_local[nn, yy, xx, ff] + pad_temp_shared[nn, yy * 2 + ry, xx * 2 + rx, rc] * placeholder_shared[ry, rx, rc, ff] | |
for ax0, ax1, ax2, ax3 in T.grid(1, 7, 1, 4): | |
with T.block("conv2d_nhwc_local"): | |
v0, v1 = T.axis.remap("SS", [ax0, ax1]) | |
v2 = T.axis.spatial(7, i0_0_i1_0_i2_0_i3_0_fused // 4 + ax2) | |
v3 = T.axis.spatial(512, i0_0_i1_0_i2_0_i3_0_fused % 4 * 128 + i0_2_i1_2_i2_2_i3_2_fused * 4 + ax3) | |
T.reads(conv2d_nhwc_local[v0, v1, v2, v3], placeholder[v0, 0, 0, v3]) | |
T.writes(T_relu[v0, v1, v2, v3]) | |
T_relu[v0, v1, v2, v3] = T.max(conv2d_nhwc_local[v0, v1, v2, v3] + placeholder[v0, 0, 0, v3], T.float32(0)) | |
b0 = sch.get_block(name="pad_temp", func_name="main") | |
b1 = sch.get_block(name="conv2d_nhwc", func_name="main") | |
b2 = sch.get_block(name="T_add", func_name="main") | |
b3 = sch.get_block(name="T_relu", func_name="main") | |
b4 = sch.get_block(name="root", func_name="main") | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS") | |
l5, l6, l7, l8, l9, l10, l11 = sch.get_loops(block=b1) | |
v12, v13, v14, v15, v16 = sch.sample_perfect_tile(loop=l5, n=5, max_innermost_factor=64, decision=[1, 1, 1, 1, 1]) | |
l17, l18, l19, l20, l21 = sch.split(loop=l5, factors=[v12, v13, v14, v15, v16]) | |
v22, v23, v24, v25, v26 = sch.sample_perfect_tile(loop=l6, n=5, max_innermost_factor=64, decision=[1, 1, 1, 1, 7]) | |
l27, l28, l29, l30, l31 = sch.split(loop=l6, factors=[v22, v23, v24, v25, v26]) | |
v32, v33, v34, v35, v36 = sch.sample_perfect_tile(loop=l7, n=5, max_innermost_factor=64, decision=[7, 1, 1, 1, 1]) | |
l37, l38, l39, l40, l41 = sch.split(loop=l7, factors=[v32, v33, v34, v35, v36]) | |
v42, v43, v44, v45, v46 = sch.sample_perfect_tile(loop=l8, n=5, max_innermost_factor=64, decision=[4, 1, 32, 2, 2]) | |
l47, l48, l49, l50, l51 = sch.split(loop=l8, factors=[v42, v43, v44, v45, v46]) | |
v52, v53, v54 = sch.sample_perfect_tile(loop=l9, n=3, max_innermost_factor=64, decision=[1, 1, 3]) | |
l55, l56, l57 = sch.split(loop=l9, factors=[v52, v53, v54]) | |
v58, v59, v60 = sch.sample_perfect_tile(loop=l10, n=3, max_innermost_factor=64, decision=[1, 1, 3]) | |
l61, l62, l63 = sch.split(loop=l10, factors=[v58, v59, v60]) | |
v64, v65, v66 = sch.sample_perfect_tile(loop=l11, n=3, max_innermost_factor=64, decision=[256, 1, 1]) | |
l67, l68, l69 = sch.split(loop=l11, factors=[v64, v65, v66]) | |
sch.reorder(l17, l27, l37, l47, l18, l28, l38, l48, l19, l29, l39, l49, l55, l61, l67, l56, l62, l68, l20, l30, l40, l50, l57, l63, l69, l21, l31, l41, l51) | |
l70 = sch.fuse(l17, l27, l37, l47) | |
sch.bind(loop=l70, thread_axis="blockIdx.x") | |
l71 = sch.fuse(l18, l28, l38, l48) | |
sch.bind(loop=l71, thread_axis="vthread.x") | |
l72 = sch.fuse(l19, l29, l39, l49) | |
sch.bind(loop=l72, thread_axis="threadIdx.x") | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.thread_extent_low_inclusive", ann_val=32) | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=1024) | |
b73 = sch.cache_write(block=b1, write_buffer_index=0, storage_scope="local") | |
sch.reverse_compute_at(block=b73, loop=l72, preserve_unit_loops=True) | |
b74 = sch.cache_read(block=b1, read_buffer_index=1, storage_scope="shared") | |
sch.compute_at(block=b74, loop=l67, preserve_unit_loops=True) | |
l75, l76, l77, l78, l79, l80, l81, l82, l83, l84 = sch.get_loops(block=b74) | |
l85 = sch.fuse(l81, l82, l83, l84) | |
v86 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=3) | |
sch.annotate(block_or_loop=b74, ann_key="meta_schedule.cooperative_fetch", ann_val=v86) | |
b87 = sch.cache_read(block=b1, read_buffer_index=2, storage_scope="shared") | |
sch.compute_at(block=b87, loop=l67, preserve_unit_loops=True) | |
l88, l89, l90, l91, l92, l93, l94, l95, l96, l97 = sch.get_loops(block=b87) | |
l98 = sch.fuse(l94, l95, l96, l97) | |
v99 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=3) | |
sch.annotate(block_or_loop=b87, ann_key="meta_schedule.cooperative_fetch", ann_val=v99) | |
sch.reverse_compute_inline(block=b3) | |
sch.reverse_compute_inline(block=b2) | |
sch.compute_inline(block=b0) | |
v100 = sch.sample_categorical(candidates=[0, 16, 64, 512, 1024], probs=[0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001], decision=0) | |
sch.annotate(block_or_loop=b4, ann_key="meta_schedule.unroll_explicit", ann_val=v100) | |
sch.enter_postproc() | |
l101, l102, l103, l104, l105, l106, l107 = sch.get_loops(block=b74) | |
l108, l109, l110 = sch.split(loop=l107, factors=[None, 32, 4]) | |
sch.vectorize(loop=l110) | |
sch.bind(loop=l109, thread_axis="threadIdx.x") | |
l111, l112, l113, l114, l115, l116, l117 = sch.get_loops(block=b87) | |
l118, l119, l120 = sch.split(loop=l117, factors=[None, 32, 4]) | |
sch.vectorize(loop=l120) | |
sch.bind(loop=l119, thread_axis="threadIdx.x") | |
b121 = sch.get_block(name="root", func_name="main") | |
sch.unannotate(block_or_loop=b121, ann_key="meta_schedule.unroll_explicit") | |
b122, b123, b124, b125 = sch.get_child_blocks(b121) | |
l126, l127, l128, l129, l130, l131, l132, l133, l134 = sch.get_loops(block=b122) | |
l135, l136, l137, l138, l139, l140, l141, l142, l143 = sch.get_loops(block=b123) | |
l144, l145, l146, l147, l148, l149, l150, l151, l152, l153, l154, l155, l156, l157, l158, l159, l160, l161, l162, l163 = sch.get_loops(block=b124) | |
l164, l165, l166, l167, l168, l169, l170 = sch.get_loops(block=b125) | |
b171 = sch.get_block(name="conv2d_nhwc", func_name="main") | |
l172, l173, l174, l175, l176, l177, l178, l179, l180, l181, l182, l183, l184, l185, l186, l187, l188, l189, l190, l191 = sch.get_loops(block=b171) | |
b192 = sch.decompose_reduction(block=b171, loop=l177) | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #13: "vm_mod_fused_nn_conv2d_add_nn_relu_3"] Trial #17: GFLOPs: 139.7718. Time: 0.8275 ms. Best GFLOPs: 2453.0249 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #13: "vm_mod_fused_nn_conv2d_add_nn_relu_3"] Trial #18: GFLOPs: 78.0116. Time: 1.4825 ms. Best GFLOPs: 2453.0249 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #13: "vm_mod_fused_nn_conv2d_add_nn_relu_3"] Trial #19: GFLOPs: 239.4988. Time: 0.4829 ms. Best GFLOPs: 2453.0249 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #13: "vm_mod_fused_nn_conv2d_add_nn_relu_3"] Trial #20: GFLOPs: 96.5618. Time: 1.1977 ms. Best GFLOPs: 2453.0249 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #13: "vm_mod_fused_nn_conv2d_add_nn_relu_3"] Trial #21: GFLOPs: 482.9486. Time: 0.2395 ms. Best GFLOPs: 2453.0249 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #13: "vm_mod_fused_nn_conv2d_add_nn_relu_3"] Trial #22: GFLOPs: 80.5699. Time: 1.4355 ms. Best GFLOPs: 2453.0249 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #13: "vm_mod_fused_nn_conv2d_add_nn_relu_3"] Trial #23: GFLOPs: 17.0295. Time: 6.7915 ms. Best GFLOPs: 2453.0249 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #13: "vm_mod_fused_nn_conv2d_add_nn_relu_3"] Trial #24: GFLOPs: 815.3738. Time: 0.1418 ms. Best GFLOPs: 2453.0249 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:271: [Task #13: "vm_mod_fused_nn_conv2d_add_nn_relu_3"] Trial #25: Error in building: LocalRunner: An exception occurred | |
Subprocess terminated | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 1, 1, 512), "float32"], placeholder_1: T.Buffer[(3, 3, 256, 512), "float32"], placeholder_2: T.Buffer[(1, 14, 14, 256), "float32"], T_relu: T.Buffer[(1, 7, 7, 512), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"tir.noalias": True, "global_symbol": "main"}) | |
# body | |
# with T.block("root") | |
conv2d_nhwc_local = T.alloc_buffer([1, 7, 7, 512], dtype="float32", scope="local") | |
pad_temp_shared = T.alloc_buffer([1, 16, 16, 256], dtype="float32", scope="shared") | |
placeholder_shared = T.alloc_buffer([3, 3, 256, 512], dtype="float32", scope="shared") | |
for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(8, thread="blockIdx.x", annotations={"pragma_auto_unroll_max_step":512, "pragma_unroll_explicit":1}): | |
for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(2, thread="vthread.x"): | |
for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(32, thread="threadIdx.x"): | |
for i1_4_init, i2_4_init in T.grid(7, 7): | |
with T.block("conv2d_nhwc_init"): | |
nn = T.axis.spatial(1, 0) | |
yy, xx = T.axis.remap("SS", [i1_4_init, i2_4_init]) | |
ff = T.axis.spatial(512, i0_0_i1_0_i2_0_i3_0_fused * 64 + i0_1_i1_1_i2_1_i3_1_fused * 32 + i0_2_i1_2_i2_2_i3_2_fused) | |
T.reads() | |
T.writes(conv2d_nhwc_local[nn, yy, xx, ff]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["conv2d_nhwc.gpu", ["TENSOR", [1, 14, 14, 256], "float32"], ["TENSOR", [3, 3, 256, 512], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], "float32"]}) | |
conv2d_nhwc_local[nn, yy, xx, ff] = T.float32(0) | |
for i4_0, i5_0, i6_0 in T.grid(3, 1, 128): | |
for ax0_ax1_ax2_ax3_fused_0 in T.serial(7): | |
for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(32, thread="threadIdx.x"): | |
for ax0_ax1_ax2_ax3_fused_2 in T.vectorized(2): | |
with T.block("pad_temp_shared"): | |
v0 = T.axis.spatial(1, 0) | |
v1 = T.axis.spatial(16, i4_0 + ((ax0_ax1_ax2_ax3_fused_0 * 32 + ax0_ax1_ax2_ax3_fused_1) * 2 + ax0_ax1_ax2_ax3_fused_2) % 390 // 30) | |
v2 = T.axis.spatial(16, ((ax0_ax1_ax2_ax3_fused_0 * 32 + ax0_ax1_ax2_ax3_fused_1) * 2 + ax0_ax1_ax2_ax3_fused_2) % 30 // 2) | |
v3 = T.axis.spatial(256, i6_0 * 2 + ((ax0_ax1_ax2_ax3_fused_0 * 32 + ax0_ax1_ax2_ax3_fused_1) * 2 + ax0_ax1_ax2_ax3_fused_2) % 2) | |
T.where((ax0_ax1_ax2_ax3_fused_0 * 32 + ax0_ax1_ax2_ax3_fused_1) * 2 + ax0_ax1_ax2_ax3_fused_2 < 390) | |
T.reads(placeholder_2[v0, v1 - 1, v2 - 1, v3]) | |
T.writes(pad_temp_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":2}) | |
pad_temp_shared[v0, v1, v2, v3] = T.if_then_else(1 <= v1 and v1 < 15 and 1 <= v2 and v2 < 15, placeholder_2[v0, v1 - 1, v2 - 1, v3], T.float32(0), dtype="float32") | |
for ax0_ax1_ax2_ax3_fused_0 in T.serial(3): | |
for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(32, thread="threadIdx.x"): | |
for ax0_ax1_ax2_ax3_fused_2 in T.vectorized(4): | |
with T.block("placeholder_shared"): | |
v0 = T.axis.spatial(3, i4_0) | |
v1 = T.axis.spatial(3, (ax0_ax1_ax2_ax3_fused_0 * 128 + ax0_ax1_ax2_ax3_fused_1 * 4 + ax0_ax1_ax2_ax3_fused_2) // 128) | |
v2 = T.axis.spatial(256, i6_0 * 2 + (ax0_ax1_ax2_ax3_fused_0 * 128 + ax0_ax1_ax2_ax3_fused_1 * 4 + ax0_ax1_ax2_ax3_fused_2) % 128 // 64) | |
v3 = T.axis.spatial(512, i0_0_i1_0_i2_0_i3_0_fused * 64 + (ax0_ax1_ax2_ax3_fused_0 * 128 + ax0_ax1_ax2_ax3_fused_1 * 4 + ax0_ax1_ax2_ax3_fused_2) % 64) | |
T.reads(placeholder_1[v0, v1, v2, v3]) | |
T.writes(placeholder_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":4}) | |
placeholder_shared[v0, v1, v2, v3] = placeholder_1[v0, v1, v2, v3] | |
for i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3, i4_2, i5_2, i6_2, i0_4, i1_4, i2_4, i3_4 in T.grid(1, 1, 1, 1, 1, 1, 1, 1, 3, 2, 1, 7, 7, 1): | |
with T.block("conv2d_nhwc_update"): | |
nn = T.axis.spatial(1, 0) | |
yy, xx = T.axis.remap("SS", [i1_4, i2_4]) | |
ff = T.axis.spatial(512, i0_0_i1_0_i2_0_i3_0_fused * 64 + i0_1_i1_1_i2_1_i3_1_fused * 32 + i0_2_i1_2_i2_2_i3_2_fused) | |
ry, rx = T.axis.remap("RR", [i4_0, i5_2]) | |
rc = T.axis.reduce(256, i6_0 * 2 + i6_2) | |
T.reads(conv2d_nhwc_local[nn, yy, xx, ff], pad_temp_shared[nn, yy * 2 + ry, xx * 2 + rx, rc], placeholder_shared[ry, rx, rc, ff]) | |
T.writes(conv2d_nhwc_local[nn, yy, xx, ff]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["conv2d_nhwc.gpu", ["TENSOR", [1, 14, 14, 256], "float32"], ["TENSOR", [3, 3, 256, 512], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], "float32"]}) | |
conv2d_nhwc_local[nn, yy, xx, ff] = conv2d_nhwc_local[nn, yy, xx, ff] + pad_temp_shared[nn, yy * 2 + ry, xx * 2 + rx, rc] * placeholder_shared[ry, rx, rc, ff] | |
for ax0, ax1, ax2, ax3 in T.grid(1, 7, 7, 1): | |
with T.block("conv2d_nhwc_local"): | |
v0, v1, v2 = T.axis.remap("SSS", [ax0, ax1, ax2]) | |
v3 = T.axis.spatial(512, i0_0_i1_0_i2_0_i3_0_fused * 64 + i0_1_i1_1_i2_1_i3_1_fused * 32 + i0_2_i1_2_i2_2_i3_2_fused + ax3) | |
T.reads(conv2d_nhwc_local[v0, v1, v2, v3], placeholder[v0, 0, 0, v3]) | |
T.writes(T_relu[v0, v1, v2, v3]) | |
T_relu[v0, v1, v2, v3] = T.max(conv2d_nhwc_local[v0, v1, v2, v3] + placeholder[v0, 0, 0, v3], T.float32(0)) | |
b0 = sch.get_block(name="pad_temp", func_name="main") | |
b1 = sch.get_block(name="conv2d_nhwc", func_name="main") | |
b2 = sch.get_block(name="T_add", func_name="main") | |
b3 = sch.get_block(name="T_relu", func_name="main") | |
b4 = sch.get_block(name="root", func_name="main") | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS") | |
l5, l6, l7, l8, l9, l10, l11 = sch.get_loops(block=b1) | |
v12, v13, v14, v15, v16 = sch.sample_perfect_tile(loop=l5, n=5, max_innermost_factor=64, decision=[1, 1, 1, 1, 1]) | |
l17, l18, l19, l20, l21 = sch.split(loop=l5, factors=[v12, v13, v14, v15, v16]) | |
v22, v23, v24, v25, v26 = sch.sample_perfect_tile(loop=l6, n=5, max_innermost_factor=64, decision=[1, 1, 1, 1, 7]) | |
l27, l28, l29, l30, l31 = sch.split(loop=l6, factors=[v22, v23, v24, v25, v26]) | |
v32, v33, v34, v35, v36 = sch.sample_perfect_tile(loop=l7, n=5, max_innermost_factor=64, decision=[1, 1, 1, 1, 7]) | |
l37, l38, l39, l40, l41 = sch.split(loop=l7, factors=[v32, v33, v34, v35, v36]) | |
v42, v43, v44, v45, v46 = sch.sample_perfect_tile(loop=l8, n=5, max_innermost_factor=64, decision=[8, 2, 32, 1, 1]) | |
l47, l48, l49, l50, l51 = sch.split(loop=l8, factors=[v42, v43, v44, v45, v46]) | |
v52, v53, v54 = sch.sample_perfect_tile(loop=l9, n=3, max_innermost_factor=64, decision=[3, 1, 1]) | |
l55, l56, l57 = sch.split(loop=l9, factors=[v52, v53, v54]) | |
v58, v59, v60 = sch.sample_perfect_tile(loop=l10, n=3, max_innermost_factor=64, decision=[1, 1, 3]) | |
l61, l62, l63 = sch.split(loop=l10, factors=[v58, v59, v60]) | |
v64, v65, v66 = sch.sample_perfect_tile(loop=l11, n=3, max_innermost_factor=64, decision=[128, 1, 2]) | |
l67, l68, l69 = sch.split(loop=l11, factors=[v64, v65, v66]) | |
sch.reorder(l17, l27, l37, l47, l18, l28, l38, l48, l19, l29, l39, l49, l55, l61, l67, l56, l62, l68, l20, l30, l40, l50, l57, l63, l69, l21, l31, l41, l51) | |
l70 = sch.fuse(l17, l27, l37, l47) | |
sch.bind(loop=l70, thread_axis="blockIdx.x") | |
l71 = sch.fuse(l18, l28, l38, l48) | |
sch.bind(loop=l71, thread_axis="vthread.x") | |
l72 = sch.fuse(l19, l29, l39, l49) | |
sch.bind(loop=l72, thread_axis="threadIdx.x") | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.thread_extent_low_inclusive", ann_val=32) | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=1024) | |
b73 = sch.cache_write(block=b1, write_buffer_index=0, storage_scope="local") | |
sch.reverse_compute_at(block=b73, loop=l72, preserve_unit_loops=True) | |
b74 = sch.cache_read(block=b1, read_buffer_index=1, storage_scope="shared") | |
sch.compute_at(block=b74, loop=l67, preserve_unit_loops=True) | |
l75, l76, l77, l78, l79, l80, l81, l82, l83, l84 = sch.get_loops(block=b74) | |
l85 = sch.fuse(l81, l82, l83, l84) | |
v86 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=1) | |
sch.annotate(block_or_loop=b74, ann_key="meta_schedule.cooperative_fetch", ann_val=v86) | |
b87 = sch.cache_read(block=b1, read_buffer_index=2, storage_scope="shared") | |
sch.compute_at(block=b87, loop=l67, preserve_unit_loops=True) | |
l88, l89, l90, l91, l92, l93, l94, l95, l96, l97 = sch.get_loops(block=b87) | |
l98 = sch.fuse(l94, l95, l96, l97) | |
v99 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=3) | |
sch.annotate(block_or_loop=b87, ann_key="meta_schedule.cooperative_fetch", ann_val=v99) | |
sch.reverse_compute_inline(block=b3) | |
sch.reverse_compute_inline(block=b2) | |
sch.compute_inline(block=b0) | |
v100 = sch.sample_categorical(candidates=[0, 16, 64, 512, 1024], probs=[0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001], decision=3) | |
sch.annotate(block_or_loop=b4, ann_key="meta_schedule.unroll_explicit", ann_val=v100) | |
sch.enter_postproc() | |
l101, l102, l103, l104, l105, l106, l107 = sch.get_loops(block=b74) | |
l108, l109, l110 = sch.split(loop=l107, factors=[None, 32, 2]) | |
sch.vectorize(loop=l110) | |
sch.bind(loop=l109, thread_axis="threadIdx.x") | |
l111, l112, l113, l114, l115, l116, l117 = sch.get_loops(block=b87) | |
l118, l119, l120 = sch.split(loop=l117, factors=[None, 32, 4]) | |
sch.vectorize(loop=l120) | |
sch.bind(loop=l119, thread_axis="threadIdx.x") | |
b121 = sch.get_block(name="root", func_name="main") | |
sch.unannotate(block_or_loop=b121, ann_key="meta_schedule.unroll_explicit") | |
b122, b123, b124, b125 = sch.get_child_blocks(b121) | |
l126, l127, l128, l129, l130, l131, l132, l133, l134 = sch.get_loops(block=b122) | |
sch.annotate(block_or_loop=l126, ann_key="pragma_auto_unroll_max_step", ann_val=512) | |
sch.annotate(block_or_loop=l126, ann_key="pragma_unroll_explicit", ann_val=1) | |
l135, l136, l137, l138, l139, l140, l141, l142, l143 = sch.get_loops(block=b123) | |
sch.annotate(block_or_loop=l135, ann_key="pragma_auto_unroll_max_step", ann_val=512) | |
sch.annotate(block_or_loop=l135, ann_key="pragma_unroll_explicit", ann_val=1) | |
l144, l145, l146, l147, l148, l149, l150, l151, l152, l153, l154, l155, l156, l157, l158, l159, l160, l161, l162, l163 = sch.get_loops(block=b124) | |
sch.annotate(block_or_loop=l144, ann_key="pragma_auto_unroll_max_step", ann_val=512) | |
sch.annotate(block_or_loop=l144, ann_key="pragma_unroll_explicit", ann_val=1) | |
l164, l165, l166, l167, l168, l169, l170 = sch.get_loops(block=b125) | |
sch.annotate(block_or_loop=l164, ann_key="pragma_auto_unroll_max_step", ann_val=512) | |
sch.annotate(block_or_loop=l164, ann_key="pragma_unroll_explicit", ann_val=1) | |
b171 = sch.get_block(name="conv2d_nhwc", func_name="main") | |
l172, l173, l174, l175, l176, l177, l178, l179, l180, l181, l182, l183, l184, l185, l186, l187, l188, l189, l190, l191 = sch.get_loops(block=b171) | |
b192 = sch.decompose_reduction(block=b171, loop=l175) | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #13: "vm_mod_fused_nn_conv2d_add_nn_relu_3"] Trial #26: GFLOPs: 36.8893. Time: 3.1352 ms. Best GFLOPs: 2453.0249 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #13: "vm_mod_fused_nn_conv2d_add_nn_relu_3"] Trial #27: GFLOPs: 70.4352. Time: 1.6420 ms. Best GFLOPs: 2453.0249 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #13: "vm_mod_fused_nn_conv2d_add_nn_relu_3"] Trial #28: GFLOPs: 1314.1522. Time: 0.0880 ms. Best GFLOPs: 2453.0249 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #13: "vm_mod_fused_nn_conv2d_add_nn_relu_3"] Trial #29: GFLOPs: 15.7267. Time: 7.3541 ms. Best GFLOPs: 2453.0249 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #13: "vm_mod_fused_nn_conv2d_add_nn_relu_3"] Trial #30: GFLOPs: 46.4368. Time: 2.4906 ms. Best GFLOPs: 2453.0249 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #13: "vm_mod_fused_nn_conv2d_add_nn_relu_3"] Trial #31: GFLOPs: 43.9264. Time: 2.6329 ms. Best GFLOPs: 2453.0249 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB validation: p-rmse: 0.053054 a-peak@32: 0.815011 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 0: tr-p-rmse: 0.614588 tr-a-peak@32: 0.667930 tr-rmse: 0.270702 tr-rmse: 0.270702 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 25: tr-p-rmse: 0.073371 tr-a-peak@32: 0.999981 tr-rmse: 0.116373 tr-rmse: 0.116373 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 50: tr-p-rmse: 0.073251 tr-a-peak@32: 0.999981 tr-rmse: 0.116304 tr-rmse: 0.116304 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 75: tr-p-rmse: 0.073251 tr-a-peak@32: 0.999981 tr-rmse: 0.116304 tr-rmse: 0.116304 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB stopped. Best iteration: [33] tr-p-rmse:0.07325 tr-a-peak@32:0.99998 tr-rmse:0.11630 tr-rmse:0.11630 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:127: Scheduler picks Task #13: "vm_mod_fused_nn_conv2d_add_nn_relu_3" | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:141: Task #13 has finished. Remaining task(s): 6 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #14: "vm_mod_fused_nn_conv2d_add_2"] Trial #0: GFLOPs: 240.4378. Time: 0.0535 ms. Best GFLOPs: 240.4378 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #14: "vm_mod_fused_nn_conv2d_add_2"] Trial #1: GFLOPs: 32.0247. Time: 0.4019 ms. Best GFLOPs: 240.4378 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #14: "vm_mod_fused_nn_conv2d_add_2"] Trial #2: GFLOPs: 127.9255. Time: 0.1006 ms. Best GFLOPs: 240.4378 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #14: "vm_mod_fused_nn_conv2d_add_2"] Trial #3: GFLOPs: 170.9018. Time: 0.0753 ms. Best GFLOPs: 240.4378 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #14: "vm_mod_fused_nn_conv2d_add_2"] Trial #4: GFLOPs: 36.5041. Time: 0.3526 ms. Best GFLOPs: 240.4378 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #14: "vm_mod_fused_nn_conv2d_add_2"] Trial #5: GFLOPs: 361.7411. Time: 0.0356 ms. Best GFLOPs: 361.7411 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #14: "vm_mod_fused_nn_conv2d_add_2"] Trial #6: GFLOPs: 767.8493. Time: 0.0168 ms. Best GFLOPs: 767.8493 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #14: "vm_mod_fused_nn_conv2d_add_2"] Trial #7: GFLOPs: 119.1770. Time: 0.1080 ms. Best GFLOPs: 767.8493 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #14: "vm_mod_fused_nn_conv2d_add_2"] Trial #8: GFLOPs: 31.0390. Time: 0.4146 ms. Best GFLOPs: 767.8493 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #14: "vm_mod_fused_nn_conv2d_add_2"] Trial #9: GFLOPs: 419.7186. Time: 0.0307 ms. Best GFLOPs: 767.8493 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #14: "vm_mod_fused_nn_conv2d_add_2"] Trial #10: GFLOPs: 147.9207. Time: 0.0870 ms. Best GFLOPs: 767.8493 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #14: "vm_mod_fused_nn_conv2d_add_2"] Trial #11: GFLOPs: 6.5236. Time: 1.9729 ms. Best GFLOPs: 767.8493 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #14: "vm_mod_fused_nn_conv2d_add_2"] Trial #12: GFLOPs: 249.3347. Time: 0.0516 ms. Best GFLOPs: 767.8493 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #14: "vm_mod_fused_nn_conv2d_add_2"] Trial #13: GFLOPs: 13.7646. Time: 0.9350 ms. Best GFLOPs: 767.8493 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #14: "vm_mod_fused_nn_conv2d_add_2"] Trial #14: GFLOPs: 17.0636. Time: 0.7542 ms. Best GFLOPs: 767.8493 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #14: "vm_mod_fused_nn_conv2d_add_2"] Trial #15: GFLOPs: 251.2138. Time: 0.0512 ms. Best GFLOPs: 767.8493 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #14: "vm_mod_fused_nn_conv2d_add_2"] Trial #16: GFLOPs: 216.0762. Time: 0.0596 ms. Best GFLOPs: 767.8493 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #14: "vm_mod_fused_nn_conv2d_add_2"] Trial #17: GFLOPs: 579.1628. Time: 0.0222 ms. Best GFLOPs: 767.8493 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #14: "vm_mod_fused_nn_conv2d_add_2"] Trial #18: GFLOPs: 356.3166. Time: 0.0361 ms. Best GFLOPs: 767.8493 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #14: "vm_mod_fused_nn_conv2d_add_2"] Trial #19: GFLOPs: 194.3035. Time: 0.0662 ms. Best GFLOPs: 767.8493 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #14: "vm_mod_fused_nn_conv2d_add_2"] Trial #20: GFLOPs: 80.4331. Time: 0.1600 ms. Best GFLOPs: 767.8493 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #14: "vm_mod_fused_nn_conv2d_add_2"] Trial #21: GFLOPs: 26.3781. Time: 0.4879 ms. Best GFLOPs: 767.8493 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #14: "vm_mod_fused_nn_conv2d_add_2"] Trial #22: GFLOPs: 22.0600. Time: 0.5834 ms. Best GFLOPs: 767.8493 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #14: "vm_mod_fused_nn_conv2d_add_2"] Trial #23: GFLOPs: 254.5465. Time: 0.0506 ms. Best GFLOPs: 767.8493 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #14: "vm_mod_fused_nn_conv2d_add_2"] Trial #24: GFLOPs: 19.4952. Time: 0.6602 ms. Best GFLOPs: 767.8493 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #14: "vm_mod_fused_nn_conv2d_add_2"] Trial #25: GFLOPs: 122.9990. Time: 0.1046 ms. Best GFLOPs: 767.8493 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #14: "vm_mod_fused_nn_conv2d_add_2"] Trial #26: GFLOPs: 8.3544. Time: 1.5405 ms. Best GFLOPs: 767.8493 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #14: "vm_mod_fused_nn_conv2d_add_2"] Trial #27: GFLOPs: 164.8650. Time: 0.0781 ms. Best GFLOPs: 767.8493 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #14: "vm_mod_fused_nn_conv2d_add_2"] Trial #28: GFLOPs: 20.5769. Time: 0.6255 ms. Best GFLOPs: 767.8493 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #14: "vm_mod_fused_nn_conv2d_add_2"] Trial #29: GFLOPs: 732.4067. Time: 0.0176 ms. Best GFLOPs: 767.8493 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #14: "vm_mod_fused_nn_conv2d_add_2"] Trial #30: GFLOPs: 711.5856. Time: 0.0181 ms. Best GFLOPs: 767.8493 | |
[14:41:44] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #14: "vm_mod_fused_nn_conv2d_add_2"] Trial #31: GFLOPs: 325.6769. Time: 0.0395 ms. Best GFLOPs: 767.8493 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB validation: p-rmse: 0.075838 a-peak@32: 0.998558 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 0: tr-p-rmse: 0.492410 tr-a-peak@32: 0.289076 tr-rmse: 0.259580 tr-rmse: 0.259580 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 25: tr-p-rmse: 0.065746 tr-a-peak@32: 0.995812 tr-rmse: 0.118815 tr-rmse: 0.118815 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 50: tr-p-rmse: 0.065784 tr-a-peak@32: 0.995812 tr-rmse: 0.118787 tr-rmse: 0.118787 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB stopped. Best iteration: [21] tr-p-rmse:0.06554 tr-a-peak@32:0.99581 tr-rmse:0.11897 tr-rmse:0.11897 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:127: Scheduler picks Task #14: "vm_mod_fused_nn_conv2d_add_2" | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:141: Task #14 has finished. Remaining task(s): 5 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #15: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_3"] Trial #0: GFLOPs: 886.2563. Time: 0.1604 ms. Best GFLOPs: 886.2563 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #15: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_3"] Trial #1: GFLOPs: 8.4302. Time: 16.8628 ms. Best GFLOPs: 886.2563 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #15: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_3"] Trial #2: GFLOPs: 420.7827. Time: 0.3378 ms. Best GFLOPs: 886.2563 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #15: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_3"] Trial #3: GFLOPs: 71.6908. Time: 1.9829 ms. Best GFLOPs: 886.2563 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #15: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_3"] Trial #4: GFLOPs: 86.4290. Time: 1.6448 ms. Best GFLOPs: 886.2563 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #15: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_3"] Trial #5: GFLOPs: 90.6875. Time: 1.5676 ms. Best GFLOPs: 886.2563 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #15: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_3"] Trial #6: GFLOPs: 32.1024. Time: 4.4282 ms. Best GFLOPs: 886.2563 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #15: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_3"] Trial #7: GFLOPs: 380.1925. Time: 0.3739 ms. Best GFLOPs: 886.2563 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #15: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_3"] Trial #8: GFLOPs: 134.4144. Time: 1.0576 ms. Best GFLOPs: 886.2563 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #15: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_3"] Trial #9: GFLOPs: 5.6767. Time: 25.0421 ms. Best GFLOPs: 886.2563 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #15: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_3"] Trial #10: GFLOPs: 222.9267. Time: 0.6377 ms. Best GFLOPs: 886.2563 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #15: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_3"] Trial #11: GFLOPs: 76.3513. Time: 1.8619 ms. Best GFLOPs: 886.2563 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #15: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_3"] Trial #12: GFLOPs: 200.8334. Time: 0.7078 ms. Best GFLOPs: 886.2563 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #15: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_3"] Trial #13: GFLOPs: 25.2808. Time: 5.6231 ms. Best GFLOPs: 886.2563 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #15: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_3"] Trial #14: GFLOPs: 121.8328. Time: 1.1668 ms. Best GFLOPs: 886.2563 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #15: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_3"] Trial #15: GFLOPs: 22.7065. Time: 6.2606 ms. Best GFLOPs: 886.2563 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #15: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_3"] Trial #16: GFLOPs: 344.0170. Time: 0.4132 ms. Best GFLOPs: 886.2563 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #15: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_3"] Trial #17: GFLOPs: 60.8067. Time: 2.3379 ms. Best GFLOPs: 886.2563 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #15: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_3"] Trial #18: GFLOPs: 6.5152. Time: 21.8194 ms. Best GFLOPs: 886.2563 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #15: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_3"] Trial #19: GFLOPs: 40.4329. Time: 3.5159 ms. Best GFLOPs: 886.2563 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #15: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_3"] Trial #20: GFLOPs: 59.5360. Time: 2.3878 ms. Best GFLOPs: 886.2563 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #15: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_3"] Trial #21: GFLOPs: 161.3625. Time: 0.8810 ms. Best GFLOPs: 886.2563 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #15: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_3"] Trial #22: GFLOPs: 62.0761. Time: 2.2901 ms. Best GFLOPs: 886.2563 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #15: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_3"] Trial #23: GFLOPs: 103.8801. Time: 1.3685 ms. Best GFLOPs: 886.2563 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #15: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_3"] Trial #24: GFLOPs: 201.7295. Time: 0.7047 ms. Best GFLOPs: 886.2563 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #15: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_3"] Trial #25: GFLOPs: 25.7671. Time: 5.5170 ms. Best GFLOPs: 886.2563 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #15: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_3"] Trial #26: GFLOPs: 611.2298. Time: 0.2326 ms. Best GFLOPs: 886.2563 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #15: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_3"] Trial #27: GFLOPs: 30.1336. Time: 4.7176 ms. Best GFLOPs: 886.2563 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #15: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_3"] Trial #28: GFLOPs: 111.1923. Time: 1.2785 ms. Best GFLOPs: 886.2563 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #15: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_3"] Trial #29: GFLOPs: 29.7951. Time: 4.7712 ms. Best GFLOPs: 886.2563 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #15: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_3"] Trial #30: GFLOPs: 129.6729. Time: 1.0963 ms. Best GFLOPs: 886.2563 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #15: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_3"] Trial #31: GFLOPs: 135.4140. Time: 1.0498 ms. Best GFLOPs: 886.2563 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB validation: p-rmse: 0.654823 a-peak@32: 0.714204 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 0: tr-p-rmse: 0.721405 tr-a-peak@32: 0.285969 tr-rmse: 0.256993 tr-rmse: 0.256993 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 25: tr-p-rmse: 0.083865 tr-a-peak@32: 0.999983 tr-rmse: 0.114271 tr-rmse: 0.114271 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 50: tr-p-rmse: 0.083868 tr-a-peak@32: 0.999983 tr-rmse: 0.114205 tr-rmse: 0.114205 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 75: tr-p-rmse: 0.083868 tr-a-peak@32: 0.999983 tr-rmse: 0.114205 tr-rmse: 0.114205 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB stopped. Best iteration: [25] tr-p-rmse:0.08386 tr-a-peak@32:0.99998 tr-rmse:0.11427 tr-rmse:0.11427 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:127: Scheduler picks Task #15: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_3" | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:141: Task #15 has finished. Remaining task(s): 4 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #16: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3"] Trial #0: GFLOPs: 254.3890. Time: 0.5587 ms. Best GFLOPs: 254.3890 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #16: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3"] Trial #1: GFLOPs: 33.5555. Time: 4.2357 ms. Best GFLOPs: 254.3890 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #16: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3"] Trial #2: GFLOPs: 162.0835. Time: 0.8769 ms. Best GFLOPs: 254.3890 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #16: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3"] Trial #3: GFLOPs: 76.8640. Time: 1.8491 ms. Best GFLOPs: 254.3890 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #16: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3"] Trial #4: GFLOPs: 19.0624. Time: 7.4562 ms. Best GFLOPs: 254.3890 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #16: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3"] Trial #5: GFLOPs: 169.5174. Time: 0.8385 ms. Best GFLOPs: 254.3890 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #16: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3"] Trial #6: GFLOPs: 42.0480. Time: 3.3802 ms. Best GFLOPs: 254.3890 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #16: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3"] Trial #7: GFLOPs: 105.2331. Time: 1.3506 ms. Best GFLOPs: 254.3890 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #16: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3"] Trial #8: GFLOPs: 84.8939. Time: 1.6742 ms. Best GFLOPs: 254.3890 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #16: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3"] Trial #9: GFLOPs: 88.7395. Time: 1.6017 ms. Best GFLOPs: 254.3890 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #16: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3"] Trial #10: GFLOPs: 444.4138. Time: 0.3198 ms. Best GFLOPs: 444.4138 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #16: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3"] Trial #11: GFLOPs: 98.1610. Time: 1.4479 ms. Best GFLOPs: 444.4138 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #16: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3"] Trial #12: GFLOPs: 399.4625. Time: 0.3558 ms. Best GFLOPs: 444.4138 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #16: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3"] Trial #13: GFLOPs: 694.1051. Time: 0.2048 ms. Best GFLOPs: 694.1051 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #16: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3"] Trial #14: GFLOPs: 456.2132. Time: 0.3115 ms. Best GFLOPs: 694.1051 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #16: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3"] Trial #15: GFLOPs: 17.7858. Time: 7.9913 ms. Best GFLOPs: 694.1051 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #16: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3"] Trial #16: GFLOPs: 43.9105. Time: 3.2369 ms. Best GFLOPs: 694.1051 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #16: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3"] Trial #17: GFLOPs: 1118.0260. Time: 0.1271 ms. Best GFLOPs: 1118.0260 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #16: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3"] Trial #18: GFLOPs: 165.9637. Time: 0.8564 ms. Best GFLOPs: 1118.0260 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #16: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3"] Trial #19: GFLOPs: 17.5630. Time: 8.0927 ms. Best GFLOPs: 1118.0260 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #16: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3"] Trial #20: GFLOPs: 734.3318. Time: 0.1936 ms. Best GFLOPs: 1118.0260 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #16: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3"] Trial #21: GFLOPs: 123.0688. Time: 1.1549 ms. Best GFLOPs: 1118.0260 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #16: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3"] Trial #22: GFLOPs: 21.8863. Time: 6.4941 ms. Best GFLOPs: 1118.0260 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #16: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3"] Trial #23: GFLOPs: 45.3236. Time: 3.1359 ms. Best GFLOPs: 1118.0260 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #16: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3"] Trial #24: GFLOPs: 28.4894. Time: 4.9889 ms. Best GFLOPs: 1118.0260 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #16: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3"] Trial #25: GFLOPs: 182.7606. Time: 0.7777 ms. Best GFLOPs: 1118.0260 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #16: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3"] Trial #26: GFLOPs: 31.2919. Time: 4.5421 ms. Best GFLOPs: 1118.0260 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #16: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3"] Trial #27: GFLOPs: 21.3581. Time: 6.6547 ms. Best GFLOPs: 1118.0260 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #16: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3"] Trial #28: GFLOPs: 225.1360. Time: 0.6313 ms. Best GFLOPs: 1118.0260 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #16: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3"] Trial #29: GFLOPs: 534.5363. Time: 0.2659 ms. Best GFLOPs: 1118.0260 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #16: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3"] Trial #30: GFLOPs: 266.4977. Time: 0.5333 ms. Best GFLOPs: 1118.0260 | |
[14:41:45] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #16: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3"] Trial #31: GFLOPs: 247.2026. Time: 0.5750 ms. Best GFLOPs: 1118.0260 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB validation: p-rmse: 0.175225 a-peak@32: 0.723258 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 0: tr-p-rmse: 0.734785 tr-a-peak@32: 0.313960 tr-rmse: 0.257027 tr-rmse: 0.257027 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 25: tr-p-rmse: 0.073689 tr-a-peak@32: 0.999986 tr-rmse: 0.112820 tr-rmse: 0.112820 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 50: tr-p-rmse: 0.073638 tr-a-peak@32: 0.999986 tr-rmse: 0.112748 tr-rmse: 0.112748 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB stopped. Best iteration: [23] tr-p-rmse:0.07361 tr-a-peak@32:0.99999 tr-rmse:0.11273 tr-rmse:0.11273 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:127: Scheduler picks Task #16: "vm_mod_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_3" | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:141: Task #16 has finished. Remaining task(s): 3 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #17: "vm_mod_fused_nn_adaptive_avg_pool2d"] Trial #0: GFLOPs: 8.1984. Time: 0.0031 ms. Best GFLOPs: 8.1984 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #17: "vm_mod_fused_nn_adaptive_avg_pool2d"] Trial #1: GFLOPs: 6.3809. Time: 0.0040 ms. Best GFLOPs: 8.1984 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #17: "vm_mod_fused_nn_adaptive_avg_pool2d"] Trial #2: GFLOPs: 11.1733. Time: 0.0023 ms. Best GFLOPs: 11.1733 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:271: [Task #17: "vm_mod_fused_nn_adaptive_avg_pool2d"] Trial #3: Error in building: LocalBuilder: An exception occurred | |
Traceback (most recent call last): | |
File "/home/zxybazh/tvm-tensorir/python/tvm/exec/popen_worker.py", line 87, in main | |
result = fn(*args, **kwargs) | |
File "/home/zxybazh/tvm-tensorir/python/tvm/meta_schedule/builder/local_builder.py", line 155, in <lambda> | |
lambda x: _worker_func(*x), | |
File "/home/zxybazh/tvm-tensorir/python/tvm/meta_schedule/builder/local_builder.py", line 213, in _worker_func | |
rt_mod: Module = f_build(mod, target, _deserialize_params(params)) | |
File "tvm/_ffi/_cython/./packed_func.pxi", line 323, in tvm._ffi._cy3.core.PackedFuncBase.__call__ | |
File "tvm/_ffi/_cython/./packed_func.pxi", line 257, in tvm._ffi._cy3.core.FuncCall | |
File "tvm/_ffi/_cython/./packed_func.pxi", line 246, in tvm._ffi._cy3.core.FuncCall3 | |
File "tvm/_ffi/_cython/./base.pxi", line 163, in tvm._ffi._cy3.core.CALL | |
tvm._ffi.base.TVMError: Traceback (most recent call last): | |
3: TVMFuncCall | |
at /home/zxybazh/tvm-tensorir/src/runtime/c_runtime_api.cc:477 | |
2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/packed_func.h:1217 | |
1: Call | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/packed_func.h:1213 | |
0: operator() | |
at /home/zxybazh/tvm-tensorir/src/runtime/c_runtime_api.cc:534 | |
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback | |
File "/home/zxybazh/tvm-tensorir/python/tvm/meta_schedule/builder/local_builder.py", line 243, in default_build | |
return tvm_build(mod, target=target) | |
File "/home/zxybazh/tvm-tensorir/python/tvm/driver/build_module.py", line 278, in build | |
... | |
at /home/zxybazh/tvm-tensorir/include/tvm/tir/stmt_functor.h:112 | |
7: tvm::tir::StmtMutator::VisitStmt_(tvm::tir::AttrStmtNode const*) | |
at /home/zxybazh/tvm-tensorir/src/tir/transforms/lower_thread_allreduce.cc:77 | |
6: tvm::tir::StmtMutator::VisitStmt(tvm::tir::Stmt const&) | |
at /home/zxybazh/tvm-tensorir/src/tir/ir/stmt_functor.cc:257 | |
5: tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>::VisitStmt(tvm::tir::Stmt const&) | |
at /home/zxybazh/tvm-tensorir/include/tvm/tir/stmt_functor.h:243 | |
4: tvm::NodeFunctor<tvm::tir::Stmt (tvm::runtime::ObjectRef const&, tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>*)>::operator()(tvm::runtime::ObjectRef const&, tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>*) const | |
at /home/zxybazh/tvm-tensorir/include/tvm/tir/stmt_functor.h:81 | |
3: tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>::InitVTable()::{lambda(tvm::runtime::ObjectRef const&, tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>*)#14}::_FUN(tvm::runtime::ObjectRef const&, tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>*) | |
at /home/zxybazh/tvm-tensorir/include/tvm/node/functor.h:97 | |
2: tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>::InitVTable()::{lambda(tvm::runtime::ObjectRef const&, tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>*)#14}::operator()(tvm::runtime::ObjectRef const&, tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>*) const | |
at /home/zxybazh/tvm-tensorir/include/tvm/tir/stmt_functor.h:124 | |
1: tvm::tir::ThreadAllreduceBuilder::VisitStmt_(tvm::tir::EvaluateNode const*) | |
at /home/zxybazh/tvm-tensorir/include/tvm/tir/stmt_functor.h:124 | |
0: tvm::tir::ThreadAllreduceBuilder::MakeAllreduce(tvm::tir::CallNode const*) | |
at /home/zxybazh/tvm-tensorir/src/tir/transforms/lower_thread_allreduce.cc:89 | |
File "/home/zxybazh/tvm-tensorir/src/tir/transforms/lower_thread_allreduce.cc", line 440 | |
at /home/zxybazh/tvm-tensorir/src/tir/transforms/lower_thread_allreduce.cc:440 | |
TVMError: | |
--------------------------------------------------------------- | |
An error occurred during the execution of TVM. | |
For more information, please see: https://tvm.apache.org/docs/errors.html | |
--------------------------------------------------------------- | |
Check failed: (!load_remap_.count(buffers[i]->data.get())) is false: | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 7, 7, 512), "float32"], tensor: T.Buffer[(1, 1, 1, 512), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"tir.noalias": True, "global_symbol": "main"}) | |
# body | |
# with T.block("root") | |
tensor_shared = T.alloc_buffer([1, 1, 1, 512], dtype="float32", scope="shared") | |
for i0_i1_i2_i3_0_fused in T.thread_binding(64, thread="blockIdx.x", annotations={"pragma_auto_unroll_max_step":1024, "pragma_unroll_explicit":1}): | |
for ax0, ax1, ax2, ax3, ax4_ax5_fused_0 in T.grid(1, 1, 1, 8, 7): | |
for ax4_ax5_fused_1 in T.thread_binding(8, thread="threadIdx.x"): | |
with T.block("tensor"): | |
ax0_1 = T.axis.spatial(1, 0) | |
ax1_1 = T.axis.spatial(1, 0) | |
ax2_1 = T.axis.spatial(1, 0) | |
ax3_1 = T.axis.spatial(512, i0_i1_i2_i3_0_fused * 8 + ax3) | |
rv0 = T.axis.reduce(7, (ax4_ax5_fused_0 * 8 + ax4_ax5_fused_1) // 7) | |
rv1 = T.axis.reduce(7, (ax4_ax5_fused_0 * 8 + ax4_ax5_fused_1) % 7) | |
T.where(ax4_ax5_fused_0 * 8 + ax4_ax5_fused_1 < 49) | |
T.reads(tensor_shared[ax0_1, ax1_1, ax2_1, ax3_1], placeholder[ax0_1, ax1_1 * 7 + rv0, ax2_1 * 7 + rv1, ax3_1]) | |
T.writes(tensor_shared[ax0_1, ax1_1, ax2_1, ax3_1]) | |
with T.init(): | |
tensor_shared[ax0_1, ax1_1, ax2_1, ax3_1] = T.float32(0) | |
tensor_shared[ax0_1, ax1_1, ax2_1, ax3_1] = tensor_shared[ax0_1, ax1_1, ax2_1, ax3_1] + placeholder[ax0_1, ax1_1 * 7 + rv0, ax2_1 * 7 + rv1, ax3_1] | |
for i3_1 in T.thread_binding(8, thread="threadIdx.x"): | |
with T.block("tensor_1"): | |
ax0 = T.axis.spatial(1, 0) | |
ax1 = T.axis.spatial(1, 0) | |
ax2 = T.axis.spatial(1, 0) | |
ax3 = T.axis.spatial(512, i0_i1_i2_i3_0_fused * 8 + i3_1) | |
T.reads(tensor_shared[ax0, ax1, ax2, ax3]) | |
T.writes(tensor[ax0, ax1, ax2, ax3]) | |
tensor[ax0, ax1, ax2, ax3] = tensor_shared[ax0, ax1, ax2, ax3] * T.float32(0.020408163265306121) | |
b0 = sch.get_block(name="tensor", func_name="main") | |
b1 = sch.get_block(name="root", func_name="main") | |
b2, = sch.get_consumers(block=b0) | |
l3, l4, l5, l6 = sch.get_loops(block=b2) | |
v7 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125], decision=1) | |
l8, l9 = sch.split(loop=l6, factors=[None, v7]) | |
sch.bind(loop=l9, thread_axis="threadIdx.x") | |
sch.compute_at(block=b0, loop=l8, preserve_unit_loops=True) | |
sch.set_scope(block=b0, buffer_index=0, storage_scope="shared") | |
l10, l11, l12, l13, l14, l15, l16, l17, l18, l19 = sch.get_loops(block=b0) | |
l20 = sch.fuse(l18, l19) | |
l21, l22 = sch.split(loop=l20, factors=[None, v7]) | |
sch.bind(loop=l22, thread_axis="threadIdx.x") | |
v23 = sch.sample_categorical(candidates=[0, 16, 64, 512, 1024], probs=[0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001], decision=4) | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.unroll_explicit", ann_val=v23) | |
sch.enter_postproc() | |
b24 = sch.get_block(name="tensor", func_name="main") | |
l25, l26, l27, l28, l29, l30, l31, l32, l33, l34 = sch.get_loops(block=b24) | |
l35 = sch.fuse(l25, l26, l27, l28) | |
sch.bind(loop=l35, thread_axis="blockIdx.x") | |
b36 = sch.get_block(name="root", func_name="main") | |
sch.unannotate(block_or_loop=b36, ann_key="meta_schedule.unroll_explicit") | |
b37, b38 = sch.get_child_blocks(b36) | |
l39, l40, l41, l42, l43, l44, l45 = sch.get_loops(block=b37) | |
sch.annotate(block_or_loop=l39, ann_key="pragma_auto_unroll_max_step", ann_val=1024) | |
sch.annotate(block_or_loop=l39, ann_key="pragma_unroll_explicit", ann_val=1) | |
l46, l47 = sch.get_loops(block=b38) | |
sch.annotate(block_or_loop=l46, ann_key="pragma_auto_unroll_max_step", ann_val=1024) | |
sch.annotate(block_or_loop=l46, ann_key="pragma_unroll_explicit", ann_val=1) | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #17: "vm_mod_fused_nn_adaptive_avg_pool2d"] Trial #4: GFLOPs: 11.1731. Time: 0.0023 ms. Best GFLOPs: 11.1733 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #17: "vm_mod_fused_nn_adaptive_avg_pool2d"] Trial #5: GFLOPs: 8.2004. Time: 0.0031 ms. Best GFLOPs: 11.1733 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #17: "vm_mod_fused_nn_adaptive_avg_pool2d"] Trial #6: GFLOPs: 8.1967. Time: 0.0031 ms. Best GFLOPs: 11.1733 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #17: "vm_mod_fused_nn_adaptive_avg_pool2d"] Trial #7: GFLOPs: 8.2164. Time: 0.0031 ms. Best GFLOPs: 11.1733 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #17: "vm_mod_fused_nn_adaptive_avg_pool2d"] Trial #8: GFLOPs: 0.2856. Time: 0.0897 ms. Best GFLOPs: 11.1733 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:271: [Task #17: "vm_mod_fused_nn_adaptive_avg_pool2d"] Trial #9: Error in building: LocalBuilder: An exception occurred | |
Traceback (most recent call last): | |
File "/home/zxybazh/tvm-tensorir/python/tvm/exec/popen_worker.py", line 87, in main | |
result = fn(*args, **kwargs) | |
File "/home/zxybazh/tvm-tensorir/python/tvm/meta_schedule/builder/local_builder.py", line 155, in <lambda> | |
lambda x: _worker_func(*x), | |
File "/home/zxybazh/tvm-tensorir/python/tvm/meta_schedule/builder/local_builder.py", line 213, in _worker_func | |
rt_mod: Module = f_build(mod, target, _deserialize_params(params)) | |
File "tvm/_ffi/_cython/./packed_func.pxi", line 323, in tvm._ffi._cy3.core.PackedFuncBase.__call__ | |
File "tvm/_ffi/_cython/./packed_func.pxi", line 257, in tvm._ffi._cy3.core.FuncCall | |
File "tvm/_ffi/_cython/./packed_func.pxi", line 246, in tvm._ffi._cy3.core.FuncCall3 | |
File "tvm/_ffi/_cython/./base.pxi", line 163, in tvm._ffi._cy3.core.CALL | |
tvm._ffi.base.TVMError: Traceback (most recent call last): | |
3: TVMFuncCall | |
at /home/zxybazh/tvm-tensorir/src/runtime/c_runtime_api.cc:477 | |
2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/packed_func.h:1217 | |
1: Call | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/packed_func.h:1213 | |
0: operator() | |
at /home/zxybazh/tvm-tensorir/src/runtime/c_runtime_api.cc:534 | |
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback | |
File "/home/zxybazh/tvm-tensorir/python/tvm/meta_schedule/builder/local_builder.py", line 243, in default_build | |
return tvm_build(mod, target=target) | |
File "/home/zxybazh/tvm-tensorir/python/tvm/driver/build_module.py", line 278, in build | |
... | |
at /home/zxybazh/tvm-tensorir/include/tvm/tir/stmt_functor.h:112 | |
7: tvm::tir::StmtMutator::VisitStmt_(tvm::tir::AttrStmtNode const*) | |
at /home/zxybazh/tvm-tensorir/src/tir/transforms/lower_thread_allreduce.cc:77 | |
6: tvm::tir::StmtMutator::VisitStmt(tvm::tir::Stmt const&) | |
at /home/zxybazh/tvm-tensorir/src/tir/ir/stmt_functor.cc:257 | |
5: tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>::VisitStmt(tvm::tir::Stmt const&) | |
at /home/zxybazh/tvm-tensorir/include/tvm/tir/stmt_functor.h:243 | |
4: tvm::NodeFunctor<tvm::tir::Stmt (tvm::runtime::ObjectRef const&, tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>*)>::operator()(tvm::runtime::ObjectRef const&, tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>*) const | |
at /home/zxybazh/tvm-tensorir/include/tvm/tir/stmt_functor.h:81 | |
3: tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>::InitVTable()::{lambda(tvm::runtime::ObjectRef const&, tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>*)#14}::_FUN(tvm::runtime::ObjectRef const&, tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>*) | |
at /home/zxybazh/tvm-tensorir/include/tvm/node/functor.h:97 | |
2: tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>::InitVTable()::{lambda(tvm::runtime::ObjectRef const&, tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>*)#14}::operator()(tvm::runtime::ObjectRef const&, tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>*) const | |
at /home/zxybazh/tvm-tensorir/include/tvm/tir/stmt_functor.h:124 | |
1: tvm::tir::ThreadAllreduceBuilder::VisitStmt_(tvm::tir::EvaluateNode const*) | |
at /home/zxybazh/tvm-tensorir/include/tvm/tir/stmt_functor.h:124 | |
0: tvm::tir::ThreadAllreduceBuilder::MakeAllreduce(tvm::tir::CallNode const*) | |
at /home/zxybazh/tvm-tensorir/src/tir/transforms/lower_thread_allreduce.cc:89 | |
File "/home/zxybazh/tvm-tensorir/src/tir/transforms/lower_thread_allreduce.cc", line 479 | |
at /home/zxybazh/tvm-tensorir/src/tir/transforms/lower_thread_allreduce.cc:479 | |
TVMError: | |
--------------------------------------------------------------- | |
An error occurred during the execution of TVM. | |
For more information, please see: https://tvm.apache.org/docs/errors.html | |
--------------------------------------------------------------- | |
Check failed: (!load_remap_.count(buffers[idx]->data.get())) is false: | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 7, 7, 512), "float32"], tensor: T.Buffer[(1, 1, 1, 512), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"tir.noalias": True, "global_symbol": "main"}) | |
# body | |
# with T.block("root") | |
tensor_shared = T.alloc_buffer([1, 1, 1, 512], dtype="float32", scope="shared") | |
for i0_i1_i2_i3_0_fused in T.thread_binding(2, thread="blockIdx.x", annotations={"pragma_auto_unroll_max_step":1024, "pragma_unroll_explicit":1}): | |
for ax0, ax1, ax2, ax3, ax4_ax5_fused_0 in T.grid(1, 1, 1, 256, 1): | |
for ax4_ax5_fused_1 in T.thread_binding(256, thread="threadIdx.x"): | |
with T.block("tensor"): | |
ax0_1 = T.axis.spatial(1, 0) | |
ax1_1 = T.axis.spatial(1, 0) | |
ax2_1 = T.axis.spatial(1, 0) | |
ax3_1 = T.axis.spatial(512, i0_i1_i2_i3_0_fused * 256 + ax3) | |
rv0 = T.axis.reduce(7, ax4_ax5_fused_1 // 7) | |
rv1 = T.axis.reduce(7, ax4_ax5_fused_1 % 7) | |
T.where(ax4_ax5_fused_1 < 49) | |
T.reads(tensor_shared[ax0_1, ax1_1, ax2_1, ax3_1], placeholder[ax0_1, ax1_1 * 7 + rv0, ax2_1 * 7 + rv1, ax3_1]) | |
T.writes(tensor_shared[ax0_1, ax1_1, ax2_1, ax3_1]) | |
with T.init(): | |
tensor_shared[ax0_1, ax1_1, ax2_1, ax3_1] = T.float32(0) | |
tensor_shared[ax0_1, ax1_1, ax2_1, ax3_1] = tensor_shared[ax0_1, ax1_1, ax2_1, ax3_1] + placeholder[ax0_1, ax1_1 * 7 + rv0, ax2_1 * 7 + rv1, ax3_1] | |
for i3_1 in T.thread_binding(256, thread="threadIdx.x"): | |
with T.block("tensor_1"): | |
ax0 = T.axis.spatial(1, 0) | |
ax1 = T.axis.spatial(1, 0) | |
ax2 = T.axis.spatial(1, 0) | |
ax3 = T.axis.spatial(512, i0_i1_i2_i3_0_fused * 256 + i3_1) | |
T.reads(tensor_shared[ax0, ax1, ax2, ax3]) | |
T.writes(tensor[ax0, ax1, ax2, ax3]) | |
tensor[ax0, ax1, ax2, ax3] = tensor_shared[ax0, ax1, ax2, ax3] * T.float32(0.020408163265306121) | |
b0 = sch.get_block(name="tensor", func_name="main") | |
b1 = sch.get_block(name="root", func_name="main") | |
b2, = sch.get_consumers(block=b0) | |
l3, l4, l5, l6 = sch.get_loops(block=b2) | |
v7 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125], decision=6) | |
l8, l9 = sch.split(loop=l6, factors=[None, v7]) | |
sch.bind(loop=l9, thread_axis="threadIdx.x") | |
sch.compute_at(block=b0, loop=l8, preserve_unit_loops=True) | |
sch.set_scope(block=b0, buffer_index=0, storage_scope="shared") | |
l10, l11, l12, l13, l14, l15, l16, l17, l18, l19 = sch.get_loops(block=b0) | |
l20 = sch.fuse(l18, l19) | |
l21, l22 = sch.split(loop=l20, factors=[None, v7]) | |
sch.bind(loop=l22, thread_axis="threadIdx.x") | |
v23 = sch.sample_categorical(candidates=[0, 16, 64, 512, 1024], probs=[0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001], decision=4) | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.unroll_explicit", ann_val=v23) | |
sch.enter_postproc() | |
b24 = sch.get_block(name="tensor", func_name="main") | |
l25, l26, l27, l28, l29, l30, l31, l32, l33, l34 = sch.get_loops(block=b24) | |
l35 = sch.fuse(l25, l26, l27, l28) | |
sch.bind(loop=l35, thread_axis="blockIdx.x") | |
b36 = sch.get_block(name="root", func_name="main") | |
sch.unannotate(block_or_loop=b36, ann_key="meta_schedule.unroll_explicit") | |
b37, b38 = sch.get_child_blocks(b36) | |
l39, l40, l41, l42, l43, l44, l45 = sch.get_loops(block=b37) | |
sch.annotate(block_or_loop=l39, ann_key="pragma_auto_unroll_max_step", ann_val=1024) | |
sch.annotate(block_or_loop=l39, ann_key="pragma_unroll_explicit", ann_val=1) | |
l46, l47 = sch.get_loops(block=b38) | |
sch.annotate(block_or_loop=l46, ann_key="pragma_auto_unroll_max_step", ann_val=1024) | |
sch.annotate(block_or_loop=l46, ann_key="pragma_unroll_explicit", ann_val=1) | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #17: "vm_mod_fused_nn_adaptive_avg_pool2d"] Trial #10: GFLOPs: 0.1021. Time: 0.2506 ms. Best GFLOPs: 11.1733 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:271: [Task #17: "vm_mod_fused_nn_adaptive_avg_pool2d"] Trial #11: Error in building: LocalBuilder: An exception occurred | |
Traceback (most recent call last): | |
File "/home/zxybazh/tvm-tensorir/python/tvm/exec/popen_worker.py", line 87, in main | |
result = fn(*args, **kwargs) | |
File "/home/zxybazh/tvm-tensorir/python/tvm/meta_schedule/builder/local_builder.py", line 155, in <lambda> | |
lambda x: _worker_func(*x), | |
File "/home/zxybazh/tvm-tensorir/python/tvm/meta_schedule/builder/local_builder.py", line 213, in _worker_func | |
rt_mod: Module = f_build(mod, target, _deserialize_params(params)) | |
File "tvm/_ffi/_cython/./packed_func.pxi", line 323, in tvm._ffi._cy3.core.PackedFuncBase.__call__ | |
File "tvm/_ffi/_cython/./packed_func.pxi", line 257, in tvm._ffi._cy3.core.FuncCall | |
File "tvm/_ffi/_cython/./packed_func.pxi", line 246, in tvm._ffi._cy3.core.FuncCall3 | |
File "tvm/_ffi/_cython/./base.pxi", line 163, in tvm._ffi._cy3.core.CALL | |
tvm._ffi.base.TVMError: Traceback (most recent call last): | |
3: TVMFuncCall | |
at /home/zxybazh/tvm-tensorir/src/runtime/c_runtime_api.cc:477 | |
2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/packed_func.h:1217 | |
1: Call | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/packed_func.h:1213 | |
0: operator() | |
at /home/zxybazh/tvm-tensorir/src/runtime/c_runtime_api.cc:534 | |
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback | |
File "/home/zxybazh/tvm-tensorir/python/tvm/meta_schedule/builder/local_builder.py", line 243, in default_build | |
return tvm_build(mod, target=target) | |
File "/home/zxybazh/tvm-tensorir/python/tvm/driver/build_module.py", line 278, in build | |
... | |
at /home/zxybazh/tvm-tensorir/include/tvm/tir/stmt_functor.h:112 | |
7: tvm::tir::StmtMutator::VisitStmt_(tvm::tir::AttrStmtNode const*) | |
at /home/zxybazh/tvm-tensorir/src/tir/transforms/lower_thread_allreduce.cc:77 | |
6: tvm::tir::StmtMutator::VisitStmt(tvm::tir::Stmt const&) | |
at /home/zxybazh/tvm-tensorir/src/tir/ir/stmt_functor.cc:257 | |
5: tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>::VisitStmt(tvm::tir::Stmt const&) | |
at /home/zxybazh/tvm-tensorir/include/tvm/tir/stmt_functor.h:243 | |
4: tvm::NodeFunctor<tvm::tir::Stmt (tvm::runtime::ObjectRef const&, tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>*)>::operator()(tvm::runtime::ObjectRef const&, tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>*) const | |
at /home/zxybazh/tvm-tensorir/include/tvm/tir/stmt_functor.h:81 | |
3: tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>::InitVTable()::{lambda(tvm::runtime::ObjectRef const&, tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>*)#14}::_FUN(tvm::runtime::ObjectRef const&, tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>*) | |
at /home/zxybazh/tvm-tensorir/include/tvm/node/functor.h:97 | |
2: tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>::InitVTable()::{lambda(tvm::runtime::ObjectRef const&, tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>*)#14}::operator()(tvm::runtime::ObjectRef const&, tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>*) const | |
at /home/zxybazh/tvm-tensorir/include/tvm/tir/stmt_functor.h:124 | |
1: tvm::tir::ThreadAllreduceBuilder::VisitStmt_(tvm::tir::EvaluateNode const*) | |
at /home/zxybazh/tvm-tensorir/include/tvm/tir/stmt_functor.h:124 | |
0: tvm::tir::ThreadAllreduceBuilder::MakeAllreduce(tvm::tir::CallNode const*) | |
at /home/zxybazh/tvm-tensorir/src/tir/transforms/lower_thread_allreduce.cc:89 | |
File "/home/zxybazh/tvm-tensorir/src/tir/transforms/lower_thread_allreduce.cc", line 440 | |
at /home/zxybazh/tvm-tensorir/src/tir/transforms/lower_thread_allreduce.cc:440 | |
TVMError: | |
--------------------------------------------------------------- | |
An error occurred during the execution of TVM. | |
For more information, please see: https://tvm.apache.org/docs/errors.html | |
--------------------------------------------------------------- | |
Check failed: (!load_remap_.count(buffers[i]->data.get())) is false: | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 7, 7, 512), "float32"], tensor: T.Buffer[(1, 1, 1, 512), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"tir.noalias": True, "global_symbol": "main"}) | |
# body | |
# with T.block("root") | |
tensor_shared = T.alloc_buffer([1, 1, 1, 512], dtype="float32", scope="shared") | |
for i0_i1_i2_i3_0_fused in T.thread_binding(32, thread="blockIdx.x", annotations={"pragma_auto_unroll_max_step":1024, "pragma_unroll_explicit":1}): | |
for ax0, ax1, ax2, ax3, ax4_ax5_fused_0 in T.grid(1, 1, 1, 16, 4): | |
for ax4_ax5_fused_1 in T.thread_binding(16, thread="threadIdx.x"): | |
with T.block("tensor"): | |
ax0_1 = T.axis.spatial(1, 0) | |
ax1_1 = T.axis.spatial(1, 0) | |
ax2_1 = T.axis.spatial(1, 0) | |
ax3_1 = T.axis.spatial(512, i0_i1_i2_i3_0_fused * 16 + ax3) | |
rv0 = T.axis.reduce(7, (ax4_ax5_fused_0 * 16 + ax4_ax5_fused_1) // 7) | |
rv1 = T.axis.reduce(7, (ax4_ax5_fused_0 * 16 + ax4_ax5_fused_1) % 7) | |
T.where(ax4_ax5_fused_0 * 16 + ax4_ax5_fused_1 < 49) | |
T.reads(tensor_shared[ax0_1, ax1_1, ax2_1, ax3_1], placeholder[ax0_1, ax1_1 * 7 + rv0, ax2_1 * 7 + rv1, ax3_1]) | |
T.writes(tensor_shared[ax0_1, ax1_1, ax2_1, ax3_1]) | |
with T.init(): | |
tensor_shared[ax0_1, ax1_1, ax2_1, ax3_1] = T.float32(0) | |
tensor_shared[ax0_1, ax1_1, ax2_1, ax3_1] = tensor_shared[ax0_1, ax1_1, ax2_1, ax3_1] + placeholder[ax0_1, ax1_1 * 7 + rv0, ax2_1 * 7 + rv1, ax3_1] | |
for i3_1 in T.thread_binding(16, thread="threadIdx.x"): | |
with T.block("tensor_1"): | |
ax0 = T.axis.spatial(1, 0) | |
ax1 = T.axis.spatial(1, 0) | |
ax2 = T.axis.spatial(1, 0) | |
ax3 = T.axis.spatial(512, i0_i1_i2_i3_0_fused * 16 + i3_1) | |
T.reads(tensor_shared[ax0, ax1, ax2, ax3]) | |
T.writes(tensor[ax0, ax1, ax2, ax3]) | |
tensor[ax0, ax1, ax2, ax3] = tensor_shared[ax0, ax1, ax2, ax3] * T.float32(0.020408163265306121) | |
b0 = sch.get_block(name="tensor", func_name="main") | |
b1 = sch.get_block(name="root", func_name="main") | |
b2, = sch.get_consumers(block=b0) | |
l3, l4, l5, l6 = sch.get_loops(block=b2) | |
v7 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125], decision=2) | |
l8, l9 = sch.split(loop=l6, factors=[None, v7]) | |
sch.bind(loop=l9, thread_axis="threadIdx.x") | |
sch.compute_at(block=b0, loop=l8, preserve_unit_loops=True) | |
sch.set_scope(block=b0, buffer_index=0, storage_scope="shared") | |
l10, l11, l12, l13, l14, l15, l16, l17, l18, l19 = sch.get_loops(block=b0) | |
l20 = sch.fuse(l18, l19) | |
l21, l22 = sch.split(loop=l20, factors=[None, v7]) | |
sch.bind(loop=l22, thread_axis="threadIdx.x") | |
v23 = sch.sample_categorical(candidates=[0, 16, 64, 512, 1024], probs=[0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001], decision=4) | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.unroll_explicit", ann_val=v23) | |
sch.enter_postproc() | |
b24 = sch.get_block(name="tensor", func_name="main") | |
l25, l26, l27, l28, l29, l30, l31, l32, l33, l34 = sch.get_loops(block=b24) | |
l35 = sch.fuse(l25, l26, l27, l28) | |
sch.bind(loop=l35, thread_axis="blockIdx.x") | |
b36 = sch.get_block(name="root", func_name="main") | |
sch.unannotate(block_or_loop=b36, ann_key="meta_schedule.unroll_explicit") | |
b37, b38 = sch.get_child_blocks(b36) | |
l39, l40, l41, l42, l43, l44, l45 = sch.get_loops(block=b37) | |
sch.annotate(block_or_loop=l39, ann_key="pragma_auto_unroll_max_step", ann_val=1024) | |
sch.annotate(block_or_loop=l39, ann_key="pragma_unroll_explicit", ann_val=1) | |
l46, l47 = sch.get_loops(block=b38) | |
sch.annotate(block_or_loop=l46, ann_key="pragma_auto_unroll_max_step", ann_val=1024) | |
sch.annotate(block_or_loop=l46, ann_key="pragma_unroll_explicit", ann_val=1) | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #17: "vm_mod_fused_nn_adaptive_avg_pool2d"] Trial #12: GFLOPs: 1.6608. Time: 0.0154 ms. Best GFLOPs: 11.1733 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #17: "vm_mod_fused_nn_adaptive_avg_pool2d"] Trial #13: GFLOPs: 8.0880. Time: 0.0032 ms. Best GFLOPs: 11.1733 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #17: "vm_mod_fused_nn_adaptive_avg_pool2d"] Trial #14: GFLOPs: 8.2177. Time: 0.0031 ms. Best GFLOPs: 11.1733 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #17: "vm_mod_fused_nn_adaptive_avg_pool2d"] Trial #15: GFLOPs: 6.3811. Time: 0.0040 ms. Best GFLOPs: 11.1733 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #17: "vm_mod_fused_nn_adaptive_avg_pool2d"] Trial #16: GFLOPs: 0.2855. Time: 0.0897 ms. Best GFLOPs: 11.1733 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #17: "vm_mod_fused_nn_adaptive_avg_pool2d"] Trial #17: GFLOPs: 8.1926. Time: 0.0031 ms. Best GFLOPs: 11.1733 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #17: "vm_mod_fused_nn_adaptive_avg_pool2d"] Trial #18: GFLOPs: 0.1021. Time: 0.2506 ms. Best GFLOPs: 11.1733 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #17: "vm_mod_fused_nn_adaptive_avg_pool2d"] Trial #19: GFLOPs: 8.1932. Time: 0.0031 ms. Best GFLOPs: 11.1733 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #17: "vm_mod_fused_nn_adaptive_avg_pool2d"] Trial #20: GFLOPs: 8.2308. Time: 0.0031 ms. Best GFLOPs: 11.1733 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #17: "vm_mod_fused_nn_adaptive_avg_pool2d"] Trial #21: GFLOPs: 8.2004. Time: 0.0031 ms. Best GFLOPs: 11.1733 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:271: [Task #17: "vm_mod_fused_nn_adaptive_avg_pool2d"] Trial #22: Error in building: LocalBuilder: An exception occurred | |
Traceback (most recent call last): | |
File "/home/zxybazh/tvm-tensorir/python/tvm/exec/popen_worker.py", line 87, in main | |
result = fn(*args, **kwargs) | |
File "/home/zxybazh/tvm-tensorir/python/tvm/meta_schedule/builder/local_builder.py", line 155, in <lambda> | |
lambda x: _worker_func(*x), | |
File "/home/zxybazh/tvm-tensorir/python/tvm/meta_schedule/builder/local_builder.py", line 213, in _worker_func | |
rt_mod: Module = f_build(mod, target, _deserialize_params(params)) | |
File "tvm/_ffi/_cython/./packed_func.pxi", line 323, in tvm._ffi._cy3.core.PackedFuncBase.__call__ | |
File "tvm/_ffi/_cython/./packed_func.pxi", line 257, in tvm._ffi._cy3.core.FuncCall | |
File "tvm/_ffi/_cython/./packed_func.pxi", line 246, in tvm._ffi._cy3.core.FuncCall3 | |
File "tvm/_ffi/_cython/./base.pxi", line 163, in tvm._ffi._cy3.core.CALL | |
tvm._ffi.base.TVMError: Traceback (most recent call last): | |
3: TVMFuncCall | |
at /home/zxybazh/tvm-tensorir/src/runtime/c_runtime_api.cc:477 | |
2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/packed_func.h:1217 | |
1: Call | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/packed_func.h:1213 | |
0: operator() | |
at /home/zxybazh/tvm-tensorir/src/runtime/c_runtime_api.cc:534 | |
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback | |
File "/home/zxybazh/tvm-tensorir/python/tvm/meta_schedule/builder/local_builder.py", line 243, in default_build | |
return tvm_build(mod, target=target) | |
File "/home/zxybazh/tvm-tensorir/python/tvm/driver/build_module.py", line 278, in build | |
... | |
at /home/zxybazh/tvm-tensorir/include/tvm/tir/stmt_functor.h:112 | |
7: tvm::tir::StmtMutator::VisitStmt_(tvm::tir::AttrStmtNode const*) | |
at /home/zxybazh/tvm-tensorir/src/tir/transforms/lower_thread_allreduce.cc:77 | |
6: tvm::tir::StmtMutator::VisitStmt(tvm::tir::Stmt const&) | |
at /home/zxybazh/tvm-tensorir/src/tir/ir/stmt_functor.cc:257 | |
5: tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>::VisitStmt(tvm::tir::Stmt const&) | |
at /home/zxybazh/tvm-tensorir/include/tvm/tir/stmt_functor.h:243 | |
4: tvm::NodeFunctor<tvm::tir::Stmt (tvm::runtime::ObjectRef const&, tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>*)>::operator()(tvm::runtime::ObjectRef const&, tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>*) const | |
at /home/zxybazh/tvm-tensorir/include/tvm/tir/stmt_functor.h:81 | |
3: tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>::InitVTable()::{lambda(tvm::runtime::ObjectRef const&, tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>*)#14}::_FUN(tvm::runtime::ObjectRef const&, tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>*) | |
at /home/zxybazh/tvm-tensorir/include/tvm/node/functor.h:97 | |
2: tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>::InitVTable()::{lambda(tvm::runtime::ObjectRef const&, tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>*)#14}::operator()(tvm::runtime::ObjectRef const&, tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>*) const | |
at /home/zxybazh/tvm-tensorir/include/tvm/tir/stmt_functor.h:124 | |
1: tvm::tir::ThreadAllreduceBuilder::VisitStmt_(tvm::tir::EvaluateNode const*) | |
at /home/zxybazh/tvm-tensorir/include/tvm/tir/stmt_functor.h:124 | |
0: tvm::tir::ThreadAllreduceBuilder::MakeAllreduce(tvm::tir::CallNode const*) | |
at /home/zxybazh/tvm-tensorir/src/tir/transforms/lower_thread_allreduce.cc:89 | |
File "/home/zxybazh/tvm-tensorir/src/tir/transforms/lower_thread_allreduce.cc", line 479 | |
at /home/zxybazh/tvm-tensorir/src/tir/transforms/lower_thread_allreduce.cc:479 | |
TVMError: | |
--------------------------------------------------------------- | |
An error occurred during the execution of TVM. | |
For more information, please see: https://tvm.apache.org/docs/errors.html | |
--------------------------------------------------------------- | |
Check failed: (!load_remap_.count(buffers[idx]->data.get())) is false: | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 7, 7, 512), "float32"], tensor: T.Buffer[(1, 1, 1, 512), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"tir.noalias": True, "global_symbol": "main"}) | |
# body | |
# with T.block("root") | |
tensor_shared = T.alloc_buffer([1, 1, 1, 512], dtype="float32", scope="shared") | |
for i0_i1_i2_i3_0_fused in T.thread_binding(8, thread="blockIdx.x", annotations={"pragma_auto_unroll_max_step":1024, "pragma_unroll_explicit":1}): | |
for ax0, ax1, ax2, ax3, ax4_ax5_fused_0 in T.grid(1, 1, 1, 64, 1): | |
for ax4_ax5_fused_1 in T.thread_binding(64, thread="threadIdx.x"): | |
with T.block("tensor"): | |
ax0_1 = T.axis.spatial(1, 0) | |
ax1_1 = T.axis.spatial(1, 0) | |
ax2_1 = T.axis.spatial(1, 0) | |
ax3_1 = T.axis.spatial(512, i0_i1_i2_i3_0_fused * 64 + ax3) | |
rv0 = T.axis.reduce(7, ax4_ax5_fused_1 // 7) | |
rv1 = T.axis.reduce(7, ax4_ax5_fused_1 % 7) | |
T.where(ax4_ax5_fused_1 < 49) | |
T.reads(tensor_shared[ax0_1, ax1_1, ax2_1, ax3_1], placeholder[ax0_1, ax1_1 * 7 + rv0, ax2_1 * 7 + rv1, ax3_1]) | |
T.writes(tensor_shared[ax0_1, ax1_1, ax2_1, ax3_1]) | |
with T.init(): | |
tensor_shared[ax0_1, ax1_1, ax2_1, ax3_1] = T.float32(0) | |
tensor_shared[ax0_1, ax1_1, ax2_1, ax3_1] = tensor_shared[ax0_1, ax1_1, ax2_1, ax3_1] + placeholder[ax0_1, ax1_1 * 7 + rv0, ax2_1 * 7 + rv1, ax3_1] | |
for i3_1 in T.thread_binding(64, thread="threadIdx.x"): | |
with T.block("tensor_1"): | |
ax0 = T.axis.spatial(1, 0) | |
ax1 = T.axis.spatial(1, 0) | |
ax2 = T.axis.spatial(1, 0) | |
ax3 = T.axis.spatial(512, i0_i1_i2_i3_0_fused * 64 + i3_1) | |
T.reads(tensor_shared[ax0, ax1, ax2, ax3]) | |
T.writes(tensor[ax0, ax1, ax2, ax3]) | |
tensor[ax0, ax1, ax2, ax3] = tensor_shared[ax0, ax1, ax2, ax3] * T.float32(0.020408163265306121) | |
b0 = sch.get_block(name="tensor", func_name="main") | |
b1 = sch.get_block(name="root", func_name="main") | |
b2, = sch.get_consumers(block=b0) | |
l3, l4, l5, l6 = sch.get_loops(block=b2) | |
v7 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125], decision=4) | |
l8, l9 = sch.split(loop=l6, factors=[None, v7]) | |
sch.bind(loop=l9, thread_axis="threadIdx.x") | |
sch.compute_at(block=b0, loop=l8, preserve_unit_loops=True) | |
sch.set_scope(block=b0, buffer_index=0, storage_scope="shared") | |
l10, l11, l12, l13, l14, l15, l16, l17, l18, l19 = sch.get_loops(block=b0) | |
l20 = sch.fuse(l18, l19) | |
l21, l22 = sch.split(loop=l20, factors=[None, v7]) | |
sch.bind(loop=l22, thread_axis="threadIdx.x") | |
v23 = sch.sample_categorical(candidates=[0, 16, 64, 512, 1024], probs=[0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001], decision=4) | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.unroll_explicit", ann_val=v23) | |
sch.enter_postproc() | |
b24 = sch.get_block(name="tensor", func_name="main") | |
l25, l26, l27, l28, l29, l30, l31, l32, l33, l34 = sch.get_loops(block=b24) | |
l35 = sch.fuse(l25, l26, l27, l28) | |
sch.bind(loop=l35, thread_axis="blockIdx.x") | |
b36 = sch.get_block(name="root", func_name="main") | |
sch.unannotate(block_or_loop=b36, ann_key="meta_schedule.unroll_explicit") | |
b37, b38 = sch.get_child_blocks(b36) | |
l39, l40, l41, l42, l43, l44, l45 = sch.get_loops(block=b37) | |
sch.annotate(block_or_loop=l39, ann_key="pragma_auto_unroll_max_step", ann_val=1024) | |
sch.annotate(block_or_loop=l39, ann_key="pragma_unroll_explicit", ann_val=1) | |
l46, l47 = sch.get_loops(block=b38) | |
sch.annotate(block_or_loop=l46, ann_key="pragma_auto_unroll_max_step", ann_val=1024) | |
sch.annotate(block_or_loop=l46, ann_key="pragma_unroll_explicit", ann_val=1) | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #17: "vm_mod_fused_nn_adaptive_avg_pool2d"] Trial #23: GFLOPs: 0.7041. Time: 0.0364 ms. Best GFLOPs: 11.1733 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #17: "vm_mod_fused_nn_adaptive_avg_pool2d"] Trial #24: GFLOPs: 8.1883. Time: 0.0031 ms. Best GFLOPs: 11.1733 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #17: "vm_mod_fused_nn_adaptive_avg_pool2d"] Trial #25: GFLOPs: 11.1732. Time: 0.0023 ms. Best GFLOPs: 11.1733 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #17: "vm_mod_fused_nn_adaptive_avg_pool2d"] Trial #26: GFLOPs: 8.2042. Time: 0.0031 ms. Best GFLOPs: 11.1733 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:271: [Task #17: "vm_mod_fused_nn_adaptive_avg_pool2d"] Trial #27: Error in building: LocalBuilder: An exception occurred | |
Traceback (most recent call last): | |
File "/home/zxybazh/tvm-tensorir/python/tvm/exec/popen_worker.py", line 87, in main | |
result = fn(*args, **kwargs) | |
File "/home/zxybazh/tvm-tensorir/python/tvm/meta_schedule/builder/local_builder.py", line 155, in <lambda> | |
lambda x: _worker_func(*x), | |
File "/home/zxybazh/tvm-tensorir/python/tvm/meta_schedule/builder/local_builder.py", line 213, in _worker_func | |
rt_mod: Module = f_build(mod, target, _deserialize_params(params)) | |
File "tvm/_ffi/_cython/./packed_func.pxi", line 323, in tvm._ffi._cy3.core.PackedFuncBase.__call__ | |
File "tvm/_ffi/_cython/./packed_func.pxi", line 257, in tvm._ffi._cy3.core.FuncCall | |
File "tvm/_ffi/_cython/./packed_func.pxi", line 246, in tvm._ffi._cy3.core.FuncCall3 | |
File "tvm/_ffi/_cython/./base.pxi", line 163, in tvm._ffi._cy3.core.CALL | |
tvm._ffi.base.TVMError: Traceback (most recent call last): | |
3: TVMFuncCall | |
at /home/zxybazh/tvm-tensorir/src/runtime/c_runtime_api.cc:477 | |
2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/packed_func.h:1217 | |
1: Call | |
at /home/zxybazh/tvm-tensorir/include/tvm/runtime/packed_func.h:1213 | |
0: operator() | |
at /home/zxybazh/tvm-tensorir/src/runtime/c_runtime_api.cc:534 | |
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback | |
File "/home/zxybazh/tvm-tensorir/python/tvm/meta_schedule/builder/local_builder.py", line 243, in default_build | |
return tvm_build(mod, target=target) | |
File "/home/zxybazh/tvm-tensorir/python/tvm/driver/build_module.py", line 278, in build | |
... | |
at /home/zxybazh/tvm-tensorir/include/tvm/tir/stmt_functor.h:112 | |
7: tvm::tir::StmtMutator::VisitStmt_(tvm::tir::AttrStmtNode const*) | |
at /home/zxybazh/tvm-tensorir/src/tir/transforms/lower_thread_allreduce.cc:77 | |
6: tvm::tir::StmtMutator::VisitStmt(tvm::tir::Stmt const&) | |
at /home/zxybazh/tvm-tensorir/src/tir/ir/stmt_functor.cc:257 | |
5: tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>::VisitStmt(tvm::tir::Stmt const&) | |
at /home/zxybazh/tvm-tensorir/include/tvm/tir/stmt_functor.h:243 | |
4: tvm::NodeFunctor<tvm::tir::Stmt (tvm::runtime::ObjectRef const&, tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>*)>::operator()(tvm::runtime::ObjectRef const&, tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>*) const | |
at /home/zxybazh/tvm-tensorir/include/tvm/tir/stmt_functor.h:81 | |
3: tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>::InitVTable()::{lambda(tvm::runtime::ObjectRef const&, tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>*)#14}::_FUN(tvm::runtime::ObjectRef const&, tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>*) | |
at /home/zxybazh/tvm-tensorir/include/tvm/node/functor.h:97 | |
2: tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>::InitVTable()::{lambda(tvm::runtime::ObjectRef const&, tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>*)#14}::operator()(tvm::runtime::ObjectRef const&, tvm::tir::StmtFunctor<tvm::tir::Stmt (tvm::tir::Stmt const&)>*) const | |
at /home/zxybazh/tvm-tensorir/include/tvm/tir/stmt_functor.h:124 | |
1: tvm::tir::ThreadAllreduceBuilder::VisitStmt_(tvm::tir::EvaluateNode const*) | |
at /home/zxybazh/tvm-tensorir/include/tvm/tir/stmt_functor.h:124 | |
0: tvm::tir::ThreadAllreduceBuilder::MakeAllreduce(tvm::tir::CallNode const*) | |
at /home/zxybazh/tvm-tensorir/src/tir/transforms/lower_thread_allreduce.cc:89 | |
File "/home/zxybazh/tvm-tensorir/src/tir/transforms/lower_thread_allreduce.cc", line 479 | |
at /home/zxybazh/tvm-tensorir/src/tir/transforms/lower_thread_allreduce.cc:479 | |
TVMError: | |
--------------------------------------------------------------- | |
An error occurred during the execution of TVM. | |
For more information, please see: https://tvm.apache.org/docs/errors.html | |
--------------------------------------------------------------- | |
Check failed: (!load_remap_.count(buffers[idx]->data.get())) is false: | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 7, 7, 512), "float32"], tensor: T.Buffer[(1, 1, 1, 512), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"tir.noalias": True, "global_symbol": "main"}) | |
# body | |
# with T.block("root") | |
tensor_shared = T.alloc_buffer([1, 1, 1, 512], dtype="float32", scope="shared") | |
for i0_i1_i2_i3_0_fused in T.thread_binding(4, thread="blockIdx.x", annotations={"pragma_auto_unroll_max_step":512, "pragma_unroll_explicit":1}): | |
for ax0, ax1, ax2, ax3, ax4_ax5_fused_0 in T.grid(1, 1, 1, 128, 1): | |
for ax4_ax5_fused_1 in T.thread_binding(128, thread="threadIdx.x"): | |
with T.block("tensor"): | |
ax0_1 = T.axis.spatial(1, 0) | |
ax1_1 = T.axis.spatial(1, 0) | |
ax2_1 = T.axis.spatial(1, 0) | |
ax3_1 = T.axis.spatial(512, i0_i1_i2_i3_0_fused * 128 + ax3) | |
rv0 = T.axis.reduce(7, ax4_ax5_fused_1 // 7) | |
rv1 = T.axis.reduce(7, ax4_ax5_fused_1 % 7) | |
T.where(ax4_ax5_fused_1 < 49) | |
T.reads(tensor_shared[ax0_1, ax1_1, ax2_1, ax3_1], placeholder[ax0_1, ax1_1 * 7 + rv0, ax2_1 * 7 + rv1, ax3_1]) | |
T.writes(tensor_shared[ax0_1, ax1_1, ax2_1, ax3_1]) | |
with T.init(): | |
tensor_shared[ax0_1, ax1_1, ax2_1, ax3_1] = T.float32(0) | |
tensor_shared[ax0_1, ax1_1, ax2_1, ax3_1] = tensor_shared[ax0_1, ax1_1, ax2_1, ax3_1] + placeholder[ax0_1, ax1_1 * 7 + rv0, ax2_1 * 7 + rv1, ax3_1] | |
for i3_1 in T.thread_binding(128, thread="threadIdx.x"): | |
with T.block("tensor_1"): | |
ax0 = T.axis.spatial(1, 0) | |
ax1 = T.axis.spatial(1, 0) | |
ax2 = T.axis.spatial(1, 0) | |
ax3 = T.axis.spatial(512, i0_i1_i2_i3_0_fused * 128 + i3_1) | |
T.reads(tensor_shared[ax0, ax1, ax2, ax3]) | |
T.writes(tensor[ax0, ax1, ax2, ax3]) | |
tensor[ax0, ax1, ax2, ax3] = tensor_shared[ax0, ax1, ax2, ax3] * T.float32(0.020408163265306121) | |
b0 = sch.get_block(name="tensor", func_name="main") | |
b1 = sch.get_block(name="root", func_name="main") | |
b2, = sch.get_consumers(block=b0) | |
l3, l4, l5, l6 = sch.get_loops(block=b2) | |
v7 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125], decision=5) | |
l8, l9 = sch.split(loop=l6, factors=[None, v7]) | |
sch.bind(loop=l9, thread_axis="threadIdx.x") | |
sch.compute_at(block=b0, loop=l8, preserve_unit_loops=True) | |
sch.set_scope(block=b0, buffer_index=0, storage_scope="shared") | |
l10, l11, l12, l13, l14, l15, l16, l17, l18, l19 = sch.get_loops(block=b0) | |
l20 = sch.fuse(l18, l19) | |
l21, l22 = sch.split(loop=l20, factors=[None, v7]) | |
sch.bind(loop=l22, thread_axis="threadIdx.x") | |
v23 = sch.sample_categorical(candidates=[0, 16, 64, 512, 1024], probs=[0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001], decision=3) | |
sch.annotate(block_or_loop=b1, ann_key="meta_schedule.unroll_explicit", ann_val=v23) | |
sch.enter_postproc() | |
b24 = sch.get_block(name="tensor", func_name="main") | |
l25, l26, l27, l28, l29, l30, l31, l32, l33, l34 = sch.get_loops(block=b24) | |
l35 = sch.fuse(l25, l26, l27, l28) | |
sch.bind(loop=l35, thread_axis="blockIdx.x") | |
b36 = sch.get_block(name="root", func_name="main") | |
sch.unannotate(block_or_loop=b36, ann_key="meta_schedule.unroll_explicit") | |
b37, b38 = sch.get_child_blocks(b36) | |
l39, l40, l41, l42, l43, l44, l45 = sch.get_loops(block=b37) | |
sch.annotate(block_or_loop=l39, ann_key="pragma_auto_unroll_max_step", ann_val=512) | |
sch.annotate(block_or_loop=l39, ann_key="pragma_unroll_explicit", ann_val=1) | |
l46, l47 = sch.get_loops(block=b38) | |
sch.annotate(block_or_loop=l46, ann_key="pragma_auto_unroll_max_step", ann_val=512) | |
sch.annotate(block_or_loop=l46, ann_key="pragma_unroll_explicit", ann_val=1) | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #17: "vm_mod_fused_nn_adaptive_avg_pool2d"] Trial #28: GFLOPs: 8.1921. Time: 0.0031 ms. Best GFLOPs: 11.1733 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #17: "vm_mod_fused_nn_adaptive_avg_pool2d"] Trial #29: GFLOPs: 7.9581. Time: 0.0032 ms. Best GFLOPs: 11.1733 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #17: "vm_mod_fused_nn_adaptive_avg_pool2d"] Trial #30: GFLOPs: 8.2135. Time: 0.0031 ms. Best GFLOPs: 11.1733 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #17: "vm_mod_fused_nn_adaptive_avg_pool2d"] Trial #31: GFLOPs: 11.1733. Time: 0.0023 ms. Best GFLOPs: 11.1733 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB validation: p-rmse: 1.185886 a-peak@32: 1.000000 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 0: tr-p-rmse: 0.289579 tr-a-peak@32: 1.000000 tr-rmse: 0.266490 tr-rmse: 0.266490 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 25: tr-p-rmse: 0.076388 tr-a-peak@32: 1.000000 tr-rmse: 0.313033 tr-rmse: 0.313033 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 50: tr-p-rmse: 0.076389 tr-a-peak@32: 1.000000 tr-rmse: 0.313033 tr-rmse: 0.313033 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB stopped. Best iteration: [17] tr-p-rmse:0.07618 tr-a-peak@32:1.00000 tr-rmse:0.31286 tr-rmse:0.31286 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:127: Scheduler picks Task #17: "vm_mod_fused_nn_adaptive_avg_pool2d" | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:141: Task #17 has finished. Remaining task(s): 2 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #18: "vm_mod_fused_layout_transform_reshape_squeeze"] Trial #0: GFLOPs: 0.0000. Time: 0.0016 ms. Best GFLOPs: 0.0000 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #18: "vm_mod_fused_layout_transform_reshape_squeeze"] Trial #1: GFLOPs: 0.0000. Time: 0.0016 ms. Best GFLOPs: 0.0000 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #18: "vm_mod_fused_layout_transform_reshape_squeeze"] Trial #2: GFLOPs: 0.0000. Time: 0.0016 ms. Best GFLOPs: 0.0000 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #18: "vm_mod_fused_layout_transform_reshape_squeeze"] Trial #3: GFLOPs: 0.0000. Time: 0.0016 ms. Best GFLOPs: 0.0000 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #18: "vm_mod_fused_layout_transform_reshape_squeeze"] Trial #4: GFLOPs: 0.0000. Time: 0.0016 ms. Best GFLOPs: 0.0000 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #18: "vm_mod_fused_layout_transform_reshape_squeeze"] Trial #5: GFLOPs: 0.0000. Time: 0.0016 ms. Best GFLOPs: 0.0000 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #18: "vm_mod_fused_layout_transform_reshape_squeeze"] Trial #6: GFLOPs: 0.0000. Time: 0.0017 ms. Best GFLOPs: 0.0000 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #18: "vm_mod_fused_layout_transform_reshape_squeeze"] Trial #7: GFLOPs: 0.0000. Time: 0.0016 ms. Best GFLOPs: 0.0000 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #18: "vm_mod_fused_layout_transform_reshape_squeeze"] Trial #8: GFLOPs: 0.0000. Time: 0.0016 ms. Best GFLOPs: 0.0000 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #18: "vm_mod_fused_layout_transform_reshape_squeeze"] Trial #9: GFLOPs: 0.0000. Time: 0.0016 ms. Best GFLOPs: 0.0000 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #18: "vm_mod_fused_layout_transform_reshape_squeeze"] Trial #10: GFLOPs: 0.0000. Time: 0.0016 ms. Best GFLOPs: 0.0000 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #18: "vm_mod_fused_layout_transform_reshape_squeeze"] Trial #11: GFLOPs: 0.0000. Time: 0.0016 ms. Best GFLOPs: 0.0000 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #18: "vm_mod_fused_layout_transform_reshape_squeeze"] Trial #12: GFLOPs: 0.0000. Time: 0.0016 ms. Best GFLOPs: 0.0000 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #18: "vm_mod_fused_layout_transform_reshape_squeeze"] Trial #13: GFLOPs: 0.0000. Time: 0.0016 ms. Best GFLOPs: 0.0000 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #18: "vm_mod_fused_layout_transform_reshape_squeeze"] Trial #14: GFLOPs: 0.0000. Time: 0.0016 ms. Best GFLOPs: 0.0000 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #18: "vm_mod_fused_layout_transform_reshape_squeeze"] Trial #15: GFLOPs: 0.0000. Time: 0.0016 ms. Best GFLOPs: 0.0000 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #18: "vm_mod_fused_layout_transform_reshape_squeeze"] Trial #16: GFLOPs: 0.0000. Time: 0.0016 ms. Best GFLOPs: 0.0000 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #18: "vm_mod_fused_layout_transform_reshape_squeeze"] Trial #17: GFLOPs: 0.0000. Time: 0.0016 ms. Best GFLOPs: 0.0000 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #18: "vm_mod_fused_layout_transform_reshape_squeeze"] Trial #18: GFLOPs: 0.0000. Time: 0.0016 ms. Best GFLOPs: 0.0000 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #18: "vm_mod_fused_layout_transform_reshape_squeeze"] Trial #19: GFLOPs: 0.0000. Time: 0.0016 ms. Best GFLOPs: 0.0000 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #18: "vm_mod_fused_layout_transform_reshape_squeeze"] Trial #20: GFLOPs: 0.0000. Time: 0.0016 ms. Best GFLOPs: 0.0000 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #18: "vm_mod_fused_layout_transform_reshape_squeeze"] Trial #21: GFLOPs: 0.0000. Time: 0.0016 ms. Best GFLOPs: 0.0000 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #18: "vm_mod_fused_layout_transform_reshape_squeeze"] Trial #22: GFLOPs: 0.0000. Time: 0.0016 ms. Best GFLOPs: 0.0000 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #18: "vm_mod_fused_layout_transform_reshape_squeeze"] Trial #23: GFLOPs: 0.0000. Time: 0.0016 ms. Best GFLOPs: 0.0000 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #18: "vm_mod_fused_layout_transform_reshape_squeeze"] Trial #24: GFLOPs: 0.0000. Time: 0.0016 ms. Best GFLOPs: 0.0000 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #18: "vm_mod_fused_layout_transform_reshape_squeeze"] Trial #25: GFLOPs: 0.0000. Time: 0.0016 ms. Best GFLOPs: 0.0000 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #18: "vm_mod_fused_layout_transform_reshape_squeeze"] Trial #26: GFLOPs: 0.0000. Time: 0.0016 ms. Best GFLOPs: 0.0000 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #18: "vm_mod_fused_layout_transform_reshape_squeeze"] Trial #27: GFLOPs: 0.0000. Time: 0.0016 ms. Best GFLOPs: 0.0000 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #18: "vm_mod_fused_layout_transform_reshape_squeeze"] Trial #28: GFLOPs: 0.0000. Time: 0.0016 ms. Best GFLOPs: 0.0000 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #18: "vm_mod_fused_layout_transform_reshape_squeeze"] Trial #29: GFLOPs: 0.0000. Time: 0.0016 ms. Best GFLOPs: 0.0000 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #18: "vm_mod_fused_layout_transform_reshape_squeeze"] Trial #30: GFLOPs: 0.0000. Time: 0.0016 ms. Best GFLOPs: 0.0000 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #18: "vm_mod_fused_layout_transform_reshape_squeeze"] Trial #31: GFLOPs: 0.0000. Time: 0.0016 ms. Best GFLOPs: 0.0000 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB validation: p-rmse: 1.062308 a-peak@32: 1.000000 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 0: tr-p-rmse: 0.387184 tr-a-peak@32: 0.683282 tr-rmse: 0.260160 tr-rmse: 0.260160 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 25: tr-p-rmse: 0.053254 tr-a-peak@32: 1.000000 tr-rmse: 0.175234 tr-rmse: 0.175234 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 50: tr-p-rmse: 0.053229 tr-a-peak@32: 1.000000 tr-rmse: 0.175223 tr-rmse: 0.175223 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB stopped. Best iteration: [15] tr-p-rmse:0.05282 tr-a-peak@32:1.00000 tr-rmse:0.17583 tr-rmse:0.17583 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:127: Scheduler picks Task #18: "vm_mod_fused_layout_transform_reshape_squeeze" | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:141: Task #18 has finished. Remaining task(s): 1 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #19: "vm_mod_fused_nn_dense_add"] Trial #0: GFLOPs: 25.5750. Time: 0.0401 ms. Best GFLOPs: 25.5750 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #19: "vm_mod_fused_nn_dense_add"] Trial #1: GFLOPs: 3.0031. Time: 0.3413 ms. Best GFLOPs: 25.5750 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #19: "vm_mod_fused_nn_dense_add"] Trial #2: GFLOPs: 9.1288. Time: 0.1123 ms. Best GFLOPs: 25.5750 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:271: [Task #19: "vm_mod_fused_nn_dense_add"] Trial #3: Error in building: LocalRunner: An exception occurred | |
Subprocess terminated | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 1000), "float32"], placeholder_1: T.Buffer[(1000, 512), "float32"], placeholder_2: T.Buffer[(1, 512), "float32"], T_add: T.Buffer[(1, 1000), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"tir.noalias": True, "global_symbol": "main"}) | |
# body | |
# with T.block("root") | |
T_matmul_NT_local = T.alloc_buffer([1, 1000], dtype="float32", scope="local") | |
placeholder_shared = T.alloc_buffer([1, 512], dtype="float32", scope="shared") | |
placeholder_shared_1 = T.alloc_buffer([1000, 512], dtype="float32", scope="shared") | |
for i0_0_i1_0_fused in T.thread_binding(1, thread="blockIdx.x"): | |
for i0_1_i1_1_fused in T.thread_binding(1, thread="vthread.x"): | |
for i0_2_i1_2_fused in T.thread_binding(125, thread="threadIdx.x"): | |
for i1_3_init, i1_4_init in T.grid(2, 4): | |
with T.block("T_matmul_NT_init"): | |
i = T.axis.spatial(1, 0) | |
j = T.axis.spatial(1000, i0_2_i1_2_fused * 8 + i1_3_init * 4 + i1_4_init) | |
T.reads() | |
T.writes(T_matmul_NT_local[i, j]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["dense_small_batch.gpu", ["TENSOR", [1, 512], "float32"], ["TENSOR", [1000, 512], "float32"], None, "float32"]}) | |
T_matmul_NT_local[i, j] = T.float32(0) | |
for i2_0 in T.serial(128): | |
for ax0_ax1_fused_0 in T.serial(1): | |
for ax0_ax1_fused_1 in T.thread_binding(125, thread="threadIdx.x"): | |
with T.block("placeholder_shared"): | |
v0 = T.axis.spatial(1, 0) | |
v1 = T.axis.spatial(512, i2_0 * 4 + ax0_ax1_fused_1) | |
T.where(ax0_ax1_fused_1 < 4) | |
T.reads(placeholder_2[v0, v1]) | |
T.writes(placeholder_shared[v0, v1]) | |
T.block_attr({"meta_schedule.cooperative_fetch":1}) | |
placeholder_shared[v0, v1] = placeholder_2[v0, v1] | |
for ax0_ax1_fused_0 in T.serial(16): | |
for ax0_ax1_fused_1 in T.thread_binding(125, thread="threadIdx.x"): | |
for ax0_ax1_fused_2 in T.vectorized(2): | |
with T.block("placeholder_shared"): | |
v0 = T.axis.spatial(1000, (ax0_ax1_fused_0 * 250 + ax0_ax1_fused_1 * 2 + ax0_ax1_fused_2) // 4) | |
v1 = T.axis.spatial(512, i2_0 * 4 + (ax0_ax1_fused_0 * 250 + ax0_ax1_fused_1 * 2 + ax0_ax1_fused_2) % 4) | |
T.reads(placeholder_1[v0, v1]) | |
T.writes(placeholder_shared_1[v0, v1]) | |
T.block_attr({"meta_schedule.cooperative_fetch":2}) | |
placeholder_shared_1[v0, v1] = placeholder_1[v0, v1] | |
for i2_1, i0_3, i1_3, i2_2, i0_4, i1_4 in T.grid(2, 1, 2, 2, 1, 4): | |
with T.block("T_matmul_NT_update"): | |
i = T.axis.spatial(1, 0) | |
j = T.axis.spatial(1000, i0_2_i1_2_fused * 8 + i1_3 * 4 + i1_4) | |
k = T.axis.reduce(512, i2_0 * 4 + i2_1 * 2 + i2_2) | |
T.reads(T_matmul_NT_local[i, j], placeholder_shared[i, k], placeholder_shared_1[j, k]) | |
T.writes(T_matmul_NT_local[i, j]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["dense_small_batch.gpu", ["TENSOR", [1, 512], "float32"], ["TENSOR", [1000, 512], "float32"], None, "float32"]}) | |
T_matmul_NT_local[i, j] = T_matmul_NT_local[i, j] + placeholder_shared[i, k] * placeholder_shared_1[j, k] | |
for ax0, ax1 in T.grid(1, 8): | |
with T.block("T_matmul_NT_local"): | |
v0 = T.axis.spatial(1, ax0) | |
v1 = T.axis.spatial(1000, i0_2_i1_2_fused * 8 + ax1) | |
T.reads(T_matmul_NT_local[v0, v1], placeholder[v0, v1]) | |
T.writes(T_add[v0, v1]) | |
T_add[v0, v1] = T_matmul_NT_local[v0, v1] + placeholder[v0, v1] | |
b0 = sch.get_block(name="T_matmul_NT", func_name="main") | |
b1 = sch.get_block(name="T_add", func_name="main") | |
b2 = sch.get_block(name="root", func_name="main") | |
sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS") | |
l3, l4, l5 = sch.get_loops(block=b0) | |
v6, v7, v8, v9, v10 = sch.sample_perfect_tile(loop=l3, n=5, max_innermost_factor=64, decision=[1, 1, 1, 1, 1]) | |
l11, l12, l13, l14, l15 = sch.split(loop=l3, factors=[v6, v7, v8, v9, v10]) | |
v16, v17, v18, v19, v20 = sch.sample_perfect_tile(loop=l4, n=5, max_innermost_factor=64, decision=[1, 1, 125, 2, 4]) | |
l21, l22, l23, l24, l25 = sch.split(loop=l4, factors=[v16, v17, v18, v19, v20]) | |
v26, v27, v28 = sch.sample_perfect_tile(loop=l5, n=3, max_innermost_factor=64, decision=[128, 2, 2]) | |
l29, l30, l31 = sch.split(loop=l5, factors=[v26, v27, v28]) | |
sch.reorder(l11, l21, l12, l22, l13, l23, l29, l30, l14, l24, l31, l15, l25) | |
l32 = sch.fuse(l11, l21) | |
sch.bind(loop=l32, thread_axis="blockIdx.x") | |
l33 = sch.fuse(l12, l22) | |
sch.bind(loop=l33, thread_axis="vthread.x") | |
l34 = sch.fuse(l13, l23) | |
sch.bind(loop=l34, thread_axis="threadIdx.x") | |
sch.annotate(block_or_loop=b0, ann_key="meta_schedule.thread_extent_low_inclusive", ann_val=32) | |
sch.annotate(block_or_loop=b0, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=1024) | |
b35 = sch.cache_write(block=b0, write_buffer_index=0, storage_scope="local") | |
sch.reverse_compute_at(block=b35, loop=l34, preserve_unit_loops=True) | |
b36 = sch.cache_read(block=b0, read_buffer_index=1, storage_scope="shared") | |
sch.compute_at(block=b36, loop=l29, preserve_unit_loops=True) | |
l37, l38, l39, l40, l41, l42 = sch.get_loops(block=b36) | |
l43 = sch.fuse(l41, l42) | |
v44 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=0) | |
sch.annotate(block_or_loop=b36, ann_key="meta_schedule.cooperative_fetch", ann_val=v44) | |
b45 = sch.cache_read(block=b0, read_buffer_index=2, storage_scope="shared") | |
sch.compute_at(block=b45, loop=l29, preserve_unit_loops=True) | |
l46, l47, l48, l49, l50, l51 = sch.get_loops(block=b45) | |
l52 = sch.fuse(l50, l51) | |
v53 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=1) | |
sch.annotate(block_or_loop=b45, ann_key="meta_schedule.cooperative_fetch", ann_val=v53) | |
sch.reverse_compute_inline(block=b1) | |
v54 = sch.sample_categorical(candidates=[0, 16, 64, 512, 1024], probs=[0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001], decision=0) | |
sch.annotate(block_or_loop=b2, ann_key="meta_schedule.unroll_explicit", ann_val=v54) | |
sch.enter_postproc() | |
l55, l56, l57, l58, l59 = sch.get_loops(block=b36) | |
l60, l61 = sch.split(loop=l59, factors=[None, 125]) | |
sch.bind(loop=l61, thread_axis="threadIdx.x") | |
l62, l63, l64, l65, l66 = sch.get_loops(block=b45) | |
l67, l68, l69 = sch.split(loop=l66, factors=[None, 125, 2]) | |
sch.vectorize(loop=l69) | |
sch.bind(loop=l68, thread_axis="threadIdx.x") | |
b70 = sch.get_block(name="root", func_name="main") | |
sch.unannotate(block_or_loop=b70, ann_key="meta_schedule.unroll_explicit") | |
b71, b72, b73, b74 = sch.get_child_blocks(b70) | |
l75, l76, l77, l78, l79, l80 = sch.get_loops(block=b71) | |
l81, l82, l83, l84, l85, l86, l87 = sch.get_loops(block=b72) | |
l88, l89, l90, l91, l92, l93, l94, l95, l96, l97 = sch.get_loops(block=b73) | |
l98, l99, l100, l101, l102 = sch.get_loops(block=b74) | |
b103 = sch.get_block(name="T_matmul_NT", func_name="main") | |
l104, l105, l106, l107, l108, l109, l110, l111, l112, l113 = sch.get_loops(block=b103) | |
b114 = sch.decompose_reduction(block=b103, loop=l107) | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #19: "vm_mod_fused_nn_dense_add"] Trial #4: GFLOPs: 16.2363. Time: 0.0631 ms. Best GFLOPs: 25.5750 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #19: "vm_mod_fused_nn_dense_add"] Trial #5: GFLOPs: 36.2412. Time: 0.0283 ms. Best GFLOPs: 36.2412 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #19: "vm_mod_fused_nn_dense_add"] Trial #6: GFLOPs: 23.6285. Time: 0.0434 ms. Best GFLOPs: 36.2412 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #19: "vm_mod_fused_nn_dense_add"] Trial #7: GFLOPs: 27.1935. Time: 0.0377 ms. Best GFLOPs: 36.2412 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #19: "vm_mod_fused_nn_dense_add"] Trial #8: GFLOPs: 7.8880. Time: 0.1299 ms. Best GFLOPs: 36.2412 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #19: "vm_mod_fused_nn_dense_add"] Trial #9: GFLOPs: 4.6527. Time: 0.2203 ms. Best GFLOPs: 36.2412 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:271: [Task #19: "vm_mod_fused_nn_dense_add"] Trial #10: Error in building: LocalRunner: An exception occurred | |
Subprocess terminated | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 1000), "float32"], placeholder_1: T.Buffer[(1000, 512), "float32"], placeholder_2: T.Buffer[(1, 512), "float32"], T_add: T.Buffer[(1, 1000), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"tir.noalias": True, "global_symbol": "main"}) | |
# body | |
# with T.block("root") | |
T_matmul_NT_local = T.alloc_buffer([1, 1000], dtype="float32", scope="local") | |
placeholder_shared = T.alloc_buffer([1, 512], dtype="float32", scope="shared") | |
placeholder_shared_1 = T.alloc_buffer([1000, 512], dtype="float32", scope="shared") | |
for i0_0_i1_0_fused in T.thread_binding(4, thread="blockIdx.x", annotations={"pragma_auto_unroll_max_step":512, "pragma_unroll_explicit":1}): | |
for i0_1_i1_1_fused in T.thread_binding(5, thread="vthread.x"): | |
for i0_2_i1_2_fused in T.thread_binding(50, thread="threadIdx.x"): | |
with T.block("T_matmul_NT_init"): | |
i = T.axis.spatial(1, 0) | |
j = T.axis.spatial(1000, i0_0_i1_0_fused * 250 + i0_1_i1_1_fused * 50 + i0_2_i1_2_fused) | |
T.reads() | |
T.writes(T_matmul_NT_local[i, j]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["dense_small_batch.gpu", ["TENSOR", [1, 512], "float32"], ["TENSOR", [1000, 512], "float32"], None, "float32"]}) | |
T_matmul_NT_local[i, j] = T.float32(0) | |
for i2_0 in T.serial(32): | |
for ax0_ax1_fused_0 in T.serial(1): | |
for ax0_ax1_fused_1 in T.thread_binding(50, thread="threadIdx.x"): | |
for ax0_ax1_fused_2 in T.vectorized(3): | |
with T.block("placeholder_shared"): | |
v0 = T.axis.spatial(1, 0) | |
v1 = T.axis.spatial(512, i2_0 * 16 + (ax0_ax1_fused_1 * 3 + ax0_ax1_fused_2)) | |
T.where(ax0_ax1_fused_1 * 3 + ax0_ax1_fused_2 < 16) | |
T.reads(placeholder_2[v0, v1]) | |
T.writes(placeholder_shared[v0, v1]) | |
T.block_attr({"meta_schedule.cooperative_fetch":3}) | |
placeholder_shared[v0, v1] = placeholder_2[v0, v1] | |
for ax0_ax1_fused_0 in T.serial(20): | |
for ax0_ax1_fused_1 in T.thread_binding(50, thread="threadIdx.x"): | |
for ax0_ax1_fused_2 in T.vectorized(4): | |
with T.block("placeholder_shared"): | |
v0 = T.axis.spatial(1000, i0_0_i1_0_fused * 250 + (ax0_ax1_fused_0 * 200 + ax0_ax1_fused_1 * 4 + ax0_ax1_fused_2) // 16) | |
v1 = T.axis.spatial(512, i2_0 * 16 + (ax0_ax1_fused_0 * 200 + ax0_ax1_fused_1 * 4 + ax0_ax1_fused_2) % 16) | |
T.reads(placeholder_1[v0, v1]) | |
T.writes(placeholder_shared_1[v0, v1]) | |
T.block_attr({"meta_schedule.cooperative_fetch":4}) | |
placeholder_shared_1[v0, v1] = placeholder_1[v0, v1] | |
for i2_1, i0_3, i1_3, i2_2, i0_4, i1_4 in T.grid(2, 1, 1, 8, 1, 1): | |
with T.block("T_matmul_NT_update"): | |
i = T.axis.spatial(1, 0) | |
j = T.axis.spatial(1000, i0_0_i1_0_fused * 250 + i0_1_i1_1_fused * 50 + i0_2_i1_2_fused) | |
k = T.axis.reduce(512, i2_0 * 16 + i2_1 * 8 + i2_2) | |
T.reads(T_matmul_NT_local[i, j], placeholder_shared[i, k], placeholder_shared_1[j, k]) | |
T.writes(T_matmul_NT_local[i, j]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["dense_small_batch.gpu", ["TENSOR", [1, 512], "float32"], ["TENSOR", [1000, 512], "float32"], None, "float32"]}) | |
T_matmul_NT_local[i, j] = T_matmul_NT_local[i, j] + placeholder_shared[i, k] * placeholder_shared_1[j, k] | |
for ax0, ax1 in T.grid(1, 1): | |
with T.block("T_matmul_NT_local"): | |
v0 = T.axis.spatial(1, ax0) | |
v1 = T.axis.spatial(1000, i0_0_i1_0_fused * 250 + i0_1_i1_1_fused * 50 + i0_2_i1_2_fused + ax1) | |
T.reads(T_matmul_NT_local[v0, v1], placeholder[v0, v1]) | |
T.writes(T_add[v0, v1]) | |
T_add[v0, v1] = T_matmul_NT_local[v0, v1] + placeholder[v0, v1] | |
b0 = sch.get_block(name="T_matmul_NT", func_name="main") | |
b1 = sch.get_block(name="T_add", func_name="main") | |
b2 = sch.get_block(name="root", func_name="main") | |
sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS") | |
l3, l4, l5 = sch.get_loops(block=b0) | |
v6, v7, v8, v9, v10 = sch.sample_perfect_tile(loop=l3, n=5, max_innermost_factor=64, decision=[1, 1, 1, 1, 1]) | |
l11, l12, l13, l14, l15 = sch.split(loop=l3, factors=[v6, v7, v8, v9, v10]) | |
v16, v17, v18, v19, v20 = sch.sample_perfect_tile(loop=l4, n=5, max_innermost_factor=64, decision=[4, 5, 50, 1, 1]) | |
l21, l22, l23, l24, l25 = sch.split(loop=l4, factors=[v16, v17, v18, v19, v20]) | |
v26, v27, v28 = sch.sample_perfect_tile(loop=l5, n=3, max_innermost_factor=64, decision=[32, 2, 8]) | |
l29, l30, l31 = sch.split(loop=l5, factors=[v26, v27, v28]) | |
sch.reorder(l11, l21, l12, l22, l13, l23, l29, l30, l14, l24, l31, l15, l25) | |
l32 = sch.fuse(l11, l21) | |
sch.bind(loop=l32, thread_axis="blockIdx.x") | |
l33 = sch.fuse(l12, l22) | |
sch.bind(loop=l33, thread_axis="vthread.x") | |
l34 = sch.fuse(l13, l23) | |
sch.bind(loop=l34, thread_axis="threadIdx.x") | |
sch.annotate(block_or_loop=b0, ann_key="meta_schedule.thread_extent_low_inclusive", ann_val=32) | |
sch.annotate(block_or_loop=b0, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=1024) | |
b35 = sch.cache_write(block=b0, write_buffer_index=0, storage_scope="local") | |
sch.reverse_compute_at(block=b35, loop=l34, preserve_unit_loops=True) | |
b36 = sch.cache_read(block=b0, read_buffer_index=1, storage_scope="shared") | |
sch.compute_at(block=b36, loop=l29, preserve_unit_loops=True) | |
l37, l38, l39, l40, l41, l42 = sch.get_loops(block=b36) | |
l43 = sch.fuse(l41, l42) | |
v44 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=2) | |
sch.annotate(block_or_loop=b36, ann_key="meta_schedule.cooperative_fetch", ann_val=v44) | |
b45 = sch.cache_read(block=b0, read_buffer_index=2, storage_scope="shared") | |
sch.compute_at(block=b45, loop=l29, preserve_unit_loops=True) | |
l46, l47, l48, l49, l50, l51 = sch.get_loops(block=b45) | |
l52 = sch.fuse(l50, l51) | |
v53 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=3) | |
sch.annotate(block_or_loop=b45, ann_key="meta_schedule.cooperative_fetch", ann_val=v53) | |
sch.reverse_compute_inline(block=b1) | |
v54 = sch.sample_categorical(candidates=[0, 16, 64, 512, 1024], probs=[0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001], decision=3) | |
sch.annotate(block_or_loop=b2, ann_key="meta_schedule.unroll_explicit", ann_val=v54) | |
sch.enter_postproc() | |
l55, l56, l57, l58, l59 = sch.get_loops(block=b36) | |
l60, l61, l62 = sch.split(loop=l59, factors=[None, 50, 3]) | |
sch.vectorize(loop=l62) | |
sch.bind(loop=l61, thread_axis="threadIdx.x") | |
l63, l64, l65, l66, l67 = sch.get_loops(block=b45) | |
l68, l69, l70 = sch.split(loop=l67, factors=[None, 50, 4]) | |
sch.vectorize(loop=l70) | |
sch.bind(loop=l69, thread_axis="threadIdx.x") | |
b71 = sch.get_block(name="root", func_name="main") | |
sch.unannotate(block_or_loop=b71, ann_key="meta_schedule.unroll_explicit") | |
b72, b73, b74, b75 = sch.get_child_blocks(b71) | |
l76, l77, l78, l79, l80, l81, l82 = sch.get_loops(block=b72) | |
sch.annotate(block_or_loop=l76, ann_key="pragma_auto_unroll_max_step", ann_val=512) | |
sch.annotate(block_or_loop=l76, ann_key="pragma_unroll_explicit", ann_val=1) | |
l83, l84, l85, l86, l87, l88, l89 = sch.get_loops(block=b73) | |
sch.annotate(block_or_loop=l83, ann_key="pragma_auto_unroll_max_step", ann_val=512) | |
sch.annotate(block_or_loop=l83, ann_key="pragma_unroll_explicit", ann_val=1) | |
l90, l91, l92, l93, l94, l95, l96, l97, l98, l99 = sch.get_loops(block=b74) | |
sch.annotate(block_or_loop=l90, ann_key="pragma_auto_unroll_max_step", ann_val=512) | |
sch.annotate(block_or_loop=l90, ann_key="pragma_unroll_explicit", ann_val=1) | |
l100, l101, l102, l103, l104 = sch.get_loops(block=b75) | |
sch.annotate(block_or_loop=l100, ann_key="pragma_auto_unroll_max_step", ann_val=512) | |
sch.annotate(block_or_loop=l100, ann_key="pragma_unroll_explicit", ann_val=1) | |
b105 = sch.get_block(name="T_matmul_NT", func_name="main") | |
l106, l107, l108, l109, l110, l111, l112, l113, l114, l115 = sch.get_loops(block=b105) | |
b116 = sch.decompose_reduction(block=b105, loop=l109) | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #19: "vm_mod_fused_nn_dense_add"] Trial #11: GFLOPs: 3.0364. Time: 0.3376 ms. Best GFLOPs: 36.2412 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #19: "vm_mod_fused_nn_dense_add"] Trial #12: GFLOPs: 12.7588. Time: 0.0803 ms. Best GFLOPs: 36.2412 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #19: "vm_mod_fused_nn_dense_add"] Trial #13: GFLOPs: 11.6082. Time: 0.0883 ms. Best GFLOPs: 36.2412 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #19: "vm_mod_fused_nn_dense_add"] Trial #14: GFLOPs: 5.9217. Time: 0.1731 ms. Best GFLOPs: 36.2412 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #19: "vm_mod_fused_nn_dense_add"] Trial #15: GFLOPs: 4.2015. Time: 0.2440 ms. Best GFLOPs: 36.2412 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #19: "vm_mod_fused_nn_dense_add"] Trial #16: GFLOPs: 24.6821. Time: 0.0415 ms. Best GFLOPs: 36.2412 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #19: "vm_mod_fused_nn_dense_add"] Trial #17: GFLOPs: 37.5386. Time: 0.0273 ms. Best GFLOPs: 37.5386 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #19: "vm_mod_fused_nn_dense_add"] Trial #18: GFLOPs: 3.9471. Time: 0.2597 ms. Best GFLOPs: 37.5386 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #19: "vm_mod_fused_nn_dense_add"] Trial #19: GFLOPs: 8.4574. Time: 0.1212 ms. Best GFLOPs: 37.5386 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:271: [Task #19: "vm_mod_fused_nn_dense_add"] Trial #20: Error in building: LocalRunner: An exception occurred | |
Subprocess terminated | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 1000), "float32"], placeholder_1: T.Buffer[(1000, 512), "float32"], placeholder_2: T.Buffer[(1, 512), "float32"], T_add: T.Buffer[(1, 1000), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"tir.noalias": True, "global_symbol": "main"}) | |
# body | |
# with T.block("root") | |
T_matmul_NT_local = T.alloc_buffer([1, 1000], dtype="float32", scope="local") | |
placeholder_shared = T.alloc_buffer([1, 512], dtype="float32", scope="shared") | |
placeholder_shared_1 = T.alloc_buffer([1000, 512], dtype="float32", scope="shared") | |
for i0_0_i1_0_fused in T.thread_binding(20, thread="blockIdx.x"): | |
for i0_1_i1_1_fused in T.thread_binding(1, thread="vthread.x"): | |
for i0_2_i1_2_fused in T.thread_binding(50, thread="threadIdx.x"): | |
with T.block("T_matmul_NT_init"): | |
i = T.axis.spatial(1, 0) | |
j = T.axis.spatial(1000, i0_0_i1_0_fused * 50 + i0_2_i1_2_fused) | |
T.reads() | |
T.writes(T_matmul_NT_local[i, j]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["dense_small_batch.gpu", ["TENSOR", [1, 512], "float32"], ["TENSOR", [1000, 512], "float32"], None, "float32"]}) | |
T_matmul_NT_local[i, j] = T.float32(0) | |
for i2_0 in T.serial(8): | |
for ax0_ax1_fused_0 in T.serial(1): | |
for ax0_ax1_fused_1 in T.thread_binding(50, thread="threadIdx.x"): | |
for ax0_ax1_fused_2 in T.vectorized(3): | |
with T.block("placeholder_shared"): | |
v0 = T.axis.spatial(1, 0) | |
v1 = T.axis.spatial(512, i2_0 * 64 + (ax0_ax1_fused_1 * 3 + ax0_ax1_fused_2)) | |
T.where(ax0_ax1_fused_1 * 3 + ax0_ax1_fused_2 < 64) | |
T.reads(placeholder_2[v0, v1]) | |
T.writes(placeholder_shared[v0, v1]) | |
T.block_attr({"meta_schedule.cooperative_fetch":3}) | |
placeholder_shared[v0, v1] = placeholder_2[v0, v1] | |
for ax0_ax1_fused_0 in T.serial(16): | |
for ax0_ax1_fused_1 in T.thread_binding(50, thread="threadIdx.x"): | |
for ax0_ax1_fused_2 in T.vectorized(4): | |
with T.block("placeholder_shared"): | |
v0 = T.axis.spatial(1000, i0_0_i1_0_fused * 50 + (ax0_ax1_fused_0 * 200 + ax0_ax1_fused_1 * 4 + ax0_ax1_fused_2) // 64) | |
v1 = T.axis.spatial(512, i2_0 * 64 + (ax0_ax1_fused_0 * 200 + ax0_ax1_fused_1 * 4 + ax0_ax1_fused_2) % 64) | |
T.reads(placeholder_1[v0, v1]) | |
T.writes(placeholder_shared_1[v0, v1]) | |
T.block_attr({"meta_schedule.cooperative_fetch":4}) | |
placeholder_shared_1[v0, v1] = placeholder_1[v0, v1] | |
for i2_1, i0_3, i1_3, i2_2, i0_4, i1_4 in T.grid(8, 1, 1, 8, 1, 1): | |
with T.block("T_matmul_NT_update"): | |
i = T.axis.spatial(1, 0) | |
j = T.axis.spatial(1000, i0_0_i1_0_fused * 50 + i0_2_i1_2_fused) | |
k = T.axis.reduce(512, i2_0 * 64 + i2_1 * 8 + i2_2) | |
T.reads(T_matmul_NT_local[i, j], placeholder_shared[i, k], placeholder_shared_1[j, k]) | |
T.writes(T_matmul_NT_local[i, j]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["dense_small_batch.gpu", ["TENSOR", [1, 512], "float32"], ["TENSOR", [1000, 512], "float32"], None, "float32"]}) | |
T_matmul_NT_local[i, j] = T_matmul_NT_local[i, j] + placeholder_shared[i, k] * placeholder_shared_1[j, k] | |
for ax0, ax1 in T.grid(1, 1): | |
with T.block("T_matmul_NT_local"): | |
v0 = T.axis.spatial(1, ax0) | |
v1 = T.axis.spatial(1000, i0_0_i1_0_fused * 50 + i0_2_i1_2_fused + ax1) | |
T.reads(T_matmul_NT_local[v0, v1], placeholder[v0, v1]) | |
T.writes(T_add[v0, v1]) | |
T_add[v0, v1] = T_matmul_NT_local[v0, v1] + placeholder[v0, v1] | |
b0 = sch.get_block(name="T_matmul_NT", func_name="main") | |
b1 = sch.get_block(name="T_add", func_name="main") | |
b2 = sch.get_block(name="root", func_name="main") | |
sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS") | |
l3, l4, l5 = sch.get_loops(block=b0) | |
v6, v7, v8, v9, v10 = sch.sample_perfect_tile(loop=l3, n=5, max_innermost_factor=64, decision=[1, 1, 1, 1, 1]) | |
l11, l12, l13, l14, l15 = sch.split(loop=l3, factors=[v6, v7, v8, v9, v10]) | |
v16, v17, v18, v19, v20 = sch.sample_perfect_tile(loop=l4, n=5, max_innermost_factor=64, decision=[20, 1, 50, 1, 1]) | |
l21, l22, l23, l24, l25 = sch.split(loop=l4, factors=[v16, v17, v18, v19, v20]) | |
v26, v27, v28 = sch.sample_perfect_tile(loop=l5, n=3, max_innermost_factor=64, decision=[8, 8, 8]) | |
l29, l30, l31 = sch.split(loop=l5, factors=[v26, v27, v28]) | |
sch.reorder(l11, l21, l12, l22, l13, l23, l29, l30, l14, l24, l31, l15, l25) | |
l32 = sch.fuse(l11, l21) | |
sch.bind(loop=l32, thread_axis="blockIdx.x") | |
l33 = sch.fuse(l12, l22) | |
sch.bind(loop=l33, thread_axis="vthread.x") | |
l34 = sch.fuse(l13, l23) | |
sch.bind(loop=l34, thread_axis="threadIdx.x") | |
sch.annotate(block_or_loop=b0, ann_key="meta_schedule.thread_extent_low_inclusive", ann_val=32) | |
sch.annotate(block_or_loop=b0, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=1024) | |
b35 = sch.cache_write(block=b0, write_buffer_index=0, storage_scope="local") | |
sch.reverse_compute_at(block=b35, loop=l34, preserve_unit_loops=True) | |
b36 = sch.cache_read(block=b0, read_buffer_index=1, storage_scope="shared") | |
sch.compute_at(block=b36, loop=l29, preserve_unit_loops=True) | |
l37, l38, l39, l40, l41, l42 = sch.get_loops(block=b36) | |
l43 = sch.fuse(l41, l42) | |
v44 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=2) | |
sch.annotate(block_or_loop=b36, ann_key="meta_schedule.cooperative_fetch", ann_val=v44) | |
b45 = sch.cache_read(block=b0, read_buffer_index=2, storage_scope="shared") | |
sch.compute_at(block=b45, loop=l29, preserve_unit_loops=True) | |
l46, l47, l48, l49, l50, l51 = sch.get_loops(block=b45) | |
l52 = sch.fuse(l50, l51) | |
v53 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=3) | |
sch.annotate(block_or_loop=b45, ann_key="meta_schedule.cooperative_fetch", ann_val=v53) | |
sch.reverse_compute_inline(block=b1) | |
v54 = sch.sample_categorical(candidates=[0, 16, 64, 512, 1024], probs=[0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001], decision=0) | |
sch.annotate(block_or_loop=b2, ann_key="meta_schedule.unroll_explicit", ann_val=v54) | |
sch.enter_postproc() | |
l55, l56, l57, l58, l59 = sch.get_loops(block=b36) | |
l60, l61, l62 = sch.split(loop=l59, factors=[None, 50, 3]) | |
sch.vectorize(loop=l62) | |
sch.bind(loop=l61, thread_axis="threadIdx.x") | |
l63, l64, l65, l66, l67 = sch.get_loops(block=b45) | |
l68, l69, l70 = sch.split(loop=l67, factors=[None, 50, 4]) | |
sch.vectorize(loop=l70) | |
sch.bind(loop=l69, thread_axis="threadIdx.x") | |
b71 = sch.get_block(name="root", func_name="main") | |
sch.unannotate(block_or_loop=b71, ann_key="meta_schedule.unroll_explicit") | |
b72, b73, b74, b75 = sch.get_child_blocks(b71) | |
l76, l77, l78, l79, l80, l81, l82 = sch.get_loops(block=b72) | |
l83, l84, l85, l86, l87, l88, l89 = sch.get_loops(block=b73) | |
l90, l91, l92, l93, l94, l95, l96, l97, l98, l99 = sch.get_loops(block=b74) | |
l100, l101, l102, l103, l104 = sch.get_loops(block=b75) | |
b105 = sch.get_block(name="T_matmul_NT", func_name="main") | |
l106, l107, l108, l109, l110, l111, l112, l113, l114, l115 = sch.get_loops(block=b105) | |
b116 = sch.decompose_reduction(block=b105, loop=l109) | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #19: "vm_mod_fused_nn_dense_add"] Trial #21: GFLOPs: 6.1992. Time: 0.1653 ms. Best GFLOPs: 37.5386 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #19: "vm_mod_fused_nn_dense_add"] Trial #22: GFLOPs: 2.0912. Time: 0.4901 ms. Best GFLOPs: 37.5386 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #19: "vm_mod_fused_nn_dense_add"] Trial #23: GFLOPs: 3.4037. Time: 0.3011 ms. Best GFLOPs: 37.5386 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #19: "vm_mod_fused_nn_dense_add"] Trial #24: GFLOPs: 3.6185. Time: 0.2833 ms. Best GFLOPs: 37.5386 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #19: "vm_mod_fused_nn_dense_add"] Trial #25: GFLOPs: 31.5238. Time: 0.0325 ms. Best GFLOPs: 37.5386 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #19: "vm_mod_fused_nn_dense_add"] Trial #26: GFLOPs: 39.2177. Time: 0.0261 ms. Best GFLOPs: 39.2177 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #19: "vm_mod_fused_nn_dense_add"] Trial #27: GFLOPs: 3.1124. Time: 0.3293 ms. Best GFLOPs: 39.2177 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #19: "vm_mod_fused_nn_dense_add"] Trial #28: GFLOPs: 7.5388. Time: 0.1360 ms. Best GFLOPs: 39.2177 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:271: [Task #19: "vm_mod_fused_nn_dense_add"] Trial #29: Error in building: LocalRunner: An exception occurred | |
Subprocess terminated | |
# from tvm.script import tir as T | |
@tvm.script.ir_module | |
class Module: | |
@T.prim_func | |
def main(placeholder: T.Buffer[(1, 1000), "float32"], placeholder_1: T.Buffer[(1000, 512), "float32"], placeholder_2: T.Buffer[(1, 512), "float32"], T_add: T.Buffer[(1, 1000), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"tir.noalias": True, "global_symbol": "main"}) | |
# body | |
# with T.block("root") | |
T_matmul_NT_local = T.alloc_buffer([1, 1000], dtype="float32", scope="local") | |
placeholder_shared = T.alloc_buffer([1, 512], dtype="float32", scope="shared") | |
placeholder_shared_1 = T.alloc_buffer([1000, 512], dtype="float32", scope="shared") | |
for i0_0_i1_0_fused in T.thread_binding(1, thread="blockIdx.x", annotations={"pragma_auto_unroll_max_step":64, "pragma_unroll_explicit":1}): | |
for i0_1_i1_1_fused in T.thread_binding(1, thread="vthread.x"): | |
for i0_2_i1_2_fused in T.thread_binding(50, thread="threadIdx.x"): | |
for i1_3_init, i1_4_init in T.grid(2, 10): | |
with T.block("T_matmul_NT_init"): | |
i = T.axis.spatial(1, 0) | |
j = T.axis.spatial(1000, i0_2_i1_2_fused * 20 + i1_3_init * 10 + i1_4_init) | |
T.reads() | |
T.writes(T_matmul_NT_local[i, j]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["dense_small_batch.gpu", ["TENSOR", [1, 512], "float32"], ["TENSOR", [1000, 512], "float32"], None, "float32"]}) | |
T_matmul_NT_local[i, j] = T.float32(0) | |
for i2_0 in T.serial(64): | |
for ax0_ax1_fused_0 in T.serial(1): | |
for ax0_ax1_fused_1 in T.thread_binding(50, thread="threadIdx.x"): | |
for ax0_ax1_fused_2 in T.vectorized(3): | |
with T.block("placeholder_shared"): | |
v0 = T.axis.spatial(1, 0) | |
v1 = T.axis.spatial(512, i2_0 * 8 + (ax0_ax1_fused_1 * 3 + ax0_ax1_fused_2)) | |
T.where(ax0_ax1_fused_1 * 3 + ax0_ax1_fused_2 < 8) | |
T.reads(placeholder_2[v0, v1]) | |
T.writes(placeholder_shared[v0, v1]) | |
T.block_attr({"meta_schedule.cooperative_fetch":3}) | |
placeholder_shared[v0, v1] = placeholder_2[v0, v1] | |
for ax0_ax1_fused_0 in T.serial(40): | |
for ax0_ax1_fused_1 in T.thread_binding(50, thread="threadIdx.x"): | |
for ax0_ax1_fused_2 in T.vectorized(4): | |
with T.block("placeholder_shared"): | |
v0 = T.axis.spatial(1000, (ax0_ax1_fused_0 * 200 + ax0_ax1_fused_1 * 4 + ax0_ax1_fused_2) // 8) | |
v1 = T.axis.spatial(512, i2_0 * 8 + (ax0_ax1_fused_0 * 200 + ax0_ax1_fused_1 * 4 + ax0_ax1_fused_2) % 8) | |
T.reads(placeholder_1[v0, v1]) | |
T.writes(placeholder_shared_1[v0, v1]) | |
T.block_attr({"meta_schedule.cooperative_fetch":4}) | |
placeholder_shared_1[v0, v1] = placeholder_1[v0, v1] | |
for i2_1, i0_3, i1_3, i2_2, i0_4, i1_4 in T.grid(4, 1, 2, 2, 1, 10): | |
with T.block("T_matmul_NT_update"): | |
i = T.axis.spatial(1, 0) | |
j = T.axis.spatial(1000, i0_2_i1_2_fused * 20 + i1_3 * 10 + i1_4) | |
k = T.axis.reduce(512, i2_0 * 8 + i2_1 * 2 + i2_2) | |
T.reads(T_matmul_NT_local[i, j], placeholder_shared[i, k], placeholder_shared_1[j, k]) | |
T.writes(T_matmul_NT_local[i, j]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["dense_small_batch.gpu", ["TENSOR", [1, 512], "float32"], ["TENSOR", [1000, 512], "float32"], None, "float32"]}) | |
T_matmul_NT_local[i, j] = T_matmul_NT_local[i, j] + placeholder_shared[i, k] * placeholder_shared_1[j, k] | |
for ax0, ax1 in T.grid(1, 20): | |
with T.block("T_matmul_NT_local"): | |
v0 = T.axis.spatial(1, ax0) | |
v1 = T.axis.spatial(1000, i0_2_i1_2_fused * 20 + ax1) | |
T.reads(T_matmul_NT_local[v0, v1], placeholder[v0, v1]) | |
T.writes(T_add[v0, v1]) | |
T_add[v0, v1] = T_matmul_NT_local[v0, v1] + placeholder[v0, v1] | |
b0 = sch.get_block(name="T_matmul_NT", func_name="main") | |
b1 = sch.get_block(name="T_add", func_name="main") | |
b2 = sch.get_block(name="root", func_name="main") | |
sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS") | |
l3, l4, l5 = sch.get_loops(block=b0) | |
v6, v7, v8, v9, v10 = sch.sample_perfect_tile(loop=l3, n=5, max_innermost_factor=64, decision=[1, 1, 1, 1, 1]) | |
l11, l12, l13, l14, l15 = sch.split(loop=l3, factors=[v6, v7, v8, v9, v10]) | |
v16, v17, v18, v19, v20 = sch.sample_perfect_tile(loop=l4, n=5, max_innermost_factor=64, decision=[1, 1, 50, 2, 10]) | |
l21, l22, l23, l24, l25 = sch.split(loop=l4, factors=[v16, v17, v18, v19, v20]) | |
v26, v27, v28 = sch.sample_perfect_tile(loop=l5, n=3, max_innermost_factor=64, decision=[64, 4, 2]) | |
l29, l30, l31 = sch.split(loop=l5, factors=[v26, v27, v28]) | |
sch.reorder(l11, l21, l12, l22, l13, l23, l29, l30, l14, l24, l31, l15, l25) | |
l32 = sch.fuse(l11, l21) | |
sch.bind(loop=l32, thread_axis="blockIdx.x") | |
l33 = sch.fuse(l12, l22) | |
sch.bind(loop=l33, thread_axis="vthread.x") | |
l34 = sch.fuse(l13, l23) | |
sch.bind(loop=l34, thread_axis="threadIdx.x") | |
sch.annotate(block_or_loop=b0, ann_key="meta_schedule.thread_extent_low_inclusive", ann_val=32) | |
sch.annotate(block_or_loop=b0, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=1024) | |
b35 = sch.cache_write(block=b0, write_buffer_index=0, storage_scope="local") | |
sch.reverse_compute_at(block=b35, loop=l34, preserve_unit_loops=True) | |
b36 = sch.cache_read(block=b0, read_buffer_index=1, storage_scope="shared") | |
sch.compute_at(block=b36, loop=l29, preserve_unit_loops=True) | |
l37, l38, l39, l40, l41, l42 = sch.get_loops(block=b36) | |
l43 = sch.fuse(l41, l42) | |
v44 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=2) | |
sch.annotate(block_or_loop=b36, ann_key="meta_schedule.cooperative_fetch", ann_val=v44) | |
b45 = sch.cache_read(block=b0, read_buffer_index=2, storage_scope="shared") | |
sch.compute_at(block=b45, loop=l29, preserve_unit_loops=True) | |
l46, l47, l48, l49, l50, l51 = sch.get_loops(block=b45) | |
l52 = sch.fuse(l50, l51) | |
v53 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25], decision=3) | |
sch.annotate(block_or_loop=b45, ann_key="meta_schedule.cooperative_fetch", ann_val=v53) | |
sch.reverse_compute_inline(block=b1) | |
v54 = sch.sample_categorical(candidates=[0, 16, 64, 512, 1024], probs=[0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001, 0.20000000000000001], decision=2) | |
sch.annotate(block_or_loop=b2, ann_key="meta_schedule.unroll_explicit", ann_val=v54) | |
sch.enter_postproc() | |
l55, l56, l57, l58, l59 = sch.get_loops(block=b36) | |
l60, l61, l62 = sch.split(loop=l59, factors=[None, 50, 3]) | |
sch.vectorize(loop=l62) | |
sch.bind(loop=l61, thread_axis="threadIdx.x") | |
l63, l64, l65, l66, l67 = sch.get_loops(block=b45) | |
l68, l69, l70 = sch.split(loop=l67, factors=[None, 50, 4]) | |
sch.vectorize(loop=l70) | |
sch.bind(loop=l69, thread_axis="threadIdx.x") | |
b71 = sch.get_block(name="root", func_name="main") | |
sch.unannotate(block_or_loop=b71, ann_key="meta_schedule.unroll_explicit") | |
b72, b73, b74, b75 = sch.get_child_blocks(b71) | |
l76, l77, l78, l79, l80, l81, l82 = sch.get_loops(block=b72) | |
sch.annotate(block_or_loop=l76, ann_key="pragma_auto_unroll_max_step", ann_val=64) | |
sch.annotate(block_or_loop=l76, ann_key="pragma_unroll_explicit", ann_val=1) | |
l83, l84, l85, l86, l87, l88, l89 = sch.get_loops(block=b73) | |
sch.annotate(block_or_loop=l83, ann_key="pragma_auto_unroll_max_step", ann_val=64) | |
sch.annotate(block_or_loop=l83, ann_key="pragma_unroll_explicit", ann_val=1) | |
l90, l91, l92, l93, l94, l95, l96, l97, l98, l99 = sch.get_loops(block=b74) | |
sch.annotate(block_or_loop=l90, ann_key="pragma_auto_unroll_max_step", ann_val=64) | |
sch.annotate(block_or_loop=l90, ann_key="pragma_unroll_explicit", ann_val=1) | |
l100, l101, l102, l103, l104 = sch.get_loops(block=b75) | |
sch.annotate(block_or_loop=l100, ann_key="pragma_auto_unroll_max_step", ann_val=64) | |
sch.annotate(block_or_loop=l100, ann_key="pragma_unroll_explicit", ann_val=1) | |
b105 = sch.get_block(name="T_matmul_NT", func_name="main") | |
l106, l107, l108, l109, l110, l111, l112, l113, l114, l115 = sch.get_loops(block=b105) | |
b116 = sch.decompose_reduction(block=b105, loop=l109) | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #19: "vm_mod_fused_nn_dense_add"] Trial #30: GFLOPs: 5.4127. Time: 0.1894 ms. Best GFLOPs: 39.2177 | |
[14:41:46] /home/zxybazh/tvm-tensorir/src/meta_schedule/measure_callback/echo_statistics.cc:258: [Task #19: "vm_mod_fused_nn_dense_add"] Trial #31: GFLOPs: 9.8170. Time: 0.1044 ms. Best GFLOPs: 39.2177 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB validation: p-rmse: 0.645415 a-peak@32: 0.963876 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 0: tr-p-rmse: 0.409151 tr-a-peak@32: 0.652678 tr-rmse: 0.257918 tr-rmse: 0.257918 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 25: tr-p-rmse: 0.054936 tr-a-peak@32: 0.992026 tr-rmse: 0.173700 tr-rmse: 0.173700 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB iter 50: tr-p-rmse: 0.054902 tr-a-peak@32: 0.992026 tr-rmse: 0.173684 tr-rmse: 0.173684 | |
DEBUG:tvm.meta_schedule.cost_model.xgb_model:XGB stopped. Best iteration: [17] tr-p-rmse:0.05480 tr-a-peak@32:1.00000 tr-rmse:0.17382 tr-rmse:0.17382 | |
[14:41:47] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:127: Scheduler picks Task #19: "vm_mod_fused_nn_dense_add" | |
[14:41:47] /home/zxybazh/tvm-tensorir/src/meta_schedule/task_scheduler/task_scheduler.cc:141: Task #19 has finished. Remaining task(s): 0 | |
WARNING:autotvm:One or more operators have not been tuned. Please tune your model for better performance. Use DEBUG logging level to see more details. | |
[14:41:53] /home/zxybazh/tvm-tensorir/src/meta_schedule/integration.cc:139: Applied history best for: tvmgen_default_fused_layout_transform | |
[14:41:53] /home/zxybazh/tvm-tensorir/src/meta_schedule/integration.cc:145: | |
# from tvm.script import tir as T | |
@T.prim_func | |
def func(placeholder: T.Buffer[(1, 3, 224, 224), "float32"], T_layout_trans: T.Buffer[(1, 224, 224, 3), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
# with T.block("root") | |
for i0_i1_i2_i3_fused_0 in T.thread_binding(4704, thread="blockIdx.x", annotations={"pragma_auto_unroll_max_step":16, "pragma_unroll_explicit":1}): | |
for i0_i1_i2_i3_fused_1 in T.thread_binding(32, thread="threadIdx.x"): | |
with T.block("T_layout_trans"): | |
ax0 = T.axis.spatial(1, 0) | |
ax1 = T.axis.spatial(224, (i0_i1_i2_i3_fused_0 * 32 + i0_i1_i2_i3_fused_1) // 672) | |
ax2 = T.axis.spatial(224, (i0_i1_i2_i3_fused_0 * 32 + i0_i1_i2_i3_fused_1) % 672 // 3) | |
ax3 = T.axis.spatial(3, (i0_i1_i2_i3_fused_0 * 32 + i0_i1_i2_i3_fused_1) % 3) | |
T.reads(placeholder[ax0, ax3, ax1, ax2]) | |
T.writes(T_layout_trans[ax0, ax1, ax2, ax3]) | |
T_layout_trans[ax0, ax1, ax2, ax3] = T.if_then_else(ax0 < 1 and ax3 < 3 and ax1 < 224 and ax2 < 224, placeholder[ax0, ax3, ax1, ax2], T.float32(0), dtype="float32") | |
[14:41:53] /home/zxybazh/tvm-tensorir/src/meta_schedule/integration.cc:139: Applied history best for: tvmgen_default_fused_nn_conv2d_add_nn_relu | |
[14:41:53] /home/zxybazh/tvm-tensorir/src/meta_schedule/integration.cc:145: | |
# from tvm.script import tir as T | |
@T.prim_func | |
def func(placeholder: T.Buffer[(1, 1, 1, 64), "float32"], placeholder_1: T.Buffer[(7, 7, 3, 64), "float32"], placeholder_2: T.Buffer[(1, 224, 224, 3), "float32"], T_relu: T.Buffer[(1, 112, 112, 64), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
# with T.block("root") | |
conv2d_nhwc_local = T.alloc_buffer([1, 112, 112, 64], dtype="float32", scope="local") | |
pad_temp_shared = T.alloc_buffer([1, 230, 230, 3], dtype="float32", scope="shared") | |
placeholder_shared = T.alloc_buffer([7, 7, 3, 64], dtype="float32", scope="shared") | |
for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(112, thread="blockIdx.x", annotations={"pragma_auto_unroll_max_step":512, "pragma_unroll_explicit":1}): | |
for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(4, thread="vthread.x"): | |
for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(224, thread="threadIdx.x"): | |
for i4_0, i5_0 in T.grid(1, 1): | |
for i1_3_init, i3_4_init in T.grid(2, 4): | |
with T.block("conv2d_nhwc_init"): | |
nn = T.axis.spatial(1, 0) | |
yy = T.axis.spatial(112, i0_2_i1_2_i2_2_i3_2_fused // 4 * 2 + i1_3_init) | |
xx = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused // 4 * 4 + i0_2_i1_2_i2_2_i3_2_fused % 4) | |
ff = T.axis.spatial(64, i0_0_i1_0_i2_0_i3_0_fused % 4 * 16 + i0_1_i1_1_i2_1_i3_1_fused * 4 + i3_4_init) | |
T.reads() | |
T.writes(conv2d_nhwc_local[nn, yy, xx, ff]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["conv2d_nhwc.gpu", ["TENSOR", [1, 224, 224, 3], "float32"], ["TENSOR", [7, 7, 3, 64], "float32"], [2, 2], [3, 3, 3, 3], [1, 1], "float32"]}) | |
conv2d_nhwc_local[nn, yy, xx, ff] = T.float32(0) | |
for i6_0 in T.serial(3): | |
for ax0_ax1_ax2_ax3_fused_0 in T.serial(4): | |
for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(224, thread="threadIdx.x"): | |
for ax0_ax1_ax2_ax3_fused_2 in T.vectorized(4): | |
with T.block("pad_temp_shared"): | |
v0 = T.axis.spatial(1, 0) | |
v1 = T.axis.spatial(230, ((ax0_ax1_ax2_ax3_fused_0 * 224 + ax0_ax1_ax2_ax3_fused_1) * 4 + ax0_ax1_ax2_ax3_fused_2) % 2977 // 13) | |
v2 = T.axis.spatial(230, i0_0_i1_0_i2_0_i3_0_fused // 4 * 8 + ((ax0_ax1_ax2_ax3_fused_0 * 224 + ax0_ax1_ax2_ax3_fused_1) * 4 + ax0_ax1_ax2_ax3_fused_2) % 13) | |
v3 = T.axis.spatial(3, i6_0 + 0) | |
T.where((ax0_ax1_ax2_ax3_fused_0 * 224 + ax0_ax1_ax2_ax3_fused_1) * 4 + ax0_ax1_ax2_ax3_fused_2 < 2977) | |
T.reads(placeholder_2[v0, v1 - 3, v2 - 3, v3]) | |
T.writes(pad_temp_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":4}) | |
pad_temp_shared[v0, v1, v2, v3] = T.if_then_else(3 <= v1 and v1 < 227 and 3 <= v2 and v2 < 227, placeholder_2[v0, v1 - 3, v2 - 3, v3], T.float32(0), dtype="float32") | |
for ax0_ax1_ax2_ax3_fused_0 in T.serial(1): | |
for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(224, thread="threadIdx.x"): | |
for ax0_ax1_ax2_ax3_fused_2 in T.vectorized(4): | |
with T.block("placeholder_shared"): | |
v0 = T.axis.spatial(7, (ax0_ax1_ax2_ax3_fused_1 * 4 + ax0_ax1_ax2_ax3_fused_2) // 112) | |
v1 = T.axis.spatial(7, (ax0_ax1_ax2_ax3_fused_1 * 4 + ax0_ax1_ax2_ax3_fused_2) % 112 // 16) | |
v2 = T.axis.spatial(3, i6_0) | |
v3 = T.axis.spatial(64, i0_0_i1_0_i2_0_i3_0_fused % 4 * 16 + (ax0_ax1_ax2_ax3_fused_1 * 4 + ax0_ax1_ax2_ax3_fused_2) % 16) | |
T.where(ax0_ax1_ax2_ax3_fused_1 * 4 + ax0_ax1_ax2_ax3_fused_2 < 784) | |
T.reads(placeholder_1[v0, v1, v2, v3]) | |
T.writes(placeholder_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":4}) | |
placeholder_shared[v0, v1, v2, v3] = placeholder_1[v0, v1, v2, v3] | |
for i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3, i4_2, i5_2, i6_2, i0_4, i1_4, i2_4, i3_4 in T.grid(1, 1, 1, 1, 2, 1, 1, 7, 7, 1, 1, 1, 1, 4): | |
with T.block("conv2d_nhwc_update"): | |
nn = T.axis.spatial(1, 0) | |
yy = T.axis.spatial(112, i0_2_i1_2_i2_2_i3_2_fused // 4 * 2 + i1_3) | |
xx = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused // 4 * 4 + i0_2_i1_2_i2_2_i3_2_fused % 4) | |
ff = T.axis.spatial(64, i0_0_i1_0_i2_0_i3_0_fused % 4 * 16 + i0_1_i1_1_i2_1_i3_1_fused * 4 + i3_4) | |
ry, rx, rc = T.axis.remap("RRR", [i4_2, i5_2, i6_0]) | |
T.reads(conv2d_nhwc_local[nn, yy, xx, ff], pad_temp_shared[nn, yy * 2 + ry, xx * 2 + rx, rc], placeholder_shared[ry, rx, rc, ff]) | |
T.writes(conv2d_nhwc_local[nn, yy, xx, ff]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["conv2d_nhwc.gpu", ["TENSOR", [1, 224, 224, 3], "float32"], ["TENSOR", [7, 7, 3, 64], "float32"], [2, 2], [3, 3, 3, 3], [1, 1], "float32"]}) | |
conv2d_nhwc_local[nn, yy, xx, ff] = conv2d_nhwc_local[nn, yy, xx, ff] + pad_temp_shared[nn, yy * 2 + ry, xx * 2 + rx, rc] * placeholder_shared[ry, rx, rc, ff] | |
for ax0, ax1, ax2, ax3 in T.grid(1, 2, 1, 4): | |
with T.block("conv2d_nhwc_local"): | |
v0 = T.axis.spatial(1, ax0) | |
v1 = T.axis.spatial(112, i0_2_i1_2_i2_2_i3_2_fused // 4 * 2 + ax1) | |
v2 = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused // 4 * 4 + i0_2_i1_2_i2_2_i3_2_fused % 4 + ax2) | |
v3 = T.axis.spatial(64, i0_0_i1_0_i2_0_i3_0_fused % 4 * 16 + i0_1_i1_1_i2_1_i3_1_fused * 4 + ax3) | |
T.reads(conv2d_nhwc_local[v0, v1, v2, v3], placeholder[v0, 0, 0, v3]) | |
T.writes(T_relu[v0, v1, v2, v3]) | |
T_relu[v0, v1, v2, v3] = T.max(conv2d_nhwc_local[v0, v1, v2, v3] + placeholder[v0, 0, 0, v3], T.float32(0)) | |
[14:41:53] /home/zxybazh/tvm-tensorir/src/meta_schedule/integration.cc:139: Applied history best for: tvmgen_default_fused_nn_max_pool2d | |
[14:41:53] /home/zxybazh/tvm-tensorir/src/meta_schedule/integration.cc:145: | |
# from tvm.script import tir as T | |
@T.prim_func | |
def func(placeholder: T.Buffer[(1, 112, 112, 64), "float32"], tensor: T.Buffer[(1, 56, 56, 64), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
# with T.block("root") | |
for i0_i1_i2_i3_fused_0 in T.thread_binding(6272, thread="blockIdx.x"): | |
for i0_i1_i2_i3_fused_1 in T.thread_binding(32, thread="threadIdx.x"): | |
with T.block("tensor_init"): | |
ax0 = T.axis.spatial(1, 0) | |
ax1 = T.axis.spatial(56, (i0_i1_i2_i3_fused_0 * 32 + i0_i1_i2_i3_fused_1) // 3584) | |
ax2 = T.axis.spatial(56, (i0_i1_i2_i3_fused_0 * 32 + i0_i1_i2_i3_fused_1) % 3584 // 64) | |
ax3 = T.axis.spatial(64, (i0_i1_i2_i3_fused_0 * 32 + i0_i1_i2_i3_fused_1) % 64) | |
T.reads() | |
T.writes(tensor[ax0, ax1, ax2, ax3]) | |
tensor[ax0, ax1, ax2, ax3] = T.float32(-3.4028234663852886e+38) | |
for i4, i5 in T.grid(3, 3): | |
with T.block("tensor_update"): | |
ax0 = T.axis.spatial(1, 0) | |
ax1 = T.axis.spatial(56, (i0_i1_i2_i3_fused_0 * 32 + i0_i1_i2_i3_fused_1) // 3584) | |
ax2 = T.axis.spatial(56, (i0_i1_i2_i3_fused_0 * 32 + i0_i1_i2_i3_fused_1) % 3584 // 64) | |
ax3 = T.axis.spatial(64, (i0_i1_i2_i3_fused_0 * 32 + i0_i1_i2_i3_fused_1) % 64) | |
rv0, rv1 = T.axis.remap("RR", [i4, i5]) | |
T.reads(tensor[ax0, ax1, ax2, ax3], placeholder[ax0, ax1 * 2 + rv0 - 1, ax2 * 2 + rv1 - 1, ax3]) | |
T.writes(tensor[ax0, ax1, ax2, ax3]) | |
tensor[ax0, ax1, ax2, ax3] = T.max(tensor[ax0, ax1, ax2, ax3], T.if_then_else(1 <= ax1 * 2 + rv0 and ax1 * 2 + rv0 < 113 and 1 <= ax2 * 2 + rv1 and ax2 * 2 + rv1 < 113, placeholder[ax0, ax1 * 2 + rv0 - 1, ax2 * 2 + rv1 - 1, ax3], T.float32(-3.4028234663852886e+38), dtype="float32")) | |
[14:41:53] /home/zxybazh/tvm-tensorir/src/meta_schedule/integration.cc:139: Applied history best for: tvmgen_default_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu | |
[14:41:53] /home/zxybazh/tvm-tensorir/src/meta_schedule/integration.cc:145: | |
# from tvm.script import tir as T | |
@T.prim_func | |
def func(placeholder: T.Buffer[(1, 1, 1, 64), "float32"], placeholder_1: T.Buffer[(6, 6, 64, 64), "float32"], placeholder_2: T.Buffer[(1, 56, 56, 64), "float32"], T_relu: T.Buffer[(1, 56, 56, 64), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
# with T.block("root") | |
input_tile_local = T.alloc_buffer([6, 6, 196, 64], dtype="float32", scope="local") | |
data_pack = T.alloc_buffer([6, 6, 196, 64], dtype="float32") | |
bgemm = T.alloc_buffer([6, 6, 196, 64], dtype="float32") | |
inverse = T.alloc_buffer([4, 4, 196, 64], dtype="float32") | |
bgemm_local = T.alloc_buffer([6, 6, 196, 64], dtype="float32", scope="local") | |
data_pack_shared = T.alloc_buffer([6, 6, 196, 64], dtype="float32", scope="shared") | |
placeholder_shared = T.alloc_buffer([6, 6, 64, 64], dtype="float32", scope="shared") | |
for i2_0_i3_0_i2_1_i3_1_fused_0 in T.thread_binding(392, thread="blockIdx.x", annotations={"pragma_auto_unroll_max_step":512, "pragma_unroll_explicit":1}): | |
for i2_0_i3_0_i2_1_i3_1_fused_1 in T.thread_binding(32, thread="threadIdx.x"): | |
for ax0, ax1, ax2, ax3 in T.grid(6, 6, 1, 1): | |
with T.block("input_tile"): | |
eps, nu = T.axis.remap("SS", [ax0, ax1]) | |
p = T.axis.spatial(196, (i2_0_i3_0_i2_1_i3_1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_fused_1) // 128 * 2 + (i2_0_i3_0_i2_1_i3_1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_fused_1) % 64 // 32) | |
ci = T.axis.spatial(64, (i2_0_i3_0_i2_1_i3_1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_fused_1) % 128 // 64 * 32 + (i2_0_i3_0_i2_1_i3_1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_fused_1) % 32) | |
T.reads(placeholder_2[p // 196, p % 196 // 14 * 4 + eps - 1, p % 14 * 4 + nu - 1, ci]) | |
T.writes(input_tile_local[eps, nu, p, ci]) | |
T.block_attr({"schedule_rule":"None"}) | |
input_tile_local[eps, nu, p, ci] = T.if_then_else(1 <= p % 196 // 14 * 4 + eps and p % 196 // 14 * 4 + eps < 57 and 1 <= p % 14 * 4 + nu and p % 14 * 4 + nu < 57, placeholder_2[p // 196, p % 196 // 14 * 4 + eps - 1, p % 14 * 4 + nu - 1, ci], T.float32(0), dtype="float32") | |
for i0 in T.unroll(6): | |
for i1 in T.unroll(6): | |
with T.block("data_pack_init"): | |
eps, nu = T.axis.remap("SS", [i0, i1]) | |
p = T.axis.spatial(196, (i2_0_i3_0_i2_1_i3_1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_fused_1) // 128 * 2 + (i2_0_i3_0_i2_1_i3_1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_fused_1) % 64 // 32) | |
ci = T.axis.spatial(64, (i2_0_i3_0_i2_1_i3_1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_fused_1) % 128 // 64 * 32 + (i2_0_i3_0_i2_1_i3_1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_fused_1) % 32) | |
T.reads() | |
T.writes(data_pack[eps, nu, p, ci]) | |
T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["eps", "nu", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_data_pack.cuda"}) | |
data_pack[eps, nu, p, ci] = T.float32(0) | |
for i4 in T.unroll(6): | |
for i5 in T.unroll(6): | |
with T.block("data_pack_update"): | |
eps, nu = T.axis.remap("SS", [i0, i1]) | |
p = T.axis.spatial(196, (i2_0_i3_0_i2_1_i3_1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_fused_1) // 128 * 2 + (i2_0_i3_0_i2_1_i3_1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_fused_1) % 64 // 32) | |
ci = T.axis.spatial(64, (i2_0_i3_0_i2_1_i3_1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_fused_1) % 128 // 64 * 32 + (i2_0_i3_0_i2_1_i3_1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_fused_1) % 32) | |
r_a, r_b = T.axis.remap("RR", [i4, i5]) | |
T.reads(data_pack[eps, nu, p, ci], input_tile_local[r_a, r_b, p, ci]) | |
T.writes(data_pack[eps, nu, p, ci]) | |
T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["eps", "nu", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_data_pack.cuda"}) | |
data_pack[eps, nu, p, ci] = data_pack[eps, nu, p, ci] + input_tile_local[r_a, r_b, p, ci] * T.Select(r_a % 6 == 5 and eps % 6 == 5, T.float32(1), T.Select(r_a % 6 == 5 and eps % 6 == 4, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 3, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 2, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 1, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 0, T.float32(0), T.Select(r_a % 6 == 4 and eps % 6 == 5, T.float32(1.5), T.Select(r_a % 6 == 4 and eps % 6 == 4, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 3, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 2, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 1, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 0, T.float32(1), T.Select(r_a % 6 == 3 and eps % 6 == 5, T.float32(-2), T.Select(r_a % 6 == 3 and eps % 6 == 4, T.float32(-0.5), T.Select(r_a % 6 == 3 and eps % 6 == 3, T.float32(2), T.Select(r_a % 6 == 3 and eps % 6 == 2, T.float32(2.5), T.Select(r_a % 6 == 3 and eps % 6 == 1, T.float32(0.5), T.Select(r_a % 6 == 3 and eps % 6 == 0, T.float32(1.5), T.Select(r_a % 6 == 2 and eps % 6 == 5, T.float32(-1.5), T.Select(r_a % 6 == 2 and eps % 6 == 4, T.float32(-1), T.Select(r_a % 6 == 2 and eps % 6 == 3, T.float32(-1), T.Select(r_a % 6 == 2 and eps % 6 == 2, T.float32(0.5), T.Select(r_a % 6 == 2 and eps % 6 == 1, T.float32(-2.5), T.Select(r_a % 6 == 2 and eps % 6 == 0, T.float32(-2), T.Select(r_a % 6 == 1 and eps % 6 == 5, T.float32(1), T.Select(r_a % 6 == 1 and eps % 6 == 4, T.float32(0.5), T.Select(r_a % 6 == 1 and eps % 6 == 3, T.float32(-2), T.Select(r_a % 6 == 1 and eps % 6 == 2, T.float32(-1), T.Select(r_a % 6 == 1 and eps % 6 == 1, T.float32(1), T.Select(r_a % 6 == 1 and eps % 6 == 0, T.float32(-1.5), T.Select(r_a % 6 == 0 and eps % 6 == 5, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 4, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 3, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 2, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 1, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 0, T.float32(1), T.float32(0))))))))))))))))))))))))))))))))))))) * T.Select(r_b % 6 == 5 and nu % 6 == 5, T.float32(1), T.Select(r_b % 6 == 5 and nu % 6 == 4, T.float32(0), T.Select(r_b % 6 == 5 and nu % 6 == 3, T.float32(0), T.Select(r_b % 6 == 5 and nu % 6 == 2, T.float32(0), T.Select(r_b % 6 == 5 and nu % 6 == 1, T.float32(0), T.Select(r_b % 6 == 5 and nu % 6 == 0, T.float32(0), T.Select(r_b % 6 == 4 and nu % 6 == 5, T.float32(1.5), T.Select(r_b % 6 == 4 and nu % 6 == 4, T.float32(1), T.Select(r_b % 6 == 4 and nu % 6 == 3, T.float32(1), T.Select(r_b % 6 == 4 and nu % 6 == 2, T.float32(1), T.Select(r_b % 6 == 4 and nu % 6 == 1, T.float32(1), T.Select(r_b % 6 == 4 and nu % 6 == 0, T.float32(1), T.Select(r_b % 6 == 3 and nu % 6 == 5, T.float32(-2), T.Select(r_b % 6 == 3 and nu % 6 == 4, T.float32(-0.5), T.Select(r_b % 6 == 3 and nu % 6 == 3, T.float32(2), T.Select(r_b % 6 == 3 and nu % 6 == 2, T.float32(2.5), T.Select(r_b % 6 == 3 and nu % 6 == 1, T.float32(0.5), T.Select(r_b % 6 == 3 and nu % 6 == 0, T.float32(1.5), T.Select(r_b % 6 == 2 and nu % 6 == 5, T.float32(-1.5), T.Select(r_b % 6 == 2 and nu % 6 == 4, T.float32(-1), T.Select(r_b % 6 == 2 and nu % 6 == 3, T.float32(-1), T.Select(r_b % 6 == 2 and nu % 6 == 2, T.float32(0.5), T.Select(r_b % 6 == 2 and nu % 6 == 1, T.float32(-2.5), T.Select(r_b % 6 == 2 and nu % 6 == 0, T.float32(-2), T.Select(r_b % 6 == 1 and nu % 6 == 5, T.float32(1), T.Select(r_b % 6 == 1 and nu % 6 == 4, T.float32(0.5), T.Select(r_b % 6 == 1 and nu % 6 == 3, T.float32(-2), T.Select(r_b % 6 == 1 and nu % 6 == 2, T.float32(-1), T.Select(r_b % 6 == 1 and nu % 6 == 1, T.float32(1), T.Select(r_b % 6 == 1 and nu % 6 == 0, T.float32(-1.5), T.Select(r_b % 6 == 0 and nu % 6 == 5, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 4, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 3, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 2, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 1, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 0, T.float32(1), T.float32(0))))))))))))))))))))))))))))))))))))) | |
for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(84, thread="blockIdx.x", annotations={"pragma_auto_unroll_max_step":512, "pragma_unroll_explicit":1}): | |
for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(1, thread="vthread.x"): | |
for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(336, thread="threadIdx.x"): | |
for i3_3_init, i2_4_init, i3_4_init in T.grid(2, 4, 2): | |
with T.block("bgemm_init"): | |
eps = T.axis.spatial(6, i0_0_i1_0_i2_0_i3_0_fused // 42 * 3 + i0_2_i1_2_i2_2_i3_2_fused // 112) | |
nu = T.axis.spatial(6, i0_0_i1_0_i2_0_i3_0_fused % 42 // 7) | |
p = T.axis.spatial(196, i0_0_i1_0_i2_0_i3_0_fused % 7 * 28 + i0_2_i1_2_i2_2_i3_2_fused % 112 // 16 * 4 + i2_4_init) | |
co = T.axis.spatial(64, i0_2_i1_2_i2_2_i3_2_fused % 16 * 4 + i3_3_init * 2 + i3_4_init) | |
T.reads() | |
T.writes(bgemm_local[eps, nu, p, co]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS"}) | |
bgemm_local[eps, nu, p, co] = T.float32(0) | |
for i4_0 in T.serial(8): | |
for ax0_ax1_ax2_ax3_fused_0 in T.serial(1): | |
for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(336, thread="threadIdx.x"): | |
for ax0_ax1_ax2_ax3_fused_2 in T.vectorized(4): | |
with T.block("data_pack_shared"): | |
v0 = T.axis.spatial(6, i0_0_i1_0_i2_0_i3_0_fused // 42 * 3 + (ax0_ax1_ax2_ax3_fused_1 * 4 + ax0_ax1_ax2_ax3_fused_2) // 224) | |
v1 = T.axis.spatial(6, i0_0_i1_0_i2_0_i3_0_fused % 42 // 7) | |
v2 = T.axis.spatial(196, i0_0_i1_0_i2_0_i3_0_fused % 7 * 28 + (ax0_ax1_ax2_ax3_fused_1 * 4 + ax0_ax1_ax2_ax3_fused_2) % 224 // 8) | |
v3 = T.axis.spatial(64, i4_0 * 8 + (ax0_ax1_ax2_ax3_fused_1 * 4 + ax0_ax1_ax2_ax3_fused_2) % 8) | |
T.where(ax0_ax1_ax2_ax3_fused_1 * 4 + ax0_ax1_ax2_ax3_fused_2 < 672) | |
T.reads(data_pack[v0, v1, v2, v3]) | |
T.writes(data_pack_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":4}) | |
data_pack_shared[v0, v1, v2, v3] = data_pack[v0, v1, v2, v3] | |
for ax0_ax1_ax2_ax3_fused_0 in T.serial(2): | |
for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(336, thread="threadIdx.x"): | |
for ax0_ax1_ax2_ax3_fused_2 in T.vectorized(3): | |
with T.block("placeholder_shared"): | |
v0 = T.axis.spatial(6, i0_0_i1_0_i2_0_i3_0_fused // 42 * 3 + (ax0_ax1_ax2_ax3_fused_0 * 1008 + ax0_ax1_ax2_ax3_fused_1 * 3 + ax0_ax1_ax2_ax3_fused_2) // 512) | |
v1 = T.axis.spatial(6, i0_0_i1_0_i2_0_i3_0_fused % 42 // 7) | |
v2 = T.axis.spatial(64, (ax0_ax1_ax2_ax3_fused_0 * 1008 + ax0_ax1_ax2_ax3_fused_1 * 3 + ax0_ax1_ax2_ax3_fused_2) % 512 // 8) | |
v3 = T.axis.spatial(64, i4_0 * 8 + (ax0_ax1_ax2_ax3_fused_0 * 1008 + ax0_ax1_ax2_ax3_fused_1 * 3 + ax0_ax1_ax2_ax3_fused_2) % 8) | |
T.where((ax0_ax1_ax2_ax3_fused_0 * 336 + ax0_ax1_ax2_ax3_fused_1) * 3 + ax0_ax1_ax2_ax3_fused_2 < 1536) | |
T.reads(placeholder_1[v0, v1, v2, v3]) | |
T.writes(placeholder_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":3}) | |
placeholder_shared[v0, v1, v2, v3] = placeholder_1[v0, v1, v2, v3] | |
for i4_1, i0_3, i1_3, i2_3, i3_3, i4_2, i0_4, i1_4, i2_4, i3_4 in T.grid(1, 1, 1, 1, 2, 8, 1, 1, 4, 2): | |
with T.block("bgemm_update"): | |
eps = T.axis.spatial(6, i0_0_i1_0_i2_0_i3_0_fused // 42 * 3 + i0_2_i1_2_i2_2_i3_2_fused // 112) | |
nu = T.axis.spatial(6, i0_0_i1_0_i2_0_i3_0_fused % 42 // 7) | |
p = T.axis.spatial(196, i0_0_i1_0_i2_0_i3_0_fused % 7 * 28 + i0_2_i1_2_i2_2_i3_2_fused % 112 // 16 * 4 + i2_4) | |
co = T.axis.spatial(64, i0_2_i1_2_i2_2_i3_2_fused % 16 * 4 + i3_3 * 2 + i3_4) | |
ci = T.axis.reduce(64, i4_0 * 8 + i4_2) | |
T.reads(bgemm_local[eps, nu, p, co], data_pack_shared[eps, nu, p, ci], placeholder_shared[eps, nu, co, ci]) | |
T.writes(bgemm_local[eps, nu, p, co]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS"}) | |
bgemm_local[eps, nu, p, co] = bgemm_local[eps, nu, p, co] + data_pack_shared[eps, nu, p, ci] * placeholder_shared[eps, nu, co, ci] | |
for ax0, ax1, ax2, ax3 in T.grid(1, 1, 4, 4): | |
with T.block("bgemm_local"): | |
v0 = T.axis.spatial(6, i0_0_i1_0_i2_0_i3_0_fused // 42 * 3 + i0_2_i1_2_i2_2_i3_2_fused // 112 + ax0) | |
v1 = T.axis.spatial(6, i0_0_i1_0_i2_0_i3_0_fused % 42 // 7 + ax1) | |
v2 = T.axis.spatial(196, i0_0_i1_0_i2_0_i3_0_fused % 7 * 28 + i0_2_i1_2_i2_2_i3_2_fused % 112 // 16 * 4 + ax2) | |
v3 = T.axis.spatial(64, i0_2_i1_2_i2_2_i3_2_fused % 16 * 4 + ax3) | |
T.reads(bgemm_local[v0, v1, v2, v3]) | |
T.writes(bgemm[v0, v1, v2, v3]) | |
bgemm[v0, v1, v2, v3] = bgemm_local[v0, v1, v2, v3] | |
for i2_0_i3_0_i2_1_i3_1_i0_i1_fused_0 in T.thread_binding(6272, thread="blockIdx.x", annotations={"pragma_auto_unroll_max_step":512, "pragma_unroll_explicit":1}): | |
for i2_0_i3_0_i2_1_i3_1_i0_i1_fused_1 in T.thread_binding(32, thread="threadIdx.x"): | |
with T.block("inverse_init"): | |
vh = T.axis.spatial(4, (i2_0_i3_0_i2_1_i3_1_i0_i1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_i0_i1_fused_1) % 16 // 4) | |
vw = T.axis.spatial(4, (i2_0_i3_0_i2_1_i3_1_i0_i1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_i0_i1_fused_1) % 4) | |
p = T.axis.spatial(196, (i2_0_i3_0_i2_1_i3_1_i0_i1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_i0_i1_fused_1) // 7168 * 7 + (i2_0_i3_0_i2_1_i3_1_i0_i1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_i0_i1_fused_1) % 448 // 64) | |
co = T.axis.spatial(64, (i2_0_i3_0_i2_1_i3_1_i0_i1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_i0_i1_fused_1) % 7168 // 448 * 4 + (i2_0_i3_0_i2_1_i3_1_i0_i1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_i0_i1_fused_1) % 64 // 16) | |
T.reads() | |
T.writes(inverse[vh, vw, p, co]) | |
T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["vh", "vw", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_inverse"}) | |
inverse[vh, vw, p, co] = T.float32(0) | |
for i4 in T.unroll(6): | |
for i5 in T.unroll(6): | |
with T.block("inverse_update"): | |
vh = T.axis.spatial(4, (i2_0_i3_0_i2_1_i3_1_i0_i1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_i0_i1_fused_1) % 16 // 4) | |
vw = T.axis.spatial(4, (i2_0_i3_0_i2_1_i3_1_i0_i1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_i0_i1_fused_1) % 4) | |
p = T.axis.spatial(196, (i2_0_i3_0_i2_1_i3_1_i0_i1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_i0_i1_fused_1) // 7168 * 7 + (i2_0_i3_0_i2_1_i3_1_i0_i1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_i0_i1_fused_1) % 448 // 64) | |
co = T.axis.spatial(64, (i2_0_i3_0_i2_1_i3_1_i0_i1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_i0_i1_fused_1) % 7168 // 448 * 4 + (i2_0_i3_0_i2_1_i3_1_i0_i1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_i0_i1_fused_1) % 64 // 16) | |
r_a, r_b = T.axis.remap("RR", [i4, i5]) | |
T.reads(inverse[vh, vw, p, co], bgemm[r_a, r_b, p, co]) | |
T.writes(inverse[vh, vw, p, co]) | |
T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["vh", "vw", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_inverse"}) | |
inverse[vh, vw, p, co] = inverse[vh, vw, p, co] + bgemm[r_a, r_b, p, co] * T.Select(r_a % 6 == 5 and vh % 4 == 3, T.float32(1), T.Select(r_a % 6 == 5 and vh % 4 == 2, T.float32(0), T.Select(r_a % 6 == 5 and vh % 4 == 1, T.float32(0), T.Select(r_a % 6 == 5 and vh % 4 == 0, T.float32(0), T.Select(r_a % 6 == 4 and vh % 4 == 3, T.float32(-8), T.Select(r_a % 6 == 4 and vh % 4 == 2, T.float32(4), T.Select(r_a % 6 == 4 and vh % 4 == 1, T.float32(-2), T.Select(r_a % 6 == 4 and vh % 4 == 0, T.float32(1), T.Select(r_a % 6 == 3 and vh % 4 == 3, T.float32(0.125), T.Select(r_a % 6 == 3 and vh % 4 == 2, T.float32(0.25), T.Select(r_a % 6 == 3 and vh % 4 == 1, T.float32(0.5), T.Select(r_a % 6 == 3 and vh % 4 == 0, T.float32(1), T.Select(r_a % 6 == 2 and vh % 4 == 3, T.float32(1), T.Select(r_a % 6 == 2 and vh % 4 == 2, T.float32(1), T.Select(r_a % 6 == 2 and vh % 4 == 1, T.float32(1), T.Select(r_a % 6 == 2 and vh % 4 == 0, T.float32(1), T.Select(r_a % 6 == 1 and vh % 4 == 3, T.float32(-1), T.Select(r_a % 6 == 1 and vh % 4 == 2, T.float32(1), T.Select(r_a % 6 == 1 and vh % 4 == 1, T.float32(-1), T.Select(r_a % 6 == 1 and vh % 4 == 0, T.float32(1), T.Select(r_a % 6 == 0 and vh % 4 == 3, T.float32(0), T.Select(r_a % 6 == 0 and vh % 4 == 2, T.float32(0), T.Select(r_a % 6 == 0 and vh % 4 == 1, T.float32(0), T.Select(r_a % 6 == 0 and vh % 4 == 0, T.float32(1), T.float32(0))))))))))))))))))))))))) * T.Select(r_b % 6 == 5 and vw % 4 == 3, T.float32(1), T.Select(r_b % 6 == 5 and vw % 4 == 2, T.float32(0), T.Select(r_b % 6 == 5 and vw % 4 == 1, T.float32(0), T.Select(r_b % 6 == 5 and vw % 4 == 0, T.float32(0), T.Select(r_b % 6 == 4 and vw % 4 == 3, T.float32(-8), T.Select(r_b % 6 == 4 and vw % 4 == 2, T.float32(4), T.Select(r_b % 6 == 4 and vw % 4 == 1, T.float32(-2), T.Select(r_b % 6 == 4 and vw % 4 == 0, T.float32(1), T.Select(r_b % 6 == 3 and vw % 4 == 3, T.float32(0.125), T.Select(r_b % 6 == 3 and vw % 4 == 2, T.float32(0.25), T.Select(r_b % 6 == 3 and vw % 4 == 1, T.float32(0.5), T.Select(r_b % 6 == 3 and vw % 4 == 0, T.float32(1), T.Select(r_b % 6 == 2 and vw % 4 == 3, T.float32(1), T.Select(r_b % 6 == 2 and vw % 4 == 2, T.float32(1), T.Select(r_b % 6 == 2 and vw % 4 == 1, T.float32(1), T.Select(r_b % 6 == 2 and vw % 4 == 0, T.float32(1), T.Select(r_b % 6 == 1 and vw % 4 == 3, T.float32(-1), T.Select(r_b % 6 == 1 and vw % 4 == 2, T.float32(1), T.Select(r_b % 6 == 1 and vw % 4 == 1, T.float32(-1), T.Select(r_b % 6 == 1 and vw % 4 == 0, T.float32(1), T.Select(r_b % 6 == 0 and vw % 4 == 3, T.float32(0), T.Select(r_b % 6 == 0 and vw % 4 == 2, T.float32(0), T.Select(r_b % 6 == 0 and vw % 4 == 1, T.float32(0), T.Select(r_b % 6 == 0 and vw % 4 == 0, T.float32(1), T.float32(0))))))))))))))))))))))))) | |
for i0_i1_i2_i3_fused_0 in T.thread_binding(6272, thread="blockIdx.x", annotations={"pragma_auto_unroll_max_step":512, "pragma_unroll_explicit":1}): | |
for i0_i1_i2_i3_fused_1 in T.thread_binding(32, thread="threadIdx.x"): | |
with T.block("conv2d_winograd"): | |
n = T.axis.spatial(1, 0) | |
h = T.axis.spatial(56, (i0_i1_i2_i3_fused_0 * 32 + i0_i1_i2_i3_fused_1) // 3584) | |
w = T.axis.spatial(56, (i0_i1_i2_i3_fused_0 * 32 + i0_i1_i2_i3_fused_1) % 3584 // 64) | |
co = T.axis.spatial(64, (i0_i1_i2_i3_fused_0 * 32 + i0_i1_i2_i3_fused_1) % 64) | |
T.reads(inverse[h % 4, w % 4, n * 196 + h // 4 * 14 + w // 4, co], placeholder[n, 0, 0, co]) | |
T.writes(T_relu[n, h, w, co]) | |
T_relu[n, h, w, co] = T.max(inverse[h % 4, w % 4, n * 196 + h // 4 * 14 + w // 4, co] + placeholder[n, 0, 0, co], T.float32(0)) | |
[14:41:53] /home/zxybazh/tvm-tensorir/src/meta_schedule/integration.cc:139: Applied history best for: tvmgen_default_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu | |
[14:41:53] /home/zxybazh/tvm-tensorir/src/meta_schedule/integration.cc:145: | |
# from tvm.script import tir as T | |
@T.prim_func | |
def func(placeholder: T.Buffer[(1, 56, 56, 64), "float32"], placeholder_1: T.Buffer[(1, 1, 1, 64), "float32"], placeholder_2: T.Buffer[(6, 6, 64, 64), "float32"], placeholder_3: T.Buffer[(1, 56, 56, 64), "float32"], T_relu: T.Buffer[(1, 56, 56, 64), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
# with T.block("root") | |
input_tile_local = T.alloc_buffer([6, 6, 196, 64], dtype="float32", scope="local") | |
data_pack = T.alloc_buffer([6, 6, 196, 64], dtype="float32") | |
bgemm = T.alloc_buffer([6, 6, 196, 64], dtype="float32") | |
inverse = T.alloc_buffer([4, 4, 196, 64], dtype="float32") | |
bgemm_local = T.alloc_buffer([6, 6, 196, 64], dtype="float32", scope="local") | |
data_pack_shared = T.alloc_buffer([6, 6, 196, 64], dtype="float32", scope="shared") | |
placeholder_shared = T.alloc_buffer([6, 6, 64, 64], dtype="float32", scope="shared") | |
for i2_0_i3_0_i2_1_i3_1_fused_0 in T.thread_binding(392, thread="blockIdx.x", annotations={"pragma_auto_unroll_max_step":64, "pragma_unroll_explicit":1}): | |
for i2_0_i3_0_i2_1_i3_1_fused_1 in T.thread_binding(32, thread="threadIdx.x"): | |
for ax0, ax1, ax2, ax3 in T.grid(6, 6, 1, 1): | |
with T.block("input_tile"): | |
eps, nu = T.axis.remap("SS", [ax0, ax1]) | |
p = T.axis.spatial(196, (i2_0_i3_0_i2_1_i3_1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_fused_1) // 64) | |
ci = T.axis.spatial(64, (i2_0_i3_0_i2_1_i3_1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_fused_1) % 64 // 2 * 2 + (i2_0_i3_0_i2_1_i3_1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_fused_1) % 2) | |
T.reads(placeholder_3[p // 196, p % 196 // 14 * 4 + eps - 1, p % 14 * 4 + nu - 1, ci]) | |
T.writes(input_tile_local[eps, nu, p, ci]) | |
T.block_attr({"schedule_rule":"None"}) | |
input_tile_local[eps, nu, p, ci] = T.if_then_else(1 <= p % 196 // 14 * 4 + eps and p % 196 // 14 * 4 + eps < 57 and 1 <= p % 14 * 4 + nu and p % 14 * 4 + nu < 57, placeholder_3[p // 196, p % 196 // 14 * 4 + eps - 1, p % 14 * 4 + nu - 1, ci], T.float32(0), dtype="float32") | |
for i0 in T.unroll(6): | |
for i1 in T.unroll(6): | |
with T.block("data_pack_init"): | |
eps, nu = T.axis.remap("SS", [i0, i1]) | |
p = T.axis.spatial(196, (i2_0_i3_0_i2_1_i3_1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_fused_1) // 64) | |
ci = T.axis.spatial(64, (i2_0_i3_0_i2_1_i3_1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_fused_1) % 64 // 2 * 2 + (i2_0_i3_0_i2_1_i3_1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_fused_1) % 2) | |
T.reads() | |
T.writes(data_pack[eps, nu, p, ci]) | |
T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["eps", "nu", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_data_pack.cuda"}) | |
data_pack[eps, nu, p, ci] = T.float32(0) | |
for i4 in T.unroll(6): | |
for i5 in T.unroll(6): | |
with T.block("data_pack_update"): | |
eps, nu = T.axis.remap("SS", [i0, i1]) | |
p = T.axis.spatial(196, (i2_0_i3_0_i2_1_i3_1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_fused_1) // 64) | |
ci = T.axis.spatial(64, (i2_0_i3_0_i2_1_i3_1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_fused_1) % 64 // 2 * 2 + (i2_0_i3_0_i2_1_i3_1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_fused_1) % 2) | |
r_a, r_b = T.axis.remap("RR", [i4, i5]) | |
T.reads(data_pack[eps, nu, p, ci], input_tile_local[r_a, r_b, p, ci]) | |
T.writes(data_pack[eps, nu, p, ci]) | |
T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["eps", "nu", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_data_pack.cuda"}) | |
data_pack[eps, nu, p, ci] = data_pack[eps, nu, p, ci] + input_tile_local[r_a, r_b, p, ci] * T.Select(r_a % 6 == 5 and eps % 6 == 5, T.float32(1), T.Select(r_a % 6 == 5 and eps % 6 == 4, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 3, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 2, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 1, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 0, T.float32(0), T.Select(r_a % 6 == 4 and eps % 6 == 5, T.float32(1.5), T.Select(r_a % 6 == 4 and eps % 6 == 4, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 3, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 2, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 1, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 0, T.float32(1), T.Select(r_a % 6 == 3 and eps % 6 == 5, T.float32(-2), T.Select(r_a % 6 == 3 and eps % 6 == 4, T.float32(-0.5), T.Select(r_a % 6 == 3 and eps % 6 == 3, T.float32(2), T.Select(r_a % 6 == 3 and eps % 6 == 2, T.float32(2.5), T.Select(r_a % 6 == 3 and eps % 6 == 1, T.float32(0.5), T.Select(r_a % 6 == 3 and eps % 6 == 0, T.float32(1.5), T.Select(r_a % 6 == 2 and eps % 6 == 5, T.float32(-1.5), T.Select(r_a % 6 == 2 and eps % 6 == 4, T.float32(-1), T.Select(r_a % 6 == 2 and eps % 6 == 3, T.float32(-1), T.Select(r_a % 6 == 2 and eps % 6 == 2, T.float32(0.5), T.Select(r_a % 6 == 2 and eps % 6 == 1, T.float32(-2.5), T.Select(r_a % 6 == 2 and eps % 6 == 0, T.float32(-2), T.Select(r_a % 6 == 1 and eps % 6 == 5, T.float32(1), T.Select(r_a % 6 == 1 and eps % 6 == 4, T.float32(0.5), T.Select(r_a % 6 == 1 and eps % 6 == 3, T.float32(-2), T.Select(r_a % 6 == 1 and eps % 6 == 2, T.float32(-1), T.Select(r_a % 6 == 1 and eps % 6 == 1, T.float32(1), T.Select(r_a % 6 == 1 and eps % 6 == 0, T.float32(-1.5), T.Select(r_a % 6 == 0 and eps % 6 == 5, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 4, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 3, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 2, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 1, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 0, T.float32(1), T.float32(0))))))))))))))))))))))))))))))))))))) * T.Select(r_b % 6 == 5 and nu % 6 == 5, T.float32(1), T.Select(r_b % 6 == 5 and nu % 6 == 4, T.float32(0), T.Select(r_b % 6 == 5 and nu % 6 == 3, T.float32(0), T.Select(r_b % 6 == 5 and nu % 6 == 2, T.float32(0), T.Select(r_b % 6 == 5 and nu % 6 == 1, T.float32(0), T.Select(r_b % 6 == 5 and nu % 6 == 0, T.float32(0), T.Select(r_b % 6 == 4 and nu % 6 == 5, T.float32(1.5), T.Select(r_b % 6 == 4 and nu % 6 == 4, T.float32(1), T.Select(r_b % 6 == 4 and nu % 6 == 3, T.float32(1), T.Select(r_b % 6 == 4 and nu % 6 == 2, T.float32(1), T.Select(r_b % 6 == 4 and nu % 6 == 1, T.float32(1), T.Select(r_b % 6 == 4 and nu % 6 == 0, T.float32(1), T.Select(r_b % 6 == 3 and nu % 6 == 5, T.float32(-2), T.Select(r_b % 6 == 3 and nu % 6 == 4, T.float32(-0.5), T.Select(r_b % 6 == 3 and nu % 6 == 3, T.float32(2), T.Select(r_b % 6 == 3 and nu % 6 == 2, T.float32(2.5), T.Select(r_b % 6 == 3 and nu % 6 == 1, T.float32(0.5), T.Select(r_b % 6 == 3 and nu % 6 == 0, T.float32(1.5), T.Select(r_b % 6 == 2 and nu % 6 == 5, T.float32(-1.5), T.Select(r_b % 6 == 2 and nu % 6 == 4, T.float32(-1), T.Select(r_b % 6 == 2 and nu % 6 == 3, T.float32(-1), T.Select(r_b % 6 == 2 and nu % 6 == 2, T.float32(0.5), T.Select(r_b % 6 == 2 and nu % 6 == 1, T.float32(-2.5), T.Select(r_b % 6 == 2 and nu % 6 == 0, T.float32(-2), T.Select(r_b % 6 == 1 and nu % 6 == 5, T.float32(1), T.Select(r_b % 6 == 1 and nu % 6 == 4, T.float32(0.5), T.Select(r_b % 6 == 1 and nu % 6 == 3, T.float32(-2), T.Select(r_b % 6 == 1 and nu % 6 == 2, T.float32(-1), T.Select(r_b % 6 == 1 and nu % 6 == 1, T.float32(1), T.Select(r_b % 6 == 1 and nu % 6 == 0, T.float32(-1.5), T.Select(r_b % 6 == 0 and nu % 6 == 5, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 4, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 3, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 2, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 1, T.float32(0), T.Select(r_b % 6 == 0 and nu % 6 == 0, T.float32(1), T.float32(0))))))))))))))))))))))))))))))))))))) | |
for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(18, thread="blockIdx.x", annotations={"pragma_auto_unroll_max_step":64, "pragma_unroll_explicit":1}): | |
for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(1, thread="vthread.x"): | |
for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(224, thread="threadIdx.x"): | |
for i2_3_init, i3_3_init, i0_4_init, i2_4_init in T.grid(14, 2, 2, 2): | |
with T.block("bgemm_init"): | |
eps = T.axis.spatial(6, i0_0_i1_0_i2_0_i3_0_fused // 6 * 2 + i0_4_init) | |
nu = T.axis.spatial(6, i0_0_i1_0_i2_0_i3_0_fused % 6) | |
p = T.axis.spatial(196, i0_2_i1_2_i2_2_i3_2_fused // 32 * 28 + i2_3_init * 2 + i2_4_init) | |
co = T.axis.spatial(64, i0_2_i1_2_i2_2_i3_2_fused % 32 * 2 + i3_3_init) | |
T.reads() | |
T.writes(bgemm_local[eps, nu, p, co]) | |
T.block_attr({"layout_free_placeholders":[placeholder_2], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS"}) | |
bgemm_local[eps, nu, p, co] = T.float32(0) | |
for i4_0 in T.serial(32): | |
for ax0_ax1_ax2_ax3_fused_0 in T.serial(2): | |
for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(224, thread="threadIdx.x"): | |
for ax0_ax1_ax2_ax3_fused_2 in T.vectorized(2): | |
with T.block("data_pack_shared"): | |
v0 = T.axis.spatial(6, i0_0_i1_0_i2_0_i3_0_fused // 6 * 2 + (ax0_ax1_ax2_ax3_fused_0 * 448 + ax0_ax1_ax2_ax3_fused_1 * 2 + ax0_ax1_ax2_ax3_fused_2) // 392) | |
v1 = T.axis.spatial(6, i0_0_i1_0_i2_0_i3_0_fused % 6) | |
v2 = T.axis.spatial(196, (ax0_ax1_ax2_ax3_fused_0 * 448 + ax0_ax1_ax2_ax3_fused_1 * 2 + ax0_ax1_ax2_ax3_fused_2) % 392 // 2) | |
v3 = T.axis.spatial(64, i4_0 * 2 + (ax0_ax1_ax2_ax3_fused_0 * 448 + ax0_ax1_ax2_ax3_fused_1 * 2 + ax0_ax1_ax2_ax3_fused_2) % 2) | |
T.where((ax0_ax1_ax2_ax3_fused_0 * 224 + ax0_ax1_ax2_ax3_fused_1) * 2 + ax0_ax1_ax2_ax3_fused_2 < 784) | |
T.reads(data_pack[v0, v1, v2, v3]) | |
T.writes(data_pack_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":2}) | |
data_pack_shared[v0, v1, v2, v3] = data_pack[v0, v1, v2, v3] | |
for ax0_ax1_ax2_ax3_fused_0 in T.serial(1): | |
for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(224, thread="threadIdx.x"): | |
for ax0_ax1_ax2_ax3_fused_2 in T.vectorized(2): | |
with T.block("placeholder_shared"): | |
v0 = T.axis.spatial(6, i0_0_i1_0_i2_0_i3_0_fused // 6 * 2 + (ax0_ax1_ax2_ax3_fused_1 * 2 + ax0_ax1_ax2_ax3_fused_2) // 128) | |
v1 = T.axis.spatial(6, i0_0_i1_0_i2_0_i3_0_fused % 6) | |
v2 = T.axis.spatial(64, (ax0_ax1_ax2_ax3_fused_1 * 2 + ax0_ax1_ax2_ax3_fused_2) % 128 // 2) | |
v3 = T.axis.spatial(64, i4_0 * 2 + (ax0_ax1_ax2_ax3_fused_1 * 2 + ax0_ax1_ax2_ax3_fused_2) % 2) | |
T.where(ax0_ax1_ax2_ax3_fused_1 * 2 + ax0_ax1_ax2_ax3_fused_2 < 256) | |
T.reads(placeholder_2[v0, v1, v2, v3]) | |
T.writes(placeholder_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":2}) | |
placeholder_shared[v0, v1, v2, v3] = placeholder_2[v0, v1, v2, v3] | |
for i4_1, i0_3, i1_3, i2_3, i3_3, i4_2, i0_4, i1_4, i2_4, i3_4 in T.grid(1, 1, 1, 14, 2, 2, 2, 1, 2, 1): | |
with T.block("bgemm_update"): | |
eps = T.axis.spatial(6, i0_0_i1_0_i2_0_i3_0_fused // 6 * 2 + i0_4) | |
nu = T.axis.spatial(6, i0_0_i1_0_i2_0_i3_0_fused % 6) | |
p = T.axis.spatial(196, i0_2_i1_2_i2_2_i3_2_fused // 32 * 28 + i2_3 * 2 + i2_4) | |
co = T.axis.spatial(64, i0_2_i1_2_i2_2_i3_2_fused % 32 * 2 + i3_3) | |
ci = T.axis.reduce(64, i4_0 * 2 + i4_2) | |
T.reads(bgemm_local[eps, nu, p, co], data_pack_shared[eps, nu, p, ci], placeholder_shared[eps, nu, co, ci]) | |
T.writes(bgemm_local[eps, nu, p, co]) | |
T.block_attr({"layout_free_placeholders":[placeholder_2], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS"}) | |
bgemm_local[eps, nu, p, co] = bgemm_local[eps, nu, p, co] + data_pack_shared[eps, nu, p, ci] * placeholder_shared[eps, nu, co, ci] | |
for ax0, ax1, ax2, ax3 in T.grid(2, 1, 28, 2): | |
with T.block("bgemm_local"): | |
v0 = T.axis.spatial(6, i0_0_i1_0_i2_0_i3_0_fused // 6 * 2 + ax0) | |
v1 = T.axis.spatial(6, i0_0_i1_0_i2_0_i3_0_fused % 6 + ax1) | |
v2 = T.axis.spatial(196, i0_2_i1_2_i2_2_i3_2_fused // 32 * 28 + ax2) | |
v3 = T.axis.spatial(64, i0_2_i1_2_i2_2_i3_2_fused % 32 * 2 + ax3) | |
T.reads(bgemm_local[v0, v1, v2, v3]) | |
T.writes(bgemm[v0, v1, v2, v3]) | |
bgemm[v0, v1, v2, v3] = bgemm_local[v0, v1, v2, v3] | |
for i2_0_i3_0_i2_1_i3_1_i0_i1_fused_0 in T.thread_binding(6272, thread="blockIdx.x", annotations={"pragma_auto_unroll_max_step":64, "pragma_unroll_explicit":1}): | |
for i2_0_i3_0_i2_1_i3_1_i0_i1_fused_1 in T.thread_binding(32, thread="threadIdx.x"): | |
with T.block("inverse_init"): | |
vh = T.axis.spatial(4, (i2_0_i3_0_i2_1_i3_1_i0_i1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_i0_i1_fused_1) % 16 // 4) | |
vw = T.axis.spatial(4, (i2_0_i3_0_i2_1_i3_1_i0_i1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_i0_i1_fused_1) % 4) | |
p = T.axis.spatial(196, (i2_0_i3_0_i2_1_i3_1_i0_i1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_i0_i1_fused_1) // 1024) | |
co = T.axis.spatial(64, (i2_0_i3_0_i2_1_i3_1_i0_i1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_i0_i1_fused_1) % 1024 // 16) | |
T.reads() | |
T.writes(inverse[vh, vw, p, co]) | |
T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["vh", "vw", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_inverse"}) | |
inverse[vh, vw, p, co] = T.float32(0) | |
for i4 in T.unroll(6): | |
for i5 in T.unroll(6): | |
with T.block("inverse_update"): | |
vh = T.axis.spatial(4, (i2_0_i3_0_i2_1_i3_1_i0_i1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_i0_i1_fused_1) % 16 // 4) | |
vw = T.axis.spatial(4, (i2_0_i3_0_i2_1_i3_1_i0_i1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_i0_i1_fused_1) % 4) | |
p = T.axis.spatial(196, (i2_0_i3_0_i2_1_i3_1_i0_i1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_i0_i1_fused_1) // 1024) | |
co = T.axis.spatial(64, (i2_0_i3_0_i2_1_i3_1_i0_i1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_i0_i1_fused_1) % 1024 // 16) | |
r_a, r_b = T.axis.remap("RR", [i4, i5]) | |
T.reads(inverse[vh, vw, p, co], bgemm[r_a, r_b, p, co]) | |
T.writes(inverse[vh, vw, p, co]) | |
T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["vh", "vw", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_inverse"}) | |
inverse[vh, vw, p, co] = inverse[vh, vw, p, co] + bgemm[r_a, r_b, p, co] * T.Select(r_a % 6 == 5 and vh % 4 == 3, T.float32(1), T.Select(r_a % 6 == 5 and vh % 4 == 2, T.float32(0), T.Select(r_a % 6 == 5 and vh % 4 == 1, T.float32(0), T.Select(r_a % 6 == 5 and vh % 4 == 0, T.float32(0), T.Select(r_a % 6 == 4 and vh % 4 == 3, T.float32(-8), T.Select(r_a % 6 == 4 and vh % 4 == 2, T.float32(4), T.Select(r_a % 6 == 4 and vh % 4 == 1, T.float32(-2), T.Select(r_a % 6 == 4 and vh % 4 == 0, T.float32(1), T.Select(r_a % 6 == 3 and vh % 4 == 3, T.float32(0.125), T.Select(r_a % 6 == 3 and vh % 4 == 2, T.float32(0.25), T.Select(r_a % 6 == 3 and vh % 4 == 1, T.float32(0.5), T.Select(r_a % 6 == 3 and vh % 4 == 0, T.float32(1), T.Select(r_a % 6 == 2 and vh % 4 == 3, T.float32(1), T.Select(r_a % 6 == 2 and vh % 4 == 2, T.float32(1), T.Select(r_a % 6 == 2 and vh % 4 == 1, T.float32(1), T.Select(r_a % 6 == 2 and vh % 4 == 0, T.float32(1), T.Select(r_a % 6 == 1 and vh % 4 == 3, T.float32(-1), T.Select(r_a % 6 == 1 and vh % 4 == 2, T.float32(1), T.Select(r_a % 6 == 1 and vh % 4 == 1, T.float32(-1), T.Select(r_a % 6 == 1 and vh % 4 == 0, T.float32(1), T.Select(r_a % 6 == 0 and vh % 4 == 3, T.float32(0), T.Select(r_a % 6 == 0 and vh % 4 == 2, T.float32(0), T.Select(r_a % 6 == 0 and vh % 4 == 1, T.float32(0), T.Select(r_a % 6 == 0 and vh % 4 == 0, T.float32(1), T.float32(0))))))))))))))))))))))))) * T.Select(r_b % 6 == 5 and vw % 4 == 3, T.float32(1), T.Select(r_b % 6 == 5 and vw % 4 == 2, T.float32(0), T.Select(r_b % 6 == 5 and vw % 4 == 1, T.float32(0), T.Select(r_b % 6 == 5 and vw % 4 == 0, T.float32(0), T.Select(r_b % 6 == 4 and vw % 4 == 3, T.float32(-8), T.Select(r_b % 6 == 4 and vw % 4 == 2, T.float32(4), T.Select(r_b % 6 == 4 and vw % 4 == 1, T.float32(-2), T.Select(r_b % 6 == 4 and vw % 4 == 0, T.float32(1), T.Select(r_b % 6 == 3 and vw % 4 == 3, T.float32(0.125), T.Select(r_b % 6 == 3 and vw % 4 == 2, T.float32(0.25), T.Select(r_b % 6 == 3 and vw % 4 == 1, T.float32(0.5), T.Select(r_b % 6 == 3 and vw % 4 == 0, T.float32(1), T.Select(r_b % 6 == 2 and vw % 4 == 3, T.float32(1), T.Select(r_b % 6 == 2 and vw % 4 == 2, T.float32(1), T.Select(r_b % 6 == 2 and vw % 4 == 1, T.float32(1), T.Select(r_b % 6 == 2 and vw % 4 == 0, T.float32(1), T.Select(r_b % 6 == 1 and vw % 4 == 3, T.float32(-1), T.Select(r_b % 6 == 1 and vw % 4 == 2, T.float32(1), T.Select(r_b % 6 == 1 and vw % 4 == 1, T.float32(-1), T.Select(r_b % 6 == 1 and vw % 4 == 0, T.float32(1), T.Select(r_b % 6 == 0 and vw % 4 == 3, T.float32(0), T.Select(r_b % 6 == 0 and vw % 4 == 2, T.float32(0), T.Select(r_b % 6 == 0 and vw % 4 == 1, T.float32(0), T.Select(r_b % 6 == 0 and vw % 4 == 0, T.float32(1), T.float32(0))))))))))))))))))))))))) | |
for i0_i1_i2_i3_fused_0 in T.thread_binding(6272, thread="blockIdx.x", annotations={"pragma_auto_unroll_max_step":64, "pragma_unroll_explicit":1}): | |
for i0_i1_i2_i3_fused_1 in T.thread_binding(32, thread="threadIdx.x"): | |
with T.block("conv2d_winograd"): | |
n = T.axis.spatial(1, 0) | |
h = T.axis.spatial(56, (i0_i1_i2_i3_fused_0 * 32 + i0_i1_i2_i3_fused_1) // 3584) | |
w = T.axis.spatial(56, (i0_i1_i2_i3_fused_0 * 32 + i0_i1_i2_i3_fused_1) % 3584 // 64) | |
co = T.axis.spatial(64, (i0_i1_i2_i3_fused_0 * 32 + i0_i1_i2_i3_fused_1) % 64) | |
T.reads(inverse[h % 4, w % 4, n * 196 + h // 4 * 14 + w // 4, co], placeholder_1[n, 0, 0, co], placeholder[n, h, w, co]) | |
T.writes(T_relu[n, h, w, co]) | |
T_relu[n, h, w, co] = T.max(inverse[h % 4, w % 4, n * 196 + h // 4 * 14 + w // 4, co] + placeholder_1[n, 0, 0, co] + placeholder[n, h, w, co], T.float32(0)) | |
[14:41:53] /home/zxybazh/tvm-tensorir/src/meta_schedule/integration.cc:139: Applied history best for: tvmgen_default_fused_nn_conv2d_add_nn_relu_1 | |
[14:41:53] /home/zxybazh/tvm-tensorir/src/meta_schedule/integration.cc:145: | |
# from tvm.script import tir as T | |
@T.prim_func | |
def func(placeholder: T.Buffer[(1, 1, 1, 128), "float32"], placeholder_1: T.Buffer[(3, 3, 64, 128), "float32"], placeholder_2: T.Buffer[(1, 56, 56, 64), "float32"], T_relu: T.Buffer[(1, 28, 28, 128), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
# with T.block("root") | |
conv2d_nhwc_local = T.alloc_buffer([1, 28, 28, 128], dtype="float32", scope="local") | |
pad_temp_shared = T.alloc_buffer([1, 58, 58, 64], dtype="float32", scope="shared") | |
placeholder_shared = T.alloc_buffer([3, 3, 64, 128], dtype="float32", scope="shared") | |
for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(56, thread="blockIdx.x", annotations={"pragma_auto_unroll_max_step":512, "pragma_unroll_explicit":1}): | |
for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(1, thread="vthread.x"): | |
for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(56, thread="threadIdx.x"): | |
for i3_3_init, i1_4_init, i3_4_init in T.grid(8, 2, 2): | |
with T.block("conv2d_nhwc_init"): | |
nn = T.axis.spatial(1, 0) | |
yy = T.axis.spatial(28, i0_0_i1_0_i2_0_i3_0_fused // 4 * 2 + i1_4_init) | |
xx = T.axis.spatial(28, i0_0_i1_0_i2_0_i3_0_fused % 4 // 2 * 14 + i0_2_i1_2_i2_2_i3_2_fused // 4) | |
ff = T.axis.spatial(128, i0_0_i1_0_i2_0_i3_0_fused % 2 * 64 + i0_2_i1_2_i2_2_i3_2_fused % 4 * 16 + i3_3_init * 2 + i3_4_init) | |
T.reads() | |
T.writes(conv2d_nhwc_local[nn, yy, xx, ff]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["conv2d_nhwc.gpu", ["TENSOR", [1, 56, 56, 64], "float32"], ["TENSOR", [3, 3, 64, 128], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], "float32"]}) | |
conv2d_nhwc_local[nn, yy, xx, ff] = T.float32(0) | |
for i4_0, i5_0, i6_0 in T.grid(3, 3, 1): | |
for ax0_ax1_ax2_ax3_fused_0 in T.serial(24): | |
for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(56, thread="threadIdx.x"): | |
for ax0_ax1_ax2_ax3_fused_2 in T.vectorized(4): | |
with T.block("pad_temp_shared"): | |
v0 = T.axis.spatial(1, 0) | |
v1 = T.axis.spatial(58, i0_0_i1_0_i2_0_i3_0_fused // 4 * 4 + i4_0 + ((ax0_ax1_ax2_ax3_fused_0 * 56 + ax0_ax1_ax2_ax3_fused_1) * 4 + ax0_ax1_ax2_ax3_fused_2) % 5184 // 1728) | |
v2 = T.axis.spatial(58, i0_0_i1_0_i2_0_i3_0_fused % 4 // 2 * 28 + i5_0 + ((ax0_ax1_ax2_ax3_fused_0 * 56 + ax0_ax1_ax2_ax3_fused_1) * 4 + ax0_ax1_ax2_ax3_fused_2) % 1728 // 64) | |
v3 = T.axis.spatial(64, ((ax0_ax1_ax2_ax3_fused_0 * 56 + ax0_ax1_ax2_ax3_fused_1) * 4 + ax0_ax1_ax2_ax3_fused_2) % 64) | |
T.where((ax0_ax1_ax2_ax3_fused_0 * 56 + ax0_ax1_ax2_ax3_fused_1) * 4 + ax0_ax1_ax2_ax3_fused_2 < 5184) | |
T.reads(placeholder_2[v0, v1 - 1, v2 - 1, v3]) | |
T.writes(pad_temp_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":4}) | |
pad_temp_shared[v0, v1, v2, v3] = T.if_then_else(1 <= v1 and v1 < 57 and 1 <= v2 and v2 < 57, placeholder_2[v0, v1 - 1, v2 - 1, v3], T.float32(0), dtype="float32") | |
for ax0_ax1_ax2_ax3_fused_0 in T.serial(74): | |
for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(56, thread="threadIdx.x"): | |
with T.block("placeholder_shared"): | |
v0, v1 = T.axis.remap("SS", [i4_0, i5_0]) | |
v2 = T.axis.spatial(64, (ax0_ax1_ax2_ax3_fused_0 * 56 + ax0_ax1_ax2_ax3_fused_1) // 64) | |
v3 = T.axis.spatial(128, i0_0_i1_0_i2_0_i3_0_fused % 2 * 64 + (ax0_ax1_ax2_ax3_fused_0 * 56 + ax0_ax1_ax2_ax3_fused_1) % 64) | |
T.where(ax0_ax1_ax2_ax3_fused_0 * 56 + ax0_ax1_ax2_ax3_fused_1 < 4096) | |
T.reads(placeholder_1[v0, v1, v2, v3]) | |
T.writes(placeholder_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":1}) | |
placeholder_shared[v0, v1, v2, v3] = placeholder_1[v0, v1, v2, v3] | |
for i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3, i4_2, i5_2, i6_2, i0_4, i1_4, i2_4, i3_4 in T.grid(1, 1, 1, 1, 1, 1, 8, 1, 1, 64, 1, 2, 1, 2): | |
with T.block("conv2d_nhwc_update"): | |
nn = T.axis.spatial(1, 0) | |
yy = T.axis.spatial(28, i0_0_i1_0_i2_0_i3_0_fused // 4 * 2 + i1_4) | |
xx = T.axis.spatial(28, i0_0_i1_0_i2_0_i3_0_fused % 4 // 2 * 14 + i0_2_i1_2_i2_2_i3_2_fused // 4) | |
ff = T.axis.spatial(128, i0_0_i1_0_i2_0_i3_0_fused % 2 * 64 + i0_2_i1_2_i2_2_i3_2_fused % 4 * 16 + i3_3 * 2 + i3_4) | |
ry, rx, rc = T.axis.remap("RRR", [i4_0, i5_0, i6_2]) | |
T.reads(conv2d_nhwc_local[nn, yy, xx, ff], pad_temp_shared[nn, yy * 2 + ry, xx * 2 + rx, rc], placeholder_shared[ry, rx, rc, ff]) | |
T.writes(conv2d_nhwc_local[nn, yy, xx, ff]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["conv2d_nhwc.gpu", ["TENSOR", [1, 56, 56, 64], "float32"], ["TENSOR", [3, 3, 64, 128], "float32"], [2, 2], [1, 1, 1, 1], [1, 1], "float32"]}) | |
conv2d_nhwc_local[nn, yy, xx, ff] = conv2d_nhwc_local[nn, yy, xx, ff] + pad_temp_shared[nn, yy * 2 + ry, xx * 2 + rx, rc] * placeholder_shared[ry, rx, rc, ff] | |
for ax0, ax1, ax2, ax3 in T.grid(1, 2, 1, 16): | |
with T.block("conv2d_nhwc_local"): | |
v0 = T.axis.spatial(1, ax0) | |
v1 = T.axis.spatial(28, i0_0_i1_0_i2_0_i3_0_fused // 4 * 2 + ax1) | |
v2 = T.axis.spatial(28, i0_0_i1_0_i2_0_i3_0_fused % 4 // 2 * 14 + i0_2_i1_2_i2_2_i3_2_fused // 4 + ax2) | |
v3 = T.axis.spatial(128, i0_0_i1_0_i2_0_i3_0_fused % 2 * 64 + i0_2_i1_2_i2_2_i3_2_fused % 4 * 16 + ax3) | |
T.reads(conv2d_nhwc_local[v0, v1, v2, v3], placeholder[v0, 0, 0, v3]) | |
T.writes(T_relu[v0, v1, v2, v3]) | |
T_relu[v0, v1, v2, v3] = T.max(conv2d_nhwc_local[v0, v1, v2, v3] + placeholder[v0, 0, 0, v3], T.float32(0)) | |
[14:41:53] /home/zxybazh/tvm-tensorir/src/meta_schedule/integration.cc:139: Applied history best for: tvmgen_default_fused_nn_conv2d_add | |
[14:41:53] /home/zxybazh/tvm-tensorir/src/meta_schedule/integration.cc:145: | |
# from tvm.script import tir as T | |
@T.prim_func | |
def func(placeholder: T.Buffer[(1, 1, 1, 128), "float32"], placeholder_1: T.Buffer[(1, 1, 64, 128), "float32"], placeholder_2: T.Buffer[(1, 56, 56, 64), "float32"], T_add: T.Buffer[(1, 28, 28, 128), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
# with T.block("root") | |
conv2d_nhwc_local = T.alloc_buffer([1, 28, 28, 128], dtype="float32", scope="local") | |
pad_temp_shared = T.alloc_buffer([1, 56, 56, 64], dtype="float32", scope="shared") | |
placeholder_shared = T.alloc_buffer([1, 1, 64, 128], dtype="float32", scope="shared") | |
for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(56, thread="blockIdx.x", annotations={"pragma_auto_unroll_max_step":512, "pragma_unroll_explicit":1}): | |
for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(4, thread="vthread.x"): | |
for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(224, thread="threadIdx.x"): | |
for i4_0, i5_0 in T.grid(1, 1): | |
for i3_3_init in T.serial(2): | |
with T.block("conv2d_nhwc_init"): | |
nn = T.axis.spatial(1, 0) | |
yy = T.axis.spatial(28, i0_0_i1_0_i2_0_i3_0_fused // 14 * 7 + i0_2_i1_2_i2_2_i3_2_fused // 32) | |
xx = T.axis.spatial(28, i0_0_i1_0_i2_0_i3_0_fused % 14 // 2 * 4 + i0_2_i1_2_i2_2_i3_2_fused % 32 // 8) | |
ff = T.axis.spatial(128, i0_0_i1_0_i2_0_i3_0_fused % 2 * 64 + i0_1_i1_1_i2_1_i3_1_fused * 16 + i0_2_i1_2_i2_2_i3_2_fused % 8 * 2 + i3_3_init) | |
T.reads() | |
T.writes(conv2d_nhwc_local[nn, yy, xx, ff]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["conv2d_nhwc.gpu", ["TENSOR", [1, 56, 56, 64], "float32"], ["TENSOR", [1, 1, 64, 128], "float32"], [2, 2], [0, 0, 0, 0], [1, 1], "float32"]}) | |
conv2d_nhwc_local[nn, yy, xx, ff] = T.float32(0) | |
for i6_0 in T.serial(16): | |
for ax0_ax1_ax2_ax3_fused_0 in T.serial(1): | |
for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(224, thread="threadIdx.x"): | |
for ax0_ax1_ax2_ax3_fused_2 in T.vectorized(3): | |
with T.block("pad_temp_shared"): | |
v0 = T.axis.spatial(1, 0) | |
v1 = T.axis.spatial(56, i0_0_i1_0_i2_0_i3_0_fused // 14 * 14 + (ax0_ax1_ax2_ax3_fused_1 * 3 + ax0_ax1_ax2_ax3_fused_2) % 364 // 28) | |
v2 = T.axis.spatial(56, i0_0_i1_0_i2_0_i3_0_fused % 14 // 2 * 8 + (ax0_ax1_ax2_ax3_fused_1 * 3 + ax0_ax1_ax2_ax3_fused_2) % 28 // 4) | |
v3 = T.axis.spatial(64, i6_0 * 4 + (ax0_ax1_ax2_ax3_fused_1 * 3 + ax0_ax1_ax2_ax3_fused_2) % 4) | |
T.where(ax0_ax1_ax2_ax3_fused_1 * 3 + ax0_ax1_ax2_ax3_fused_2 < 364) | |
T.reads(placeholder_2[v0, v1, v2, v3]) | |
T.writes(pad_temp_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":3}) | |
pad_temp_shared[v0, v1, v2, v3] = placeholder_2[v0, v1, v2, v3] | |
for ax0_ax1_ax2_ax3_fused_0 in T.serial(1): | |
for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(224, thread="threadIdx.x"): | |
for ax0_ax1_ax2_ax3_fused_2 in T.vectorized(2): | |
with T.block("placeholder_shared"): | |
v0 = T.axis.spatial(1, 0) | |
v1 = T.axis.spatial(1, 0) | |
v2 = T.axis.spatial(64, i6_0 * 4 + (ax0_ax1_ax2_ax3_fused_1 * 2 + ax0_ax1_ax2_ax3_fused_2) // 64) | |
v3 = T.axis.spatial(128, i0_0_i1_0_i2_0_i3_0_fused % 2 * 64 + (ax0_ax1_ax2_ax3_fused_1 * 2 + ax0_ax1_ax2_ax3_fused_2) % 64) | |
T.where(ax0_ax1_ax2_ax3_fused_1 * 2 + ax0_ax1_ax2_ax3_fused_2 < 256) | |
T.reads(placeholder_1[v0, v1, v2, v3]) | |
T.writes(placeholder_shared[v0, v1, v2, v3]) | |
T.block_attr({"meta_schedule.cooperative_fetch":2}) | |
placeholder_shared[v0, v1, v2, v3] = placeholder_1[v0, v1, v2, v3] | |
for i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3, i4_2, i5_2, i6_2, i0_4, i1_4, i2_4, i3_4 in T.grid(1, 1, 4, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1): | |
with T.block("conv2d_nhwc_update"): | |
nn = T.axis.spatial(1, 0) | |
yy = T.axis.spatial(28, i0_0_i1_0_i2_0_i3_0_fused // 14 * 7 + i0_2_i1_2_i2_2_i3_2_fused // 32) | |
xx = T.axis.spatial(28, i0_0_i1_0_i2_0_i3_0_fused % 14 // 2 * 4 + i0_2_i1_2_i2_2_i3_2_fused % 32 // 8) | |
ff = T.axis.spatial(128, i0_0_i1_0_i2_0_i3_0_fused % 2 * 64 + i0_1_i1_1_i2_1_i3_1_fused * 16 + i0_2_i1_2_i2_2_i3_2_fused % 8 * 2 + i3_3) | |
ry = T.axis.reduce(1, 0) | |
rx = T.axis.reduce(1, 0) | |
rc = T.axis.reduce(64, i6_0 * 4 + i6_1) | |
T.reads(conv2d_nhwc_local[nn, yy, xx, ff], pad_temp_shared[nn, yy * 2 + ry, xx * 2 + rx, rc], placeholder_shared[ry, rx, rc, ff]) | |
T.writes(conv2d_nhwc_local[nn, yy, xx, ff]) | |
T.block_attr({"layout_free_placeholders":[placeholder_1], "meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS", "workload":["conv2d_nhwc.gpu", ["TENSOR", [1, 56, 56, 64], "float32"], ["TENSOR", [1, 1, 64, 128], "float32"], [2, 2], [0, 0, 0, 0], [1, 1], "float32"]}) | |
conv2d_nhwc_local[nn, yy, xx, ff] = conv2d_nhwc_local[nn, yy, xx, ff] + pad_temp_shared[nn, yy * 2 + ry, xx * 2 + rx, rc] * placeholder_shared[ry, rx, rc, ff] | |
for ax0, ax1, ax2, ax3 in T.grid(1, 1, 1, 2): | |
with T.block("conv2d_nhwc_local"): | |
v0 = T.axis.spatial(1, ax0) | |
v1 = T.axis.spatial(28, i0_0_i1_0_i2_0_i3_0_fused // 14 * 7 + i0_2_i1_2_i2_2_i3_2_fused // 32 + ax1) | |
v2 = T.axis.spatial(28, i0_0_i1_0_i2_0_i3_0_fused % 14 // 2 * 4 + i0_2_i1_2_i2_2_i3_2_fused % 32 // 8 + ax2) | |
v3 = T.axis.spatial(128, i0_0_i1_0_i2_0_i3_0_fused % 2 * 64 + i0_1_i1_1_i2_1_i3_1_fused * 16 + i0_2_i1_2_i2_2_i3_2_fused % 8 * 2 + ax3) | |
T.reads(conv2d_nhwc_local[v0, v1, v2, v3], placeholder[v0, 0, 0, v3]) | |
T.writes(T_add[v0, v1, v2, v3]) | |
T_add[v0, v1, v2, v3] = conv2d_nhwc_local[v0, v1, v2, v3] + placeholder[v0, 0, 0, v3] | |
[14:41:53] /home/zxybazh/tvm-tensorir/src/meta_schedule/integration.cc:139: Applied history best for: tvmgen_default_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1 | |
[14:41:53] /home/zxybazh/tvm-tensorir/src/meta_schedule/integration.cc:145: | |
# from tvm.script import tir as T | |
@T.prim_func | |
def func(placeholder: T.Buffer[(1, 28, 28, 128), "float32"], placeholder_1: T.Buffer[(1, 1, 1, 128), "float32"], placeholder_2: T.Buffer[(4, 4, 128, 128), "float32"], placeholder_3: T.Buffer[(1, 28, 28, 128), "float32"], T_relu: T.Buffer[(1, 28, 28, 128), "float32"]) -> None: | |
# function attr dict | |
T.func_attr({"global_symbol": "main", "tir.noalias": True}) | |
# body | |
# with T.block("root") | |
input_tile_local = T.alloc_buffer([4, 4, 196, 128], dtype="float32", scope="local") | |
data_pack = T.alloc_buffer([4, 4, 196, 128], dtype="float32") | |
bgemm = T.alloc_buffer([4, 4, 196, 128], dtype="float32") | |
inverse = T.alloc_buffer([2, 2, 196, 128], dtype="float32") | |
bgemm_local = T.alloc_buffer([4, 4, 196, 128], dtype="float32", scope="local") | |
data_pack_shared = T.alloc_buffer([4, 4, 196, 128], dtype="float32", scope="shared") | |
placeholder_shared = T.alloc_buffer([4, 4, 128, 128], dtype="float32", scope="shared") | |
for i2_0_i3_0_i2_1_i3_1_fused_0 in T.thread_binding(784, thread="blockIdx.x", annotations={"pragma_auto_unroll_max_step":512, "pragma_unroll_explicit":1}): | |
for i2_0_i3_0_i2_1_i3_1_fused_1 in T.thread_binding(32, thread="threadIdx.x"): | |
for ax0, ax1, ax2, ax3 in T.grid(4, 4, 1, 1): | |
with T.block("input_tile"): | |
eps, nu = T.axis.remap("SS", [ax0, ax1]) | |
p = T.axis.spatial(196, (i2_0_i3_0_i2_1_i3_1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_fused_1) // 6272 * 49 + (i2_0_i3_0_i2_1_i3_1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_fused_1) % 392 // 8) | |
ci = T.axis.spatial(128, (i2_0_i3_0_i2_1_i3_1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_fused_1) % 6272 // 392 * 8 + (i2_0_i3_0_i2_1_i3_1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_fused_1) % 8) | |
T.reads(placeholder_3[p // 196, p % 196 // 14 * 2 + eps - 1, p % 14 * 2 + nu - 1, ci]) | |
T.writes(input_tile_local[eps, nu, p, ci]) | |
T.block_attr({"schedule_rule":"None"}) | |
input_tile_local[eps, nu, p, ci] = T.if_then_else(1 <= p % 196 // 14 * 2 + eps and p % 196 // 14 * 2 + eps < 29 and 1 <= p % 14 * 2 + nu and p % 14 * 2 + nu < 29, placeholder_3[p // 196, p % 196 // 14 * 2 + eps - 1, p % 14 * 2 + nu - 1, ci], T.float32(0), dtype="float32") | |
for i0 in T.unroll(4): | |
for i1 in T.unroll(4): | |
with T.block("data_pack_init"): | |
eps, nu = T.axis.remap("SS", [i0, i1]) | |
p = T.axis.spatial(196, (i2_0_i3_0_i2_1_i3_1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_fused_1) // 6272 * 49 + (i2_0_i3_0_i2_1_i3_1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_fused_1) % 392 // 8) | |
ci = T.axis.spatial(128, (i2_0_i3_0_i2_1_i3_1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_fused_1) % 6272 // 392 * 8 + (i2_0_i3_0_i2_1_i3_1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_fused_1) % 8) | |
T.reads() | |
T.writes(data_pack[eps, nu, p, ci]) | |
T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["eps", "nu", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_data_pack.cuda"}) | |
data_pack[eps, nu, p, ci] = T.float32(0) | |
for i4 in T.unroll(4): | |
for i5 in T.unroll(4): | |
with T.block("data_pack_update"): | |
eps, nu = T.axis.remap("SS", [i0, i1]) | |
p = T.axis.spatial(196, (i2_0_i3_0_i2_1_i3_1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_fused_1) // 6272 * 49 + (i2_0_i3_0_i2_1_i3_1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_fused_1) % 392 // 8) | |
ci = T.axis.spatial(128, (i2_0_i3_0_i2_1_i3_1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_fused_1) % 6272 // 392 * 8 + (i2_0_i3_0_i2_1_i3_1_fused_0 * 32 + i2_0_i3_0_i2_1_i3_1_fused_1) % 8) | |
r_a, r_b = T.axis.remap("RR", [i4, i5]) | |
T.reads(data_pack[eps, nu, p, ci], input_tile_local[r_a, r_b, p, ci]) | |
T.writes(data_pack[eps, nu, p, ci]) | |
T.block_attr({"auto_scheduler_simplify_const_tensor_indices":["eps", "nu", "r_a", "r_b"], "schedule_rule":"meta_schedule.winograd_data_pack.cuda"}) | |
data_pack[eps, nu, p, ci] = data_pack[eps, nu, p, ci] + input_tile_local[r_a, r_b, p, ci] * T.Select(r_a % 4 == 3 and eps % 4 == 3, T.float32(1), T.Select(r_a % 4 == 3 and eps % 4 == 2, T.float32(0), T.Select(r_a % 4 == 3 and eps % 4 == 1, T.float32(0), T.Select(r_a % 4 == 3 and eps % 4 == 0, T.float32(0), T.Select(r_a % 4 == 2 and eps % 4 == 3, T.float32(0), T.Select(r_a % 4 == 2 and eps % 4 == 2, T.float32(1), T.Select(r_a % 4 == 2 and eps % 4 == 1, T.float32(1), T.Select(r_a % 4 == 2 and eps % 4 == 0, T.float32(-1), T.Select(r_a % 4 == 1 and eps % 4 == 3, T.float32(-1), T.Select(r_a % 4 == 1 and eps % 4 == 2, T.float32(1), T.Select(r_a % 4 == 1 and eps % 4 == 1, T.float32(-1), T.Select(r_a % 4 == 1 and eps % 4 == 0, T.float32(0), T.Select(r_a % 4 == 0 and eps % 4 == 3, T.float32(0), T.Select(r_a % 4 == 0 and eps % 4 == 2, T.float32(0), T.Select(r_a % 4 == 0 and eps % 4 == 1, T.float32(0), T.Select(r_a % 4 == 0 and eps % 4 == 0, T.float32(1), T.float32(0))))))))))))))))) * T.Select(r_b % 4 == 3 and nu % 4 == 3, T.float32(1), T.Select(r_b % 4 == 3 and nu % 4 == 2, T.float32(0), T.Select(r_b % 4 == 3 and nu % 4 == 1, T.float32(0), T.Select(r_b % 4 == 3 and nu % 4 == 0, T.float32(0), T.Select(r_b % 4 == 2 and nu % 4 == 3, T.float32(0), T.Select(r_b % 4 == 2 and nu % 4 == 2, T.float32(1), T.Select(r_b % 4 == 2 and nu % 4 == 1, T.float32(1), T.Select(r_b % 4 == 2 and nu % 4 == 0, T.float32(-1), T.Select(r_b % 4 == 1 and nu % 4 == 3, T.float32(-1), T.Select(r_b % 4 == 1 and nu % 4 == 2, T.float32(1), T.Select(r_b % 4 == 1 and nu % 4 == 1, T.float32(-1), T.Select(r_b % 4 == 1 and nu % 4 == 0, T.float32(0), T.Select(r_b % 4 == 0 and nu % 4 == 3, T.float32(0), T.Select(r_b % 4 == 0 and nu % 4 == 2, T.float32(0), T.Select(r_b % 4 == 0 and nu % 4 == 1, T.float32(0), T.Select(r_b % 4 == 0 and nu % 4 == 0, T.float32(1), T.float32(0))))))))))))))))) | |
for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(28, thread="blockIdx.x", annotations={"pragma_auto_unroll_max_step":512, "pragma_unroll_explicit":1}): | |
for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(1, thread="vthread.x"): | |
for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(896, thread="threadIdx.x"): | |
for i2_3_init, i3_3_init, i2_4_init in T.grid(2, 4, 2): | |
with T.block("bgemm_init"): | |
eps = T.axis.spatial(4, i0_2_i1_2 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment