Yizhi Liu yzhliu

## benchmark_fp8_tflops.py
import argparse

import torch


pt_dtype_mappings = {
    "float": torch.float,
    "half": torch.half,
    "float16": torch.float16,
    "bfloat16": torch.bfloat16,

## test_pyobj_compile.py
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#

## arm_benchmark.py
import numpy as np

import tvm
from tvm.contrib import graph_runtime
from tvm.contrib.util import tempdir
from tvm import autotvm
from tvm import relay
import tvm.relay.testing

#import mxnet

## normal.cu
extern "C" __global__ void tvmop_kernel0( float* __restrict__ buffer,  float* __restrict__ buffer1,  float* __restrict__ buffer2, int tindex, int tindex1, int tindex2, int stride, int stride1, int stride2, int stride3, int stride4, int stride5, int stride6, int stride7, int stride8) {
  if (((((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) / tindex) / tindex1) < tindex2) {
    if (((((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) / tindex) % tindex1) < tindex1) {
      if ((((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) % tindex) < tindex) {
        if ((((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) / tindex) < (tindex2 * tindex1)) {
          if (((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) < ((tindex2 * tindex1) * tindex)) {
            if (((((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) / tindex) / tindex1) < tindex2) {
              if (0 <= ((((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) / tindex) % tindex1)) {
                if (((((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) / tindex) % ti

## slow.cu
extern "C" __global__ void tvmop_kernel0( float* __restrict__ buffer,  float* __restrict__ buffer1,  float* __restrict__ buffer2, int tindex, int tindex1, int tindex2, int stride, int stride1, int stride2, int stride3, int stride4, int stride5, int stride6, int stride7, int stride8) {
  if (((int)blockIdx.x) < (((tindex * tindex1) * tindex2) >> 6)) {
    if (((((tindex1 >= 0) && ((((((tindex2 >= 0) && ((((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) % tindex2) >= 0)) || ((tindex2 < 0) && ((((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) % tindex2) <= 0))) ? (((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) / tindex2) : ((((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) / tindex2) - 1)) % tindex1) >= 0)) || ((tindex1 < 0) && ((((((tindex2 >= 0) && ((((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) % tindex2) >= 0)) || ((tindex2 < 0) && ((((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) % tindex2) <= 0))) ? (((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) / tindex2) : ((((((int)blockIdx.x) * 64) + ((int)threadIdx.

## arm_fp16.py
import tvm
import topi
from topi.util import get_const_tuple
import numpy as np

from topi.nn.pad import pad

# on a2: python3 -m tvm.exec.rpc_server --port=8499
# target = 'llvm -mcpu=core-avx2'
# target = 'llvm -device=arm_cpu -target=aarch64-linux-gnu -mattr=+v8.4a,+fp16fml,+fullfp16'

## reproduce_mali_patch.diff
diff --git a/src/arithmetic/const_fold.h b/src/arithmetic/const_fold.h
index fbf8fe7e..1c397f40 100644
--- a/src/arithmetic/const_fold.h
+++ b/src/arithmetic/const_fold.h
@@ -101,33 +101,28 @@ inline bool IsIndexType(const Type& type) {
 // specialization of constant folders.
 template<>
 inline Expr TryConstFold<ir::Add>(Expr a, Expr b) {
-  TVM_ARITH_CONST_PROPAGATION({
+  TVM_INDEX_CONST_PROPAGATION({

## mobilenet_x86_arm.log
{"i": ["llvm -mcpu=skylake-avx512", "topi_nn_conv2d", [["TENSOR", [1, 1024, 7, 7], "float32"], ["TENSOR", [1024, 1024, 1, 1], "float32"], [1, 1], [0, 0], "NCHW", "float32"], {}, ["conv2d", [1, 1024, 7, 7, "float32"], [1024, 1024, 1, 1, "float32"], [1, 1], [0, 0], "NCHW", "float32"], {"i": 417, "c": null, "e": [["tile_ic", "sp", [1, 1024]], ["tile_oc", "sp", [64, 16]], ["tile_ow", "sp", [1, 7]], ["tile_oh", "ot", 2]], "t": "direct"}], "r": [[0.00010386519659715739], 0, 1.2961008548736572, 1541133775.742406], "v": 0.1}
{"i": ["llvm -mcpu=skylake-avx512", "topi_nn_depthwise_conv2d_nchw", [["TENSOR", [1, 1024, 7, 7], "float32"], ["TENSOR", [1024, 1, 3, 3], "float32"], [1, 1], [1, 1], "float32"], {}, ["depthwise_conv2d_nchw", [1, 1024, 7, 7, "float32"], [1024, 1, 3, 3, "float32"], [1, 1], [1, 1], "float32"], {"i": 777271, "c": null, "e": [["tile_co", "sp", [1024, 1]], ["tile_oh", "sp", [1, 7]], ["tile_ow", "sp", [7, 1]], ["reorder_0", "re", [0, 1, 2, 3, 4, 5, 8, 6, 7]], ["reorder_1", "re", [0, 1, 2, 3, 4, 5, 6]],

## nohup_autotvm_mobilenet.log
Extract tasks...
Tuning...
[Task  1/19]  Current/Best:    0.00/  79.93 GFLOPS | Progress: (484/1000) | 2325.08 s Done.
[Task  2/19]  Current/Best:    7.72/  13.22 GFLOPS | Progress: (756/1000) | 1381.61 s Done.
[Task  3/19]  Current/Best:   10.29/  80.49 GFLOPS | Progress: (440/1000) | 1910.56 s Done.
[Task  4/19]  Current/Best:    2.18/   6.38 GFLOPS | Progress: (1000/1000) | 1627.55 s Done.
[Task  5/19]  Current/Best:   13.69/ 116.10 GFLOPS | Progress: (504/1000) | 1762.37 s Done.
[Task  6/19]  Current/Best:    8.73/   8.73 GFLOPS | Progress: (36/1000) | 66.31 sLLVM ERROR: Cannot select: 0x56385e2c8368: i32 = X86ISD::CMP 0x56385e2ca1e8, 0x56385e2c9550
  0x56385e2ca1e8: v16i1 = and 0x56385e202548, 0x56385e213948
    0x56385e202548: v16i1 = bitcast 0x56385e202a28

## mobilenet.log
{"i": ["llvm -mcpu=skylake-avx512", "topi_nn_conv2d", [["TENSOR", [1, 1024, 7, 7], "float32"], ["TENSOR", [1024, 1024, 1, 1], "float32"], [1, 1], [0, 0], "NCHW", "float32"], {}, ["conv2d", [1, 1024, 7, 7, "float32"], [1024, 1024, 1, 1, "float32"], [1, 1], [0, 0], "NCHW", "float32"], {"i": 416, "c": null, "e": [["tile_ic", "sp", [2, 512]], ["tile_oc", "sp", [64, 16]], ["tile_ow", "sp", [1, 7]], ["tile_oh", "ot", 2]], "t": "direct"}], "r": [[0.0012855558738853504], 0, 1.3173747062683105, 1541097220.10334], "v": 0.1}
{"i": ["llvm -mcpu=skylake-avx512", "topi_nn_depthwise_conv2d_nchw", [["TENSOR", [1, 1024, 7, 7], "float32"], ["TENSOR", [1024, 1, 3, 3], "float32"], [1, 1], [1, 1], "float32"], {}, ["depthwise_conv2d_nchw", [1, 1024, 7, 7, "float32"], [1024, 1, 3, 3, "float32"], [1, 1], [1, 1], "float32"], {"i": 354673, "c": null, "e": [["tile_co", "sp", [1024, 1]], ["tile_oh", "sp", [1, 7]], ["tile_ow", "sp", [1, 7]], ["reorder_0", "re", [0, 1, 2, 3, 4, 5, 6, 7, 8]], ["reorder_1", "re", [0, 1, 2, 3, 6, 4, 5]], ["a
	import argparse

	import torch


	pt_dtype_mappings = {
	"float": torch.float,
	"half": torch.half,
	"float16": torch.float16,
	"bfloat16": torch.bfloat16,
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	import numpy as np

	import tvm
	from tvm.contrib import graph_runtime
	from tvm.contrib.util import tempdir
	from tvm import autotvm
	from tvm import relay
	import tvm.relay.testing

	#import mxnet
	extern "C" __global__ void tvmop_kernel0( float* __restrict__ buffer, float* __restrict__ buffer1, float* __restrict__ buffer2, int tindex, int tindex1, int tindex2, int stride, int stride1, int stride2, int stride3, int stride4, int stride5, int stride6, int stride7, int stride8) {
	if (((((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) / tindex) / tindex1) < tindex2) {
	if (((((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) / tindex) % tindex1) < tindex1) {
	if ((((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) % tindex) < tindex) {
	if ((((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) / tindex) < (tindex2 * tindex1)) {
	if (((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) < ((tindex2 * tindex1) * tindex)) {
	if (((((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) / tindex) / tindex1) < tindex2) {
	if (0 <= ((((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) / tindex) % tindex1)) {
	if (((((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) / tindex) % ti
	extern "C" __global__ void tvmop_kernel0( float* __restrict__ buffer, float* __restrict__ buffer1, float* __restrict__ buffer2, int tindex, int tindex1, int tindex2, int stride, int stride1, int stride2, int stride3, int stride4, int stride5, int stride6, int stride7, int stride8) {
	if (((int)blockIdx.x) < (((tindex * tindex1) * tindex2) >> 6)) {
	if (((((tindex1 >= 0) && ((((((tindex2 >= 0) && ((((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) % tindex2) >= 0)) \|\| ((tindex2 < 0) && ((((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) % tindex2) <= 0))) ? (((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) / tindex2) : ((((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) / tindex2) - 1)) % tindex1) >= 0)) \|\| ((tindex1 < 0) && ((((((tindex2 >= 0) && ((((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) % tindex2) >= 0)) \|\| ((tindex2 < 0) && ((((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) % tindex2) <= 0))) ? (((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) / tindex2) : ((((((int)blockIdx.x) * 64) + ((int)threadIdx.
	import tvm
	import topi
	from topi.util import get_const_tuple
	import numpy as np

	from topi.nn.pad import pad

	# on a2: python3 -m tvm.exec.rpc_server --port=8499
	# target = 'llvm -mcpu=core-avx2'
	# target = 'llvm -device=arm_cpu -target=aarch64-linux-gnu -mattr=+v8.4a,+fp16fml,+fullfp16'
	diff --git a/src/arithmetic/const_fold.h b/src/arithmetic/const_fold.h
	index fbf8fe7e..1c397f40 100644
	--- a/src/arithmetic/const_fold.h
	+++ b/src/arithmetic/const_fold.h
	@@ -101,33 +101,28 @@ inline bool IsIndexType(const Type& type) {
	// specialization of constant folders.
	template<>
	inline Expr TryConstFold<ir::Add>(Expr a, Expr b) {
	- TVM_ARITH_CONST_PROPAGATION({
	+ TVM_INDEX_CONST_PROPAGATION({
	Extract tasks...
	Tuning...
	[Task 1/19] Current/Best: 0.00/ 79.93 GFLOPS \| Progress: (484/1000) \| 2325.08 s Done.
	[Task 2/19] Current/Best: 7.72/ 13.22 GFLOPS \| Progress: (756/1000) \| 1381.61 s Done.
	[Task 3/19] Current/Best: 10.29/ 80.49 GFLOPS \| Progress: (440/1000) \| 1910.56 s Done.
	[Task 4/19] Current/Best: 2.18/ 6.38 GFLOPS \| Progress: (1000/1000) \| 1627.55 s Done.
	[Task 5/19] Current/Best: 13.69/ 116.10 GFLOPS \| Progress: (504/1000) \| 1762.37 s Done.
	[Task 6/19] Current/Best: 8.73/ 8.73 GFLOPS \| Progress: (36/1000) \| 66.31 sLLVM ERROR: Cannot select: 0x56385e2c8368: i32 = X86ISD::CMP 0x56385e2ca1e8, 0x56385e2c9550
	0x56385e2ca1e8: v16i1 = and 0x56385e202548, 0x56385e213948
	0x56385e202548: v16i1 = bitcast 0x56385e202a28