JoeyTPChou/autotune_rpc_mali_error.py

## autotune_rpc_mali_error.py
#! /usr/bin/env python3

import tvm
import tvm.relay as relay
import tvm.contrib.graph_runtime as runtime
from tvm import autotvm
from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
from tvm.contrib.util import tempdir

import os
import numpy as np
import logging
import pprint
from keras.models import Sequential
from keras.layers import Conv2D

_LOCAL_IP = '0.0.0.0'
_PORT = 9190

def tune_tasks(tasks,
               measure_option,
               tuner='xgb',
               n_trial=1000,
               early_stopping=None,
               log_filename='tuning.log',
               use_transfer_learning=True,
               try_winograd=True):

    if try_winograd:
        for i in range(len(tasks)):
            try:  # try winograd template
                tsk = autotvm.task.create(tasks[i].name, tasks[i].args,
                                          tasks[i].target, tasks[i].target_host, 'winograd')
                tasks.append(tsk)
            except Exception:
                pass

    # Create tmp log file
    tmp_log_file = log_filename + ".tmp"
    if os.path.exists(tmp_log_file):
        os.remove(tmp_log_file)

    for i, tsk in enumerate(reversed(tasks)):
        prefix = "[Task %2d/%2d] " % (i+1, len(tasks))

        # Create tuner
        if tuner == 'xgb' or tuner == 'xgb-rank':
            tuner_obj = XGBTuner(tsk, loss_type='rank')
        elif tuner == 'ga':
            tuner_obj = GATuner(tsk, pop_size=50)
        elif tuner == 'random':
            tuner_obj = RandomTuner(tsk)
        elif tuner == 'gridsearch':
            tuner_obj = GridSearchTuner(tsk)
        else:
            raise ValueError("Invalid tuner: " + tuner)

        if use_transfer_learning:
            if os.path.isfile(tmp_log_file):
                tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file))

        # Do tuning
        n_trial = min(n_trial, len(tsk.config_space))
        tuner_obj.tune(n_trial=n_trial,
                       early_stopping=early_stopping,
                       measure_option=measure_option,
                       callbacks=[
                           autotvm.callback.progress_bar(n_trial, prefix=prefix),
                           autotvm.callback.log_to_file(tmp_log_file)])

    # Pick best records to a cache file
    autotvm.record.pick_best(tmp_log_file, log_filename)
    os.remove(tmp_log_file)


def tune_and_evaluate(model_name,
                      func,
                      params,
                      input_shape,
                      use_mali=False,
                      use_android=False):

    # Replace this with the device key in your tracker
    device_key = 'rk3399'

    if use_mali:
        target = tvm.target.create('opencl -model=rk3399 -device=mali')
        log_prefix = device_key + '_mali'
    else:
        target = tvm.target.arm_cpu('rk3399')
        log_prefix = device_key + '_cpu'

    # Replace "aarch64-linux-gnu" with the correct target of your board.
    # This target host is used for cross compilation. You can query it by :code:`gcc -v` on your device.
    target_host = 'llvm -target=aarch64-linux-gnu'

    log_file = "%s.%s.log" % (log_prefix, model_name)
    dtype = 'float32'

    # Get tuning options
    tuning_opt = {
        'log_filename': log_file,
        'tuner': 'xgb',
        'n_trial': 1000,
        'early_stopping': 600,
        'measure_option': autotvm.measure_option(
            builder=autotvm.LocalBuilder(
                build_func='ndk' if use_android else 'default'),
            runner=autotvm.RPCRunner(
                device_key, host=_LOCAL_IP, port=_PORT,
                number=10,
                timeout=5,
            ),
        ),
    }

    # Extract workloads from relay program
    print("Extract tasks...")
    tasks = autotvm.task.extract_from_program(func["main"],
                                              target=target,
                                              target_host=target_host,
                                              params=params, ops=(relay.op.nn.conv2d,))

    print("Total {0} layers to be tuned ...".format(len(tasks)))
    pprint.pprint(tasks)

    tune_tasks(tasks, **tuning_opt)

    # Compile kernels with history best records
    with autotvm.apply_history_best(log_file):

        print("Compile...")
        with relay.build_config(opt_level=3):
            graph, lib, params = relay.build_module.build(
                func, target=target, params=params, target_host=target_host)

        # Export library
        tmp = tempdir()
        if use_android:
            from tvm.contrib import ndk
            filename = "net.so"
            lib.export_library(tmp.relpath(filename), ndk.create_shared)
        else:
            filename = "net.tar"
            lib.export_library(tmp.relpath(filename))

        # Upload module to device
        print("Upload...")
        remote = autotvm.measure.request_remote(device_key, _LOCAL_IP, _PORT,
                                                timeout=10000)
        remote.upload(tmp.relpath(filename))
        rlib = remote.load_module(filename)

        # Upload parameters to device
        ctx = remote.context(str(target), 0)
        module = runtime.create(graph, rlib, ctx)
        data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
        module.set_input('input_1', data_tvm)
        module.set_input(**params)

        # Evaluate
        print("Evaluate inference time cost...")
        ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=30)
        prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
              (np.mean(prof_res), np.std(prof_res)))


def _main_():

    # Use mali
    use_mali = True

    ###############################
    #   Create a one-layer keras model
    ###############################

    """ This one-layer model cause a lot of warning messages
    """
    input_shape = (8, 8, 32)
    keras_model = Sequential([
                Conv2D(input_shape=input_shape, filters=32, kernel_size=1, padding='SAME')
            ])

    model_name = 'error_model'
    input_name = keras_model.input_names[0]

    ###############################
    #   Prepare data
    ###############################

    data = np.ones(input_shape, dtype=np.float32)
    data = np.array(data)[np.newaxis, :]
    data = data.transpose([0, 3, 1, 2])  # NCHW

    ###############################
    #   Compile tvm model
    ###############################

    # Prepare input data
    shape_dict = {input_name: data.shape}
    func, params = relay.frontend.from_keras(keras_model, shape_dict)

    ###############################
    #   Auto Tune
    ###############################

    # Begin tuning
    tune_and_evaluate(model_name=model_name,
                      func=func,
                      params=params,
                      input_shape=shape_dict[input_name],
                      use_mali=use_mali)


if __name__ == "__main__":

    _main_()

## device_clinfo_mali.log
Number of platforms                               1
  Platform Name                                   ARM Platform
  Platform Vendor                                 ARM
  Platform Version                                OpenCL 1.2 v1.r14p0-01rel0-git(966ed26).f44c85cb3d2ceb87e8be88e7592755c3
  Platform Profile                                FULL_PROFILE
  Platform Extensions                             cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_byte_addressable_store cl_khr_3d_image_writes cl_khr_fp64 cl_khr_int64_base_atomics cl_khr_int64_extended_atomics cl_khr_fp16 cl_khr_gl_sharing cl_khr_icd cl_khr_egl_event cl_khr_egl_image cl_khr_image2d_from_buffer cl_arm_core_id cl_arm_printf cl_arm_thread_limit_hint cl_arm_non_uniform_work_group_size cl_arm_import_memory
  Platform Extensions function suffix             ARM

  Platform Name                                   ARM Platform
Number of devices                                 1
  Device Name                                     Mali-T860
  Device Vendor                                   ARM
  Device Vendor ID                                0x8602000
  Device Version                                  OpenCL 1.2 v1.r14p0-01rel0-git(966ed26).f44c85cb3d2ceb87e8be88e7592755c3
  Driver Version                                  1.2
  Device OpenCL C Version                         OpenCL C 1.2 v1.r14p0-01rel0-git(966ed26).f44c85cb3d2ceb87e8be88e7592755c3
  Device Type                                     GPU
  Device Profile                                  FULL_PROFILE
  Max compute units                               4
  Max clock frequency                             200MHz
  Device Partition                                (core)
    Max number of sub-devices                     0
    Supported partition types                     None
  Max work item dimensions                        3
  Max work item sizes                             256x256x256
  Max work group size                             256
  Preferred work group size multiple              4
  Preferred / native vector sizes
    char                                                16 / 16
    short                                                8 / 8
    int                                                  4 / 4
    long                                                 2 / 2
    half                                                 8 / 8        (cl_khr_fp16)
    float                                                4 / 4
    double                                               2 / 2        (cl_khr_fp64)
  Half-precision Floating-point support           (cl_khr_fp16)
    Denormals                                     Yes
    Infinity and NANs                             Yes
    Round to nearest                              Yes
    Round to zero                                 Yes
    Round to infinity                             Yes
    IEEE754-2008 fused multiply-add               Yes
    Support is emulated in software               No
    Correctly-rounded divide and sqrt operations  No
  Single-precision Floating-point support         (core)
    Denormals                                     Yes
    Infinity and NANs                             Yes
    Round to nearest                              Yes
    Round to zero                                 Yes
    Round to infinity                             Yes
    IEEE754-2008 fused multiply-add               Yes
    Support is emulated in software               No
    Correctly-rounded divide and sqrt operations  No
  Double-precision Floating-point support         (cl_khr_fp64)
    Denormals                                     Yes
    Infinity and NANs                             Yes
    Round to nearest                              Yes
    Round to zero                                 Yes
    Round to infinity                             Yes
    IEEE754-2008 fused multiply-add               Yes
    Support is emulated in software               No
    Correctly-rounded divide and sqrt operations  No
  Address bits                                    64, Little-Endian
  Global memory size                              4033777664 (3.757GiB)
  Error Correction support                        No
  Max memory allocation                           1008444416 (961.7MiB)
  Unified memory for Host and Device              Yes
  Minimum alignment for any data type             128 bytes
  Alignment of base address                       1024 bits (128 bytes)
  Global Memory cache type                        Read/Write
  Global Memory cache size                        262144
  Global Memory cache line                        64 bytes
  Image support                                   Yes
    Max number of samplers per kernel             16
    Max size for 1D images from buffer            65536 pixels
    Max 1D or 2D image array size                 2048 images
    Base address alignment for 2D image buffers   32 bytes
    Pitch alignment for 2D image buffers          16 bytes
    Max 2D image size                             65536x65536 pixels
    Max 3D image size                             65536x65536x65536 pixels
    Max number of read image args                 128
    Max number of write image args                8
  Local memory type                               Global
  Local memory size                               32768 (32KiB)
  Max constant buffer size                        65536 (64KiB)
  Max number of constant args                     8
  Max size of kernel argument                     1024
  Queue properties
    Out-of-order execution                        Yes
    Profiling                                     Yes
  Prefer user sync for interop                    No
  Profiling timer resolution                      1000ns
  Execution capabilities
    Run OpenCL kernels                            Yes
    Run native kernels                            No
  printf() buffer size                            1048576 (1024KiB)
  Built-in kernels
  Device Available                                Yes
  Compiler Available                              Yes
  Linker Available                                Yes
  Device Extensions                               cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_byte_addressable_store cl_khr_3d_image_writes cl_khr_fp64 cl_khr_int64_base_atomics cl_khr_int64_extended_atomics cl_khr_fp16 cl_khr_gl_sharing cl_khr_icd cl_khr_egl_event cl_khr_egl_image cl_khr_image2d_from_buffer cl_arm_core_id cl_arm_printf cl_arm_thread_limit_hint cl_arm_non_uniform_work_group_size cl_arm_import_memory

NULL platform behavior
  clGetPlatformInfo(NULL, CL_PLATFORM_NAME, ...)  ARM Platform
  clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, ...)   Success [ARM]
  clCreateContext(NULL, ...) [default]            Success [ARM]
  clCreateContextFromType(NULL, CL_DEVICE_TYPE_CPU)  No devices found in platform
  clCreateContextFromType(NULL, CL_DEVICE_TYPE_GPU)  Success (1)
    Platform Name                                 ARM Platform
    Device Name                                   Mali-T860
  clCreateContextFromType(NULL, CL_DEVICE_TYPE_ACCELERATOR)  No devices found in platform
  clCreateContextFromType(NULL, CL_DEVICE_TYPE_CUSTOM)  No devices found in platform
  clCreateContextFromType(NULL, CL_DEVICE_TYPE_ALL)  Success (1)
    Platform Name                                 ARM Platform
    Device Name                                   Mali-T860

ICD loader properties
  ICD loader Name                                 OpenCL ICD Loader
  ICD loader Vendor                               OCL Icd free software
  ICD loader Version                              2.2.11
  ICD loader Profile                              OpenCL 2.1

## host_error.log
Extract tasks...
Total 1 layers to be tuned ...
[Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 32, 8, 8), 'float32'), ('TENSOR', (32, 32, 1, 1), 'float32'), (1, 1), (0, 0), (1, 1), 'NCHW', 'float32'), kwargs={}, workload=('conv2d', (1, 32, 8, 8, 'float32'), (32, 32, 1, 1, 'float32'), (1, 1), (0, 0), (1, 1), 'NCHW', 'float32'))]
[Task  1/ 1]  Current/Best:    0.00/   0.00 GFLOPS | Progress: (160/1000) | 58.22 s

WARNING:autotvm:Too many errors happen in the tuning. Now is in debug mode

DEBUG:autotvm:No: 161	GFLOPS: 0.00/0.00	result: MeasureResult(costs=(RuntimeError('Traceback (most recent call last):\n  [bt] (8) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::ServerLoop()+0xe4) [0x7f81fa763c]\n  [bt] (7) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::HandleUntilReturnEvent(tvm::runtime::TVMRetValue*, bool, tvm::runtime::PackedFunc const*)+0x180) [0x7f81fa7368]\n  [bt] (6) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::EventHandler::HandleNextEvent(tvm::runtime::TVMRetValue*, bool, tvm::runtime::PackedFunc const*)+0x240) [0x7f81fadf78]\n  [bt] (5) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::EventHandler::HandleRecvPackedSeqArg()+0x318) [0x7f81fad8b8]\n  [bt] (4) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::EventHandler::SwitchToState(tvm::runtime::RPCSession::EventHandler::State)+0x32c) [0x7f81fac44c]\n  [bt] (3) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::EventHandler::HandlePackedCall()+0x6c8) [0x7f81fa6',),), error_no=4, all_cost=1.460705280303955, timestamp=1578348078.2993999)	[('tile_co', [-1, 2, 1]), ('tile_oh', [-1, 4, 1]), ('tile_ow', [-1, 1, 8]), ('reorder_0', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), ('ann_reduce', ['none', 'none']), ('ann_spatial', ['unroll', 'unroll', 'vec'])],direct,None,220333
DEBUG:autotvm:No: 162	GFLOPS: 0.00/0.00	result: MeasureResult(costs=(RuntimeError('Traceback (most recent call last):\n  [bt] (8) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::ServerLoop()+0xe4) [0x7f81fa763c]\n  [bt] (7) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::HandleUntilReturnEvent(tvm::runtime::TVMRetValue*, bool, tvm::runtime::PackedFunc const*)+0x180) [0x7f81fa7368]\n  [bt] (6) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::EventHandler::HandleNextEvent(tvm::runtime::TVMRetValue*, bool, tvm::runtime::PackedFunc const*)+0x240) [0x7f81fadf78]\n  [bt] (5) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::EventHandler::HandleRecvPackedSeqArg()+0x318) [0x7f81fad8b8]\n  [bt] (4) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::EventHandler::SwitchToState(tvm::runtime::RPCSession::EventHandler::State)+0x32c) [0x7f81fac44c]\n  [bt] (3) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::EventHandler::HandlePackedCall()+0x6c8) [0x7f81fa6',),), error_no=4, all_cost=3.148015260696411, timestamp=1578348078.9530125)	[('tile_co', [-1, 16, 1]), ('tile_oh', [-1, 1, 4]), ('tile_ow', [-1, 1, 4]), ('reorder_0', [0, 1, 2, 3, 4, 5, 6, 9, 7, 8]), ('ann_reduce', ['unroll', 'none']), ('ann_spatial', ['none', 'unroll', 'vec'])],direct,None,96121
DEBUG:autotvm:No: 163	GFLOPS: 0.00/0.00	result: MeasureResult(costs=(InstantiationError(['Too large factor for unrolling'],),), error_no=1, all_cost=0.03780198097229004, timestamp=1578348075.7359238)	[('tile_co', [-1, 1, 32]), ('tile_oh', [-1, 1, 4]), ('tile_ow', [-1, 4, 1]), ('reorder_0', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), ('ann_reduce', ['unroll', 'unroll']), ('ann_spatial', ['vec', 'unroll', 'unroll'])],direct,None,332387
DEBUG:autotvm:No: 164	GFLOPS: 0.00/0.00	result: MeasureResult(costs=(TVMError('Traceback (most recent call last):\n  [bt] (8) /home/fpga/jchou/TVM/tvm/build/libtvm.so(TVMFuncCall+0x61) [0x7fbedd2905c1]\n  [bt] (7) /home/fpga/jchou/TVM/tvm/build/libtvm.so(+0x41af15) [0x7fbedcaf1f15]\n  [bt] (6) /home/fpga/jchou/TVM/tvm/build/libtvm.so(tvm::codegen::Build(tvm::Array<tvm::LoweredFunc, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)+0xc85) [0x7fbedcc072a5]\n  [bt] (5) /home/fpga/jchou/TVM/tvm/build/libtvm.so(std::_Function_handler<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*), void tvm::runtime::TypedPackedFunc<tvm::runtime::Module (tvm::Array<tvm::LoweredFunc, void>)>::AssignTypedLambda<tvm::runtime::Module (*)(tvm::Array<tvm::LoweredFunc, void>)>(tvm::runtime::Module (*)(tvm::Array<tvm::LoweredFunc, void>))::{lambda(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*)#1}>::_M_invoke(std::_Any_data const&, tvm::runtime::TVMArgs&&, tvm::runtime::TVMRetValue*&&)+0x4e) [0x7fbedcbfec4e]\n  [bt] (4) /home/fpga/jchou/TVM/tvm/build/libtvm.so(tvm::codegen::BuildOpenCL(tvm::Array<tvm::LoweredFunc, void>)+0x384) [0x7fbedcbfe004]\n  [bt] (3) /home/fpga/jchou/TVM/tvm/build/libtvm.so(tvm::runtime::OpenCLModuleCreate(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, tvm::runtime::FunctionInfo, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, tvm::runtime::FunctionInfo> > >, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >)+0x33d) [0x7fbedd32dc5d]\n  [bt] (2) /home/fpga/jchou/TVM/tvm/build/libtvm.so(tvm::runtime::OpenCLModuleNode::Init()+0x130) [0x7fbedd32d4e0]\n  [bt] (1) /home/fpga/jchou/TVM/tvm/build/libtvm.so(tvm::runtime::cl::OpenCLWorkspace::Init(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)+0x351) [0x7fbedd337691]\n  [bt] (0) /home/fpga/jchou/TVM/tvm/build/libtvm.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x32) [0x7fbedca811d2]\n  File "/home/fpga/jchou/TVM/tvm/src/runtime/opencl/opencl_device_api.cc", line 292\nTVMError: Check failed: err_code == CL_SUCCESS: OpenCL Error, code=-6: CL_OUT_OF_HOST_MEMORY',),), error_no=2, all_cost=3.4754340648651123, timestamp=1578348077.3251393)	[('tile_co', [-1, 4, 2]), ('tile_oh', [-1, 1, 8]), ('tile_ow', [-1, 2, 4]), ('reorder_0', [0, 1, 2, 3, 4, 5, 6, 9, 7, 8]), ('ann_reduce', ['none', 'none']), ('ann_spatial', ['vec', 'none', 'none'])],direct,None,272777
...
	#! /usr/bin/env python3

	import tvm
	import tvm.relay as relay
	import tvm.contrib.graph_runtime as runtime
	from tvm import autotvm
	from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
	from tvm.contrib.util import tempdir

	import os
	import numpy as np
	import logging
	import pprint
	from keras.models import Sequential
	from keras.layers import Conv2D

	_LOCAL_IP = '0.0.0.0'
	_PORT = 9190

	def tune_tasks(tasks,
	measure_option,
	tuner='xgb',
	n_trial=1000,
	early_stopping=None,
	log_filename='tuning.log',
	use_transfer_learning=True,
	try_winograd=True):

	if try_winograd:
	for i in range(len(tasks)):
	try: # try winograd template
	tsk = autotvm.task.create(tasks[i].name, tasks[i].args,
	tasks[i].target, tasks[i].target_host, 'winograd')
	tasks.append(tsk)
	except Exception:
	pass

	# Create tmp log file
	tmp_log_file = log_filename + ".tmp"
	if os.path.exists(tmp_log_file):
	os.remove(tmp_log_file)

	for i, tsk in enumerate(reversed(tasks)):
	prefix = "[Task %2d/%2d] " % (i+1, len(tasks))

	# Create tuner
	if tuner == 'xgb' or tuner == 'xgb-rank':
	tuner_obj = XGBTuner(tsk, loss_type='rank')
	elif tuner == 'ga':
	tuner_obj = GATuner(tsk, pop_size=50)
	elif tuner == 'random':
	tuner_obj = RandomTuner(tsk)
	elif tuner == 'gridsearch':
	tuner_obj = GridSearchTuner(tsk)
	else:
	raise ValueError("Invalid tuner: " + tuner)

	if use_transfer_learning:
	if os.path.isfile(tmp_log_file):
	tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file))

	# Do tuning
	n_trial = min(n_trial, len(tsk.config_space))
	tuner_obj.tune(n_trial=n_trial,
	early_stopping=early_stopping,
	measure_option=measure_option,
	callbacks=[
	autotvm.callback.progress_bar(n_trial, prefix=prefix),
	autotvm.callback.log_to_file(tmp_log_file)])

	# Pick best records to a cache file
	autotvm.record.pick_best(tmp_log_file, log_filename)
	os.remove(tmp_log_file)


	def tune_and_evaluate(model_name,
	func,
	params,
	input_shape,
	use_mali=False,
	use_android=False):

	# Replace this with the device key in your tracker
	device_key = 'rk3399'

	if use_mali:
	target = tvm.target.create('opencl -model=rk3399 -device=mali')
	log_prefix = device_key + '_mali'
	else:
	target = tvm.target.arm_cpu('rk3399')
	log_prefix = device_key + '_cpu'

	# Replace "aarch64-linux-gnu" with the correct target of your board.
	# This target host is used for cross compilation. You can query it by :code:`gcc -v` on your device.
	target_host = 'llvm -target=aarch64-linux-gnu'

	log_file = "%s.%s.log" % (log_prefix, model_name)
	dtype = 'float32'

	# Get tuning options
	tuning_opt = {
	'log_filename': log_file,
	'tuner': 'xgb',
	'n_trial': 1000,
	'early_stopping': 600,
	'measure_option': autotvm.measure_option(
	builder=autotvm.LocalBuilder(
	build_func='ndk' if use_android else 'default'),
	runner=autotvm.RPCRunner(
	device_key, host=_LOCAL_IP, port=_PORT,
	number=10,
	timeout=5,
	),
	),
	}

	# Extract workloads from relay program
	print("Extract tasks...")
	tasks = autotvm.task.extract_from_program(func["main"],
	target=target,
	target_host=target_host,
	params=params, ops=(relay.op.nn.conv2d,))

	print("Total {0} layers to be tuned ...".format(len(tasks)))
	pprint.pprint(tasks)

	tune_tasks(tasks, **tuning_opt)

	# Compile kernels with history best records
	with autotvm.apply_history_best(log_file):

	print("Compile...")
	with relay.build_config(opt_level=3):
	graph, lib, params = relay.build_module.build(
	func, target=target, params=params, target_host=target_host)

	# Export library
	tmp = tempdir()
	if use_android:
	from tvm.contrib import ndk
	filename = "net.so"
	lib.export_library(tmp.relpath(filename), ndk.create_shared)
	else:
	filename = "net.tar"
	lib.export_library(tmp.relpath(filename))

	# Upload module to device
	print("Upload...")
	remote = autotvm.measure.request_remote(device_key, _LOCAL_IP, _PORT,
	timeout=10000)
	remote.upload(tmp.relpath(filename))
	rlib = remote.load_module(filename)

	# Upload parameters to device
	ctx = remote.context(str(target), 0)
	module = runtime.create(graph, rlib, ctx)
	data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
	module.set_input('input_1', data_tvm)
	module.set_input(**params)

	# Evaluate
	print("Evaluate inference time cost...")
	ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=30)
	prof_res = np.array(ftimer().results) * 1000 # convert to millisecond
	print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
	(np.mean(prof_res), np.std(prof_res)))


	def _main_():

	# Use mali
	use_mali = True

	###############################
	# Create a one-layer keras model
	###############################

	""" This one-layer model cause a lot of warning messages
	"""
	input_shape = (8, 8, 32)
	keras_model = Sequential([
	Conv2D(input_shape=input_shape, filters=32, kernel_size=1, padding='SAME')
	])

	model_name = 'error_model'
	input_name = keras_model.input_names[0]

	###############################
	# Prepare data
	###############################

	data = np.ones(input_shape, dtype=np.float32)
	data = np.array(data)[np.newaxis, :]
	data = data.transpose([0, 3, 1, 2]) # NCHW

	###############################
	# Compile tvm model
	###############################

	# Prepare input data
	shape_dict = {input_name: data.shape}
	func, params = relay.frontend.from_keras(keras_model, shape_dict)

	###############################
	# Auto Tune
	###############################

	# Begin tuning
	tune_and_evaluate(model_name=model_name,
	func=func,
	params=params,
	input_shape=shape_dict[input_name],
	use_mali=use_mali)


	if __name__ == "__main__":

	_main_()
	Number of platforms 1
	Platform Name ARM Platform
	Platform Vendor ARM
	Platform Version OpenCL 1.2 v1.r14p0-01rel0-git(966ed26).f44c85cb3d2ceb87e8be88e7592755c3
	Platform Profile FULL_PROFILE
	Platform Extensions cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_byte_addressable_store cl_khr_3d_image_writes cl_khr_fp64 cl_khr_int64_base_atomics cl_khr_int64_extended_atomics cl_khr_fp16 cl_khr_gl_sharing cl_khr_icd cl_khr_egl_event cl_khr_egl_image cl_khr_image2d_from_buffer cl_arm_core_id cl_arm_printf cl_arm_thread_limit_hint cl_arm_non_uniform_work_group_size cl_arm_import_memory
	Platform Extensions function suffix ARM

	Platform Name ARM Platform
	Number of devices 1
	Device Name Mali-T860
	Device Vendor ARM
	Device Vendor ID 0x8602000
	Device Version OpenCL 1.2 v1.r14p0-01rel0-git(966ed26).f44c85cb3d2ceb87e8be88e7592755c3
	Driver Version 1.2
	Device OpenCL C Version OpenCL C 1.2 v1.r14p0-01rel0-git(966ed26).f44c85cb3d2ceb87e8be88e7592755c3
	Device Type GPU
	Device Profile FULL_PROFILE
	Max compute units 4
	Max clock frequency 200MHz
	Device Partition (core)
	Max number of sub-devices 0
	Supported partition types None
	Max work item dimensions 3
	Max work item sizes 256x256x256
	Max work group size 256
	Preferred work group size multiple 4
	Preferred / native vector sizes
	char 16 / 16
	short 8 / 8
	int 4 / 4
	long 2 / 2
	half 8 / 8 (cl_khr_fp16)
	float 4 / 4
	double 2 / 2 (cl_khr_fp64)
	Half-precision Floating-point support (cl_khr_fp16)
	Denormals Yes
	Infinity and NANs Yes
	Round to nearest Yes
	Round to zero Yes
	Round to infinity Yes
	IEEE754-2008 fused multiply-add Yes
	Support is emulated in software No
	Correctly-rounded divide and sqrt operations No
	Single-precision Floating-point support (core)
	Denormals Yes
	Infinity and NANs Yes
	Round to nearest Yes
	Round to zero Yes
	Round to infinity Yes
	IEEE754-2008 fused multiply-add Yes
	Support is emulated in software No
	Correctly-rounded divide and sqrt operations No
	Double-precision Floating-point support (cl_khr_fp64)
	Denormals Yes
	Infinity and NANs Yes
	Round to nearest Yes
	Round to zero Yes
	Round to infinity Yes
	IEEE754-2008 fused multiply-add Yes
	Support is emulated in software No
	Correctly-rounded divide and sqrt operations No
	Address bits 64, Little-Endian
	Global memory size 4033777664 (3.757GiB)
	Error Correction support No
	Max memory allocation 1008444416 (961.7MiB)
	Unified memory for Host and Device Yes
	Minimum alignment for any data type 128 bytes
	Alignment of base address 1024 bits (128 bytes)
	Global Memory cache type Read/Write
	Global Memory cache size 262144
	Global Memory cache line 64 bytes
	Image support Yes
	Max number of samplers per kernel 16
	Max size for 1D images from buffer 65536 pixels
	Max 1D or 2D image array size 2048 images
	Base address alignment for 2D image buffers 32 bytes
	Pitch alignment for 2D image buffers 16 bytes
	Max 2D image size 65536x65536 pixels
	Max 3D image size 65536x65536x65536 pixels
	Max number of read image args 128
	Max number of write image args 8
	Local memory type Global
	Local memory size 32768 (32KiB)
	Max constant buffer size 65536 (64KiB)
	Max number of constant args 8
	Max size of kernel argument 1024
	Queue properties
	Out-of-order execution Yes
	Profiling Yes
	Prefer user sync for interop No
	Profiling timer resolution 1000ns
	Execution capabilities
	Run OpenCL kernels Yes
	Run native kernels No
	printf() buffer size 1048576 (1024KiB)
	Built-in kernels
	Device Available Yes
	Compiler Available Yes
	Linker Available Yes
	Device Extensions cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_byte_addressable_store cl_khr_3d_image_writes cl_khr_fp64 cl_khr_int64_base_atomics cl_khr_int64_extended_atomics cl_khr_fp16 cl_khr_gl_sharing cl_khr_icd cl_khr_egl_event cl_khr_egl_image cl_khr_image2d_from_buffer cl_arm_core_id cl_arm_printf cl_arm_thread_limit_hint cl_arm_non_uniform_work_group_size cl_arm_import_memory

	NULL platform behavior
	clGetPlatformInfo(NULL, CL_PLATFORM_NAME, ...) ARM Platform
	clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, ...) Success [ARM]
	clCreateContext(NULL, ...) [default] Success [ARM]
	clCreateContextFromType(NULL, CL_DEVICE_TYPE_CPU) No devices found in platform
	clCreateContextFromType(NULL, CL_DEVICE_TYPE_GPU) Success (1)
	Platform Name ARM Platform
	Device Name Mali-T860
	clCreateContextFromType(NULL, CL_DEVICE_TYPE_ACCELERATOR) No devices found in platform
	clCreateContextFromType(NULL, CL_DEVICE_TYPE_CUSTOM) No devices found in platform
	clCreateContextFromType(NULL, CL_DEVICE_TYPE_ALL) Success (1)
	Platform Name ARM Platform
	Device Name Mali-T860

	ICD loader properties
	ICD loader Name OpenCL ICD Loader
	ICD loader Vendor OCL Icd free software
	ICD loader Version 2.2.11
	ICD loader Profile OpenCL 2.1
	Extract tasks...
	Total 1 layers to be tuned ...
	[Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 32, 8, 8), 'float32'), ('TENSOR', (32, 32, 1, 1), 'float32'), (1, 1), (0, 0), (1, 1), 'NCHW', 'float32'), kwargs={}, workload=('conv2d', (1, 32, 8, 8, 'float32'), (32, 32, 1, 1, 'float32'), (1, 1), (0, 0), (1, 1), 'NCHW', 'float32'))]
	[Task 1/ 1] Current/Best: 0.00/ 0.00 GFLOPS \| Progress: (160/1000) \| 58.22 s

	WARNING:autotvm:Too many errors happen in the tuning. Now is in debug mode

	DEBUG:autotvm:No: 161 GFLOPS: 0.00/0.00 result: MeasureResult(costs=(RuntimeError('Traceback (most recent call last):\n [bt] (8) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::ServerLoop()+0xe4) [0x7f81fa763c]\n [bt] (7) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::HandleUntilReturnEvent(tvm::runtime::TVMRetValue, bool, tvm::runtime::PackedFunc const)+0x180) [0x7f81fa7368]\n [bt] (6) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::EventHandler::HandleNextEvent(tvm::runtime::TVMRetValue, bool, tvm::runtime::PackedFunc const)+0x240) [0x7f81fadf78]\n [bt] (5) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::EventHandler::HandleRecvPackedSeqArg()+0x318) [0x7f81fad8b8]\n [bt] (4) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::EventHandler::SwitchToState(tvm::runtime::RPCSession::EventHandler::State)+0x32c) [0x7f81fac44c]\n [bt] (3) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::EventHandler::HandlePackedCall()+0x6c8) [0x7f81fa6',),), error_no=4, all_cost=1.460705280303955, timestamp=1578348078.2993999) [('tile_co', [-1, 2, 1]), ('tile_oh', [-1, 4, 1]), ('tile_ow', [-1, 1, 8]), ('reorder_0', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), ('ann_reduce', ['none', 'none']), ('ann_spatial', ['unroll', 'unroll', 'vec'])],direct,None,220333
	DEBUG:autotvm:No: 162 GFLOPS: 0.00/0.00 result: MeasureResult(costs=(RuntimeError('Traceback (most recent call last):\n [bt] (8) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::ServerLoop()+0xe4) [0x7f81fa763c]\n [bt] (7) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::HandleUntilReturnEvent(tvm::runtime::TVMRetValue, bool, tvm::runtime::PackedFunc const)+0x180) [0x7f81fa7368]\n [bt] (6) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::EventHandler::HandleNextEvent(tvm::runtime::TVMRetValue, bool, tvm::runtime::PackedFunc const)+0x240) [0x7f81fadf78]\n [bt] (5) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::EventHandler::HandleRecvPackedSeqArg()+0x318) [0x7f81fad8b8]\n [bt] (4) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::EventHandler::SwitchToState(tvm::runtime::RPCSession::EventHandler::State)+0x32c) [0x7f81fac44c]\n [bt] (3) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::EventHandler::HandlePackedCall()+0x6c8) [0x7f81fa6',),), error_no=4, all_cost=3.148015260696411, timestamp=1578348078.9530125) [('tile_co', [-1, 16, 1]), ('tile_oh', [-1, 1, 4]), ('tile_ow', [-1, 1, 4]), ('reorder_0', [0, 1, 2, 3, 4, 5, 6, 9, 7, 8]), ('ann_reduce', ['unroll', 'none']), ('ann_spatial', ['none', 'unroll', 'vec'])],direct,None,96121
	DEBUG:autotvm:No: 163 GFLOPS: 0.00/0.00 result: MeasureResult(costs=(InstantiationError(['Too large factor for unrolling'],),), error_no=1, all_cost=0.03780198097229004, timestamp=1578348075.7359238) [('tile_co', [-1, 1, 32]), ('tile_oh', [-1, 1, 4]), ('tile_ow', [-1, 4, 1]), ('reorder_0', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), ('ann_reduce', ['unroll', 'unroll']), ('ann_spatial', ['vec', 'unroll', 'unroll'])],direct,None,332387
	DEBUG:autotvm:No: 164 GFLOPS: 0.00/0.00 result: MeasureResult(costs=(TVMError('Traceback (most recent call last):\n [bt] (8) /home/fpga/jchou/TVM/tvm/build/libtvm.so(TVMFuncCall+0x61) [0x7fbedd2905c1]\n [bt] (7) /home/fpga/jchou/TVM/tvm/build/libtvm.so(+0x41af15) [0x7fbedcaf1f15]\n [bt] (6) /home/fpga/jchou/TVM/tvm/build/libtvm.so(tvm::codegen::Build(tvm::Array<tvm::LoweredFunc, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)+0xc85) [0x7fbedcc072a5]\n [bt] (5) /home/fpga/jchou/TVM/tvm/build/libtvm.so(std::_Function_handler<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue), void tvm::runtime::TypedPackedFunc<tvm::runtime::Module (tvm::Array<tvm::LoweredFunc, void>)>::AssignTypedLambda<tvm::runtime::Module ()(tvm::Array<tvm::LoweredFunc, void>)>(tvm::runtime::Module ()(tvm::Array<tvm::LoweredFunc, void>))::{lambda(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue)#1}>::_M_invoke(std::_Any_data const&, tvm::runtime::TVMArgs&&, tvm::runtime::TVMRetValue*&&)+0x4e) [0x7fbedcbfec4e]\n [bt] (4) /home/fpga/jchou/TVM/tvm/build/libtvm.so(tvm::codegen::BuildOpenCL(tvm::Array<tvm::LoweredFunc, void>)+0x384) [0x7fbedcbfe004]\n [bt] (3) /home/fpga/jchou/TVM/tvm/build/libtvm.so(tvm::runtime::OpenCLModuleCreate(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, tvm::runtime::FunctionInfo, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, tvm::runtime::FunctionInfo> > >, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >)+0x33d) [0x7fbedd32dc5d]\n [bt] (2) /home/fpga/jchou/TVM/tvm/build/libtvm.so(tvm::runtime::OpenCLModuleNode::Init()+0x130) [0x7fbedd32d4e0]\n [bt] (1) /home/fpga/jchou/TVM/tvm/build/libtvm.so(tvm::runtime::cl::OpenCLWorkspace::Init(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)+0x351) [0x7fbedd337691]\n [bt] (0) /home/fpga/jchou/TVM/tvm/build/libtvm.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x32) [0x7fbedca811d2]\n File "/home/fpga/jchou/TVM/tvm/src/runtime/opencl/opencl_device_api.cc", line 292\nTVMError: Check failed: err_code == CL_SUCCESS: OpenCL Error, code=-6: CL_OUT_OF_HOST_MEMORY',),), error_no=2, all_cost=3.4754340648651123, timestamp=1578348077.3251393) [('tile_co', [-1, 4, 2]), ('tile_oh', [-1, 1, 8]), ('tile_ow', [-1, 2, 4]), ('reorder_0', [0, 1, 2, 3, 4, 5, 6, 9, 7, 8]), ('ann_reduce', ['none', 'none']), ('ann_spatial', ['vec', 'none', 'none'])],direct,None,272777
	...