Skip to content

Instantly share code, notes, and snippets.

@JoeyTPChou
Created January 6, 2020 22:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save JoeyTPChou/5c1859cfe15e34690f2334402cfe6042 to your computer and use it in GitHub Desktop.
Save JoeyTPChou/5c1859cfe15e34690f2334402cfe6042 to your computer and use it in GitHub Desktop.
Autotuning warrnings/errors on RK3399 Mali using RPC.
#! /usr/bin/env python3
import tvm
import tvm.relay as relay
import tvm.contrib.graph_runtime as runtime
from tvm import autotvm
from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
from tvm.contrib.util import tempdir
import os
import numpy as np
import logging
import pprint
from keras.models import Sequential
from keras.layers import Conv2D
_LOCAL_IP = '0.0.0.0'
_PORT = 9190
def tune_tasks(tasks,
measure_option,
tuner='xgb',
n_trial=1000,
early_stopping=None,
log_filename='tuning.log',
use_transfer_learning=True,
try_winograd=True):
if try_winograd:
for i in range(len(tasks)):
try: # try winograd template
tsk = autotvm.task.create(tasks[i].name, tasks[i].args,
tasks[i].target, tasks[i].target_host, 'winograd')
tasks.append(tsk)
except Exception:
pass
# Create tmp log file
tmp_log_file = log_filename + ".tmp"
if os.path.exists(tmp_log_file):
os.remove(tmp_log_file)
for i, tsk in enumerate(reversed(tasks)):
prefix = "[Task %2d/%2d] " % (i+1, len(tasks))
# Create tuner
if tuner == 'xgb' or tuner == 'xgb-rank':
tuner_obj = XGBTuner(tsk, loss_type='rank')
elif tuner == 'ga':
tuner_obj = GATuner(tsk, pop_size=50)
elif tuner == 'random':
tuner_obj = RandomTuner(tsk)
elif tuner == 'gridsearch':
tuner_obj = GridSearchTuner(tsk)
else:
raise ValueError("Invalid tuner: " + tuner)
if use_transfer_learning:
if os.path.isfile(tmp_log_file):
tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file))
# Do tuning
n_trial = min(n_trial, len(tsk.config_space))
tuner_obj.tune(n_trial=n_trial,
early_stopping=early_stopping,
measure_option=measure_option,
callbacks=[
autotvm.callback.progress_bar(n_trial, prefix=prefix),
autotvm.callback.log_to_file(tmp_log_file)])
# Pick best records to a cache file
autotvm.record.pick_best(tmp_log_file, log_filename)
os.remove(tmp_log_file)
def tune_and_evaluate(model_name,
func,
params,
input_shape,
use_mali=False,
use_android=False):
# Replace this with the device key in your tracker
device_key = 'rk3399'
if use_mali:
target = tvm.target.create('opencl -model=rk3399 -device=mali')
log_prefix = device_key + '_mali'
else:
target = tvm.target.arm_cpu('rk3399')
log_prefix = device_key + '_cpu'
# Replace "aarch64-linux-gnu" with the correct target of your board.
# This target host is used for cross compilation. You can query it by :code:`gcc -v` on your device.
target_host = 'llvm -target=aarch64-linux-gnu'
log_file = "%s.%s.log" % (log_prefix, model_name)
dtype = 'float32'
# Get tuning options
tuning_opt = {
'log_filename': log_file,
'tuner': 'xgb',
'n_trial': 1000,
'early_stopping': 600,
'measure_option': autotvm.measure_option(
builder=autotvm.LocalBuilder(
build_func='ndk' if use_android else 'default'),
runner=autotvm.RPCRunner(
device_key, host=_LOCAL_IP, port=_PORT,
number=10,
timeout=5,
),
),
}
# Extract workloads from relay program
print("Extract tasks...")
tasks = autotvm.task.extract_from_program(func["main"],
target=target,
target_host=target_host,
params=params, ops=(relay.op.nn.conv2d,))
print("Total {0} layers to be tuned ...".format(len(tasks)))
pprint.pprint(tasks)
tune_tasks(tasks, **tuning_opt)
# Compile kernels with history best records
with autotvm.apply_history_best(log_file):
print("Compile...")
with relay.build_config(opt_level=3):
graph, lib, params = relay.build_module.build(
func, target=target, params=params, target_host=target_host)
# Export library
tmp = tempdir()
if use_android:
from tvm.contrib import ndk
filename = "net.so"
lib.export_library(tmp.relpath(filename), ndk.create_shared)
else:
filename = "net.tar"
lib.export_library(tmp.relpath(filename))
# Upload module to device
print("Upload...")
remote = autotvm.measure.request_remote(device_key, _LOCAL_IP, _PORT,
timeout=10000)
remote.upload(tmp.relpath(filename))
rlib = remote.load_module(filename)
# Upload parameters to device
ctx = remote.context(str(target), 0)
module = runtime.create(graph, rlib, ctx)
data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
module.set_input('input_1', data_tvm)
module.set_input(**params)
# Evaluate
print("Evaluate inference time cost...")
ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=30)
prof_res = np.array(ftimer().results) * 1000 # convert to millisecond
print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
(np.mean(prof_res), np.std(prof_res)))
def _main_():
# Use mali
use_mali = True
###############################
# Create a one-layer keras model
###############################
""" This one-layer model cause a lot of warning messages
"""
input_shape = (8, 8, 32)
keras_model = Sequential([
Conv2D(input_shape=input_shape, filters=32, kernel_size=1, padding='SAME')
])
model_name = 'error_model'
input_name = keras_model.input_names[0]
###############################
# Prepare data
###############################
data = np.ones(input_shape, dtype=np.float32)
data = np.array(data)[np.newaxis, :]
data = data.transpose([0, 3, 1, 2]) # NCHW
###############################
# Compile tvm model
###############################
# Prepare input data
shape_dict = {input_name: data.shape}
func, params = relay.frontend.from_keras(keras_model, shape_dict)
###############################
# Auto Tune
###############################
# Begin tuning
tune_and_evaluate(model_name=model_name,
func=func,
params=params,
input_shape=shape_dict[input_name],
use_mali=use_mali)
if __name__ == "__main__":
_main_()
Number of platforms 1
Platform Name ARM Platform
Platform Vendor ARM
Platform Version OpenCL 1.2 v1.r14p0-01rel0-git(966ed26).f44c85cb3d2ceb87e8be88e7592755c3
Platform Profile FULL_PROFILE
Platform Extensions cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_byte_addressable_store cl_khr_3d_image_writes cl_khr_fp64 cl_khr_int64_base_atomics cl_khr_int64_extended_atomics cl_khr_fp16 cl_khr_gl_sharing cl_khr_icd cl_khr_egl_event cl_khr_egl_image cl_khr_image2d_from_buffer cl_arm_core_id cl_arm_printf cl_arm_thread_limit_hint cl_arm_non_uniform_work_group_size cl_arm_import_memory
Platform Extensions function suffix ARM
Platform Name ARM Platform
Number of devices 1
Device Name Mali-T860
Device Vendor ARM
Device Vendor ID 0x8602000
Device Version OpenCL 1.2 v1.r14p0-01rel0-git(966ed26).f44c85cb3d2ceb87e8be88e7592755c3
Driver Version 1.2
Device OpenCL C Version OpenCL C 1.2 v1.r14p0-01rel0-git(966ed26).f44c85cb3d2ceb87e8be88e7592755c3
Device Type GPU
Device Profile FULL_PROFILE
Max compute units 4
Max clock frequency 200MHz
Device Partition (core)
Max number of sub-devices 0
Supported partition types None
Max work item dimensions 3
Max work item sizes 256x256x256
Max work group size 256
Preferred work group size multiple 4
Preferred / native vector sizes
char 16 / 16
short 8 / 8
int 4 / 4
long 2 / 2
half 8 / 8 (cl_khr_fp16)
float 4 / 4
double 2 / 2 (cl_khr_fp64)
Half-precision Floating-point support (cl_khr_fp16)
Denormals Yes
Infinity and NANs Yes
Round to nearest Yes
Round to zero Yes
Round to infinity Yes
IEEE754-2008 fused multiply-add Yes
Support is emulated in software No
Correctly-rounded divide and sqrt operations No
Single-precision Floating-point support (core)
Denormals Yes
Infinity and NANs Yes
Round to nearest Yes
Round to zero Yes
Round to infinity Yes
IEEE754-2008 fused multiply-add Yes
Support is emulated in software No
Correctly-rounded divide and sqrt operations No
Double-precision Floating-point support (cl_khr_fp64)
Denormals Yes
Infinity and NANs Yes
Round to nearest Yes
Round to zero Yes
Round to infinity Yes
IEEE754-2008 fused multiply-add Yes
Support is emulated in software No
Correctly-rounded divide and sqrt operations No
Address bits 64, Little-Endian
Global memory size 4033777664 (3.757GiB)
Error Correction support No
Max memory allocation 1008444416 (961.7MiB)
Unified memory for Host and Device Yes
Minimum alignment for any data type 128 bytes
Alignment of base address 1024 bits (128 bytes)
Global Memory cache type Read/Write
Global Memory cache size 262144
Global Memory cache line 64 bytes
Image support Yes
Max number of samplers per kernel 16
Max size for 1D images from buffer 65536 pixels
Max 1D or 2D image array size 2048 images
Base address alignment for 2D image buffers 32 bytes
Pitch alignment for 2D image buffers 16 bytes
Max 2D image size 65536x65536 pixels
Max 3D image size 65536x65536x65536 pixels
Max number of read image args 128
Max number of write image args 8
Local memory type Global
Local memory size 32768 (32KiB)
Max constant buffer size 65536 (64KiB)
Max number of constant args 8
Max size of kernel argument 1024
Queue properties
Out-of-order execution Yes
Profiling Yes
Prefer user sync for interop No
Profiling timer resolution 1000ns
Execution capabilities
Run OpenCL kernels Yes
Run native kernels No
printf() buffer size 1048576 (1024KiB)
Built-in kernels
Device Available Yes
Compiler Available Yes
Linker Available Yes
Device Extensions cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_byte_addressable_store cl_khr_3d_image_writes cl_khr_fp64 cl_khr_int64_base_atomics cl_khr_int64_extended_atomics cl_khr_fp16 cl_khr_gl_sharing cl_khr_icd cl_khr_egl_event cl_khr_egl_image cl_khr_image2d_from_buffer cl_arm_core_id cl_arm_printf cl_arm_thread_limit_hint cl_arm_non_uniform_work_group_size cl_arm_import_memory
NULL platform behavior
clGetPlatformInfo(NULL, CL_PLATFORM_NAME, ...) ARM Platform
clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, ...) Success [ARM]
clCreateContext(NULL, ...) [default] Success [ARM]
clCreateContextFromType(NULL, CL_DEVICE_TYPE_CPU) No devices found in platform
clCreateContextFromType(NULL, CL_DEVICE_TYPE_GPU) Success (1)
Platform Name ARM Platform
Device Name Mali-T860
clCreateContextFromType(NULL, CL_DEVICE_TYPE_ACCELERATOR) No devices found in platform
clCreateContextFromType(NULL, CL_DEVICE_TYPE_CUSTOM) No devices found in platform
clCreateContextFromType(NULL, CL_DEVICE_TYPE_ALL) Success (1)
Platform Name ARM Platform
Device Name Mali-T860
ICD loader properties
ICD loader Name OpenCL ICD Loader
ICD loader Vendor OCL Icd free software
ICD loader Version 2.2.11
ICD loader Profile OpenCL 2.1
Extract tasks...
Total 1 layers to be tuned ...
[Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 32, 8, 8), 'float32'), ('TENSOR', (32, 32, 1, 1), 'float32'), (1, 1), (0, 0), (1, 1), 'NCHW', 'float32'), kwargs={}, workload=('conv2d', (1, 32, 8, 8, 'float32'), (32, 32, 1, 1, 'float32'), (1, 1), (0, 0), (1, 1), 'NCHW', 'float32'))]
[Task 1/ 1] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (160/1000) | 58.22 s
WARNING:autotvm:Too many errors happen in the tuning. Now is in debug mode
DEBUG:autotvm:No: 161 GFLOPS: 0.00/0.00 result: MeasureResult(costs=(RuntimeError('Traceback (most recent call last):\n [bt] (8) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::ServerLoop()+0xe4) [0x7f81fa763c]\n [bt] (7) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::HandleUntilReturnEvent(tvm::runtime::TVMRetValue*, bool, tvm::runtime::PackedFunc const*)+0x180) [0x7f81fa7368]\n [bt] (6) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::EventHandler::HandleNextEvent(tvm::runtime::TVMRetValue*, bool, tvm::runtime::PackedFunc const*)+0x240) [0x7f81fadf78]\n [bt] (5) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::EventHandler::HandleRecvPackedSeqArg()+0x318) [0x7f81fad8b8]\n [bt] (4) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::EventHandler::SwitchToState(tvm::runtime::RPCSession::EventHandler::State)+0x32c) [0x7f81fac44c]\n [bt] (3) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::EventHandler::HandlePackedCall()+0x6c8) [0x7f81fa6',),), error_no=4, all_cost=1.460705280303955, timestamp=1578348078.2993999) [('tile_co', [-1, 2, 1]), ('tile_oh', [-1, 4, 1]), ('tile_ow', [-1, 1, 8]), ('reorder_0', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), ('ann_reduce', ['none', 'none']), ('ann_spatial', ['unroll', 'unroll', 'vec'])],direct,None,220333
DEBUG:autotvm:No: 162 GFLOPS: 0.00/0.00 result: MeasureResult(costs=(RuntimeError('Traceback (most recent call last):\n [bt] (8) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::ServerLoop()+0xe4) [0x7f81fa763c]\n [bt] (7) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::HandleUntilReturnEvent(tvm::runtime::TVMRetValue*, bool, tvm::runtime::PackedFunc const*)+0x180) [0x7f81fa7368]\n [bt] (6) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::EventHandler::HandleNextEvent(tvm::runtime::TVMRetValue*, bool, tvm::runtime::PackedFunc const*)+0x240) [0x7f81fadf78]\n [bt] (5) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::EventHandler::HandleRecvPackedSeqArg()+0x318) [0x7f81fad8b8]\n [bt] (4) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::EventHandler::SwitchToState(tvm::runtime::RPCSession::EventHandler::State)+0x32c) [0x7f81fac44c]\n [bt] (3) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::EventHandler::HandlePackedCall()+0x6c8) [0x7f81fa6',),), error_no=4, all_cost=3.148015260696411, timestamp=1578348078.9530125) [('tile_co', [-1, 16, 1]), ('tile_oh', [-1, 1, 4]), ('tile_ow', [-1, 1, 4]), ('reorder_0', [0, 1, 2, 3, 4, 5, 6, 9, 7, 8]), ('ann_reduce', ['unroll', 'none']), ('ann_spatial', ['none', 'unroll', 'vec'])],direct,None,96121
DEBUG:autotvm:No: 163 GFLOPS: 0.00/0.00 result: MeasureResult(costs=(InstantiationError(['Too large factor for unrolling'],),), error_no=1, all_cost=0.03780198097229004, timestamp=1578348075.7359238) [('tile_co', [-1, 1, 32]), ('tile_oh', [-1, 1, 4]), ('tile_ow', [-1, 4, 1]), ('reorder_0', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), ('ann_reduce', ['unroll', 'unroll']), ('ann_spatial', ['vec', 'unroll', 'unroll'])],direct,None,332387
DEBUG:autotvm:No: 164 GFLOPS: 0.00/0.00 result: MeasureResult(costs=(TVMError('Traceback (most recent call last):\n [bt] (8) /home/fpga/jchou/TVM/tvm/build/libtvm.so(TVMFuncCall+0x61) [0x7fbedd2905c1]\n [bt] (7) /home/fpga/jchou/TVM/tvm/build/libtvm.so(+0x41af15) [0x7fbedcaf1f15]\n [bt] (6) /home/fpga/jchou/TVM/tvm/build/libtvm.so(tvm::codegen::Build(tvm::Array<tvm::LoweredFunc, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)+0xc85) [0x7fbedcc072a5]\n [bt] (5) /home/fpga/jchou/TVM/tvm/build/libtvm.so(std::_Function_handler<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*), void tvm::runtime::TypedPackedFunc<tvm::runtime::Module (tvm::Array<tvm::LoweredFunc, void>)>::AssignTypedLambda<tvm::runtime::Module (*)(tvm::Array<tvm::LoweredFunc, void>)>(tvm::runtime::Module (*)(tvm::Array<tvm::LoweredFunc, void>))::{lambda(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*)#1}>::_M_invoke(std::_Any_data const&, tvm::runtime::TVMArgs&&, tvm::runtime::TVMRetValue*&&)+0x4e) [0x7fbedcbfec4e]\n [bt] (4) /home/fpga/jchou/TVM/tvm/build/libtvm.so(tvm::codegen::BuildOpenCL(tvm::Array<tvm::LoweredFunc, void>)+0x384) [0x7fbedcbfe004]\n [bt] (3) /home/fpga/jchou/TVM/tvm/build/libtvm.so(tvm::runtime::OpenCLModuleCreate(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, tvm::runtime::FunctionInfo, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, tvm::runtime::FunctionInfo> > >, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >)+0x33d) [0x7fbedd32dc5d]\n [bt] (2) /home/fpga/jchou/TVM/tvm/build/libtvm.so(tvm::runtime::OpenCLModuleNode::Init()+0x130) [0x7fbedd32d4e0]\n [bt] (1) /home/fpga/jchou/TVM/tvm/build/libtvm.so(tvm::runtime::cl::OpenCLWorkspace::Init(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)+0x351) [0x7fbedd337691]\n [bt] (0) /home/fpga/jchou/TVM/tvm/build/libtvm.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x32) [0x7fbedca811d2]\n File "/home/fpga/jchou/TVM/tvm/src/runtime/opencl/opencl_device_api.cc", line 292\nTVMError: Check failed: err_code == CL_SUCCESS: OpenCL Error, code=-6: CL_OUT_OF_HOST_MEMORY',),), error_no=2, all_cost=3.4754340648651123, timestamp=1578348077.3251393) [('tile_co', [-1, 4, 2]), ('tile_oh', [-1, 1, 8]), ('tile_ow', [-1, 2, 4]), ('reorder_0', [0, 1, 2, 3, 4, 5, 6, 9, 7, 8]), ('ann_reduce', ['none', 'none']), ('ann_spatial', ['vec', 'none', 'none'])],direct,None,272777
...
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment