Created
January 6, 2020 22:10
-
-
Save JoeyTPChou/5c1859cfe15e34690f2334402cfe6042 to your computer and use it in GitHub Desktop.
Autotuning warrnings/errors on RK3399 Mali using RPC.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python3 | |
import tvm | |
import tvm.relay as relay | |
import tvm.contrib.graph_runtime as runtime | |
from tvm import autotvm | |
from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner | |
from tvm.contrib.util import tempdir | |
import os | |
import numpy as np | |
import logging | |
import pprint | |
from keras.models import Sequential | |
from keras.layers import Conv2D | |
_LOCAL_IP = '0.0.0.0' | |
_PORT = 9190 | |
def tune_tasks(tasks, | |
measure_option, | |
tuner='xgb', | |
n_trial=1000, | |
early_stopping=None, | |
log_filename='tuning.log', | |
use_transfer_learning=True, | |
try_winograd=True): | |
if try_winograd: | |
for i in range(len(tasks)): | |
try: # try winograd template | |
tsk = autotvm.task.create(tasks[i].name, tasks[i].args, | |
tasks[i].target, tasks[i].target_host, 'winograd') | |
tasks.append(tsk) | |
except Exception: | |
pass | |
# Create tmp log file | |
tmp_log_file = log_filename + ".tmp" | |
if os.path.exists(tmp_log_file): | |
os.remove(tmp_log_file) | |
for i, tsk in enumerate(reversed(tasks)): | |
prefix = "[Task %2d/%2d] " % (i+1, len(tasks)) | |
# Create tuner | |
if tuner == 'xgb' or tuner == 'xgb-rank': | |
tuner_obj = XGBTuner(tsk, loss_type='rank') | |
elif tuner == 'ga': | |
tuner_obj = GATuner(tsk, pop_size=50) | |
elif tuner == 'random': | |
tuner_obj = RandomTuner(tsk) | |
elif tuner == 'gridsearch': | |
tuner_obj = GridSearchTuner(tsk) | |
else: | |
raise ValueError("Invalid tuner: " + tuner) | |
if use_transfer_learning: | |
if os.path.isfile(tmp_log_file): | |
tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file)) | |
# Do tuning | |
n_trial = min(n_trial, len(tsk.config_space)) | |
tuner_obj.tune(n_trial=n_trial, | |
early_stopping=early_stopping, | |
measure_option=measure_option, | |
callbacks=[ | |
autotvm.callback.progress_bar(n_trial, prefix=prefix), | |
autotvm.callback.log_to_file(tmp_log_file)]) | |
# Pick best records to a cache file | |
autotvm.record.pick_best(tmp_log_file, log_filename) | |
os.remove(tmp_log_file) | |
def tune_and_evaluate(model_name, | |
func, | |
params, | |
input_shape, | |
use_mali=False, | |
use_android=False): | |
# Replace this with the device key in your tracker | |
device_key = 'rk3399' | |
if use_mali: | |
target = tvm.target.create('opencl -model=rk3399 -device=mali') | |
log_prefix = device_key + '_mali' | |
else: | |
target = tvm.target.arm_cpu('rk3399') | |
log_prefix = device_key + '_cpu' | |
# Replace "aarch64-linux-gnu" with the correct target of your board. | |
# This target host is used for cross compilation. You can query it by :code:`gcc -v` on your device. | |
target_host = 'llvm -target=aarch64-linux-gnu' | |
log_file = "%s.%s.log" % (log_prefix, model_name) | |
dtype = 'float32' | |
# Get tuning options | |
tuning_opt = { | |
'log_filename': log_file, | |
'tuner': 'xgb', | |
'n_trial': 1000, | |
'early_stopping': 600, | |
'measure_option': autotvm.measure_option( | |
builder=autotvm.LocalBuilder( | |
build_func='ndk' if use_android else 'default'), | |
runner=autotvm.RPCRunner( | |
device_key, host=_LOCAL_IP, port=_PORT, | |
number=10, | |
timeout=5, | |
), | |
), | |
} | |
# Extract workloads from relay program | |
print("Extract tasks...") | |
tasks = autotvm.task.extract_from_program(func["main"], | |
target=target, | |
target_host=target_host, | |
params=params, ops=(relay.op.nn.conv2d,)) | |
print("Total {0} layers to be tuned ...".format(len(tasks))) | |
pprint.pprint(tasks) | |
tune_tasks(tasks, **tuning_opt) | |
# Compile kernels with history best records | |
with autotvm.apply_history_best(log_file): | |
print("Compile...") | |
with relay.build_config(opt_level=3): | |
graph, lib, params = relay.build_module.build( | |
func, target=target, params=params, target_host=target_host) | |
# Export library | |
tmp = tempdir() | |
if use_android: | |
from tvm.contrib import ndk | |
filename = "net.so" | |
lib.export_library(tmp.relpath(filename), ndk.create_shared) | |
else: | |
filename = "net.tar" | |
lib.export_library(tmp.relpath(filename)) | |
# Upload module to device | |
print("Upload...") | |
remote = autotvm.measure.request_remote(device_key, _LOCAL_IP, _PORT, | |
timeout=10000) | |
remote.upload(tmp.relpath(filename)) | |
rlib = remote.load_module(filename) | |
# Upload parameters to device | |
ctx = remote.context(str(target), 0) | |
module = runtime.create(graph, rlib, ctx) | |
data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) | |
module.set_input('input_1', data_tvm) | |
module.set_input(**params) | |
# Evaluate | |
print("Evaluate inference time cost...") | |
ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=30) | |
prof_res = np.array(ftimer().results) * 1000 # convert to millisecond | |
print("Mean inference time (std dev): %.2f ms (%.2f ms)" % | |
(np.mean(prof_res), np.std(prof_res))) | |
def _main_(): | |
# Use mali | |
use_mali = True | |
############################### | |
# Create a one-layer keras model | |
############################### | |
""" This one-layer model cause a lot of warning messages | |
""" | |
input_shape = (8, 8, 32) | |
keras_model = Sequential([ | |
Conv2D(input_shape=input_shape, filters=32, kernel_size=1, padding='SAME') | |
]) | |
model_name = 'error_model' | |
input_name = keras_model.input_names[0] | |
############################### | |
# Prepare data | |
############################### | |
data = np.ones(input_shape, dtype=np.float32) | |
data = np.array(data)[np.newaxis, :] | |
data = data.transpose([0, 3, 1, 2]) # NCHW | |
############################### | |
# Compile tvm model | |
############################### | |
# Prepare input data | |
shape_dict = {input_name: data.shape} | |
func, params = relay.frontend.from_keras(keras_model, shape_dict) | |
############################### | |
# Auto Tune | |
############################### | |
# Begin tuning | |
tune_and_evaluate(model_name=model_name, | |
func=func, | |
params=params, | |
input_shape=shape_dict[input_name], | |
use_mali=use_mali) | |
if __name__ == "__main__": | |
_main_() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Number of platforms 1 | |
Platform Name ARM Platform | |
Platform Vendor ARM | |
Platform Version OpenCL 1.2 v1.r14p0-01rel0-git(966ed26).f44c85cb3d2ceb87e8be88e7592755c3 | |
Platform Profile FULL_PROFILE | |
Platform Extensions cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_byte_addressable_store cl_khr_3d_image_writes cl_khr_fp64 cl_khr_int64_base_atomics cl_khr_int64_extended_atomics cl_khr_fp16 cl_khr_gl_sharing cl_khr_icd cl_khr_egl_event cl_khr_egl_image cl_khr_image2d_from_buffer cl_arm_core_id cl_arm_printf cl_arm_thread_limit_hint cl_arm_non_uniform_work_group_size cl_arm_import_memory | |
Platform Extensions function suffix ARM | |
Platform Name ARM Platform | |
Number of devices 1 | |
Device Name Mali-T860 | |
Device Vendor ARM | |
Device Vendor ID 0x8602000 | |
Device Version OpenCL 1.2 v1.r14p0-01rel0-git(966ed26).f44c85cb3d2ceb87e8be88e7592755c3 | |
Driver Version 1.2 | |
Device OpenCL C Version OpenCL C 1.2 v1.r14p0-01rel0-git(966ed26).f44c85cb3d2ceb87e8be88e7592755c3 | |
Device Type GPU | |
Device Profile FULL_PROFILE | |
Max compute units 4 | |
Max clock frequency 200MHz | |
Device Partition (core) | |
Max number of sub-devices 0 | |
Supported partition types None | |
Max work item dimensions 3 | |
Max work item sizes 256x256x256 | |
Max work group size 256 | |
Preferred work group size multiple 4 | |
Preferred / native vector sizes | |
char 16 / 16 | |
short 8 / 8 | |
int 4 / 4 | |
long 2 / 2 | |
half 8 / 8 (cl_khr_fp16) | |
float 4 / 4 | |
double 2 / 2 (cl_khr_fp64) | |
Half-precision Floating-point support (cl_khr_fp16) | |
Denormals Yes | |
Infinity and NANs Yes | |
Round to nearest Yes | |
Round to zero Yes | |
Round to infinity Yes | |
IEEE754-2008 fused multiply-add Yes | |
Support is emulated in software No | |
Correctly-rounded divide and sqrt operations No | |
Single-precision Floating-point support (core) | |
Denormals Yes | |
Infinity and NANs Yes | |
Round to nearest Yes | |
Round to zero Yes | |
Round to infinity Yes | |
IEEE754-2008 fused multiply-add Yes | |
Support is emulated in software No | |
Correctly-rounded divide and sqrt operations No | |
Double-precision Floating-point support (cl_khr_fp64) | |
Denormals Yes | |
Infinity and NANs Yes | |
Round to nearest Yes | |
Round to zero Yes | |
Round to infinity Yes | |
IEEE754-2008 fused multiply-add Yes | |
Support is emulated in software No | |
Correctly-rounded divide and sqrt operations No | |
Address bits 64, Little-Endian | |
Global memory size 4033777664 (3.757GiB) | |
Error Correction support No | |
Max memory allocation 1008444416 (961.7MiB) | |
Unified memory for Host and Device Yes | |
Minimum alignment for any data type 128 bytes | |
Alignment of base address 1024 bits (128 bytes) | |
Global Memory cache type Read/Write | |
Global Memory cache size 262144 | |
Global Memory cache line 64 bytes | |
Image support Yes | |
Max number of samplers per kernel 16 | |
Max size for 1D images from buffer 65536 pixels | |
Max 1D or 2D image array size 2048 images | |
Base address alignment for 2D image buffers 32 bytes | |
Pitch alignment for 2D image buffers 16 bytes | |
Max 2D image size 65536x65536 pixels | |
Max 3D image size 65536x65536x65536 pixels | |
Max number of read image args 128 | |
Max number of write image args 8 | |
Local memory type Global | |
Local memory size 32768 (32KiB) | |
Max constant buffer size 65536 (64KiB) | |
Max number of constant args 8 | |
Max size of kernel argument 1024 | |
Queue properties | |
Out-of-order execution Yes | |
Profiling Yes | |
Prefer user sync for interop No | |
Profiling timer resolution 1000ns | |
Execution capabilities | |
Run OpenCL kernels Yes | |
Run native kernels No | |
printf() buffer size 1048576 (1024KiB) | |
Built-in kernels | |
Device Available Yes | |
Compiler Available Yes | |
Linker Available Yes | |
Device Extensions cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_byte_addressable_store cl_khr_3d_image_writes cl_khr_fp64 cl_khr_int64_base_atomics cl_khr_int64_extended_atomics cl_khr_fp16 cl_khr_gl_sharing cl_khr_icd cl_khr_egl_event cl_khr_egl_image cl_khr_image2d_from_buffer cl_arm_core_id cl_arm_printf cl_arm_thread_limit_hint cl_arm_non_uniform_work_group_size cl_arm_import_memory | |
NULL platform behavior | |
clGetPlatformInfo(NULL, CL_PLATFORM_NAME, ...) ARM Platform | |
clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, ...) Success [ARM] | |
clCreateContext(NULL, ...) [default] Success [ARM] | |
clCreateContextFromType(NULL, CL_DEVICE_TYPE_CPU) No devices found in platform | |
clCreateContextFromType(NULL, CL_DEVICE_TYPE_GPU) Success (1) | |
Platform Name ARM Platform | |
Device Name Mali-T860 | |
clCreateContextFromType(NULL, CL_DEVICE_TYPE_ACCELERATOR) No devices found in platform | |
clCreateContextFromType(NULL, CL_DEVICE_TYPE_CUSTOM) No devices found in platform | |
clCreateContextFromType(NULL, CL_DEVICE_TYPE_ALL) Success (1) | |
Platform Name ARM Platform | |
Device Name Mali-T860 | |
ICD loader properties | |
ICD loader Name OpenCL ICD Loader | |
ICD loader Vendor OCL Icd free software | |
ICD loader Version 2.2.11 | |
ICD loader Profile OpenCL 2.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Extract tasks... | |
Total 1 layers to be tuned ... | |
[Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 32, 8, 8), 'float32'), ('TENSOR', (32, 32, 1, 1), 'float32'), (1, 1), (0, 0), (1, 1), 'NCHW', 'float32'), kwargs={}, workload=('conv2d', (1, 32, 8, 8, 'float32'), (32, 32, 1, 1, 'float32'), (1, 1), (0, 0), (1, 1), 'NCHW', 'float32'))] | |
[Task 1/ 1] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (160/1000) | 58.22 s | |
WARNING:autotvm:Too many errors happen in the tuning. Now is in debug mode | |
DEBUG:autotvm:No: 161 GFLOPS: 0.00/0.00 result: MeasureResult(costs=(RuntimeError('Traceback (most recent call last):\n [bt] (8) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::ServerLoop()+0xe4) [0x7f81fa763c]\n [bt] (7) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::HandleUntilReturnEvent(tvm::runtime::TVMRetValue*, bool, tvm::runtime::PackedFunc const*)+0x180) [0x7f81fa7368]\n [bt] (6) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::EventHandler::HandleNextEvent(tvm::runtime::TVMRetValue*, bool, tvm::runtime::PackedFunc const*)+0x240) [0x7f81fadf78]\n [bt] (5) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::EventHandler::HandleRecvPackedSeqArg()+0x318) [0x7f81fad8b8]\n [bt] (4) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::EventHandler::SwitchToState(tvm::runtime::RPCSession::EventHandler::State)+0x32c) [0x7f81fac44c]\n [bt] (3) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::EventHandler::HandlePackedCall()+0x6c8) [0x7f81fa6',),), error_no=4, all_cost=1.460705280303955, timestamp=1578348078.2993999) [('tile_co', [-1, 2, 1]), ('tile_oh', [-1, 4, 1]), ('tile_ow', [-1, 1, 8]), ('reorder_0', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), ('ann_reduce', ['none', 'none']), ('ann_spatial', ['unroll', 'unroll', 'vec'])],direct,None,220333 | |
DEBUG:autotvm:No: 162 GFLOPS: 0.00/0.00 result: MeasureResult(costs=(RuntimeError('Traceback (most recent call last):\n [bt] (8) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::ServerLoop()+0xe4) [0x7f81fa763c]\n [bt] (7) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::HandleUntilReturnEvent(tvm::runtime::TVMRetValue*, bool, tvm::runtime::PackedFunc const*)+0x180) [0x7f81fa7368]\n [bt] (6) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::EventHandler::HandleNextEvent(tvm::runtime::TVMRetValue*, bool, tvm::runtime::PackedFunc const*)+0x240) [0x7f81fadf78]\n [bt] (5) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::EventHandler::HandleRecvPackedSeqArg()+0x318) [0x7f81fad8b8]\n [bt] (4) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::EventHandler::SwitchToState(tvm::runtime::RPCSession::EventHandler::State)+0x32c) [0x7f81fac44c]\n [bt] (3) /root/data/jchou/TVM/tvm/build/libtvm_runtime.so(tvm::runtime::RPCSession::EventHandler::HandlePackedCall()+0x6c8) [0x7f81fa6',),), error_no=4, all_cost=3.148015260696411, timestamp=1578348078.9530125) [('tile_co', [-1, 16, 1]), ('tile_oh', [-1, 1, 4]), ('tile_ow', [-1, 1, 4]), ('reorder_0', [0, 1, 2, 3, 4, 5, 6, 9, 7, 8]), ('ann_reduce', ['unroll', 'none']), ('ann_spatial', ['none', 'unroll', 'vec'])],direct,None,96121 | |
DEBUG:autotvm:No: 163 GFLOPS: 0.00/0.00 result: MeasureResult(costs=(InstantiationError(['Too large factor for unrolling'],),), error_no=1, all_cost=0.03780198097229004, timestamp=1578348075.7359238) [('tile_co', [-1, 1, 32]), ('tile_oh', [-1, 1, 4]), ('tile_ow', [-1, 4, 1]), ('reorder_0', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), ('ann_reduce', ['unroll', 'unroll']), ('ann_spatial', ['vec', 'unroll', 'unroll'])],direct,None,332387 | |
DEBUG:autotvm:No: 164 GFLOPS: 0.00/0.00 result: MeasureResult(costs=(TVMError('Traceback (most recent call last):\n [bt] (8) /home/fpga/jchou/TVM/tvm/build/libtvm.so(TVMFuncCall+0x61) [0x7fbedd2905c1]\n [bt] (7) /home/fpga/jchou/TVM/tvm/build/libtvm.so(+0x41af15) [0x7fbedcaf1f15]\n [bt] (6) /home/fpga/jchou/TVM/tvm/build/libtvm.so(tvm::codegen::Build(tvm::Array<tvm::LoweredFunc, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)+0xc85) [0x7fbedcc072a5]\n [bt] (5) /home/fpga/jchou/TVM/tvm/build/libtvm.so(std::_Function_handler<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*), void tvm::runtime::TypedPackedFunc<tvm::runtime::Module (tvm::Array<tvm::LoweredFunc, void>)>::AssignTypedLambda<tvm::runtime::Module (*)(tvm::Array<tvm::LoweredFunc, void>)>(tvm::runtime::Module (*)(tvm::Array<tvm::LoweredFunc, void>))::{lambda(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*)#1}>::_M_invoke(std::_Any_data const&, tvm::runtime::TVMArgs&&, tvm::runtime::TVMRetValue*&&)+0x4e) [0x7fbedcbfec4e]\n [bt] (4) /home/fpga/jchou/TVM/tvm/build/libtvm.so(tvm::codegen::BuildOpenCL(tvm::Array<tvm::LoweredFunc, void>)+0x384) [0x7fbedcbfe004]\n [bt] (3) /home/fpga/jchou/TVM/tvm/build/libtvm.so(tvm::runtime::OpenCLModuleCreate(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, tvm::runtime::FunctionInfo, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, tvm::runtime::FunctionInfo> > >, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >)+0x33d) [0x7fbedd32dc5d]\n [bt] (2) /home/fpga/jchou/TVM/tvm/build/libtvm.so(tvm::runtime::OpenCLModuleNode::Init()+0x130) [0x7fbedd32d4e0]\n [bt] (1) /home/fpga/jchou/TVM/tvm/build/libtvm.so(tvm::runtime::cl::OpenCLWorkspace::Init(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)+0x351) [0x7fbedd337691]\n [bt] (0) /home/fpga/jchou/TVM/tvm/build/libtvm.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x32) [0x7fbedca811d2]\n File "/home/fpga/jchou/TVM/tvm/src/runtime/opencl/opencl_device_api.cc", line 292\nTVMError: Check failed: err_code == CL_SUCCESS: OpenCL Error, code=-6: CL_OUT_OF_HOST_MEMORY',),), error_no=2, all_cost=3.4754340648651123, timestamp=1578348077.3251393) [('tile_co', [-1, 4, 2]), ('tile_oh', [-1, 1, 8]), ('tile_ow', [-1, 2, 4]), ('reorder_0', [0, 1, 2, 3, 4, 5, 6, 9, 7, 8]), ('ann_reduce', ['none', 'none']), ('ann_spatial', ['vec', 'none', 'none'])],direct,None,272777 | |
... |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment