Instantly share code, notes, and snippets.

Embed
What would you like to do?
tvm cpp example
cmake_minimum_required(VERSION 3.2)
project(tvm-deploy-gpu-sample)
include_directories(/dl/mxnet/3rdparty/tvm/3rdparty/dlpack/include)
include_directories(/dl/mxnet/3rdparty/tvm/3rdparty/dmlc-core/include)
include_directories(/dl/mxnet/3rdparty/tvm/include)
function(print_cmake_vars)
get_cmake_property(_variableNames VARIABLES)
list (SORT _variableNames)
foreach (_variableName ${_variableNames})
message(STATUS "${_variableName}=${${_variableName}}")
endforeach()
endfunction()
option(TUI_USE_SHARED_TVM_RUNTIME "use shared tvm runtime lib" OFF)
if(TUI_USE_SHARED_TVM_RUNTIME)
add_executable(tvm_deploy_gpu_sample tvm_deploy_gpu_sample.cpp)
target_link_libraries(
tvm_deploy_gpu_sample
PUBLIC
/dl/mxnet/3rdparty/tvm/_builds/libtvm_runtime.so
)
else()
# add include so that relative paths in the upstream tvm_runtime_pack.cc will work
include_directories(/dl/mxnet/3rdparty/tvm/apps/howto_deploy)
add_executable(
tvm_deploy_gpu_sample
tvm_deploy_gpu_sample.cpp
# Adopt tvm_runtime_pack.cc for local modifications
tvm_runtime_pack.cc
# This could eventually go back here:
#/dl/mxnet/3rdparty/tvm/apps/howto_deploy/tvm_runtime_pack.cc
)
find_package(Threads REQUIRED)
target_link_libraries(tvm_deploy_gpu_sample PUBLIC Threads::Threads ${CMAKE_DL_LIBS})
option(TUI_USE_CPU "Use cpu runtime" OFF)
if(TUI_USE_CPU)
# currently nothing is required
target_compile_definitions(tvm_deploy_gpu_sample PUBLIC TVM_CPU_RUNTIME=1)
endif()
option(TUI_USE_OPENGL "Use opengl runtime" OFF)
if(TUI_USE_OPENGL)
find_package(OpenGL REQUIRED)
target_link_libraries(tvm_deploy_gpu_sample PUBLIC OpenGL::OpenGL)
find_package(glfw3)
target_link_libraries(tvm_deploy_gpu_sample PUBLIC glfw)
target_compile_definitions(tvm_deploy_gpu_sample PUBLIC TVM_OPENGL_RUNTIME=1)
endif()
option(TUI_USE_OPENCL "Use opencl runtime" OFF)
if(TUI_USE_OPENCL)
find_package(OpenCL REQUIRED)
target_link_libraries(tvm_deploy_gpu_sample PUBLIC OpenCL::OpenCL)
target_compile_definitions(tvm_deploy_gpu_sample PUBLIC TVM_OPENCL_RUNTIME=1)
endif()
option(TUI_USE_VULKAN "Use vulkan runtime" OFF)
if(TUI_USE_VULKAN)
find_package(Vulkan REQUIRED)
target_link_libraries(tvm_deploy_gpu_sample PUBLIC Vulkan::Vulkan)
target_compile_definitions(tvm_deploy_gpu_sample PUBLIC TVM_VULKAN_RUNTIME=1)
endif()
option(TUI_USE_CUDA "Use cuda runtime" OFF)
if(TUI_USE_CUDA)
find_package(CUDA REQUIRED)
#print_cmake_vars()
target_compile_definitions(tvm_deploy_gpu_sample PUBLIC TVM_CUDA_RUNTIME=1)
target_include_directories(tvm_deploy_gpu_sample PUBLIC ${CUDA_INCLUDE_DIRS})
target_link_libraries(tvm_deploy_gpu_sample PUBLIC
${CUDA_CUDA_LIBRARY}
${CUDA_CUDART_LIBRARY}
${CUDA_CURAND_LIBRARY})
endif()
option(TUI_USE_METAL "Use metal runtime" OFF)
if(TUI_USE_METAL)
find_package(Metal REQUIRED)
target_link_libraries(tvm_deploy_gpu_sample PUBLIC Metal::Metal)
target_compile_definitions(tvm_deploy_gpu_sample PUBLIC TVM_METAL_RUNTIME=1)
endif()
endif()
option(TUI_USE_GRAPH_RUNTIME_DEBUG "use debug runtime" ON)
target_compile_definitions(tvm_deploy_gpu_sample PUBLIC TVM_USE_GRAPH_RUNTIME_DEBUG=1)
enable_testing()
add_test(NAME tvm_deploy_gpu_sample COMMAND tvm_deploy_gpu_sample)
#define TUI_SAVE_LAYERS 0
#define TUI_TEST_DEBUG_GET_OUTPUT 1
#include <string>
#include <cstring>
#include <fstream>
#include <algorithm>
#include <chrono>
#include <iomanip>
#include <dlpack/dlpack.h>
#include <tvm/runtime/module.h>
#include <tvm/runtime/registry.h>
#include <tvm/runtime/packed_func.h>
#if TUI_SAVE_LAYERS
#include <dmlc/json.h>
#include <tvm/runtime/ndarray.h>
using tvm::runtime::NDArray;
/*
Here we borrow a partial implementation of GraphRuntime from here:
tvm/src/runtime/graph/graph_runtime.h
Since there doesn't seem to be a public hook to access the layer info
that is required for logging layer output at runtime. In particular,
we need access to:
1) the number of layers : needed to iterate over all layers
2) the layer attributes : needed by to allocate DLTensor * if using debug_get_output
This info is all contained in private member variables of GraphRuntime after
loading the JSON graph, so we simply duplicate the parts we need here for
the C++ test and serialization.
*/
struct GraphRuntimePrivateStuffWeNeed
{
/*! \brief operator attributes about tvm op */
struct TVMOpParam
{
std::string func_name;
uint32_t num_inputs;
uint32_t num_outputs;
uint32_t flatten_data;
};
struct PoolEntry
{
size_t size;
int device_type;
PoolEntry(int s, int dev_type)
: size(s)
, device_type(dev_type)
{
}
};
// Node entry
struct NodeEntry
{
uint32_t node_id;
uint32_t index;
uint32_t version;
// JSON Loader
void Load(dmlc::JSONReader* reader)
{
reader->BeginArray();
CHECK(reader->NextArrayItem()) << "invalid json format";
reader->Read(&node_id);
CHECK(reader->NextArrayItem()) << "invalid json format";
reader->Read(&index);
if (reader->NextArrayItem())
{
reader->Read(&version);
CHECK(!reader->NextArrayItem()) << "invalid json format";
}
else
{
version = 0;
}
}
};
// Node
struct Node
{
// operator type in string
std::string op_type;
// name of the op
std::string name;
// parameters
TVMOpParam param;
// inputs
std::vector<NodeEntry> inputs;
// control deps
std::vector<uint32_t> control_deps;
// JSON Loader
void LoadAttrs(dmlc::JSONReader* reader, TVMOpParam* param)
{
int bitmask = 0;
std::string key, value;
reader->BeginObject();
while (reader->NextObjectItem(&key))
{
reader->Read(&value);
if (key == "func_name")
{
param->func_name = value;
bitmask |= 1;
}
else if (key == "num_inputs")
{
param->num_inputs = strtoul(value.c_str(), nullptr, 10);
bitmask |= 2;
}
else if (key == "num_outputs")
{
param->num_outputs = strtoul(value.c_str(), nullptr, 10);
bitmask |= 4;
}
else if (key == "flatten_data")
{
param->flatten_data = strtoul(value.c_str(), nullptr, 10);
bitmask |= 8;
}
}
CHECK_EQ(bitmask, 1 | 2 | 4 | 8) << "invalid format";
}
// JSON Loader
void Load(dmlc::JSONReader* reader)
{
reader->BeginObject();
int bitmask = 0;
std::string key;
while (reader->NextObjectItem(&key))
{
if (key == "op")
{
reader->Read(&op_type);
bitmask |= 1;
}
else if (key == "name")
{
reader->Read(&name);
bitmask |= 2;
}
else if (key == "inputs")
{
reader->Read(&inputs);
bitmask |= 4;
}
else if (key == "attr" || key == "attrs")
{
this->LoadAttrs(reader, &param);
}
else if (key == "control_deps")
{
reader->Read(&control_deps);
}
else
{
LOG(FATAL) << "do not support key " << key;
}
}
CHECK_EQ(bitmask, 1 | 2 | 4) << "invalid format";
}
};
struct GraphAttr
{
size_t storage_num_not_alloctaed{ 0 };
std::vector<int> storage_id;
std::vector<int> device_index;
std::vector<std::string> dltype;
std::vector<std::vector<int64_t>> shape;
// The graph attribute fields.
void Load(dmlc::JSONReader* reader)
{
reader->BeginObject();
int bitmask = 0;
std::string key, type;
while (reader->NextObjectItem(&key))
{
if (key == "dltype")
{
reader->BeginArray();
CHECK(reader->NextArrayItem());
reader->Read(&type);
CHECK_EQ(type, "list_str");
CHECK(reader->NextArrayItem());
reader->Read(&dltype);
CHECK(!reader->NextArrayItem());
bitmask |= 1;
}
else if (key == "storage_id")
{
reader->BeginArray();
CHECK(reader->NextArrayItem());
reader->Read(&type);
CHECK_EQ(type, "list_int");
CHECK(reader->NextArrayItem());
reader->Read(&storage_id);
CHECK(!reader->NextArrayItem());
bitmask |= 2;
}
else if (key == "shape")
{
reader->BeginArray();
CHECK(reader->NextArrayItem());
reader->Read(&type);
CHECK_EQ(type, "list_shape");
CHECK(reader->NextArrayItem());
reader->Read(&shape);
CHECK(!reader->NextArrayItem());
bitmask |= 4;
}
else if (key == "device_index")
{
reader->BeginArray();
CHECK(reader->NextArrayItem());
reader->Read(&type);
CHECK_EQ(type, "list_int");
CHECK(reader->NextArrayItem());
reader->Read(&device_index);
CHECK(!reader->NextArrayItem());
}
else
{
reader->BeginArray();
CHECK(reader->NextArrayItem());
reader->Read(&type);
if (type == "list_int")
{
CHECK(reader->NextArrayItem());
std::vector<int> temp;
reader->Read(&temp);
}
else if (type == "size_t")
{
CHECK(reader->NextArrayItem());
size_t temp;
reader->Read(&temp);
}
else
{
LOG(FATAL) << "cannot skip graph attr " << key;
}
CHECK(!reader->NextArrayItem());
}
}
CHECK_EQ(bitmask, 1 | 2 | 4) << "invalid format";
}
};
// The graph attribute fields.
void Load(dmlc::JSONReader* reader)
{
reader->BeginObject();
int bitmask = 0;
std::string key;
while (reader->NextObjectItem(&key))
{
if (key == "nodes")
{
reader->Read(&nodes_);
bitmask |= 1;
}
else if (key == "arg_nodes")
{
reader->Read(&input_nodes_);
bitmask |= 2;
}
else if (key == "node_row_ptr")
{
reader->Read(&node_row_ptr_);
bitmask |= 4;
}
else if (key == "heads")
{
reader->Read(&outputs_);
bitmask |= 8;
}
else if (key == "attrs")
{
reader->Read(&attrs_);
bitmask |= 16;
}
else
{
LOG(FATAL) << "key " << key << " is not supported";
}
}
CHECK_EQ(bitmask, 1 | 2 | 4 | 8 | 16) << "invalid format";
}
// Get node entry index.
uint32_t entry_id(uint32_t nid, uint32_t index) const
{
return node_row_ptr_[nid] + index;
}
// Get node entry index.
uint32_t entry_id(const NodeEntry& e) const
{
return entry_id(e.node_id, e.index);
}
// Number of node entries.
uint32_t num_node_entries() const
{
return node_row_ptr_.back();
}
/*! \brief The graph nodes. */
std::vector<Node> nodes_;
/*! \brief The argument nodes. */
std::vector<uint32_t> input_nodes_;
/*! \brief Used for quick entry indexing. */
std::vector<uint32_t> node_row_ptr_;
/*! \brief Output entries. */
std::vector<NodeEntry> outputs_;
/*! \brief Additional graph attributes. */
GraphAttr attrs_;
/*! \brief The code module that contains both host and device code. */
tvm::runtime::Module module_;
/*! \brief Execution context of all devices including the host. */
std::vector<TVMContext> ctxs_;
/*! \brief Common storage pool for all devices. */
std::vector<NDArray> storage_pool_;
/*! \brief Data entry of each node. */
std::vector<NDArray> data_entry_;
};
#endif // TUI_SAVE_LAYERS
int main(int argc, char** argv) try
{
using Clock = std::chrono::high_resolution_clock;
using Timepoint = Clock::time_point;
using Duration = std::chrono::duration<double>;
if (argc < 2)
{
std::cerr << "usage: tvm_deploy_gpu_sample /full/path/to/from_mxnet.so " << std::endl;
return 1;
}
std::string lib = argv[1];
const std::string json_file("from_mxnet.json");
const std::string param_file("from_mxnet.params");
tvm::runtime::Module mod_syslib = tvm::runtime::Module::LoadFromFile(lib);
#if TUI_SAVE_LAYERS
GraphRuntimePrivateStuffWeNeed info;
#endif
std::string json_data;
{
std::ifstream json_in(json_file.c_str(), std::ios::in);
if (!json_in)
{
std::cerr << "Failed to read json file " << json_file << std::endl;
return 1;
}
#if TUI_SAVE_LAYERS
dmlc::JSONReader json(&json_in);
info.Load(&json);
json_in.clear(); // clear fail and eof bits
json_in.seekg(0, std::ios::beg); // back to the start
#endif
json_data.assign((std::istreambuf_iterator<char>(json_in)), std::istreambuf_iterator<char>());
}
std::string params_data;
{
std::ifstream params_in(param_file.c_str(), std::ios::binary);
if (!params_in)
{
std::cerr << "Failed to read param file " << param_file << std::endl;
return 1;
}
params_data.assign((std::istreambuf_iterator<char>(params_in)), std::istreambuf_iterator<char>());
}
TVMByteArray params_arr;
params_arr.data = params_data.data();
params_arr.size = params_data.length();
constexpr int dtype_code = kDLFloat;
constexpr int dtype_bits = 32;
constexpr int dtype_lanes = 1;
#if defined(TVM_OPENCL_RUNTIME)
constexpr int device_type = kDLOpenCL;
#elif defined(TVM_OPENGL_RUNTIME)
constexpr int device_type = 11; // kDLOpenGL;
#elif defined(TVM_VULKAN_RUNTIME)
constexpr int device_type = kDLVulkan;
#elif defined(TVM_METAL_RUNTIME)
constexpr int device_type = kDLMetal;
#elif defined(TVM_CUDA_RUNTIME)
constexpr int device_type = kDLGPU;
#elif defined(TVM_CPU_RUNTIME)
constexpr int device_type = kDLCPU;
#endif
std::cout << "device_type " << int(device_type) << std::endl;
constexpr int device_id = 0;
//const char * runtime = "tvm.graph_runtime.create";
const char* runtime = "tvm.graph_runtime_debug.create";
tvm::runtime::Module mod = (*tvm::runtime::Registry::Get(runtime))(json_data, mod_syslib, device_type, device_id);
std::cout << "load_params" << std::endl;
tvm::runtime::PackedFunc load_params = mod.GetFunction("load_params");
load_params(params_arr);
DLTensor* x = nullptr;
DLTensor* y = nullptr;
const int n_samples = 1;
// Configure input tensor for single 1x3x224x224 RGB image (floating point)
const int in_ndim = 4;
const int64_t in_shape[] = { 1, 3, 224, 224 };
TVMArrayAlloc(in_shape, in_ndim, dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &x);
const size_t in_size = in_shape[0] * in_shape[1] * in_shape[2] * in_shape[3];
std::vector<float> tvm_input(1 * in_size, 0);
// load image data saved in binary to tvm_input array
std::ifstream data_fin("cat.bin", std::ios::binary);
if (!data_fin)
{
std::cerr << "Failed to read input file cat.bin" << std::endl;
return 1;
}
data_fin.read(reinterpret_cast<char*>(tvm_input.data()), in_size * sizeof(float));
// Configure output tensor for 1x100 softmax class "probability" vector
const int out_ndim = 2;
const int64_t out_shape[] = { 1, 1000 };
TVMArrayAlloc(out_shape, out_ndim, dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &y);
const size_t out_size = out_shape[0] * out_shape[1];
std::vector<float> tvm_output(1 * out_size, 0);
tvm::runtime::PackedFunc set_input = mod.GetFunction("set_input");
tvm::runtime::PackedFunc run = mod.GetFunction("run");
tvm::runtime::PackedFunc get_output = mod.GetFunction("get_output");
tvm::runtime::PackedFunc get_output_by_layer = mod.GetFunction("get_output_by_layer");
#if TUI_TEST_DEBUG_GET_OUTPUT
tvm::runtime::PackedFunc debug_get_output = mod.GetFunction("debug_get_output");
#endif
const int warmup = 10;
int count = 0;
double total = 0.0;
for (int i = 0; i < n_samples; ++i)
{
std::cout << "iteration " << i << std::endl;
std::cout << "set_input(data, x)" << std::endl;
TVMArrayCopyFromBytes(x, tvm_input.data(), in_size * sizeof(float));
set_input("data", x);
std::cout << "run()" << std::endl;
auto tic = Clock::now();
run();
auto toc = Clock::now();
auto elapsed = Duration(toc - tic).count();
if (i > warmup)
{
count++;
total += elapsed;
std::cout << "elapsed: " << elapsed << std::endl;
}
std::cout << "get_output(0, y)" << std::endl;
get_output(0, y);
#if TUI_SAVE_LAYERS
for (int j = 0, k = 0; j < info.attrs_.shape.size(); j += 1, k++)
{
DLTensor* layer_output = nullptr;
DLTensor* layer_output2 = nullptr;
auto& shape = info.attrs_.shape[j];
auto code = info.attrs_.dltype;
std::size_t total = shape.front();
for (auto iter = shape.begin() + 1; iter != shape.end(); iter++)
{
total *= (*iter);
}
std::cout << "N=" << k << " total = " << total << std::endl;
std::cout << "get_output_by_layer(" << j << ", layer_output);" << std::endl;
std::vector<float> values(total);
layer_output = get_output_by_layer(j, 0);
TVMArrayCopyToBytes(layer_output, values.data(), values.size() * sizeof(float));
//TVMArrayFree(layer_output); // needed?
#if TUI_TEST_DEBUG_GET_OUTPUT
// debug_get_output require pre-allocation:
TVMArrayAlloc(shape.data(), shape.size(), dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &layer_output2);
std::cout << "debug_get_output(" << j << ", layer_output);" << std::endl;
auto result = debug_get_output(j, layer_output2);
std::vector<float> values2(total);
TVMArrayCopyToBytes(layer_output2, values2.data(), values2.size() * sizeof(float));
TVMArrayFree(layer_output2);
#endif // TUI_TEST_DEBUG_GET_OUTPUT
std::stringstream ss;
ss << "tvm_" << std::setw(4) << std::setfill('0') << j << '_' << info.nodes_[j].name << ".txt";
std::ofstream ofs(ss.str());
if (ofs)
{
for (std::size_t k = 0; k < values.size(); k++)
{
std::cout << "value[" << k << "] = "
<< values[k]
#if TUI_TEST_DEBUG_GET_OUTPUT
<< " "
<< values2[k]
#endif
<< std::endl;
ofs << values[k] << std::endl;
}
}
}
#endif // TUI_SAVE_LAYERS
std::cout << "TVMArrayCopyToBytes(y, y_iter, out_size * sizeof(float));" << std::endl;
float* y_iter = tvm_output.data();
TVMArrayCopyToBytes(y, y_iter, out_size * sizeof(float));
for (std::size_t i = 0; i < tvm_output.size(); i++)
{
std::cout << "score[" << i << "] = " << tvm_output[i] << std::endl;
}
// get the maximum position in output vector
auto max_iter = std::max_element(y_iter, y_iter + 1000);
auto max_index = std::distance(y_iter, max_iter);
std::cout << "The maximum position in output vector is: " << max_index << std::endl;
if (max_index != 282) // 282: 'tiger cat' (see synset.txt)
{
std::cerr << "Expected 282 but got: " << max_index << std::endl;
exit(1);
}
}
TVMArrayFree(x);
TVMArrayFree(y);
std::cout << "average: " << total / static_cast<double>(count) << std::endl;
exit(0);
}
catch (const dmlc::Error& e)
{
std::cerr << "error: " << e.what() << std::endl;
}
catch (const std::exception& e)
{
std::cerr << "exception: " << e.what() << std::endl;
}
/*!
* \brief This is an all in one TVM runtime file.
*
* You only have to use this file to compile libtvm_runtime to
* include in your project.
*
* - Copy this file into your project which depends on tvm runtime.
* - Compile with -std=c++11
* - Add the following include path
* - /path/to/tvm/include/
* - /path/to/tvm/3rdparty/dmlc-core/include/
* - /path/to/tvm/3rdparty/dlpack/include/
* - Add -lpthread -ldl to the linked library.
* - You are good to go.
* - See the Makefile in the same folder for example.
*
* The include files here are presented with relative path
* You need to remember to change it to point to the right file.
*
*/
#include "../../src/runtime/c_runtime_api.cc"
#include "../../src/runtime/cpu_device_api.cc"
#include "../../src/runtime/workspace_pool.cc"
#include "../../src/runtime/module_util.cc"
#include "../../src/runtime/module.cc"
#include "../../src/runtime/registry.cc"
#include "../../src/runtime/file_util.cc"
#include "../../src/runtime/threading_backend.cc"
#include "../../src/runtime/thread_pool.cc"
#include "../../src/runtime/ndarray.cc"
// NOTE: all the files after this are optional modules
// that you can include remove, depending on how much feature you use.
// Likely we only need to enable one of the following
// If you use Module::Load, use dso_module
// For system packed library, use system_lib_module
#include "../../src/runtime/dso_module.cc"
#include "../../src/runtime/system_lib_module.cc"
// Graph runtime
#include "../../src/runtime/graph/graph_runtime.cc"
#if defined(TVM_USE_GRAPH_RUNTIME_DEBUG)
# include "../../src/runtime/graph/debug/graph_runtime_debug.cc"
#endif
#if defined(TVM_USE_RPC)
# include "../../src/runtime/rpc/rpc_session.cc"
# include "../../src/runtime/rpc/rpc_event_impl.cc"
# include "../../src/runtime/rpc/rpc_server_env.cc"
#endif
#if defined(TVM_CUDA_RUNTIME)
# include "../../src/runtime/cuda/cuda_device_api.cc"
# include "../../src/runtime/cuda/cuda_module.cc"
#endif
#if defined(TVM_METAL_RUNTIME)
# include "../../src/runtime/metal/metal_device_api.mm"
# include "../../src/runtime/metal/metal_module.mm"
#endif
#if defined(TVM_OPENCL_RUNTIME)
# include "../../src/runtime/opencl/opencl_device_api.cc"
# include "../../src/runtime/opencl/opencl_module.cc"
#endif
#if defined(TVM_OPENGL_RUNTIME)
# include "../../src/runtime/opengl/opengl_device_api.cc"
# include "../../src/runtime/opengl/opengl_module.cc"
#endif
#if defined(TVM_VULKAN_RUNTIME)
# include "../../src/runtime/vulkan/vulkan_device_api.cc"
# include "../../src/runtime/vulkan/vulkan_module.cc"
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment