headupinclouds/CMakeLists.txt

## CMakeLists.txt
cmake_minimum_required(VERSION 3.2)
project(tvm-deploy-gpu-sample)

include_directories(/dl/mxnet/3rdparty/tvm/3rdparty/dlpack/include)
include_directories(/dl/mxnet/3rdparty/tvm/3rdparty/dmlc-core/include)
include_directories(/dl/mxnet/3rdparty/tvm/include)

function(print_cmake_vars)
  get_cmake_property(_variableNames VARIABLES)
  list (SORT _variableNames)
  foreach (_variableName ${_variableNames})
    message(STATUS "${_variableName}=${${_variableName}}")
  endforeach()
endfunction()

option(TUI_USE_SHARED_TVM_RUNTIME "use shared tvm runtime lib" OFF)

if(TUI_USE_SHARED_TVM_RUNTIME)
  add_executable(tvm_deploy_gpu_sample tvm_deploy_gpu_sample.cpp)

  target_link_libraries(
    tvm_deploy_gpu_sample
    PUBLIC
    /dl/mxnet/3rdparty/tvm/_builds/libtvm_runtime.so
  )

else()

  # add include so that relative paths in the upstream tvm_runtime_pack.cc will work
  include_directories(/dl/mxnet/3rdparty/tvm/apps/howto_deploy)

  add_executable(
    tvm_deploy_gpu_sample
    tvm_deploy_gpu_sample.cpp

    # Adopt tvm_runtime_pack.cc for local modifications
    tvm_runtime_pack.cc
    # This could eventually go back here:
    #/dl/mxnet/3rdparty/tvm/apps/howto_deploy/tvm_runtime_pack.cc
    )

  find_package(Threads REQUIRED)
  target_link_libraries(tvm_deploy_gpu_sample PUBLIC Threads::Threads ${CMAKE_DL_LIBS})

  option(TUI_USE_CPU "Use cpu runtime" OFF)
  if(TUI_USE_CPU)
    # currently nothing is required
    target_compile_definitions(tvm_deploy_gpu_sample PUBLIC TVM_CPU_RUNTIME=1)
  endif()

  option(TUI_USE_OPENGL "Use opengl runtime" OFF)
  if(TUI_USE_OPENGL)
    find_package(OpenGL REQUIRED)
    target_link_libraries(tvm_deploy_gpu_sample PUBLIC OpenGL::OpenGL)

    find_package(glfw3)
    target_link_libraries(tvm_deploy_gpu_sample PUBLIC glfw)

    target_compile_definitions(tvm_deploy_gpu_sample PUBLIC TVM_OPENGL_RUNTIME=1)
  endif()

  option(TUI_USE_OPENCL "Use opencl runtime" OFF)
  if(TUI_USE_OPENCL)
    find_package(OpenCL REQUIRED)
    target_link_libraries(tvm_deploy_gpu_sample PUBLIC OpenCL::OpenCL)
    target_compile_definitions(tvm_deploy_gpu_sample PUBLIC TVM_OPENCL_RUNTIME=1)
  endif()

  option(TUI_USE_VULKAN "Use vulkan runtime" OFF)
  if(TUI_USE_VULKAN)
    find_package(Vulkan REQUIRED)
    target_link_libraries(tvm_deploy_gpu_sample PUBLIC Vulkan::Vulkan)
    target_compile_definitions(tvm_deploy_gpu_sample PUBLIC TVM_VULKAN_RUNTIME=1)
  endif()

  option(TUI_USE_CUDA "Use cuda runtime" OFF)
  if(TUI_USE_CUDA)
    find_package(CUDA REQUIRED)
    #print_cmake_vars()
    target_compile_definitions(tvm_deploy_gpu_sample PUBLIC TVM_CUDA_RUNTIME=1)
    target_include_directories(tvm_deploy_gpu_sample PUBLIC ${CUDA_INCLUDE_DIRS})
    target_link_libraries(tvm_deploy_gpu_sample PUBLIC
      ${CUDA_CUDA_LIBRARY}
      ${CUDA_CUDART_LIBRARY}
      ${CUDA_CURAND_LIBRARY})
  endif()

  option(TUI_USE_METAL "Use metal runtime" OFF)
  if(TUI_USE_METAL)
    find_package(Metal REQUIRED)
    target_link_libraries(tvm_deploy_gpu_sample PUBLIC Metal::Metal)
    target_compile_definitions(tvm_deploy_gpu_sample PUBLIC TVM_METAL_RUNTIME=1)
  endif()

endif()

option(TUI_USE_GRAPH_RUNTIME_DEBUG "use debug runtime" ON)
target_compile_definitions(tvm_deploy_gpu_sample PUBLIC TVM_USE_GRAPH_RUNTIME_DEBUG=1)

enable_testing()
add_test(NAME tvm_deploy_gpu_sample COMMAND tvm_deploy_gpu_sample)

## tvm_deploy_gpu_sample.cpp
#define TUI_SAVE_LAYERS 0
#define TUI_TEST_DEBUG_GET_OUTPUT 1

#include <string>
#include <cstring>
#include <fstream>
#include <algorithm>
#include <chrono>
#include <iomanip>

#include <dlpack/dlpack.h>
#include <tvm/runtime/module.h>
#include <tvm/runtime/registry.h>
#include <tvm/runtime/packed_func.h>

#if TUI_SAVE_LAYERS

#include <dmlc/json.h>
#include <tvm/runtime/ndarray.h>

using tvm::runtime::NDArray;

/*

  Here we borrow a partial implementation of GraphRuntime from here:

  tvm/src/runtime/graph/graph_runtime.h

  Since there doesn't seem to be a public hook to access the layer info
  that is required for logging layer output at runtime.  In particular,
  we need access to:

  1) the number of layers : needed to iterate over all layers
  2) the layer attributes : needed by to allocate DLTensor * if using debug_get_output

  This info is all contained in private member variables of GraphRuntime after
  loading the JSON graph, so we simply duplicate the parts we need here for
  the C++ test and serialization.

 */

struct GraphRuntimePrivateStuffWeNeed
{
    /*! \brief operator attributes about tvm op */
    struct TVMOpParam
    {
        std::string func_name;
        uint32_t num_inputs;
        uint32_t num_outputs;
        uint32_t flatten_data;
    };

    struct PoolEntry
    {
        size_t size;
        int device_type;

        PoolEntry(int s, int dev_type)
            : size(s)
            , device_type(dev_type)
        {
        }
    };

    // Node entry
    struct NodeEntry
    {
        uint32_t node_id;
        uint32_t index;
        uint32_t version;

        // JSON Loader
        void Load(dmlc::JSONReader* reader)
        {
            reader->BeginArray();
            CHECK(reader->NextArrayItem()) << "invalid json format";
            reader->Read(&node_id);
            CHECK(reader->NextArrayItem()) << "invalid json format";
            reader->Read(&index);
            if (reader->NextArrayItem())
            {
                reader->Read(&version);
                CHECK(!reader->NextArrayItem()) << "invalid json format";
            }
            else
            {
                version = 0;
            }
        }
    };

    // Node
    struct Node
    {
        // operator type in string
        std::string op_type;
        // name of the op
        std::string name;
        // parameters
        TVMOpParam param;
        // inputs
        std::vector<NodeEntry> inputs;
        // control deps
        std::vector<uint32_t> control_deps;

        // JSON Loader
        void LoadAttrs(dmlc::JSONReader* reader, TVMOpParam* param)
        {
            int bitmask = 0;
            std::string key, value;
            reader->BeginObject();
            while (reader->NextObjectItem(&key))
            {
                reader->Read(&value);
                if (key == "func_name")
                {
                    param->func_name = value;
                    bitmask |= 1;
                }
                else if (key == "num_inputs")
                {
                    param->num_inputs = strtoul(value.c_str(), nullptr, 10);
                    bitmask |= 2;
                }
                else if (key == "num_outputs")
                {
                    param->num_outputs = strtoul(value.c_str(), nullptr, 10);
                    bitmask |= 4;
                }
                else if (key == "flatten_data")
                {
                    param->flatten_data = strtoul(value.c_str(), nullptr, 10);
                    bitmask |= 8;
                }
            }
            CHECK_EQ(bitmask, 1 | 2 | 4 | 8) << "invalid format";
        }

        // JSON Loader
        void Load(dmlc::JSONReader* reader)
        {
            reader->BeginObject();
            int bitmask = 0;
            std::string key;
            while (reader->NextObjectItem(&key))
            {
                if (key == "op")
                {
                    reader->Read(&op_type);
                    bitmask |= 1;
                }
                else if (key == "name")
                {
                    reader->Read(&name);
                    bitmask |= 2;
                }
                else if (key == "inputs")
                {
                    reader->Read(&inputs);
                    bitmask |= 4;
                }
                else if (key == "attr" || key == "attrs")
                {
                    this->LoadAttrs(reader, &param);
                }
                else if (key == "control_deps")
                {
                    reader->Read(&control_deps);
                }
                else
                {
                    LOG(FATAL) << "do not support key " << key;
                }
            }
            CHECK_EQ(bitmask, 1 | 2 | 4) << "invalid format";
        }
    };

    struct GraphAttr
    {
        size_t storage_num_not_alloctaed{ 0 };
        std::vector<int> storage_id;
        std::vector<int> device_index;
        std::vector<std::string> dltype;
        std::vector<std::vector<int64_t>> shape;

        // The graph attribute fields.
        void Load(dmlc::JSONReader* reader)
        {
            reader->BeginObject();
            int bitmask = 0;
            std::string key, type;
            while (reader->NextObjectItem(&key))
            {
                if (key == "dltype")
                {
                    reader->BeginArray();
                    CHECK(reader->NextArrayItem());
                    reader->Read(&type);
                    CHECK_EQ(type, "list_str");
                    CHECK(reader->NextArrayItem());
                    reader->Read(&dltype);
                    CHECK(!reader->NextArrayItem());
                    bitmask |= 1;
                }
                else if (key == "storage_id")
                {
                    reader->BeginArray();
                    CHECK(reader->NextArrayItem());
                    reader->Read(&type);
                    CHECK_EQ(type, "list_int");
                    CHECK(reader->NextArrayItem());
                    reader->Read(&storage_id);
                    CHECK(!reader->NextArrayItem());
                    bitmask |= 2;
                }
                else if (key == "shape")
                {
                    reader->BeginArray();
                    CHECK(reader->NextArrayItem());
                    reader->Read(&type);
                    CHECK_EQ(type, "list_shape");
                    CHECK(reader->NextArrayItem());
                    reader->Read(&shape);
                    CHECK(!reader->NextArrayItem());
                    bitmask |= 4;
                }
                else if (key == "device_index")
                {
                    reader->BeginArray();
                    CHECK(reader->NextArrayItem());
                    reader->Read(&type);
                    CHECK_EQ(type, "list_int");
                    CHECK(reader->NextArrayItem());
                    reader->Read(&device_index);
                    CHECK(!reader->NextArrayItem());
                }
                else
                {
                    reader->BeginArray();
                    CHECK(reader->NextArrayItem());
                    reader->Read(&type);
                    if (type == "list_int")
                    {
                        CHECK(reader->NextArrayItem());
                        std::vector<int> temp;
                        reader->Read(&temp);
                    }
                    else if (type == "size_t")
                    {
                        CHECK(reader->NextArrayItem());
                        size_t temp;
                        reader->Read(&temp);
                    }
                    else
                    {
                        LOG(FATAL) << "cannot skip graph attr " << key;
                    }
                    CHECK(!reader->NextArrayItem());
                }
            }
            CHECK_EQ(bitmask, 1 | 2 | 4) << "invalid format";
        }
    };

    // The graph attribute fields.
    void Load(dmlc::JSONReader* reader)
    {
        reader->BeginObject();
        int bitmask = 0;
        std::string key;
        while (reader->NextObjectItem(&key))
        {
            if (key == "nodes")
            {
                reader->Read(&nodes_);
                bitmask |= 1;
            }
            else if (key == "arg_nodes")
            {
                reader->Read(&input_nodes_);
                bitmask |= 2;
            }
            else if (key == "node_row_ptr")
            {
                reader->Read(&node_row_ptr_);
                bitmask |= 4;
            }
            else if (key == "heads")
            {
                reader->Read(&outputs_);
                bitmask |= 8;
            }
            else if (key == "attrs")
            {
                reader->Read(&attrs_);
                bitmask |= 16;
            }
            else
            {
                LOG(FATAL) << "key " << key << " is not supported";
            }
        }
        CHECK_EQ(bitmask, 1 | 2 | 4 | 8 | 16) << "invalid format";
    }

    // Get node entry index.
    uint32_t entry_id(uint32_t nid, uint32_t index) const
    {
        return node_row_ptr_[nid] + index;
    }

    // Get node entry index.
    uint32_t entry_id(const NodeEntry& e) const
    {
        return entry_id(e.node_id, e.index);
    }

    // Number of node entries.
    uint32_t num_node_entries() const
    {
        return node_row_ptr_.back();
    }

    /*! \brief The graph nodes. */
    std::vector<Node> nodes_;
    /*! \brief The argument nodes. */
    std::vector<uint32_t> input_nodes_;
    /*! \brief Used for quick entry indexing. */
    std::vector<uint32_t> node_row_ptr_;
    /*! \brief Output entries. */
    std::vector<NodeEntry> outputs_;
    /*! \brief Additional graph attributes. */
    GraphAttr attrs_;
    /*! \brief The code module that contains both host and device code. */
    tvm::runtime::Module module_;
    /*! \brief Execution context of all devices including the host. */
    std::vector<TVMContext> ctxs_;
    /*! \brief Common storage pool for all devices. */
    std::vector<NDArray> storage_pool_;
    /*! \brief Data entry of each node. */
    std::vector<NDArray> data_entry_;
};

#endif // TUI_SAVE_LAYERS

int main(int argc, char** argv) try
{
    using Clock = std::chrono::high_resolution_clock;
    using Timepoint = Clock::time_point;
    using Duration = std::chrono::duration<double>;

    if (argc < 2)
    {
        std::cerr << "usage: tvm_deploy_gpu_sample /full/path/to/from_mxnet.so " << std::endl;
        return 1;
    }

    std::string lib = argv[1];

    const std::string json_file("from_mxnet.json");
    const std::string param_file("from_mxnet.params");
    tvm::runtime::Module mod_syslib = tvm::runtime::Module::LoadFromFile(lib);

#if TUI_SAVE_LAYERS
    GraphRuntimePrivateStuffWeNeed info;
#endif

    std::string json_data;
    {
        std::ifstream json_in(json_file.c_str(), std::ios::in);
        if (!json_in)
        {
            std::cerr << "Failed to read json file " << json_file << std::endl;
            return 1;
        }

#if TUI_SAVE_LAYERS
        dmlc::JSONReader json(&json_in);
        info.Load(&json);
        json_in.clear();                 // clear fail and eof bits
        json_in.seekg(0, std::ios::beg); // back to the start
#endif

        json_data.assign((std::istreambuf_iterator<char>(json_in)), std::istreambuf_iterator<char>());
    }

    std::string params_data;
    {
        std::ifstream params_in(param_file.c_str(), std::ios::binary);
        if (!params_in)
        {
            std::cerr << "Failed to read param file " << param_file << std::endl;
            return 1;
        }

        params_data.assign((std::istreambuf_iterator<char>(params_in)), std::istreambuf_iterator<char>());
    }

    TVMByteArray params_arr;
    params_arr.data = params_data.data();
    params_arr.size = params_data.length();

    constexpr int dtype_code = kDLFloat;
    constexpr int dtype_bits = 32;
    constexpr int dtype_lanes = 1;

#if defined(TVM_OPENCL_RUNTIME)
    constexpr int device_type = kDLOpenCL;
#elif defined(TVM_OPENGL_RUNTIME)
    constexpr int device_type = 11; // kDLOpenGL;
#elif defined(TVM_VULKAN_RUNTIME)
    constexpr int device_type = kDLVulkan;
#elif defined(TVM_METAL_RUNTIME)
    constexpr int device_type = kDLMetal;
#elif defined(TVM_CUDA_RUNTIME)
    constexpr int device_type = kDLGPU;
#elif defined(TVM_CPU_RUNTIME)
    constexpr int device_type = kDLCPU;
#endif

    std::cout << "device_type " << int(device_type) << std::endl;

    constexpr int device_id = 0;

    //const char * runtime = "tvm.graph_runtime.create";
    const char* runtime = "tvm.graph_runtime_debug.create";

    tvm::runtime::Module mod = (*tvm::runtime::Registry::Get(runtime))(json_data, mod_syslib, device_type, device_id);

    std::cout << "load_params" << std::endl;
    tvm::runtime::PackedFunc load_params = mod.GetFunction("load_params");
    load_params(params_arr);

    DLTensor* x = nullptr;
    DLTensor* y = nullptr;

    const int n_samples = 1;

    // Configure input tensor for single 1x3x224x224 RGB image (floating point)
    const int in_ndim = 4;
    const int64_t in_shape[] = { 1, 3, 224, 224 };
    TVMArrayAlloc(in_shape, in_ndim, dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &x);
    const size_t in_size = in_shape[0] * in_shape[1] * in_shape[2] * in_shape[3];
    std::vector<float> tvm_input(1 * in_size, 0);

    // load image data saved in binary to tvm_input array
    std::ifstream data_fin("cat.bin", std::ios::binary);
    if (!data_fin)
    {
        std::cerr << "Failed to read input file cat.bin" << std::endl;
        return 1;
    }

    data_fin.read(reinterpret_cast<char*>(tvm_input.data()), in_size * sizeof(float));

    // Configure output tensor for 1x100 softmax class "probability" vector
    const int out_ndim = 2;
    const int64_t out_shape[] = { 1, 1000 };
    TVMArrayAlloc(out_shape, out_ndim, dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &y);
    const size_t out_size = out_shape[0] * out_shape[1];
    std::vector<float> tvm_output(1 * out_size, 0);

    tvm::runtime::PackedFunc set_input = mod.GetFunction("set_input");
    tvm::runtime::PackedFunc run = mod.GetFunction("run");
    tvm::runtime::PackedFunc get_output = mod.GetFunction("get_output");
    tvm::runtime::PackedFunc get_output_by_layer = mod.GetFunction("get_output_by_layer");
#if TUI_TEST_DEBUG_GET_OUTPUT
    tvm::runtime::PackedFunc debug_get_output = mod.GetFunction("debug_get_output");
#endif

    const int warmup = 10;

    int count = 0;
    double total = 0.0;

    for (int i = 0; i < n_samples; ++i)
    {

        std::cout << "iteration " << i << std::endl;

        std::cout << "set_input(data, x)" << std::endl;
        TVMArrayCopyFromBytes(x, tvm_input.data(), in_size * sizeof(float));

        set_input("data", x);

        std::cout << "run()" << std::endl;
        auto tic = Clock::now();
        run();
        auto toc = Clock::now();
        auto elapsed = Duration(toc - tic).count();
        if (i > warmup)
        {
            count++;
            total += elapsed;
            std::cout << "elapsed: " << elapsed << std::endl;
        }

        std::cout << "get_output(0, y)" << std::endl;
        get_output(0, y);

#if TUI_SAVE_LAYERS
        for (int j = 0, k = 0; j < info.attrs_.shape.size(); j += 1, k++)
        {
            DLTensor* layer_output = nullptr;
            DLTensor* layer_output2 = nullptr;

            auto& shape = info.attrs_.shape[j];
            auto code = info.attrs_.dltype;

            std::size_t total = shape.front();
            for (auto iter = shape.begin() + 1; iter != shape.end(); iter++)
            {
                total *= (*iter);
            }

            std::cout << "N=" << k << " total = " << total << std::endl;

            std::cout << "get_output_by_layer(" << j << ", layer_output);" << std::endl;
            std::vector<float> values(total);
            layer_output = get_output_by_layer(j, 0);
            TVMArrayCopyToBytes(layer_output, values.data(), values.size() * sizeof(float));

            //TVMArrayFree(layer_output); // needed?

#if TUI_TEST_DEBUG_GET_OUTPUT
            // debug_get_output require pre-allocation:
            TVMArrayAlloc(shape.data(), shape.size(), dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &layer_output2);

            std::cout << "debug_get_output(" << j << ", layer_output);" << std::endl;
            auto result = debug_get_output(j, layer_output2);
            std::vector<float> values2(total);
            TVMArrayCopyToBytes(layer_output2, values2.data(), values2.size() * sizeof(float));
            TVMArrayFree(layer_output2);
#endif // TUI_TEST_DEBUG_GET_OUTPUT

            std::stringstream ss;
            ss << "tvm_" << std::setw(4) << std::setfill('0') << j << '_' << info.nodes_[j].name << ".txt";
            std::ofstream ofs(ss.str());
            if (ofs)
            {
                for (std::size_t k = 0; k < values.size(); k++)
                {
                    std::cout << "value[" << k << "] = "
                              << values[k]
#if TUI_TEST_DEBUG_GET_OUTPUT
                              << " "
                              << values2[k]
#endif
                              << std::endl;

                    ofs << values[k] << std::endl;
                }
            }
        }
#endif // TUI_SAVE_LAYERS

        std::cout << "TVMArrayCopyToBytes(y, y_iter, out_size * sizeof(float));" << std::endl;
        float* y_iter = tvm_output.data();
        TVMArrayCopyToBytes(y, y_iter, out_size * sizeof(float));

        for (std::size_t i = 0; i < tvm_output.size(); i++)
        {
            std::cout << "score[" << i << "] = " << tvm_output[i] << std::endl;
        }

        // get the maximum position in output vector
        auto max_iter = std::max_element(y_iter, y_iter + 1000);
        auto max_index = std::distance(y_iter, max_iter);
        std::cout << "The maximum position in output vector is: " << max_index << std::endl;

        if (max_index != 282) // 282: 'tiger cat' (see synset.txt)
        {
            std::cerr << "Expected 282 but got: " << max_index << std::endl;
            exit(1);
        }
    }

    TVMArrayFree(x);
    TVMArrayFree(y);

    std::cout << "average: " << total / static_cast<double>(count) << std::endl;

    exit(0);
}
catch (const dmlc::Error& e)
{
    std::cerr << "error: " << e.what() << std::endl;
}
catch (const std::exception& e)
{
    std::cerr << "exception: " << e.what() << std::endl;
}

## tvm_runtime_pack.cc
/*!
 * \brief This is an all in one TVM runtime file.
 *
 *   You only have to use this file to compile libtvm_runtime to
 *   include in your project.
 *
 *  - Copy this file into your project which depends on tvm runtime.
 *  - Compile with -std=c++11
 *  - Add the following include path
 *     - /path/to/tvm/include/
 *     - /path/to/tvm/3rdparty/dmlc-core/include/
 *     - /path/to/tvm/3rdparty/dlpack/include/
 *   - Add -lpthread -ldl to the linked library.
 *   - You are good to go.
 *   - See the Makefile in the same folder for example.
 *
 *  The include files here are presented with relative path
 *  You need to remember to change it to point to the right file.
 *
 */
#include "../../src/runtime/c_runtime_api.cc"
#include "../../src/runtime/cpu_device_api.cc"
#include "../../src/runtime/workspace_pool.cc"
#include "../../src/runtime/module_util.cc"
#include "../../src/runtime/module.cc"
#include "../../src/runtime/registry.cc"
#include "../../src/runtime/file_util.cc"
#include "../../src/runtime/threading_backend.cc"
#include "../../src/runtime/thread_pool.cc"
#include "../../src/runtime/ndarray.cc"

// NOTE: all the files after this are optional modules
// that you can include remove, depending on how much feature you use.

// Likely we only need to enable one of the following
// If you use Module::Load, use dso_module
// For system packed library, use system_lib_module
#include "../../src/runtime/dso_module.cc"
#include "../../src/runtime/system_lib_module.cc"

// Graph runtime
#include "../../src/runtime/graph/graph_runtime.cc"

#if defined(TVM_USE_GRAPH_RUNTIME_DEBUG)
#  include "../../src/runtime/graph/debug/graph_runtime_debug.cc"
#endif

#if defined(TVM_USE_RPC)
#  include "../../src/runtime/rpc/rpc_session.cc"
#  include "../../src/runtime/rpc/rpc_event_impl.cc"
#  include "../../src/runtime/rpc/rpc_server_env.cc"
#endif

#if defined(TVM_CUDA_RUNTIME)
#  include "../../src/runtime/cuda/cuda_device_api.cc"
#  include "../../src/runtime/cuda/cuda_module.cc"
#endif

#if defined(TVM_METAL_RUNTIME)
#  include "../../src/runtime/metal/metal_device_api.mm"
#  include "../../src/runtime/metal/metal_module.mm"
#endif

#if defined(TVM_OPENCL_RUNTIME)
#  include "../../src/runtime/opencl/opencl_device_api.cc"
#  include "../../src/runtime/opencl/opencl_module.cc"
#endif

#if defined(TVM_OPENGL_RUNTIME)
#  include "../../src/runtime/opengl/opengl_device_api.cc"
#  include "../../src/runtime/opengl/opengl_module.cc"
#endif

#if defined(TVM_VULKAN_RUNTIME)
#  include "../../src/runtime/vulkan/vulkan_device_api.cc"
#  include "../../src/runtime/vulkan/vulkan_module.cc"
#endif
	cmake_minimum_required(VERSION 3.2)
	project(tvm-deploy-gpu-sample)

	include_directories(/dl/mxnet/3rdparty/tvm/3rdparty/dlpack/include)
	include_directories(/dl/mxnet/3rdparty/tvm/3rdparty/dmlc-core/include)
	include_directories(/dl/mxnet/3rdparty/tvm/include)

	function(print_cmake_vars)
	get_cmake_property(_variableNames VARIABLES)
	list (SORT _variableNames)
	foreach (_variableName ${_variableNames})
	message(STATUS "${_variableName}=${${_variableName}}")
	endforeach()
	endfunction()

	option(TUI_USE_SHARED_TVM_RUNTIME "use shared tvm runtime lib" OFF)

	if(TUI_USE_SHARED_TVM_RUNTIME)
	add_executable(tvm_deploy_gpu_sample tvm_deploy_gpu_sample.cpp)

	target_link_libraries(
	tvm_deploy_gpu_sample
	PUBLIC
	/dl/mxnet/3rdparty/tvm/_builds/libtvm_runtime.so
	)

	else()

	# add include so that relative paths in the upstream tvm_runtime_pack.cc will work
	include_directories(/dl/mxnet/3rdparty/tvm/apps/howto_deploy)

	add_executable(
	tvm_deploy_gpu_sample
	tvm_deploy_gpu_sample.cpp

	# Adopt tvm_runtime_pack.cc for local modifications
	tvm_runtime_pack.cc
	# This could eventually go back here:
	#/dl/mxnet/3rdparty/tvm/apps/howto_deploy/tvm_runtime_pack.cc
	)

	find_package(Threads REQUIRED)
	target_link_libraries(tvm_deploy_gpu_sample PUBLIC Threads::Threads ${CMAKE_DL_LIBS})

	option(TUI_USE_CPU "Use cpu runtime" OFF)
	if(TUI_USE_CPU)
	# currently nothing is required
	target_compile_definitions(tvm_deploy_gpu_sample PUBLIC TVM_CPU_RUNTIME=1)
	endif()

	option(TUI_USE_OPENGL "Use opengl runtime" OFF)
	if(TUI_USE_OPENGL)
	find_package(OpenGL REQUIRED)
	target_link_libraries(tvm_deploy_gpu_sample PUBLIC OpenGL::OpenGL)

	find_package(glfw3)
	target_link_libraries(tvm_deploy_gpu_sample PUBLIC glfw)

	target_compile_definitions(tvm_deploy_gpu_sample PUBLIC TVM_OPENGL_RUNTIME=1)
	endif()

	option(TUI_USE_OPENCL "Use opencl runtime" OFF)
	if(TUI_USE_OPENCL)
	find_package(OpenCL REQUIRED)
	target_link_libraries(tvm_deploy_gpu_sample PUBLIC OpenCL::OpenCL)
	target_compile_definitions(tvm_deploy_gpu_sample PUBLIC TVM_OPENCL_RUNTIME=1)
	endif()

	option(TUI_USE_VULKAN "Use vulkan runtime" OFF)
	if(TUI_USE_VULKAN)
	find_package(Vulkan REQUIRED)
	target_link_libraries(tvm_deploy_gpu_sample PUBLIC Vulkan::Vulkan)
	target_compile_definitions(tvm_deploy_gpu_sample PUBLIC TVM_VULKAN_RUNTIME=1)
	endif()

	option(TUI_USE_CUDA "Use cuda runtime" OFF)
	if(TUI_USE_CUDA)
	find_package(CUDA REQUIRED)
	#print_cmake_vars()
	target_compile_definitions(tvm_deploy_gpu_sample PUBLIC TVM_CUDA_RUNTIME=1)
	target_include_directories(tvm_deploy_gpu_sample PUBLIC ${CUDA_INCLUDE_DIRS})
	target_link_libraries(tvm_deploy_gpu_sample PUBLIC
	${CUDA_CUDA_LIBRARY}
	${CUDA_CUDART_LIBRARY}
	${CUDA_CURAND_LIBRARY})
	endif()

	option(TUI_USE_METAL "Use metal runtime" OFF)
	if(TUI_USE_METAL)
	find_package(Metal REQUIRED)
	target_link_libraries(tvm_deploy_gpu_sample PUBLIC Metal::Metal)
	target_compile_definitions(tvm_deploy_gpu_sample PUBLIC TVM_METAL_RUNTIME=1)
	endif()

	endif()

	option(TUI_USE_GRAPH_RUNTIME_DEBUG "use debug runtime" ON)
	target_compile_definitions(tvm_deploy_gpu_sample PUBLIC TVM_USE_GRAPH_RUNTIME_DEBUG=1)

	enable_testing()
	add_test(NAME tvm_deploy_gpu_sample COMMAND tvm_deploy_gpu_sample)
	#define TUI_SAVE_LAYERS 0
	#define TUI_TEST_DEBUG_GET_OUTPUT 1

	#include <string>
	#include <cstring>
	#include <fstream>
	#include <algorithm>
	#include <chrono>
	#include <iomanip>

	#include <dlpack/dlpack.h>
	#include <tvm/runtime/module.h>
	#include <tvm/runtime/registry.h>
	#include <tvm/runtime/packed_func.h>

	#if TUI_SAVE_LAYERS

	#include <dmlc/json.h>
	#include <tvm/runtime/ndarray.h>

	using tvm::runtime::NDArray;

	/*

	Here we borrow a partial implementation of GraphRuntime from here:

	tvm/src/runtime/graph/graph_runtime.h

	Since there doesn't seem to be a public hook to access the layer info
	that is required for logging layer output at runtime. In particular,
	we need access to:

	1) the number of layers : needed to iterate over all layers
	2) the layer attributes : needed by to allocate DLTensor * if using debug_get_output

	This info is all contained in private member variables of GraphRuntime after
	loading the JSON graph, so we simply duplicate the parts we need here for
	the C++ test and serialization.

	*/

	struct GraphRuntimePrivateStuffWeNeed
	{
	/! \brief operator attributes about tvm op /
	struct TVMOpParam
	{
	std::string func_name;
	uint32_t num_inputs;
	uint32_t num_outputs;
	uint32_t flatten_data;
	};

	struct PoolEntry
	{
	size_t size;
	int device_type;

	PoolEntry(int s, int dev_type)
	: size(s)
	, device_type(dev_type)
	{
	}
	};

	// Node entry
	struct NodeEntry
	{
	uint32_t node_id;
	uint32_t index;
	uint32_t version;

	// JSON Loader
	void Load(dmlc::JSONReader* reader)
	{
	reader->BeginArray();
	CHECK(reader->NextArrayItem()) << "invalid json format";
	reader->Read(&node_id);
	CHECK(reader->NextArrayItem()) << "invalid json format";
	reader->Read(&index);
	if (reader->NextArrayItem())
	{
	reader->Read(&version);
	CHECK(!reader->NextArrayItem()) << "invalid json format";
	}
	else
	{
	version = 0;
	}
	}
	};

	// Node
	struct Node
	{
	// operator type in string
	std::string op_type;
	// name of the op
	std::string name;
	// parameters
	TVMOpParam param;
	// inputs
	std::vector<NodeEntry> inputs;
	// control deps
	std::vector<uint32_t> control_deps;

	// JSON Loader
	void LoadAttrs(dmlc::JSONReader* reader, TVMOpParam* param)
	{
	int bitmask = 0;
	std::string key, value;
	reader->BeginObject();
	while (reader->NextObjectItem(&key))
	{
	reader->Read(&value);
	if (key == "func_name")
	{
	param->func_name = value;
	bitmask \|= 1;
	}
	else if (key == "num_inputs")
	{
	param->num_inputs = strtoul(value.c_str(), nullptr, 10);
	bitmask \|= 2;
	}
	else if (key == "num_outputs")
	{
	param->num_outputs = strtoul(value.c_str(), nullptr, 10);
	bitmask \|= 4;
	}
	else if (key == "flatten_data")
	{
	param->flatten_data = strtoul(value.c_str(), nullptr, 10);
	bitmask \|= 8;
	}
	}
	CHECK_EQ(bitmask, 1 \| 2 \| 4 \| 8) << "invalid format";
	}

	// JSON Loader
	void Load(dmlc::JSONReader* reader)
	{
	reader->BeginObject();
	int bitmask = 0;
	std::string key;
	while (reader->NextObjectItem(&key))
	{
	if (key == "op")
	{
	reader->Read(&op_type);
	bitmask \|= 1;
	}
	else if (key == "name")
	{
	reader->Read(&name);
	bitmask \|= 2;
	}
	else if (key == "inputs")
	{
	reader->Read(&inputs);
	bitmask \|= 4;
	}
	else if (key == "attr" \|\| key == "attrs")
	{
	this->LoadAttrs(reader, &param);
	}
	else if (key == "control_deps")
	{
	reader->Read(&control_deps);
	}
	else
	{
	LOG(FATAL) << "do not support key " << key;
	}
	}
	CHECK_EQ(bitmask, 1 \| 2 \| 4) << "invalid format";
	}
	};

	struct GraphAttr
	{
	size_t storage_num_not_alloctaed{ 0 };
	std::vector<int> storage_id;
	std::vector<int> device_index;
	std::vector<std::string> dltype;
	std::vector<std::vector<int64_t>> shape;

	// The graph attribute fields.
	void Load(dmlc::JSONReader* reader)
	{
	reader->BeginObject();
	int bitmask = 0;
	std::string key, type;
	while (reader->NextObjectItem(&key))
	{
	if (key == "dltype")
	{
	reader->BeginArray();
	CHECK(reader->NextArrayItem());
	reader->Read(&type);
	CHECK_EQ(type, "list_str");
	CHECK(reader->NextArrayItem());
	reader->Read(&dltype);
	CHECK(!reader->NextArrayItem());
	bitmask \|= 1;
	}
	else if (key == "storage_id")
	{
	reader->BeginArray();
	CHECK(reader->NextArrayItem());
	reader->Read(&type);
	CHECK_EQ(type, "list_int");
	CHECK(reader->NextArrayItem());
	reader->Read(&storage_id);
	CHECK(!reader->NextArrayItem());
	bitmask \|= 2;
	}
	else if (key == "shape")
	{
	reader->BeginArray();
	CHECK(reader->NextArrayItem());
	reader->Read(&type);
	CHECK_EQ(type, "list_shape");
	CHECK(reader->NextArrayItem());
	reader->Read(&shape);
	CHECK(!reader->NextArrayItem());
	bitmask \|= 4;
	}
	else if (key == "device_index")
	{
	reader->BeginArray();
	CHECK(reader->NextArrayItem());
	reader->Read(&type);
	CHECK_EQ(type, "list_int");
	CHECK(reader->NextArrayItem());
	reader->Read(&device_index);
	CHECK(!reader->NextArrayItem());
	}
	else
	{
	reader->BeginArray();
	CHECK(reader->NextArrayItem());
	reader->Read(&type);
	if (type == "list_int")
	{
	CHECK(reader->NextArrayItem());
	std::vector<int> temp;
	reader->Read(&temp);
	}
	else if (type == "size_t")
	{
	CHECK(reader->NextArrayItem());
	size_t temp;
	reader->Read(&temp);
	}
	else
	{
	LOG(FATAL) << "cannot skip graph attr " << key;
	}
	CHECK(!reader->NextArrayItem());
	}
	}
	CHECK_EQ(bitmask, 1 \| 2 \| 4) << "invalid format";
	}
	};

	// The graph attribute fields.
	void Load(dmlc::JSONReader* reader)
	{
	reader->BeginObject();
	int bitmask = 0;
	std::string key;
	while (reader->NextObjectItem(&key))
	{
	if (key == "nodes")
	{
	reader->Read(&nodes_);
	bitmask \|= 1;
	}
	else if (key == "arg_nodes")
	{
	reader->Read(&input_nodes_);
	bitmask \|= 2;
	}
	else if (key == "node_row_ptr")
	{
	reader->Read(&node_row_ptr_);
	bitmask \|= 4;
	}
	else if (key == "heads")
	{
	reader->Read(&outputs_);
	bitmask \|= 8;
	}
	else if (key == "attrs")
	{
	reader->Read(&attrs_);
	bitmask \|= 16;
	}
	else
	{
	LOG(FATAL) << "key " << key << " is not supported";
	}
	}
	CHECK_EQ(bitmask, 1 \| 2 \| 4 \| 8 \| 16) << "invalid format";
	}

	// Get node entry index.
	uint32_t entry_id(uint32_t nid, uint32_t index) const
	{
	return node_row_ptr_[nid] + index;
	}

	// Get node entry index.
	uint32_t entry_id(const NodeEntry& e) const
	{
	return entry_id(e.node_id, e.index);
	}

	// Number of node entries.
	uint32_t num_node_entries() const
	{
	return node_row_ptr_.back();
	}

	/! \brief The graph nodes. /
	std::vector<Node> nodes_;
	/! \brief The argument nodes. /
	std::vector<uint32_t> input_nodes_;
	/! \brief Used for quick entry indexing. /
	std::vector<uint32_t> node_row_ptr_;
	/! \brief Output entries. /
	std::vector<NodeEntry> outputs_;
	/! \brief Additional graph attributes. /
	GraphAttr attrs_;
	/! \brief The code module that contains both host and device code. /
	tvm::runtime::Module module_;
	/! \brief Execution context of all devices including the host. /
	std::vector<TVMContext> ctxs_;
	/! \brief Common storage pool for all devices. /
	std::vector<NDArray> storage_pool_;
	/! \brief Data entry of each node. /
	std::vector<NDArray> data_entry_;
	};

	#endif // TUI_SAVE_LAYERS

	int main(int argc, char** argv) try
	{
	using Clock = std::chrono::high_resolution_clock;
	using Timepoint = Clock::time_point;
	using Duration = std::chrono::duration<double>;

	if (argc < 2)
	{
	std::cerr << "usage: tvm_deploy_gpu_sample /full/path/to/from_mxnet.so " << std::endl;
	return 1;
	}

	std::string lib = argv[1];

	const std::string json_file("from_mxnet.json");
	const std::string param_file("from_mxnet.params");
	tvm::runtime::Module mod_syslib = tvm::runtime::Module::LoadFromFile(lib);

	#if TUI_SAVE_LAYERS
	GraphRuntimePrivateStuffWeNeed info;
	#endif

	std::string json_data;
	{
	std::ifstream json_in(json_file.c_str(), std::ios::in);
	if (!json_in)
	{
	std::cerr << "Failed to read json file " << json_file << std::endl;
	return 1;
	}

	#if TUI_SAVE_LAYERS
	dmlc::JSONReader json(&json_in);
	info.Load(&json);
	json_in.clear(); // clear fail and eof bits
	json_in.seekg(0, std::ios::beg); // back to the start
	#endif

	json_data.assign((std::istreambuf_iterator<char>(json_in)), std::istreambuf_iterator<char>());
	}

	std::string params_data;
	{
	std::ifstream params_in(param_file.c_str(), std::ios::binary);
	if (!params_in)
	{
	std::cerr << "Failed to read param file " << param_file << std::endl;
	return 1;
	}

	params_data.assign((std::istreambuf_iterator<char>(params_in)), std::istreambuf_iterator<char>());
	}

	TVMByteArray params_arr;
	params_arr.data = params_data.data();
	params_arr.size = params_data.length();

	constexpr int dtype_code = kDLFloat;
	constexpr int dtype_bits = 32;
	constexpr int dtype_lanes = 1;

	#if defined(TVM_OPENCL_RUNTIME)
	constexpr int device_type = kDLOpenCL;
	#elif defined(TVM_OPENGL_RUNTIME)
	constexpr int device_type = 11; // kDLOpenGL;
	#elif defined(TVM_VULKAN_RUNTIME)
	constexpr int device_type = kDLVulkan;
	#elif defined(TVM_METAL_RUNTIME)
	constexpr int device_type = kDLMetal;
	#elif defined(TVM_CUDA_RUNTIME)
	constexpr int device_type = kDLGPU;
	#elif defined(TVM_CPU_RUNTIME)
	constexpr int device_type = kDLCPU;
	#endif

	std::cout << "device_type " << int(device_type) << std::endl;

	constexpr int device_id = 0;

	//const char * runtime = "tvm.graph_runtime.create";
	const char* runtime = "tvm.graph_runtime_debug.create";

	tvm::runtime::Module mod = (*tvm::runtime::Registry::Get(runtime))(json_data, mod_syslib, device_type, device_id);

	std::cout << "load_params" << std::endl;
	tvm::runtime::PackedFunc load_params = mod.GetFunction("load_params");
	load_params(params_arr);

	DLTensor* x = nullptr;
	DLTensor* y = nullptr;

	const int n_samples = 1;

	// Configure input tensor for single 1x3x224x224 RGB image (floating point)
	const int in_ndim = 4;
	const int64_t in_shape[] = { 1, 3, 224, 224 };
	TVMArrayAlloc(in_shape, in_ndim, dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &x);
	const size_t in_size = in_shape[0] * in_shape[1] * in_shape[2] * in_shape[3];
	std::vector<float> tvm_input(1 * in_size, 0);

	// load image data saved in binary to tvm_input array
	std::ifstream data_fin("cat.bin", std::ios::binary);
	if (!data_fin)
	{
	std::cerr << "Failed to read input file cat.bin" << std::endl;
	return 1;
	}

	data_fin.read(reinterpret_cast<char>(tvm_input.data()), in_size sizeof(float));

	// Configure output tensor for 1x100 softmax class "probability" vector
	const int out_ndim = 2;
	const int64_t out_shape[] = { 1, 1000 };
	TVMArrayAlloc(out_shape, out_ndim, dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &y);
	const size_t out_size = out_shape[0] * out_shape[1];
	std::vector<float> tvm_output(1 * out_size, 0);

	tvm::runtime::PackedFunc set_input = mod.GetFunction("set_input");
	tvm::runtime::PackedFunc run = mod.GetFunction("run");
	tvm::runtime::PackedFunc get_output = mod.GetFunction("get_output");
	tvm::runtime::PackedFunc get_output_by_layer = mod.GetFunction("get_output_by_layer");
	#if TUI_TEST_DEBUG_GET_OUTPUT
	tvm::runtime::PackedFunc debug_get_output = mod.GetFunction("debug_get_output");
	#endif

	const int warmup = 10;

	int count = 0;
	double total = 0.0;

	for (int i = 0; i < n_samples; ++i)
	{

	std::cout << "iteration " << i << std::endl;

	std::cout << "set_input(data, x)" << std::endl;
	TVMArrayCopyFromBytes(x, tvm_input.data(), in_size * sizeof(float));

	set_input("data", x);

	std::cout << "run()" << std::endl;
	auto tic = Clock::now();
	run();
	auto toc = Clock::now();
	auto elapsed = Duration(toc - tic).count();
	if (i > warmup)
	{
	count++;
	total += elapsed;
	std::cout << "elapsed: " << elapsed << std::endl;
	}

	std::cout << "get_output(0, y)" << std::endl;
	get_output(0, y);

	#if TUI_SAVE_LAYERS
	for (int j = 0, k = 0; j < info.attrs_.shape.size(); j += 1, k++)
	{
	DLTensor* layer_output = nullptr;
	DLTensor* layer_output2 = nullptr;

	auto& shape = info.attrs_.shape[j];
	auto code = info.attrs_.dltype;

	std::size_t total = shape.front();
	for (auto iter = shape.begin() + 1; iter != shape.end(); iter++)
	{
	total = (iter);
	}

	std::cout << "N=" << k << " total = " << total << std::endl;

	std::cout << "get_output_by_layer(" << j << ", layer_output);" << std::endl;
	std::vector<float> values(total);
	layer_output = get_output_by_layer(j, 0);
	TVMArrayCopyToBytes(layer_output, values.data(), values.size() * sizeof(float));

	//TVMArrayFree(layer_output); // needed?

	#if TUI_TEST_DEBUG_GET_OUTPUT
	// debug_get_output require pre-allocation:
	TVMArrayAlloc(shape.data(), shape.size(), dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &layer_output2);

	std::cout << "debug_get_output(" << j << ", layer_output);" << std::endl;
	auto result = debug_get_output(j, layer_output2);
	std::vector<float> values2(total);
	TVMArrayCopyToBytes(layer_output2, values2.data(), values2.size() * sizeof(float));
	TVMArrayFree(layer_output2);
	#endif // TUI_TEST_DEBUG_GET_OUTPUT

	std::stringstream ss;
	ss << "tvm_" << std::setw(4) << std::setfill('0') << j << '_' << info.nodes_[j].name << ".txt";
	std::ofstream ofs(ss.str());
	if (ofs)
	{
	for (std::size_t k = 0; k < values.size(); k++)
	{
	std::cout << "value[" << k << "] = "
	<< values[k]
	#if TUI_TEST_DEBUG_GET_OUTPUT
	<< " "
	<< values2[k]
	#endif
	<< std::endl;

	ofs << values[k] << std::endl;
	}
	}
	}
	#endif // TUI_SAVE_LAYERS

	std::cout << "TVMArrayCopyToBytes(y, y_iter, out_size * sizeof(float));" << std::endl;
	float* y_iter = tvm_output.data();
	TVMArrayCopyToBytes(y, y_iter, out_size * sizeof(float));

	for (std::size_t i = 0; i < tvm_output.size(); i++)
	{
	std::cout << "score[" << i << "] = " << tvm_output[i] << std::endl;
	}

	// get the maximum position in output vector
	auto max_iter = std::max_element(y_iter, y_iter + 1000);
	auto max_index = std::distance(y_iter, max_iter);
	std::cout << "The maximum position in output vector is: " << max_index << std::endl;

	if (max_index != 282) // 282: 'tiger cat' (see synset.txt)
	{
	std::cerr << "Expected 282 but got: " << max_index << std::endl;
	exit(1);
	}
	}

	TVMArrayFree(x);
	TVMArrayFree(y);

	std::cout << "average: " << total / static_cast<double>(count) << std::endl;

	exit(0);
	}
	catch (const dmlc::Error& e)
	{
	std::cerr << "error: " << e.what() << std::endl;
	}
	catch (const std::exception& e)
	{
	std::cerr << "exception: " << e.what() << std::endl;
	}
	/*!
	* \brief This is an all in one TVM runtime file.
	*
	* You only have to use this file to compile libtvm_runtime to
	* include in your project.
	*
	* - Copy this file into your project which depends on tvm runtime.
	* - Compile with -std=c++11
	* - Add the following include path
	* - /path/to/tvm/include/
	* - /path/to/tvm/3rdparty/dmlc-core/include/
	* - /path/to/tvm/3rdparty/dlpack/include/
	* - Add -lpthread -ldl to the linked library.
	* - You are good to go.
	* - See the Makefile in the same folder for example.
	*
	* The include files here are presented with relative path
	* You need to remember to change it to point to the right file.
	*
	*/
	#include "../../src/runtime/c_runtime_api.cc"
	#include "../../src/runtime/cpu_device_api.cc"
	#include "../../src/runtime/workspace_pool.cc"
	#include "../../src/runtime/module_util.cc"
	#include "../../src/runtime/module.cc"
	#include "../../src/runtime/registry.cc"
	#include "../../src/runtime/file_util.cc"
	#include "../../src/runtime/threading_backend.cc"
	#include "../../src/runtime/thread_pool.cc"
	#include "../../src/runtime/ndarray.cc"

	// NOTE: all the files after this are optional modules
	// that you can include remove, depending on how much feature you use.

	// Likely we only need to enable one of the following
	// If you use Module::Load, use dso_module
	// For system packed library, use system_lib_module
	#include "../../src/runtime/dso_module.cc"
	#include "../../src/runtime/system_lib_module.cc"

	// Graph runtime
	#include "../../src/runtime/graph/graph_runtime.cc"

	#if defined(TVM_USE_GRAPH_RUNTIME_DEBUG)
	# include "../../src/runtime/graph/debug/graph_runtime_debug.cc"
	#endif

	#if defined(TVM_USE_RPC)
	# include "../../src/runtime/rpc/rpc_session.cc"
	# include "../../src/runtime/rpc/rpc_event_impl.cc"
	# include "../../src/runtime/rpc/rpc_server_env.cc"
	#endif

	#if defined(TVM_CUDA_RUNTIME)
	# include "../../src/runtime/cuda/cuda_device_api.cc"
	# include "../../src/runtime/cuda/cuda_module.cc"
	#endif

	#if defined(TVM_METAL_RUNTIME)
	# include "../../src/runtime/metal/metal_device_api.mm"
	# include "../../src/runtime/metal/metal_module.mm"
	#endif

	#if defined(TVM_OPENCL_RUNTIME)
	# include "../../src/runtime/opencl/opencl_device_api.cc"
	# include "../../src/runtime/opencl/opencl_module.cc"
	#endif

	#if defined(TVM_OPENGL_RUNTIME)
	# include "../../src/runtime/opengl/opengl_device_api.cc"
	# include "../../src/runtime/opengl/opengl_module.cc"
	#endif

	#if defined(TVM_VULKAN_RUNTIME)
	# include "../../src/runtime/vulkan/vulkan_device_api.cc"
	# include "../../src/runtime/vulkan/vulkan_module.cc"
	#endif