Skip to content

Instantly share code, notes, and snippets.

@fengyuentau
Created July 28, 2022 02:45
Show Gist options
  • Save fengyuentau/6a5a4946758e2640912ce6992cbaba4b to your computer and use it in GitHub Desktop.
Save fengyuentau/6a5a4946758e2640912ce6992cbaba4b to your computer and use it in GitHub Desktop.
Single operator inference with Ascend and OpenCV for input and output
#include "acl/acl.h"
#include "acl/ops/acl_cblas.h"
#include "acl/acl_op_compiler.h"
#include "opencv2/imgproc.hpp"
#include <iostream>
#include <vector>
#include <string>
#include <fstream>
#include <algorithm> // for transform
static std::string getType(const std::string& header)
{
std::string field = "'descr':";
int idx = header.find(field);
//CV_Assert(idx != -1);
int from = header.find('\'', idx + field.size()) + 1;
int to = header.find('\'', from);
return header.substr(from, to - from);
}
static std::string getFortranOrder(const std::string& header)
{
std::string field = "'fortran_order':";
int idx = header.find(field);
//CV_Assert(idx != -1);
int from = header.find_last_of(' ', idx + field.size()) + 1;
int to = header.find(',', from);
return header.substr(from, to - from);
}
static std::vector<int> getShape(const std::string& header)
{
std::string field = "'shape':";
int idx = header.find(field);
//CV_Assert(idx != -1);
int from = header.find('(', idx + field.size()) + 1;
int to = header.find(')', from);
std::string shapeStr = header.substr(from, to - from);
if (shapeStr.empty())
return std::vector<int>(1, 1);
// Remove all commas.
shapeStr.erase(std::remove(shapeStr.begin(), shapeStr.end(), ','),
shapeStr.end());
std::istringstream ss(shapeStr);
int value;
std::vector<int> shape;
while (ss >> value)
{
shape.push_back(value);
}
return shape;
}
cv::Mat blobFromNPY(const std::string& path)
{
std::ifstream ifs(path.c_str(), std::ios::binary);
//CV_Assert(ifs.is_open());
std::string magic(6, '*');
ifs.read(&magic[0], magic.size());
//CV_Assert(magic == "\x93NUMPY");
ifs.ignore(1); // Skip major version byte.
ifs.ignore(1); // Skip minor version byte.
unsigned short headerSize;
ifs.read((char*)&headerSize, sizeof(headerSize));
std::string header(headerSize, '*');
ifs.read(&header[0], header.size());
// Extract data type.
//CV_Assert(getType(header) == "<f4");
//CV_Assert(getFortranOrder(header) == "False");
std::vector<int> shape = getShape(header);
cv::Mat blob(shape, CV_32F);
ifs.read((char*)blob.data, blob.total() * blob.elemSize());
//CV_Assert((size_t)ifs.gcount() == blob.total() * blob.elemSize());
return blob;
}
void printBlob(const cv::Mat& m, int end)
{
const float* mptr = (const float*)m.data;
for (int i = 0; i < end; i++)
std::cout << mptr[i] << " ";
std::cout << std::endl;
}
// Conv2D
// * input shape [1, 3, 10, 10]
// * kernel shape [5, 3, 5, 5]
// * output shape [1, 5, 4, 4]
int main()
{
// Ascend resource initialization
// * init ascend
aclInit(NULL);
// * set device
int deviceID = 0;
aclrtSetDevice(deviceID);
// * create context
aclrtContext context = nullptr;
aclrtCreateContext(&context, deviceID);
// * create stream
aclrtStream stream = nullptr;
aclrtCreateStream(&stream);
int ret;
// Inputs
// * get input pointer
std::vector<int64_t> shape = {1, 3, 10, 10};
size_t inputSizeInByte = sizeof(float) * 1 * 3 * 10 * 10;
cv::Mat inputMat = blobFromNPY("./input_convolution.npy");
//printBlob(inputMat, 300);
const void* inputOnHost = (const void*)inputMat.data;
// * alloc buffer for input on device
void* inputOnDevice = nullptr;
ret = aclrtMalloc(&inputOnDevice, inputSizeInByte, ACL_MEM_MALLOC_NORMAL_ONLY);
std::cout << "inputOnDevice malloc status: " << ret << std::endl;
// * send the input data from host to device
ret = aclrtMemcpy(inputOnDevice, inputSizeInByte, inputOnHost, inputSizeInByte, ACL_MEMCPY_HOST_TO_DEVICE);
std::cout << "inputOnDevice memcpy status: " << ret << std::endl;
// Model parameters
// * w
std::vector<int64_t> w_shape = {5, 3, 5, 5};
size_t wSizeInByte = sizeof(float) * 5 * 3 * 5 * 5;
cv::Mat wMat = blobFromNPY("./convolution_w.npy");
const void* wOnHost = (const void*)wMat.data;
//printBlob(wMat, 25);
void* wOnDevice = nullptr;
ret = aclrtMalloc(&wOnDevice, wSizeInByte, ACL_MEM_MALLOC_NORMAL_ONLY);
std::cout << "wOnDevice malloc status: " << ret << std::endl;
ret = aclrtMemcpy(wOnDevice, wSizeInByte, wOnHost, wSizeInByte, ACL_MEMCPY_HOST_TO_DEVICE);
std::cout << "wOnDevice memcpy status: " << ret << std::endl;
// * b
std::vector<int64_t> b_shape = {5};
size_t bSizeInByte = sizeof(float) * 5;
cv::Mat bMat = blobFromNPY("./convolution_b.npy");
//printBlob(bMat, 5);
const void* bOnHost = (const void*)bMat.data;
void* bOnDeivce = nullptr;
ret = aclrtMalloc(&bOnDeivce, bSizeInByte, ACL_MEM_MALLOC_NORMAL_ONLY);
std::cout << "bOnDevice malloc status: " << ret << std::endl;
ret = aclrtMemcpy(bOnDeivce, bSizeInByte, bOnHost, bSizeInByte, ACL_MEMCPY_HOST_TO_DEVICE);
std::cout << "bOnDevice memcpy status: " << ret << std::endl;
// Model output
// * get output shape
std::vector<int64_t> output_shape = {1, 5, 4, 4};
size_t outputSizeInByte = sizeof(float) * 1 * 5 * 4 * 4;
// * alloc buffer for output on device
void* outputOnDevice = nullptr;
ret = aclrtMalloc(&outputOnDevice, outputSizeInByte, ACL_MEM_MALLOC_HUGE_FIRST);
std::cout << "outputonDevice malloc status: " << ret << std::endl;
// Create model
std::string opName("Conv2D");
// * set attr, stides, pads, dilations
aclopAttr* opAttr = aclopCreateAttr();
std::vector<int64_t> stridesValue = {1, 1, 2, 2}; // strides
ret = aclopSetAttrListInt(opAttr, "strides", stridesValue.size(), stridesValue.data());
std::cout << "attr set strides: " << ret << std::endl;
std::vector<int64_t> padsValue = {1, 1, 1, 1}; // pads
ret = aclopSetAttrListInt(opAttr, "pads", padsValue.size(), padsValue.data());
std::cout << "attr set pads: " << ret << std::endl;
std::vector<int64_t> dilationsValue = {1, 1, 1, 1}; // dilations
ret = aclopSetAttrListInt(opAttr, "dilations", dilationsValue.size(), dilationsValue.data());
std::cout << "attr set dilations: " << ret << std::endl;
int groups = 1;
ret = aclopSetAttrInt(opAttr, "groups", groups);
std::cout << "attr set groups: " << ret << std::endl;
int offset_x = 0;
ret = aclopSetAttrInt(opAttr, "offset_x", offset_x);
std::cout << "attr set offset_x: " << ret << std::endl;
// * set inputTensor (description)
std::vector<aclTensorDesc*> inputTensorDesc;
inputTensorDesc.push_back(aclCreateTensorDesc(ACL_FLOAT, // ACL data type
shape.size(), // num of dim
shape.data(), // dims
ACL_FORMAT_NCHW)); // ACL tensor format
inputTensorDesc.push_back(aclCreateTensorDesc(ACL_FLOAT,
w_shape.size(),
w_shape.data(),
ACL_FORMAT_NCHW));
inputTensorDesc.push_back(aclCreateTensorDesc(ACL_FLOAT,
b_shape.size(),
b_shape.data(),
ACL_FORMAT_ND));
// * set outputTensor (description), similar above
std::vector<aclTensorDesc*> outputTensorDesc;
outputTensorDesc.push_back(aclCreateTensorDesc(ACL_FLOAT,
output_shape.size(),
output_shape.data(),
ACL_FORMAT_NCHW));
// Inference
// * create data buffer for input
std::vector<aclDataBuffer*> inputBuffers;
inputBuffers.push_back(aclCreateDataBuffer(inputOnDevice, inputSizeInByte));
inputBuffers.push_back(aclCreateDataBuffer(wOnDevice, wSizeInByte));
inputBuffers.push_back(aclCreateDataBuffer(bOnDeivce, bSizeInByte));
// * create data buffer for output
std::vector<aclDataBuffer*> outputBuffers;
outputBuffers.push_back(aclCreateDataBuffer(outputOnDevice, outputSizeInByte));
// * forward: call aclopExecute()
ret = aclopCompileAndExecute(opName.c_str(),
inputTensorDesc.size(), inputTensorDesc.data(), inputBuffers.data(),
outputTensorDesc.size(), outputTensorDesc.data(), outputBuffers.data(),
opAttr, ACL_ENGINE_SYS, ACL_COMPILE_SYS, NULL, stream);
std::cout << "op execute: " << ret << std::endl;
// * synchronize stream
aclrtSynchronizeStream(stream);
// Get output - move from device to host
// * send the output data from device to host
void* outputOnHost = nullptr;
aclrtMallocHost(&outputOnHost, outputSizeInByte);
aclrtMemcpy(outputOnHost, outputSizeInByte, outputOnDevice, outputSizeInByte, ACL_MEMCPY_DEVICE_TO_HOST);
// * construct outputMat
std::vector<int> output_shape_int = {1, 5, 4, 4};
cv::Mat tmp(output_shape_int, CV_32FC1, outputOnHost);
cv::Mat outputMat;
tmp.copyTo(outputMat);
// * print
std::cout << outputMat.size << std::endl;
printBlob(outputMat, 5*4*4);
// * write to file
//ofstream outstr("res.out", ios::out | ios::binary);
//outstr.write((char*)
// Release Ascend resource
// * release stream
aclrtDestroyStream(stream);
stream = nullptr;
// * release context
aclrtDestroyContext(context);
context = nullptr;
// * reset device
aclrtResetDevice(deviceID);
// * de-init ascend
aclFinalize();
}
cmake_minimum_required(VERSION 3.16.3)
project(ascend-conv2d)
# Find OpenCV
find_package(OpenCV 4.5.4 REQUIRED)
include_directories(${OpenCV_INCLUDE_DIRS})
# Find Ascend
set(ASCEND_INSTALL_DIR $ENV{ASCEND_INSTALL_DIR})
set(ASCEND_DRIVER_DIR $ENV{ASCEND_DRIVER_DIR}/lib64)
set(ASCEND_INCLUDE_DIR "${ASCEND_INSTALL_DIR}/include")
include_directories(${ASCEND_INCLUDE_DIR})
#set(ASCEND_LIBRARY_ASCENDCL "${ASCEND_INSTALL_DIR}/acllib/lib64/libascendcl.so")
find_library(ASCEND_LIBRARY_ASCENDCL NAMES ascendcl PATHS "${ASCEND_INSTALL_DIR}/acllib/lib64" NO_DEFAULT_PATH)
find_library(ASCEND_LIBRARY_ACLOPCOMPILER NAMES acl_op_compiler PATHS "${ASCEND_INSTALL_DIR}/compiler/lib64" NO_DEFAULT_PATH)
link_directories(${ASCEND_DRIVER_DIR})
add_executable(conv2d ./ascend_conv2d.cpp)
target_link_libraries(conv2d ${OpenCV_LIBS} ${ASCEND_LIBRARY_ASCENDCL} ${ASCEND_LIBRARY_ACLOPCOMPILER})
@fengyuentau
Copy link
Author

npy files can be found in https://github.com/opencv/opencv_extra/tree/4.x/testdata/dnn/onnx. npy for weights and bias are extracted from convolution.npy.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment