Skip to content

Instantly share code, notes, and snippets.

@yd2102

yd2102/test.cpp Secret

Created February 28, 2023 00:20
Show Gist options
  • Save yd2102/f16346d9fcc173450490fcb8ae00aa13 to your computer and use it in GitHub Desktop.
Save yd2102/f16346d9fcc173450490fcb8ae00aa13 to your computer and use it in GitHub Desktop.
NEGEMMLowpMatrixMultiplyCore performance
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/NEON/NEFunctions.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "utils/Utils.h"
#include <cstdlib>
#include <chrono>
using namespace arm_compute;
static const size_t M = 10;
static const size_t N = 768;
static const size_t K = 768;
static const size_t iterations = 10000;
static const float alpha = 1.0f;
static const float beta = 0.0f;
void benchmark(IFunction *kernel, const std::string& name, const int threads)
{
printf("[%s] Using %d threads...\n", name.c_str(), threads);
// Use specified number of threads
NEScheduler::get().set_num_threads(threads);
// Warm up kernel
for (auto i = 0; i < 100; i++)
{
kernel->run();
}
auto total = threads * iterations;
auto start = std::chrono::steady_clock::now();
// Execute kernel
for (auto i = 0; i < total; i++)
{
kernel->run();
}
auto stop = std::chrono::steady_clock::now();
std::chrono::duration<double> diff = stop - start;
double time = diff.count();
double tp = 2 * M * N * K * total / time / 1e9;
printf("[%s] %f ms/iter, %f op/s\n", name.c_str(), 1e3 * time / total, tp);
}
// Find min and max value in a float array
void find_min_max(int size, const float *data, float *min, float *max)
{
*min = *max = data[0];
for(int i = 0; i < size; i++)
{
const float val = data[i];
*min = std::min(*min, val);
*max = std::max(*max, val);
}
}
// Return reasonable quantisation parameters to use for an array of floats
// based on min and max values
QuantizationInfo choose_quantization_params(float min, float max)
{
// Extend the [min,max] interval to contain 0 so we can represent it exactly
min = std::min(min, 0.f);
max = std::max(max, 0.f);
// Set the quantized min and max in float values
const float qmin = 0;
const float qmax = 255;
// Determine the scale
const float scale = (max - min) / (qmax - qmin);
// Determine the zero-point; using affine equation val = (qval-zerop) * scale
const float zero_point_real = qmin - min / scale;
// But we need to nudge the zero_point to an integer (exact quantized value)
std::uint8_t zero_point_nudged = 0;
if(zero_point_real < qmin)
{
zero_point_nudged = qmin;
}
else if(zero_point_real > qmax)
{
zero_point_nudged = qmax;
}
else
{
zero_point_nudged = static_cast<std::uint8_t>(support::cpp11::round(zero_point_real));
}
QuantizationInfo qinfo = QuantizationInfo(scale, zero_point_nudged);
return qinfo;
}
int main(int argc, char **argv)
{
Tensor src0;
Tensor src1;
Tensor dst0;
NEGEMM fgemm;
// Populate tensor information
src0.allocator()->init(TensorInfo(TensorShape(K, M), 1, DataType::F32, DataLayout::NHWC));
src1.allocator()->init(TensorInfo(TensorShape(N, K), 1, DataType::F32, DataLayout::NHWC));
dst0.allocator()->init(TensorInfo(TensorShape(N, M), 1, DataType::F32, DataLayout::NHWC));
// Configure kernel
fgemm.configure(&src0, &src1, nullptr, &dst0, alpha, beta);
// Allocate all tensors
src0.allocator()->allocate();
src1.allocator()->allocate();
dst0.allocator()->allocate();
auto *src0_ptr = reinterpret_cast<float *>(src0.buffer());
auto *src1_ptr = reinterpret_cast<float *>(src1.buffer());
auto *dst0_ptr = reinterpret_cast<float *>(dst0.buffer());
// Initialize random inputs
utils::fill_random_tensor(src0, -1.f, 1.f);
utils::fill_random_tensor(src1, -1.f, 1.f);
// Run benchmarking
for (auto i : {1, 2, 4, 8})
{
benchmark(&fgemm, "NEGEMM", i);
}
float src0_min;
float src0_max;
float src1_min;
float src1_max;
find_min_max(M * K, src0_ptr, &src0_min, &src0_max);
find_min_max(K * N, src1_ptr, &src1_min, &src1_max);
// Get quantization parameters
const QuantizationInfo src0_qinfo = choose_quantization_params(src0_min, src0_max);
const QuantizationInfo src1_qinfo = choose_quantization_params(src1_min, src1_max);
std::cout << "Matrix 1: min=" << src0_min << ", max=" << src0_max << ", ";
std::cout << "QuantisationInfo(" << src0_qinfo.scale()[0] << ", " << src0_qinfo.offset()[0] << ")\n";
std::cout << "Matrix 2: min=" << src1_min << ", max=" << src1_max << ", ";
std::cout << "QuantisationInfo(" << src1_qinfo.scale()[0] << ", " << src1_qinfo.offset()[0] << ")\n";
// Populate tensor information
Tensor q_src0;
Tensor q_src1;
Tensor q_acc;
TensorInfo q_src0_tinfo = TensorInfo(TensorShape(K, M), 1, DataType::QASYMM8, DataLayout::NHWC);
TensorInfo q_src1_tinfo = TensorInfo(TensorShape(N, K), 1, DataType::QASYMM8, DataLayout::NHWC);
TensorInfo q_acc_tinfo = TensorInfo(TensorShape(N, M), 1, DataType::S32, DataLayout::NHWC);
q_src0_tinfo.set_quantization_info(src0_qinfo);
q_src1_tinfo.set_quantization_info(src1_qinfo);
q_src0.allocator()->init(q_src0_tinfo);
q_src1.allocator()->init(q_src1_tinfo);
q_acc.allocator()->init(q_acc_tinfo);
// Quantize inputs
NEQuantizationLayer ql_src0;
NEQuantizationLayer ql_src1;
ql_src0.configure(&src0, &q_src0);
ql_src1.configure(&src1, &q_src1);
// Configure kernel
NEGEMMLowpMatrixMultiplyCore qgemm;
qgemm.configure(&q_src0, &q_src1, nullptr, &q_acc);
// Allocate all tensors
q_src0.allocator()->allocate();
q_src1.allocator()->allocate();
q_acc.allocator()->allocate();
// Run quantization layers
ql_src0.run();
ql_src1.run();
// Run benchmarking
for (auto i : {1, 2, 4, 8})
{
benchmark(&qgemm, "NEGEMMLowpMatrixMultiplyCore", i);
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment