yd2102/test.cpp Secret

## test.cpp
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/NEON/NEFunctions.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "utils/Utils.h"

#include <cstdlib>
#include <chrono>

using namespace arm_compute;

static const size_t M = 10;
static const size_t N = 768;
static const size_t K = 768;
static const size_t iterations = 10000;
static const float  alpha = 1.0f;
static const float  beta = 0.0f;

void benchmark(IFunction *kernel, const std::string& name, const int threads)
{
    printf("[%s] Using %d threads...\n", name.c_str(), threads);

    // Use specified number of threads
    NEScheduler::get().set_num_threads(threads);

    // Warm up kernel
    for (auto i = 0; i < 100; i++)
    {
        kernel->run();
    }

    auto total = threads * iterations;
    auto start = std::chrono::steady_clock::now();

    // Execute kernel
    for (auto i = 0; i < total; i++)
    {
        kernel->run();
    }

    auto stop = std::chrono::steady_clock::now();
    std::chrono::duration<double> diff = stop - start;
    double time = diff.count();
    double tp = 2 * M * N * K * total / time / 1e9;

    printf("[%s] %f ms/iter, %f op/s\n", name.c_str(), 1e3 * time / total, tp);
}

// Find min and max value in a float array
void find_min_max(int size, const float *data, float *min, float *max)
{
    *min = *max = data[0];
    for(int i = 0; i < size; i++)
    {
        const float val = data[i];
        *min            = std::min(*min, val);
        *max            = std::max(*max, val);
    }
}

// Return reasonable quantisation parameters to use for an array of floats
// based on min and max values
QuantizationInfo choose_quantization_params(float min, float max)
{
    // Extend the [min,max] interval to contain 0 so we can represent it exactly
    min = std::min(min, 0.f);
    max = std::max(max, 0.f);

    // Set the quantized min and max in float values
    const float qmin = 0;
    const float qmax = 255;

    // Determine the scale
    const float scale = (max - min) / (qmax - qmin);

    // Determine the zero-point; using affine equation val = (qval-zerop) * scale
    const float zero_point_real = qmin - min / scale;

    // But we need to nudge the zero_point to an integer (exact quantized value)
    std::uint8_t zero_point_nudged = 0;
    if(zero_point_real < qmin)
    {
        zero_point_nudged = qmin;
    }
    else if(zero_point_real > qmax)
    {
        zero_point_nudged = qmax;
    }
    else
    {
        zero_point_nudged = static_cast<std::uint8_t>(support::cpp11::round(zero_point_real));
    }

    QuantizationInfo qinfo = QuantizationInfo(scale, zero_point_nudged);
    return qinfo;
}

int main(int argc, char **argv)
{
    Tensor      src0;
    Tensor      src1;
    Tensor      dst0;
    NEGEMM      fgemm;

    // Populate tensor information
    src0.allocator()->init(TensorInfo(TensorShape(K, M), 1, DataType::F32, DataLayout::NHWC));
    src1.allocator()->init(TensorInfo(TensorShape(N, K), 1, DataType::F32, DataLayout::NHWC));
    dst0.allocator()->init(TensorInfo(TensorShape(N, M), 1, DataType::F32, DataLayout::NHWC));

    // Configure kernel
    fgemm.configure(&src0, &src1, nullptr, &dst0, alpha, beta);

    // Allocate all tensors
    src0.allocator()->allocate();
    src1.allocator()->allocate();
    dst0.allocator()->allocate();
    auto *src0_ptr = reinterpret_cast<float *>(src0.buffer());
    auto *src1_ptr = reinterpret_cast<float *>(src1.buffer());
    auto *dst0_ptr = reinterpret_cast<float *>(dst0.buffer());

    // Initialize random inputs
    utils::fill_random_tensor(src0, -1.f, 1.f);
    utils::fill_random_tensor(src1, -1.f, 1.f);

    // Run benchmarking
    for (auto i : {1, 2, 4, 8})
    {
        benchmark(&fgemm, "NEGEMM", i);
    }

    float src0_min;
    float src0_max;
    float src1_min;
    float src1_max;
    find_min_max(M * K, src0_ptr, &src0_min, &src0_max);
    find_min_max(K * N, src1_ptr, &src1_min, &src1_max);

    // Get quantization parameters
    const QuantizationInfo src0_qinfo = choose_quantization_params(src0_min, src0_max);
    const QuantizationInfo src1_qinfo = choose_quantization_params(src1_min, src1_max);
    std::cout << "Matrix 1: min=" << src0_min << ", max=" << src0_max << ", ";
    std::cout << "QuantisationInfo(" << src0_qinfo.scale()[0] << ", " << src0_qinfo.offset()[0] << ")\n";
    std::cout << "Matrix 2: min=" << src1_min << ", max=" << src1_max << ", ";
    std::cout << "QuantisationInfo(" << src1_qinfo.scale()[0] << ", " << src1_qinfo.offset()[0] << ")\n";

    // Populate tensor information
    Tensor q_src0;
    Tensor q_src1;
    Tensor q_acc;
    TensorInfo q_src0_tinfo = TensorInfo(TensorShape(K, M), 1, DataType::QASYMM8, DataLayout::NHWC);
    TensorInfo q_src1_tinfo = TensorInfo(TensorShape(N, K), 1, DataType::QASYMM8, DataLayout::NHWC);
    TensorInfo q_acc_tinfo = TensorInfo(TensorShape(N, M), 1, DataType::S32, DataLayout::NHWC);
    q_src0_tinfo.set_quantization_info(src0_qinfo);
    q_src1_tinfo.set_quantization_info(src1_qinfo);
    q_src0.allocator()->init(q_src0_tinfo);
    q_src1.allocator()->init(q_src1_tinfo);
    q_acc.allocator()->init(q_acc_tinfo);

    // Quantize inputs
    NEQuantizationLayer ql_src0;
    NEQuantizationLayer ql_src1;
    ql_src0.configure(&src0, &q_src0);
    ql_src1.configure(&src1, &q_src1);

    // Configure kernel
    NEGEMMLowpMatrixMultiplyCore qgemm;
    qgemm.configure(&q_src0, &q_src1, nullptr, &q_acc);

    // Allocate all tensors
    q_src0.allocator()->allocate();
    q_src1.allocator()->allocate();
    q_acc.allocator()->allocate();

    // Run quantization layers
    ql_src0.run();
    ql_src1.run();

    // Run benchmarking
    for (auto i : {1, 2, 4, 8})
    {
        benchmark(&qgemm, "NEGEMMLowpMatrixMultiplyCore", i);
    }

    return 0;
}
	#include "arm_compute/core/Types.h"
	#include "arm_compute/runtime/NEON/NEFunctions.h"
	#include "arm_compute/runtime/NEON/NEScheduler.h"
	#include "utils/Utils.h"

	#include <cstdlib>
	#include <chrono>

	using namespace arm_compute;

	static const size_t M = 10;
	static const size_t N = 768;
	static const size_t K = 768;
	static const size_t iterations = 10000;
	static const float alpha = 1.0f;
	static const float beta = 0.0f;

	void benchmark(IFunction *kernel, const std::string& name, const int threads)
	{
	printf("[%s] Using %d threads...\n", name.c_str(), threads);

	// Use specified number of threads
	NEScheduler::get().set_num_threads(threads);

	// Warm up kernel
	for (auto i = 0; i < 100; i++)
	{
	kernel->run();
	}

	auto total = threads * iterations;
	auto start = std::chrono::steady_clock::now();

	// Execute kernel
	for (auto i = 0; i < total; i++)
	{
	kernel->run();
	}

	auto stop = std::chrono::steady_clock::now();
	std::chrono::duration<double> diff = stop - start;
	double time = diff.count();
	double tp = 2 * M * N * K * total / time / 1e9;

	printf("[%s] %f ms/iter, %f op/s\n", name.c_str(), 1e3 * time / total, tp);
	}

	// Find min and max value in a float array
	void find_min_max(int size, const float data, float min, float *max)
	{
	min = max = data[0];
	for(int i = 0; i < size; i++)
	{
	const float val = data[i];
	min = std::min(min, val);
	max = std::max(max, val);
	}
	}

	// Return reasonable quantisation parameters to use for an array of floats
	// based on min and max values
	QuantizationInfo choose_quantization_params(float min, float max)
	{
	// Extend the [min,max] interval to contain 0 so we can represent it exactly
	min = std::min(min, 0.f);
	max = std::max(max, 0.f);

	// Set the quantized min and max in float values
	const float qmin = 0;
	const float qmax = 255;

	// Determine the scale
	const float scale = (max - min) / (qmax - qmin);

	// Determine the zero-point; using affine equation val = (qval-zerop) * scale
	const float zero_point_real = qmin - min / scale;

	// But we need to nudge the zero_point to an integer (exact quantized value)
	std::uint8_t zero_point_nudged = 0;
	if(zero_point_real < qmin)
	{
	zero_point_nudged = qmin;
	}
	else if(zero_point_real > qmax)
	{
	zero_point_nudged = qmax;
	}
	else
	{
	zero_point_nudged = static_cast<std::uint8_t>(support::cpp11::round(zero_point_real));
	}

	QuantizationInfo qinfo = QuantizationInfo(scale, zero_point_nudged);
	return qinfo;
	}

	int main(int argc, char **argv)
	{
	Tensor src0;
	Tensor src1;
	Tensor dst0;
	NEGEMM fgemm;

	// Populate tensor information
	src0.allocator()->init(TensorInfo(TensorShape(K, M), 1, DataType::F32, DataLayout::NHWC));
	src1.allocator()->init(TensorInfo(TensorShape(N, K), 1, DataType::F32, DataLayout::NHWC));
	dst0.allocator()->init(TensorInfo(TensorShape(N, M), 1, DataType::F32, DataLayout::NHWC));

	// Configure kernel
	fgemm.configure(&src0, &src1, nullptr, &dst0, alpha, beta);

	// Allocate all tensors
	src0.allocator()->allocate();
	src1.allocator()->allocate();
	dst0.allocator()->allocate();
	auto src0_ptr = reinterpret_cast<float >(src0.buffer());
	auto src1_ptr = reinterpret_cast<float >(src1.buffer());
	auto dst0_ptr = reinterpret_cast<float >(dst0.buffer());

	// Initialize random inputs
	utils::fill_random_tensor(src0, -1.f, 1.f);
	utils::fill_random_tensor(src1, -1.f, 1.f);

	// Run benchmarking
	for (auto i : {1, 2, 4, 8})
	{
	benchmark(&fgemm, "NEGEMM", i);
	}

	float src0_min;
	float src0_max;
	float src1_min;
	float src1_max;
	find_min_max(M * K, src0_ptr, &src0_min, &src0_max);
	find_min_max(K * N, src1_ptr, &src1_min, &src1_max);

	// Get quantization parameters
	const QuantizationInfo src0_qinfo = choose_quantization_params(src0_min, src0_max);
	const QuantizationInfo src1_qinfo = choose_quantization_params(src1_min, src1_max);
	std::cout << "Matrix 1: min=" << src0_min << ", max=" << src0_max << ", ";
	std::cout << "QuantisationInfo(" << src0_qinfo.scale()[0] << ", " << src0_qinfo.offset()[0] << ")\n";
	std::cout << "Matrix 2: min=" << src1_min << ", max=" << src1_max << ", ";
	std::cout << "QuantisationInfo(" << src1_qinfo.scale()[0] << ", " << src1_qinfo.offset()[0] << ")\n";

	// Populate tensor information
	Tensor q_src0;
	Tensor q_src1;
	Tensor q_acc;
	TensorInfo q_src0_tinfo = TensorInfo(TensorShape(K, M), 1, DataType::QASYMM8, DataLayout::NHWC);
	TensorInfo q_src1_tinfo = TensorInfo(TensorShape(N, K), 1, DataType::QASYMM8, DataLayout::NHWC);
	TensorInfo q_acc_tinfo = TensorInfo(TensorShape(N, M), 1, DataType::S32, DataLayout::NHWC);
	q_src0_tinfo.set_quantization_info(src0_qinfo);
	q_src1_tinfo.set_quantization_info(src1_qinfo);
	q_src0.allocator()->init(q_src0_tinfo);
	q_src1.allocator()->init(q_src1_tinfo);
	q_acc.allocator()->init(q_acc_tinfo);

	// Quantize inputs
	NEQuantizationLayer ql_src0;
	NEQuantizationLayer ql_src1;
	ql_src0.configure(&src0, &q_src0);
	ql_src1.configure(&src1, &q_src1);

	// Configure kernel
	NEGEMMLowpMatrixMultiplyCore qgemm;
	qgemm.configure(&q_src0, &q_src1, nullptr, &q_acc);

	// Allocate all tensors
	q_src0.allocator()->allocate();
	q_src1.allocator()->allocate();
	q_acc.allocator()->allocate();

	// Run quantization layers
	ql_src0.run();
	ql_src1.run();

	// Run benchmarking
	for (auto i : {1, 2, 4, 8})
	{
	benchmark(&qgemm, "NEGEMMLowpMatrixMultiplyCore", i);
	}

	return 0;
	}