jarmitage/readme.md

## readme.md

      
    Raw
  

              readme.md
            
          
    SOFIE


Code generation for fast inference of Deep Learning models
ROOT/TMVA SOFIE (“System for Optimized Fast Inference code Emit”) is a new package introduced in this release that generates C++ functions easily invokable for the fast inference of trained neural network models. It takes ONNX model files as inputs and produces C++ header files that can be included and utilized in a “plug-and-go” style. This is a new development and it is currently still in experimental stage.
SOFIE can take your trained ONNX model and generate blazingly fast C++ code from it, depending only on BLAS.


Announcement https://root.cern/doc/v626/release-notes.html#sofie-code-generation-for-fast-inference-of-deep-learning-models
SOFIE was created by Sitong An https://sitongan.github.io/ a Marie Curie Fellow at CERN
Supported ONNX operators: https://github.com/root-project/root/blob/master/tmva/sofie/inc/TMVA/OperatorList.hxx
SOFIE is part of TMVA, the ROOT Machine Learning library https://root.cern/manual/tmva/
Sitang's demo models can be found here: https://github.com/sitongan/TMVAFastInferencePrototype

SOFIE installation

Requires building ROOT from source with experimental flags: https://root.cern/install/build_from_source/
git clone git@github.com:root-project/root.git

cd root

mkdir root_install root_build

cd root_build

cmake -DCMAKE_INSTALL_PREFIX=../root_install ../ -Dtmva-sofie=ON -Dtmva-pymva=On -DPython3_EXECUTABLE=CHOOSEYOURPYTHONBINARY

cmake --build . --target install -j2

source <installdir>/bin/thisroot.sh

Bela results 2022-04-07

Requires Eigen library to be installed on Bela.
I installed via apt-get install libeigen3-dev but for some reason could only include it by editing the model files with #include "../../../usr/include/eigen3/Eigen/Eigen" :/.


ONNX
Time (us)
CPU %
Notes


Linear_2
650
9.3


Linear_4
720
9.5


Linear_8
920
10


Linear_16
1480
10.3


Linear_32


segfault


Linear_64


No .hxx file available


Linear_event


undefined reference to `sgemv_'


Linear_RDF


undefined reference to `sgemv_'


LinearNN


segfault


Tried with this: https://github.com/rodrigodzf/DeepLearningForBela/blob/main/python/mlp_pytorch.py
And got ~570ms per pass, apparently 25x slower than ArmNN!

  
## render.cpp
#include <Bela.h>
#include "Linear_2.hxx"
// #include "Linear_4.hxx"
// #include "Linear_8.hxx"
// #include "Linear_16.hxx"
// #include "Linear_32.hxx"
// #include "Linear_event.hxx"
// #include "Linear_RDF.hxx"
// #include "LinearNN.hxx"

#include <algorithm>
#include <iostream>
#include <chrono>
#include <stdlib.h>
#include <time.h>
#include <cmath>
#include <numeric>

int INPUT_SIZE = 2048;
float input[2048];
std::vector<float> out;

AuxiliaryTask sofieTask;
void sofieProcess (void*);
float sofieInterval = 5; // seconds
int sofieCount = 0;
int sofieProcessCount = 0;
int sofieProcessSkip = 5; // ignore first few runs for mean calculation

std::vector<float> total_time;

bool setup(BelaContext *context, void *userData)
{
	for (int i = 0; i < INPUT_SIZE; ++i)
	{
		input[i] = rand();
	}
	out.resize(INPUT_SIZE);

	sofieTask = Bela_createAuxiliaryTask(sofieProcess, 50, "sofie-task");
	sofieInterval = (int) sofieInterval * 44100;

	return true;
}

void render(BelaContext *context, void *userData)
{
	for(unsigned int n = 0; n < context->audioFrames; n++) {
		if(++sofieCount >= sofieInterval) {
			sofieCount = 0;
			Bela_scheduleAuxiliaryTask(sofieTask);
		}
	}
}

void sofieProcess (void*)
{
	if (++sofieProcessCount >= sofieProcessSkip)
	{
		auto t1 = std::chrono::high_resolution_clock::now();

		out = TMVA_SOFIE_Linear_2::infer(input);
		// out = TMVA_SOFIE_Linear_4::infer(input);
		// out = TMVA_SOFIE_Linear_8::infer(input);
		// out = TMVA_SOFIE_Linear_16::infer(input);
		// out = TMVA_SOFIE_Linear_32::infer(input);
		// out = TMVA_SOFIE_Linear_event::infer(input);
		// float rdf = TMVA_SOFIE_Linear_RDF::infer(rand(), rand(), rand());
		// out = TMVA_SOFIE_LinearNN::infer(input);

		auto t2 = std::chrono::high_resolution_clock::now();
		auto duration = float(std::chrono::duration_cast<std::chrono::microseconds>( t2 - t1 ).count());
		total_time.push_back(duration);

		float sum = std::accumulate(total_time.begin(), total_time.end(), 0.0);
		float mean = sum / total_time.size();

		float sq_sum = std::inner_product(total_time.begin(), total_time.end(), total_time.begin(), 0.0);
		float std = std::sqrt(sq_sum / total_time.size() - mean * mean);

		printf("sofieProcess() Total Runs: %i, This Run: %.f us, Mean: %.f us, Std: %.f us\n", sofieProcessCount-sofieProcessSkip, duration, mean, std);
	} else {
		printf("sofieProcess() Total Runs: %i (skipping first %i)\n", sofieProcessCount, sofieProcessSkip);
	}
}

void cleanup(BelaContext *context, void *userData)
{

}
ONNX	Time (us)	CPU %	Notes
Linear_2	650	9.3
Linear_4	720	9.5
Linear_8	920	10
Linear_16	1480	10.3
Linear_32			segfault
Linear_64			No .hxx file available
Linear_event			undefined reference to `sgemv_'
Linear_RDF			undefined reference to `sgemv_'
LinearNN			segfault
	#include <Bela.h>
	#include "Linear_2.hxx"
	// #include "Linear_4.hxx"
	// #include "Linear_8.hxx"
	// #include "Linear_16.hxx"
	// #include "Linear_32.hxx"
	// #include "Linear_event.hxx"
	// #include "Linear_RDF.hxx"
	// #include "LinearNN.hxx"

	#include <algorithm>
	#include <iostream>
	#include <chrono>
	#include <stdlib.h>
	#include <time.h>
	#include <cmath>
	#include <numeric>

	int INPUT_SIZE = 2048;
	float input[2048];
	std::vector<float> out;

	AuxiliaryTask sofieTask;
	void sofieProcess (void*);
	float sofieInterval = 5; // seconds
	int sofieCount = 0;
	int sofieProcessCount = 0;
	int sofieProcessSkip = 5; // ignore first few runs for mean calculation

	std::vector<float> total_time;

	bool setup(BelaContext context, void userData)
	{
	for (int i = 0; i < INPUT_SIZE; ++i)
	{
	input[i] = rand();
	}
	out.resize(INPUT_SIZE);

	sofieTask = Bela_createAuxiliaryTask(sofieProcess, 50, "sofie-task");
	sofieInterval = (int) sofieInterval * 44100;

	return true;
	}

	void render(BelaContext context, void userData)
	{
	for(unsigned int n = 0; n < context->audioFrames; n++) {
	if(++sofieCount >= sofieInterval) {
	sofieCount = 0;
	Bela_scheduleAuxiliaryTask(sofieTask);
	}
	}
	}

	void sofieProcess (void*)
	{
	if (++sofieProcessCount >= sofieProcessSkip)
	{
	auto t1 = std::chrono::high_resolution_clock::now();

	out = TMVA_SOFIE_Linear_2::infer(input);
	// out = TMVA_SOFIE_Linear_4::infer(input);
	// out = TMVA_SOFIE_Linear_8::infer(input);
	// out = TMVA_SOFIE_Linear_16::infer(input);
	// out = TMVA_SOFIE_Linear_32::infer(input);
	// out = TMVA_SOFIE_Linear_event::infer(input);
	// float rdf = TMVA_SOFIE_Linear_RDF::infer(rand(), rand(), rand());
	// out = TMVA_SOFIE_LinearNN::infer(input);

	auto t2 = std::chrono::high_resolution_clock::now();
	auto duration = float(std::chrono::duration_cast<std::chrono::microseconds>( t2 - t1 ).count());
	total_time.push_back(duration);

	float sum = std::accumulate(total_time.begin(), total_time.end(), 0.0);
	float mean = sum / total_time.size();

	float sq_sum = std::inner_product(total_time.begin(), total_time.end(), total_time.begin(), 0.0);
	float std = std::sqrt(sq_sum / total_time.size() - mean * mean);

	printf("sofieProcess() Total Runs: %i, This Run: %.f us, Mean: %.f us, Std: %.f us\n", sofieProcessCount-sofieProcessSkip, duration, mean, std);
	} else {
	printf("sofieProcess() Total Runs: %i (skipping first %i)\n", sofieProcessCount, sofieProcessSkip);
	}
	}

	void cleanup(BelaContext context, void userData)
	{

	}