Created
July 31, 2018 21:31
-
-
Save mustafaxfe/7a3b26da2d28e72c933f15a05f5db85c to your computer and use it in GitHub Desktop.
Edited interactive_face_detection_sample code to work with Intel Realsense R200 Camera
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
// Copyright (c) 2018 Intel Corporation | |
// | |
// Licensed under the Apache License, Version 2.0 (the "License"); | |
// you may not use this file except in compliance with the License. | |
// You may obtain a copy of the License at | |
// | |
// http://www.apache.org/licenses/LICENSE-2.0 | |
// | |
// Unless required by applicable law or agreed to in writing, software | |
// distributed under the License is distributed on an "AS IS" BASIS, | |
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
// See the License for the specific language governing permissions and | |
// limitations under the License. | |
*/ | |
/** | |
* \brief The entry point for the Inference Engine interactive_face_detection sample application | |
* \file interactive_face_detection_sample/main.cpp | |
* \example interactive_face_detection_sample/main.cpp | |
*/ | |
// First include the librealsense C++ header file | |
#include <librealsense/rs.hpp> | |
#include <gflags/gflags.h> | |
#include <functional> | |
#include <iostream> | |
#include <fstream> | |
#include <random> | |
#include <memory> | |
#include <chrono> | |
#include <vector> | |
#include <string> | |
#include <utility> | |
#include <algorithm> | |
#include <iterator> | |
#include <map> | |
#include <inference_engine.hpp> | |
#include <samples/common.hpp> | |
#include <samples/slog.hpp> | |
#include "interactive_face_detection.hpp" | |
#include "mkldnn/mkldnn_extension_ptr.hpp" | |
#include <ext_list.hpp> | |
#include <opencv2/opencv.hpp> | |
#include <opencv2/highgui.hpp> | |
using namespace InferenceEngine; | |
using namespace rs; | |
// Window size and frame rate | |
int const INPUT_WIDTH = 320; | |
int const INPUT_HEIGHT = 240; | |
int const FRAMERATE = 60; | |
// Named windows | |
char const *WINDOW_DEPTH = "Depth Image"; | |
char const *WINDOW_RGB = "RGB Image"; | |
context _rs_ctx; | |
//device * _rs_camera = _rs_ctx.get_device(0); | |
device* _rs_camera = NULL; | |
intrinsics _depth_intrin; | |
intrinsics _color_intrin; | |
bool _loop = true; | |
///////////////////////////////////////////////////////////////////////////// | |
// Called every frame gets the data from streams and displays them using OpenCV. | |
///////////////////////////////////////////////////////////////////////////// | |
bool display_next_frame( ) | |
{ | |
// Get current frames intrinsic data. | |
_depth_intrin = _rs_camera->get_stream_intrinsics( rs::stream::depth ); | |
_color_intrin = _rs_camera->get_stream_intrinsics( rs::stream::color ); | |
// Create depth image | |
cv::Mat depth16( _depth_intrin.height, | |
_depth_intrin.width, | |
CV_16U, | |
(uchar *)_rs_camera->get_frame_data( rs::stream::depth ) ); | |
// Create color image | |
cv::Mat rgb( _color_intrin.height, | |
_color_intrin.width, | |
CV_8UC3, | |
(uchar *)_rs_camera->get_frame_data( rs::stream::color ) ); | |
// < 800 | |
cv::Mat depth8u = depth16; | |
depth8u.convertTo( depth8u, CV_8UC1, 255.0/1000 ); | |
imshow( WINDOW_DEPTH, depth8u ); | |
cvWaitKey( 1 ); | |
cv::cvtColor( rgb, rgb, cv::COLOR_BGR2RGB ); | |
imshow( WINDOW_RGB, rgb ); | |
cvWaitKey( 1 ); | |
return true; | |
} | |
// Initialize the application state. Upon success will return the static app_state vars address | |
bool initialize_streaming( ) | |
{ | |
bool success = false; | |
if( _rs_ctx.get_device_count( ) > 0 ) | |
{ | |
_rs_camera = _rs_ctx.get_device( 0 ); | |
_rs_camera->enable_stream( rs::stream::color, INPUT_WIDTH, INPUT_HEIGHT, rs::format::rgb8, FRAMERATE ); | |
_rs_camera->start( ); | |
success = true; | |
} | |
return success; | |
} | |
bool ParseAndCheckCommandLine(int argc, char *argv[]) { | |
// ---------------------------Parsing and validation of input args-------------------------------------- | |
gflags::ParseCommandLineNonHelpFlags(&argc, &argv, true); | |
if (FLAGS_h) { | |
showUsage(); | |
return false; | |
} | |
slog::info << "Parsing input parameters" << slog::endl; | |
if (FLAGS_i.empty()) { | |
throw std::logic_error("Parameter -i is not set"); | |
} | |
if (FLAGS_m.empty()) { | |
throw std::logic_error("Parameter -m is not set"); | |
} | |
if (FLAGS_n_ag < 1) { | |
throw std::logic_error("Parameter -n_ag cannot be 0"); | |
} | |
if (FLAGS_n_hp < 1) { | |
throw std::logic_error("Parameter -n_hp cannot be 0"); | |
} | |
return true; | |
} | |
template <typename T> | |
void matU8ToBlob(const cv::Mat& orig_image, Blob::Ptr& blob, float scaleFactor = 1.0, int batchIndex = 0) { | |
SizeVector blobSize = blob->getTensorDesc().getDims(); | |
const size_t width = blobSize[3]; | |
const size_t height = blobSize[2]; | |
const size_t channels = blobSize[1]; | |
T* blob_data = blob->buffer().as<T*>(); | |
cv::Mat resized_image(orig_image); | |
if (width != orig_image.size().width || height!= orig_image.size().height) { | |
cv::resize(orig_image, resized_image, cv::Size(width, height)); | |
} | |
int batchOffset = batchIndex * width * height * channels; | |
for (size_t c = 0; c < channels; c++) { | |
for (size_t h = 0; h < height; h++) { | |
for (size_t w = 0; w < width; w++) { | |
blob_data[batchOffset + c * width * height + h * width + w] = | |
resized_image.at<cv::Vec3b>(h, w)[c] * scaleFactor; | |
} | |
} | |
} | |
} | |
// -------------------------Generic routines for detection networks------------------------------------------------- | |
struct BaseDetection { | |
ExecutableNetwork net; | |
InferencePlugin * plugin; | |
InferRequest::Ptr request; | |
std::string & commandLineFlag; | |
std::string topoName; | |
const int maxBatch; | |
BaseDetection(std::string &commandLineFlag, std::string topoName, int maxBatch) | |
: commandLineFlag(commandLineFlag), topoName(topoName), maxBatch(maxBatch) {} | |
virtual ~BaseDetection() {} | |
ExecutableNetwork* operator ->() { | |
return &net; | |
} | |
virtual CNNNetwork read() = 0; | |
virtual void submitRequest() { | |
if (!enabled() || request == nullptr) return; | |
request->StartAsync(); | |
} | |
virtual void wait() { | |
if (!enabled()|| !request) return; | |
request->Wait(IInferRequest::WaitMode::RESULT_READY); | |
} | |
mutable bool enablingChecked = false; | |
mutable bool _enabled = false; | |
bool enabled() const { | |
if (!enablingChecked) { | |
_enabled = !commandLineFlag.empty(); | |
if (!_enabled) { | |
slog::info << topoName << " DISABLED" << slog::endl; | |
} | |
enablingChecked = true; | |
} | |
return _enabled; | |
} | |
void printPerformanceCounts() { | |
if (!enabled()) { | |
return; | |
} | |
slog::info << "Performance counts for " << topoName << slog::endl << slog::endl; | |
::printPerformanceCounts(request->GetPerformanceCounts(), std::cout, false); | |
} | |
}; | |
struct FaceDetectionClass : BaseDetection { | |
std::string input; | |
std::string output; | |
int maxProposalCount; | |
int objectSize; | |
int enquedFrames = 0; | |
float width = 0; | |
float height = 0; | |
bool resultsFetched = false; | |
std::vector<std::string> labels; | |
struct Result { | |
int label; | |
float confidence; | |
cv::Rect location; | |
}; | |
std::vector<Result> results; | |
void submitRequest() override { | |
if (!enquedFrames) return; | |
enquedFrames = 0; | |
resultsFetched = false; | |
results.clear(); | |
BaseDetection::submitRequest(); | |
} | |
void enqueue(const cv::Mat &frame) { | |
if (!enabled()) return; | |
if (!request) { | |
request = net.CreateInferRequestPtr(); | |
} | |
width = frame.cols; | |
height = frame.rows; | |
Blob::Ptr inputBlob = request->GetBlob(input); | |
matU8ToBlob<uint8_t >(frame, inputBlob); | |
enquedFrames = 1; | |
} | |
FaceDetectionClass() : BaseDetection(FLAGS_m, "Face Detection", 1) {} | |
CNNNetwork read() override { | |
slog::info << "Loading network files for Face Detection" << slog::endl; | |
CNNNetReader netReader; | |
/** Read network model **/ | |
netReader.ReadNetwork(FLAGS_m); | |
/** Set batch size to 1 **/ | |
slog::info << "Batch size is set to "<< maxBatch << slog::endl; | |
netReader.getNetwork().setBatchSize(maxBatch); | |
/** Extract model name and load it's weights **/ | |
std::string binFileName = fileNameNoExt(FLAGS_m) + ".bin"; | |
netReader.ReadWeights(binFileName); | |
/** Read labels (if any)**/ | |
std::string labelFileName = fileNameNoExt(FLAGS_m) + ".labels"; | |
std::ifstream inputFile(labelFileName); | |
std::copy(std::istream_iterator<std::string>(inputFile), | |
std::istream_iterator<std::string>(), | |
std::back_inserter(labels)); | |
// ----------------------------------------------------------------------------------------------------- | |
/** SSD-based network should have one input and one output **/ | |
// ---------------------------Check inputs ------------------------------------------------------ | |
slog::info << "Checking Face Detection inputs" << slog::endl; | |
InputsDataMap inputInfo(netReader.getNetwork().getInputsInfo()); | |
if (inputInfo.size() != 1) { | |
throw std::logic_error("Face Detection network should have only one input"); | |
} | |
InputInfo::Ptr inputInfoFirst = inputInfo.begin()->second; | |
inputInfoFirst->setPrecision(Precision::U8); | |
inputInfoFirst->getInputData()->setLayout(Layout::NCHW); | |
// ----------------------------------------------------------------------------------------------------- | |
// ---------------------------Check outputs ------------------------------------------------------ | |
slog::info << "Checking Face Detection outputs" << slog::endl; | |
OutputsDataMap outputInfo(netReader.getNetwork().getOutputsInfo()); | |
if (outputInfo.size() != 1) { | |
throw std::logic_error("Face Detection network should have only one output"); | |
} | |
DataPtr& _output = outputInfo.begin()->second; | |
output = outputInfo.begin()->first; | |
const CNNLayerPtr outputLayer = netReader.getNetwork().getLayerByName(output.c_str()); | |
if (outputLayer->type != "DetectionOutput") { | |
throw std::logic_error("Face Detection network output layer(" + outputLayer->name + | |
") should be DetectionOutput, but was " + outputLayer->type); | |
} | |
if (outputLayer->params.find("num_classes") == outputLayer->params.end()) { | |
throw std::logic_error("Face Detection network output layer (" + | |
output + ") should have num_classes integer attribute"); | |
} | |
const int num_classes = outputLayer->GetParamAsInt("num_classes"); | |
if (labels.size() != num_classes) { | |
if (labels.size() == (num_classes - 1)) // if network assumes default "background" class, having no label | |
labels.insert(labels.begin(), "fake"); | |
else | |
labels.clear(); | |
} | |
const SizeVector outputDims = _output->getTensorDesc().getDims(); | |
maxProposalCount = outputDims[2]; | |
objectSize = outputDims[3]; | |
if (objectSize != 7) { | |
throw std::logic_error("Face Detection network output layer should have 7 as a last dimension"); | |
} | |
if (outputDims.size() != 4) { | |
throw std::logic_error("Face Detection network output dimensions not compatible shoulld be 4, but was " + | |
std::to_string(outputDims.size())); | |
} | |
_output->setPrecision(Precision::FP32); | |
_output->setLayout(Layout::NCHW); | |
slog::info << "Loading Face Detection model to the "<< FLAGS_d << " plugin" << slog::endl; | |
input = inputInfo.begin()->first; | |
return netReader.getNetwork(); | |
} | |
void fetchResults() { | |
if (!enabled()) return; | |
results.clear(); | |
if (resultsFetched) return; | |
resultsFetched = true; | |
const float *detections = request->GetBlob(output)->buffer().as<float *>(); | |
for (int i = 0; i < maxProposalCount; i++) { | |
float image_id = detections[i * objectSize + 0]; | |
Result r; | |
r.label = static_cast<int>(detections[i * objectSize + 1]); | |
r.confidence = detections[i * objectSize + 2]; | |
if (r.confidence <= FLAGS_t) { | |
continue; | |
} | |
r.location.x = detections[i * objectSize + 3] * width; | |
r.location.y = detections[i * objectSize + 4] * height; | |
r.location.width = detections[i * objectSize + 5] * width - r.location.x; | |
r.location.height = detections[i * objectSize + 6] * height - r.location.y; | |
if (image_id < 0) { | |
break; | |
} | |
if (FLAGS_r) { | |
std::cout << "[" << i << "," << r.label << "] element, prob = " << r.confidence << | |
" (" << r.location.x << "," << r.location.y << ")-(" << r.location.width << "," | |
<< r.location.height << ")" | |
<< ((r.confidence > FLAGS_t) ? " WILL BE RENDERED!" : "") << std::endl; | |
} | |
results.push_back(r); | |
} | |
} | |
}; | |
struct AgeGenderDetection : BaseDetection { | |
std::string input; | |
std::string outputAge; | |
std::string outputGender; | |
int enquedFaces = 0; | |
AgeGenderDetection() : BaseDetection(FLAGS_m_ag, "Age Gender", FLAGS_n_ag) {} | |
void submitRequest() override { | |
if (!enquedFaces) return; | |
BaseDetection::submitRequest(); | |
enquedFaces = 0; | |
} | |
void enqueue(const cv::Mat &face) { | |
if (!enabled()) { | |
return; | |
} | |
if (enquedFaces == maxBatch) { | |
slog::warn << "Number of detected faces more than maximum(" << maxBatch << ") processed by Age Gender detector" << slog::endl; | |
return; | |
} | |
if (!request) { | |
request = net.CreateInferRequestPtr(); | |
} | |
Blob::Ptr inputBlob = request->GetBlob(input); | |
matU8ToBlob<float>(face, inputBlob, 1.0f, enquedFaces); | |
enquedFaces++; | |
} | |
struct Result { float age; float maleProb;}; | |
Result operator[] (int idx) const { | |
Blob::Ptr genderBlob = request->GetBlob(outputGender); | |
Blob::Ptr ageBlob = request->GetBlob(outputAge); | |
return {ageBlob->buffer().as<float*>()[idx] * 100, | |
genderBlob->buffer().as<float*>()[idx * 2 + 1]}; | |
} | |
CNNNetwork read() override { | |
slog::info << "Loading network files for AgeGender" << slog::endl; | |
CNNNetReader netReader; | |
/** Read network model **/ | |
netReader.ReadNetwork(FLAGS_m_ag); | |
/** Set batch size to 16 **/ | |
netReader.getNetwork().setBatchSize(maxBatch); | |
slog::info << "Batch size is set to " << netReader.getNetwork().getBatchSize() << " for Age Gender" << slog::endl; | |
/** Extract model name and load it's weights **/ | |
std::string binFileName = fileNameNoExt(FLAGS_m_ag) + ".bin"; | |
netReader.ReadWeights(binFileName); | |
// ----------------------------------------------------------------------------------------------------- | |
/** Age Gender network should have one input two outputs **/ | |
// ---------------------------Check inputs ------------------------------------------------------ | |
slog::info << "Checking Age Gender inputs" << slog::endl; | |
InputsDataMap inputInfo(netReader.getNetwork().getInputsInfo()); | |
if (inputInfo.size() != 1) { | |
throw std::logic_error("Age gender topology should have only one input"); | |
} | |
InputInfo::Ptr& inputInfoFirst = inputInfo.begin()->second; | |
inputInfoFirst->setPrecision(Precision::FP32); | |
inputInfoFirst->getInputData()->setLayout(Layout::NCHW); | |
input = inputInfo.begin()->first; | |
// ----------------------------------------------------------------------------------------------------- | |
// ---------------------------Check outputs ------------------------------------------------------ | |
slog::info << "Checking Age Gender outputs" << slog::endl; | |
OutputsDataMap outputInfo(netReader.getNetwork().getOutputsInfo()); | |
if (outputInfo.size() != 2) { | |
throw std::logic_error("Age Gender network should have two output layers"); | |
} | |
auto it = outputInfo.begin(); | |
DataPtr ptrAgeOutput = (it++)->second; | |
DataPtr ptrGenderOutput = (it++)->second; | |
if (!ptrAgeOutput) { | |
throw std::logic_error("Age output data pointer is not valid"); | |
} | |
if (!ptrGenderOutput) { | |
throw std::logic_error("Gender output data pointer is not valid"); | |
} | |
auto genderCreatorLayer = ptrGenderOutput->getCreatorLayer().lock(); | |
auto ageCreatorLayer = ptrAgeOutput->getCreatorLayer().lock(); | |
if (!ageCreatorLayer) { | |
throw std::logic_error("Age's creator layer pointer is not valid"); | |
} | |
if (!genderCreatorLayer) { | |
throw std::logic_error("Gender's creator layer pointer is not valid"); | |
} | |
// if gender output is convolution, it can be swapped with age | |
if (genderCreatorLayer->type == "Convolution") { | |
std::swap(ptrAgeOutput, ptrGenderOutput); | |
} | |
if (ptrAgeOutput->getCreatorLayer().lock()->type != "Convolution") { | |
throw std::logic_error("In Age Gender network, age layer (" + ageCreatorLayer->name + | |
") should be a Convolution, but was: " + ageCreatorLayer->type); | |
} | |
if (ptrGenderOutput->getCreatorLayer().lock()->type != "SoftMax") { | |
throw std::logic_error("In Age Gender network, gender layer (" + genderCreatorLayer->name + | |
") should be a SoftMax, but was: " + genderCreatorLayer->type); | |
} | |
slog::info << "Age layer: " << ageCreatorLayer->name<< slog::endl; | |
slog::info << "Gender layer: " << genderCreatorLayer->name<< slog::endl; | |
outputAge = ptrAgeOutput->name; | |
outputGender = ptrGenderOutput->name; | |
slog::info << "Loading Age Gender model to the "<< FLAGS_d_ag << " plugin" << slog::endl; | |
_enabled = true; | |
return netReader.getNetwork(); | |
} | |
}; | |
struct HeadPoseDetection : BaseDetection { | |
std::string input; | |
std::string outputAngleR = "angle_r_fc"; | |
std::string outputAngleP = "angle_p_fc"; | |
std::string outputAngleY = "angle_y_fc"; | |
int enquedFaces = 0; | |
cv::Mat cameraMatrix; | |
HeadPoseDetection() : BaseDetection(FLAGS_m_hp, "Head Pose", FLAGS_n_hp) {} | |
void submitRequest() override { | |
if (!enquedFaces) return; | |
BaseDetection::submitRequest(); | |
enquedFaces = 0; | |
} | |
void enqueue(const cv::Mat &face) { | |
if (!enabled()) { | |
return; | |
} | |
if (enquedFaces == maxBatch) { | |
slog::warn << "Number of detected faces more than maximum(" << maxBatch << ") processed by Head Pose detector" << slog::endl; | |
return; | |
} | |
if (!request) { | |
request = net.CreateInferRequestPtr(); | |
} | |
Blob::Ptr inputBlob = request->GetBlob(input); | |
matU8ToBlob<float>(face, inputBlob, 1.0f, enquedFaces); | |
enquedFaces++; | |
} | |
struct Results { | |
float angle_r; | |
float angle_p; | |
float angle_y; | |
}; | |
Results operator[] (int idx) const { | |
Blob::Ptr angleR = request->GetBlob(outputAngleR); | |
Blob::Ptr angleP = request->GetBlob(outputAngleP); | |
Blob::Ptr angleY = request->GetBlob(outputAngleY); | |
return {angleR->buffer().as<float*>()[idx], | |
angleP->buffer().as<float*>()[idx], | |
angleY->buffer().as<float*>()[idx]}; | |
} | |
CNNNetwork read() override { | |
slog::info << "Loading network files for Head Pose detection " << slog::endl; | |
CNNNetReader netReader; | |
/** Read network model **/ | |
netReader.ReadNetwork(FLAGS_m_hp); | |
/** Set batch size to maximum currently set to one provided from command line **/ | |
netReader.getNetwork().setBatchSize(maxBatch); | |
netReader.getNetwork().setBatchSize(maxBatch); | |
slog::info << "Batch size is sey to " << netReader.getNetwork().getBatchSize() << " for Head Pose Network" << slog::endl; | |
/** Extract model name and load it's weights **/ | |
std::string binFileName = fileNameNoExt(FLAGS_m_hp) + ".bin"; | |
netReader.ReadWeights(binFileName); | |
/** Age Gender network should have one input two outputs **/ | |
// ---------------------------Check inputs ------------------------------------------------------ | |
slog::info << "Checking Head Pose Network inputs" << slog::endl; | |
InputsDataMap inputInfo(netReader.getNetwork().getInputsInfo()); | |
if (inputInfo.size() != 1) { | |
throw std::logic_error("Head Pose topology should have only one input"); | |
} | |
InputInfo::Ptr& inputInfoFirst = inputInfo.begin()->second; | |
inputInfoFirst->setPrecision(Precision::FP32); | |
inputInfoFirst->getInputData()->setLayout(Layout::NCHW); | |
input = inputInfo.begin()->first; | |
// ----------------------------------------------------------------------------------------------------- | |
// ---------------------------Check outputs ------------------------------------------------------ | |
slog::info << "Checking Head Pose network outputs" << slog::endl; | |
OutputsDataMap outputInfo(netReader.getNetwork().getOutputsInfo()); | |
if (outputInfo.size() != 3) { | |
throw std::logic_error("Head Pose network should have 3 outputs"); | |
} | |
std::map<std::string, bool> layerNames = { | |
{outputAngleR, false}, | |
{outputAngleP, false}, | |
{outputAngleY, false} | |
}; | |
for (auto && output : outputInfo) { | |
CNNLayerPtr layer = output.second->getCreatorLayer().lock(); | |
if (!layer) { | |
throw std::logic_error("Layer pointer is invalid"); | |
} | |
if (layerNames.find(layer->name) == layerNames.end()) { | |
throw std::logic_error("Head Pose network output layer unknown: " + layer->name + ", should be " + | |
outputAngleR + " or " + outputAngleP + " or " + outputAngleY); | |
} | |
if (layer->type != "FullyConnected") { | |
throw std::logic_error("Head Pose network output layer (" + layer->name + ") has invalid type: " + | |
layer->type + ", should be FullyConnected"); | |
} | |
auto fc = dynamic_cast<FullyConnectedLayer*>(layer.get()); | |
if (!fc) { | |
throw std::logic_error("Fully connected layer is not valid"); | |
} | |
if (fc->_out_num != 1) { | |
throw std::logic_error("Head Pose network output layer (" + layer->name + ") has invalid out-size=" + | |
std::to_string(fc->_out_num) + ", should be 1"); | |
} | |
layerNames[layer->name] = true; | |
} | |
slog::info << "Loading Head Pose model to the "<< FLAGS_d_hp << " plugin" << slog::endl; | |
_enabled = true; | |
return netReader.getNetwork(); | |
} | |
void buildCameraMatrix(int cx, int cy, float focalLength) { | |
if (!cameraMatrix.empty()) return; | |
cameraMatrix = cv::Mat::zeros(3, 3, CV_32F); | |
cameraMatrix.at<float>(0) = focalLength; | |
cameraMatrix.at<float>(2) = static_cast<float>(cx); | |
cameraMatrix.at<float>(4) = focalLength; | |
cameraMatrix.at<float>(5) = static_cast<float>(cy); | |
cameraMatrix.at<float>(8) = 1; | |
} | |
void drawAxes(cv::Mat& frame, cv::Point3f cpoint, Results headPose, float scale) { | |
double yaw = headPose.angle_y; | |
double pitch = headPose.angle_p; | |
double roll = headPose.angle_r; | |
if (FLAGS_r) { | |
std::cout << "Head pose results: yaw, pitch, roll = " << yaw << ";" << pitch << ";" << roll << std::endl; | |
} | |
pitch *= CV_PI / 180.0; | |
yaw *= CV_PI / 180.0; | |
roll *= CV_PI / 180.0; | |
cv::Matx33f Rx(1, 0, 0, | |
0, cos(pitch), -sin(pitch), | |
0, sin(pitch), cos(pitch)); | |
cv::Matx33f Ry(cos(yaw), 0, -sin(yaw), | |
0, 1, 0, | |
sin(yaw), 0, cos(yaw)); | |
cv::Matx33f Rz(cos(roll), -sin(roll), 0, | |
sin(roll), cos(roll), 0, | |
0, 0, 1); | |
auto r = cv::Mat(Rz*Ry*Rx); | |
buildCameraMatrix(frame.cols / 2, frame.rows / 2, 950.0); | |
cv::Mat xAxis(3, 1, CV_32F), yAxis(3, 1, CV_32F), zAxis(3, 1, CV_32F), zAxis1(3, 1, CV_32F); | |
xAxis.at<float>(0) = 1 * scale; | |
xAxis.at<float>(1) = 0; | |
xAxis.at<float>(2) = 0; | |
yAxis.at<float>(0) = 0; | |
yAxis.at<float>(1) = -1 * scale; | |
yAxis.at<float>(2) = 0; | |
zAxis.at<float>(0) = 0; | |
zAxis.at<float>(1) = 0; | |
zAxis.at<float>(2) = -1 * scale; | |
zAxis1.at<float>(0) = 0; | |
zAxis1.at<float>(1) = 0; | |
zAxis1.at<float>(2) = 1 * scale; | |
cv::Mat o(3, 1, CV_32F, cv::Scalar(0)); | |
o.at<float>(2) = cameraMatrix.at<float>(0); | |
xAxis = r * xAxis + o; | |
yAxis = r * yAxis + o; | |
zAxis = r * zAxis + o; | |
zAxis1 = r * zAxis1 + o; | |
cv::Point p1, p2; | |
p2.x = static_cast<int>((xAxis.at<float>(0) / xAxis.at<float>(2) * cameraMatrix.at<float>(0)) + cpoint.x); | |
p2.y = static_cast<int>((xAxis.at<float>(1) / xAxis.at<float>(2) * cameraMatrix.at<float>(4)) + cpoint.y); | |
cv::line(frame, cv::Point(cpoint.x, cpoint.y), p2, cv::Scalar(0, 0, 255), 2); | |
p2.x = static_cast<int>((yAxis.at<float>(0) / yAxis.at<float>(2) * cameraMatrix.at<float>(0)) + cpoint.x); | |
p2.y = static_cast<int>((yAxis.at<float>(1) / yAxis.at<float>(2) * cameraMatrix.at<float>(4)) + cpoint.y); | |
cv::line(frame, cv::Point(cpoint.x, cpoint.y), p2, cv::Scalar(0, 255, 0), 2); | |
p1.x = static_cast<int>((zAxis1.at<float>(0) / zAxis1.at<float>(2) * cameraMatrix.at<float>(0)) + cpoint.x); | |
p1.y = static_cast<int>((zAxis1.at<float>(1) / zAxis1.at<float>(2) * cameraMatrix.at<float>(4)) + cpoint.y); | |
p2.x = static_cast<int>((zAxis.at<float>(0) / zAxis.at<float>(2) * cameraMatrix.at<float>(0)) + cpoint.x); | |
p2.y = static_cast<int>((zAxis.at<float>(1) / zAxis.at<float>(2) * cameraMatrix.at<float>(4)) + cpoint.y); | |
cv::line(frame, p1, p2, cv::Scalar(255, 0, 0), 2); | |
cv::circle(frame, p2, 3, cv::Scalar(255, 0, 0), 2); | |
} | |
}; | |
struct EmotionsDetectionClass : BaseDetection { | |
std::string input; | |
std::string outputEmotions; | |
int enquedFaces = 0; | |
EmotionsDetectionClass() : BaseDetection(FLAGS_m_em, "Emotions Recognition", FLAGS_n_em) {} | |
void submitRequest() override { | |
if (!enquedFaces) return; | |
BaseDetection::submitRequest(); | |
enquedFaces = 0; | |
} | |
void enqueue(const cv::Mat &face) { | |
if (!enabled()) { | |
return; | |
} | |
if (enquedFaces == maxBatch) { | |
slog::warn << "Number of detected faces more than maximum(" << maxBatch << ") processed by Emotions detector" << slog::endl; | |
return; | |
} | |
if (!request) { | |
request = net.CreateInferRequestPtr(); | |
} | |
Blob::Ptr inputBlob = request->GetBlob(input); | |
matU8ToBlob<float>(face, inputBlob, 1.0f, enquedFaces); | |
enquedFaces++; | |
} | |
std::string operator[] (int idx) const { | |
/* vector of supported emotions */ | |
static const std::vector<std::string> emotionsVec = {"neutral", "happy", "sad", "surprise", "anger"}; | |
auto emotionsVecSize = emotionsVec.size(); | |
Blob::Ptr emotionsBlob = request->GetBlob(outputEmotions); | |
/* emotions vector must have the same size as number of channels | |
* in model output. Default output format is NCHW so we check index 1. */ | |
int numOfChannels = emotionsBlob->getTensorDesc().getDims().at(1); | |
if (numOfChannels != emotionsVec.size()) { | |
throw std::logic_error("Output size (" + std::to_string(numOfChannels) + | |
") of the Emotions Recognition network is not equal " | |
"to used emotions vector size (" + | |
std::to_string(emotionsVec.size()) + ")"); | |
} | |
auto emotionsValues = emotionsBlob->buffer().as<float *>(); | |
auto outputIdxPos = emotionsValues + idx; | |
/* we identify an index of the most probable emotion in output array | |
for idx image to return appropriate emotion name */ | |
int maxProbEmotionIx = std::max_element(outputIdxPos, outputIdxPos + emotionsVecSize) - outputIdxPos; | |
return emotionsVec[maxProbEmotionIx]; | |
} | |
CNNNetwork read() override { | |
slog::info << "Loading network files for Emotions recognition" << slog::endl; | |
InferenceEngine::CNNNetReader netReader; | |
/** Read network model **/ | |
netReader.ReadNetwork(FLAGS_m_em); | |
/** Default batch size is 16 **/ | |
netReader.getNetwork().setBatchSize(maxBatch); | |
slog::info << "Batch size is set to " << netReader.getNetwork().getBatchSize() << " for Emotions recognition" << slog::endl; | |
/** Extract model name and load it's weights **/ | |
std::string binFileName = fileNameNoExt(FLAGS_m_em) + ".bin"; | |
netReader.ReadWeights(binFileName); | |
// ----------------------------------------------------------------------------------------------------- | |
/** Emotions recognition network should have one input and one output **/ | |
// ---------------------------Check inputs ------------------------------------------------------ | |
slog::info << "Checking Emotions Recognition inputs" << slog::endl; | |
InferenceEngine::InputsDataMap inputInfo(netReader.getNetwork().getInputsInfo()); | |
if (inputInfo.size() != 1) { | |
throw std::logic_error("Emotions Recognition topology should have only one input"); | |
} | |
auto& inputInfoFirst = inputInfo.begin()->second; | |
inputInfoFirst->setPrecision(Precision::FP32); | |
inputInfoFirst->getInputData()->setLayout(Layout::NCHW); | |
input = inputInfo.begin()->first; | |
// ----------------------------------------------------------------------------------------------------- | |
// ---------------------------Check outputs ------------------------------------------------------ | |
slog::info << "Checking Emotions Recognition outputs" << slog::endl; | |
InferenceEngine::OutputsDataMap outputInfo(netReader.getNetwork().getOutputsInfo()); | |
if (outputInfo.size() != 1) { | |
throw std::logic_error("Emotions Recognition network should have one output layer"); | |
} | |
DataPtr emotionsOutput = outputInfo.begin()->second; | |
if (!emotionsOutput) { | |
throw std::logic_error("Emotions output data pointer is invalid"); | |
} | |
auto emotionsCreatorLayer = emotionsOutput->getCreatorLayer().lock(); | |
if (!emotionsCreatorLayer) { | |
throw std::logic_error("Emotions creator layer pointer is invalid"); | |
} | |
if (emotionsCreatorLayer->type != "SoftMax") { | |
throw std::logic_error("In Emotions Recognition network, Emotion layer (" | |
+ emotionsCreatorLayer->name + | |
") should be a SoftMax, but was: " + | |
emotionsCreatorLayer->type); | |
} | |
slog::info << "Emotions layer: " << emotionsCreatorLayer->name<< slog::endl; | |
outputEmotions = emotionsOutput->name; | |
slog::info << "Loading Emotions Recognition model to the "<< FLAGS_d_em << " plugin" << slog::endl; | |
_enabled = true; | |
return netReader.getNetwork(); | |
} | |
}; | |
struct Load { | |
BaseDetection& detector; | |
explicit Load(BaseDetection& detector) : detector(detector) { } | |
void into(InferencePlugin & plg) const { | |
if (detector.enabled()) { | |
detector.net = plg.LoadNetwork(detector.read(), {}); | |
detector.plugin = &plg; | |
} | |
} | |
}; | |
int main(int argc, char *argv[]) { | |
std::cout << "Deneme"; | |
try { | |
/** This sample covers 3 certain topologies and cannot be generalized **/ | |
std::cout << "InferenceEngine: " << GetInferenceEngineVersion() << std::endl; | |
// ------------------------------ Parsing and validation of input args --------------------------------- | |
if (!ParseAndCheckCommandLine(argc, argv)) { | |
return 0; | |
} | |
rs::log_to_console( rs::log_severity::warn ); | |
if( !initialize_streaming( ) ) { | |
std::cout << "Unable to locate a camera" << std::endl; | |
rs::log_to_console( rs::log_severity::fatal ); | |
return EXIT_FAILURE; | |
} | |
slog::info << "Reading input" << slog::endl; | |
initialize_streaming(); | |
//const bool isCamera = FLAGS_i == "cam"; | |
//if (!(FLAGS_i == "cam" ? cap.open(0) : cap.open(FLAGS_i))) { | |
// throw std::logic_error("Cannot open input file or camera: " + FLAGS_i); | |
//} | |
const size_t width = (size_t) _color_intrin.width;//_color_intrin.get(CV_CAP_PROP_FRAME_WIDTH); | |
const size_t height = (size_t) _color_intrin.height;//_color_intrin.get(CV_CAP_PROP_FRAME_HEIGHT); | |
// read input (video) frame | |
//cv::Mat frame; | |
cv::Mat rgb( _color_intrin.height, | |
_color_intrin.width, | |
CV_8UC3, | |
(uchar *)_rs_camera->get_frame_data( rs::stream::color )); | |
// if (!cap.read(frame)) { | |
// throw std::logic_error("Failed to get frame from cv::VideoCapture"); | |
//} | |
// ----------------------------------------------------------------------------------------------------- | |
// --------------------------- 1. Load Plugin for inference engine ------------------------------------- | |
std::map<std::string, InferencePlugin> pluginsForDevices; | |
std::vector<std::pair<std::string, std::string>> cmdOptions = { | |
{FLAGS_d, FLAGS_m}, {FLAGS_d_ag, FLAGS_m_ag}, {FLAGS_d_hp, FLAGS_m_hp}, | |
{FLAGS_d_em, FLAGS_m_em} | |
}; | |
FaceDetectionClass FaceDetection; | |
AgeGenderDetection AgeGender; | |
HeadPoseDetection HeadPose; | |
EmotionsDetectionClass EmotionsDetection; | |
for (auto && option : cmdOptions) { | |
auto deviceName = option.first; | |
auto networkName = option.second; | |
if (deviceName == "" || networkName == "") { | |
continue; | |
} | |
if (pluginsForDevices.find(deviceName) != pluginsForDevices.end()) { | |
continue; | |
} | |
slog::info << "Loading plugin " << deviceName << slog::endl; | |
InferencePlugin plugin = PluginDispatcher({"../../../lib/intel64", ""}).getPluginByDevice(deviceName); | |
/** Printing plugin version **/ | |
printPluginVersion(plugin, std::cout); | |
/** Load extensions for the CPU plugin **/ | |
if ((deviceName.find("CPU") != std::string::npos)) { | |
plugin.AddExtension(std::make_shared<Extensions::Cpu::CpuExtensions>()); | |
if (!FLAGS_l.empty()) { | |
// CPU(MKLDNN) extensions are loaded as a shared library and passed as a pointer to base extension | |
auto extension_ptr = make_so_pointer<MKLDNNPlugin::IMKLDNNExtension>(FLAGS_l); | |
plugin.AddExtension(std::static_pointer_cast<IExtension>(extension_ptr)); | |
} | |
} else if (!FLAGS_c.empty()) { | |
// Load Extensions for other plugins not CPU | |
plugin.SetConfig({ { PluginConfigParams::KEY_CONFIG_FILE, FLAGS_c } }); | |
} | |
pluginsForDevices[deviceName] = plugin; | |
} | |
/** Per layer metrics **/ | |
if (FLAGS_pc) { | |
for (auto && plugin : pluginsForDevices) { | |
plugin.second.SetConfig({{PluginConfigParams::KEY_PERF_COUNT, PluginConfigParams::YES}}); | |
} | |
} | |
// ----------------------------------------------------------------------------------------------------- | |
// --------------------------- 2. Read IR models and load them to plugins ------------------------------ | |
Load(FaceDetection).into(pluginsForDevices[FLAGS_d]); | |
Load(AgeGender).into(pluginsForDevices[FLAGS_d_ag]); | |
Load(HeadPose).into(pluginsForDevices[FLAGS_d_hp]); | |
Load(EmotionsDetection).into(pluginsForDevices[FLAGS_d_em]); | |
// ----------------------------------------------------------------------------------------------------- | |
// --------------------------- 3. Do inference --------------------------------------------------------- | |
slog::info << "Start inference " << slog::endl; | |
typedef std::chrono::duration<double, std::ratio<1, 1000>> ms; | |
auto wallclock = std::chrono::high_resolution_clock::now(); | |
double ocv_decode_time = 0, ocv_render_time = 0; | |
bool firstFrame = true; | |
/** Start inference & calc performance **/ | |
while (true) { | |
/** requesting new frame if any*/ | |
//cap.grab(); | |
if( _rs_camera->is_streaming( ) ) | |
_rs_camera->wait_for_frames( ); | |
display_next_frame( ); | |
auto t0 = std::chrono::high_resolution_clock::now(); | |
FaceDetection.enqueue(rgb); | |
auto t1 = std::chrono::high_resolution_clock::now(); | |
ocv_decode_time = std::chrono::duration_cast<ms>(t1 - t0).count(); | |
t0 = std::chrono::high_resolution_clock::now(); | |
// ----------------------------Run face detection inference----------------------------------------- | |
FaceDetection.submitRequest(); | |
FaceDetection.wait(); | |
t1 = std::chrono::high_resolution_clock::now(); | |
ms detection = std::chrono::duration_cast<ms>(t1 - t0); | |
FaceDetection.fetchResults(); | |
for (auto && face : FaceDetection.results) { | |
if (AgeGender.enabled() || HeadPose.enabled() || EmotionsDetection.enabled()) { | |
auto clippedRect = face.location & cv::Rect(0, 0, width, height); | |
cv::Mat face = rgb(clippedRect); | |
AgeGender.enqueue(face); | |
HeadPose.enqueue(face); | |
EmotionsDetection.enqueue(face); | |
} | |
} | |
// ----------------------------Run age-gender, and head pose detection simultaneously--------------- | |
t0 = std::chrono::high_resolution_clock::now(); | |
if (AgeGender.enabled() || HeadPose.enabled() || EmotionsDetection.enabled()) { | |
AgeGender.submitRequest(); | |
HeadPose.submitRequest(); | |
EmotionsDetection.submitRequest(); | |
AgeGender.wait(); | |
HeadPose.wait(); | |
EmotionsDetection.wait(); | |
} | |
t1 = std::chrono::high_resolution_clock::now(); | |
ms secondDetection = std::chrono::duration_cast<ms>(t1 - t0); | |
// ----------------------------Processing outputs--------------------------------------------------- | |
std::ostringstream out; | |
out << "OpenCV cap/render time: " << std::fixed << std::setprecision(2) | |
<< (ocv_decode_time + ocv_render_time) << " ms"; | |
cv::putText(rgb, out.str(), cv::Point2f(0, 25), cv::FONT_HERSHEY_TRIPLEX, 0.5, cv::Scalar(255, 0, 0)); | |
out.str(""); | |
out << "Face detection time: " << std::fixed << std::setprecision(2) << detection.count() | |
<< " ms (" | |
<< 1000.f / detection.count() << " fps)"; | |
cv::putText(rgb, out.str(), cv::Point2f(0, 45), cv::FONT_HERSHEY_TRIPLEX, 0.5, | |
cv::Scalar(255, 0, 0)); | |
if (HeadPose.enabled() || AgeGender.enabled() || EmotionsDetection.enabled()) { | |
out.str(""); | |
out << (AgeGender.enabled() ? "Age Gender " : "") | |
<< (AgeGender.enabled() && (HeadPose.enabled() || EmotionsDetection.enabled()) ? "+ " : "") | |
<< (HeadPose.enabled() ? "Head Pose " : "") | |
<< (HeadPose.enabled() && EmotionsDetection.enabled() ? "+ " : "") | |
<< (EmotionsDetection.enabled() ? "Emotions Recognition " : "") | |
<< "time: "<< std::fixed << std::setprecision(2) << secondDetection.count() | |
<< " ms "; | |
if (!FaceDetection.results.empty()) { | |
out << "(" << 1000.f / secondDetection.count() << " fps)"; | |
} | |
cv::putText(rgb, out.str(), cv::Point2f(0, 65), cv::FONT_HERSHEY_TRIPLEX, 0.5, cv::Scalar(255, 0, 0)); | |
} | |
int i = 0; | |
for (auto & result : FaceDetection.results) { | |
cv::Rect rect = result.location; | |
out.str(""); | |
if (AgeGender.enabled() && i < AgeGender.maxBatch) { | |
out << (AgeGender[i].maleProb > 0.5 ? "M" : "F"); | |
out << std::fixed << std::setprecision(0) << "," << AgeGender[i].age; | |
if (FLAGS_r) { | |
std::cout << "Predicted gender, age = " << out.str() << std::endl; | |
} | |
} else { | |
out << (result.label < FaceDetection.labels.size() ? FaceDetection.labels[result.label] : | |
std::string("label #") + std::to_string(result.label)) | |
<< ": " << std::fixed << std::setprecision(3) << result.confidence; | |
} | |
if (EmotionsDetection.enabled()) { | |
/* currently we display only most probable emotion */ | |
std::string emotion = EmotionsDetection[i]; | |
if (FLAGS_r) { | |
std::cout << "Predicted emotion = " << emotion << std::endl; | |
} | |
out << "," << emotion; | |
} | |
cv::putText(rgb, | |
out.str(), | |
cv::Point2f(result.location.x, result.location.y - 15), | |
cv::FONT_HERSHEY_COMPLEX_SMALL, | |
0.8, | |
cv::Scalar(0, 0, 255)); | |
if (HeadPose.enabled() && i < HeadPose.maxBatch) { | |
cv::Point3f center(rect.x + rect.width / 2, rect.y + rect.height / 2, 0); | |
HeadPose.drawAxes(rgb, center, HeadPose[i], 50); | |
} | |
auto genderColor = (AgeGender.enabled() && (i < AgeGender.maxBatch)) ? | |
((AgeGender[i].maleProb < 0.5) ? cv::Scalar(147, 20, 255) : cv::Scalar(255, 0, 0)) : | |
cv::Scalar(100, 100, 100); | |
cv::rectangle(rgb, result.location, genderColor, 1); | |
i++; | |
} | |
if (-1 != cv::waitKey(1)) | |
break; | |
t0 = std::chrono::high_resolution_clock::now(); | |
if (!FLAGS_no_show) { | |
cv::imshow("Detection results", rgb); | |
} | |
t1 = std::chrono::high_resolution_clock::now(); | |
ocv_render_time = std::chrono::duration_cast<ms>(t1 - t0).count(); | |
// end of file, for single frame file, like image we just keep it displayed to let user check what was shown | |
/* | |
if (!cap.retrieve(rgb)) { | |
if (!FLAGS_no_wait) { | |
slog::info << "Press any key to exit" << slog::endl; | |
cv::waitKey(0); | |
} | |
break; | |
} | |
*/ | |
if (firstFrame) { | |
slog::info << "Press any key to stop" << slog::endl; | |
} | |
firstFrame = false; | |
} | |
/** Show performace results **/ | |
if (FLAGS_pc) { | |
FaceDetection.printPerformanceCounts(); | |
AgeGender.printPerformanceCounts(); | |
HeadPose.printPerformanceCounts(); | |
} | |
// ----------------------------------------------------------------------------------------------------- | |
} | |
catch (const std::exception& error) { | |
slog::err << error.what() << slog::endl; | |
return 1; | |
} | |
catch (...) { | |
slog::err << "Unknown/internal exception happened." << slog::endl; | |
return 1; | |
} | |
slog::info << "Execution successful" << slog::endl; | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment