mustafaxfe/main.cpp

## main.cpp
/*
// Copyright (c) 2018 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
*/

/**
* \brief The entry point for the Inference Engine interactive_face_detection sample application
* \file interactive_face_detection_sample/main.cpp
* \example interactive_face_detection_sample/main.cpp
*/
// First include the librealsense C++ header file
#include <librealsense/rs.hpp>
#include <gflags/gflags.h>
#include <functional>
#include <iostream>
#include <fstream>
#include <random>
#include <memory>
#include <chrono>
#include <vector>
#include <string>
#include <utility>
#include <algorithm>
#include <iterator>
#include <map>

#include <inference_engine.hpp>

#include <samples/common.hpp>
#include <samples/slog.hpp>

#include "interactive_face_detection.hpp"
#include "mkldnn/mkldnn_extension_ptr.hpp"
#include <ext_list.hpp>

#include <opencv2/opencv.hpp>
#include <opencv2/highgui.hpp>

using namespace InferenceEngine;
using namespace rs;

// Window size and frame rate
int const INPUT_WIDTH 	= 320;
int const INPUT_HEIGHT 	= 240;
int const FRAMERATE 	= 60;

// Named windows
char const *WINDOW_DEPTH = "Depth Image";
char const *WINDOW_RGB	 = "RGB Image";


context 	_rs_ctx;
//device * _rs_camera = _rs_ctx.get_device(0);
device* 	_rs_camera = NULL;

intrinsics 	_depth_intrin;
intrinsics  _color_intrin;
bool 		_loop = true;
/////////////////////////////////////////////////////////////////////////////
// Called every frame gets the data from streams and displays them using OpenCV.
/////////////////////////////////////////////////////////////////////////////
bool display_next_frame( )
{
	// Get current frames intrinsic data.
	_depth_intrin 	= _rs_camera->get_stream_intrinsics( rs::stream::depth );
	_color_intrin 	= _rs_camera->get_stream_intrinsics( rs::stream::color );

	// Create depth image
	cv::Mat depth16( _depth_intrin.height,
					 _depth_intrin.width,
					 CV_16U,
					 (uchar *)_rs_camera->get_frame_data( rs::stream::depth ) );

	// Create color image
	cv::Mat rgb( _color_intrin.height,
				 _color_intrin.width,
				 CV_8UC3,
				 (uchar *)_rs_camera->get_frame_data( rs::stream::color ) );

	// < 800
	cv::Mat depth8u = depth16;
	depth8u.convertTo( depth8u, CV_8UC1, 255.0/1000 );

	imshow( WINDOW_DEPTH, depth8u );
	cvWaitKey( 1 );

	cv::cvtColor( rgb, rgb, cv::COLOR_BGR2RGB );
	imshow( WINDOW_RGB, rgb );
	cvWaitKey( 1 );

	return true;
}


// Initialize the application state. Upon success will return the static app_state vars address
bool initialize_streaming( )
{
	bool success = false;
	if( _rs_ctx.get_device_count( ) > 0 )
	{
		_rs_camera = _rs_ctx.get_device( 0 );

		_rs_camera->enable_stream( rs::stream::color, INPUT_WIDTH, INPUT_HEIGHT, rs::format::rgb8, FRAMERATE );

		_rs_camera->start( );

		success = true;
	}
	return success;
}
bool ParseAndCheckCommandLine(int argc, char *argv[]) {
    // ---------------------------Parsing and validation of input args--------------------------------------
    gflags::ParseCommandLineNonHelpFlags(&argc, &argv, true);
    if (FLAGS_h) {
        showUsage();
        return false;
    }
    slog::info << "Parsing input parameters" << slog::endl;

    if (FLAGS_i.empty()) {
        throw std::logic_error("Parameter -i is not set");
    }

    if (FLAGS_m.empty()) {
        throw std::logic_error("Parameter -m is not set");
    }

    if (FLAGS_n_ag < 1) {
        throw std::logic_error("Parameter -n_ag cannot be 0");
    }

    if (FLAGS_n_hp < 1) {
        throw std::logic_error("Parameter -n_hp cannot be 0");
    }

    return true;
}

template <typename T>
void matU8ToBlob(const cv::Mat& orig_image, Blob::Ptr& blob, float scaleFactor = 1.0, int batchIndex = 0) {
    SizeVector blobSize = blob->getTensorDesc().getDims();
    const size_t width = blobSize[3];
    const size_t height = blobSize[2];
    const size_t channels = blobSize[1];
    T* blob_data = blob->buffer().as<T*>();

    cv::Mat resized_image(orig_image);
    if (width != orig_image.size().width || height!= orig_image.size().height) {
        cv::resize(orig_image, resized_image, cv::Size(width, height));
    }

    int batchOffset = batchIndex * width * height * channels;

    for (size_t c = 0; c < channels; c++) {
        for (size_t  h = 0; h < height; h++) {
            for (size_t w = 0; w < width; w++) {
                blob_data[batchOffset + c * width * height + h * width + w] =
                    resized_image.at<cv::Vec3b>(h, w)[c] * scaleFactor;
            }
        }
    }
}

// -------------------------Generic routines for detection networks-------------------------------------------------

struct BaseDetection {
    ExecutableNetwork net;
    InferencePlugin * plugin;
    InferRequest::Ptr request;
    std::string & commandLineFlag;
    std::string topoName;
    const int maxBatch;

    BaseDetection(std::string &commandLineFlag, std::string topoName, int maxBatch)
        : commandLineFlag(commandLineFlag), topoName(topoName), maxBatch(maxBatch) {}

    virtual ~BaseDetection() {}

    ExecutableNetwork* operator ->() {
        return &net;
    }
    virtual CNNNetwork read()  = 0;

    virtual void submitRequest() {
        if (!enabled() || request == nullptr) return;
        request->StartAsync();
    }

    virtual void wait() {
        if (!enabled()|| !request) return;
        request->Wait(IInferRequest::WaitMode::RESULT_READY);
    }
    mutable bool enablingChecked = false;
    mutable bool _enabled = false;

    bool enabled() const  {
        if (!enablingChecked) {
            _enabled = !commandLineFlag.empty();
            if (!_enabled) {
                slog::info << topoName << " DISABLED" << slog::endl;
            }
            enablingChecked = true;
        }
        return _enabled;
    }
    void printPerformanceCounts() {
        if (!enabled()) {
            return;
        }
        slog::info << "Performance counts for " << topoName << slog::endl << slog::endl;
        ::printPerformanceCounts(request->GetPerformanceCounts(), std::cout, false);
    }
};

struct FaceDetectionClass : BaseDetection {
    std::string input;
    std::string output;
    int maxProposalCount;
    int objectSize;
    int enquedFrames = 0;
    float width = 0;
    float height = 0;
    bool resultsFetched = false;
    std::vector<std::string> labels;

    struct Result {
        int label;
        float confidence;
        cv::Rect location;
    };

    std::vector<Result> results;

    void submitRequest() override {
        if (!enquedFrames) return;
        enquedFrames = 0;
        resultsFetched = false;
        results.clear();
        BaseDetection::submitRequest();
    }

    void enqueue(const cv::Mat &frame) {
        if (!enabled()) return;

        if (!request) {
            request = net.CreateInferRequestPtr();
        }

        width = frame.cols;
        height = frame.rows;

        Blob::Ptr  inputBlob = request->GetBlob(input);

        matU8ToBlob<uint8_t >(frame, inputBlob);

        enquedFrames = 1;
    }


    FaceDetectionClass() : BaseDetection(FLAGS_m, "Face Detection", 1) {}
    CNNNetwork read() override {
        slog::info << "Loading network files for Face Detection" << slog::endl;
        CNNNetReader netReader;
        /** Read network model **/
        netReader.ReadNetwork(FLAGS_m);
        /** Set batch size to 1 **/
        slog::info << "Batch size is set to  "<< maxBatch << slog::endl;
        netReader.getNetwork().setBatchSize(maxBatch);
        /** Extract model name and load it's weights **/
        std::string binFileName = fileNameNoExt(FLAGS_m) + ".bin";
        netReader.ReadWeights(binFileName);
        /** Read labels (if any)**/
        std::string labelFileName = fileNameNoExt(FLAGS_m) + ".labels";

        std::ifstream inputFile(labelFileName);
        std::copy(std::istream_iterator<std::string>(inputFile),
                  std::istream_iterator<std::string>(),
                  std::back_inserter(labels));
        // -----------------------------------------------------------------------------------------------------

        /** SSD-based network should have one input and one output **/
        // ---------------------------Check inputs ------------------------------------------------------
        slog::info << "Checking Face Detection inputs" << slog::endl;
        InputsDataMap inputInfo(netReader.getNetwork().getInputsInfo());
        if (inputInfo.size() != 1) {
            throw std::logic_error("Face Detection network should have only one input");
        }
        InputInfo::Ptr inputInfoFirst = inputInfo.begin()->second;
        inputInfoFirst->setPrecision(Precision::U8);
        inputInfoFirst->getInputData()->setLayout(Layout::NCHW);
        // -----------------------------------------------------------------------------------------------------

        // ---------------------------Check outputs ------------------------------------------------------
        slog::info << "Checking Face Detection outputs" << slog::endl;
        OutputsDataMap outputInfo(netReader.getNetwork().getOutputsInfo());
        if (outputInfo.size() != 1) {
            throw std::logic_error("Face Detection network should have only one output");
        }
        DataPtr& _output = outputInfo.begin()->second;
        output = outputInfo.begin()->first;

        const CNNLayerPtr outputLayer = netReader.getNetwork().getLayerByName(output.c_str());
        if (outputLayer->type != "DetectionOutput") {
            throw std::logic_error("Face Detection network output layer(" + outputLayer->name +
                ") should be DetectionOutput, but was " +  outputLayer->type);
        }

        if (outputLayer->params.find("num_classes") == outputLayer->params.end()) {
            throw std::logic_error("Face Detection network output layer (" +
                output + ") should have num_classes integer attribute");
        }

        const int num_classes = outputLayer->GetParamAsInt("num_classes");
        if (labels.size() != num_classes) {
            if (labels.size() == (num_classes - 1))  // if network assumes default "background" class, having no label
                labels.insert(labels.begin(), "fake");
            else
                labels.clear();
        }
        const SizeVector outputDims = _output->getTensorDesc().getDims();
        maxProposalCount = outputDims[2];
        objectSize = outputDims[3];
        if (objectSize != 7) {
            throw std::logic_error("Face Detection network output layer should have 7 as a last dimension");
        }
        if (outputDims.size() != 4) {
            throw std::logic_error("Face Detection network output dimensions not compatible shoulld be 4, but was " +
                                           std::to_string(outputDims.size()));
        }
        _output->setPrecision(Precision::FP32);
        _output->setLayout(Layout::NCHW);

        slog::info << "Loading Face Detection model to the "<< FLAGS_d << " plugin" << slog::endl;
        input = inputInfo.begin()->first;
        return netReader.getNetwork();
    }

    void fetchResults() {
        if (!enabled()) return;
        results.clear();
        if (resultsFetched) return;
        resultsFetched = true;
        const float *detections = request->GetBlob(output)->buffer().as<float *>();

        for (int i = 0; i < maxProposalCount; i++) {
            float image_id = detections[i * objectSize + 0];
            Result r;
            r.label = static_cast<int>(detections[i * objectSize + 1]);
            r.confidence = detections[i * objectSize + 2];
            if (r.confidence <= FLAGS_t) {
                continue;
            }

            r.location.x = detections[i * objectSize + 3] * width;
            r.location.y = detections[i * objectSize + 4] * height;
            r.location.width = detections[i * objectSize + 5] * width - r.location.x;
            r.location.height = detections[i * objectSize + 6] * height - r.location.y;

            if (image_id < 0) {
                break;
            }
            if (FLAGS_r) {
                std::cout << "[" << i << "," << r.label << "] element, prob = " << r.confidence <<
                          "    (" << r.location.x << "," << r.location.y << ")-(" << r.location.width << ","
                          << r.location.height << ")"
                          << ((r.confidence > FLAGS_t) ? " WILL BE RENDERED!" : "") << std::endl;
            }

            results.push_back(r);
        }
    }
};

struct AgeGenderDetection : BaseDetection {
    std::string input;
    std::string outputAge;
    std::string outputGender;
    int enquedFaces = 0;

    AgeGenderDetection() : BaseDetection(FLAGS_m_ag, "Age Gender", FLAGS_n_ag) {}

    void submitRequest() override {
        if (!enquedFaces) return;
        BaseDetection::submitRequest();
        enquedFaces = 0;
    }

    void enqueue(const cv::Mat &face) {
        if (!enabled()) {
            return;
        }
        if (enquedFaces == maxBatch) {
            slog::warn << "Number of detected faces more than maximum(" << maxBatch << ") processed by Age Gender detector" << slog::endl;
            return;
        }
        if (!request) {
            request = net.CreateInferRequestPtr();
        }

        Blob::Ptr  inputBlob = request->GetBlob(input);

        matU8ToBlob<float>(face, inputBlob, 1.0f, enquedFaces);

        enquedFaces++;
    }

    struct Result { float age; float maleProb;};
    Result operator[] (int idx) const {
        Blob::Ptr  genderBlob = request->GetBlob(outputGender);
        Blob::Ptr  ageBlob    = request->GetBlob(outputAge);

        return {ageBlob->buffer().as<float*>()[idx] * 100,
                genderBlob->buffer().as<float*>()[idx * 2 + 1]};
    }

    CNNNetwork read() override {
        slog::info << "Loading network files for AgeGender" << slog::endl;
        CNNNetReader netReader;
        /** Read network model **/
        netReader.ReadNetwork(FLAGS_m_ag);

        /** Set batch size to 16 **/
        netReader.getNetwork().setBatchSize(maxBatch);
        slog::info << "Batch size is set to " << netReader.getNetwork().getBatchSize() << " for Age Gender" << slog::endl;


        /** Extract model name and load it's weights **/
        std::string binFileName = fileNameNoExt(FLAGS_m_ag) + ".bin";
        netReader.ReadWeights(binFileName);

        // -----------------------------------------------------------------------------------------------------

        /** Age Gender network should have one input two outputs **/
        // ---------------------------Check inputs ------------------------------------------------------
        slog::info << "Checking Age Gender inputs" << slog::endl;
        InputsDataMap inputInfo(netReader.getNetwork().getInputsInfo());
        if (inputInfo.size() != 1) {
            throw std::logic_error("Age gender topology should have only one input");
        }
        InputInfo::Ptr& inputInfoFirst = inputInfo.begin()->second;
        inputInfoFirst->setPrecision(Precision::FP32);
        inputInfoFirst->getInputData()->setLayout(Layout::NCHW);
        input = inputInfo.begin()->first;
        // -----------------------------------------------------------------------------------------------------

        // ---------------------------Check outputs ------------------------------------------------------
        slog::info << "Checking Age Gender outputs" << slog::endl;
        OutputsDataMap outputInfo(netReader.getNetwork().getOutputsInfo());
        if (outputInfo.size() != 2) {
            throw std::logic_error("Age Gender network should have two output layers");
        }
        auto it = outputInfo.begin();
        DataPtr ptrAgeOutput = (it++)->second;
        DataPtr ptrGenderOutput = (it++)->second;

        if (!ptrAgeOutput) {
            throw std::logic_error("Age output data pointer is not valid");
        }
        if (!ptrGenderOutput) {
            throw std::logic_error("Gender output data pointer is not valid");
        }

        auto genderCreatorLayer = ptrGenderOutput->getCreatorLayer().lock();
        auto ageCreatorLayer = ptrAgeOutput->getCreatorLayer().lock();

        if (!ageCreatorLayer) {
            throw std::logic_error("Age's creator layer pointer is not valid");
        }
        if (!genderCreatorLayer) {
            throw std::logic_error("Gender's creator layer pointer is not valid");
        }

        // if gender output is convolution, it can be swapped with age
        if (genderCreatorLayer->type == "Convolution") {
            std::swap(ptrAgeOutput, ptrGenderOutput);
        }

        if (ptrAgeOutput->getCreatorLayer().lock()->type != "Convolution") {
            throw std::logic_error("In Age Gender network, age layer (" + ageCreatorLayer->name +
                ") should be a Convolution, but was: " + ageCreatorLayer->type);
        }

        if (ptrGenderOutput->getCreatorLayer().lock()->type != "SoftMax") {
            throw std::logic_error("In Age Gender network, gender layer (" + genderCreatorLayer->name +
                ") should be a SoftMax, but was: " + genderCreatorLayer->type);
        }
        slog::info << "Age layer: " << ageCreatorLayer->name<< slog::endl;
        slog::info << "Gender layer: " << genderCreatorLayer->name<< slog::endl;

        outputAge = ptrAgeOutput->name;
        outputGender = ptrGenderOutput->name;

        slog::info << "Loading Age Gender model to the "<< FLAGS_d_ag << " plugin" << slog::endl;
        _enabled = true;
        return netReader.getNetwork();
    }
};

struct HeadPoseDetection : BaseDetection {
    std::string input;
    std::string outputAngleR = "angle_r_fc";
    std::string outputAngleP = "angle_p_fc";
    std::string outputAngleY = "angle_y_fc";
    int enquedFaces = 0;
    cv::Mat cameraMatrix;
    HeadPoseDetection() : BaseDetection(FLAGS_m_hp, "Head Pose", FLAGS_n_hp) {}

    void submitRequest() override {
        if (!enquedFaces) return;
        BaseDetection::submitRequest();
        enquedFaces = 0;
    }

    void enqueue(const cv::Mat &face) {
        if (!enabled()) {
            return;
        }
        if (enquedFaces == maxBatch) {
            slog::warn << "Number of detected faces more than maximum(" << maxBatch << ") processed by Head Pose detector" << slog::endl;
            return;
        }
        if (!request) {
            request = net.CreateInferRequestPtr();
        }

        Blob::Ptr  inputBlob = request->GetBlob(input);

        matU8ToBlob<float>(face, inputBlob, 1.0f, enquedFaces);

        enquedFaces++;
    }

    struct Results {
        float angle_r;
        float angle_p;
        float angle_y;
    };

    Results operator[] (int idx) const {
        Blob::Ptr  angleR = request->GetBlob(outputAngleR);
        Blob::Ptr  angleP = request->GetBlob(outputAngleP);
        Blob::Ptr  angleY = request->GetBlob(outputAngleY);

        return {angleR->buffer().as<float*>()[idx],
                angleP->buffer().as<float*>()[idx],
                angleY->buffer().as<float*>()[idx]};
    }

    CNNNetwork read() override {
        slog::info << "Loading network files for Head Pose detection " << slog::endl;
        CNNNetReader netReader;
        /** Read network model **/
        netReader.ReadNetwork(FLAGS_m_hp);
        /** Set batch size to maximum currently set to one provided from command line **/
        netReader.getNetwork().setBatchSize(maxBatch);
        netReader.getNetwork().setBatchSize(maxBatch);
        slog::info << "Batch size is sey to  " << netReader.getNetwork().getBatchSize() << " for Head Pose Network" << slog::endl;
        /** Extract model name and load it's weights **/
        std::string binFileName = fileNameNoExt(FLAGS_m_hp) + ".bin";
        netReader.ReadWeights(binFileName);


        /** Age Gender network should have one input two outputs **/
        // ---------------------------Check inputs ------------------------------------------------------
        slog::info << "Checking Head Pose Network inputs" << slog::endl;
        InputsDataMap inputInfo(netReader.getNetwork().getInputsInfo());
        if (inputInfo.size() != 1) {
            throw std::logic_error("Head Pose topology should have only one input");
        }
        InputInfo::Ptr& inputInfoFirst = inputInfo.begin()->second;
        inputInfoFirst->setPrecision(Precision::FP32);
        inputInfoFirst->getInputData()->setLayout(Layout::NCHW);
        input = inputInfo.begin()->first;
        // -----------------------------------------------------------------------------------------------------

        // ---------------------------Check outputs ------------------------------------------------------
        slog::info << "Checking Head Pose network outputs" << slog::endl;
        OutputsDataMap outputInfo(netReader.getNetwork().getOutputsInfo());
        if (outputInfo.size() != 3) {
            throw std::logic_error("Head Pose network should have 3 outputs");
        }
        std::map<std::string, bool> layerNames = {
            {outputAngleR, false},
            {outputAngleP, false},
            {outputAngleY, false}
        };

        for (auto && output : outputInfo) {
            CNNLayerPtr layer = output.second->getCreatorLayer().lock();
            if (!layer) {
                throw std::logic_error("Layer pointer is invalid");
            }
            if (layerNames.find(layer->name) == layerNames.end()) {
                throw std::logic_error("Head Pose network output layer unknown: " + layer->name + ", should be " +
                    outputAngleR + " or " + outputAngleP + " or " + outputAngleY);
            }
            if (layer->type != "FullyConnected") {
                throw std::logic_error("Head Pose network output layer (" + layer->name + ") has invalid type: " +
                    layer->type + ", should be FullyConnected");
            }
            auto fc = dynamic_cast<FullyConnectedLayer*>(layer.get());
            if (!fc) {
                throw std::logic_error("Fully connected layer is not valid");
            }
            if (fc->_out_num != 1) {
                throw std::logic_error("Head Pose network output layer (" + layer->name + ") has invalid out-size=" +
                    std::to_string(fc->_out_num) + ", should be 1");
            }
            layerNames[layer->name] = true;
        }

        slog::info << "Loading Head Pose model to the "<< FLAGS_d_hp << " plugin" << slog::endl;

        _enabled = true;
        return netReader.getNetwork();
    }

    void buildCameraMatrix(int cx, int cy, float focalLength) {
        if (!cameraMatrix.empty()) return;
        cameraMatrix = cv::Mat::zeros(3, 3, CV_32F);
        cameraMatrix.at<float>(0) = focalLength;
        cameraMatrix.at<float>(2) = static_cast<float>(cx);
        cameraMatrix.at<float>(4) = focalLength;
        cameraMatrix.at<float>(5) = static_cast<float>(cy);
        cameraMatrix.at<float>(8) = 1;
    }

    void drawAxes(cv::Mat& frame, cv::Point3f cpoint, Results headPose, float scale) {
        double yaw   = headPose.angle_y;
        double pitch = headPose.angle_p;
        double roll  = headPose.angle_r;

        if (FLAGS_r) {
            std::cout << "Head pose results: yaw, pitch, roll = " << yaw << ";" << pitch << ";" << roll << std::endl;
        }

        pitch *= CV_PI / 180.0;
        yaw   *= CV_PI / 180.0;
        roll  *= CV_PI / 180.0;

        cv::Matx33f        Rx(1,           0,            0,
                              0,  cos(pitch),  -sin(pitch),
                              0,  sin(pitch),  cos(pitch));
        cv::Matx33f Ry(cos(yaw),           0,    -sin(yaw),
                              0,           1,            0,
                       sin(yaw),           0,    cos(yaw));
        cv::Matx33f Rz(cos(roll), -sin(roll),            0,
                       sin(roll),  cos(roll),            0,
                              0,           0,            1);


        auto r = cv::Mat(Rz*Ry*Rx);
        buildCameraMatrix(frame.cols / 2, frame.rows / 2, 950.0);

        cv::Mat xAxis(3, 1, CV_32F), yAxis(3, 1, CV_32F), zAxis(3, 1, CV_32F), zAxis1(3, 1, CV_32F);

        xAxis.at<float>(0) = 1 * scale;
        xAxis.at<float>(1) = 0;
        xAxis.at<float>(2) = 0;

        yAxis.at<float>(0) = 0;
        yAxis.at<float>(1) = -1 * scale;
        yAxis.at<float>(2) = 0;

        zAxis.at<float>(0) = 0;
        zAxis.at<float>(1) = 0;
        zAxis.at<float>(2) = -1 * scale;

        zAxis1.at<float>(0) = 0;
        zAxis1.at<float>(1) = 0;
        zAxis1.at<float>(2) = 1 * scale;

        cv::Mat o(3, 1, CV_32F, cv::Scalar(0));
        o.at<float>(2) = cameraMatrix.at<float>(0);

        xAxis = r * xAxis + o;
        yAxis = r * yAxis + o;
        zAxis = r * zAxis + o;
        zAxis1 = r * zAxis1 + o;

        cv::Point p1, p2;

        p2.x = static_cast<int>((xAxis.at<float>(0) / xAxis.at<float>(2) * cameraMatrix.at<float>(0)) + cpoint.x);
        p2.y = static_cast<int>((xAxis.at<float>(1) / xAxis.at<float>(2) * cameraMatrix.at<float>(4)) + cpoint.y);
        cv::line(frame, cv::Point(cpoint.x, cpoint.y), p2, cv::Scalar(0, 0, 255), 2);

        p2.x = static_cast<int>((yAxis.at<float>(0) / yAxis.at<float>(2) * cameraMatrix.at<float>(0)) + cpoint.x);
        p2.y = static_cast<int>((yAxis.at<float>(1) / yAxis.at<float>(2) * cameraMatrix.at<float>(4)) + cpoint.y);
        cv::line(frame, cv::Point(cpoint.x, cpoint.y), p2, cv::Scalar(0, 255, 0), 2);

        p1.x = static_cast<int>((zAxis1.at<float>(0) / zAxis1.at<float>(2) * cameraMatrix.at<float>(0)) + cpoint.x);
        p1.y = static_cast<int>((zAxis1.at<float>(1) / zAxis1.at<float>(2) * cameraMatrix.at<float>(4)) + cpoint.y);

        p2.x = static_cast<int>((zAxis.at<float>(0) / zAxis.at<float>(2) * cameraMatrix.at<float>(0)) + cpoint.x);
        p2.y = static_cast<int>((zAxis.at<float>(1) / zAxis.at<float>(2) * cameraMatrix.at<float>(4)) + cpoint.y);
        cv::line(frame, p1, p2, cv::Scalar(255, 0, 0), 2);
        cv::circle(frame, p2, 3, cv::Scalar(255, 0, 0), 2);
    }
};

struct EmotionsDetectionClass : BaseDetection {
    std::string input;
    std::string outputEmotions;
    int enquedFaces = 0;

    EmotionsDetectionClass() : BaseDetection(FLAGS_m_em, "Emotions Recognition", FLAGS_n_em) {}

    void submitRequest() override {
        if (!enquedFaces) return;
        BaseDetection::submitRequest();
        enquedFaces = 0;
    }

    void enqueue(const cv::Mat &face) {
        if (!enabled()) {
            return;
        }
        if (enquedFaces == maxBatch) {
            slog::warn << "Number of detected faces more than maximum(" << maxBatch << ") processed by Emotions detector" << slog::endl;
            return;
        }
        if (!request) {
            request = net.CreateInferRequestPtr();
        }

        Blob::Ptr inputBlob = request->GetBlob(input);

        matU8ToBlob<float>(face, inputBlob, 1.0f, enquedFaces);

        enquedFaces++;
    }

    std::string operator[] (int idx) const {
        /* vector of supported emotions */
        static const std::vector<std::string> emotionsVec = {"neutral", "happy", "sad", "surprise", "anger"};
        auto emotionsVecSize = emotionsVec.size();

        Blob::Ptr emotionsBlob = request->GetBlob(outputEmotions);

        /* emotions vector must have the same size as number of channels
         * in model output. Default output format is NCHW so we check index 1. */
        int numOfChannels = emotionsBlob->getTensorDesc().getDims().at(1);
        if (numOfChannels != emotionsVec.size()) {
            throw std::logic_error("Output size (" + std::to_string(numOfChannels) +
                                   ") of the Emotions Recognition network is not equal "
                                   "to used emotions vector size (" +
                                   std::to_string(emotionsVec.size()) + ")");
        }

        auto emotionsValues = emotionsBlob->buffer().as<float *>();
        auto outputIdxPos = emotionsValues + idx;

        /* we identify an index of the most probable emotion in output array
           for idx image to return appropriate emotion name */
        int maxProbEmotionIx = std::max_element(outputIdxPos, outputIdxPos + emotionsVecSize) - outputIdxPos;
        return emotionsVec[maxProbEmotionIx];
    }

    CNNNetwork read() override {
        slog::info << "Loading network files for Emotions recognition" << slog::endl;
        InferenceEngine::CNNNetReader netReader;
        /** Read network model **/
        netReader.ReadNetwork(FLAGS_m_em);

        /** Default batch size is 16 **/
        netReader.getNetwork().setBatchSize(maxBatch);
        slog::info << "Batch size is set to " << netReader.getNetwork().getBatchSize() << " for Emotions recognition" << slog::endl;


        /** Extract model name and load it's weights **/
        std::string binFileName = fileNameNoExt(FLAGS_m_em) + ".bin";
        netReader.ReadWeights(binFileName);

        // -----------------------------------------------------------------------------------------------------

        /** Emotions recognition network should have one input and one output **/
        // ---------------------------Check inputs ------------------------------------------------------
        slog::info << "Checking Emotions Recognition inputs" << slog::endl;
        InferenceEngine::InputsDataMap inputInfo(netReader.getNetwork().getInputsInfo());
        if (inputInfo.size() != 1) {
            throw std::logic_error("Emotions Recognition topology should have only one input");
        }
        auto& inputInfoFirst = inputInfo.begin()->second;
        inputInfoFirst->setPrecision(Precision::FP32);
        inputInfoFirst->getInputData()->setLayout(Layout::NCHW);
        input = inputInfo.begin()->first;
        // -----------------------------------------------------------------------------------------------------

        // ---------------------------Check outputs ------------------------------------------------------
        slog::info << "Checking Emotions Recognition outputs" << slog::endl;
        InferenceEngine::OutputsDataMap outputInfo(netReader.getNetwork().getOutputsInfo());
        if (outputInfo.size() != 1) {
            throw std::logic_error("Emotions Recognition network should have one output layer");
        }

        DataPtr emotionsOutput = outputInfo.begin()->second;

        if (!emotionsOutput) {
            throw std::logic_error("Emotions output data pointer is invalid");
        }

        auto emotionsCreatorLayer = emotionsOutput->getCreatorLayer().lock();

        if (!emotionsCreatorLayer) {
            throw std::logic_error("Emotions creator layer pointer is invalid");
        }

        if (emotionsCreatorLayer->type != "SoftMax") {
            throw std::logic_error("In Emotions Recognition network, Emotion layer ("
                                   + emotionsCreatorLayer->name +
                                   ") should be a SoftMax, but was: " +
                                           emotionsCreatorLayer->type);
        }
        slog::info << "Emotions layer: " << emotionsCreatorLayer->name<< slog::endl;

        outputEmotions = emotionsOutput->name;

        slog::info << "Loading Emotions Recognition model to the "<< FLAGS_d_em << " plugin" << slog::endl;
        _enabled = true;
        return netReader.getNetwork();
    }
};

struct Load {
    BaseDetection& detector;
    explicit Load(BaseDetection& detector) : detector(detector) { }

    void into(InferencePlugin & plg) const {
        if (detector.enabled()) {
            detector.net = plg.LoadNetwork(detector.read(), {});
            detector.plugin = &plg;
        }
    }
};

int main(int argc, char *argv[]) {
    std::cout << "Deneme";
    try {
        /** This sample covers 3 certain topologies and cannot be generalized **/
        std::cout << "InferenceEngine: " << GetInferenceEngineVersion() << std::endl;

        // ------------------------------ Parsing and validation of input args ---------------------------------
        if (!ParseAndCheckCommandLine(argc, argv)) {
            return 0;
        }
        rs::log_to_console( rs::log_severity::warn );

	    if( !initialize_streaming( ) ) {
		    std::cout << "Unable to locate a camera" << std::endl;
		    rs::log_to_console( rs::log_severity::fatal );
		    return EXIT_FAILURE;
	    }


        slog::info << "Reading input" << slog::endl;
        initialize_streaming();
        //const bool isCamera = FLAGS_i == "cam";
        //if (!(FLAGS_i == "cam" ? cap.open(0) : cap.open(FLAGS_i))) {
          //  throw std::logic_error("Cannot open input file or camera: " + FLAGS_i);
        //}
        const size_t width  = (size_t) _color_intrin.width;//_color_intrin.get(CV_CAP_PROP_FRAME_WIDTH);
        const size_t height = (size_t) _color_intrin.height;//_color_intrin.get(CV_CAP_PROP_FRAME_HEIGHT);

        // read input (video) frame
        //cv::Mat frame;
        cv::Mat rgb( _color_intrin.height,
	                _color_intrin.width,
				    CV_8UC3,
				    (uchar *)_rs_camera->get_frame_data( rs::stream::color ));
       // if (!cap.read(frame)) {
         //   throw std::logic_error("Failed to get frame from cv::VideoCapture");
        //}
        // -----------------------------------------------------------------------------------------------------

        // --------------------------- 1. Load Plugin for inference engine -------------------------------------
        std::map<std::string, InferencePlugin> pluginsForDevices;
        std::vector<std::pair<std::string, std::string>> cmdOptions = {
            {FLAGS_d, FLAGS_m}, {FLAGS_d_ag, FLAGS_m_ag}, {FLAGS_d_hp, FLAGS_m_hp},
            {FLAGS_d_em, FLAGS_m_em}
        };

        FaceDetectionClass FaceDetection;
        AgeGenderDetection AgeGender;
        HeadPoseDetection HeadPose;
        EmotionsDetectionClass EmotionsDetection;

        for (auto && option : cmdOptions) {
            auto deviceName = option.first;
            auto networkName = option.second;

            if (deviceName == "" || networkName == "") {
                continue;
            }

            if (pluginsForDevices.find(deviceName) != pluginsForDevices.end()) {
                continue;
            }
            slog::info << "Loading plugin " << deviceName << slog::endl;
            InferencePlugin plugin = PluginDispatcher({"../../../lib/intel64", ""}).getPluginByDevice(deviceName);

            /** Printing plugin version **/
            printPluginVersion(plugin, std::cout);

            /** Load extensions for the CPU plugin **/
            if ((deviceName.find("CPU") != std::string::npos)) {
                plugin.AddExtension(std::make_shared<Extensions::Cpu::CpuExtensions>());

                if (!FLAGS_l.empty()) {
                    // CPU(MKLDNN) extensions are loaded as a shared library and passed as a pointer to base extension
                    auto extension_ptr = make_so_pointer<MKLDNNPlugin::IMKLDNNExtension>(FLAGS_l);
                    plugin.AddExtension(std::static_pointer_cast<IExtension>(extension_ptr));
                }
            } else if (!FLAGS_c.empty()) {
                // Load Extensions for other plugins not CPU
                plugin.SetConfig({ { PluginConfigParams::KEY_CONFIG_FILE, FLAGS_c } });
            }
            pluginsForDevices[deviceName] = plugin;
        }

        /** Per layer metrics **/
        if (FLAGS_pc) {
            for (auto && plugin : pluginsForDevices) {
                plugin.second.SetConfig({{PluginConfigParams::KEY_PERF_COUNT, PluginConfigParams::YES}});
            }
        }
        // -----------------------------------------------------------------------------------------------------

        // --------------------------- 2. Read IR models and load them to plugins ------------------------------
        Load(FaceDetection).into(pluginsForDevices[FLAGS_d]);
        Load(AgeGender).into(pluginsForDevices[FLAGS_d_ag]);
        Load(HeadPose).into(pluginsForDevices[FLAGS_d_hp]);
        Load(EmotionsDetection).into(pluginsForDevices[FLAGS_d_em]);
        // -----------------------------------------------------------------------------------------------------

        // --------------------------- 3. Do inference ---------------------------------------------------------
        slog::info << "Start inference " << slog::endl;
        typedef std::chrono::duration<double, std::ratio<1, 1000>> ms;
        auto wallclock = std::chrono::high_resolution_clock::now();

        double ocv_decode_time = 0, ocv_render_time = 0;
        bool firstFrame = true;
        /** Start inference & calc performance **/

        while (true) {
            /** requesting new frame if any*/
            //cap.grab();
            if( _rs_camera->is_streaming( ) )
			    _rs_camera->wait_for_frames( );

		    display_next_frame( );

            auto t0 = std::chrono::high_resolution_clock::now();
            FaceDetection.enqueue(rgb);
            auto t1 = std::chrono::high_resolution_clock::now();
            ocv_decode_time = std::chrono::duration_cast<ms>(t1 - t0).count();

            t0 = std::chrono::high_resolution_clock::now();
            // ----------------------------Run face detection inference-----------------------------------------
            FaceDetection.submitRequest();
            FaceDetection.wait();

            t1 = std::chrono::high_resolution_clock::now();
            ms detection = std::chrono::duration_cast<ms>(t1 - t0);

            FaceDetection.fetchResults();

            for (auto && face : FaceDetection.results) {
                if (AgeGender.enabled() || HeadPose.enabled() || EmotionsDetection.enabled()) {
                    auto clippedRect = face.location & cv::Rect(0, 0, width, height);
                    cv::Mat face = rgb(clippedRect);
                    AgeGender.enqueue(face);
                    HeadPose.enqueue(face);
                    EmotionsDetection.enqueue(face);
                }
            }
            // ----------------------------Run age-gender, and head pose detection simultaneously---------------
            t0 = std::chrono::high_resolution_clock::now();
            if (AgeGender.enabled() || HeadPose.enabled() || EmotionsDetection.enabled()) {
                AgeGender.submitRequest();
                HeadPose.submitRequest();
                EmotionsDetection.submitRequest();

                AgeGender.wait();
                HeadPose.wait();
                EmotionsDetection.wait();
            }
            t1 = std::chrono::high_resolution_clock::now();
            ms secondDetection = std::chrono::duration_cast<ms>(t1 - t0);

            // ----------------------------Processing outputs---------------------------------------------------
            std::ostringstream out;
            out << "OpenCV cap/render time: " << std::fixed << std::setprecision(2)
                << (ocv_decode_time + ocv_render_time) << " ms";
            cv::putText(rgb, out.str(), cv::Point2f(0, 25), cv::FONT_HERSHEY_TRIPLEX, 0.5, cv::Scalar(255, 0, 0));

            out.str("");
            out << "Face detection time: " << std::fixed << std::setprecision(2) << detection.count()
                << " ms ("
                << 1000.f / detection.count() << " fps)";
            cv::putText(rgb, out.str(), cv::Point2f(0, 45), cv::FONT_HERSHEY_TRIPLEX, 0.5,
                        cv::Scalar(255, 0, 0));

            if (HeadPose.enabled() || AgeGender.enabled() || EmotionsDetection.enabled()) {
                out.str("");
                out << (AgeGender.enabled() ? "Age Gender "  : "")
                    << (AgeGender.enabled() && (HeadPose.enabled() || EmotionsDetection.enabled()) ? "+ "  : "")
                    << (HeadPose.enabled() ? "Head Pose "  : "")
                    << (HeadPose.enabled() && EmotionsDetection.enabled() ? "+ " : "")
                    << (EmotionsDetection.enabled() ? "Emotions Recognition " : "")
                    << "time: "<< std::fixed << std::setprecision(2) << secondDetection.count()
                    << " ms ";
                if (!FaceDetection.results.empty()) {
                    out << "(" << 1000.f / secondDetection.count() << " fps)";
                }
                cv::putText(rgb, out.str(), cv::Point2f(0, 65), cv::FONT_HERSHEY_TRIPLEX, 0.5, cv::Scalar(255, 0, 0));
            }

            int i = 0;
            for (auto & result : FaceDetection.results) {
                cv::Rect rect = result.location;

                out.str("");

                if (AgeGender.enabled() && i < AgeGender.maxBatch) {
                    out << (AgeGender[i].maleProb > 0.5 ? "M" : "F");
                    out << std::fixed << std::setprecision(0) << "," << AgeGender[i].age;
                    if (FLAGS_r) {
                        std::cout << "Predicted gender, age = " << out.str() << std::endl;
                    }
                } else {
                    out << (result.label < FaceDetection.labels.size() ? FaceDetection.labels[result.label] :
                             std::string("label #") + std::to_string(result.label))
                        << ": " << std::fixed << std::setprecision(3) << result.confidence;
                }

                if (EmotionsDetection.enabled()) {
                    /* currently we display only most probable emotion */
                    std::string emotion = EmotionsDetection[i];
                    if (FLAGS_r) {
                        std::cout << "Predicted emotion = " << emotion << std::endl;
                    }
                    out << "," << emotion;
                }

                cv::putText(rgb,
                            out.str(),
                            cv::Point2f(result.location.x, result.location.y - 15),
                            cv::FONT_HERSHEY_COMPLEX_SMALL,
                            0.8,
                            cv::Scalar(0, 0, 255));

                if (HeadPose.enabled() && i < HeadPose.maxBatch) {
                    cv::Point3f center(rect.x + rect.width / 2, rect.y + rect.height / 2, 0);
                    HeadPose.drawAxes(rgb, center, HeadPose[i], 50);
                }

                auto genderColor = (AgeGender.enabled() && (i < AgeGender.maxBatch)) ?
                              ((AgeGender[i].maleProb < 0.5) ? cv::Scalar(147, 20, 255) : cv::Scalar(255, 0, 0)) :
                              cv::Scalar(100, 100, 100);

                cv::rectangle(rgb, result.location, genderColor, 1);

                i++;
            }

            if (-1 != cv::waitKey(1))
                break;

            t0 = std::chrono::high_resolution_clock::now();
            if (!FLAGS_no_show) {
                cv::imshow("Detection results", rgb);
            }
            t1 = std::chrono::high_resolution_clock::now();
            ocv_render_time = std::chrono::duration_cast<ms>(t1 - t0).count();

            // end of file, for single frame file, like image we just keep it displayed to let user check what was shown
            /*
            if (!cap.retrieve(rgb)) {
                if (!FLAGS_no_wait) {
                    slog::info << "Press any key to exit" << slog::endl;
                    cv::waitKey(0);
                }
                break;
            }
            */
            if (firstFrame) {
                slog::info << "Press any key to stop" << slog::endl;
            }

            firstFrame = false;
        }

        /** Show performace results **/
        if (FLAGS_pc) {
            FaceDetection.printPerformanceCounts();
            AgeGender.printPerformanceCounts();
            HeadPose.printPerformanceCounts();
        }
        // -----------------------------------------------------------------------------------------------------
    }
    catch (const std::exception& error) {
        slog::err << error.what() << slog::endl;
        return 1;
    }
    catch (...) {
        slog::err << "Unknown/internal exception happened." << slog::endl;
        return 1;
    }

    slog::info << "Execution successful" << slog::endl;
    return 0;
}