marty1885/hog.cpp Secret

## hog.cpp
//NOTE: The following source code is only for educational purpose. DO NOT use it for commerical purpose.
//However. You can use it as a referenece to implement commerical code.

//NOTE: Remember to serailize parallel_for_ sice the detect can and will be called in parallel
//otherwise. That is bad for FPGA code. OpenCL buffers are shared between threads. We don't
//want race condition.
//Add these to the end of HOGDescriptor's declartion
/*
public:
    bool initCL();

    cl::Platform mPlatform;
    cl::Device mDevice;
    cl::Context mContext;
    cl::Program mProgram;
    cl::CommandQueue mCommandQueue;
    cl::Kernel mSVMKernel;
    cl::Buffer mSVMBuffer;
    cl::Buffer mVecBuffer;
    cl::Buffer mResultBuffer;

    bool mInitedBuffer;
    float* mResult;
    float* mVecData;
    float* mSVMData;

*/
#define ON_FPGA //Comment this line if running on GPU
//Call thins functoin after creating the HOGDescriptor object
bool HOGDescriptor::initCL()
{
    std::vector<cl::Platform> platforms;
    cl::Platform::get(&platforms);
    if(platforms.size() == 0)
    {
        std::cout << "Error: no OpenCL platform found." << std::endl;
        return false;
    }
    std::cout << "Platform get!" << std::endl;
    std::cout << platforms.size() << " platforms found." << std::endl;
    mPlatform = platforms[0];

    std::vector<cl::Device> devices;
    mPlatform.getDevices(CL_DEVICE_TYPE_ALL,&devices);
    if(devices.size() == 0)
    {
        std::cout << "Error: no OpenCL device found." << std::endl;
        return false;
    }
    std::cout << devices.size() << " devieces found." << std::endl;
    mDevice = devices[0];
    std::cout << "Device get!" << std::endl;

    cl::Context context({mDevice});
    mContext = context;
    std::cout << "Context created!" << std::endl;

    #ifndef ON_FPGA
    std::ifstream sourceFile("svm.cl");
    if(sourceFile.good() == false)
    {
        std::cout << "Error: cannot load kernel source" << std::endl;
        return false;
    }

    cl::Program::Sources sources;
    std::string kernelSource((std::istreambuf_iterator<char>(sourceFile)),
                 std::istreambuf_iterator<char>());

    sources.push_back({kernelSource.c_str(),kernelSource.length()});
    cl::Program program(mContext,sources);
    mProgram = program;
    if(mProgram.build({mDevice})!=CL_SUCCESS)
    {
        std::cout<<" Error building: "<<mProgram.getBuildInfo<CL_PROGRAM_BUILD_LOG>(mDevice)<<"\n";
        return false;
    }

    #else//On FPGA
    std::ifstream in("svm.aocx",std::ios::binary);//NOTE: Load this kernel using the aoc command first
    std::vector<unsigned char> binaryVal;
    if(in.good() == false)
    {
    	std::cout << "Cannot open AOCX file" << std::endl;
        return false;
    }
    unsigned char b;
    while(!in.eof())
    {
        in.read((char*)&b,1);
        binaryVal.push_back(b);
    }

    typedef std::pair<void*,size_t> Binary;
    Binary binary;
    binary.first = (void*)&binaryVal[0];
    binary.second = binaryVal.size();
    //binaryProgram.push_back(binary);
    cl_int err;
    cl::Program program(mContext,{mDevice},{binary},nullptr,&err);
    if(err != 0)
    {
        std::cout << "Error loading binary program!!!!" << std::endl;
        return false;
    }
    mProgram = program;

    #endif
    std::cout << "Program loaded" << std::endl;
    cl::CommandQueue commandQueue(mContext,mDevice);
    mCommandQueue = commandQueue;
    std::cout << "CommandQueue created" << std::endl;

    cl::Kernel kernel(mProgram, "compute");
    mSVMKernel = kernel;
    std::cout << "Kernel created" << std::endl;

    mInitedBuffer = false;

    return true;
}

//Where the computing happens
void HOGDescriptor::detect(const Mat& img,
    std::vector<Point>& hits, std::vector<double>& weights, double hitThreshold,
    Size winStride, Size padding, const std::vector<Point>& locations)
{
    //auto time1 = high_resolution_clock::now();
    CV_INSTRUMENT_REGION()

    hits.clear();
    weights.clear();
    if( svmDetector.empty() )
        return;

    if( winStride == Size() )
        winStride = cellSize;
    Size cacheStride(gcd(winStride.width, blockStride.width),
        gcd(winStride.height, blockStride.height));

    size_t nwindows = locations.size();
    padding.width = (int)alignSize(std::max(padding.width, 0), cacheStride.width);
    padding.height = (int)alignSize(std::max(padding.height, 0), cacheStride.height);
    Size paddedImgSize(img.cols + padding.width*2, img.rows + padding.height*2);

    HOGCache cache(this, img, padding, padding, nwindows == 0, cacheStride);

    if( !nwindows )
        nwindows = cache.windowsInImage(paddedImgSize, winStride).area();

    const HOGCache::BlockData* blockData = &cache.blockData[0];

    int nblocks = cache.nblocks.area();
    int blockHistogramSize = cache.blockHistogramSize;
    size_t dsize = getDescriptorSize();

    double rho = svmDetector.size() > dsize ? svmDetector[dsize] : 0;
    std::vector<float> blockHist(blockHistogramSize);

    const int MAX_SIM_WINDOWS = 64;
    size_t size = blockHistogramSize*nblocks;
    size_t vecSize = size*MAX_SIM_WINDOWS;
    float result[nblocks*MAX_SIM_WINDOWS];
    float vecData[vecSize];
    float svmData[vecSize];

    //Second phase init
    if(mInitedBuffer == false)
    {
        std::cout << "Initing buffers..." << std::endl;
        //Initialize host buffer. XXX: Memory leak
        mResult = new float[nblocks*MAX_SIM_WINDOWS];
        mVecData = new float[vecSize];
        mSVMData = new float[vecSize];
        std::cout << "Buffers allocated on host" << std::endl;

        mResultBuffer = cl::Buffer(mContext, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR, sizeof(result));
        std::cout << "ResultBuffer created" << std::endl;
        mVecBuffer = cl::Buffer(mContext, CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR, sizeof(vecData),mVecData);
        std::cout << "SVMBUffer created" << std::endl;
        mSVMBuffer = cl::Buffer(mContext, CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR, sizeof(svmData),mSVMData);
        std::cout << "VecBuffer created" << std::endl;

        mSVMKernel.setArg(0, blockHistogramSize);
        std::cout << "hisgram size assigned" << std::endl;
        mSVMKernel.setArg(3,mResultBuffer);
        std::cout << "ResultBuffer assigned" << std::endl;
        mSVMKernel.setArg(1,mVecBuffer);
        std::cout << "VecBuffer assigned" << std::endl;
        mSVMKernel.setArg(2,mSVMBuffer);
        std::cout << "SVMBuffer assigned" << std::endl;

        //Load target SVM Vector
        for(int h=0;h<MAX_SIM_WINDOWS;h++)
        {
            float* svmVec = (float*)&svmDetector[0];
            for(int j = 0; j < nblocks; j++, svmVec += blockHistogramSize )
                memcpy(mSVMData+(h*size+j*blockHistogramSize), svmVec, sizeof(float)*blockHistogramSize);
        }
        mCommandQueue.enqueueWriteBuffer(mSVMBuffer, false, 0, sizeof(svmData),mSVMData);
        std::cout << "SVMBuffer written" << std::endl;

	    int err;
        //Map the buffer instead of writing it. This is faster on the C5SoC
	    mVecData = (float*)mCommandQueue.enqueueMapBuffer(mVecBuffer, false,CL_MAP_READ,0,sizeof(mVecData), NULL, NULL, &err);
	    std::cout << "MapBuffer done" << std::endl;
	    std::cout << "ptr = " << mVecData << ", err = " << err << std::endl;
        mInitedBuffer = true;
    }

    size_t i = 0;
    Point points[MAX_SIM_WINDOWS];
    for(;;)
    {
        size_t end = std::min(i+MAX_SIM_WINDOWS,nwindows);
        size_t computeNum = 0;
        if(i>=nwindows)
            break;
        memset(mResult,0,sizeof(result));

	//#pragma omp parallel for
        for(size_t h=i;h<end;h++)
        {
            Point pt0;
            if( !locations.empty() )
            {
                pt0 = locations[h];
                if( pt0.x < -padding.width || pt0.x > img.cols + padding.width - winSize.width ||
                        pt0.y < -padding.height || pt0.y > img.rows + padding.height - winSize.height )
                    continue;
            }
            else
            {
                pt0 = cache.getWindow(paddedImgSize, winStride, (int)h).tl() - Point(padding);
                CV_Assert(pt0.x % cacheStride.width == 0 && pt0.y % cacheStride.height == 0);
            }
            size_t index = 0;
            //#pragma omp critical
            {
                index = computeNum;
                points[computeNum] = pt0;
                computeNum++;
            }


            //FIXME: This is VERY slow.
            for(int j = 0; j < nblocks; j++)
            {
                const HOGCache::BlockData& bj = blockData[j];
                Point pt = pt0 + bj.imgOffset;

                float* vec = (float*)cache.getBlock(pt, &blockHist[0]);
                memcpy(mVecData+(index*size+j*blockHistogramSize), vec, sizeof(float)*blockHistogramSize);
            }

        }
        if(computeNum == 0)
            continue;
        //No need to write buffer now. :)
	    //mCommandQueue.enqueueWriteBuffer(mVecBuffer, false, 0, sizeof(vecData),mVecData);


	    mCommandQueue.enqueueNDRangeKernel(mSVMKernel,
	        cl::NullRange,cl::NDRange(nblocks*computeNum),cl::NDRange(computeNum));
	    mCommandQueue.enqueueReadBuffer(mResultBuffer, true,0
	        ,sizeof(result) ,mResult);
	    //mCommandQueue.finish();
	    std::cout << i << "\t" << end << std::endl;
        for(int h=0;h<computeNum;h++)
        {
            float s = rho;
            for(int j=0;j<nblocks;j++)
                s += mResult[h*nblocks+j];

            if( s >= hitThreshold )
            {
                hits.push_back(points[h]);
                weights.push_back(s);
            }
        }
        i += MAX_SIM_WINDOWS;
    }
    //auto time2 = high_resolution_clock::now();
    //auto time_span = duration_cast<duration<float>>(time2 - time1);
    //printf("Detect takes %f s.\n",time_span.count());
}
	//NOTE: The following source code is only for educational purpose. DO NOT use it for commerical purpose.
	//However. You can use it as a referenece to implement commerical code.

	//NOTE: Remember to serailize parallel_for_ sice the detect can and will be called in parallel
	//otherwise. That is bad for FPGA code. OpenCL buffers are shared between threads. We don't
	//want race condition.
	//Add these to the end of HOGDescriptor's declartion
	/*
	public:
	bool initCL();

	cl::Platform mPlatform;
	cl::Device mDevice;
	cl::Context mContext;
	cl::Program mProgram;
	cl::CommandQueue mCommandQueue;
	cl::Kernel mSVMKernel;
	cl::Buffer mSVMBuffer;
	cl::Buffer mVecBuffer;
	cl::Buffer mResultBuffer;

	bool mInitedBuffer;
	float* mResult;
	float* mVecData;
	float* mSVMData;

	*/
	#define ON_FPGA //Comment this line if running on GPU
	//Call thins functoin after creating the HOGDescriptor object
	bool HOGDescriptor::initCL()
	{
	std::vector<cl::Platform> platforms;
	cl::Platform::get(&platforms);
	if(platforms.size() == 0)
	{
	std::cout << "Error: no OpenCL platform found." << std::endl;
	return false;
	}
	std::cout << "Platform get!" << std::endl;
	std::cout << platforms.size() << " platforms found." << std::endl;
	mPlatform = platforms[0];

	std::vector<cl::Device> devices;
	mPlatform.getDevices(CL_DEVICE_TYPE_ALL,&devices);
	if(devices.size() == 0)
	{
	std::cout << "Error: no OpenCL device found." << std::endl;
	return false;
	}
	std::cout << devices.size() << " devieces found." << std::endl;
	mDevice = devices[0];
	std::cout << "Device get!" << std::endl;

	cl::Context context({mDevice});
	mContext = context;
	std::cout << "Context created!" << std::endl;

	#ifndef ON_FPGA
	std::ifstream sourceFile("svm.cl");
	if(sourceFile.good() == false)
	{
	std::cout << "Error: cannot load kernel source" << std::endl;
	return false;
	}

	cl::Program::Sources sources;
	std::string kernelSource((std::istreambuf_iterator<char>(sourceFile)),
	std::istreambuf_iterator<char>());

	sources.push_back({kernelSource.c_str(),kernelSource.length()});
	cl::Program program(mContext,sources);
	mProgram = program;
	if(mProgram.build({mDevice})!=CL_SUCCESS)
	{
	std::cout<<" Error building: "<<mProgram.getBuildInfo<CL_PROGRAM_BUILD_LOG>(mDevice)<<"\n";
	return false;
	}

	#else//On FPGA
	std::ifstream in("svm.aocx",std::ios::binary);//NOTE: Load this kernel using the aoc command first
	std::vector<unsigned char> binaryVal;
	if(in.good() == false)
	{
	std::cout << "Cannot open AOCX file" << std::endl;
	return false;
	}
	unsigned char b;
	while(!in.eof())
	{
	in.read((char*)&b,1);
	binaryVal.push_back(b);
	}

	typedef std::pair<void*,size_t> Binary;
	Binary binary;
	binary.first = (void*)&binaryVal[0];
	binary.second = binaryVal.size();
	//binaryProgram.push_back(binary);
	cl_int err;
	cl::Program program(mContext,{mDevice},{binary},nullptr,&err);
	if(err != 0)
	{
	std::cout << "Error loading binary program!!!!" << std::endl;
	return false;
	}
	mProgram = program;

	#endif
	std::cout << "Program loaded" << std::endl;
	cl::CommandQueue commandQueue(mContext,mDevice);
	mCommandQueue = commandQueue;
	std::cout << "CommandQueue created" << std::endl;

	cl::Kernel kernel(mProgram, "compute");
	mSVMKernel = kernel;
	std::cout << "Kernel created" << std::endl;

	mInitedBuffer = false;

	return true;
	}

	//Where the computing happens
	void HOGDescriptor::detect(const Mat& img,
	std::vector<Point>& hits, std::vector<double>& weights, double hitThreshold,
	Size winStride, Size padding, const std::vector<Point>& locations)
	{
	//auto time1 = high_resolution_clock::now();
	CV_INSTRUMENT_REGION()

	hits.clear();
	weights.clear();
	if( svmDetector.empty() )
	return;

	if( winStride == Size() )
	winStride = cellSize;
	Size cacheStride(gcd(winStride.width, blockStride.width),
	gcd(winStride.height, blockStride.height));

	size_t nwindows = locations.size();
	padding.width = (int)alignSize(std::max(padding.width, 0), cacheStride.width);
	padding.height = (int)alignSize(std::max(padding.height, 0), cacheStride.height);
	Size paddedImgSize(img.cols + padding.width2, img.rows + padding.height2);

	HOGCache cache(this, img, padding, padding, nwindows == 0, cacheStride);

	if( !nwindows )
	nwindows = cache.windowsInImage(paddedImgSize, winStride).area();

	const HOGCache::BlockData* blockData = &cache.blockData[0];

	int nblocks = cache.nblocks.area();
	int blockHistogramSize = cache.blockHistogramSize;
	size_t dsize = getDescriptorSize();

	double rho = svmDetector.size() > dsize ? svmDetector[dsize] : 0;
	std::vector<float> blockHist(blockHistogramSize);

	const int MAX_SIM_WINDOWS = 64;
	size_t size = blockHistogramSize*nblocks;
	size_t vecSize = size*MAX_SIM_WINDOWS;
	float result[nblocks*MAX_SIM_WINDOWS];
	float vecData[vecSize];
	float svmData[vecSize];

	//Second phase init
	if(mInitedBuffer == false)
	{
	std::cout << "Initing buffers..." << std::endl;
	//Initialize host buffer. XXX: Memory leak
	mResult = new float[nblocks*MAX_SIM_WINDOWS];
	mVecData = new float[vecSize];
	mSVMData = new float[vecSize];
	std::cout << "Buffers allocated on host" << std::endl;

	mResultBuffer = cl::Buffer(mContext, CL_MEM_WRITE_ONLY\|CL_MEM_ALLOC_HOST_PTR, sizeof(result));
	std::cout << "ResultBuffer created" << std::endl;
	mVecBuffer = cl::Buffer(mContext, CL_MEM_READ_ONLY\|CL_MEM_USE_HOST_PTR, sizeof(vecData),mVecData);
	std::cout << "SVMBUffer created" << std::endl;
	mSVMBuffer = cl::Buffer(mContext, CL_MEM_READ_ONLY\|CL_MEM_USE_HOST_PTR, sizeof(svmData),mSVMData);
	std::cout << "VecBuffer created" << std::endl;

	mSVMKernel.setArg(0, blockHistogramSize);
	std::cout << "hisgram size assigned" << std::endl;
	mSVMKernel.setArg(3,mResultBuffer);
	std::cout << "ResultBuffer assigned" << std::endl;
	mSVMKernel.setArg(1,mVecBuffer);
	std::cout << "VecBuffer assigned" << std::endl;
	mSVMKernel.setArg(2,mSVMBuffer);
	std::cout << "SVMBuffer assigned" << std::endl;

	//Load target SVM Vector
	for(int h=0;h<MAX_SIM_WINDOWS;h++)
	{
	float* svmVec = (float*)&svmDetector[0];
	for(int j = 0; j < nblocks; j++, svmVec += blockHistogramSize )
	memcpy(mSVMData+(hsize+jblockHistogramSize), svmVec, sizeof(float)*blockHistogramSize);
	}
	mCommandQueue.enqueueWriteBuffer(mSVMBuffer, false, 0, sizeof(svmData),mSVMData);
	std::cout << "SVMBuffer written" << std::endl;

	int err;
	//Map the buffer instead of writing it. This is faster on the C5SoC
	mVecData = (float*)mCommandQueue.enqueueMapBuffer(mVecBuffer, false,CL_MAP_READ,0,sizeof(mVecData), NULL, NULL, &err);
	std::cout << "MapBuffer done" << std::endl;
	std::cout << "ptr = " << mVecData << ", err = " << err << std::endl;
	mInitedBuffer = true;
	}

	size_t i = 0;
	Point points[MAX_SIM_WINDOWS];
	for(;;)
	{
	size_t end = std::min(i+MAX_SIM_WINDOWS,nwindows);
	size_t computeNum = 0;
	if(i>=nwindows)
	break;
	memset(mResult,0,sizeof(result));

	//#pragma omp parallel for
	for(size_t h=i;h<end;h++)
	{
	Point pt0;
	if( !locations.empty() )
	{
	pt0 = locations[h];
	if( pt0.x < -padding.width \|\| pt0.x > img.cols + padding.width - winSize.width \|\|
	pt0.y < -padding.height \|\| pt0.y > img.rows + padding.height - winSize.height )
	continue;
	}
	else
	{
	pt0 = cache.getWindow(paddedImgSize, winStride, (int)h).tl() - Point(padding);
	CV_Assert(pt0.x % cacheStride.width == 0 && pt0.y % cacheStride.height == 0);
	}
	size_t index = 0;
	//#pragma omp critical
	{
	index = computeNum;
	points[computeNum] = pt0;
	computeNum++;
	}


	//FIXME: This is VERY slow.
	for(int j = 0; j < nblocks; j++)
	{
	const HOGCache::BlockData& bj = blockData[j];
	Point pt = pt0 + bj.imgOffset;

	float* vec = (float*)cache.getBlock(pt, &blockHist[0]);
	memcpy(mVecData+(indexsize+jblockHistogramSize), vec, sizeof(float)*blockHistogramSize);
	}

	}
	if(computeNum == 0)
	continue;
	//No need to write buffer now. :)
	//mCommandQueue.enqueueWriteBuffer(mVecBuffer, false, 0, sizeof(vecData),mVecData);


	mCommandQueue.enqueueNDRangeKernel(mSVMKernel,
	cl::NullRange,cl::NDRange(nblocks*computeNum),cl::NDRange(computeNum));
	mCommandQueue.enqueueReadBuffer(mResultBuffer, true,0
	,sizeof(result) ,mResult);
	//mCommandQueue.finish();
	std::cout << i << "\t" << end << std::endl;
	for(int h=0;h<computeNum;h++)
	{
	float s = rho;
	for(int j=0;j<nblocks;j++)
	s += mResult[h*nblocks+j];

	if( s >= hitThreshold )
	{
	hits.push_back(points[h]);
	weights.push_back(s);
	}
	}
	i += MAX_SIM_WINDOWS;
	}
	//auto time2 = high_resolution_clock::now();
	//auto time_span = duration_cast<duration<float>>(time2 - time1);
	//printf("Detect takes %f s.\n",time_span.count());
	}