Skip to content

Instantly share code, notes, and snippets.

@marty1885
Created June 2, 2017 04:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save marty1885/75ca2e236cd5938146ec09f2a56eed0b to your computer and use it in GitHub Desktop.
Save marty1885/75ca2e236cd5938146ec09f2a56eed0b to your computer and use it in GitHub Desktop.
//NOTE: The following source code is only for educational purpose. DO NOT use it for commerical purpose.
//However. You can use it as a referenece to implement commerical code.
//NOTE: Remember to serailize parallel_for_ sice the detect can and will be called in parallel
//otherwise. That is bad for FPGA code. OpenCL buffers are shared between threads. We don't
//want race condition.
//Add these to the end of HOGDescriptor's declartion
/*
public:
bool initCL();
cl::Platform mPlatform;
cl::Device mDevice;
cl::Context mContext;
cl::Program mProgram;
cl::CommandQueue mCommandQueue;
cl::Kernel mSVMKernel;
cl::Buffer mSVMBuffer;
cl::Buffer mVecBuffer;
cl::Buffer mResultBuffer;
bool mInitedBuffer;
float* mResult;
float* mVecData;
float* mSVMData;
*/
#define ON_FPGA //Comment this line if running on GPU
//Call thins functoin after creating the HOGDescriptor object
bool HOGDescriptor::initCL()
{
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
if(platforms.size() == 0)
{
std::cout << "Error: no OpenCL platform found." << std::endl;
return false;
}
std::cout << "Platform get!" << std::endl;
std::cout << platforms.size() << " platforms found." << std::endl;
mPlatform = platforms[0];
std::vector<cl::Device> devices;
mPlatform.getDevices(CL_DEVICE_TYPE_ALL,&devices);
if(devices.size() == 0)
{
std::cout << "Error: no OpenCL device found." << std::endl;
return false;
}
std::cout << devices.size() << " devieces found." << std::endl;
mDevice = devices[0];
std::cout << "Device get!" << std::endl;
cl::Context context({mDevice});
mContext = context;
std::cout << "Context created!" << std::endl;
#ifndef ON_FPGA
std::ifstream sourceFile("svm.cl");
if(sourceFile.good() == false)
{
std::cout << "Error: cannot load kernel source" << std::endl;
return false;
}
cl::Program::Sources sources;
std::string kernelSource((std::istreambuf_iterator<char>(sourceFile)),
std::istreambuf_iterator<char>());
sources.push_back({kernelSource.c_str(),kernelSource.length()});
cl::Program program(mContext,sources);
mProgram = program;
if(mProgram.build({mDevice})!=CL_SUCCESS)
{
std::cout<<" Error building: "<<mProgram.getBuildInfo<CL_PROGRAM_BUILD_LOG>(mDevice)<<"\n";
return false;
}
#else//On FPGA
std::ifstream in("svm.aocx",std::ios::binary);//NOTE: Load this kernel using the aoc command first
std::vector<unsigned char> binaryVal;
if(in.good() == false)
{
std::cout << "Cannot open AOCX file" << std::endl;
return false;
}
unsigned char b;
while(!in.eof())
{
in.read((char*)&b,1);
binaryVal.push_back(b);
}
typedef std::pair<void*,size_t> Binary;
Binary binary;
binary.first = (void*)&binaryVal[0];
binary.second = binaryVal.size();
//binaryProgram.push_back(binary);
cl_int err;
cl::Program program(mContext,{mDevice},{binary},nullptr,&err);
if(err != 0)
{
std::cout << "Error loading binary program!!!!" << std::endl;
return false;
}
mProgram = program;
#endif
std::cout << "Program loaded" << std::endl;
cl::CommandQueue commandQueue(mContext,mDevice);
mCommandQueue = commandQueue;
std::cout << "CommandQueue created" << std::endl;
cl::Kernel kernel(mProgram, "compute");
mSVMKernel = kernel;
std::cout << "Kernel created" << std::endl;
mInitedBuffer = false;
return true;
}
//Where the computing happens
void HOGDescriptor::detect(const Mat& img,
std::vector<Point>& hits, std::vector<double>& weights, double hitThreshold,
Size winStride, Size padding, const std::vector<Point>& locations)
{
//auto time1 = high_resolution_clock::now();
CV_INSTRUMENT_REGION()
hits.clear();
weights.clear();
if( svmDetector.empty() )
return;
if( winStride == Size() )
winStride = cellSize;
Size cacheStride(gcd(winStride.width, blockStride.width),
gcd(winStride.height, blockStride.height));
size_t nwindows = locations.size();
padding.width = (int)alignSize(std::max(padding.width, 0), cacheStride.width);
padding.height = (int)alignSize(std::max(padding.height, 0), cacheStride.height);
Size paddedImgSize(img.cols + padding.width*2, img.rows + padding.height*2);
HOGCache cache(this, img, padding, padding, nwindows == 0, cacheStride);
if( !nwindows )
nwindows = cache.windowsInImage(paddedImgSize, winStride).area();
const HOGCache::BlockData* blockData = &cache.blockData[0];
int nblocks = cache.nblocks.area();
int blockHistogramSize = cache.blockHistogramSize;
size_t dsize = getDescriptorSize();
double rho = svmDetector.size() > dsize ? svmDetector[dsize] : 0;
std::vector<float> blockHist(blockHistogramSize);
const int MAX_SIM_WINDOWS = 64;
size_t size = blockHistogramSize*nblocks;
size_t vecSize = size*MAX_SIM_WINDOWS;
float result[nblocks*MAX_SIM_WINDOWS];
float vecData[vecSize];
float svmData[vecSize];
//Second phase init
if(mInitedBuffer == false)
{
std::cout << "Initing buffers..." << std::endl;
//Initialize host buffer. XXX: Memory leak
mResult = new float[nblocks*MAX_SIM_WINDOWS];
mVecData = new float[vecSize];
mSVMData = new float[vecSize];
std::cout << "Buffers allocated on host" << std::endl;
mResultBuffer = cl::Buffer(mContext, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR, sizeof(result));
std::cout << "ResultBuffer created" << std::endl;
mVecBuffer = cl::Buffer(mContext, CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR, sizeof(vecData),mVecData);
std::cout << "SVMBUffer created" << std::endl;
mSVMBuffer = cl::Buffer(mContext, CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR, sizeof(svmData),mSVMData);
std::cout << "VecBuffer created" << std::endl;
mSVMKernel.setArg(0, blockHistogramSize);
std::cout << "hisgram size assigned" << std::endl;
mSVMKernel.setArg(3,mResultBuffer);
std::cout << "ResultBuffer assigned" << std::endl;
mSVMKernel.setArg(1,mVecBuffer);
std::cout << "VecBuffer assigned" << std::endl;
mSVMKernel.setArg(2,mSVMBuffer);
std::cout << "SVMBuffer assigned" << std::endl;
//Load target SVM Vector
for(int h=0;h<MAX_SIM_WINDOWS;h++)
{
float* svmVec = (float*)&svmDetector[0];
for(int j = 0; j < nblocks; j++, svmVec += blockHistogramSize )
memcpy(mSVMData+(h*size+j*blockHistogramSize), svmVec, sizeof(float)*blockHistogramSize);
}
mCommandQueue.enqueueWriteBuffer(mSVMBuffer, false, 0, sizeof(svmData),mSVMData);
std::cout << "SVMBuffer written" << std::endl;
int err;
//Map the buffer instead of writing it. This is faster on the C5SoC
mVecData = (float*)mCommandQueue.enqueueMapBuffer(mVecBuffer, false,CL_MAP_READ,0,sizeof(mVecData), NULL, NULL, &err);
std::cout << "MapBuffer done" << std::endl;
std::cout << "ptr = " << mVecData << ", err = " << err << std::endl;
mInitedBuffer = true;
}
size_t i = 0;
Point points[MAX_SIM_WINDOWS];
for(;;)
{
size_t end = std::min(i+MAX_SIM_WINDOWS,nwindows);
size_t computeNum = 0;
if(i>=nwindows)
break;
memset(mResult,0,sizeof(result));
//#pragma omp parallel for
for(size_t h=i;h<end;h++)
{
Point pt0;
if( !locations.empty() )
{
pt0 = locations[h];
if( pt0.x < -padding.width || pt0.x > img.cols + padding.width - winSize.width ||
pt0.y < -padding.height || pt0.y > img.rows + padding.height - winSize.height )
continue;
}
else
{
pt0 = cache.getWindow(paddedImgSize, winStride, (int)h).tl() - Point(padding);
CV_Assert(pt0.x % cacheStride.width == 0 && pt0.y % cacheStride.height == 0);
}
size_t index = 0;
//#pragma omp critical
{
index = computeNum;
points[computeNum] = pt0;
computeNum++;
}
//FIXME: This is VERY slow.
for(int j = 0; j < nblocks; j++)
{
const HOGCache::BlockData& bj = blockData[j];
Point pt = pt0 + bj.imgOffset;
float* vec = (float*)cache.getBlock(pt, &blockHist[0]);
memcpy(mVecData+(index*size+j*blockHistogramSize), vec, sizeof(float)*blockHistogramSize);
}
}
if(computeNum == 0)
continue;
//No need to write buffer now. :)
//mCommandQueue.enqueueWriteBuffer(mVecBuffer, false, 0, sizeof(vecData),mVecData);
mCommandQueue.enqueueNDRangeKernel(mSVMKernel,
cl::NullRange,cl::NDRange(nblocks*computeNum),cl::NDRange(computeNum));
mCommandQueue.enqueueReadBuffer(mResultBuffer, true,0
,sizeof(result) ,mResult);
//mCommandQueue.finish();
std::cout << i << "\t" << end << std::endl;
for(int h=0;h<computeNum;h++)
{
float s = rho;
for(int j=0;j<nblocks;j++)
s += mResult[h*nblocks+j];
if( s >= hitThreshold )
{
hits.push_back(points[h]);
weights.push_back(s);
}
}
i += MAX_SIM_WINDOWS;
}
//auto time2 = high_resolution_clock::now();
//auto time_span = duration_cast<duration<float>>(time2 - time1);
//printf("Detect takes %f s.\n",time_span.count());
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment