-
-
Save marty1885/75ca2e236cd5938146ec09f2a56eed0b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//NOTE: The following source code is only for educational purpose. DO NOT use it for commerical purpose. | |
//However. You can use it as a referenece to implement commerical code. | |
//NOTE: Remember to serailize parallel_for_ sice the detect can and will be called in parallel | |
//otherwise. That is bad for FPGA code. OpenCL buffers are shared between threads. We don't | |
//want race condition. | |
//Add these to the end of HOGDescriptor's declartion | |
/* | |
public: | |
bool initCL(); | |
cl::Platform mPlatform; | |
cl::Device mDevice; | |
cl::Context mContext; | |
cl::Program mProgram; | |
cl::CommandQueue mCommandQueue; | |
cl::Kernel mSVMKernel; | |
cl::Buffer mSVMBuffer; | |
cl::Buffer mVecBuffer; | |
cl::Buffer mResultBuffer; | |
bool mInitedBuffer; | |
float* mResult; | |
float* mVecData; | |
float* mSVMData; | |
*/ | |
#define ON_FPGA //Comment this line if running on GPU | |
//Call thins functoin after creating the HOGDescriptor object | |
bool HOGDescriptor::initCL() | |
{ | |
std::vector<cl::Platform> platforms; | |
cl::Platform::get(&platforms); | |
if(platforms.size() == 0) | |
{ | |
std::cout << "Error: no OpenCL platform found." << std::endl; | |
return false; | |
} | |
std::cout << "Platform get!" << std::endl; | |
std::cout << platforms.size() << " platforms found." << std::endl; | |
mPlatform = platforms[0]; | |
std::vector<cl::Device> devices; | |
mPlatform.getDevices(CL_DEVICE_TYPE_ALL,&devices); | |
if(devices.size() == 0) | |
{ | |
std::cout << "Error: no OpenCL device found." << std::endl; | |
return false; | |
} | |
std::cout << devices.size() << " devieces found." << std::endl; | |
mDevice = devices[0]; | |
std::cout << "Device get!" << std::endl; | |
cl::Context context({mDevice}); | |
mContext = context; | |
std::cout << "Context created!" << std::endl; | |
#ifndef ON_FPGA | |
std::ifstream sourceFile("svm.cl"); | |
if(sourceFile.good() == false) | |
{ | |
std::cout << "Error: cannot load kernel source" << std::endl; | |
return false; | |
} | |
cl::Program::Sources sources; | |
std::string kernelSource((std::istreambuf_iterator<char>(sourceFile)), | |
std::istreambuf_iterator<char>()); | |
sources.push_back({kernelSource.c_str(),kernelSource.length()}); | |
cl::Program program(mContext,sources); | |
mProgram = program; | |
if(mProgram.build({mDevice})!=CL_SUCCESS) | |
{ | |
std::cout<<" Error building: "<<mProgram.getBuildInfo<CL_PROGRAM_BUILD_LOG>(mDevice)<<"\n"; | |
return false; | |
} | |
#else//On FPGA | |
std::ifstream in("svm.aocx",std::ios::binary);//NOTE: Load this kernel using the aoc command first | |
std::vector<unsigned char> binaryVal; | |
if(in.good() == false) | |
{ | |
std::cout << "Cannot open AOCX file" << std::endl; | |
return false; | |
} | |
unsigned char b; | |
while(!in.eof()) | |
{ | |
in.read((char*)&b,1); | |
binaryVal.push_back(b); | |
} | |
typedef std::pair<void*,size_t> Binary; | |
Binary binary; | |
binary.first = (void*)&binaryVal[0]; | |
binary.second = binaryVal.size(); | |
//binaryProgram.push_back(binary); | |
cl_int err; | |
cl::Program program(mContext,{mDevice},{binary},nullptr,&err); | |
if(err != 0) | |
{ | |
std::cout << "Error loading binary program!!!!" << std::endl; | |
return false; | |
} | |
mProgram = program; | |
#endif | |
std::cout << "Program loaded" << std::endl; | |
cl::CommandQueue commandQueue(mContext,mDevice); | |
mCommandQueue = commandQueue; | |
std::cout << "CommandQueue created" << std::endl; | |
cl::Kernel kernel(mProgram, "compute"); | |
mSVMKernel = kernel; | |
std::cout << "Kernel created" << std::endl; | |
mInitedBuffer = false; | |
return true; | |
} | |
//Where the computing happens | |
void HOGDescriptor::detect(const Mat& img, | |
std::vector<Point>& hits, std::vector<double>& weights, double hitThreshold, | |
Size winStride, Size padding, const std::vector<Point>& locations) | |
{ | |
//auto time1 = high_resolution_clock::now(); | |
CV_INSTRUMENT_REGION() | |
hits.clear(); | |
weights.clear(); | |
if( svmDetector.empty() ) | |
return; | |
if( winStride == Size() ) | |
winStride = cellSize; | |
Size cacheStride(gcd(winStride.width, blockStride.width), | |
gcd(winStride.height, blockStride.height)); | |
size_t nwindows = locations.size(); | |
padding.width = (int)alignSize(std::max(padding.width, 0), cacheStride.width); | |
padding.height = (int)alignSize(std::max(padding.height, 0), cacheStride.height); | |
Size paddedImgSize(img.cols + padding.width*2, img.rows + padding.height*2); | |
HOGCache cache(this, img, padding, padding, nwindows == 0, cacheStride); | |
if( !nwindows ) | |
nwindows = cache.windowsInImage(paddedImgSize, winStride).area(); | |
const HOGCache::BlockData* blockData = &cache.blockData[0]; | |
int nblocks = cache.nblocks.area(); | |
int blockHistogramSize = cache.blockHistogramSize; | |
size_t dsize = getDescriptorSize(); | |
double rho = svmDetector.size() > dsize ? svmDetector[dsize] : 0; | |
std::vector<float> blockHist(blockHistogramSize); | |
const int MAX_SIM_WINDOWS = 64; | |
size_t size = blockHistogramSize*nblocks; | |
size_t vecSize = size*MAX_SIM_WINDOWS; | |
float result[nblocks*MAX_SIM_WINDOWS]; | |
float vecData[vecSize]; | |
float svmData[vecSize]; | |
//Second phase init | |
if(mInitedBuffer == false) | |
{ | |
std::cout << "Initing buffers..." << std::endl; | |
//Initialize host buffer. XXX: Memory leak | |
mResult = new float[nblocks*MAX_SIM_WINDOWS]; | |
mVecData = new float[vecSize]; | |
mSVMData = new float[vecSize]; | |
std::cout << "Buffers allocated on host" << std::endl; | |
mResultBuffer = cl::Buffer(mContext, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR, sizeof(result)); | |
std::cout << "ResultBuffer created" << std::endl; | |
mVecBuffer = cl::Buffer(mContext, CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR, sizeof(vecData),mVecData); | |
std::cout << "SVMBUffer created" << std::endl; | |
mSVMBuffer = cl::Buffer(mContext, CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR, sizeof(svmData),mSVMData); | |
std::cout << "VecBuffer created" << std::endl; | |
mSVMKernel.setArg(0, blockHistogramSize); | |
std::cout << "hisgram size assigned" << std::endl; | |
mSVMKernel.setArg(3,mResultBuffer); | |
std::cout << "ResultBuffer assigned" << std::endl; | |
mSVMKernel.setArg(1,mVecBuffer); | |
std::cout << "VecBuffer assigned" << std::endl; | |
mSVMKernel.setArg(2,mSVMBuffer); | |
std::cout << "SVMBuffer assigned" << std::endl; | |
//Load target SVM Vector | |
for(int h=0;h<MAX_SIM_WINDOWS;h++) | |
{ | |
float* svmVec = (float*)&svmDetector[0]; | |
for(int j = 0; j < nblocks; j++, svmVec += blockHistogramSize ) | |
memcpy(mSVMData+(h*size+j*blockHistogramSize), svmVec, sizeof(float)*blockHistogramSize); | |
} | |
mCommandQueue.enqueueWriteBuffer(mSVMBuffer, false, 0, sizeof(svmData),mSVMData); | |
std::cout << "SVMBuffer written" << std::endl; | |
int err; | |
//Map the buffer instead of writing it. This is faster on the C5SoC | |
mVecData = (float*)mCommandQueue.enqueueMapBuffer(mVecBuffer, false,CL_MAP_READ,0,sizeof(mVecData), NULL, NULL, &err); | |
std::cout << "MapBuffer done" << std::endl; | |
std::cout << "ptr = " << mVecData << ", err = " << err << std::endl; | |
mInitedBuffer = true; | |
} | |
size_t i = 0; | |
Point points[MAX_SIM_WINDOWS]; | |
for(;;) | |
{ | |
size_t end = std::min(i+MAX_SIM_WINDOWS,nwindows); | |
size_t computeNum = 0; | |
if(i>=nwindows) | |
break; | |
memset(mResult,0,sizeof(result)); | |
//#pragma omp parallel for | |
for(size_t h=i;h<end;h++) | |
{ | |
Point pt0; | |
if( !locations.empty() ) | |
{ | |
pt0 = locations[h]; | |
if( pt0.x < -padding.width || pt0.x > img.cols + padding.width - winSize.width || | |
pt0.y < -padding.height || pt0.y > img.rows + padding.height - winSize.height ) | |
continue; | |
} | |
else | |
{ | |
pt0 = cache.getWindow(paddedImgSize, winStride, (int)h).tl() - Point(padding); | |
CV_Assert(pt0.x % cacheStride.width == 0 && pt0.y % cacheStride.height == 0); | |
} | |
size_t index = 0; | |
//#pragma omp critical | |
{ | |
index = computeNum; | |
points[computeNum] = pt0; | |
computeNum++; | |
} | |
//FIXME: This is VERY slow. | |
for(int j = 0; j < nblocks; j++) | |
{ | |
const HOGCache::BlockData& bj = blockData[j]; | |
Point pt = pt0 + bj.imgOffset; | |
float* vec = (float*)cache.getBlock(pt, &blockHist[0]); | |
memcpy(mVecData+(index*size+j*blockHistogramSize), vec, sizeof(float)*blockHistogramSize); | |
} | |
} | |
if(computeNum == 0) | |
continue; | |
//No need to write buffer now. :) | |
//mCommandQueue.enqueueWriteBuffer(mVecBuffer, false, 0, sizeof(vecData),mVecData); | |
mCommandQueue.enqueueNDRangeKernel(mSVMKernel, | |
cl::NullRange,cl::NDRange(nblocks*computeNum),cl::NDRange(computeNum)); | |
mCommandQueue.enqueueReadBuffer(mResultBuffer, true,0 | |
,sizeof(result) ,mResult); | |
//mCommandQueue.finish(); | |
std::cout << i << "\t" << end << std::endl; | |
for(int h=0;h<computeNum;h++) | |
{ | |
float s = rho; | |
for(int j=0;j<nblocks;j++) | |
s += mResult[h*nblocks+j]; | |
if( s >= hitThreshold ) | |
{ | |
hits.push_back(points[h]); | |
weights.push_back(s); | |
} | |
} | |
i += MAX_SIM_WINDOWS; | |
} | |
//auto time2 = high_resolution_clock::now(); | |
//auto time_span = duration_cast<duration<float>>(time2 - time1); | |
//printf("Detect takes %f s.\n",time_span.count()); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment