Skip to content

Instantly share code, notes, and snippets.

@xiaoyongzhu
Last active July 4, 2018 05:06
Show Gist options
  • Save xiaoyongzhu/4ae8d9df580c84e33157ff8d68f9ce89 to your computer and use it in GitHub Desktop.
Save xiaoyongzhu/4ae8d9df580c84e33157ff8d68f9ce89 to your computer and use it in GitHub Desktop.
multi_proposal for CPU for SNIPER (with issue https://github.com/mahyarnajibi/SNIPER/issues/28)
// For more details, please refer to this issue https://github.com/mahyarnajibi/SNIPER/issues/28
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*!
* Copyright (c) 2018 University of Maryland, College Park
* Licensed under The Apache-2.0 License [see LICENSE for details]
* \file multi_proposal.cc
* \brief Proposal target layer
*/
#include "./multi_proposal-inl.h"
#include <set>
#include <math.h>
#include <unistd.h>
#include <dmlc/logging.h>
#include <dmlc/parameter.h>
#include <mxnet/operator.h>
#include <mshadow/tensor.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
#include <thrust/functional.h>
#include "./operator_common.h"
#include "./mshadow_op.h"
#include <time.h>
#include <stdlib.h>
#include <time.h>
//============================
// Bounding Box Transform Utils
//============================
namespace mxnet {
namespace op {
namespace utils {
inline void BBoxTransformInv(float* boxes,
float* deltas,
float* im_info,
int num_images,
int anchors,
int heights,
int widths) {
int num_anchors = anchors * heights * widths;
//usleep(20000000);
#pragma omp parallel for num_threads(8)
for (int t = 0; t < num_images * num_anchors; ++t) {
int b = t / num_anchors;
int index = t % num_anchors;
int a = index / (heights*widths);
int mat = index % (heights*widths);
int w = mat % widths; //width index
int h = mat / widths; //height index
float width = boxes[5*t + 2] - boxes[5*t] + 1.0;
float height = boxes[5*t + 3] - boxes[5*t + 1] + 1.0;
float ctr_x = boxes[5*t + 0] + 0.5 * (width - 1.0);
float ctr_y = boxes[5*t + 1] + 0.5 * (height - 1.0);
float dx = deltas[b*num_anchors*4 + a*4*widths*heights + h*widths + w];
float dy = deltas[b*num_anchors*4 + (a*4 + 1)*widths*heights + h*widths + w];
float dw = deltas[b*num_anchors*4 + (a*4 + 2)*widths*heights + h*widths + w];
float dh = deltas[b*num_anchors*4 + (a*4 + 3)*widths*heights + h*widths + w];
float pred_ctr_x = dx * width + ctr_x;
float pred_ctr_y = dy * height + ctr_y;
float pred_w = exp(dw) * width;
float pred_h = exp(dh) * height;
float pred_x1 = pred_ctr_x - 0.5 * (pred_w - 1.0);
float pred_y1 = pred_ctr_y - 0.5 * (pred_h - 1.0);
float pred_x2 = pred_ctr_x + 0.5 * (pred_w - 1.0);
float pred_y2 = pred_ctr_y + 0.5 * (pred_h - 1.0);
pred_x1 = std::max(std::min(pred_x1, im_info[3*b+1] - 1.0f), 0.0f);
pred_y1 = std::max(std::min(pred_y1, im_info[3*b] - 1.0f), 0.0f);
pred_x2 = std::max(std::min(pred_x2, im_info[3*b+1] - 1.0f), 0.0f);
pred_y2 = std::max(std::min(pred_y2, im_info[3*b] - 1.0f), 0.0f);
boxes[5*t] = pred_x1;
boxes[5*t + 1] = pred_y1;
boxes[5*t + 2] = pred_x2;
boxes[5*t + 3] = pred_y2;
}
}
// filter box by set confidence to zero
// * height or width < rpn_min_size
inline void FilterBox(float *dets,
int num_dets, float min_size) {
#pragma omp parallel for num_threads(8)
for (int i = 0; i < num_dets; ++i) {
float iw = dets[5*i + 2] - dets[5*i] + 1.0f;
float ih = dets[5*i + 3] - dets[5*i + 1] + 1.0f;
if (iw < min_size || ih < min_size) {
dets[5*i+0] -= min_size / 2;
dets[5*i+1] -= min_size / 2;
dets[5*i+2] += min_size / 2;
dets[5*i+3] += min_size / 2;
dets[5*i+4] = -1.0f;
}
}
}
inline void _MakeAnchor(float w,
float h,
float x_ctr,
float y_ctr,
std::vector<float> *out_anchors) {
out_anchors->push_back(x_ctr - 0.5f * (w - 1.0f));
out_anchors->push_back(y_ctr - 0.5f * (h - 1.0f));
out_anchors->push_back(x_ctr + 0.5f * (w - 1.0f));
out_anchors->push_back(y_ctr + 0.5f * (h - 1.0f));
}
inline void _Transform(float scale,
float ratio,
const std::vector<float>& base_anchor,
std::vector<float> *out_anchors) {
float w = base_anchor[2] - base_anchor[0] + 1.0f;
float h = base_anchor[3] - base_anchor[1] + 1.0f;
float x_ctr = base_anchor[0] + 0.5 * (w - 1.0f);
float y_ctr = base_anchor[1] + 0.5 * (h - 1.0f);
float size = w * h;
float size_ratios = std::floor(size / ratio);
float new_w = std::floor(std::sqrt(size_ratios) + 0.5f) * scale;
float new_h = std::floor((new_w / scale * ratio) + 0.5f) * scale;
_MakeAnchor(new_w, new_h, x_ctr,
y_ctr, out_anchors);
}
// out_anchors must have shape (n, 5), where n is ratios.size() * scales.size()
inline void GenerateAnchors(const std::vector<float>& base_anchor,
const nnvm::Tuple<float>& ratios,
const nnvm::Tuple<float>& scales,
std::vector<float> *out_anchors) {
for (size_t j = 0; j < ratios.ndim(); ++j) {
for (size_t k = 0; k < scales.ndim(); ++k) {
_Transform(scales[k], ratios[j], base_anchor, out_anchors);
}
}
}
// greedily keep the max detections (already sorted)
inline void NonMaximumSuppression(float* dets,
int post_nms_top_n,
int num_images,
int num_anchors,
int width,
int height,
std::vector< std::vector<int> > & final_keep_images) {
int total_anchors = num_images*num_anchors*width*height;
int chip_anchors = num_anchors*width*height;
float *area = new float[total_anchors];
#pragma omp parallel for num_threads(8)
for (int i = 0; i < total_anchors; ++i) {
area[i] = (dets[5*i + 2] - dets[5*i + 0] + 1) * (dets[5*i + 3] - dets[5*i + 1] + 1);
}
int max_nms = std::min(12000, chip_anchors);
#pragma omp parallel for num_threads(8)
for (int i = 0; i < num_images; i++) {
std::vector <float> sortids(chip_anchors);
for (int j = 0; j < chip_anchors; j++) {
sortids[j] = j;
}
int chip_index = i*chip_anchors;
std::sort(sortids.begin(), sortids.end(),
[&dets,chip_index](int i1, int i2) {
return dets[5*(chip_index + i1) + 4] > dets[5*(chip_index + i2) + 4];
});
float *dbuf = new float[6*max_nms];
//reorder for spatial locality in CPU, yo!
for (int j = 0; j < max_nms; j++) {
int index = i*chip_anchors + sortids[j];
dbuf[6*j] = dets[5*index];
dbuf[6*j+1] = dets[5*index+1];
dbuf[6*j+2] = dets[5*index+2];
dbuf[6*j+3] = dets[5*index+3];
dbuf[6*j+4] = dets[5*index+4];
dbuf[6*j+5] = area[index];
}
int vct = 0;
for (int j = 0; j < max_nms && vct < post_nms_top_n; j++) {
int index = i*chip_anchors + sortids[j];
float ix1 = dbuf[6*j];
float iy1 = dbuf[6*j+1];
float ix2 = dbuf[6*j+2];
float iy2 = dbuf[6*j+3];
float iarea = dbuf[6*j+5];
if (dbuf[6*j+4] == -1) {
continue;
}
final_keep_images[i].push_back(index);
vct = vct + 1;
for (int pind = j + 1; pind < max_nms; pind++) {
if (dbuf[6*pind + 4] == -1) {
continue;
}
float xx1 = std::max(ix1, dbuf[6*pind]);
float yy1 = std::max(iy1, dbuf[6*pind + 1]);
float xx2 = std::min(ix2, dbuf[6*pind + 2]);
float yy2 = std::min(iy2, dbuf[6*pind + 3]);
float w = std::max(0.0f, xx2 - xx1 + 1.0f);
float h = std::max(0.0f, yy2 - yy1 + 1.0f);
float inter = w * h;
float ovr = inter / (iarea + dbuf[6*pind+5] - inter);
if (ovr > 0.7) {
dbuf[6*pind + 4] = -1;
}
}
}
delete [] dbuf;
}
delete [] area;
}
} // namespace utils
template<typename xpu>
class MultiProposalGPUOp : public Operator{
public:
float *scores;
float *bbox_deltas;
float *proposals;
float *im_info;
float *rois;
float *out_scores;
explicit MultiProposalGPUOp(MultiProposalParam param) {
this->param_ = param;
int batch_size = param.batch_size;
this->scores = new float[batch_size*21*2*200*200];
this->bbox_deltas = new float[batch_size*21*4*200*200];
this->proposals = new float[batch_size*21*5*200*200];
this->im_info = new float[batch_size*3];
this->rois = new float[param.rpn_post_nms_top_n * batch_size * 5];
this->out_scores = new float[param.rpn_post_nms_top_n*batch_size];
}
~MultiProposalGPUOp() {
delete [] this->scores;
delete [] this->bbox_deltas;
delete [] this->proposals;
delete [] this->im_info;
delete [] this->rois;
delete [] this->out_scores;
}
virtual void Forward(const OpContext &ctx,
const std::vector<TBlob> &in_data,
const std::vector<OpReqType> &req,
const std::vector<TBlob> &out_data,
const std::vector<TBlob> &aux_states) {
CHECK_EQ(in_data.size(), 3);
CHECK_EQ(out_data.size(), 2);
using namespace mshadow;
using namespace mshadow::expr;
//clock_t t;
//t = clock();
//std::cout << "quack 1" << std::endl;
Stream<xpu> *s = ctx.get_stream<xpu>();
Tensor<cpu, 4> tscores = in_data[proposal::kClsProb].get<cpu, 4, real_t>(s);
Tensor<cpu, 4> tbbox_deltas = in_data[proposal::kBBoxPred].get<cpu, 4, real_t>(s);
Tensor<cpu, 2> tim_info = in_data[proposal::kImInfo].get<cpu, 2, real_t>(s);
int num_images = tbbox_deltas.size(0);
int num_anchors = tbbox_deltas.size(1) / 4;
int height = tbbox_deltas.size(2);
int width = tbbox_deltas.size(3);
//number of anchors per chip
int count_anchors = num_anchors*height*width;
//std::cout << "quack 2" << std::endl;
//total number of anchors in a batch
int total_anchors = count_anchors * num_images;
// float *proposals = new float[total_anchors*5];
// float *im_info = new float[num_images*3];
memcpy(scores, tscores.dptr_, total_anchors*2*sizeof(float));
memcpy(bbox_deltas, tbbox_deltas.dptr_, total_anchors*4*sizeof(float));
memcpy(im_info, tim_info.dptr_, 3 * sizeof(float) * num_images);
std::vector<float> base_anchor(4);
//usleep(20000000);
base_anchor[0] = 0.0;
base_anchor[1] = 0.0;
base_anchor[2] = param_.feature_stride - 1.0;
base_anchor[3] = param_.feature_stride - 1.0;
std::vector<float> anchors;
utils::GenerateAnchors(base_anchor,
param_.ratios,
param_.scales,
&anchors);
//std::cout << "quack 3" << std::endl;
#pragma omp parallel for num_threads(8)
for (int t = 0; t < total_anchors; ++t) {
int b = t / count_anchors;
int index = t % count_anchors;
int i = index / (height*width);
int mat = t % (height*width);
int k = mat % width; //width index
int j = mat / width; //height index
proposals[5*t] = anchors[4*i] + k * param_.feature_stride;
proposals[5*t + 1] = anchors[4*i+1] + j * param_.feature_stride;
proposals[5*t + 2] = anchors[4*i+2] + k * param_.feature_stride;
proposals[5*t + 3] = anchors[4*i+3] + j * param_.feature_stride;
proposals[5*t + 4] = scores[b*count_anchors*2 + ((num_anchors + i)*height + j)*width + k];
}
utils::BBoxTransformInv(proposals, bbox_deltas, im_info, num_images, num_anchors, height, width);
utils::FilterBox(proposals, total_anchors, 3);
std::vector <std::vector<int> > keep_images(num_images);
for (int i = 0; i < num_images; i++) {
keep_images[i] = std::vector<int>(0);
}
//std::cout << "quack 5" << std::endl;
int rpn_post_nms_top_n = param_.rpn_post_nms_top_n;
utils::NonMaximumSuppression(proposals, rpn_post_nms_top_n, num_images, num_anchors, width, height, keep_images);
//std::cout << "quack 6" << std::endl;
#pragma omp parallel for num_threads(8)
for (int i = 0; i < num_images; i++) {
int numpropsi = keep_images[i].size();
for (int j = 0; j < numpropsi; j++) {
int base = (i*rpn_post_nms_top_n + j);
rois[5*base] = i;
rois[5*base+1] = proposals[5*keep_images[i][j] + 0];
rois[5*base+2] = proposals[5*keep_images[i][j] + 1];
rois[5*base+3] = proposals[5*keep_images[i][j] + 2];
rois[5*base+4] = proposals[5*keep_images[i][j] + 3];
out_scores[base] = proposals[5*keep_images[i][j] + 4];
}
for (int j = numpropsi; j < rpn_post_nms_top_n; j++) {
int base = (i*rpn_post_nms_top_n + j);
rois[5*base+0] = i;
rois[5*base+1] = rand() % 100;
rois[5*base+2] = rand() % 100;
rois[5*base+3] = 200 + rand() % 200;
rois[5*base+4] = 200 + rand() % 200;
out_scores[base] = 0.0;
}
}
// delete [] im_info;
// delete [] proposals;
Stream<xpu> *so = ctx.get_stream<xpu>();
Tensor<cpu,1> oscores = out_data[proposal::kScores].get<cpu, 1, real_t>(so);
Tensor<cpu, 2> orois = out_data[proposal::kRoIs].get<cpu, 2, real_t>(so);
memcpy(orois.dptr_, rois, 5*sizeof(float) * num_images * rpn_post_nms_top_n);
memcpy(oscores.dptr_, out_scores, sizeof(float) * num_images * rpn_post_nms_top_n);
}
virtual void Backward(const OpContext &ctx,
const std::vector<TBlob> &out_grad,
const std::vector<TBlob> &in_data,
const std::vector<TBlob> &out_data,
const std::vector<OpReqType> &req,
const std::vector<TBlob> &in_grad,
const std::vector<TBlob> &aux_states) {
using namespace mshadow;
using namespace mshadow::expr;
CHECK_EQ(in_grad.size(), 4);
Stream<xpu> *s = ctx.get_stream<xpu>();
Tensor<xpu, 4> gscores = in_grad[proposal::kClsProb].get<xpu, 4, real_t>(s);
Tensor<xpu, 4> gbbox = in_grad[proposal::kBBoxPred].get<xpu, 4, real_t>(s);
Tensor<xpu, 2> ginfo = in_grad[proposal::kImInfo].get<xpu, 2, real_t>(s);
// can not assume the grad would be zero
Assign(gscores, req[proposal::kClsProb], 0);
Assign(gbbox, req[proposal::kBBoxPred], 0);
Assign(ginfo, req[proposal::kImInfo], 0);
}
private:
MultiProposalParam param_;
}; // class MultiProposalOp
template<>
Operator *CreateOp<cpu>(MultiProposalParam param) {
// though the name is GPUOp, the it's actually for CPU
return new MultiProposalGPUOp<cpu>(param);
}
Operator* MultiProposalProp::CreateOperator(Context ctx) const {
DO_BIND_DISPATCH(CreateOp, param_);
}
DMLC_REGISTER_PARAMETER(MultiProposalParam);
MXNET_REGISTER_OP_PROPERTY(MultiProposal, MultiProposalProp)
.describe("Generate region proposals via RPN")
.add_argument("cls_prob", "NDArray-or-Symbol", "Score of how likely proposal is object.")
.add_argument("bbox_pred", "NDArray-or-Symbol", "BBox Predicted deltas from anchors for proposals")
.add_argument("im_info", "NDArray-or-Symbol", "Image size and scale.")
.add_arguments(MultiProposalParam::__FIELDS__());
} // namespace op
} // namespace mxnet
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment