shimat/AKAZEFeatures.cpp.diff

## AKAZEFeatures.cpp.diff
diff --git a/modules/features2d/src/kaze/AKAZEFeatures.cpp b/modules/features2d/src/kaze/AKAZEFeatures.cpp
index 77a68a5..024a5ca 100644
--- a/modules/features2d/src/kaze/AKAZEFeatures.cpp
+++ b/modules/features2d/src/kaze/AKAZEFeatures.cpp
@@ -11,9 +11,14 @@
 #include "fed.h"
 #include "nldiffusion_functions.h"
 #include "utils.h"
+#include "opencl_kernels_features2d.hpp"

 #include <iostream>

+#ifdef HAVE_OPENCL // OpenCL is not well supported
+#undef HAVE_OPENCL
+#endif
+
 // Namespaces
 namespace cv
 {
@@ -43,10 +48,20 @@ AKAZEFeatures::AKAZEFeatures(const AKAZEOptions& options) : options_(options) {
  * @brief This method allocates the memory for the nonlinear diffusion evolution
  */
 void AKAZEFeatures::Allocate_Memory_Evolution(void) {
+  CV_INSTRUMENT_REGION()

   float rfactor = 0.0f;
   int level_height = 0, level_width = 0;

+  // maximum size of the area for the descriptor computation
+  float smax = 0.0;
+  if (options_.descriptor == AKAZE::DESCRIPTOR_MLDB_UPRIGHT || options_.descriptor == AKAZE::DESCRIPTOR_MLDB) {
+    smax = 10.0f*sqrtf(2.0f);
+  }
+  else if (options_.descriptor == AKAZE::DESCRIPTOR_KAZE_UPRIGHT || options_.descriptor == AKAZE::DESCRIPTOR_KAZE) {
+    smax = 12.0f*sqrtf(2.0f);
+  }
+
   // Allocate the dimension of the matrices for the evolution
   for (int i = 0, power = 1; i <= options_.omax - 1; i++, power *= 2) {
     rfactor = 1.0f / power;
@@ -60,20 +75,16 @@ void AKAZEFeatures::Allocate_Memory_Evolution(void) {
     }

     for (int j = 0; j < options_.nsublevels; j++) {
-      TEvolution step;
-      step.Lx = Mat::zeros(level_height, level_width, CV_32F);
-      step.Ly = Mat::zeros(level_height, level_width, CV_32F);
-      step.Lxx = Mat::zeros(level_height, level_width, CV_32F);
-      step.Lxy = Mat::zeros(level_height, level_width, CV_32F);
-      step.Lyy = Mat::zeros(level_height, level_width, CV_32F);
-      step.Lt = Mat::zeros(level_height, level_width, CV_32F);
-      step.Ldet = Mat::zeros(level_height, level_width, CV_32F);
-      step.Lsmooth = Mat::zeros(level_height, level_width, CV_32F);
+      Evolution step;
+      step.size = Size(level_width, level_height);
       step.esigma = options_.soffset*pow(2.f, (float)(j) / (float)(options_.nsublevels) + i);
-      step.sigma_size = fRound(step.esigma);
-      step.etime = 0.5f*(step.esigma*step.esigma);
+      step.sigma_size = cvRound(step.esigma * options_.derivative_factor / power);  // In fact sigma_size only depends on j
+      step.etime = 0.5f * (step.esigma * step.esigma);
       step.octave = i;
       step.sublevel = j;
+      step.octave_ratio = (float)power;
+      step.border = cvRound(smax * step.sigma_size) + 1;
+
       evolution_.push_back(step);
     }
   }
@@ -93,363 +104,758 @@ void AKAZEFeatures::Allocate_Memory_Evolution(void) {

 /* ************************************************************************* */
 /**
+ * @brief Computes kernel size for Gaussian smoothing if the image
+ * @param sigma Kernel standard deviation
+ * @returns kernel size
+ */
+static inline int getGaussianKernelSize(float sigma) {
+  // Compute an appropriate kernel size according to the specified sigma
+  int ksize = (int)cvCeil(2.0f*(1.0f + (sigma - 0.8f) / (0.3f)));
+  ksize |= 1; // kernel should be odd
+  return ksize;
+}
+
+/* ************************************************************************* */
+/**
+* @brief This function computes a scalar non-linear diffusion step
+* @param Lt Base image in the evolution
+* @param Lf Conductivity image
+* @param Lstep Output image that gives the difference between the current
+* Ld and the next Ld being evolved
+* @param row_begin row where to start
+* @param row_end last row to fill exclusive. the range is [row_begin, row_end).
+* @note Forward Euler Scheme 3x3 stencil
+* The function c is a scalar value that depends on the gradient norm
+* dL_by_ds = d(c dL_by_dx)_by_dx + d(c dL_by_dy)_by_dy
+*/
+static inline void
+nld_step_scalar_one_lane(const Mat& Lt, const Mat& Lf, Mat& Lstep, float step_size, int row_begin, int row_end)
+{
+  CV_INSTRUMENT_REGION()
+  /* The labeling scheme for this five star stencil:
+   [    a    ]
+   [ -1 c +1 ]
+   [    b    ]
+   */
+
+  Lstep.create(Lt.size(), Lt.type());
+  const int cols = Lt.cols - 2;
+  int row = row_begin;
+
+  const float *lt_a, *lt_c, *lt_b;
+  const float *lf_a, *lf_c, *lf_b;
+  float *dst;
+  float step_r = 0.f;
+
+  // Process the top row
+  if (row == 0) {
+    lt_c = Lt.ptr<float>(0) + 1;  /* Skip the left-most column by +1 */
+    lf_c = Lf.ptr<float>(0) + 1;
+    lt_b = Lt.ptr<float>(1) + 1;
+    lf_b = Lf.ptr<float>(1) + 1;
+
+    // fill the corner to prevent uninitialized values
+    dst = Lstep.ptr<float>(0);
+    dst[0] = 0.0f;
+    ++dst;
+
+    for (int j = 0; j < cols; j++) {
+      step_r = (lf_c[j] + lf_c[j + 1])*(lt_c[j + 1] - lt_c[j]) +
+               (lf_c[j] + lf_c[j - 1])*(lt_c[j - 1] - lt_c[j]) +
+               (lf_c[j] + lf_b[j    ])*(lt_b[j    ] - lt_c[j]);
+      dst[j] = step_r * step_size;
+    }
+
+    // fill the corner to prevent uninitialized values
+    dst[cols] = 0.0f;
+    ++row;
+  }
+
+  // Process the middle rows
+  int middle_end = std::min(Lt.rows - 1, row_end);
+  for (; row < middle_end; ++row)
+  {
+    lt_a = Lt.ptr<float>(row - 1);
+    lf_a = Lf.ptr<float>(row - 1);
+    lt_c = Lt.ptr<float>(row    );
+    lf_c = Lf.ptr<float>(row    );
+    lt_b = Lt.ptr<float>(row + 1);
+    lf_b = Lf.ptr<float>(row + 1);
+    dst = Lstep.ptr<float>(row);
+
+    // The left-most column
+    step_r = (lf_c[0] + lf_c[1])*(lt_c[1] - lt_c[0]) +
+             (lf_c[0] + lf_b[0])*(lt_b[0] - lt_c[0]) +
+             (lf_c[0] + lf_a[0])*(lt_a[0] - lt_c[0]);
+    dst[0] = step_r * step_size;
+
+    lt_a++; lt_c++; lt_b++;
+    lf_a++; lf_c++; lf_b++;
+    dst++;
+
+    // The middle columns
+    for (int j = 0; j < cols; j++)
+    {
+      step_r = (lf_c[j] + lf_c[j + 1])*(lt_c[j + 1] - lt_c[j]) +
+               (lf_c[j] + lf_c[j - 1])*(lt_c[j - 1] - lt_c[j]) +
+               (lf_c[j] + lf_b[j    ])*(lt_b[j    ] - lt_c[j]) +
+               (lf_c[j] + lf_a[j    ])*(lt_a[j    ] - lt_c[j]);
+      dst[j] = step_r * step_size;
+    }
+
+    // The right-most column
+    step_r = (lf_c[cols] + lf_c[cols - 1])*(lt_c[cols - 1] - lt_c[cols]) +
+             (lf_c[cols] + lf_b[cols    ])*(lt_b[cols    ] - lt_c[cols]) +
+             (lf_c[cols] + lf_a[cols    ])*(lt_a[cols    ] - lt_c[cols]);
+    dst[cols] = step_r * step_size;
+  }
+
+  // Process the bottom row (row == Lt.rows - 1)
+  if (row_end == Lt.rows) {
+    lt_a = Lt.ptr<float>(row - 1) + 1;  /* Skip the left-most column by +1 */
+    lf_a = Lf.ptr<float>(row - 1) + 1;
+    lt_c = Lt.ptr<float>(row    ) + 1;
+    lf_c = Lf.ptr<float>(row    ) + 1;
+
+    // fill the corner to prevent uninitialized values
+    dst = Lstep.ptr<float>(row);
+    dst[0] = 0.0f;
+    ++dst;
+
+    for (int j = 0; j < cols; j++) {
+      step_r = (lf_c[j] + lf_c[j + 1])*(lt_c[j + 1] - lt_c[j]) +
+               (lf_c[j] + lf_c[j - 1])*(lt_c[j - 1] - lt_c[j]) +
+               (lf_c[j] + lf_a[j    ])*(lt_a[j    ] - lt_c[j]);
+      dst[j] = step_r * step_size;
+    }
+
+    // fill the corner to prevent uninitialized values
+    dst[cols] = 0.0f;
+  }
+}
+
+class NonLinearScalarDiffusionStep : public ParallelLoopBody
+{
+public:
+  NonLinearScalarDiffusionStep(const Mat& Lt, const Mat& Lf, Mat& Lstep, float step_size)
+    : Lt_(&Lt), Lf_(&Lf), Lstep_(&Lstep), step_size_(step_size)
+  {}
+
+  void operator()(const Range& range) const
+  {
+    nld_step_scalar_one_lane(*Lt_, *Lf_, *Lstep_, step_size_, range.start, range.end);
+  }
+
+private:
+  const Mat* Lt_;
+  const Mat* Lf_;
+  Mat* Lstep_;
+  float step_size_;
+};
+
+#ifdef HAVE_OPENCL
+static inline bool
+ocl_non_linear_diffusion_step(InputArray Lt_, InputArray Lf_, OutputArray Lstep_, float step_size)
+{
+    if (!Lt_.isContinuous())
+        return false;
+
+    UMat Lt = Lt_.getUMat(), Lf = Lf_.getUMat(), Lstep = Lstep_.getUMat();
+
+    size_t globalSize[] = {(size_t)Lt.cols, (size_t)Lt.rows};
+
+    ocl::Kernel ker("AKAZE_nld_step_scalar", ocl::features2d::akaze_oclsrc);
+    if (ker.empty())
+        return false;
+
+    return ker.args(
+            ocl::KernelArg::ReadOnly(Lt),
+            ocl::KernelArg::PtrReadOnly(Lf),
+            ocl::KernelArg::PtrWriteOnly(Lstep),
+            step_size)
+    .run(2, globalSize, 0, true);
+}
+#endif // HAVE_OPENCL
+
+static inline void
+non_linear_diffusion_step(InputArray Lt, InputArray Lf, OutputArray Lstep, float step_size)
+{
+  CV_INSTRUMENT_REGION()
+
+  Lstep.create(Lt.size(), Lt.type());
+
+#ifdef HAVE_OPENCL
+  CV_OCL_RUN(OCL_PERFORMANCE_CHECK(Lstep.isUMat()), ocl_non_linear_diffusion_step(Lt, Lf, Lstep, step_size));
+#endif
+
+  Mat Mstep = Lstep.getMat();
+  parallel_for_(Range(0, Lt.rows()), NonLinearScalarDiffusionStep(Lt.getMat(), Lf.getMat(), Mstep, step_size));
+}
+
+/**
+ * @brief This function computes a good empirical value for the k contrast factor
+ * given two gradient images, the percentile (0-1), the temporal storage to hold
+ * gradient norms and the histogram bins
+ * @param Lx Horizontal gradient of the input image
+ * @param Ly Vertical gradient of the input image
+ * @param nbins Number of histogram bins
+ * @return k contrast factor
+ */
+static inline float
+compute_kcontrast(const cv::Mat& Lx, const cv::Mat& Ly, float perc, int nbins)
+{
+  CV_INSTRUMENT_REGION()
+
+  CV_Assert(nbins > 2);
+  CV_Assert(!Lx.empty());
+
+  // temporary square roots of dot product
+  Mat modgs (Lx.rows - 2, Lx.cols - 2, CV_32F);
+  const int total = modgs.cols * modgs.rows;
+  float *modg = modgs.ptr<float>();
+  float hmax = 0.0f;
+
+  for (int i = 1; i < Lx.rows - 1; i++) {
+    const float *lx = Lx.ptr<float>(i) + 1;
+    const float *ly = Ly.ptr<float>(i) + 1;
+    const int cols = Lx.cols - 2;
+
+    for (int j = 0; j < cols; j++) {
+      float dist = sqrtf(lx[j] * lx[j] + ly[j] * ly[j]);
+      *modg++ = dist;
+      hmax = std::max(hmax, dist);
+    }
+  }
+  modg = modgs.ptr<float>();
+
+  if (hmax == 0.0f)
+    return 0.03f;  // e.g. a blank image
+
+  // Compute the bin numbers: the value range [0, hmax] -> [0, nbins-1]
+  modgs *= (nbins - 1) / hmax;
+
+  // Count up histogram
+  std::vector<int> hist(nbins, 0);
+  for (int i = 0; i < total; i++)
+    hist[(int)modg[i]]++;
+
+  // Now find the perc of the histogram percentile
+  const int nthreshold = (int)((total - hist[0]) * perc);  // Exclude hist[0] as background
+  int nelements = 0;
+  for (int k = 1; k < nbins; k++) {
+    if (nelements >= nthreshold)
+        return (float)hmax * k / nbins;
+
+    nelements += hist[k];
+  }
+
+  return 0.03f;
+}
+
+#ifdef HAVE_OPENCL
+static inline bool
+ocl_pm_g2(InputArray Lx_, InputArray Ly_, OutputArray Lflow_, float kcontrast)
+{
+    UMat Lx = Lx_.getUMat(), Ly = Ly_.getUMat(), Lflow = Lflow_.getUMat();
+
+    int total = Lx.rows * Lx.cols;
+    size_t globalSize[] = {(size_t)total};
+
+    ocl::Kernel ker("AKAZE_pm_g2", ocl::features2d::akaze_oclsrc);
+    if (ker.empty())
+        return false;
+
+    return ker.args(
+            ocl::KernelArg::PtrReadOnly(Lx),
+            ocl::KernelArg::PtrReadOnly(Ly),
+            ocl::KernelArg::PtrWriteOnly(Lflow),
+            kcontrast, total)
+    .run(1, globalSize, 0, true);
+}
+#endif // HAVE_OPENCL
+
+static inline void
+compute_diffusivity(InputArray Lx, InputArray Ly, OutputArray Lflow, float kcontrast, int diffusivity)
+{
+  CV_INSTRUMENT_REGION()
+
+  Lflow.create(Lx.size(), Lx.type());
+
+  switch (diffusivity) {
+    case KAZE::DIFF_PM_G1:
+      pm_g1(Lx, Ly, Lflow, kcontrast);
+    break;
+    case KAZE::DIFF_PM_G2:
+#ifdef HAVE_OPENCL
+      CV_OCL_RUN(OCL_PERFORMANCE_CHECK(Lflow.isUMat()), ocl_pm_g2(Lx, Ly, Lflow, kcontrast));
+#endif
+      pm_g2(Lx, Ly, Lflow, kcontrast);
+    break;
+    case KAZE::DIFF_WEICKERT:
+      weickert_diffusivity(Lx, Ly, Lflow, kcontrast);
+    break;
+    case KAZE::DIFF_CHARBONNIER:
+      charbonnier_diffusivity(Lx, Ly, Lflow, kcontrast);
+    break;
+    default:
+      CV_Error(diffusivity, "Diffusivity is not supported");
+    break;
+  }
+}
+
+/**
  * @brief This method creates the nonlinear scale space for a given image
  * @param img Input image for which the nonlinear scale space needs to be created
  * @return 0 if the nonlinear scale space was created successfully, -1 otherwise
  */
-int AKAZEFeatures::Create_Nonlinear_Scale_Space(const Mat& img)
+void AKAZEFeatures::Create_Nonlinear_Scale_Space(InputArray img)
 {
+  CV_INSTRUMENT_REGION()
   CV_Assert(evolution_.size() > 0);

-  // Copy the original image to the first level of the evolution
-  img.copyTo(evolution_[0].Lt);
-  gaussian_2D_convolution(evolution_[0].Lt, evolution_[0].Lt, 0, 0, options_.soffset);
-  evolution_[0].Lt.copyTo(evolution_[0].Lsmooth);
+  // create first level of the evolution
+  int ksize = getGaussianKernelSize(options_.soffset);
+  GaussianBlur(img, evolution_[0].Lsmooth, Size(ksize, ksize), options_.soffset, options_.soffset, BORDER_REPLICATE);
+  evolution_[0].Lsmooth.copyTo(evolution_[0].Lt);
+
+  if (evolution_.size() == 1) {
+    // we don't need to compute kcontrast factor
+    Compute_Determinant_Hessian_Response();
+    return;
+  }

-  // Allocate memory for the flow and step images
-  Mat Lflow = Mat::zeros(evolution_[0].Lt.rows, evolution_[0].Lt.cols, CV_32F);
-  Mat Lstep = Mat::zeros(evolution_[0].Lt.rows, evolution_[0].Lt.cols, CV_32F);
+  // derivatives, flow and diffusion step
+  Mat Lx, Ly, Lsmooth, Lflow, Lstep;

-  // First compute the kcontrast factor
-  options_.kcontrast = compute_k_percentile(img, options_.kcontrast_percentile, 1.0f, options_.kcontrast_nbins, 0, 0);
+  // compute derivatives for computing k contrast
+  GaussianBlur(img, Lsmooth, Size(5, 5), 1.0f, 1.0f, BORDER_REPLICATE);
+  Scharr(Lsmooth, Lx, CV_32F, 1, 0, 1, 0, BORDER_DEFAULT);
+  Scharr(Lsmooth, Ly, CV_32F, 0, 1, 1, 0, BORDER_DEFAULT);
+  Lsmooth.release();
+  // compute the kcontrast factor
+  float kcontrast = compute_kcontrast(Lx, Ly, options_.kcontrast_percentile, options_.kcontrast_nbins);

   // Now generate the rest of evolution levels
   for (size_t i = 1; i < evolution_.size(); i++) {
+    Evolution &e = evolution_[i];

-    if (evolution_[i].octave > evolution_[i - 1].octave) {
-      halfsample_image(evolution_[i - 1].Lt, evolution_[i].Lt);
-      options_.kcontrast = options_.kcontrast*0.75f;
-
-      // Allocate memory for the resized flow and step images
-      Lflow = Mat::zeros(evolution_[i].Lt.rows, evolution_[i].Lt.cols, CV_32F);
-      Lstep = Mat::zeros(evolution_[i].Lt.rows, evolution_[i].Lt.cols, CV_32F);
+    if (e.octave > evolution_[i - 1].octave) {
+      // new octave will be half the size
+      resize(evolution_[i - 1].Lt, e.Lt, e.size, 0, 0, INTER_AREA);
+      kcontrast *= 0.75f;
     }
     else {
-      evolution_[i - 1].Lt.copyTo(evolution_[i].Lt);
+      evolution_[i - 1].Lt.copyTo(e.Lt);
     }

-    gaussian_2D_convolution(evolution_[i].Lt, evolution_[i].Lsmooth, 0, 0, 1.0f);
+    GaussianBlur(e.Lt, e.Lsmooth, Size(5, 5), 1.0f, 1.0f, BORDER_REPLICATE);

     // Compute the Gaussian derivatives Lx and Ly
-    image_derivatives_scharr(evolution_[i].Lsmooth, evolution_[i].Lx, 1, 0);
-    image_derivatives_scharr(evolution_[i].Lsmooth, evolution_[i].Ly, 0, 1);
+    Scharr(e.Lsmooth, Lx, CV_32F, 1, 0, 1.0, 0, BORDER_DEFAULT);
+    Scharr(e.Lsmooth, Ly, CV_32F, 0, 1, 1.0, 0, BORDER_DEFAULT);

     // Compute the conductivity equation
-    switch (options_.diffusivity) {
-      case KAZE::DIFF_PM_G1:
-        pm_g1(evolution_[i].Lx, evolution_[i].Ly, Lflow, options_.kcontrast);
-      break;
-      case KAZE::DIFF_PM_G2:
-        pm_g2(evolution_[i].Lx, evolution_[i].Ly, Lflow, options_.kcontrast);
-      break;
-      case KAZE::DIFF_WEICKERT:
-        weickert_diffusivity(evolution_[i].Lx, evolution_[i].Ly, Lflow, options_.kcontrast);
-      break;
-      case KAZE::DIFF_CHARBONNIER:
-        charbonnier_diffusivity(evolution_[i].Lx, evolution_[i].Ly, Lflow, options_.kcontrast);
-      break;
-      default:
-        CV_Error(options_.diffusivity, "Diffusivity is not supported");
-      break;
-    }
-
-    // Perform FED n inner steps
-    for (int j = 0; j < nsteps_[i - 1]; j++) {
-      nld_step_scalar(evolution_[i].Lt, Lflow, Lstep, tsteps_[i - 1][j]);
+    compute_diffusivity(Lx, Ly, Lflow, kcontrast, options_.diffusivity);
+
+    // Perform Fast Explicit Diffusion on Lt
+    std::vector<float> &tsteps = tsteps_[i - 1];
+    for (size_t j = 0; j < tsteps.size(); j++) {
+      const float step_size = tsteps[j] * 0.5f;
+      non_linear_diffusion_step(e.Lt, Lflow, Lstep, step_size);
+      add(e.Lt, Lstep, e.Lt);
     }
   }

-  return 0;
+  Compute_Determinant_Hessian_Response();
 }

 /* ************************************************************************* */
+
+#ifdef HAVE_OPENCL
+static inline bool
+ocl_compute_determinant(InputArray Lxx_, InputArray Lxy_, InputArray Lyy_, OutputArray Ldet_, float sigma)
+{
+    UMat Lxx = Lxx_.getUMat(), Lxy = Lxy_.getUMat(), Lyy = Lyy_.getUMat(), Ldet = Ldet_.getUMat();
+
+    const int total = Lxx.rows * Lxx.cols;
+    size_t globalSize[] = {(size_t)total};
+
+    ocl::Kernel ker("AKAZE_compute_determinant", ocl::features2d::akaze_oclsrc);
+    if (ker.empty())
+        return false;
+
+    return ker.args(
+            ocl::KernelArg::PtrReadOnly(Lxx),
+            ocl::KernelArg::PtrReadOnly(Lxy),
+            ocl::KernelArg::PtrReadOnly(Lyy),
+            ocl::KernelArg::PtrWriteOnly(Ldet),
+            sigma, total)
+    .run(1, globalSize, 0, true);
+}
+#endif // HAVE_OPENCL
+
 /**
- * @brief This method selects interesting keypoints through the nonlinear scale space
- * @param kpts Vector of detected keypoints
+ * @brief Compute determinant from hessians
+ * @details Compute Ldet by (Lxx.mul(Lyy) - Lxy.mul(Lxy)) * sigma
+ *
+ * @param Lxx spatial derivates
+ * @param Lxy spatial derivates
+ * @param Lyy spatial derivates
+ * @param Ldet output determinant
+ * @param sigma determinant will be scaled by this sigma
  */
-void AKAZEFeatures::Feature_Detection(std::vector<KeyPoint>& kpts)
+static inline void compute_determinant(InputArray Lxx, InputArray Lxy, InputArray Lyy, OutputArray Ldet, float sigma)
 {
-  kpts.clear();
-  Compute_Determinant_Hessian_Response();
-  Find_Scale_Space_Extrema(kpts);
-  Do_Subpixel_Refinement(kpts);
+    CV_INSTRUMENT_REGION()
+
+    Ldet.create(Lxx.size(), Lxx.type());
+
+#ifdef HAVE_OPENCL
+    CV_OCL_RUN(OCL_PERFORMANCE_CHECK(Ldet.isUMat()), ocl_compute_determinant(Lxx, Lxy, Lyy, Ldet, sigma));
+#endif
+
+    // output determinant
+    Mat Mxx = Lxx.getMat(), Mxy = Lxy.getMat(), Myy = Lyy.getMat(), Mdet = Ldet.getMat();
+    const int W = Mxx.cols, H = Mxx.rows;
+    for (int y = 0; y < H; y++)
+    {
+        float *lxx = Mxx.ptr<float>(y);
+        float *lxy = Mxy.ptr<float>(y);
+        float *lyy = Myy.ptr<float>(y);
+        float *ldet = Mdet.ptr<float>(y);
+        for (int x = 0; x < W; x++)
+        {
+            ldet[x] = (lxx[x] * lyy[x] - lxy[x] * lxy[x]) * sigma;
+        }
+    }
 }

-/* ************************************************************************* */
-class MultiscaleDerivativesAKAZEInvoker : public ParallelLoopBody
+class DeterminantHessianResponse : public ParallelLoopBody
 {
 public:
-    explicit MultiscaleDerivativesAKAZEInvoker(std::vector<TEvolution>& ev, const AKAZEOptions& opt)
+    explicit DeterminantHessianResponse(std::vector<Evolution>& ev)
     : evolution_(&ev)
-    , options_(opt)
   {
   }

   void operator()(const Range& range) const
   {
-    std::vector<TEvolution>& evolution = *evolution_;
+    Mat Lxx, Lxy, Lyy;

     for (int i = range.start; i < range.end; i++)
     {
-      float ratio = (float)fastpow(2, evolution[i].octave);
-      int sigma_size_ = fRound(evolution[i].esigma * options_.derivative_factor / ratio);
-
-      compute_scharr_derivatives(evolution[i].Lsmooth, evolution[i].Lx, 1, 0, sigma_size_);
-      compute_scharr_derivatives(evolution[i].Lsmooth, evolution[i].Ly, 0, 1, sigma_size_);
-      compute_scharr_derivatives(evolution[i].Lx, evolution[i].Lxx, 1, 0, sigma_size_);
-      compute_scharr_derivatives(evolution[i].Ly, evolution[i].Lyy, 0, 1, sigma_size_);
-      compute_scharr_derivatives(evolution[i].Lx, evolution[i].Lxy, 0, 1, sigma_size_);
-
-      evolution[i].Lx = evolution[i].Lx*((sigma_size_));
-      evolution[i].Ly = evolution[i].Ly*((sigma_size_));
-      evolution[i].Lxx = evolution[i].Lxx*((sigma_size_)*(sigma_size_));
-      evolution[i].Lxy = evolution[i].Lxy*((sigma_size_)*(sigma_size_));
-      evolution[i].Lyy = evolution[i].Lyy*((sigma_size_)*(sigma_size_));
+      Evolution &e = (*evolution_)[i];
+
+      // we cannot use cv:Scharr here, because we need to handle also
+      // kernel sizes other than 3, by default we are using 9x9, 5x5 and 7x7
+
+      // compute kernels
+      Mat DxKx, DxKy, DyKx, DyKy;
+      compute_derivative_kernels(DxKx, DxKy, 1, 0, e.sigma_size);
+      compute_derivative_kernels(DyKx, DyKy, 0, 1, e.sigma_size);
+
+      // compute the multiscale derivatives
+      sepFilter2D(e.Lsmooth, e.Lx, CV_32F, DxKx, DxKy);
+      sepFilter2D(e.Lx, Lxx, CV_32F, DxKx, DxKy);
+      sepFilter2D(e.Lx, Lxy, CV_32F, DyKx, DyKy);
+      sepFilter2D(e.Lsmooth, e.Ly, CV_32F, DyKx, DyKy);
+      sepFilter2D(e.Ly, Lyy, CV_32F, DyKx, DyKy);
+
+      // free Lsmooth to same some space in the pyramid, it is not needed anymore
+      e.Lsmooth.release();
+
+      // compute determinant scaled by sigma
+      float sigma_size_quat = (float)(e.sigma_size * e.sigma_size * e.sigma_size * e.sigma_size);
+      compute_determinant(Lxx, Lxy, Lyy, e.Ldet, sigma_size_quat);
     }
   }

 private:
-  std::vector<TEvolution>*  evolution_;
-  AKAZEOptions              options_;
+  std::vector<Evolution>*  evolution_;
 };

-/* ************************************************************************* */
-/**
- * @brief This method computes the multiscale derivatives for the nonlinear scale space
- */
-void AKAZEFeatures::Compute_Multiscale_Derivatives(void)
-{
-  parallel_for_(Range(0, (int)evolution_.size()),
-                                        MultiscaleDerivativesAKAZEInvoker(evolution_, options_));
-}

-/* ************************************************************************* */
 /**
  * @brief This method computes the feature detector response for the nonlinear scale space
  * @note We use the Hessian determinant as the feature detector response
  */
 void AKAZEFeatures::Compute_Determinant_Hessian_Response(void) {
+  CV_INSTRUMENT_REGION()

-  // Firstly compute the multiscale derivatives
-  Compute_Multiscale_Derivatives();
-
-  for (size_t i = 0; i < evolution_.size(); i++)
-  {
-    for (int ix = 0; ix < evolution_[i].Ldet.rows; ix++)
-    {
-      for (int jx = 0; jx < evolution_[i].Ldet.cols; jx++)
-      {
-        float lxx = *(evolution_[i].Lxx.ptr<float>(ix)+jx);
-        float lxy = *(evolution_[i].Lxy.ptr<float>(ix)+jx);
-        float lyy = *(evolution_[i].Lyy.ptr<float>(ix)+jx);
-        *(evolution_[i].Ldet.ptr<float>(ix)+jx) = (lxx*lyy - lxy*lxy);
-      }
-    }
+  if (ocl::useOpenCL()) {
+    DeterminantHessianResponse body (evolution_);
+    body(Range(0, (int)evolution_.size()));
+  } else {
+    parallel_for_(Range(0, (int)evolution_.size()), DeterminantHessianResponse(evolution_));
   }
 }

 /* ************************************************************************* */
+
 /**
- * @brief This method finds extrema in the nonlinear scale space
+ * @brief This method selects interesting keypoints through the nonlinear scale space
  * @param kpts Vector of detected keypoints
  */
-void AKAZEFeatures::Find_Scale_Space_Extrema(std::vector<KeyPoint>& kpts)
+void AKAZEFeatures::Feature_Detection(std::vector<KeyPoint>& kpts)
 {
+  CV_INSTRUMENT_REGION()

-  float value = 0.0;
-  float dist = 0.0, ratio = 0.0, smax = 0.0;
-  int npoints = 0, id_repeated = 0;
-  int sigma_size_ = 0, left_x = 0, right_x = 0, up_y = 0, down_y = 0;
-  bool is_extremum = false, is_repeated = false, is_out = false;
-  KeyPoint point;
-  vector<KeyPoint> kpts_aux;
+  kpts.clear();
+  std::vector<Mat> keypoints_by_layers;
+  Find_Scale_Space_Extrema(keypoints_by_layers);
+  Do_Subpixel_Refinement(keypoints_by_layers, kpts);
+}

-  // Set maximum size
-  if (options_.descriptor == AKAZE::DESCRIPTOR_MLDB_UPRIGHT || options_.descriptor == AKAZE::DESCRIPTOR_MLDB) {
-    smax = 10.0f*sqrtf(2.0f);
-  }
-  else if (options_.descriptor == AKAZE::DESCRIPTOR_KAZE_UPRIGHT || options_.descriptor == AKAZE::DESCRIPTOR_KAZE) {
-    smax = 12.0f*sqrtf(2.0f);
+/**
+ * @brief This method searches v for a neighbor point of the point candidate p
+ * @param x Coordinates of the keypoint candidate to search a neighbor
+ * @param y Coordinates of the keypoint candidate to search a neighbor
+ * @param mask Matrix holding keypoints positions
+ * @param search_radius neighbour radius for searching keypoints
+ * @param idx The index to mask, pointing to keypoint found.
+ * @return true if a neighbor point is found; false otherwise
+ */
+static inline bool
+find_neighbor_point(const int x, const int y, const Mat &mask, const int search_radius, int &idx)
+{
+  // search neighborhood for keypoints
+  for (int i = y - search_radius; i < y + search_radius; ++i) {
+    const uchar *curr = mask.ptr<uchar>(i);
+    for (int j = x - search_radius; j < x + search_radius; ++j) {
+      if (curr[j] == 0) {
+        continue; // skip non-keypoint
+      }
+      // fine-compare with L2 metric (L2 is smaller than our search window)
+      int dx = j - x;
+      int dy = i - y;
+      if (dx * dx + dy * dy <= search_radius * search_radius) {
+        idx = i * mask.cols + j;
+        return true;
+      }
+    }
   }

-  for (size_t i = 0; i < evolution_.size(); i++) {
-    float* prev = evolution_[i].Ldet.ptr<float>(0);
-    float* curr = evolution_[i].Ldet.ptr<float>(1);
-    for (int ix = 1; ix < evolution_[i].Ldet.rows - 1; ix++) {
-      float* next = evolution_[i].Ldet.ptr<float>(ix + 1);
-
-      for (int jx = 1; jx < evolution_[i].Ldet.cols - 1; jx++) {
-        is_extremum = false;
-        is_repeated = false;
-        is_out = false;
-        value = *(evolution_[i].Ldet.ptr<float>(ix)+jx);
-
-        // Filter the points with the detector threshold
-        if (value > options_.dthreshold && value >= options_.min_dthreshold &&
-            value > curr[jx-1] &&
-            value > curr[jx+1] &&
-            value > prev[jx-1] &&
-            value > prev[jx] &&
-            value > prev[jx+1] &&
-            value > next[jx-1] &&
-            value > next[jx] &&
-            value > next[jx+1]) {
-
-          is_extremum = true;
-          point.response = fabs(value);
-          point.size = evolution_[i].esigma*options_.derivative_factor;
-          point.octave = (int)evolution_[i].octave;
-          point.class_id = (int)i;
-          ratio = (float)fastpow(2, point.octave);
-          sigma_size_ = fRound(point.size / ratio);
-          point.pt.x = static_cast<float>(jx);
-          point.pt.y = static_cast<float>(ix);
-
-          // Compare response with the same and lower scale
-          for (size_t ik = 0; ik < kpts_aux.size(); ik++) {
-
-            if ((point.class_id - 1) == kpts_aux[ik].class_id ||
-                point.class_id == kpts_aux[ik].class_id) {
-              float distx = point.pt.x*ratio - kpts_aux[ik].pt.x;
-              float disty = point.pt.y*ratio - kpts_aux[ik].pt.y;
-              dist = distx * distx + disty * disty;
-              if (dist <= point.size * point.size) {
-                if (point.response > kpts_aux[ik].response) {
-                  id_repeated = (int)ik;
-                  is_repeated = true;
-                }
-                else {
-                  is_extremum = false;
-                }
-                break;
-              }
+  return false;
+}
+
+/**
+ * @brief Find keypoints in parallel for each pyramid layer
+ */
+class FindKeypointsSameScale : public ParallelLoopBody
+{
+public:
+    explicit FindKeypointsSameScale(const std::vector<Evolution>& ev,
+      std::vector<Mat>& kpts, float dthreshold)
+    : evolution_(&ev), keypoints_by_layers_(&kpts), dthreshold_(dthreshold)
+  {}
+
+  void operator()(const Range& range) const
+  {
+    for (int i = range.start; i < range.end; i++)
+    {
+      const Evolution &e = (*evolution_)[i];
+      Mat &kpts = (*keypoints_by_layers_)[i];
+      // this mask will hold positions of keypoints in this level
+      kpts = Mat::zeros(e.Ldet.size(), CV_8UC1);
+
+      // if border is too big we shouldn't search any keypoints
+      if (e.border + 1 >= e.Ldet.rows)
+        continue;
+
+      const float * prev = e.Ldet.ptr<float>(e.border - 1);
+      const float * curr = e.Ldet.ptr<float>(e.border    );
+      const float * next = e.Ldet.ptr<float>(e.border + 1);
+      const float * ldet = e.Ldet.ptr<float>();
+      uchar *mask = kpts.ptr<uchar>();
+      const int search_radius = e.sigma_size; // size of keypoint in this level
+
+      for (int y = e.border; y < e.Ldet.rows - e.border; y++) {
+        for (int x = e.border; x < e.Ldet.cols - e.border; x++) {
+          const float value = curr[x];
+
+          // Filter the points with the detector threshold
+          if (value <= dthreshold_)
+            continue;
+          if (value <= curr[x-1] || value <= curr[x+1])
+            continue;
+          if (value <= prev[x-1] || value <= prev[x  ] || value <= prev[x+1])
+            continue;
+          if (value <= next[x-1] || value <= next[x  ] || value <= next[x+1])
+            continue;
+
+          int idx = 0;
+          // Compare response with the same scale
+          if (find_neighbor_point(x, y, kpts, search_radius, idx)) {
+            if (value > ldet[idx]) {
+              mask[idx] = 0;  // clear old point - we have better candidate now
+            } else {
+              continue; // there already is a better keypoint
             }
           }

-          // Check out of bounds
-          if (is_extremum == true) {
+          kpts.at<uchar>(y, x) = 1; // we have a new keypoint
+        }

-            // Check that the point is under the image limits for the descriptor computation
-            left_x = fRound(point.pt.x - smax*sigma_size_) - 1;
-            right_x = fRound(point.pt.x + smax*sigma_size_) + 1;
-            up_y = fRound(point.pt.y - smax*sigma_size_) - 1;
-            down_y = fRound(point.pt.y + smax*sigma_size_) + 1;
+        prev = curr;
+        curr = next;
+        next += e.Ldet.cols;
+      }
+    }
+  }

-            if (left_x < 0 || right_x >= evolution_[i].Ldet.cols ||
-                up_y < 0 || down_y >= evolution_[i].Ldet.rows) {
-              is_out = true;
-            }
+private:
+  const std::vector<Evolution>*  evolution_;
+  std::vector<Mat>* keypoints_by_layers_;
+  float dthreshold_; ///< Detector response threshold to accept point
+};

-            if (is_out == false) {
-              if (is_repeated == false) {
-                point.pt.x = (float)(point.pt.x*ratio + .5*(ratio-1.0));
-                point.pt.y = (float)(point.pt.y*ratio + .5*(ratio-1.0));
-                kpts_aux.push_back(point);
-                npoints++;
-              }
-              else {
-                point.pt.x = (float)(point.pt.x*ratio + .5*(ratio-1.0));
-                point.pt.y = (float)(point.pt.y*ratio + .5*(ratio-1.0));
-                kpts_aux[id_repeated] = point;
-              }
-            } // if is_out
-          } //if is_extremum
+/**
+ * @brief This method finds extrema in the nonlinear scale space
+ * @param keypoints_by_layers Output vectors of detected keypoints; one vector for each evolution level
+ */
+void AKAZEFeatures::Find_Scale_Space_Extrema(std::vector<Mat>& keypoints_by_layers)
+{
+  CV_INSTRUMENT_REGION()
+
+  keypoints_by_layers.resize(evolution_.size());
+
+  // find points in the same level
+  parallel_for_(Range(0, (int)evolution_.size()),
+    FindKeypointsSameScale(evolution_, keypoints_by_layers, options_.dthreshold));
+
+  // Filter points with the lower scale level
+  for (size_t i = 1; i < keypoints_by_layers.size(); i++) {
+    // constants for this level
+    const Mat &keypoints = keypoints_by_layers[i];
+    const uchar *const kpts = keypoints_by_layers[i].ptr<uchar>();
+    uchar *const kpts_prev = keypoints_by_layers[i-1].ptr<uchar>();
+    const float *const ldet = evolution_[i].Ldet.ptr<float>();
+    const float *const ldet_prev = evolution_[i-1].Ldet.ptr<float>();
+    // ratios are just powers of 2
+    const int diff_ratio = (int)evolution_[i].octave_ratio / (int)evolution_[i-1].octave_ratio;
+    const int search_radius = evolution_[i].sigma_size * diff_ratio; // size of keypoint in this level
+
+    size_t j = 0;
+    for (int y = 0; y < keypoints.rows; y++) {
+      for (int x = 0; x < keypoints.cols; x++, j++) {
+        if (kpts[j] == 0) {
+          continue; // skip non-keypoints
         }
-      } // for jx
-      prev = curr;
-      curr = next;
-    } // for ix
-  } // for i
-
-  // Now filter points with the upper scale level
-  for (size_t i = 0; i < kpts_aux.size(); i++) {
-
-    is_repeated = false;
-    const KeyPoint& pt = kpts_aux[i];
-    for (size_t j = i + 1; j < kpts_aux.size(); j++) {
-
-      // Compare response with the upper scale
-      if ((pt.class_id + 1) == kpts_aux[j].class_id) {
-        float distx = pt.pt.x - kpts_aux[j].pt.x;
-        float disty = pt.pt.y - kpts_aux[j].pt.y;
-        dist = distx * distx + disty * disty;
-        if (dist <= pt.size * pt.size) {
-          if (pt.response < kpts_aux[j].response) {
-            is_repeated = true;
-            break;
+        int idx = 0;
+        // project point to lower scale layer
+        const int p_x = x * diff_ratio;
+        const int p_y = y * diff_ratio;
+        if (find_neighbor_point(p_x, p_y, keypoints_by_layers[i-1], search_radius, idx)) {
+          if (ldet[j] > ldet_prev[idx]) {
+            kpts_prev[idx] = 0;  // clear keypoint in lower layer
           }
+          // else this pt may be pruned by the upper scale
         }
       }
     }
+  }

-    if (is_repeated == false)
-      kpts.push_back(pt);
+  // Now filter points with the upper scale level (the other direction)
+  for (int i = (int)keypoints_by_layers.size() - 2; i >= 0; i--) {
+    // constants for this level
+    const Mat &keypoints = keypoints_by_layers[i];
+    const uchar *const kpts = keypoints_by_layers[i].ptr<uchar>();
+    uchar *const kpts_next = keypoints_by_layers[i+1].ptr<uchar>();
+    const float *const ldet = evolution_[i].Ldet.ptr<float>();
+    const float *const ldet_next = evolution_[i+1].Ldet.ptr<float>();
+    // ratios are just powers of 2, i+1 ratio is always greater or equal to i
+    const int diff_ratio = (int)evolution_[i+1].octave_ratio / (int)evolution_[i].octave_ratio;
+    const int search_radius = evolution_[i+1].sigma_size; // size of keypoints in upper level
+
+    size_t j = 0;
+    for (int y = 0; y < keypoints.rows; y++) {
+      for (int x = 0; x < keypoints.cols; x++, j++) {
+        if (kpts[j] == 0) {
+          continue; // skip non-keypoints
+        }
+        int idx = 0;
+        // project point to upper scale layer
+        const int p_x = x / diff_ratio;
+        const int p_y = y / diff_ratio;
+        if (find_neighbor_point(p_x, p_y, keypoints_by_layers[i+1], search_radius, idx)) {
+          if (ldet[j] > ldet_next[idx]) {
+            kpts_next[idx] = 0;  // clear keypoint in upper layer
+          }
+        }
+      }
+    }
   }
 }

 /* ************************************************************************* */
 /**
  * @brief This method performs subpixel refinement of the detected keypoints
- * @param kpts Vector of detected keypoints
+ * @param keypoints_by_layers Input vectors of detected keypoints, sorted by evolution levels
+ * @param kpts Output vector of the final refined keypoints
  */
-void AKAZEFeatures::Do_Subpixel_Refinement(std::vector<KeyPoint>& kpts)
+void AKAZEFeatures::Do_Subpixel_Refinement(
+  std::vector<Mat>& keypoints_by_layers, std::vector<KeyPoint>& output_keypoints)
 {
-  float Dx = 0.0, Dy = 0.0, ratio = 0.0;
-  float Dxx = 0.0, Dyy = 0.0, Dxy = 0.0;
-  int x = 0, y = 0;
-  Matx22f A(0, 0, 0, 0);
-  Vec2f b(0, 0);
-  Vec2f dst(0, 0);
-
-  for (size_t i = 0; i < kpts.size(); i++) {
-    ratio = (float)fastpow(2, kpts[i].octave);
-    x = fRound(kpts[i].pt.x / ratio);
-    y = fRound(kpts[i].pt.y / ratio);
-
-    // Compute the gradient
-    Dx = (0.5f)*(*(evolution_[kpts[i].class_id].Ldet.ptr<float>(y)+x + 1)
-        - *(evolution_[kpts[i].class_id].Ldet.ptr<float>(y)+x - 1));
-    Dy = (0.5f)*(*(evolution_[kpts[i].class_id].Ldet.ptr<float>(y + 1) + x)
-        - *(evolution_[kpts[i].class_id].Ldet.ptr<float>(y - 1) + x));
-
-    // Compute the Hessian
-    Dxx = (*(evolution_[kpts[i].class_id].Ldet.ptr<float>(y)+x + 1)
-        + *(evolution_[kpts[i].class_id].Ldet.ptr<float>(y)+x - 1)
-        - 2.0f*(*(evolution_[kpts[i].class_id].Ldet.ptr<float>(y)+x)));
-
-    Dyy = (*(evolution_[kpts[i].class_id].Ldet.ptr<float>(y + 1) + x)
-        + *(evolution_[kpts[i].class_id].Ldet.ptr<float>(y - 1) + x)
-        - 2.0f*(*(evolution_[kpts[i].class_id].Ldet.ptr<float>(y)+x)));
-
-    Dxy = (0.25f)*(*(evolution_[kpts[i].class_id].Ldet.ptr<float>(y + 1) + x + 1)
-        + (*(evolution_[kpts[i].class_id].Ldet.ptr<float>(y - 1) + x - 1)))
-        - (0.25f)*(*(evolution_[kpts[i].class_id].Ldet.ptr<float>(y - 1) + x + 1)
-        + (*(evolution_[kpts[i].class_id].Ldet.ptr<float>(y + 1) + x - 1)));
-
-    // Solve the linear system
-    A(0, 0) = Dxx;
-    A(1, 1) = Dyy;
-    A(0, 1) = A(1, 0) = Dxy;
-    b(0) = -Dx;
-    b(1) = -Dy;
-
-    solve(A, b, dst, DECOMP_LU);
-
-    if (fabs(dst(0)) <= 1.0f && fabs(dst(1)) <= 1.0f) {
-        kpts[i].pt.x = x + dst(0);
-      kpts[i].pt.y = y + dst(1);
-      int power = fastpow(2, evolution_[kpts[i].class_id].octave);
-      kpts[i].pt.x = (float)(kpts[i].pt.x*power + .5*(power-1));
-      kpts[i].pt.y = (float)(kpts[i].pt.y*power + .5*(power-1));
-      kpts[i].angle = 0.0;
-
-      // In OpenCV the size of a keypoint its the diameter
-      kpts[i].size *= 2.0f;
-    }
-    // Delete the point since its not stable
-    else {
-      kpts.erase(kpts.begin() + i);
-      i--;
+  CV_INSTRUMENT_REGION()
+
+  for (size_t i = 0; i < keypoints_by_layers.size(); i++) {
+    const Evolution &e = evolution_[i];
+    const float * const ldet = e.Ldet.ptr<float>();
+    const float ratio = e.octave_ratio;
+    const int cols = e.Ldet.cols;
+    const Mat& keypoints = keypoints_by_layers[i];
+    const uchar *const kpts = keypoints.ptr<uchar>();
+
+    size_t j = 0;
+    for (int y = 0; y < keypoints.rows; y++) {
+      for (int x = 0; x < keypoints.cols; x++, j++) {
+        if (kpts[j] == 0) {
+          continue; // skip non-keypoints
+        }
+
+        // create a new keypoint
+        KeyPoint kp;
+        kp.pt.x = x * e.octave_ratio;
+        kp.pt.y = y * e.octave_ratio;
+        kp.size = e.esigma * options_.derivative_factor;
+        kp.angle = -1;
+        kp.response = ldet[j];
+        kp.octave = e.octave;
+        kp.class_id = static_cast<int>(i);
+
+        // Compute the gradient
+        float Dx = 0.5f * (ldet[ y     *cols + x + 1] - ldet[ y     *cols + x - 1]);
+        float Dy = 0.5f * (ldet[(y + 1)*cols + x    ] - ldet[(y - 1)*cols + x    ]);
+
+        // Compute the Hessian
+        float Dxx = ldet[ y     *cols + x + 1] + ldet[ y     *cols + x - 1] - 2.0f * ldet[y*cols + x];
+        float Dyy = ldet[(y + 1)*cols + x    ] + ldet[(y - 1)*cols + x    ] - 2.0f * ldet[y*cols + x];
+        float Dxy = 0.25f * (ldet[(y + 1)*cols + x + 1] + ldet[(y - 1)*cols + x - 1] -
+                            ldet[(y - 1)*cols + x + 1] - ldet[(y + 1)*cols + x - 1]);
+
+        // Solve the linear system
+        Matx22f A( Dxx, Dxy,
+                   Dxy, Dyy );
+        Vec2f   b( -Dx, -Dy );
+        Vec2f   dst( 0.0f, 0.0f );
+        solve(A, b, dst, DECOMP_LU);
+
+        float dx = dst(0);
+        float dy = dst(1);
+
+        if (fabs(dx) > 1.0f || fabs(dy) > 1.0f)
+          continue; // Ignore the point that is not stable
+
+        // Refine the coordinates
+        kp.pt.x += dx * ratio + .5f*(ratio-1.f);
+        kp.pt.y += dy * ratio + .5f*(ratio-1.f);
+
+        kp.angle = 0.0;
+        kp.size *= 2.0f; // In OpenCV the size of a keypoint is the diameter
+
+        // Push the refined keypoint to the final storage
+        output_keypoints.push_back(kp);
+      }
     }
   }
 }
@@ -459,7 +865,7 @@ void AKAZEFeatures::Do_Subpixel_Refinement(std::vector<KeyPoint>& kpts)
 class SURF_Descriptor_Upright_64_Invoker : public ParallelLoopBody
 {
 public:
-  SURF_Descriptor_Upright_64_Invoker(std::vector<KeyPoint>& kpts, Mat& desc, std::vector<TEvolution>& evolution)
+  SURF_Descriptor_Upright_64_Invoker(std::vector<KeyPoint>& kpts, Mat& desc, std::vector<Evolution>& evolution)
     : keypoints_(&kpts)
     , descriptors_(&desc)
     , evolution_(&evolution)
@@ -470,22 +876,22 @@ public:
   {
     for (int i = range.start; i < range.end; i++)
     {
-      Get_SURF_Descriptor_Upright_64((*keypoints_)[i], descriptors_->ptr<float>(i));
+      Get_SURF_Descriptor_Upright_64((*keypoints_)[i], descriptors_->ptr<float>(i), descriptors_->cols);
     }
   }

-  void Get_SURF_Descriptor_Upright_64(const KeyPoint& kpt, float* desc) const;
+  void Get_SURF_Descriptor_Upright_64(const KeyPoint& kpt, float* desc, int desc_size) const;

 private:
   std::vector<KeyPoint>* keypoints_;
   Mat*                   descriptors_;
-  std::vector<TEvolution>*   evolution_;
+  std::vector<Evolution>*   evolution_;
 };

 class SURF_Descriptor_64_Invoker : public ParallelLoopBody
 {
 public:
-  SURF_Descriptor_64_Invoker(std::vector<KeyPoint>& kpts, Mat& desc, std::vector<TEvolution>& evolution)
+  SURF_Descriptor_64_Invoker(std::vector<KeyPoint>& kpts, Mat& desc, std::vector<Evolution>& evolution)
     : keypoints_(&kpts)
     , descriptors_(&desc)
     , evolution_(&evolution)
@@ -496,23 +902,22 @@ public:
   {
     for (int i = range.start; i < range.end; i++)
     {
-      AKAZEFeatures::Compute_Main_Orientation((*keypoints_)[i], *evolution_);
-      Get_SURF_Descriptor_64((*keypoints_)[i], descriptors_->ptr<float>(i));
+      Get_SURF_Descriptor_64((*keypoints_)[i], descriptors_->ptr<float>(i), descriptors_->cols);
     }
   }

-  void Get_SURF_Descriptor_64(const KeyPoint& kpt, float* desc) const;
+  void Get_SURF_Descriptor_64(const KeyPoint& kpt, float* desc, int desc_size) const;

 private:
   std::vector<KeyPoint>* keypoints_;
   Mat*                   descriptors_;
-  std::vector<TEvolution>*   evolution_;
+  std::vector<Evolution>*   evolution_;
 };

 class MSURF_Upright_Descriptor_64_Invoker : public ParallelLoopBody
 {
 public:
-  MSURF_Upright_Descriptor_64_Invoker(std::vector<KeyPoint>& kpts, Mat& desc, std::vector<TEvolution>& evolution)
+  MSURF_Upright_Descriptor_64_Invoker(std::vector<KeyPoint>& kpts, Mat& desc, std::vector<Evolution>& evolution)
     : keypoints_(&kpts)
     , descriptors_(&desc)
     , evolution_(&evolution)
@@ -523,22 +928,22 @@ public:
   {
     for (int i = range.start; i < range.end; i++)
     {
-      Get_MSURF_Upright_Descriptor_64((*keypoints_)[i], descriptors_->ptr<float>(i));
+      Get_MSURF_Upright_Descriptor_64((*keypoints_)[i], descriptors_->ptr<float>(i), descriptors_->cols);
     }
   }

-  void Get_MSURF_Upright_Descriptor_64(const KeyPoint& kpt, float* desc) const;
+  void Get_MSURF_Upright_Descriptor_64(const KeyPoint& kpt, float* desc, int desc_size) const;

 private:
   std::vector<KeyPoint>* keypoints_;
   Mat*                   descriptors_;
-  std::vector<TEvolution>*   evolution_;
+  std::vector<Evolution>*   evolution_;
 };

 class MSURF_Descriptor_64_Invoker : public ParallelLoopBody
 {
 public:
-  MSURF_Descriptor_64_Invoker(std::vector<KeyPoint>& kpts, Mat& desc, std::vector<TEvolution>& evolution)
+  MSURF_Descriptor_64_Invoker(std::vector<KeyPoint>& kpts, Mat& desc, std::vector<Evolution>& evolution)
     : keypoints_(&kpts)
     , descriptors_(&desc)
     , evolution_(&evolution)
@@ -549,23 +954,22 @@ public:
   {
     for (int i = range.start; i < range.end; i++)
     {
-      AKAZEFeatures::Compute_Main_Orientation((*keypoints_)[i], *evolution_);
-      Get_MSURF_Descriptor_64((*keypoints_)[i], descriptors_->ptr<float>(i));
+      Get_MSURF_Descriptor_64((*keypoints_)[i], descriptors_->ptr<float>(i), descriptors_->cols);
     }
   }

-  void Get_MSURF_Descriptor_64(const KeyPoint& kpt, float* desc) const;
+  void Get_MSURF_Descriptor_64(const KeyPoint& kpt, float* desc, int desc_size) const;

 private:
   std::vector<KeyPoint>* keypoints_;
   Mat*                   descriptors_;
-  std::vector<TEvolution>*   evolution_;
+  std::vector<Evolution>*   evolution_;
 };

 class Upright_MLDB_Full_Descriptor_Invoker : public ParallelLoopBody
 {
 public:
-  Upright_MLDB_Full_Descriptor_Invoker(std::vector<KeyPoint>& kpts, Mat& desc, std::vector<TEvolution>& evolution, AKAZEOptions& options)
+  Upright_MLDB_Full_Descriptor_Invoker(std::vector<KeyPoint>& kpts, Mat& desc, std::vector<Evolution>& evolution, AKAZEOptions& options)
     : keypoints_(&kpts)
     , descriptors_(&desc)
     , evolution_(&evolution)
@@ -577,16 +981,16 @@ public:
   {
     for (int i = range.start; i < range.end; i++)
     {
-      Get_Upright_MLDB_Full_Descriptor((*keypoints_)[i], descriptors_->ptr<unsigned char>(i));
+      Get_Upright_MLDB_Full_Descriptor((*keypoints_)[i], descriptors_->ptr<unsigned char>(i), descriptors_->cols);
     }
   }

-  void Get_Upright_MLDB_Full_Descriptor(const KeyPoint& kpt, unsigned char* desc) const;
+  void Get_Upright_MLDB_Full_Descriptor(const KeyPoint& kpt, unsigned char* desc, int desc_size) const;

 private:
   std::vector<KeyPoint>* keypoints_;
   Mat*                   descriptors_;
-  std::vector<TEvolution>*   evolution_;
+  std::vector<Evolution>*   evolution_;
   AKAZEOptions*              options_;
 };

@@ -595,7 +999,7 @@ class Upright_MLDB_Descriptor_Subset_Invoker : public ParallelLoopBody
 public:
   Upright_MLDB_Descriptor_Subset_Invoker(std::vector<KeyPoint>& kpts,
                                          Mat& desc,
-                                         std::vector<TEvolution>& evolution,
+                                         std::vector<Evolution>& evolution,
                                          AKAZEOptions& options,
                                          Mat descriptorSamples,
                                          Mat descriptorBits)
@@ -612,16 +1016,16 @@ public:
   {
     for (int i = range.start; i < range.end; i++)
     {
-      Get_Upright_MLDB_Descriptor_Subset((*keypoints_)[i], descriptors_->ptr<unsigned char>(i));
+      Get_Upright_MLDB_Descriptor_Subset((*keypoints_)[i], descriptors_->ptr<unsigned char>(i), descriptors_->cols);
     }
   }

-  void Get_Upright_MLDB_Descriptor_Subset(const KeyPoint& kpt, unsigned char* desc) const;
+  void Get_Upright_MLDB_Descriptor_Subset(const KeyPoint& kpt, unsigned char* desc, int desc_size) const;

 private:
   std::vector<KeyPoint>* keypoints_;
   Mat*                   descriptors_;
-  std::vector<TEvolution>*   evolution_;
+  std::vector<Evolution>*   evolution_;
   AKAZEOptions*              options_;

   Mat descriptorSamples_;  // List of positions in the grids to sample LDB bits from.
@@ -631,7 +1035,7 @@ private:
 class MLDB_Full_Descriptor_Invoker : public ParallelLoopBody
 {
 public:
-  MLDB_Full_Descriptor_Invoker(std::vector<KeyPoint>& kpts, Mat& desc, std::vector<TEvolution>& evolution, AKAZEOptions& options)
+  MLDB_Full_Descriptor_Invoker(std::vector<KeyPoint>& kpts, Mat& desc, std::vector<Evolution>& evolution, AKAZEOptions& options)
     : keypoints_(&kpts)
     , descriptors_(&desc)
     , evolution_(&evolution)
@@ -643,12 +1047,11 @@ public:
   {
     for (int i = range.start; i < range.end; i++)
     {
-      AKAZEFeatures::Compute_Main_Orientation((*keypoints_)[i], *evolution_);
-      Get_MLDB_Full_Descriptor((*keypoints_)[i], descriptors_->ptr<unsigned char>(i));
+      Get_MLDB_Full_Descriptor((*keypoints_)[i], descriptors_->ptr<unsigned char>(i), descriptors_->cols);
     }
   }

-  void Get_MLDB_Full_Descriptor(const KeyPoint& kpt, unsigned char* desc) const;
+  void Get_MLDB_Full_Descriptor(const KeyPoint& kpt, unsigned char* desc, int desc_size) const;
   void MLDB_Fill_Values(float* values, int sample_step, int level,
                         float xf, float yf, float co, float si, float scale) const;
   void MLDB_Binary_Comparisons(float* values, unsigned char* desc,
@@ -657,7 +1060,7 @@ public:
 private:
   std::vector<KeyPoint>* keypoints_;
   Mat*                   descriptors_;
-  std::vector<TEvolution>*   evolution_;
+  std::vector<Evolution>*   evolution_;
   AKAZEOptions*              options_;
 };

@@ -666,7 +1069,7 @@ class MLDB_Descriptor_Subset_Invoker : public ParallelLoopBody
 public:
   MLDB_Descriptor_Subset_Invoker(std::vector<KeyPoint>& kpts,
                                  Mat& desc,
-                                 std::vector<TEvolution>& evolution,
+                                 std::vector<Evolution>& evolution,
                                  AKAZEOptions& options,
                                  Mat descriptorSamples,
                                  Mat descriptorBits)
@@ -683,17 +1086,16 @@ public:
   {
     for (int i = range.start; i < range.end; i++)
     {
-      AKAZEFeatures::Compute_Main_Orientation((*keypoints_)[i], *evolution_);
-      Get_MLDB_Descriptor_Subset((*keypoints_)[i], descriptors_->ptr<unsigned char>(i));
+      Get_MLDB_Descriptor_Subset((*keypoints_)[i], descriptors_->ptr<unsigned char>(i), descriptors_->cols);
     }
   }

-  void Get_MLDB_Descriptor_Subset(const KeyPoint& kpt, unsigned char* desc) const;
+  void Get_MLDB_Descriptor_Subset(const KeyPoint& kpt, unsigned char* desc, int desc_size) const;

 private:
   std::vector<KeyPoint>* keypoints_;
   Mat*                   descriptors_;
-  std::vector<TEvolution>*   evolution_;
+  std::vector<Evolution>*   evolution_;
   AKAZEOptions*              options_;

   Mat descriptorSamples_;  // List of positions in the grids to sample LDB bits from.
@@ -705,28 +1107,29 @@ private:
  * @param kpts Vector of detected keypoints
  * @param desc Matrix to store the descriptors
  */
-void AKAZEFeatures::Compute_Descriptors(std::vector<KeyPoint>& kpts, Mat& desc)
+void AKAZEFeatures::Compute_Descriptors(std::vector<KeyPoint>& kpts, OutputArray descriptors)
 {
+  CV_INSTRUMENT_REGION()
+
   for(size_t i = 0; i < kpts.size(); i++)
   {
       CV_Assert(0 <= kpts[i].class_id && kpts[i].class_id < static_cast<int>(evolution_.size()));
   }

   // Allocate memory for the matrix with the descriptors
-  if (options_.descriptor < AKAZE::DESCRIPTOR_MLDB_UPRIGHT) {
-    desc = Mat::zeros((int)kpts.size(), 64, CV_32FC1);
-  }
-  else {
-    // We use the full length binary descriptor -> 486 bits
-    if (options_.descriptor_size == 0) {
-      int t = (6 + 36 + 120)*options_.descriptor_channels;
-      desc = Mat::zeros((int)kpts.size(), (int)ceil(t / 8.), CV_8UC1);
-    }
-    else {
-      // We use the random bit selection length binary descriptor
-      desc = Mat::zeros((int)kpts.size(), (int)ceil(options_.descriptor_size / 8.), CV_8UC1);
-    }
+  int descriptor_size = 64;
+  int descriptor_type = CV_32FC1;
+  if (options_.descriptor >= AKAZE::DESCRIPTOR_MLDB_UPRIGHT)
+  {
+    int descriptor_bits = (options_.descriptor_size == 0)
+          ? (6 + 36 + 120)*options_.descriptor_channels // the full length binary descriptor -> 486 bits
+          : options_.descriptor_size; // the random bit selection length binary descriptor
+    descriptor_size = divUp(descriptor_bits, 8);
+    descriptor_type = CV_8UC1;
   }
+  descriptors.create((int)kpts.size(), descriptor_size, descriptor_type);
+
+  Mat desc = descriptors.getMat();

   switch (options_.descriptor)
   {
@@ -761,12 +1164,20 @@ void AKAZEFeatures::Compute_Descriptors(std::vector<KeyPoint>& kpts, Mat& desc)

 /* ************************************************************************* */
 /**
- * @brief This method computes the main orientation for a given keypoint
- * @param kpt Input keypoint
- * @note The orientation is computed using a similar approach as described in the
- * original SURF method. See Bay et al., Speeded Up Robust Features, ECCV 2006
+ * @brief This function samples the derivative responses Lx and Ly for the points
+ * within the radius of 6*scale from (x0, y0), then multiply 2D Gaussian weight
+ * @param Lx Horizontal derivative
+ * @param Ly Vertical derivative
+ * @param x0 X-coordinate of the center point
+ * @param y0 Y-coordinate of the center point
+ * @param scale The sampling step
+ * @param resX Output array of the weighted horizontal derivative responses
+ * @param resY Output array of the weighted vertical derivative responses
  */
-void AKAZEFeatures::Compute_Main_Orientation(KeyPoint& kpt, const std::vector<TEvolution>& evolution_)
+static inline
+void Sample_Derivative_Response_Radius6(const Mat &Lx, const Mat &Ly,
+                                  const int x0, const int y0, const int scale,
+                                  float *resX, float *resY)
 {
     /* ************************************************************************* */
     /// Lookup table for 2d gaussian (sigma = 2.5) where (0,0) is top left and (6,6) is bottom right
@@ -780,79 +1191,208 @@ void AKAZEFeatures::Compute_Main_Orientation(KeyPoint& kpt, const std::vector<TE
         { 0.00344629f, 0.00318132f, 0.00250252f, 0.00167749f, 0.00095820f, 0.00046640f, 0.00019346f },
         { 0.00142946f, 0.00131956f, 0.00103800f, 0.00069579f, 0.00039744f, 0.00019346f, 0.00008024f }
     };
+    static const struct gtable
+    {
+      float weight[109];
+      int xidx[109];
+      int yidx[109];

-  int ix = 0, iy = 0, idx = 0, s = 0, level = 0;
-  float xf = 0.0, yf = 0.0, gweight = 0.0, ratio = 0.0;
-  const int ang_size = 109;
-  float resX[ang_size], resY[ang_size], Ang[ang_size];
-  const int id[] = { 6, 5, 4, 3, 2, 1, 0, 1, 2, 3, 4, 5, 6 };
+      explicit gtable(void)
+      {
+        // Generate the weight and indices by one-time initialization
+        int k = 0;
+        for (int i = -6; i <= 6; ++i) {
+          for (int j = -6; j <= 6; ++j) {
+            if (i*i + j*j < 36) {
+              CV_Assert(k < 109);
+              weight[k] = gauss25[abs(i)][abs(j)];
+              yidx[k] = i;
+              xidx[k] = j;
+              ++k;
+            }
+          }
+        }
+      }
+    } g;

-  // Variables for computing the dominant direction
-  float sumX = 0.0, sumY = 0.0, max = 0.0, ang1 = 0.0, ang2 = 0.0;
+  CV_Assert(x0 - 6 * scale >= 0 && x0 + 6 * scale < Lx.cols);
+  CV_Assert(y0 - 6 * scale >= 0 && y0 + 6 * scale < Lx.rows);

+  for (int i = 0; i < 109; i++)
+  {
+    int y = y0 + g.yidx[i] * scale;
+    int x = x0 + g.xidx[i] * scale;
+
+    float w = g.weight[i];
+    resX[i] = w * Lx.at<float>(y, x);
+    resY[i] = w * Ly.at<float>(y, x);
+  }
+}
+
+/**
+ * @brief This function sorts a[] by quantized float values
+ * @param a[] Input floating point array to sort
+ * @param n The length of a[]
+ * @param quantum The interval to convert a[i]'s float values to integers
+ * @param nkeys a[i] < nkeys * quantum
+ * @param idx[] Output array of the indices: a[idx[i]] forms a sorted array
+ * @param cum[] Output array of the starting indices of quantized floats
+ * @note The values of a[] in [k*quantum, (k + 1)*quantum) is labeled by
+ * the integer k, which is calculated by floor(a[i]/quantum).  After sorting,
+ * the values from a[idx[cum[k]]] to a[idx[cum[k+1]-1]] are all labeled by k.
+ * This sorting is unstable to reduce the memory access.
+ */
+static inline
+void quantized_counting_sort(const float a[], const int n,
+                             const float quantum, const int nkeys,
+                             int idx[/*n*/], int cum[/*nkeys + 1*/])
+{
+  memset(cum, 0, sizeof(cum[0]) * (nkeys + 1));
+
+  // Count up the quantized values
+  for (int i = 0; i < n; i++)
+  {
+    int b = (int)(a[i] / quantum);
+    if (b < 0 || b >= nkeys)
+      b = 0;
+    cum[b]++;
+  }
+
+  // Compute the inclusive prefix sum i.e. the end indices; cum[nkeys] is the total
+  for (int i = 1; i <= nkeys; i++)
+  {
+    cum[i] += cum[i - 1];
+  }
+  CV_Assert(cum[nkeys] == n);
+
+  // Generate the sorted indices; cum[] becomes the exclusive prefix sum i.e. the start indices of keys
+  for (int i = 0; i < n; i++)
+  {
+    int b = (int)(a[i] / quantum);
+    if (b < 0 || b >= nkeys)
+      b = 0;
+    idx[--cum[b]] = i;
+  }
+}
+
+/**
+ * @brief This function computes the main orientation for a given keypoint
+ * @param kpt Input keypoint
+ * @note The orientation is computed using a similar approach as described in the
+ * original SURF method. See Bay et al., Speeded Up Robust Features, ECCV 2006
+ */
+static inline
+void Compute_Main_Orientation(KeyPoint& kpt, const std::vector<Evolution>& evolution)
+{
+  // get the right evolution level for this keypoint
+  const Evolution& e = evolution[kpt.class_id];
   // Get the information from the keypoint
-  level = kpt.class_id;
-  ratio = (float)(1 << evolution_[level].octave);
-  s = fRound(0.5f*kpt.size / ratio);
-  xf = kpt.pt.x / ratio;
-  yf = kpt.pt.y / ratio;
+  int scale = cvRound(0.5f * kpt.size / e.octave_ratio);
+  int x0 = cvRound(kpt.pt.x / e.octave_ratio);
+  int y0 = cvRound(kpt.pt.y / e.octave_ratio);
+
+  // Sample derivatives responses for the points within radius of 6*scale
+  const int ang_size = 109;
+  float resX[ang_size], resY[ang_size];
+  Sample_Derivative_Response_Radius6(e.Lx, e.Ly, x0, y0, scale, resX, resY);
+
+  // Compute the angle of each gradient vector
+  float Ang[ang_size];
+  hal::fastAtan2(resY, resX, Ang, ang_size, false);
+
+  // Sort by the angles; angles are labeled by slices of 0.15 radian
+  const int slices = 42;
+  const float ang_step = (float)(2.0 * CV_PI / slices);
+  int slice[slices + 1];
+  int sorted_idx[ang_size];
+  quantized_counting_sort(Ang, ang_size, ang_step, slices, sorted_idx, slice);
+
+  // Find the main angle by sliding a window of 7-slice size(=PI/3) around the keypoint
+  const int win = 7;
+
+  float maxX = 0.0f, maxY = 0.0f;
+  for (int i = slice[0]; i < slice[win]; i++) {
+    const int idx = sorted_idx[i];
+    maxX += resX[idx];
+    maxY += resY[idx];
+  }
+  float maxNorm = maxX * maxX + maxY * maxY;

-  // Calculate derivatives responses for points within radius of 6*scale
-  for (int i = -6; i <= 6; ++i) {
-    for (int j = -6; j <= 6; ++j) {
-      if (i*i + j*j < 36) {
-        iy = fRound(yf + j*s);
-        ix = fRound(xf + i*s);
+  for (int sn = 1; sn <= slices - win; sn++) {

-        gweight = gauss25[id[i + 6]][id[j + 6]];
-        resX[idx] = gweight*(*(evolution_[level].Lx.ptr<float>(iy)+ix));
-        resY[idx] = gweight*(*(evolution_[level].Ly.ptr<float>(iy)+ix));
+    if (slice[sn] == slice[sn - 1] && slice[sn + win] == slice[sn + win - 1])
+      continue;  // The contents of the window didn't change; don't repeat the computation

-        ++idx;
-      }
+    float sumX = 0.0f, sumY = 0.0f;
+    for (int i = slice[sn]; i < slice[sn + win]; i++) {
+      const int idx = sorted_idx[i];
+      sumX += resX[idx];
+      sumY += resY[idx];
     }
+
+    float norm = sumX * sumX + sumY * sumY;
+    if (norm > maxNorm)
+        maxNorm = norm, maxX = sumX, maxY = sumY;  // Found bigger one; update
   }
-  hal::fastAtan32f(resY, resX, Ang, ang_size, false);
-  // Loop slides pi/3 window around feature point
-  for (ang1 = 0; ang1 < (float)(2.0 * CV_PI); ang1 += 0.15f) {
-    ang2 = (ang1 + (float)(CV_PI / 3.0) >(float)(2.0*CV_PI) ? ang1 - (float)(5.0*CV_PI / 3.0) : ang1 + (float)(CV_PI / 3.0));
-    sumX = sumY = 0.f;
-
-    for (int k = 0; k < ang_size; ++k) {
-      // Get angle from the x-axis of the sample point
-      const float & ang = Ang[k];
-
-      // Determine whether the point is within the window
-      if (ang1 < ang2 && ang1 < ang && ang < ang2) {
-        sumX += resX[k];
-        sumY += resY[k];
-      }
-      else if (ang2 < ang1 &&
-               ((ang > 0 && ang < ang2) || (ang > ang1 && ang < 2.0f*CV_PI))) {
-        sumX += resX[k];
-        sumY += resY[k];
-      }
-    }

-    // if the vector produced from this window is longer than all
-    // previous vectors then this forms the new dominant direction
-    if (sumX*sumX + sumY*sumY > max) {
-      // store largest orientation
-      max = sumX*sumX + sumY*sumY;
-      kpt.angle = getAngle(sumX, sumY) * 180.f / static_cast<float>(CV_PI);
+  for (int sn = slices - win + 1; sn < slices; sn++) {
+    int remain = sn + win - slices;
+
+    if (slice[sn] == slice[sn - 1] && slice[remain] == slice[remain - 1])
+      continue;
+
+    float sumX = 0.0f, sumY = 0.0f;
+    for (int i = slice[sn]; i < slice[slices]; i++) {
+      const int idx = sorted_idx[i];
+      sumX += resX[idx];
+      sumY += resY[idx];
+    }
+    for (int i = slice[0]; i < slice[remain]; i++) {
+      const int idx = sorted_idx[i];
+      sumX += resX[idx];
+      sumY += resY[idx];
     }
+
+    float norm = sumX * sumX + sumY * sumY;
+    if (norm > maxNorm)
+        maxNorm = norm, maxX = sumX, maxY = sumY;
   }
+
+  // Store the final result
+  kpt.angle = fastAtan2(maxY, maxX);
 }

-/* ************************************************************************* */
+class ComputeKeypointOrientation : public ParallelLoopBody
+{
+public:
+  ComputeKeypointOrientation(std::vector<KeyPoint>& kpts,
+                             const std::vector<Evolution>& evolution)
+    : keypoints_(&kpts)
+    , evolution_(&evolution)
+  {
+  }
+
+  void operator() (const Range& range) const
+  {
+    for (int i = range.start; i < range.end; i++)
+    {
+      Compute_Main_Orientation((*keypoints_)[i], *evolution_);
+    }
+  }
+private:
+  std::vector<KeyPoint>* keypoints_;
+  const std::vector<Evolution>* evolution_;
+};
+
 /**
  * @brief This method computes the main orientation for a given keypoints
  * @param kpts Input keypoints
  */
 void AKAZEFeatures::Compute_Keypoints_Orientation(std::vector<KeyPoint>& kpts) const
 {
-     for(size_t i = 0; i < kpts.size(); i++)
-         Compute_Main_Orientation(kpts[i], evolution_);
+  CV_INSTRUMENT_REGION()
+
+  parallel_for_(Range(0, (int)kpts.size()), ComputeKeypointOrientation(kpts, evolution_));
 }

 /* ************************************************************************* */
@@ -865,7 +1405,10 @@ void AKAZEFeatures::Compute_Keypoints_Orientation(std::vector<KeyPoint>& kpts) c
  * from Agrawal et al., CenSurE: Center Surround Extremas for Realtime Feature Detection and Matching,
  * ECCV 2008
  */
-void MSURF_Upright_Descriptor_64_Invoker::Get_MSURF_Upright_Descriptor_64(const KeyPoint& kpt, float *desc) const {
+void MSURF_Upright_Descriptor_64_Invoker::Get_MSURF_Upright_Descriptor_64(const KeyPoint& kpt, float *desc, int desc_size) const {
+
+  const int dsize = 64;
+  CV_Assert(desc_size == dsize);

   float dx = 0.0, dy = 0.0, mdx = 0.0, mdy = 0.0, gauss_s1 = 0.0, gauss_s2 = 0.0;
   float rx = 0.0, ry = 0.0, len = 0.0, xf = 0.0, yf = 0.0, ys = 0.0, xs = 0.0;
@@ -873,22 +1416,23 @@ void MSURF_Upright_Descriptor_64_Invoker::Get_MSURF_Upright_Descriptor_64(const
   int x1 = 0, y1 = 0, sample_step = 0, pattern_size = 0;
   int x2 = 0, y2 = 0, kx = 0, ky = 0, i = 0, j = 0, dcount = 0;
   float fx = 0.0, fy = 0.0, ratio = 0.0, res1 = 0.0, res2 = 0.0, res3 = 0.0, res4 = 0.0;
-  int scale = 0, dsize = 0, level = 0;
+  int scale = 0;

   // Subregion centers for the 4x4 gaussian weighting
   float cx = -0.5f, cy = 0.5f;

-  const std::vector<TEvolution>& evolution = *evolution_;
+  const std::vector<Evolution>& evolution = *evolution_;

   // Set the descriptor size and the sample and pattern sizes
-  dsize = 64;
   sample_step = 5;
   pattern_size = 12;

   // Get the information from the keypoint
   ratio = (float)(1 << kpt.octave);
-  scale = fRound(0.5f*kpt.size / ratio);
-  level = kpt.class_id;
+  scale = cvRound(0.5f*kpt.size / ratio);
+  const int level = kpt.class_id;
+  const Mat Lx = evolution[level].Lx;
+  const Mat Ly = evolution[level].Ly;
   yf = kpt.pt.y / ratio;
   xf = kpt.pt.x / ratio;

@@ -922,25 +1466,28 @@ void MSURF_Upright_Descriptor_64_Invoker::Get_MSURF_Upright_Descriptor_64(const
           //Get the gaussian weighted x and y responses
           gauss_s1 = gaussian(xs - sample_x, ys - sample_y, 2.50f*scale);

-          y1 = (int)(sample_y - .5);
-          x1 = (int)(sample_x - .5);
+          y1 = cvFloor(sample_y);
+          x1 = cvFloor(sample_x);
+
+          y2 = y1 + 1;
+          x2 = x1 + 1;

-          y2 = (int)(sample_y + .5);
-          x2 = (int)(sample_x + .5);
+          if (x1 < 0 || y1 < 0 || x2 >= Lx.cols || y2 >= Lx.rows)
+              continue; // FIXIT Boundaries

           fx = sample_x - x1;
           fy = sample_y - y1;

-          res1 = *(evolution[level].Lx.ptr<float>(y1)+x1);
-          res2 = *(evolution[level].Lx.ptr<float>(y1)+x2);
-          res3 = *(evolution[level].Lx.ptr<float>(y2)+x1);
-          res4 = *(evolution[level].Lx.ptr<float>(y2)+x2);
+          res1 = Lx.at<float>(y1, x1);
+          res2 = Lx.at<float>(y1, x2);
+          res3 = Lx.at<float>(y2, x1);
+          res4 = Lx.at<float>(y2, x2);
           rx = (1.0f - fx)*(1.0f - fy)*res1 + fx*(1.0f - fy)*res2 + (1.0f - fx)*fy*res3 + fx*fy*res4;

-          res1 = *(evolution[level].Ly.ptr<float>(y1)+x1);
-          res2 = *(evolution[level].Ly.ptr<float>(y1)+x2);
-          res3 = *(evolution[level].Ly.ptr<float>(y2)+x1);
-          res4 = *(evolution[level].Ly.ptr<float>(y2)+x2);
+          res1 = Ly.at<float>(y1, x1);
+          res2 = Ly.at<float>(y1, x2);
+          res3 = Ly.at<float>(y2, x1);
+          res4 = Ly.at<float>(y2, x2);
           ry = (1.0f - fx)*(1.0f - fy)*res1 + fx*(1.0f - fy)*res2 + (1.0f - fx)*fy*res3 + fx*fy*res4;

           rx = gauss_s1*rx;
@@ -970,11 +1517,14 @@ void MSURF_Upright_Descriptor_64_Invoker::Get_MSURF_Upright_Descriptor_64(const
     i += 9;
   }

+  CV_Assert(dcount == desc_size);
+
   // convert to unit vector
   len = sqrt(len);

+  const float len_inv = 1.0f / len;
   for (i = 0; i < dsize; i++) {
-    desc[i] /= len;
+    desc[i] *= len_inv;
   }
 }

@@ -988,7 +1538,10 @@ void MSURF_Upright_Descriptor_64_Invoker::Get_MSURF_Upright_Descriptor_64(const
  * from Agrawal et al., CenSurE: Center Surround Extremas for Realtime Feature Detection and Matching,
  * ECCV 2008
  */
-void MSURF_Descriptor_64_Invoker::Get_MSURF_Descriptor_64(const KeyPoint& kpt, float *desc) const {
+void MSURF_Descriptor_64_Invoker::Get_MSURF_Descriptor_64(const KeyPoint& kpt, float *desc, int desc_size) const {
+
+  const int dsize = 64;
+  CV_Assert(desc_size == dsize);

   float dx = 0.0, dy = 0.0, mdx = 0.0, mdy = 0.0, gauss_s1 = 0.0, gauss_s2 = 0.0;
   float rx = 0.0, ry = 0.0, rrx = 0.0, rry = 0.0, len = 0.0, xf = 0.0, yf = 0.0, ys = 0.0, xs = 0.0;
@@ -996,23 +1549,24 @@ void MSURF_Descriptor_64_Invoker::Get_MSURF_Descriptor_64(const KeyPoint& kpt, f
   float fx = 0.0, fy = 0.0, ratio = 0.0, res1 = 0.0, res2 = 0.0, res3 = 0.0, res4 = 0.0;
   int x1 = 0, y1 = 0, x2 = 0, y2 = 0, sample_step = 0, pattern_size = 0;
   int kx = 0, ky = 0, i = 0, j = 0, dcount = 0;
-  int scale = 0, dsize = 0, level = 0;
+  int scale = 0;

   // Subregion centers for the 4x4 gaussian weighting
   float cx = -0.5f, cy = 0.5f;

-  const std::vector<TEvolution>& evolution = *evolution_;
+  const std::vector<Evolution>& evolution = *evolution_;

   // Set the descriptor size and the sample and pattern sizes
-  dsize = 64;
   sample_step = 5;
   pattern_size = 12;

   // Get the information from the keypoint
   ratio = (float)(1 << kpt.octave);
-  scale = fRound(0.5f*kpt.size / ratio);
-  angle = (kpt.angle * static_cast<float>(CV_PI)) / 180.f;
-  level = kpt.class_id;
+  scale = cvRound(0.5f*kpt.size / ratio);
+  angle = kpt.angle * static_cast<float>(CV_PI / 180.f);
+  const int level = kpt.class_id;
+  const Mat Lx = evolution[level].Lx;
+  const Mat Ly = evolution[level].Ly;
   yf = kpt.pt.y / ratio;
   xf = kpt.pt.x / ratio;
   co = cos(angle);
@@ -1049,25 +1603,28 @@ void MSURF_Descriptor_64_Invoker::Get_MSURF_Descriptor_64(const KeyPoint& kpt, f
           // Get the gaussian weighted x and y responses
           gauss_s1 = gaussian(xs - sample_x, ys - sample_y, 2.5f*scale);

-          y1 = fRound(sample_y - 0.5f);
-          x1 = fRound(sample_x - 0.5f);
+          y1 = cvFloor(sample_y);
+          x1 = cvFloor(sample_x);
+
+          y2 = y1 + 1;
+          x2 = x1 + 1;

-          y2 = fRound(sample_y + 0.5f);
-          x2 = fRound(sample_x + 0.5f);
+          if (x1 < 0 || y1 < 0 || x2 >= Lx.cols || y2 >= Lx.rows)
+              continue; // FIXIT Boundaries

           fx = sample_x - x1;
           fy = sample_y - y1;

-          res1 = *(evolution[level].Lx.ptr<float>(y1)+x1);
-          res2 = *(evolution[level].Lx.ptr<float>(y1)+x2);
-          res3 = *(evolution[level].Lx.ptr<float>(y2)+x1);
-          res4 = *(evolution[level].Lx.ptr<float>(y2)+x2);
+          res1 = Lx.at<float>(y1, x1);
+          res2 = Lx.at<float>(y1, x2);
+          res3 = Lx.at<float>(y2, x1);
+          res4 = Lx.at<float>(y2, x2);
           rx = (1.0f - fx)*(1.0f - fy)*res1 + fx*(1.0f - fy)*res2 + (1.0f - fx)*fy*res3 + fx*fy*res4;

-          res1 = *(evolution[level].Ly.ptr<float>(y1)+x1);
-          res2 = *(evolution[level].Ly.ptr<float>(y1)+x2);
-          res3 = *(evolution[level].Ly.ptr<float>(y2)+x1);
-          res4 = *(evolution[level].Ly.ptr<float>(y2)+x2);
+          res1 = Ly.at<float>(y1, x1);
+          res2 = Ly.at<float>(y1, x2);
+          res3 = Ly.at<float>(y2, x1);
+          res4 = Ly.at<float>(y2, x2);
           ry = (1.0f - fx)*(1.0f - fy)*res1 + fx*(1.0f - fy)*res2 + (1.0f - fx)*fy*res3 + fx*fy*res4;

           // Get the x and y derivatives on the rotated axis
@@ -1097,11 +1654,14 @@ void MSURF_Descriptor_64_Invoker::Get_MSURF_Descriptor_64(const KeyPoint& kpt, f
     i += 9;
   }

+  CV_Assert(dcount == desc_size);
+
   // convert to unit vector
   len = sqrt(len);

+  const float len_inv = 1.0f / len;
   for (i = 0; i < dsize; i++) {
-    desc[i] /= len;
+    desc[i] *= len_inv;
   }
 }

@@ -1112,244 +1672,145 @@ void MSURF_Descriptor_64_Invoker::Get_MSURF_Descriptor_64(const KeyPoint& kpt, f
  * @param kpt Input keypoint
  * @param desc Descriptor vector
  */
-void Upright_MLDB_Full_Descriptor_Invoker::Get_Upright_MLDB_Full_Descriptor(const KeyPoint& kpt, unsigned char *desc) const {
-
-  float di = 0.0, dx = 0.0, dy = 0.0;
-  float ri = 0.0, rx = 0.0, ry = 0.0, xf = 0.0, yf = 0.0;
-  float sample_x = 0.0, sample_y = 0.0, ratio = 0.0;
-  int x1 = 0, y1 = 0, sample_step = 0, pattern_size = 0;
-  int level = 0, nsamples = 0, scale = 0;
-  int dcount1 = 0, dcount2 = 0;
+void Upright_MLDB_Full_Descriptor_Invoker::Get_Upright_MLDB_Full_Descriptor(const KeyPoint& kpt, unsigned char *desc, int desc_size) const {

   const AKAZEOptions & options = *options_;
-  const std::vector<TEvolution>& evolution = *evolution_;
+  const std::vector<Evolution>& evolution = *evolution_;

-  // Matrices for the M-LDB descriptor
-  Mat values_1 = Mat::zeros(4, options.descriptor_channels, CV_32FC1);
-  Mat values_2 = Mat::zeros(9, options.descriptor_channels, CV_32FC1);
-  Mat values_3 = Mat::zeros(16, options.descriptor_channels, CV_32FC1);
+  // Buffer for the M-LDB descriptor
+  const int max_channels = 3;
+  CV_Assert(options.descriptor_channels <= max_channels);
+  float values[16*max_channels];

   // Get the information from the keypoint
-  ratio = (float)(1 << kpt.octave);
-  scale = fRound(0.5f*kpt.size / ratio);
-  level = kpt.class_id;
-  yf = kpt.pt.y / ratio;
-  xf = kpt.pt.x / ratio;
-
-  // First 2x2 grid
-  pattern_size = options_->descriptor_pattern_size;
-  sample_step = pattern_size;
-
-  for (int i = -pattern_size; i < pattern_size; i += sample_step) {
-    for (int j = -pattern_size; j < pattern_size; j += sample_step) {
-      di = dx = dy = 0.0;
-      nsamples = 0;
-
-      for (int k = i; k < i + sample_step; k++) {
-        for (int l = j; l < j + sample_step; l++) {
-
-          // Get the coordinates of the sample point
-          sample_y = yf + l*scale;
-          sample_x = xf + k*scale;
-
-          y1 = fRound(sample_y);
-          x1 = fRound(sample_x);
-
-          ri = *(evolution[level].Lt.ptr<float>(y1)+x1);
-          rx = *(evolution[level].Lx.ptr<float>(y1)+x1);
-          ry = *(evolution[level].Ly.ptr<float>(y1)+x1);
-
-          di += ri;
-          dx += rx;
-          dy += ry;
-          nsamples++;
+  const float ratio = (float)(1 << kpt.octave);
+  const int scale = cvRound(0.5f*kpt.size / ratio);
+  const int level = kpt.class_id;
+  const Mat Lx = evolution[level].Lx;
+  const Mat Ly = evolution[level].Ly;
+  const Mat Lt = evolution[level].Lt;
+  const float yf = kpt.pt.y / ratio;
+  const float xf = kpt.pt.x / ratio;
+
+  // For 2x2 grid, 3x3 grid and 4x4 grid
+  const int pattern_size = options_->descriptor_pattern_size;
+  CV_Assert((pattern_size & 1) == 0);
+  const int sample_step[3] = {
+    pattern_size,
+    divUp(pattern_size * 2, 3),
+    divUp(pattern_size, 2)
+  };
+
+  memset(desc, 0, desc_size);
+
+  // For the three grids
+  int dcount1 = 0;
+  for (int z = 0; z < 3; z++) {
+    int dcount2 = 0;
+    const int step = sample_step[z];
+    for (int i = -pattern_size; i < pattern_size; i += step) {
+      for (int j = -pattern_size; j < pattern_size; j += step) {
+        float di = 0.0, dx = 0.0, dy = 0.0;
+
+        int nsamples = 0;
+        for (int k = 0; k < step; k++) {
+          for (int l = 0; l < step; l++) {
+
+            // Get the coordinates of the sample point
+            const float sample_y = yf + (l+j)*scale;
+            const float sample_x = xf + (k+i)*scale;
+
+            const int y1 = cvRound(sample_y);
+            const int x1 = cvRound(sample_x);
+
+            if (y1 < 0 || y1 >= Lt.rows || x1 < 0 || x1 >= Lt.cols)
+                continue; // Boundaries
+
+            const float ri = Lt.at<float>(y1, x1);
+            const float rx = Lx.at<float>(y1, x1);
+            const float ry = Ly.at<float>(y1, x1);
+
+            di += ri;
+            dx += rx;
+            dy += ry;
+            nsamples++;
+          }
         }
-      }
-
-      di /= nsamples;
-      dx /= nsamples;
-      dy /= nsamples;
-
-      *(values_1.ptr<float>(dcount2)) = di;
-      *(values_1.ptr<float>(dcount2)+1) = dx;
-      *(values_1.ptr<float>(dcount2)+2) = dy;
-      dcount2++;
-    }
-  }
-
-  // Do binary comparison first level
-  for (int i = 0; i < 4; i++) {
-    for (int j = i + 1; j < 4; j++) {
-      if (*(values_1.ptr<float>(i)) > *(values_1.ptr<float>(j))) {
-        desc[dcount1 / 8] |= (1 << (dcount1 % 8));
-      }
-      dcount1++;
-
-      if (*(values_1.ptr<float>(i)+1) > *(values_1.ptr<float>(j)+1)) {
-        desc[dcount1 / 8] |= (1 << (dcount1 % 8));
-      }
-      dcount1++;
-
-      if (*(values_1.ptr<float>(i)+2) > *(values_1.ptr<float>(j)+2)) {
-        desc[dcount1 / 8] |= (1 << (dcount1 % 8));
-      }
-      dcount1++;
-    }
-  }
-
-  // Second 3x3 grid
-  sample_step = static_cast<int>(ceil(pattern_size*2. / 3.));
-  dcount2 = 0;

-  for (int i = -pattern_size; i < pattern_size; i += sample_step) {
-    for (int j = -pattern_size; j < pattern_size; j += sample_step) {
-      di = dx = dy = 0.0;
-      nsamples = 0;
-
-      for (int k = i; k < i + sample_step; k++) {
-        for (int l = j; l < j + sample_step; l++) {
-
-          // Get the coordinates of the sample point
-          sample_y = yf + l*scale;
-          sample_x = xf + k*scale;
-
-          y1 = fRound(sample_y);
-          x1 = fRound(sample_x);
-
-          ri = *(evolution[level].Lt.ptr<float>(y1)+x1);
-          rx = *(evolution[level].Lx.ptr<float>(y1)+x1);
-          ry = *(evolution[level].Ly.ptr<float>(y1)+x1);
-
-          di += ri;
-          dx += rx;
-          dy += ry;
-          nsamples++;
+        if (nsamples > 0)
+        {
+            const float nsamples_inv = 1.0f / nsamples;
+            di *= nsamples_inv;
+            dx *= nsamples_inv;
+            dy *= nsamples_inv;
         }
-      }
-
-      di /= nsamples;
-      dx /= nsamples;
-      dy /= nsamples;
-
-      *(values_2.ptr<float>(dcount2)) = di;
-      *(values_2.ptr<float>(dcount2)+1) = dx;
-      *(values_2.ptr<float>(dcount2)+2) = dy;
-      dcount2++;
-    }
-  }

-  //Do binary comparison second level
-  dcount2 = 0;
-  for (int i = 0; i < 9; i++) {
-    for (int j = i + 1; j < 9; j++) {
-      if (*(values_2.ptr<float>(i)) > *(values_2.ptr<float>(j))) {
-        desc[dcount1 / 8] |= (1 << (dcount1 % 8));
+        float *val = &values[dcount2*max_channels];
+        *(val) = di;
+        *(val+1) = dx;
+        *(val+2) = dy;
+        dcount2++;
       }
-      dcount1++;
-
-      if (*(values_2.ptr<float>(i)+1) > *(values_2.ptr<float>(j)+1)) {
-        desc[dcount1 / 8] |= (1 << (dcount1 % 8));
-      }
-      dcount1++;
-
-      if (*(values_2.ptr<float>(i)+2) > *(values_2.ptr<float>(j)+2)) {
-        desc[dcount1 / 8] |= (1 << (dcount1 % 8));
-      }
-      dcount1++;
     }
-  }

-  // Third 4x4 grid
-  sample_step = pattern_size / 2;
-  dcount2 = 0;
-
-  for (int i = -pattern_size; i < pattern_size; i += sample_step) {
-    for (int j = -pattern_size; j < pattern_size; j += sample_step) {
-      di = dx = dy = 0.0;
-      nsamples = 0;
-
-      for (int k = i; k < i + sample_step; k++) {
-        for (int l = j; l < j + sample_step; l++) {
-
-          // Get the coordinates of the sample point
-          sample_y = yf + l*scale;
-          sample_x = xf + k*scale;
-
-          y1 = fRound(sample_y);
-          x1 = fRound(sample_x);
-
-          ri = *(evolution[level].Lt.ptr<float>(y1)+x1);
-          rx = *(evolution[level].Lx.ptr<float>(y1)+x1);
-          ry = *(evolution[level].Ly.ptr<float>(y1)+x1);
-
-          di += ri;
-          dx += rx;
-          dy += ry;
-          nsamples++;
+    // Do binary comparison
+    const int num = (z + 2) * (z + 2);
+    for (int i = 0; i < num; i++) {
+      for (int j = i + 1; j < num; j++) {
+        const float * valI = &values[i*max_channels];
+        const float * valJ = &values[j*max_channels];
+        for (int k = 0; k < 3; ++k) {
+          if (*(valI + k) > *(valJ + k)) {
+            desc[dcount1 / 8] |= (1 << (dcount1 % 8));
+          }
+          dcount1++;
         }
       }
-
-      di /= nsamples;
-      dx /= nsamples;
-      dy /= nsamples;
-
-      *(values_3.ptr<float>(dcount2)) = di;
-      *(values_3.ptr<float>(dcount2)+1) = dx;
-      *(values_3.ptr<float>(dcount2)+2) = dy;
-      dcount2++;
     }
-  }

-  //Do binary comparison third level
-  dcount2 = 0;
-  for (int i = 0; i < 16; i++) {
-    for (int j = i + 1; j < 16; j++) {
-      if (*(values_3.ptr<float>(i)) > *(values_3.ptr<float>(j))) {
-        desc[dcount1 / 8] |= (1 << (dcount1 % 8));
-      }
-      dcount1++;
-
-      if (*(values_3.ptr<float>(i)+1) > *(values_3.ptr<float>(j)+1)) {
-        desc[dcount1 / 8] |= (1 << (dcount1 % 8));
-      }
-      dcount1++;
+  } // for (int z = 0; z < 3; z++)

-      if (*(values_3.ptr<float>(i)+2) > *(values_3.ptr<float>(j)+2)) {
-        desc[dcount1 / 8] |= (1 << (dcount1 % 8));
-      }
-      dcount1++;
-    }
-  }
+  CV_Assert(dcount1 <= desc_size*8);
+  CV_Assert(divUp(dcount1, 8) == desc_size);
 }

-void MLDB_Full_Descriptor_Invoker::MLDB_Fill_Values(float* values, int sample_step, int level,
+void MLDB_Full_Descriptor_Invoker::MLDB_Fill_Values(float* values, int sample_step, const int level,
                                                     float xf, float yf, float co, float si, float scale) const
 {
-    const std::vector<TEvolution>& evolution = *evolution_;
+    const std::vector<Evolution>& evolution = *evolution_;
     int pattern_size = options_->descriptor_pattern_size;
     int chan = options_->descriptor_channels;
-    int valpos = 0;
+    const Mat Lx = evolution[level].Lx;
+    const Mat Ly = evolution[level].Ly;
+    const Mat Lt = evolution[level].Lt;
+
+    const Size size = Lt.size();
+    CV_Assert(size == Lx.size());
+    CV_Assert(size == Ly.size());

+    int valpos = 0;
     for (int i = -pattern_size; i < pattern_size; i += sample_step) {
         for (int j = -pattern_size; j < pattern_size; j += sample_step) {
-            float di, dx, dy;
-            di = dx = dy = 0.0;
-            int nsamples = 0;
+            float di = 0.0f, dx = 0.0f, dy = 0.0f;

+            int nsamples = 0;
             for (int k = i; k < i + sample_step; k++) {
               for (int l = j; l < j + sample_step; l++) {
                 float sample_y = yf + (l*co * scale + k*si*scale);
                 float sample_x = xf + (-l*si * scale + k*co*scale);

-                int y1 = fRound(sample_y);
-                int x1 = fRound(sample_x);
+                int y1 = cvRound(sample_y);
+                int x1 = cvRound(sample_x);

-                float ri = *(evolution[level].Lt.ptr<float>(y1)+x1);
+                if (y1 < 0 || y1 >= Lt.rows || x1 < 0 || x1 >= Lt.cols)
+                    continue; // Boundaries
+
+                float ri = Lt.at<float>(y1, x1);
                 di += ri;

                 if(chan > 1) {
-                    float rx = *(evolution[level].Lx.ptr<float>(y1)+x1);
-                    float ry = *(evolution[level].Ly.ptr<float>(y1)+x1);
+                    float rx = Lx.at<float>(y1, x1);
+                    float ry = Ly.at<float>(y1, x1);
                     if (chan == 2) {
                       dx += sqrtf(rx*rx + ry*ry);
                     }
@@ -1363,20 +1824,25 @@ void MLDB_Full_Descriptor_Invoker::MLDB_Fill_Values(float* values, int sample_st
                 nsamples++;
               }
             }
-            di /= nsamples;
-            dx /= nsamples;
-            dy /= nsamples;
+
+            if (nsamples > 0)
+            {
+                const float nsamples_inv = 1.0f / nsamples;
+                di *= nsamples_inv;
+                dx *= nsamples_inv;
+                dy *= nsamples_inv;
+            }

             values[valpos] = di;
             if (chan > 1) {
                 values[valpos + 1] = dx;
             }
             if (chan > 2) {
-              values[valpos + 2] = dy;
+                values[valpos + 2] = dy;
             }
             valpos += chan;
-          }
         }
+    }
 }

 void MLDB_Full_Descriptor_Invoker::MLDB_Binary_Comparisons(float* values, unsigned char* desc,
@@ -1391,8 +1857,9 @@ void MLDB_Full_Descriptor_Invoker::MLDB_Binary_Comparisons(float* values, unsign
         for (int i = 0; i < count; i++) {
             int ival = ivalues[chan * i + pos];
             for (int j = i + 1; j < count; j++) {
-                int res = ival > ivalues[chan * j + pos];
-                desc[dpos >> 3] |= (res << (dpos & 7));
+                if (ival > ivalues[chan * j + pos]) {
+                  desc[dpos >> 3] |= (1 << (dpos & 7));
+                }
                 dpos++;
             }
         }
@@ -1406,30 +1873,41 @@ void MLDB_Full_Descriptor_Invoker::MLDB_Binary_Comparisons(float* values, unsign
  * @param kpt Input keypoint
  * @param desc Descriptor vector
  */
-void MLDB_Full_Descriptor_Invoker::Get_MLDB_Full_Descriptor(const KeyPoint& kpt, unsigned char *desc) const {
+void MLDB_Full_Descriptor_Invoker::Get_MLDB_Full_Descriptor(const KeyPoint& kpt, unsigned char *desc, int desc_size) const {

   const int max_channels = 3;
   CV_Assert(options_->descriptor_channels <= max_channels);
+  const int pattern_size = options_->descriptor_pattern_size;
+
   float values[16*max_channels];
-  const double size_mult[3] = {1, 2.0/3.0, 1.0/2.0};
+  CV_Assert((pattern_size & 1) == 0);
+  //const double size_mult[3] = {1, 2.0/3.0, 1.0/2.0};
+  const int sample_step[3] = { // static_cast<int>(ceil(pattern_size * size_mult[lvl]))
+    pattern_size,
+    divUp(pattern_size * 2, 3),
+    divUp(pattern_size, 2)
+  };

   float ratio = (float)(1 << kpt.octave);
-  float scale = (float)fRound(0.5f*kpt.size / ratio);
+  float scale = (float)cvRound(0.5f*kpt.size / ratio);
   float xf = kpt.pt.x / ratio;
   float yf = kpt.pt.y / ratio;
-  float angle = (kpt.angle * static_cast<float>(CV_PI)) / 180.f;
+  float angle = kpt.angle * static_cast<float>(CV_PI / 180.f);
   float co = cos(angle);
   float si = sin(angle);
-  int pattern_size = options_->descriptor_pattern_size;

-  int dpos = 0;
-  for(int lvl = 0; lvl < 3; lvl++) {
+  memset(desc, 0, desc_size);

+  int dpos = 0;
+  for(int lvl = 0; lvl < 3; lvl++)
+  {
       int val_count = (lvl + 2) * (lvl + 2);
-      int sample_step = static_cast<int>(ceil(pattern_size * size_mult[lvl]));
-      MLDB_Fill_Values(values, sample_step, kpt.class_id, xf, yf, co, si, scale);
+      MLDB_Fill_Values(values, sample_step[lvl], kpt.class_id, xf, yf, co, si, scale);
       MLDB_Binary_Comparisons(values, desc, val_count, dpos);
   }
+
+  CV_Assert(dpos == 486);
+  CV_Assert(divUp(dpos, 8) == desc_size);
 }

 /* ************************************************************************* */
@@ -1440,41 +1918,48 @@ void MLDB_Full_Descriptor_Invoker::Get_MLDB_Full_Descriptor(const KeyPoint& kpt,
  * @param kpt Input keypoint
  * @param desc Descriptor vector
  */
-void MLDB_Descriptor_Subset_Invoker::Get_MLDB_Descriptor_Subset(const KeyPoint& kpt, unsigned char *desc) const {
+void MLDB_Descriptor_Subset_Invoker::Get_MLDB_Descriptor_Subset(const KeyPoint& kpt, unsigned char *desc, int desc_size) const {

-  float di = 0.f, dx = 0.f, dy = 0.f;
   float rx = 0.f, ry = 0.f;
   float sample_x = 0.f, sample_y = 0.f;
-  int x1 = 0, y1 = 0;

   const AKAZEOptions & options = *options_;
-  const std::vector<TEvolution>& evolution = *evolution_;
+  const std::vector<Evolution>& evolution = *evolution_;

   // Get the information from the keypoint
   float ratio = (float)(1 << kpt.octave);
-  int scale = fRound(0.5f*kpt.size / ratio);
-  float angle = (kpt.angle * static_cast<float>(CV_PI)) / 180.f;
-  int level = kpt.class_id;
+  int scale = cvRound(0.5f*kpt.size / ratio);
+  float angle = kpt.angle * static_cast<float>(CV_PI / 180.f);
+  const int level = kpt.class_id;
+  const Mat Lx = evolution[level].Lx;
+  const Mat Ly = evolution[level].Ly;
+  const Mat Lt = evolution[level].Lt;
   float yf = kpt.pt.y / ratio;
   float xf = kpt.pt.x / ratio;
   float co = cos(angle);
   float si = sin(angle);

   // Allocate memory for the matrix of values
-  Mat values = Mat_<float>::zeros((4 + 9 + 16)*options.descriptor_channels, 1);
+  // Buffer for the M-LDB descriptor
+  const int max_channels = 3;
+  const int channels = options.descriptor_channels;
+  CV_Assert(channels <= max_channels);
+  float values[(4 + 9 + 16)*max_channels] = { 0 };

   // Sample everything, but only do the comparisons
-  vector<int> steps(3);
-  steps.at(0) = options.descriptor_pattern_size;
-  steps.at(1) = (int)ceil(2.f*options.descriptor_pattern_size / 3.f);
-  steps.at(2) = options.descriptor_pattern_size / 2;
+  const int pattern_size = options.descriptor_pattern_size;
+  CV_Assert((pattern_size & 1) == 0);
+  const int sample_steps[3] = {
+    pattern_size,
+    divUp(pattern_size * 2, 3),
+    divUp(pattern_size, 2)
+  };

   for (int i = 0; i < descriptorSamples_.rows; i++) {
     const int *coords = descriptorSamples_.ptr<int>(i);
-    int sample_step = steps.at(coords[0]);
-    di = 0.0f;
-    dx = 0.0f;
-    dy = 0.0f;
+    CV_Assert(coords[0] >= 0 && coords[0] < 3);
+    const int sample_step = sample_steps[coords[0]];
+    float di = 0.f, dx = 0.f, dy = 0.f;

     for (int k = coords[1]; k < coords[1] + sample_step; k++) {
       for (int l = coords[2]; l < coords[2] + sample_step; l++) {
@@ -1483,14 +1968,17 @@ void MLDB_Descriptor_Subset_Invoker::Get_MLDB_Descriptor_Subset(const KeyPoint&
         sample_y = yf + (l*scale*co + k*scale*si);
         sample_x = xf + (-l*scale*si + k*scale*co);

-        y1 = fRound(sample_y);
-        x1 = fRound(sample_x);
+        const int y1 = cvRound(sample_y);
+        const int x1 = cvRound(sample_x);
+
+        if (x1 < 0 || y1 < 0 || x1 >= Lt.cols || y1 >= Lt.rows)
+            continue; // Boundaries

-        di += *(evolution[level].Lt.ptr<float>(y1)+x1);
+        di += Lt.at<float>(y1, x1);

         if (options.descriptor_channels > 1) {
-          rx = *(evolution[level].Lx.ptr<float>(y1)+x1);
-          ry = *(evolution[level].Ly.ptr<float>(y1)+x1);
+          rx = Lx.at<float>(y1, x1);
+          ry = Ly.at<float>(y1, x1);

           if (options.descriptor_channels == 2) {
             dx += sqrtf(rx*rx + ry*ry);
@@ -1504,23 +1992,26 @@ void MLDB_Descriptor_Subset_Invoker::Get_MLDB_Descriptor_Subset(const KeyPoint&
       }
     }

-    *(values.ptr<float>(options.descriptor_channels*i)) = di;
+    float* pValues = &values[channels * i];
+    pValues[0] = di;

-    if (options.descriptor_channels == 2) {
-      *(values.ptr<float>(options.descriptor_channels*i + 1)) = dx;
+    if (channels == 2) {
+      pValues[1] = dx;
     }
-    else if (options.descriptor_channels == 3) {
-      *(values.ptr<float>(options.descriptor_channels*i + 1)) = dx;
-      *(values.ptr<float>(options.descriptor_channels*i + 2)) = dy;
+    else if (channels == 3) {
+      pValues[1] = dx;
+      pValues[2] = dy;
     }
   }

   // Do the comparisons
-  const float *vals = values.ptr<float>(0);
   const int *comps = descriptorBits_.ptr<int>(0);

+  CV_Assert(divUp(descriptorBits_.rows, 8) == desc_size);
+  memset(desc, 0, desc_size);
+
   for (int i = 0; i<descriptorBits_.rows; i++) {
-    if (vals[comps[2 * i]] > vals[comps[2 * i + 1]]) {
+    if (values[comps[2 * i]] > values[comps[2 * i + 1]]) {
       desc[i / 8] |= (1 << (i % 8));
     }
   }
@@ -1534,7 +2025,7 @@ void MLDB_Descriptor_Subset_Invoker::Get_MLDB_Descriptor_Subset(const KeyPoint&
  * @param kpt Input keypoint
  * @param desc Descriptor vector
  */
-void Upright_MLDB_Descriptor_Subset_Invoker::Get_Upright_MLDB_Descriptor_Subset(const KeyPoint& kpt, unsigned char *desc) const {
+void Upright_MLDB_Descriptor_Subset_Invoker::Get_Upright_MLDB_Descriptor_Subset(const KeyPoint& kpt, unsigned char *desc, int desc_size) const {

   float di = 0.0f, dx = 0.0f, dy = 0.0f;
   float rx = 0.0f, ry = 0.0f;
@@ -1542,26 +2033,36 @@ void Upright_MLDB_Descriptor_Subset_Invoker::Get_Upright_MLDB_Descriptor_Subset(
   int x1 = 0, y1 = 0;

   const AKAZEOptions & options = *options_;
-  const std::vector<TEvolution>& evolution = *evolution_;
+  const std::vector<Evolution>& evolution = *evolution_;

   // Get the information from the keypoint
   float ratio = (float)(1 << kpt.octave);
-  int scale = fRound(0.5f*kpt.size / ratio);
-  int level = kpt.class_id;
+  int scale = cvRound(0.5f*kpt.size / ratio);
+  const int level = kpt.class_id;
+  const Mat Lx = evolution[level].Lx;
+  const Mat Ly = evolution[level].Ly;
+  const Mat Lt = evolution[level].Lt;
   float yf = kpt.pt.y / ratio;
   float xf = kpt.pt.x / ratio;

   // Allocate memory for the matrix of values
-  Mat values = Mat_<float>::zeros((4 + 9 + 16)*options.descriptor_channels, 1);
-
-  vector<int> steps(3);
-  steps.at(0) = options.descriptor_pattern_size;
-  steps.at(1) = static_cast<int>(ceil(2.f*options.descriptor_pattern_size / 3.f));
-  steps.at(2) = options.descriptor_pattern_size / 2;
+  const int max_channels = 3;
+  const int channels = options.descriptor_channels;
+  CV_Assert(channels <= max_channels);
+  float values[(4 + 9 + 16)*max_channels] = { 0 };
+
+  const int pattern_size = options.descriptor_pattern_size;
+  CV_Assert((pattern_size & 1) == 0);
+  const int sample_steps[3] = {
+    pattern_size,
+    divUp(pattern_size * 2, 3),
+    divUp(pattern_size, 2)
+  };

   for (int i = 0; i < descriptorSamples_.rows; i++) {
     const int *coords = descriptorSamples_.ptr<int>(i);
-    int sample_step = steps.at(coords[0]);
+    CV_Assert(coords[0] >= 0 && coords[0] < 3);
+    int sample_step = sample_steps[coords[0]];
     di = 0.0f, dx = 0.0f, dy = 0.0f;

     for (int k = coords[1]; k < coords[1] + sample_step; k++) {
@@ -1571,13 +2072,17 @@ void Upright_MLDB_Descriptor_Subset_Invoker::Get_Upright_MLDB_Descriptor_Subset(
         sample_y = yf + l*scale;
         sample_x = xf + k*scale;

-        y1 = fRound(sample_y);
-        x1 = fRound(sample_x);
-        di += *(evolution[level].Lt.ptr<float>(y1)+x1);
+        y1 = cvRound(sample_y);
+        x1 = cvRound(sample_x);
+
+        if (x1 < 0 || y1 < 0 || x1 >= Lt.cols || y1 >= Lt.rows)
+            continue; // Boundaries
+
+        di += Lt.at<float>(y1, x1);

         if (options.descriptor_channels > 1) {
-          rx = *(evolution[level].Lx.ptr<float>(y1)+x1);
-          ry = *(evolution[level].Ly.ptr<float>(y1)+x1);
+          rx = Lx.at<float>(y1, x1);
+          ry = Ly.at<float>(y1, x1);

           if (options.descriptor_channels == 2) {
             dx += sqrtf(rx*rx + ry*ry);
@@ -1590,23 +2095,26 @@ void Upright_MLDB_Descriptor_Subset_Invoker::Get_Upright_MLDB_Descriptor_Subset(
       }
     }

-    *(values.ptr<float>(options.descriptor_channels*i)) = di;
+    float* pValues = &values[channels * i];
+    pValues[0] = di;

     if (options.descriptor_channels == 2) {
-      *(values.ptr<float>(options.descriptor_channels*i + 1)) = dx;
+      pValues[1] = dx;
     }
     else if (options.descriptor_channels == 3) {
-      *(values.ptr<float>(options.descriptor_channels*i + 1)) = dx;
-      *(values.ptr<float>(options.descriptor_channels*i + 2)) = dy;
+      pValues[1] = dx;
+      pValues[2] = dy;
     }
   }

   // Do the comparisons
-  const float *vals = values.ptr<float>(0);
   const int *comps = descriptorBits_.ptr<int>(0);

+  CV_Assert(divUp(descriptorBits_.rows, 8) == desc_size);
+  memset(desc, 0, desc_size);
+
   for (int i = 0; i<descriptorBits_.rows; i++) {
-    if (vals[comps[2 * i]] > vals[comps[2 * i + 1]]) {
+    if (values[comps[2 * i]] > values[comps[2 * i + 1]]) {
       desc[i / 8] |= (1 << (i % 8));
     }
   }
@@ -1636,7 +2144,8 @@ void generateDescriptorSubsample(Mat& sampleList, Mat& comparisons, int nbits,
   }
   ssz *= nchannels;

-  CV_Assert(nbits <= ssz); // Descriptor size can't be bigger than full descriptor
+  CV_Assert(ssz == 162*nchannels);
+  CV_Assert(nbits <= ssz && "Descriptor size can't be bigger than full descriptor (486 = 162*3 - 3 channels)");

   // Since the full descriptor is usually under 10k elements, we pick
   // the selection from the full matrix.  We take as many samples per
@@ -1647,7 +2156,7 @@ void generateDescriptorSubsample(Mat& sampleList, Mat& comparisons, int nbits,
   for (int i = 0, c = 0; i < 3; i++) {
     int gdiv = i + 2; //grid divisions, per row
     int gsz = gdiv*gdiv;
-    int psz = (int)ceil(2.f*pattern_size / (float)gdiv);
+    int psz = divUp(2*pattern_size, gdiv);

     for (int j = 0; j < gsz; j++) {
       for (int k = j + 1; k < gsz; k++, c++) {
@@ -1660,19 +2169,19 @@ void generateDescriptorSubsample(Mat& sampleList, Mat& comparisons, int nbits,
     }
   }

-  srand(1024);
-  Mat_<int> comps = Mat_<int>(nchannels * (int)ceil(nbits / (float)nchannels), 2);
+  RNG rng(1024);
+  const int npicks = divUp(nbits, nchannels);
+  Mat_<int> comps = Mat_<int>(nchannels * npicks, 2);
   comps = 1000;

   // Select some samples. A sample includes all channels
   int count = 0;
-  int npicks = (int)ceil(nbits / (float)nchannels);
   Mat_<int> samples(29, 3);
   Mat_<int> fullcopy = fullM.clone();
   samples = -1;

   for (int i = 0; i < npicks; i++) {
-    int k = rand() % (fullM.rows - i);
+    int k = rng(fullM.rows - i);
     if (i < 6) {
       // Force use of the coarser grid values and comparisons
       k = i;