Merge pull request #9330 from hrnr:akaze_ocl

[GSOC] Enable OCL for AKAZE (#9330) * revert e0489cb - reenable OCL for AKAZE * deal with conversion internally in AKAZE * pass InputArray directly to AKAZE to allow distiguishing input Mat/UMat. deal with conversion there * ensure that keypoints orientations are always computed. prevents misuse of internal AKAZE class. * covert internal AKAZE functions to use InputArray/OutputArray * make internal functions private in AKAZE * split OCL and CPU paths in AKAZE * create 2 separate pyramids, 1 for OCL and 1 for CPU * template functions that use temporaries to always store them as correct type (UMat/Mat) * remove variable used only in OCL path causes unused variable warning * update AKAZE documentation * run ocl version only when ocl is enabled * add tests for OCL path in AKAZE * relax condition for keypoints angle
2025-08-06 14:36:36 +08:00 · 2017-08-16 18:46:11 +02:00 · 2017-08-16 18:46:11 +02:00 · a5b5684670
commit a5b5684670
parent 3a8dbebd37
5 changed files with 341 additions and 175 deletions
--- a/modules/features2d/include/opencv2/features2d.hpp
+++ b/modules/features2d/include/opencv2/features2d.hpp
@ -658,13 +658,22 @@ public:
    CV_WRAP virtual int getDiffusivity() const = 0;
 };

-/** @brief Class implementing the AKAZE keypoint detector and descriptor extractor, described in @cite ANB13 . :
+/** @brief Class implementing the AKAZE keypoint detector and descriptor extractor, described in @cite ANB13.

-@note AKAZE descriptors can only be used with KAZE or AKAZE keypoints. Try to avoid using *extract*
-and *detect* instead of *operator()* due to performance reasons. .. [ANB13] Fast Explicit Diffusion
-for Accelerated Features in Nonlinear Scale Spaces. Pablo F. Alcantarilla, Jesús Nuevo and Adrien
-Bartoli. In British Machine Vision Conference (BMVC), Bristol, UK, September 2013.
- */
+@details AKAZE descriptors can only be used with KAZE or AKAZE keypoints. This class is thread-safe.
+
+@note When you need descriptors use Feature2D::detectAndCompute, which
+provides better performance. When using Feature2D::detect followed by
+Feature2D::compute scale space pyramid is computed twice.
+
+@note AKAZE implements T-API. When image is passed as UMat some parts of the algorithm
+will use OpenCL.
+
+@note [ANB13] Fast Explicit Diffusion for Accelerated Features in Nonlinear
+Scale Spaces. Pablo F. Alcantarilla, Jesús Nuevo and Adrien Bartoli. In
+British Machine Vision Conference (BMVC), Bristol, UK, September 2013.
+
+*/
 class CV_EXPORTS_W AKAZE : public Feature2D
 {
 public:
--- a/modules/features2d/src/akaze.cpp
+++ b/modules/features2d/src/akaze.cpp
@ -169,38 +169,25 @@ namespace cv
        {
            CV_INSTRUMENT_REGION()

-            Mat img = image.getMat();
-            if (img.channels() > 1)
-                cvtColor(image, img, COLOR_BGR2GRAY);
-
-            Mat img1_32;
-            if ( img.depth() == CV_32F )
-                img1_32 = img;
-            else if ( img.depth() == CV_8U )
-                img.convertTo(img1_32, CV_32F, 1.0 / 255.0, 0);
-            else if ( img.depth() == CV_16U )
-                img.convertTo(img1_32, CV_32F, 1.0 / 65535.0, 0);
-
-            CV_Assert( ! img1_32.empty() );
+            CV_Assert( ! image.empty() );

            AKAZEOptions options;
            options.descriptor = descriptor;
            options.descriptor_channels = descriptor_channels;
            options.descriptor_size = descriptor_size;
-            options.img_width = img.cols;
-            options.img_height = img.rows;
+            options.img_width = image.cols();
+            options.img_height = image.rows();
            options.dthreshold = threshold;
            options.omax = octaves;
            options.nsublevels = sublevels;
            options.diffusivity = diffusivity;

            AKAZEFeatures impl(options);
-            impl.Create_Nonlinear_Scale_Space(img1_32);
+            impl.Create_Nonlinear_Scale_Space(image);

            if (!useProvidedKeypoints)
            {
                impl.Feature_Detection(keypoints);
-                impl.Compute_Keypoints_Orientation(keypoints);
            }

            if (!mask.empty())
@ -208,7 +195,7 @@ namespace cv
                KeyPointsFilter::runByPixelsMask(keypoints, mask.getMat());
            }

-            if( descriptors.needed() )
+            if(descriptors.needed())
            {
                impl.Compute_Descriptors(keypoints, descriptors);

--- a/modules/features2d/src/kaze/AKAZEFeatures.cpp
+++ b/modules/features2d/src/kaze/AKAZEFeatures.cpp
@ -15,10 +15,6 @@

 #include <iostream>

-#ifdef HAVE_OPENCL // OpenCL is not well supported
-#undef HAVE_OPENCL
-#endif
-
 // Namespaces
 namespace cv
 {
@ -75,7 +71,7 @@ void AKAZEFeatures::Allocate_Memory_Evolution(void) {
    }

    for (int j = 0; j < options_.nsublevels; j++) {
-      Evolution step;
+      MEvolution step;
      step.size = Size(level_width, level_height);
      step.esigma = options_.soffset*pow(2.f, (float)(j) / (float)(options_.nsublevels) + i);
      step.sigma_size = cvRound(step.esigma * options_.derivative_factor / power);  // In fact sigma_size only depends on j
@ -257,39 +253,41 @@ private:
 static inline bool
 ocl_non_linear_diffusion_step(InputArray Lt_, InputArray Lf_, OutputArray Lstep_, float step_size)
 {
-    if (!Lt_.isContinuous())
-        return false;
+  if(!Lt_.isContinuous())
+    return false;

-    UMat Lt = Lt_.getUMat(), Lf = Lf_.getUMat(), Lstep = Lstep_.getUMat();
+  UMat Lt = Lt_.getUMat();
+  UMat Lf = Lf_.getUMat();
+  UMat Lstep = Lstep_.getUMat();

-    size_t globalSize[] = {(size_t)Lt.cols, (size_t)Lt.rows};
+  size_t globalSize[] = {(size_t)Lt.cols, (size_t)Lt.rows};

-    ocl::Kernel ker("AKAZE_nld_step_scalar", ocl::features2d::akaze_oclsrc);
-    if (ker.empty())
-        return false;
+  ocl::Kernel ker("AKAZE_nld_step_scalar", ocl::features2d::akaze_oclsrc);
+  if( ker.empty() )
+    return false;

-    return ker.args(
-            ocl::KernelArg::ReadOnly(Lt),
-            ocl::KernelArg::PtrReadOnly(Lf),
-            ocl::KernelArg::PtrWriteOnly(Lstep),
-            step_size)
-    .run(2, globalSize, 0, true);
+  return ker.args(
+    ocl::KernelArg::ReadOnly(Lt),
+    ocl::KernelArg::PtrReadOnly(Lf),
+    ocl::KernelArg::PtrWriteOnly(Lstep),
+    step_size).run(2, globalSize, 0, true);
 }
 #endif // HAVE_OPENCL

 static inline void
-non_linear_diffusion_step(InputArray Lt, InputArray Lf, OutputArray Lstep, float step_size)
+non_linear_diffusion_step(InputArray Lt_, InputArray Lf_, OutputArray Lstep_, float step_size)
 {
  CV_INSTRUMENT_REGION()

-  Lstep.create(Lt.size(), Lt.type());
+  Lstep_.create(Lt_.size(), Lt_.type());

-#ifdef HAVE_OPENCL
-  CV_OCL_RUN(OCL_PERFORMANCE_CHECK(Lstep.isUMat()), ocl_non_linear_diffusion_step(Lt, Lf, Lstep, step_size));
-#endif
+  CV_OCL_RUN(Lt_.isUMat() && Lf_.isUMat() && Lstep_.isUMat(),
+    ocl_non_linear_diffusion_step(Lt_, Lf_, Lstep_, step_size));

-  Mat Mstep = Lstep.getMat();
-  parallel_for_(Range(0, Lt.rows()), NonLinearScalarDiffusionStep(Lt.getMat(), Lf.getMat(), Mstep, step_size));
+  Mat Lt = Lt_.getMat();
+  Mat Lf = Lf_.getMat();
+  Mat Lstep = Lstep_.getMat();
+  parallel_for_(Range(0, Lt.rows), NonLinearScalarDiffusionStep(Lt, Lf, Lstep, step_size));
 }

 /**
@ -302,12 +300,15 @@ non_linear_diffusion_step(InputArray Lt, InputArray Lf, OutputArray Lstep, float
 * @return k contrast factor
 */
 static inline float
-compute_kcontrast(const cv::Mat& Lx, const cv::Mat& Ly, float perc, int nbins)
+compute_kcontrast(InputArray Lx_, InputArray Ly_, float perc, int nbins)
 {
  CV_INSTRUMENT_REGION()

  CV_Assert(nbins > 2);
-  CV_Assert(!Lx.empty());
+  CV_Assert(!Lx_.empty());
+
+  Mat Lx = Lx_.getMat();
+  Mat Ly = Ly_.getMat();

  // temporary square roots of dot product
  Mat modgs (Lx.rows - 2, Lx.cols - 2, CV_32F);
@ -356,21 +357,22 @@ compute_kcontrast(const cv::Mat& Lx, const cv::Mat& Ly, float perc, int nbins)
 static inline bool
 ocl_pm_g2(InputArray Lx_, InputArray Ly_, OutputArray Lflow_, float kcontrast)
 {
-    UMat Lx = Lx_.getUMat(), Ly = Ly_.getUMat(), Lflow = Lflow_.getUMat();
+  UMat Lx = Lx_.getUMat();
+  UMat Ly = Ly_.getUMat();
+  UMat Lflow = Lflow_.getUMat();

-    int total = Lx.rows * Lx.cols;
-    size_t globalSize[] = {(size_t)total};
+  int total = Lx.rows * Lx.cols;
+  size_t globalSize[] = {(size_t)total};

-    ocl::Kernel ker("AKAZE_pm_g2", ocl::features2d::akaze_oclsrc);
-    if (ker.empty())
-        return false;
+  ocl::Kernel ker("AKAZE_pm_g2", ocl::features2d::akaze_oclsrc);
+  if( ker.empty() )
+    return false;

-    return ker.args(
-            ocl::KernelArg::PtrReadOnly(Lx),
-            ocl::KernelArg::PtrReadOnly(Ly),
-            ocl::KernelArg::PtrWriteOnly(Lflow),
-            kcontrast, total)
-    .run(1, globalSize, 0, true);
+  return ker.args(
+    ocl::KernelArg::PtrReadOnly(Lx),
+    ocl::KernelArg::PtrReadOnly(Ly),
+    ocl::KernelArg::PtrWriteOnly(Lflow),
+    kcontrast, total).run(1, globalSize, 0, true);
 }
 #endif // HAVE_OPENCL

@ -386,9 +388,7 @@ compute_diffusivity(InputArray Lx, InputArray Ly, OutputArray Lflow, float kcont
      pm_g1(Lx, Ly, Lflow, kcontrast);
    break;
    case KAZE::DIFF_PM_G2:
-#ifdef HAVE_OPENCL
-      CV_OCL_RUN(OCL_PERFORMANCE_CHECK(Lflow.isUMat()), ocl_pm_g2(Lx, Ly, Lflow, kcontrast));
-#endif
+      CV_OCL_RUN(Lx.isUMat() && Ly.isUMat() && Lflow.isUMat(), ocl_pm_g2(Lx, Ly, Lflow, kcontrast));
      pm_g2(Lx, Ly, Lflow, kcontrast);
    break;
    case KAZE::DIFF_WEICKERT:
@ -404,28 +404,54 @@ compute_diffusivity(InputArray Lx, InputArray Ly, OutputArray Lflow, float kcont
 }

 /**
- * @brief This method creates the nonlinear scale space for a given image
- * @param img Input image for which the nonlinear scale space needs to be created
- * @return 0 if the nonlinear scale space was created successfully, -1 otherwise
+ * @brief Converts input image to grayscale float image
+ *
+ * @param image any image
+ * @param dst grayscale float image
 */
-void AKAZEFeatures::Create_Nonlinear_Scale_Space(InputArray img)
+static inline void prepareInputImage(InputArray image, OutputArray dst)
+{
+  Mat img = image.getMat();
+  if (img.channels() > 1)
+    cvtColor(image, img, COLOR_BGR2GRAY);
+
+  if ( img.depth() == CV_32F )
+    dst.assign(img);
+  else if ( img.depth() == CV_8U )
+    img.convertTo(dst, CV_32F, 1.0 / 255.0, 0);
+  else if ( img.depth() == CV_16U )
+    img.convertTo(dst, CV_32F, 1.0 / 65535.0, 0);
+}
+
+/**
+ * @brief This method creates the nonlinear scale space for a given image
+ * @param image Input image for which the nonlinear scale space needs to be created
+ */
+template<typename MatType>
+static inline void
+create_nonlinear_scale_space(InputArray image, const AKAZEOptions &options,
+  const std::vector<std::vector<float > > &tsteps_evolution, std::vector<Evolution<MatType> > &evolution)
 {
  CV_INSTRUMENT_REGION()
-  CV_Assert(evolution_.size() > 0);
+  CV_Assert(evolution.size() > 0);
+
+  // convert input to grayscale float image if needed
+  MatType img;
+  prepareInputImage(image, img);

  // create first level of the evolution
-  int ksize = getGaussianKernelSize(options_.soffset);
-  GaussianBlur(img, evolution_[0].Lsmooth, Size(ksize, ksize), options_.soffset, options_.soffset, BORDER_REPLICATE);
-  evolution_[0].Lsmooth.copyTo(evolution_[0].Lt);
+  int ksize = getGaussianKernelSize(options.soffset);
+  GaussianBlur(img, evolution[0].Lsmooth, Size(ksize, ksize), options.soffset, options.soffset, BORDER_REPLICATE);
+  evolution[0].Lsmooth.copyTo(evolution[0].Lt);

-  if (evolution_.size() == 1) {
+  if (evolution.size() == 1) {
    // we don't need to compute kcontrast factor
-    Compute_Determinant_Hessian_Response();
+    Compute_Determinant_Hessian_Response(evolution);
    return;
  }

  // derivatives, flow and diffusion step
-  Mat Lx, Ly, Lsmooth, Lflow, Lstep;
+  MatType Lx, Ly, Lsmooth, Lflow, Lstep;

  // compute derivatives for computing k contrast
  GaussianBlur(img, Lsmooth, Size(5, 5), 1.0f, 1.0f, BORDER_REPLICATE);
@ -433,19 +459,19 @@ void AKAZEFeatures::Create_Nonlinear_Scale_Space(InputArray img)
  Scharr(Lsmooth, Ly, CV_32F, 0, 1, 1, 0, BORDER_DEFAULT);
  Lsmooth.release();
  // compute the kcontrast factor
-  float kcontrast = compute_kcontrast(Lx, Ly, options_.kcontrast_percentile, options_.kcontrast_nbins);
+  float kcontrast = compute_kcontrast(Lx, Ly, options.kcontrast_percentile, options.kcontrast_nbins);

  // Now generate the rest of evolution levels
-  for (size_t i = 1; i < evolution_.size(); i++) {
-    Evolution &e = evolution_[i];
+  for (size_t i = 1; i < evolution.size(); i++) {
+    Evolution<MatType> &e = evolution[i];

-    if (e.octave > evolution_[i - 1].octave) {
+    if (e.octave > evolution[i - 1].octave) {
      // new octave will be half the size
-      resize(evolution_[i - 1].Lt, e.Lt, e.size, 0, 0, INTER_AREA);
+      resize(evolution[i - 1].Lt, e.Lt, e.size, 0, 0, INTER_AREA);
      kcontrast *= 0.75f;
    }
    else {
-      evolution_[i - 1].Lt.copyTo(e.Lt);
+      evolution[i - 1].Lt.copyTo(e.Lt);
    }

    GaussianBlur(e.Lt, e.Lsmooth, Size(5, 5), 1.0f, 1.0f, BORDER_REPLICATE);
@ -455,10 +481,10 @@ void AKAZEFeatures::Create_Nonlinear_Scale_Space(InputArray img)
    Scharr(e.Lsmooth, Ly, CV_32F, 0, 1, 1.0, 0, BORDER_DEFAULT);

    // Compute the conductivity equation
-    compute_diffusivity(Lx, Ly, Lflow, kcontrast, options_.diffusivity);
+    compute_diffusivity(Lx, Ly, Lflow, kcontrast, options.diffusivity);

    // Perform Fast Explicit Diffusion on Lt
-    std::vector<float> &tsteps = tsteps_[i - 1];
+    const std::vector<float> &tsteps = tsteps_evolution[i - 1];
    for (size_t j = 0; j < tsteps.size(); j++) {
      const float step_size = tsteps[j] * 0.5f;
      non_linear_diffusion_step(e.Lt, Lflow, Lstep, step_size);
@ -466,31 +492,73 @@ void AKAZEFeatures::Create_Nonlinear_Scale_Space(InputArray img)
    }
  }

-  Compute_Determinant_Hessian_Response();
+  Compute_Determinant_Hessian_Response(evolution);
+
+  return;
+}
+
+/**
+ * @brief Converts between UMatPyramid and Pyramid and vice versa
+ * @details Matrices in evolution levels will be copied
+ *
+ * @param src source pyramid
+ * @param dst destination pyramid
+ */
+template<typename MatTypeSrc, typename MatTypeDst>
+static inline void
+convertScalePyramid(const std::vector<Evolution<MatTypeSrc> >& src, std::vector<Evolution<MatTypeDst> > &dst)
+{
+  dst.resize(src.size());
+  for (size_t i = 0; i < src.size(); ++i) {
+    dst[i] = Evolution<MatTypeDst>(src[i]);
+  }
+}
+
+/**
+ * @brief This method creates the nonlinear scale space for a given image
+ * @param image Input image for which the nonlinear scale space needs to be created
+ */
+void AKAZEFeatures::Create_Nonlinear_Scale_Space(InputArray image)
+{
+  if (ocl::useOpenCL() && image.isUMat()) {
+    // will run OCL version of scale space pyramid
+    UMatPyramid uPyr;
+    // init UMat pyramid with sizes
+    convertScalePyramid(evolution_, uPyr);
+    create_nonlinear_scale_space(image, options_, tsteps_, uPyr);
+    // download pyramid from GPU
+    convertScalePyramid(uPyr, evolution_);
+  } else {
+    // CPU version
+    create_nonlinear_scale_space(image, options_, tsteps_, evolution_);
+  }
 }

 /* ************************************************************************* */

 #ifdef HAVE_OPENCL
 static inline bool
-ocl_compute_determinant(InputArray Lxx_, InputArray Lxy_, InputArray Lyy_, OutputArray Ldet_, float sigma)
+ocl_compute_determinant(InputArray Lxx_, InputArray Lxy_, InputArray Lyy_,
+  OutputArray Ldet_, float sigma)
 {
-    UMat Lxx = Lxx_.getUMat(), Lxy = Lxy_.getUMat(), Lyy = Lyy_.getUMat(), Ldet = Ldet_.getUMat();
+  UMat Lxx = Lxx_.getUMat();
+  UMat Lxy = Lxy_.getUMat();
+  UMat Lyy = Lyy_.getUMat();
+  UMat Ldet = Ldet_.getUMat();

-    const int total = Lxx.rows * Lxx.cols;
-    size_t globalSize[] = {(size_t)total};
+  const int total = Lxx.rows * Lxx.cols;
+  size_t globalSize[] = {(size_t)total};

-    ocl::Kernel ker("AKAZE_compute_determinant", ocl::features2d::akaze_oclsrc);
-    if (ker.empty())
-        return false;
+  ocl::Kernel ker("AKAZE_compute_determinant", ocl::features2d::akaze_oclsrc);
+  if( ker.empty() )
+    return false;

-    return ker.args(
-            ocl::KernelArg::PtrReadOnly(Lxx),
-            ocl::KernelArg::PtrReadOnly(Lxy),
-            ocl::KernelArg::PtrReadOnly(Lyy),
-            ocl::KernelArg::PtrWriteOnly(Ldet),
-            sigma, total)
-    .run(1, globalSize, 0, true);
+  return ker.args(
+    ocl::KernelArg::PtrReadOnly(Lxx),
+    ocl::KernelArg::PtrReadOnly(Lxy),
+    ocl::KernelArg::PtrReadOnly(Lyy),
+    ocl::KernelArg::PtrWriteOnly(Ldet),
+    sigma, total).run(1, globalSize, 0, true);
 }
 #endif // HAVE_OPENCL

@ -504,47 +572,44 @@ ocl_compute_determinant(InputArray Lxx_, InputArray Lxy_, InputArray Lyy_, Outpu
 * @param Ldet output determinant
 * @param sigma determinant will be scaled by this sigma
 */
-static inline void compute_determinant(InputArray Lxx, InputArray Lxy, InputArray Lyy, OutputArray Ldet, float sigma)
+static inline void compute_determinant(InputArray Lxx_, InputArray Lxy_, InputArray Lyy_,
+  OutputArray Ldet_, float sigma)
 {
-    CV_INSTRUMENT_REGION()
+  CV_INSTRUMENT_REGION()

-    Ldet.create(Lxx.size(), Lxx.type());
+  Ldet_.create(Lxx_.size(), Lxx_.type());

-#ifdef HAVE_OPENCL
-    CV_OCL_RUN(OCL_PERFORMANCE_CHECK(Ldet.isUMat()), ocl_compute_determinant(Lxx, Lxy, Lyy, Ldet, sigma));
-#endif
+  CV_OCL_RUN(Lxx_.isUMat() && Ldet_.isUMat(), ocl_compute_determinant(Lxx_, Lxy_, Lyy_, Ldet_, sigma));
+
+  // output determinant
+  Mat Lxx = Lxx_.getMat(), Lxy = Lxy_.getMat(), Lyy = Lyy_.getMat(), Ldet = Ldet_.getMat();
+  float *lxx = Lxx.ptr<float>();
+  float *lxy = Lxy.ptr<float>();
+  float *lyy = Lyy.ptr<float>();
+  float *ldet = Ldet.ptr<float>();
+  const int total = Lxx.cols * Lxx.rows;
+  for (int j = 0; j < total; j++) {
+    ldet[j] = (lxx[j] * lyy[j] - lxy[j] * lxy[j]) * sigma;
+  }

-    // output determinant
-    Mat Mxx = Lxx.getMat(), Mxy = Lxy.getMat(), Myy = Lyy.getMat(), Mdet = Ldet.getMat();
-    const int W = Mxx.cols, H = Mxx.rows;
-    for (int y = 0; y < H; y++)
-    {
-        float *lxx = Mxx.ptr<float>(y);
-        float *lxy = Mxy.ptr<float>(y);
-        float *lyy = Myy.ptr<float>(y);
-        float *ldet = Mdet.ptr<float>(y);
-        for (int x = 0; x < W; x++)
-        {
-            ldet[x] = (lxx[x] * lyy[x] - lxy[x] * lxy[x]) * sigma;
-        }
-    }
 }

+template <typename MatType>
 class DeterminantHessianResponse : public ParallelLoopBody
 {
 public:
-    explicit DeterminantHessianResponse(std::vector<Evolution>& ev)
+    explicit DeterminantHessianResponse(std::vector<Evolution<MatType> >& ev)
    : evolution_(&ev)
  {
  }

  void operator()(const Range& range) const
  {
-    Mat Lxx, Lxy, Lyy;
+    MatType Lxx, Lxy, Lyy;

    for (int i = range.start; i < range.end; i++)
    {
-      Evolution &e = (*evolution_)[i];
+      Evolution<MatType> &e = (*evolution_)[i];

      // we cannot use cv:Scharr here, because we need to handle also
      // kernel sizes other than 3, by default we are using 9x9, 5x5 and 7x7
@ -571,23 +636,33 @@ public:
  }

 private:
-  std::vector<Evolution>*  evolution_;
+  std::vector<Evolution<MatType> >*  evolution_;
 };


 /**
 * @brief This method computes the feature detector response for the nonlinear scale space
+ * @details OCL version
 * @note We use the Hessian determinant as the feature detector response
 */
-void AKAZEFeatures::Compute_Determinant_Hessian_Response(void) {
+static inline void
+Compute_Determinant_Hessian_Response(UMatPyramid &evolution) {
  CV_INSTRUMENT_REGION()

-  if (ocl::useOpenCL()) {
-    DeterminantHessianResponse body (evolution_);
-    body(Range(0, (int)evolution_.size()));
-  } else {
-    parallel_for_(Range(0, (int)evolution_.size()), DeterminantHessianResponse(evolution_));
-  }
+  DeterminantHessianResponse<UMat> body (evolution);
+  body(Range(0, (int)evolution.size()));
+}
+
+/**
+ * @brief This method computes the feature detector response for the nonlinear scale space
+ * @details CPU version
+ * @note We use the Hessian determinant as the feature detector response
+ */
+static inline void
+Compute_Determinant_Hessian_Response(Pyramid &evolution) {
+  CV_INSTRUMENT_REGION()
+
+  parallel_for_(Range(0, (int)evolution.size()), DeterminantHessianResponse<Mat>(evolution));
 }

 /* ************************************************************************* */
@ -604,6 +679,7 @@ void AKAZEFeatures::Feature_Detection(std::vector<KeyPoint>& kpts)
  std::vector<Mat> keypoints_by_layers;
  Find_Scale_Space_Extrema(keypoints_by_layers);
  Do_Subpixel_Refinement(keypoints_by_layers, kpts);
+  Compute_Keypoints_Orientation(kpts);
 }

 /**
@ -644,7 +720,7 @@ find_neighbor_point(const int x, const int y, const Mat &mask, const int search_
 class FindKeypointsSameScale : public ParallelLoopBody
 {
 public:
-    explicit FindKeypointsSameScale(const std::vector<Evolution>& ev,
+    explicit FindKeypointsSameScale(const Pyramid& ev,
      std::vector<Mat>& kpts, float dthreshold)
    : evolution_(&ev), keypoints_by_layers_(&kpts), dthreshold_(dthreshold)
  {}
@ -653,7 +729,7 @@ public:
  {
    for (int i = range.start; i < range.end; i++)
    {
-      const Evolution &e = (*evolution_)[i];
+      const MEvolution &e = (*evolution_)[i];
      Mat &kpts = (*keypoints_by_layers_)[i];
      // this mask will hold positions of keypoints in this level
      kpts = Mat::zeros(e.Ldet.size(), CV_8UC1);
@ -704,7 +780,7 @@ public:
  }

 private:
-  const std::vector<Evolution>*  evolution_;
+  const Pyramid*  evolution_;
  std::vector<Mat>* keypoints_by_layers_;
  float dthreshold_; ///< Detector response threshold to accept point
 };
@ -799,7 +875,7 @@ void AKAZEFeatures::Do_Subpixel_Refinement(
  CV_INSTRUMENT_REGION()

  for (size_t i = 0; i < keypoints_by_layers.size(); i++) {
-    const Evolution &e = evolution_[i];
+    const MEvolution &e = evolution_[i];
    const float * const ldet = e.Ldet.ptr<float>();
    const float ratio = e.octave_ratio;
    const int cols = e.Ldet.cols;
@ -865,7 +941,7 @@ void AKAZEFeatures::Do_Subpixel_Refinement(
 class SURF_Descriptor_Upright_64_Invoker : public ParallelLoopBody
 {
 public:
-  SURF_Descriptor_Upright_64_Invoker(std::vector<KeyPoint>& kpts, Mat& desc, std::vector<Evolution>& evolution)
+  SURF_Descriptor_Upright_64_Invoker(std::vector<KeyPoint>& kpts, Mat& desc, const Pyramid& evolution)
    : keypoints_(&kpts)
    , descriptors_(&desc)
    , evolution_(&evolution)
@ -885,13 +961,13 @@ public:
 private:
  std::vector<KeyPoint>* keypoints_;
  Mat*                   descriptors_;
-  std::vector<Evolution>*   evolution_;
+  const Pyramid*   evolution_;
 };

 class SURF_Descriptor_64_Invoker : public ParallelLoopBody
 {
 public:
-  SURF_Descriptor_64_Invoker(std::vector<KeyPoint>& kpts, Mat& desc, std::vector<Evolution>& evolution)
+  SURF_Descriptor_64_Invoker(std::vector<KeyPoint>& kpts, Mat& desc, Pyramid& evolution)
    : keypoints_(&kpts)
    , descriptors_(&desc)
    , evolution_(&evolution)
@ -911,13 +987,13 @@ public:
 private:
  std::vector<KeyPoint>* keypoints_;
  Mat*                   descriptors_;
-  std::vector<Evolution>*   evolution_;
+  Pyramid*   evolution_;
 };

 class MSURF_Upright_Descriptor_64_Invoker : public ParallelLoopBody
 {
 public:
-  MSURF_Upright_Descriptor_64_Invoker(std::vector<KeyPoint>& kpts, Mat& desc, std::vector<Evolution>& evolution)
+  MSURF_Upright_Descriptor_64_Invoker(std::vector<KeyPoint>& kpts, Mat& desc, Pyramid& evolution)
    : keypoints_(&kpts)
    , descriptors_(&desc)
    , evolution_(&evolution)
@ -937,13 +1013,13 @@ public:
 private:
  std::vector<KeyPoint>* keypoints_;
  Mat*                   descriptors_;
-  std::vector<Evolution>*   evolution_;
+  Pyramid*   evolution_;
 };

 class MSURF_Descriptor_64_Invoker : public ParallelLoopBody
 {
 public:
-  MSURF_Descriptor_64_Invoker(std::vector<KeyPoint>& kpts, Mat& desc, std::vector<Evolution>& evolution)
+  MSURF_Descriptor_64_Invoker(std::vector<KeyPoint>& kpts, Mat& desc, Pyramid& evolution)
    : keypoints_(&kpts)
    , descriptors_(&desc)
    , evolution_(&evolution)
@ -963,13 +1039,13 @@ public:
 private:
  std::vector<KeyPoint>* keypoints_;
  Mat*                   descriptors_;
-  std::vector<Evolution>*   evolution_;
+  Pyramid*   evolution_;
 };

 class Upright_MLDB_Full_Descriptor_Invoker : public ParallelLoopBody
 {
 public:
-  Upright_MLDB_Full_Descriptor_Invoker(std::vector<KeyPoint>& kpts, Mat& desc, std::vector<Evolution>& evolution, AKAZEOptions& options)
+  Upright_MLDB_Full_Descriptor_Invoker(std::vector<KeyPoint>& kpts, Mat& desc, Pyramid& evolution, AKAZEOptions& options)
    : keypoints_(&kpts)
    , descriptors_(&desc)
    , evolution_(&evolution)
@ -990,7 +1066,7 @@ public:
 private:
  std::vector<KeyPoint>* keypoints_;
  Mat*                   descriptors_;
-  std::vector<Evolution>*   evolution_;
+  Pyramid*   evolution_;
  AKAZEOptions*              options_;
 };

@ -999,7 +1075,7 @@ class Upright_MLDB_Descriptor_Subset_Invoker : public ParallelLoopBody
 public:
  Upright_MLDB_Descriptor_Subset_Invoker(std::vector<KeyPoint>& kpts,
                                         Mat& desc,
-                                         std::vector<Evolution>& evolution,
+                                         Pyramid& evolution,
                                         AKAZEOptions& options,
                                         Mat descriptorSamples,
                                         Mat descriptorBits)
@ -1025,7 +1101,7 @@ public:
 private:
  std::vector<KeyPoint>* keypoints_;
  Mat*                   descriptors_;
-  std::vector<Evolution>*   evolution_;
+  Pyramid*   evolution_;
  AKAZEOptions*              options_;

  Mat descriptorSamples_;  // List of positions in the grids to sample LDB bits from.
@ -1035,7 +1111,7 @@ private:
 class MLDB_Full_Descriptor_Invoker : public ParallelLoopBody
 {
 public:
-  MLDB_Full_Descriptor_Invoker(std::vector<KeyPoint>& kpts, Mat& desc, std::vector<Evolution>& evolution, AKAZEOptions& options)
+  MLDB_Full_Descriptor_Invoker(std::vector<KeyPoint>& kpts, Mat& desc, Pyramid& evolution, AKAZEOptions& options)
    : keypoints_(&kpts)
    , descriptors_(&desc)
    , evolution_(&evolution)
@ -1060,7 +1136,7 @@ public:
 private:
  std::vector<KeyPoint>* keypoints_;
  Mat*                   descriptors_;
-  std::vector<Evolution>*   evolution_;
+  Pyramid*   evolution_;
  AKAZEOptions*              options_;
 };

@ -1069,7 +1145,7 @@ class MLDB_Descriptor_Subset_Invoker : public ParallelLoopBody
 public:
  MLDB_Descriptor_Subset_Invoker(std::vector<KeyPoint>& kpts,
                                 Mat& desc,
-                                 std::vector<Evolution>& evolution,
+                                 Pyramid& evolution,
                                 AKAZEOptions& options,
                                 Mat descriptorSamples,
                                 Mat descriptorBits)
@ -1095,7 +1171,7 @@ public:
 private:
  std::vector<KeyPoint>* keypoints_;
  Mat*                   descriptors_;
-  std::vector<Evolution>*   evolution_;
+  Pyramid*   evolution_;
  AKAZEOptions*              options_;

  Mat descriptorSamples_;  // List of positions in the grids to sample LDB bits from.
@ -1282,10 +1358,10 @@ void quantized_counting_sort(const float a[], const int n,
 * original SURF method. See Bay et al., Speeded Up Robust Features, ECCV 2006
 */
 static inline
-void Compute_Main_Orientation(KeyPoint& kpt, const std::vector<Evolution>& evolution)
+void Compute_Main_Orientation(KeyPoint& kpt, const Pyramid& evolution)
 {
  // get the right evolution level for this keypoint
-  const Evolution& e = evolution[kpt.class_id];
+  const MEvolution& e = evolution[kpt.class_id];
  // Get the information from the keypoint
  int scale = cvRound(0.5f * kpt.size / e.octave_ratio);
  int x0 = cvRound(kpt.pt.x / e.octave_ratio);
@ -1366,7 +1442,7 @@ class ComputeKeypointOrientation : public ParallelLoopBody
 {
 public:
  ComputeKeypointOrientation(std::vector<KeyPoint>& kpts,
-                             const std::vector<Evolution>& evolution)
+                             const Pyramid& evolution)
    : keypoints_(&kpts)
    , evolution_(&evolution)
  {
@ -1381,7 +1457,7 @@ public:
  }
 private:
  std::vector<KeyPoint>* keypoints_;
-  const std::vector<Evolution>* evolution_;
+  const Pyramid* evolution_;
 };

 /**
@ -1421,7 +1497,7 @@ void MSURF_Upright_Descriptor_64_Invoker::Get_MSURF_Upright_Descriptor_64(const
  // Subregion centers for the 4x4 gaussian weighting
  float cx = -0.5f, cy = 0.5f;

-  const std::vector<Evolution>& evolution = *evolution_;
+  const Pyramid& evolution = *evolution_;

  // Set the descriptor size and the sample and pattern sizes
  sample_step = 5;
@ -1554,7 +1630,7 @@ void MSURF_Descriptor_64_Invoker::Get_MSURF_Descriptor_64(const KeyPoint& kpt, f
  // Subregion centers for the 4x4 gaussian weighting
  float cx = -0.5f, cy = 0.5f;

-  const std::vector<Evolution>& evolution = *evolution_;
+  const Pyramid& evolution = *evolution_;

  // Set the descriptor size and the sample and pattern sizes
  sample_step = 5;
@ -1675,7 +1751,7 @@ void MSURF_Descriptor_64_Invoker::Get_MSURF_Descriptor_64(const KeyPoint& kpt, f
 void Upright_MLDB_Full_Descriptor_Invoker::Get_Upright_MLDB_Full_Descriptor(const KeyPoint& kpt, unsigned char *desc, int desc_size) const {

  const AKAZEOptions & options = *options_;
-  const std::vector<Evolution>& evolution = *evolution_;
+  const Pyramid& evolution = *evolution_;

  // Buffer for the M-LDB descriptor
  const int max_channels = 3;
@ -1777,7 +1853,7 @@ void Upright_MLDB_Full_Descriptor_Invoker::Get_Upright_MLDB_Full_Descriptor(cons
 void MLDB_Full_Descriptor_Invoker::MLDB_Fill_Values(float* values, int sample_step, const int level,
                                                    float xf, float yf, float co, float si, float scale) const
 {
-    const std::vector<Evolution>& evolution = *evolution_;
+    const Pyramid& evolution = *evolution_;
    int pattern_size = options_->descriptor_pattern_size;
    int chan = options_->descriptor_channels;
    const Mat Lx = evolution[level].Lx;
@ -1924,7 +2000,7 @@ void MLDB_Descriptor_Subset_Invoker::Get_MLDB_Descriptor_Subset(const KeyPoint&
  float sample_x = 0.f, sample_y = 0.f;

  const AKAZEOptions & options = *options_;
-  const std::vector<Evolution>& evolution = *evolution_;
+  const Pyramid& evolution = *evolution_;

  // Get the information from the keypoint
  float ratio = (float)(1 << kpt.octave);
@ -2033,7 +2109,7 @@ void Upright_MLDB_Descriptor_Subset_Invoker::Get_Upright_MLDB_Descriptor_Subset(
  int x1 = 0, y1 = 0;

  const AKAZEOptions & options = *options_;
-  const std::vector<Evolution>& evolution = *evolution_;
+  const Pyramid& evolution = *evolution_;

  // Get the information from the keypoint
  float ratio = (float)(1 << kpt.octave);
--- a/modules/features2d/src/kaze/AKAZEFeatures.h
+++ b/modules/features2d/src/kaze/AKAZEFeatures.h
@ -17,6 +17,7 @@ namespace cv
 {

 /// A-KAZE nonlinear diffusion filtering evolution
+template <typename MatType>
 struct Evolution
 {
  Evolution() {
@ -29,10 +30,28 @@ struct Evolution
    border = 0;
  }

-  Mat Lx, Ly;           ///< First order spatial derivatives
-  Mat Lt;               ///< Evolution image
-  Mat Lsmooth;          ///< Smoothed image, used only for computing determinant, released afterwards
-  Mat Ldet;             ///< Detector response
+  template <typename T>
+  explicit Evolution(const Evolution<T> &other) {
+    size = other.size;
+    etime = other.etime;
+    esigma = other.esigma;
+    octave = other.octave;
+    sublevel = other.sublevel;
+    sigma_size = other.sigma_size;
+    octave_ratio = other.octave_ratio;
+    border = other.border;
+
+    other.Lx.copyTo(Lx);
+    other.Ly.copyTo(Ly);
+    other.Lt.copyTo(Lt);
+    other.Lsmooth.copyTo(Lsmooth);
+    other.Ldet.copyTo(Ldet);
+  }
+
+  MatType Lx, Ly;           ///< First order spatial derivatives
+  MatType Lt;               ///< Evolution image
+  MatType Lsmooth;          ///< Smoothed image, used only for computing determinant, released afterwards
+  MatType Ldet;             ///< Detector response

  Size size;                ///< Size of the layer
  float etime;              ///< Evolution time
@ -44,6 +63,11 @@ struct Evolution
  int border;               ///< Width of border where descriptors cannot be computed
 };

+typedef Evolution<Mat> MEvolution;
+typedef Evolution<UMat> UEvolution;
+typedef std::vector<MEvolution> Pyramid;
+typedef std::vector<UEvolution> UMatPyramid;
+
 /* ************************************************************************* */
 // AKAZE Class Declaration
 class AKAZEFeatures {
@ -51,7 +75,7 @@ class AKAZEFeatures {
 private:

  AKAZEOptions options_;                ///< Configuration options for AKAZE
-  std::vector<Evolution> evolution_;        ///< Vector of nonlinear diffusion evolution
+  Pyramid evolution_;        ///< Vector of nonlinear diffusion evolution

  /// FED parameters
  int ncycles_;                  ///< Number of cycles
@ -64,23 +88,21 @@ private:
  cv::Mat descriptorBits_;
  cv::Mat bitMask_;

-public:
-
-  /// Constructor with input arguments
-  AKAZEFeatures(const AKAZEOptions& options);
-
  /// Scale Space methods
  void Allocate_Memory_Evolution();
-  void Create_Nonlinear_Scale_Space(InputArray img);
-  void Feature_Detection(std::vector<cv::KeyPoint>& kpts);
-  void Compute_Determinant_Hessian_Response(void);
  void Find_Scale_Space_Extrema(std::vector<Mat>& keypoints_by_layers);
  void Do_Subpixel_Refinement(std::vector<Mat>& keypoints_by_layers,
    std::vector<KeyPoint>& kpts);

  /// Feature description methods
-  void Compute_Descriptors(std::vector<cv::KeyPoint>& kpts, OutputArray desc);
  void Compute_Keypoints_Orientation(std::vector<cv::KeyPoint>& kpts) const;
+
+public:
+  /// Constructor with input arguments
+  AKAZEFeatures(const AKAZEOptions& options);
+  void Create_Nonlinear_Scale_Space(InputArray img);
+  void Feature_Detection(std::vector<cv::KeyPoint>& kpts);
+  void Compute_Descriptors(std::vector<cv::KeyPoint>& kpts, OutputArray desc);
 };

 /* ************************************************************************* */
--- a/modules/features2d/test/ocl/test_feature2d.cpp
+++ b/modules/features2d/test/ocl/test_feature2d.cpp
@ -0,0 +1,72 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#include "../test_precomp.hpp"
+#include "cvconfig.h"
+#include "opencv2/ts/ocl_test.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+#define TEST_IMAGES testing::Values(\
+    "detectors_descriptors_evaluation/images_datasets/leuven/img1.png",\
+    "../stitching/a3.png", \
+    "../stitching/s2.jpg")
+
+PARAM_TEST_CASE(Feature2DFixture, Ptr<Feature2D>, std::string)
+{
+    std::string filename;
+    Mat image, descriptors;
+    vector<KeyPoint> keypoints;
+    UMat uimage, udescriptors;
+    vector<KeyPoint> ukeypoints;
+    Ptr<Feature2D> feature;
+
+    virtual void SetUp()
+    {
+        feature = GET_PARAM(0);
+        filename = GET_PARAM(1);
+
+        image = readImage(filename);
+
+        ASSERT_FALSE(image.empty());
+
+        image.copyTo(uimage);
+
+        OCL_OFF(feature->detect(image, keypoints));
+        OCL_ON(feature->detect(uimage, ukeypoints));
+        // note: we use keypoints from CPU for GPU too, to test descriptors separately
+        OCL_OFF(feature->compute(image, keypoints, descriptors));
+        OCL_ON(feature->compute(uimage, keypoints, udescriptors));
+    }
+};
+
+OCL_TEST_P(Feature2DFixture, KeypointsSame)
+{
+    EXPECT_EQ(keypoints.size(), ukeypoints.size());
+
+    for (size_t i = 0; i < keypoints.size(); ++i)
+    {
+        EXPECT_GE(KeyPoint::overlap(keypoints[i], ukeypoints[i]), 0.95);
+        EXPECT_NEAR(keypoints[i].angle, ukeypoints[i].angle, 0.001);
+    }
+}
+
+OCL_TEST_P(Feature2DFixture, DescriptorsSame)
+{
+    EXPECT_MAT_NEAR(descriptors, udescriptors, 0.001);
+}
+
+OCL_INSTANTIATE_TEST_CASE_P(AKAZE, Feature2DFixture,
+    testing::Combine(testing::Values(AKAZE::create()), TEST_IMAGES));
+
+OCL_INSTANTIATE_TEST_CASE_P(AKAZE_DESCRIPTOR_KAZE, Feature2DFixture,
+    testing::Combine(testing::Values(AKAZE::create(AKAZE::DESCRIPTOR_KAZE)), TEST_IMAGES));
+
+}//ocl
+}//cvtest
+
+#endif //HAVE_OPENCL