\ifCpp


\section{Initalization and Info}


\cvCppFunc{gpu::hasNativeDoubleSupport}
Returns true if current GPU has native double support, false otherwise.

\cvdefCpp{bool hasNativeDoubleSupport(int device);}


\cvCppFunc{gpu::hasAtomicsSupport}
Returns true if current GPU has atomics support, false otherwise.

\cvdefCpp{bool hasAtomicsSupport(int device);}


\section{Data Structures}


\cvCppFunc{gpu::createContinuous}
Creates continuous matrix in GPU memory.

\cvdefCpp{void createContinuous(int rows, int cols, int type, GpuMat\& m);}
\begin{description}
\cvarg{rows}{Row count.}
\cvarg{cols}{Column count.}
\cvarg{type}{Type of the matrix.}
\cvarg{m}{Destionation matrix. Will do only reshape if \texttt{m} has proper type and area ($rows \times cols$).}
\end{description}


\section{Per-element Operations}


%can't make cvCppFunc work with underscore (even as \_)
\cvfunc{cv::gpu::bitwise\_not}
Performs per-element bitwise inversion.

\cvdefCpp{void bitwise\_not(const GpuMat\& src, GpuMat\& dst,\par
  const GpuMat\& mask=GpuMat());\newline
void bitwise\_not(const GpuMat\& src, GpuMat\& dst,\par
  const GpuMat\& mask, const Stream\& stream);}

\begin{description}
\cvarg{src}{Source matrix.}
\cvarg{dst}{Destination matrix. Will have the same size and type as \texttt{src}.}
\cvarg{mask}{Optional operation mask. 8-bit single channel image.}
\cvarg{stream}{Stream for asynchronous version.}
\end{description}
See also: \hyperref[cppfunc.bitwise.not]{cv::bitwise\_not}.


\cvfunc{cv::gpu::bitwise\_or}
Performs per-element bitwise disjunction of two matrixes.

\cvdefCpp{void bitwise\_or(const GpuMat\& src1, const GpuMat\& src2, GpuMat\& dst,\par
  const GpuMat\& mask=GpuMat());\newline
void bitwise\_or(const GpuMat\& src1, const GpuMat\& src2, GpuMat\& dst,\par
  const GpuMat\& mask, const Stream\& stream);}

\begin{description}
\cvarg{src1}{The first source matrix.}
\cvarg{src2}{The second source matrix. It must have the same size and type as \texttt{src1}.}
\cvarg{dst}{Destination matrix. Will have the same size and type as \texttt{src1}.}
\cvarg{mask}{Optional operation mask. 8-bit single channel image.}
\cvarg{stream}{Stream for asynchronous version.}
\end{description}
See also: \hyperref[cppfunc.bitwise.or]{cv::bitwise\_or}.


\cvfunc{cv::gpu::bitwise\_and}
Performs per-element bitwise conjunction of two matrixes.

\cvdefCpp{void bitwise\_and(const GpuMat\& src1, const GpuMat\& src2, GpuMat\& dst,\par
  const GpuMat\& mask=GpuMat());\newline
void bitwise\_and(const GpuMat\& src1, const GpuMat\& src2, GpuMat\& dst,\par
  const GpuMat\& mask, const Stream\& stream);}

\begin{description}
\cvarg{src1}{The first source matrix.}
\cvarg{src2}{The second source matrix. It must have the same size and type as \texttt{src1}.}
\cvarg{dst}{Destination matrix. Will have the same size and type as \texttt{src1}.}
\cvarg{mask}{Optional operation mask. 8-bit single channel image.}
\cvarg{stream}{Stream for asynchronous version.}
\end{description}
See also: \hyperref[cppfunc.bitwise.and]{cv::bitwise\_and}.


\cvfunc{cv::gpu::bitwise\_xor}
Performs per-element bitwise "exclusive or" of two matrixes.

\cvdefCpp{void bitwise\_xor(const GpuMat\& src1, const GpuMat\& src2, GpuMat\& dst,\par
  const GpuMat\& mask=GpuMat());\newline
void bitwise\_xor(const GpuMat\& src1, const GpuMat\& src2, GpuMat\& dst,\par
  const GpuMat\& mask, const Stream\& stream);}

\begin{description}
\cvarg{src1}{The first source matrix.}
\cvarg{src2}{The second source matrix. It must have the same size and type as \texttt{src1}.}
\cvarg{dst}{Destination matrix. Will have the same size and type as \texttt{src1}.}
\cvarg{mask}{Optional operation mask. 8-bit single channel image.}
\cvarg{stream}{Stream for asynchronous version.}
\end{description}
See also: \hyperref[cppfunc.bitwise.xor]{cv::bitwise\_xor}.


\section{Image Processing}


\cvCppFunc{gpu::meanShiftFiltering}
Performs mean-shift filtering.

\cvdefCpp{void meanShiftFiltering(const GpuMat\& src, GpuMat\& dst,\par
  int sp, int sr,\par
  TermCriteria criteria = TermCriteria(TermCriteria::MAX\_ITER\par
  + TermCriteria::EPS, 5, 1));}

\begin{description}
\cvarg{src}{Source image. Only 8UC4 images are supported for now.}
\cvarg{dst}{Destination image. Will have the same size and type as \texttt{src}. Each pixel \texttt{(x,y)} of the destination image will contain color of converged point started from \texttt{(x,y)} pixel of the source image.}
\cvarg{sp}{Spatial window radius.}
\cvarg{sr}{Color window radius.}
\cvarg{criteria}{Termination criteria. See \hyperref[TermCriteria]{cv::TermCriteria}.}
\end{description}


\cvCppFunc{gpu::meanShiftProc}
Performs mean-shift procedure and stores information about converged points in two images..

\cvdefCpp{void meanShiftProc(const GpuMat\& src, GpuMat\& dstr, GpuMat\& dstsp,\par
  int sp, int sr,\par
  TermCriteria criteria = TermCriteria(TermCriteria::MAX\_ITER\par
  + TermCriteria::EPS, 5, 1));}

\begin{description}
\cvarg{src}{Source image. Only 8UC4 images are supported for now.}
\cvarg{dstr}{Destination image. Will have the same size and type as \texttt{src}. Each pixel \texttt{(x,y)} of the destination image will contain color of converged point started from \texttt{(x,y)} pixel of the source image.}
\cvarg{dstsp}{16SC2 matrix, which will contain coordinates of converged points and have the same size as \texttt{src}.}
\cvarg{sp}{Spatial window radius.}
\cvarg{sr}{Color window radius.}
\cvarg{criteria}{Termination criteria. See \hyperref[TermCriteria]{cv::TermCriteria}.}
\end{description}


\cvCppFunc{gpu::meanShiftSegmentation}
Performs mean-shift segmentation of the source image and eleminates small segments.

\cvdefCpp{void meanShiftSegmentation(const GpuMat\& src, Mat\& dst,\par
  int sp, int sr, int minsize,\par
  TermCriteria criteria = TermCriteria(TermCriteria::MAX\_ITER\par
  + TermCriteria::EPS, 5, 1));}

\begin{description}
\cvarg{src}{Source image. Only 8UC4 images are supported for now.}
\cvarg{dst}{Segmented image. Will have the same size and type as \texttt{src}.}
\cvarg{sp}{Spatial window radius.}
\cvarg{sr}{Color window radius.}
\cvarg{minsize}{Minimum segment size. Smaller segements will be merged.}
\cvarg{criteria}{Termination criteria. See \hyperref[TermCriteria]{cv::TermCriteria}.}
\end{description}


\cvCppFunc{gpu::integral}
Computes the integral image and squared integral image.

\cvdefCpp{void integral(const GpuMat\& src, GpuMat\& sum);\newline
void integral(const GpuMat\& src, GpuMat\& sum, GpuMat\& sqsum);}

\begin{description}
\cvarg{src}{Source image. Only 8UC1 images are supported for now.}
\cvarg{sum}{Integral image. Will contain 32-bit unsigned integer values packed into 32SC1.}
\cvarg{sqsum}{Squared integral image. Will have 32FC1 type.}
\end{description}
See also: \cvCppCross{integral}.


\cvCppFunc{gpu::sqrIntegral}
Computes squared integral image.

\cvdefCpp{void sqrIntegral(const GpuMat\& src, GpuMat\& sqsum);}
\begin{description}
\cvarg{src}{Source image. Only 8UC1 images are supported for now.}
\cvarg{sqsum}{Squared integral image. Will contain 64-bit floating point values packed into 64U.}
\end{description}


\cvCppFunc{gpu::columnSum}
Computes vertical (column) sum.

\cvdefCpp{void columnSum(const GpuMat\& src, GpuMat\& sum);}
\begin{description}
\cvarg{src}{Source image. Only 32FC1 images are supported for now.}
\cvarg{sum}{Destination image. Will have 32FC1 type.}
\end{description}


\cvCppFunc{gpu::cornerHarris}
Computes Harris cornerness criteria at each image pixel.

\cvdefCpp{void cornerHarris(const GpuMat\& src, GpuMat\& dst,\par
  int blockSize, int ksize, double k,\par
  int borderType=BORDER\_REFLECT101);}

\begin{description}
\cvarg{src}{Source image. Only 8UC1 and 32FC1 images are supported for now.}
\cvarg{dst}{Destination image. Will have the same size and 32FC1 type and contain cornerness values.}
\cvarg{blockSize}{Neighborhood size.}
\cvarg{ksize}{Aperture parameter for the Sobel operator.}
\cvarg{k}{Harris detector free parameter.}
\cvarg{borderType}{Pixel extrapolation method. Only \texttt{BORDER\_REFLECT101} and \texttt{BORDER\_REPLICATE} are supported for now.}
\end{description}
See also: \cvCppCross{cornerHarris}.


\cvCppFunc{gpu::cornerMinEigenVal}
Computes minimum eigen value of 2x2 derivative covariation matrix at each pixel - the cornerness criteria.

\cvdefCpp{void cornerMinEigenVal(const GpuMat\& src, GpuMat\& dst,\par
  int blockSize, int ksize,\par
  int borderType=BORDER\_REFLECT101);}

\begin{description}
\cvarg{src}{Source image. Only 8UC1 and 32FC1 images are supported for now.}
\cvarg{dst}{Destination image. Will have the same size and 32FC1 type and contain cornerness values.}
\cvarg{blockSize}{Neighborhood size.}
\cvarg{ksize}{Aperture parameter for the Sobel operator.}
\cvarg{k}{Harris detector free parameter.}
\cvarg{borderType}{Pixel extrapolation method. Only \texttt{BORDER\_REFLECT101} and \texttt{BORDER\_REPLICATE} are supported for now.}
\end{description}
See also: \cvCppCross{cornerMinEigenValue}.


\cvCppFunc{gpu::mulSpectrums}
Performs per-element multiplication of two Fourier spectrums.

\cvdefCpp{void mulSpectrums(const GpuMat\& a, const GpuMat\& b,\par
  GpuMat\& c, int flags, bool conjB=false);}

\begin{description}
\cvarg{a}{First spectrum.}
\cvarg{b}{Second spectrum. Must have the same size and type as \texttt{a}.}
\cvarg{c}{Destination spectrum.}
\cvarg{flags}{Mock paramter is kept for CPU/GPU interfaces similarity.}
\cvarg{conjB}{Optional flag indicates if the second spectrum must be conjugated before the multiplcation.}
\end{description}

Only full (i.e. not packed) 32FC2 complex spectrums in the interleaved format are supported for now.

See also: \cvCppCross{mulSpectrums}.


\cvCppFunc{gpu::mulAndScaleSpectrums}
Performs per-element multiplication of two Fourier spectrums and scales the result.

\cvdefCpp{void mulAndScaleSpectrums(const GpuMat\& a, const GpuMat\& b,\par
  GpuMat\& c, int flags, float scale, bool conjB=false);}

\begin{description}
\cvarg{a}{First spectrum.}
\cvarg{b}{Second spectrum. Must have the same size and type as \texttt{a}.}
\cvarg{c}{Destination spectrum.}
\cvarg{flags}{Mock paramter is kept for CPU/GPU interfaces similarity.}
\cvarg{scale}{Scale constant.}
\cvarg{conjB}{Optional flag indicates if the second spectrum must be conjugated before the multiplcation.}
\end{description}

Only full (i.e. not packed) 32FC2 complex spectrums in the interleaved format are supported for now.

See also: \cvCppCross{mulSpectrums}.


\cvCppFunc{gpu::dft}
Performs a forward or inverse discrete Fourier transform (1D or 2D) of floating point matrix.

\cvdefCpp{void dft(const GpuMat\& src, GpuMat\& dst, Size dft\_size, int flags=0);}

\begin{description}
\cvarg{src}{Real of complex source matrix.}
\cvarg{dst}{Real or complex destination matrix.}
\cvarg{dft\_size}{Size of discrete Fourier transform.}
\cvarg{flags}{Optional flags:
\begin{description}
  \cvarg{DFT\_ROWS}{Transform each individual row of the source matrix.}
  \cvarg{DFT\_SCALE}{Scale the result: divide it by the number of elements in the transform (it's obtained from \texttt{dft\_size}).
  \cvarg{DFT\_INVERSE}{Inverse DFT must be perfromed for complex-complex case (real-complex and  complex-real cases are respectively forward and inverse always).}}
  \cvarg{DFT\_REAL\_OUTPUT}{The source matrix is the result of real-complex transform and the destination matrix must be real.}
\end{description}}
\end{description}

The source matrix should be continuous, otherwise reallocation and data copying will be performed. Function chooses the operation mode depending on the flags, size and channel count of the source matrix:
\begin{itemize}
  \item If the source matrix is complex and the output isn't specified as real then the destination matrix will be complex, will have \texttt{dft\_size} size and 32FC2 type. It will contain full result of the DFT (forward or inverse).
  \item If the source matrix is complex and the output is specified as real then function assumes that its input is the result of the forward transform (see next item). The destionation matrix will have \texttt{dft\_size} size and 32FC1 type. It will contain result of the inverse DFT.
  \item If the source matrix is real (i.e. its type is 32FC1) then forward DFT will be performed. The result of the DFT will be packed into complex (32FC2) matrix so its width will be \texttt{dft\_size.width / 2 + 1}, but if the source is a single column then height will be reduced.
\end{itemize}

See also: \cvCppCross{dft}.


\cvCppFunc{gpu::convolve}
Computes convolution (or cross-correlation) of two images.

\cvdefCpp{void convolve(const GpuMat\& image, const GpuMat\& templ, GpuMat\& result,\par
  bool ccorr=false);\newline
void convolve(const GpuMat\& image, const GpuMat\& templ, GpuMat\& result,\par
  bool ccorr, ConvolveBuf\& buf);}

\begin{description}
\cvarg{image}{Source image. Only 32FC1 images are supported for now.}
\cvarg{templ}{Template image. Must have size not greater then \texttt{image} size and be the same type as \texttt{image}.}
\cvarg{result}{Result image. Will have the same size and type as \texttt{image}.}
\cvarg{ccorr}{Indicates that cross-correlation must be evaluated instead of convolution.}
\cvarg{buf}{Optional buffer to decrease memory reallocation count (for many calls with the same sizes).}
\end{description}


\cvclass{gpu::ConvolveBuf}
Memory buffer for the \cvCppCross{gpu::convolve} function.

\begin{lstlisting}
struct CV_EXPORTS ConvolveBuf
{
    ConvolveBuf() {}
    ConvolveBuf(Size image_size, Size templ_size) 
        { create(image_size, templ_size); }
    void create(Size image_size, Size templ_size);

private:
    // Hidden
};
\end{lstlisting}


\cvCppFunc{gpu::ConvolveBuf::ConvolveBuf}

\cvdefCpp{ConvolveBuf();}
Construct empty buffer which will be properly resized after first call of the convolve function.

\cvdefCpp{ConvolveBuf(Size image\_size, Size templ\_size);}
Construct buffer for the convolve function with respectively arguments.


\cvCppFunc{gpu::matchTemplate}
Computes the proximity map for the raster template and the image where the template is searched for.

\cvdefCpp{void matchTemplate(const GpuMat\& image, const GpuMat\& templ,\par
  GpuMat\& result, int method);}

\begin{description}
\cvarg{image}{Source image. 32F and 8U images (1..4 channels) are supported for now.}
\cvarg{templ}{Template image. Must have the same size and type as \texttt{image}.}
\cvarg{result}{A map of comparison results (32FC1). If \texttt{image} is $W \times H$ and
\texttt{templ} is $w \times h$ then \texttt{result} must be $(W-w+1) \times (H-h+1)$.}
\cvarg{method}{Specifies the way the template must be compared with the image.}
\end{description}

Following methods are supported for 8U images for now:
\begin{itemize}
\item CV\_TM\_SQDIFF \item CV\_TM\_SQDIFF\_NORMED \item CV\_TM\_CCORR \item CV\_TM\_CCORR\_NORMED \item CV\_TM\_CCOEFF \item CV\_TM\_CCOEFF\_NORMED 
\end{itemize}\par
Following methods are supported for 32F images for now:
\begin{itemize}
\item CV\_TM\_SQDIFF \item CV\_TM\_CCORR
\end{itemize}

See also: \cvCppCross{matchTemplate}. 


\section{Matrix Reductions}


\cvCppFunc{gpu::sum}
Computes sum of array elements.

\cvdefCpp{Scalar sum(const GpuMat\& src);\newline
Scalar sum(const GpuMat\& src, GpuMat\& buf);}

\begin{description}
\cvarg{src}{Source image of any depth excepting 64F, single-channel.}
\cvarg{buf}{Optional buffer. It's resized automatically.}
\end{description}

See also: \cvCppCross{sum}.


\section{Object Detection}


\cvclass{gpu::HOGDescriptor}
Histogram of Oriented Gradients descriptor and detector.

\begin{lstlisting}
struct CV_EXPORTS HOGDescriptor
{
    enum { DEFAULT_WIN_SIGMA = -1 };
    enum { DEFAULT_NLEVELS = 64 };
    enum { DESCR_FORMAT_ROW_BY_ROW, DESCR_FORMAT_COL_BY_COL };

    HOGDescriptor(Size win_size=Size(64, 128), Size block_size=Size(16, 16),
                  Size block_stride=Size(8, 8), Size cell_size=Size(8, 8),
                  int nbins=9, double win_sigma=DEFAULT_WIN_SIGMA,
                  double threshold_L2hys=0.2, bool gamma_correction=true,
                  int nlevels=DEFAULT_NLEVELS);

    size_t getDescriptorSize() const;
    size_t getBlockHistogramSize() const;

    void setSVMDetector(const vector<float>& detector);

    static vector<float> getDefaultPeopleDetector();
    static vector<float> getPeopleDetector48x96();
    static vector<float> getPeopleDetector64x128();

    void detect(const GpuMat& img, vector<Point>& found_locations, 
                double hit_threshold=0, Size win_stride=Size(), 
                Size padding=Size());

    void detectMultiScale(const GpuMat& img, vector<Rect>& found_locations,
                          double hit_threshold=0, Size win_stride=Size(), 
                          Size padding=Size(), double scale0=1.05, 
                          int group_threshold=2);

    void getDescriptors(const GpuMat& img, Size win_stride, 
                        GpuMat& descriptors,
                        int descr_format=DESCR_FORMAT_COL_BY_COL);

    Size win_size;
    Size block_size;
    Size block_stride;
    Size cell_size;
    int nbins;
    double win_sigma;
    double threshold_L2hys;
    bool gamma_correction;
    int nlevels;

private:
    // Hidden
}
\end{lstlisting}

Interfaces of all methods are kept similar to CPU HOG descriptor and detector's analogues as much as possible.


\cvCppFunc{gpu::HOGDescriptor::HOGDescriptor}
Creates HOG descriptor and detector.

\cvdefCpp{HOGDescriptor(Size win\_size=Size(64, 128), Size block\_size=Size(16, 16),\par
          Size block\_stride=Size(8, 8), Size cell\_size=Size(8, 8),\par
          int nbins=9, double win\_sigma=DEFAULT\_WIN\_SIGMA,\par
          double threshold\_L2hys=0.2, bool gamma\_correction=true,\par
          int nlevels=DEFAULT\_NLEVELS);}

\begin{description}
\cvarg{win\_size}{Detection window size. Must be aligned to block size and block stride.}
\cvarg{block\_size}{Block size in cells. Only (2,2) is supported for now.}
\cvarg{block\_stride}{Block stride. Must be a multiple of cell size.}
\cvarg{cell\_size}{Cell size. Only (8, 8) is supported for now.}
\cvarg{nbins}{Number of bins. Only 9 bins per cell is supported for now.}
\cvarg{win\_sigma}{Gaussian smoothing window parameter.}
\cvarg{threshold\_L2Hys}{L2-Hys normalization method shrinkage.}
\cvarg{gamma\_correction}{Do gamma correction preprocessing or not.}
\cvarg{nlevels}{Maximum number of detection window increases.}
\end{description}


\cvCppFunc{gpu::HOGDescriptor::getDescriptorSize}
Returns number of coefficients required for the classification.

\cvdefCpp{size\_t getDescriptorSize() const;}


\cvCppFunc{gpu::HOGDescriptor::getBlockHistogramSize}
Returns block histogram size.

\cvdefCpp{size\_t getBlockHistogramSize() const;}


\cvCppFunc{gpu::HOGDescriptor::setSVMDetector}
Sets coefficients for the linear SVM classifier. 

\cvdefCpp{void setSVMDetector(const vector<float>\& detector);}


\cvCppFunc{gpu::HOGDescriptor::getDefaultPeopleDetector}
Returns coefficients of the classifier trained for people detection (for default window size).

\cvdefCpp{static vector<float> getDefaultPeopleDetector();}


\cvCppFunc{gpu::HOGDescriptor::getPeopleDetector48x96}
Returns coefficients of the classifier trained for people detection (for 48x96 windows).

\cvdefCpp{static vector<float> getPeopleDetector48x96();}


\cvCppFunc{gpu::HOGDescriptor::getPeopleDetector64x128}
Returns coefficients of the classifier trained for people detection (for 64x128 windows).

\cvdefCpp{static vector<float> getPeopleDetector64x128();}


\cvCppFunc{gpu::HOGDescriptor::detect}
Perfroms object detection without increasing detection window.

\cvdefCpp{void detect(const GpuMat\& img, vector<Point>\& found\_locations,\par
             double hit\_threshold=0, Size win\_stride=Size(),\par
             Size padding=Size());}

\begin{description}
\cvarg{img}{Source image. 8UC1 and 8UC4 types are supported for now.}
\cvarg{found\_locations}{Will contain left-top corner points of detected objects boundaries.}
\cvarg{hit\_threshold}{The threshold for the distance between features and classifying plane. Usually it's 0, and should be specfied in the detector coefficients (as the last free coefficient), but if the free coefficient is missed (it's allowed) you can specify it manually here.}
\cvarg{win\_stride}{Window stride. Must be a multiple of block stride.}
\cvarg{padding}{Mock parameter to keep CPU interface compatibility. Must be (0,0).}
\end{description}


\cvCppFunc{gpu::HOGDescriptor::detectMultiScale}
Perfroms object detection with increasing detection window.

\cvdefCpp{void detectMultiScale(const GpuMat\& img, vector<Rect>\& found\_locations,\par
                      double hit\_threshold=0, Size win\_stride=Size(),\par
                      Size padding=Size(), double scale0=1.05,\par
                      int group\_threshold=2);}

\begin{description}
\cvarg{img}{Source image. See \cvCppCross{gpu::HOGDescriptor::detect} for type limitations.}
\cvarg{found\_locations}{Will contain detected objects boundaries.}
\cvarg{hit\_threshold}{The threshold for the distance between features and classifying plane. See \cvCppCross{gpu::HOGDescriptor::detect} for details.}
\cvarg{win\_stride}{Window stride. Must be a multiple of block stride.}
\cvarg{padding}{Mock parameter to keep CPU interface compatibility. Must be (0,0).}
\cvarg{scale0}{Coefficient of the detection window increase.}
\cvarg{group\_threshold}{After detection some objects could be covered by many rectangles. This coefficient regulates similarity threshold. 0 means don't perform grouping.\newline
See \cvCppCross{groupRectangles}.}
\end{description}


\cvCppFunc{gpu::HOGDescriptor::getDescriptors}
Returns block descriptors computed for the whole image. 

\cvdefCpp{void getDescriptors(const GpuMat\& img, Size win\_stride,\par
                    GpuMat\& descriptors,\par
                    int descr\_format=DESCR\_FORMAT\_COL\_BY\_COL);}

\begin{description}
\cvarg{img}{Source image. See \cvCppCross{gpu::HOGDescriptor::detect} for type limitations.}
\cvarg{win\_stride}{Window stride. Must be a multiple of block stride.}
\cvarg{descriptors}{2D array of descriptors.}
\cvarg{descr\_format}{Descriptor storage format: 
\begin{description}
    \cvarg{DESCR\_FORMAT\_ROW\_BY\_ROW}{Row-major order.}
    \cvarg{DESCR\_FORMAT\_COL\_BY\_COL}{Column-major order.}
\end{description}}
\end{description}

\section{Feature detection and description}

\cvclass{gpu::SURFParams\_GPU}
Various SURF algorithm parameters. 

\begin{lstlisting}
struct SURFParams_GPU 
{
    SURFParams_GPU() :
        threshold(0.1f), 
        nOctaves(4),
        nIntervals(4),
        initialScale(2.f),

        l1(3.f/1.5f),
        l2(5.f/1.5f),
        l3(3.f/1.5f),
        l4(1.f/1.5f),
        edgeScale(0.81f),
        initialStep(1),

        extended(true),

        featuresRatio(0.01f)
    {
    }

    //! The interest operator threshold
    float threshold;
    //! The number of octaves to process
    int nOctaves;
    //! The number of intervals in each octave
    int nIntervals;
    //! The scale associated with the first interval of the first octave
    float initialScale;

    //! mask parameter l_1
    float l1;
    //! mask parameter l_2 
    float l2;
    //! mask parameter l_3
    float l3;
    //! mask parameter l_4
    float l4;
    //! The amount to scale the edge rejection mask
    float edgeScale;
    //! The initial sampling step in pixels.
    int initialStep;

    //! True, if generate 128-len descriptors, false - 64-len descriptors
    bool extended;

    //! max features = featuresRatio * img.size().srea()
    float featuresRatio;
};
\end{lstlisting}

\cvclass{gpu::SURF\_GPU}
Class for extracting Speeded Up Robust Features from an image.

\begin{lstlisting}
class SURF_GPU : public SURFParams_GPU
{
public:
    //! returns the descriptor size in float's (64 or 128)
    int descriptorSize() const;

    //! upload host keypoints to device memory
    static void uploadKeypoints(const vector<KeyPoint>& keypoints, 
        GpuMat& keypointsGPU);
    //! download keypoints from device to host memory
    static void downloadKeypoints(const GpuMat& keypointsGPU, 
        vector<KeyPoint>& keypoints);

    //! download descriptors from device to host memory
    static void downloadDescriptors(const GpuMat& descriptorsGPU, 
        vector<float>& descriptors);
    
    void operator()(const GpuMat& img, const GpuMat& mask, 
        GpuMat& keypoints);
    
    void operator()(const GpuMat& img, const GpuMat& mask, 
        GpuMat& keypoints, GpuMat& descriptors, 
        bool useProvidedKeypoints = false, 
        bool calcOrientation = true);

    void operator()(const GpuMat& img, const GpuMat& mask, 
        std::vector<KeyPoint>& keypoints);

    void operator()(const GpuMat& img, const GpuMat& mask, 
        std::vector<KeyPoint>& keypoints, GpuMat& descriptors, 
        bool useProvidedKeypoints = false, 
        bool calcOrientation = true);
    
    void operator()(const GpuMat& img, const GpuMat& mask, 
        std::vector<KeyPoint>& keypoints, 
        std::vector<float>& descriptors, 
        bool useProvidedKeypoints = false, 
        bool calcOrientation = true);

    GpuMat sum;
    GpuMat sumf;

    GpuMat mask1;
    GpuMat maskSum;

    GpuMat hessianBuffer;
    GpuMat maxPosBuffer;
    GpuMat featuresBuffer;
};
\end{lstlisting}

The class \texttt{SURF\_GPU} implements Speeded Up Robust Features descriptor. There is fast multi-scale Hessian keypoint detector that can be used to find the keypoints (which is the default option), but the descriptors can be also computed for the user-specified keypoints. Supports only 8 bit grayscale images.

The class \texttt{SURF\_GPU} can store results to GPU and CPU memory and provides static functions to convert results between CPU and GPU version (\texttt{uploadKeypoints}, \texttt{downloadKeypoints}, \texttt{downloadDescriptors}). CPU results has the same format as \hyperref[cv.class.SURF]{cv::SURF} results. GPU results is stored to \texttt{GpuMat}. \texttt{keypoints} matrix is one row matrix with \texttt{CV\_32FC6} type. It contains 6 float values per feature: \texttt{x, y, size, response, angle, octave}. \texttt{descriptors} matrix is \texttt{nFeatures} x \texttt{descriptorSize} matrix with \texttt{CV\_32FC1} type.

The class \texttt{SURF\_GPU} uses some buffers and provides access to it. All buffers can be safely released between function calls. 

\cvclass{gpu::BruteForceMatcher\_GPU}
Brute-force descriptor matcher. For each descriptor in the first set, this matcher finds the closest descriptor in the second set by trying each one. This descriptor matcher supports masking permissible matches between descriptor sets.

\begin{lstlisting}
template<class Distance>
class BruteForceMatcher_GPU
{
public:
    // Add descriptors to train descriptor collection.
    void add(const std::vector<GpuMat>& descCollection);

    // Get train descriptors collection.
    const std::vector<GpuMat>& getTrainDescriptors() const;

    // Clear train descriptors collection.
    void clear();

    // Return true if there are not train descriptors in collection.
    bool empty() const;

    // Return true if the matcher supports mask in match methods.
    bool isMaskSupported() const;

    void matchSingle(const GpuMat& queryDescs, const GpuMat& trainDescs,
        GpuMat& trainIdx, GpuMat& distance,
        const GpuMat& mask = GpuMat());

    static void matchDownload(const GpuMat& trainIdx, 
        const GpuMat& distance, std::vector<DMatch>& matches);

    void match(const GpuMat& queryDescs, const GpuMat& trainDescs, 
        std::vector<DMatch>& matches, const GpuMat& mask = GpuMat());

    void makeGpuCollection(GpuMat& trainCollection, GpuMat& maskCollection,
        const vector<GpuMat>& masks = std::vector<GpuMat>());

    void matchCollection(const GpuMat& queryDescs, 
        const GpuMat& trainCollection,
        GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance,
        const GpuMat& maskCollection);

    static void matchDownload(const GpuMat& trainIdx, GpuMat& imgIdx, 
        const GpuMat& distance, std::vector<DMatch>& matches);

    void match(const GpuMat& queryDescs, std::vector<DMatch>& matches,
        const std::vector<GpuMat>& masks = std::vector<GpuMat>());

    void knnMatch(const GpuMat& queryDescs, const GpuMat& trainDescs,
        GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k, 
        const GpuMat& mask = GpuMat());

    static void knnMatchDownload(const GpuMat& trainIdx, 
        const GpuMat& distance, std::vector< std::vector<DMatch> >& matches, 
        bool compactResult = false);

    void knnMatch(const GpuMat& queryDescs, const GpuMat& trainDescs,
        std::vector< std::vector<DMatch> >& matches, int k, 
        const GpuMat& mask = GpuMat(), bool compactResult = false);
        
    void knnMatch(const GpuMat& queryDescs, 
        std::vector< std::vector<DMatch> >& matches, int knn,
        const std::vector<GpuMat>& masks = std::vector<GpuMat>(), 
        bool compactResult = false );

    void radiusMatch(const GpuMat& queryDescs, const GpuMat& trainDescs,
        GpuMat& trainIdx, GpuMat& nMatches, GpuMat& distance, 
        float maxDistance, const GpuMat& mask = GpuMat());

    static void radiusMatchDownload(const GpuMat& trainIdx, 
        const GpuMat& nMatches, const GpuMat& distance, 
        std::vector< std::vector<DMatch> >& matches, 
        bool compactResult = false);

    void radiusMatch(const GpuMat& queryDescs, const GpuMat& trainDescs,
        std::vector< std::vector<DMatch> >& matches, float maxDistance,
        const GpuMat& mask = GpuMat(), bool compactResult = false);

    void radiusMatch(const GpuMat& queryDescs, 
        std::vector< std::vector<DMatch> >& matches, float maxDistance,
        const std::vector<GpuMat>& masks = std::vector<GpuMat>(), 
        bool compactResult = false);

private:
    std::vector<GpuMat> trainDescCollection;
};
\end{lstlisting}

The class \texttt{BruteForceMatcher\_GPU} has the similar interface with class \hyperref[cv.class.DescriptorMatcher]{cv::DescriptorMatcher}. It has two groups of match methods: for matching descriptors of one image with other image or with image set. Also all functions have alternative: save results to GPU memory or to CPU memory. \texttt{BruteForceMatcher\_GPU} is templated on the distance metric as \hyperref[cv.class.BruteForceMatcher]{cv::BruteForceMatcher}, but supports only \texttt{L1} and \texttt{L2} distance types.

\cvfunc{gpu::BruteForceMatcher\_GPU::match}\label{cppfunc.gpu.BruteForceMatcher.match}
Find the best match for each descriptor from a query set with train descriptors. This function is equivalent of \cvCppCross{DescriptorMatcher::match}.
\cvdefCpp{
void match(const GpuMat\& queryDescs, \par const GpuMat\& trainDescs, \par std::vector<DMatch>\& matches, \par const GpuMat\& mask = GpuMat());
}
\cvdefCpp{
void match(const GpuMat\& queryDescs, \par std::vector<DMatch>\& matches, \par const std::vector<GpuMat>\& masks = std::vector<GpuMat>());
}

\cvfunc{gpu::BruteForceMatcher\_GPU::matchSingle}\label{cppfunc.gpu.BruteForceMatcher.matchSingle}
Find one best match for each query descriptor. Results stored to GPU memory.
\cvdefCpp{
void matchSingle(const GpuMat\& queryDescs, \par const GpuMat\& trainDescs, \par GpuMat\& trainIdx, \par GpuMat\& distance, \par const GpuMat\& mask = GpuMat());
}
\begin{description}
\cvarg{queryDescs} {Query set of descriptors.}
\cvarg{trainDescs} {Train set of descriptors. This will not be added to train descriptors collection stored in class object.}
\cvarg{trainIdx} {One row \texttt{CV\_32SC1} matrix. Will contain best train index for each query. If some query descriptor masked out in \texttt{mask} it will contain -1.}
\cvarg{distance} {One row \texttt{CV\_32FC1} matrix. Will contain best distance for each query. If some query descriptor masked out in \texttt{mask} it will contain \texttt{FLT\_MAX}.}
\cvarg{mask}{Mask specifying permissible matches between input query and train matrices of descriptors.}
\end{description}

\cvfunc{gpu::BruteForceMatcher\_GPU::matchCollection}\label{cppfunc.gpu.BruteForceMatcher.matchCollection}
Find one best match for each query descriptor from train collection. Results stored to GPU memory.
\cvdefCpp{
void matchCollection(const GpuMat\& queryDescs, \par const GpuMat\& trainCollection, \par GpuMat\& trainIdx, \par GpuMat\& imgIdx, \par GpuMat\& distance, \par const GpuMat\& maskCollection);
}
\begin{description}
\cvarg{queryDescs} {Query set of descriptors.}
\cvarg{trainCollection} {\texttt{GpuMat} with train collection. It can be obtained from train descriptors collection that was set using \texttt{add} method by \hyperref[cppfunc.gpu.BruteForceMatcher.makeGpuCollection]{makeGpuCollection}. Or it can contain user defined collection. It must be one row matrix, each element is a \texttt{DevMem2D} that points to one train descriptors matrix (matrix must have \texttt{CV\_32FC1} type).}
\cvarg{trainIdx} {One row \texttt{CV\_32SC1} matrix. Will contain best train index for each query. If some query descriptor masked out in \texttt{mask} it will contain -1.}
\cvarg{imgIdx} {One row \texttt{CV\_32SC1} matrix. Will contain image train index for each query. If some query descriptor masked out in \texttt{mask} it will contain -1.}
\cvarg{distance} {One row \texttt{CV\_32FC1} matrix. Will contain best distance for each query. If some query descriptor masked out in \texttt{mask} it will contain \texttt{FLT\_MAX}.}
\cvarg{maskCollection}{\texttt{GpuMat} with set of masks. It can be obtained from \texttt{std::vector<GpuMat>} by \hyperref[cppfunc.gpu.BruteForceMatcher.makeGpuCollection]{makeGpuCollection}. Or it can contain user defined mask set. It must be empty matrix or one row matrix, each element is a \texttt{PtrStep} that points to one mask (must have \texttt{CV\_8UC1} type).}
\end{description}

\cvfunc{gpu::BruteForceMatcher\_GPU::makeGpuCollection}\label{cppfunc.gpu.BruteForceMatcher.makeGpuCollection}
Make gpu collection of train descriptors and masks in suitable format for \hyperref[cppfunc.gpu.BruteForceMatcher.matchCollection]{matchCollection} function.
\cvdefCpp{
void makeGpuCollection(GpuMat\& trainCollection, \par GpuMat\& maskCollection, \par const vector<GpuMat>\& masks = std::vector<GpuMat>());
}

\cvfunc{gpu::BruteForceMatcher\_GPU::matchDownload}\label{cppfunc.gpu.BruteForceMatcher.matchDownload}
Download \texttt{trainIdx}, \texttt{imgIdx} and \texttt{distance} matrices obtained by \hyperref[cppfunc.gpu.BruteForceMatcher.matchSingle]{matchSingle} or \hyperref[cppfunc.gpu.BruteForceMatcher.matchCollection]{matchCollection} to CPU vector with \hyperref[cv.class.DMatch]{cv::DMatch}.
\cvdefCpp{
static void matchDownload(const GpuMat\& trainIdx, \par const GpuMat\& distance, \par std::vector<DMatch>\& matches);
}
\cvdefCpp{
static void matchDownload(const GpuMat\& trainIdx, \par GpuMat\& imgIdx, \par const GpuMat\& distance, \par std::vector<DMatch>\& matches);
}

\cvfunc{gpu::BruteForceMatcher\_GPU::knnMatch}\label{cppfunc.gpu.BruteForceMatcher.knnMatch}
Find the k best matches for each descriptor from a query set with train descriptors. Found k (or less if not possible) matches are returned in distance increasing order. This function is equivalent of \cvCppCross{DescriptorMatcher::knnMatch}.
\cvdefCpp{
void knnMatch(const GpuMat\& queryDescs, \par const GpuMat\& trainDescs, \par std::vector< std::vector<DMatch> >\& matches, \par int k, \par const GpuMat\& mask = GpuMat(), \par bool compactResult = false);
}
\cvdefCpp{
void knnMatch(const GpuMat\& queryDescs, \par std::vector< std::vector<DMatch> >\& matches, \par int k, \par const std::vector<GpuMat>\& masks = std::vector<GpuMat>(), \par bool compactResult = false );
}

\cvfunc{gpu::BruteForceMatcher\_GPU::knnMatch}\label{cppfunc.gpu.BruteForceMatcher.knnMatchSingle}
Find the k best matches for each descriptor from a query set with train descriptors. Found k (or less if not possible) matches are returned in distance increasing order. Results stored to GPU memory.
\cvdefCpp{
void knnMatch(const GpuMat\& queryDescs, \par const GpuMat\& trainDescs, \par GpuMat\& trainIdx, \par GpuMat\& distance, \par GpuMat\& allDist, \par int k, \par const GpuMat\& mask = GpuMat());
}
\begin{description}
\cvarg{queryDescs} {Query set of descriptors.}
\cvarg{trainDescs} {Train set of descriptors. This will not be added to train descriptors collection stored in class object.}
\cvarg{trainIdx} {Matrix \texttt{nQueries} x \texttt{k} with type \texttt{CV\_32SC1}. \texttt{trainIdx.at<int>(queryIdx, i)} will contain index of i'th best trains. If some query descriptor masked out in \texttt{mask} it will contain -1.}
\cvarg{distance} {Matrix \texttt{nQuery} x \texttt{k} with type \texttt{CV\_32FC1}. Will contain distance for each query and i'th best trains. If some query descriptor masked out in \texttt{mask} it will contain \texttt{FLT\_MAX}.}
\cvarg{allDist} {Buffer to store all distances between query descriptors and train descriptors. It have size \texttt{nQuery} x \texttt{nTrain} and \texttt{CV\_32F} type. \texttt{allDist.at<float>(queryIdx, trainIdx)} will contain \texttt{FLT\_MAX}, if \texttt{trainIdx} is one from k best, otherwise it will contain distance between \texttt{queryIdx} and \texttt{trainIdx} descriptors.}
\cvarg{k}{Count of best matches will be found per each query descriptor (or less if it's not possible).}
\cvarg{mask}{Mask specifying permissible matches between input query and train matrices of descriptors.}
\end{description}

\cvfunc{gpu::BruteForceMatcher\_GPU::knnMatchDownload}\label{cppfunc.gpu.BruteForceMatcher.knnMatchDownload}
Download \texttt{trainIdx} and \texttt{distance} matrices obtained by \hyperref[cppfunc.gpu.BruteForceMatcher.knnMatchSingle]{knnMatch} to CPU vector with \hyperref[cv.class.DMatch]{cv::DMatch}. If \texttt{compactResult} is true \texttt{matches} vector will not contain matches for fully masked out query descriptors.
\cvdefCpp{
static void knnMatchDownload(const GpuMat\& trainIdx, \par const GpuMat\& distance, \par std::vector< std::vector<DMatch> >\& matches, \par bool compactResult = false);
}

\cvfunc{gpu::BruteForceMatcher\_GPU::radiusMatch}\label{cppfunc.gpu.BruteForceMatcher.radiusMatch}
Find the best matches for each query descriptor which have distance less than given threshold. Found matches are returned in distance increasing order. This function is equivalent of \cvCppCross{DescriptorMatcher::radiusMatch}. Works only on device with Compute Capability \texttt{>=} 1.1.
\cvdefCpp{
void radiusMatch(const GpuMat\& queryDescs, \par const GpuMat\& trainDescs, \par std::vector< std::vector<DMatch> >\& matches, \par float maxDistance, \par const GpuMat\& mask = GpuMat(), \par bool compactResult = false);
}
\cvdefCpp{
void radiusMatch(const GpuMat\& queryDescs, \par std::vector< std::vector<DMatch> >\& matches, \par float maxDistance, \par const std::vector<GpuMat>\& masks = std::vector<GpuMat>(), \par bool compactResult = false);
}

\cvfunc{gpu::BruteForceMatcher\_GPU::radiusMatch}\label{cppfunc.gpu.BruteForceMatcher.radiusMatchSingle}
Find the best matches for each query descriptor which have distance less than given threshold. Results stored to GPU memory. Results are not sorted by distance increasing order. Works only on device with Compute Capability \texttt{>=} 1.1.
\cvdefCpp{
void radiusMatch(const GpuMat\& queryDescs, \par const GpuMat\& trainDescs, \par GpuMat\& trainIdx, \par GpuMat\& nMatches, \par GpuMat\& distance, \par float maxDistance, \par const GpuMat\& mask = GpuMat());
}
\begin{description}
\cvarg{queryDescs} {Query set of descriptors.}
\cvarg{trainDescs} {Train set of descriptors. This will not be added to train descriptors collection stored in class object.}
\cvarg{trainIdx} {\texttt{trainIdx.at<int>(queryIdx, i)} will contain i'th train index \newline\texttt{(i < min(nMatches.at<unsigned int>(0, queryIdx), trainIdx.cols)}. If \texttt{trainIdx} is empty, it will be created with size \texttt{nQuery} x \texttt{nTrain}. Or it can be allocated by user (it must have \texttt{nQuery} rows and \texttt{CV\_32SC1} type). Cols can be less than \texttt{nTrain}, but it can be that matcher won't find all matches, because it haven't enough memory to store results.}
\cvarg{nMatches} {\texttt{nMatches.at<unsigned int>(0, queryIdx)} will contain matches count for \texttt{queryIdx}. Carefully, \texttt{nMatches} can be greater than \texttt{trainIdx.cols} - it means that matcher didn't find all matches, because it didn't have enough memory.}
\cvarg{distance} {\texttt{distance.at<int>(queryIdx, i)} will contain i'th distance \newline\texttt{(i < min(nMatches.at<unsigned int>(0, queryIdx), trainIdx.cols)}. If \texttt{trainIdx} is empty, it will be created with size \texttt{nQuery} x \texttt{nTrain}. Otherwise it must be also allocated by user (it must have the same size as \texttt{trainIdx} and \texttt{CV\_32FC1} type).}
\cvarg{maxDistance}{The threshold to found match distances.}
\cvarg{mask}{Mask specifying permissible matches between input query and train matrices of descriptors.}
\end{description}

\cvfunc{gpu::BruteForceMatcher\_GPU::radiusMatchDownload}\label{cppfunc.gpu.BruteForceMatcher.radiusMatchDownload}
Download \texttt{trainIdx}, \texttt{nMatches} and \texttt{distance} matrices obtained by \hyperref[cppfunc.gpu.BruteForceMatcher.radiusMatchSingle]{radiusMatch} to CPU vector with \hyperref[cv.class.DMatch]{cv::DMatch}. If \texttt{compactResult} is true \texttt{matches} vector will not contain matches for fully masked out query descriptors.
\cvdefCpp{
static void radiusMatchDownload(const GpuMat\& trainIdx, \par const GpuMat\& nMatches, \par const GpuMat\& distance, \par std::vector< std::vector<DMatch> >\& matches, \par bool compactResult = false);
}

\fi