update docs

minor fixes and refactoring of GPU module
2024-11-28 13:10:12 +08:00 · 2011-02-16 08:31:45 +00:00 · 2011-02-16 08:31:45 +00:00 · 54fa600b9e
commit 54fa600b9e
parent 7d42dbdd71
16 changed files with 944 additions and 901 deletions
--- a/doc/gpu_features2d.tex
+++ b/doc/gpu_features2d.tex
@ -1,7 +1,55 @@
 \section{Feature Detection and Description}
-\cvclass{gpu::SURF\_GPU}
+\cvclass{gpu::SURFParams\_GPU}\label{class.gpu.SURFParams}
 Various SURF algorithm parameters.
 \begin{lstlisting}
 struct SURFParams_GPU 
 {
    SURFParams_GPU() : threshold(0.1f), nOctaves(4), nIntervals(4), 
        initialScale(2.f), l1(3.f/1.5f), l2(5.f/1.5f), l3(3.f/1.5f), 
        l4(1.f/1.5f), edgeScale(0.81f), initialStep(1), extended(true), 
        featuresRatio(0.01f) {}
    //! The interest operator threshold
    float threshold;
    //! The number of octaves to process
    int nOctaves;
    //! The number of intervals in each octave
    int nIntervals;
    //! The scale associated with the first interval of the first octave
    float initialScale;
    //! mask parameter l_1
    float l1;
    //! mask parameter l_2 
    float l2;
    //! mask parameter l_3
    float l3;
    //! mask parameter l_4
    float l4;
    //! The amount to scale the edge rejection mask
    float edgeScale;
    //! The initial sampling step in pixels.
    int initialStep;
    //! True, if generate 128-len descriptors, false - 64-len descriptors
    bool extended;
    //! max features = featuresRatio * img.size().area()
    float featuresRatio;
 };
 \end{lstlisting}
 In contrast to \hyperref[cv.class.SURF]{cv::SURF} \texttt{SURF\_GPU} works with float sources (with range [0..1]). It performs conversion after calculation of the integral by division result by 255. Please take it into consideration when change some parameters (like hessian threshold).
 Current \texttt{SURF\_GPU} implementation supports the number of intervals in each octave in range [3..21].
 See also: \hyperref[class.gpu.SURF]{cv::gpu::SURF\_GPU}.
 \cvclass{gpu::SURF\_GPU}\label{class.gpu.SURF}
 Class for extracting Speeded Up Robust Features from an image.
 \begin{lstlisting}
@ -62,7 +110,7 @@ The class \texttt{SURF\_GPU} can store results to GPU and CPU memory and provide
 The class \texttt{SURF\_GPU} uses some buffers and provides access to it. All buffers can be safely released between function calls. 
-See also: \hyperref[cv.class.SURF]{cv::SURF}.
+See also: \hyperref[cv.class.SURF]{cv::SURF}, \hyperref[class.gpu.SURFParams]{cv::gpu::SURFParams\_GPU}.
 \cvclass{gpu::BruteForceMatcher\_GPU}
@ -269,7 +317,7 @@ void radiusMatch(const GpuMat\& queryDescs, \par const GpuMat\& trainDescs, \par
 void radiusMatch(const GpuMat\& queryDescs, \par std::vector< std::vector<DMatch> >\& matches, \par float maxDistance, \par const std::vector<GpuMat>\& masks = std::vector<GpuMat>(), \par bool compactResult = false);
 }
-This function works only on devices with Compute Capability $>=$ 1.1.
+\textbf{Please note:} This function works only on devices with Compute Capability $>=$ 1.1.
 See also: \cvCppCross{DescriptorMatcher::radiusMatch}.
@ -293,7 +341,8 @@ void radiusMatch(const GpuMat\& queryDescs, \par const GpuMat\& trainDescs, \par
 In contrast to \hyperref[cppfunc.gpu.BruteForceMatcher.radiusMatch]{cv::gpu::BruteForceMather\_GPU::radiusMatch} results are not sorted by distance increasing order.
-This function works only on devices with Compute Capability $>=$ 1.1.
+\textbf{Please note:} This function works only on devices with Compute Capability $>=$ 1.1.
 \cvfunc{cv::gpu::BruteForceMatcher\_GPU::radiusMatchDownload}\label{cppfunc.gpu.BruteForceMatcher.radiusMatchDownload}
 Downloads \texttt{trainIdx}, \texttt{nMatches} and \texttt{distance} matrices obtained via \hyperref[cppfunc.gpu.BruteForceMatcher.radiusMatchSingle]{radiusMatch} to CPU vector with \hyperref[cv.class.DMatch]{cv::DMatch}. If \texttt{compactResult} is true \texttt{matches} vector will not contain matches for fully masked out query descriptors.
--- a/doc/gpu_image_processing.tex
+++ b/doc/gpu_image_processing.tex
@ -17,6 +17,8 @@ Performs mean-shift filtering for each point of the source image. It maps each p
 \cvarg{criteria}{Termination criteria. See \hyperref[TermCriteria]{cv::TermCriteria}.}
 \end{description}
 \textbf{Please note:} This function works only on devices with Compute Capability $>=$ 1.2.
 \cvCppFunc{gpu::meanShiftProc}
 Performs mean-shift procedure and stores information about processed points (i.e. their colors and positions) into two images. 
@ -35,6 +37,8 @@ Performs mean-shift procedure and stores information about processed points (i.e
 \cvarg{criteria}{Termination criteria. See \hyperref[TermCriteria]{cv::TermCriteria}.}
 \end{description}
 \textbf{Please note:} This function works only on devices with Compute Capability $>=$ 1.2.
 See also: \cvCppCross{gpu::meanShiftFiltering}.
@ -55,6 +59,8 @@ Performs mean-shift segmentation of the source image and eleminates small segmen
 \cvarg{criteria}{Termination criteria. See \hyperref[TermCriteria]{cv::TermCriteria}.}
 \end{description}
 \textbf{Please note:} This function works only on devices with Compute Capability $>=$ 1.2.
 \cvCppFunc{gpu::integral}
 Computes integral image and squared integral image.
@ -319,7 +325,7 @@ double threshold(const GpuMat\& src, GpuMat\& dst, double thresh, \par double ma
 }
 \begin{description}
-\cvarg{src}{Source array (single-channel, \texttt{CV\_64F} depth isn't supported).}
+\cvarg{src}{Source array (single-channel).}
 \cvarg{dst}{Destination array; will have the same size and the same type as \texttt{src}.}
 \cvarg{thresh}{Threshold value.}
 \cvarg{maxVal}{Maximum value to use with \texttt{THRESH\_BINARY} and \texttt{THRESH\_BINARY\_INV} thresholding types.}
--- a/doc/opencv.pdf
+++ b/doc/opencv.pdf
--- a/modules/gpu/src/cuda/brute_force_matcher.cu
+++ b/modules/gpu/src/cuda/brute_force_matcher.cu
@ -582,10 +582,10 @@ namespace cv { namespace gpu { namespace bfmatcher
    }
    ///////////////////////////////////////////////////////////////////////////////
-    // Match kernel chooser
+    // Match caller
    template <typename Dist, typename T, typename Train, typename Mask>
-    void match_chooser(const DevMem2D_<T>& queryDescs, const Train& train, 
+    void matchDispatcher(const DevMem2D_<T>& queryDescs, const Train& train, 
        const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
        bool cc_12)
    {
@ -616,11 +616,11 @@ namespace cv { namespace gpu { namespace bfmatcher
        if (mask.data)
        {
            SingleMask m(mask);
-            match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12);
+            matchDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12);
        }
        else
        {
-            match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
+            matchDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
        }
    }
@ -640,11 +640,11 @@ namespace cv { namespace gpu { namespace bfmatcher
        if (mask.data)
        {
            SingleMask m(mask);
-            match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12);
+            matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12);
        }
        else
        {
-            match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
+            matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
        }
    }
@ -664,11 +664,11 @@ namespace cv { namespace gpu { namespace bfmatcher
        if (maskCollection.data)
        {
            MaskCollection mask(maskCollection.data);
-            match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12);
+            matchDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12);
        }
        else
        {
-            match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
+            matchDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
        }
    }
@ -688,11 +688,11 @@ namespace cv { namespace gpu { namespace bfmatcher
        if (maskCollection.data)
        {
            MaskCollection mask(maskCollection.data);
-            match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12);
+            matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12);
        }
        else
        {
-            match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
+            matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
        }
    }
@ -942,22 +942,35 @@ namespace cv { namespace gpu { namespace bfmatcher
    ///////////////////////////////////////////////////////////////////////////////
    // knn match caller
    template <typename Dist, typename T, typename Mask>
    void calcDistanceDispatcher(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs, 
        const Mask& mask, const DevMem2Df& allDist)
    {
        calcDistance_caller<16, 16, Dist>(queryDescs, trainDescs, mask, allDist);
    }
    void findKnnMatchDispatcher(int knn, const DevMem2Di& trainIdx, const DevMem2Df& distance, 
        const DevMem2Df& allDist)
    {
        findKnnMatch_caller<256>(knn, trainIdx, distance, allDist);
    }
    template <typename T>
    void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist)
    {
        if (mask.data)
        {
-            calcDistance_caller<16, 16, L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
+            calcDistanceDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
                SingleMask(mask), allDist);
        }
        else
        {
-            calcDistance_caller<16, 16, L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
+            calcDistanceDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
                WithOutMask(), allDist);
        }
-        findKnnMatch_caller<256>(knn, trainIdx, distance, allDist);
+        findKnnMatchDispatcher(knn, trainIdx, distance, allDist);
    }
    template void knnMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
@ -973,16 +986,16 @@ namespace cv { namespace gpu { namespace bfmatcher
    {
        if (mask.data)
        {
-            calcDistance_caller<16, 16, L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
+            calcDistanceDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
                SingleMask(mask), allDist);
        }
        else
        {
-            calcDistance_caller<16, 16, L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
+            calcDistanceDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
                WithOutMask(), allDist);
        }
-        findKnnMatch_caller<256>(knn, trainIdx, distance, allDist);
+        findKnnMatchDispatcher(knn, trainIdx, distance, allDist);
    }
    template void knnMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
@ -1061,7 +1074,16 @@ namespace cv { namespace gpu { namespace bfmatcher
    }
    ///////////////////////////////////////////////////////////////////////////////
-    // Radius Match kernel chooser
+    // Radius Match caller
    template <typename Dist, typename T, typename Mask>
    void radiusMatchDispatcher(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs, 
        float maxDistance, const Mask& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, 
        const DevMem2Df& distance)
    {
        radiusMatch_caller<16, 16, Dist>(queryDescs, trainDescs, maxDistance, mask, 
            trainIdx, nMatches, distance);
    }
    template <typename T>
    void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
@ -1069,12 +1091,12 @@ namespace cv { namespace gpu { namespace bfmatcher
    {
        if (mask.data)
        {
-            radiusMatch_caller<16, 16, L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
+            radiusMatchDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
                maxDistance, SingleMask(mask), trainIdx, nMatches, distance);
        }
        else
        {
-            radiusMatch_caller<16, 16, L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
+            radiusMatchDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
                maxDistance, WithOutMask(), trainIdx, nMatches, distance);
        }
    }
@ -1092,12 +1114,12 @@ namespace cv { namespace gpu { namespace bfmatcher
    {
        if (mask.data)
        {
-            radiusMatch_caller<16, 16, L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
+            radiusMatchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
                maxDistance, SingleMask(mask), trainIdx, nMatches, distance);
        }
        else
        {
-            radiusMatch_caller<16, 16, L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
+            radiusMatchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
                maxDistance, WithOutMask(), trainIdx, nMatches, distance);
        }
    }
--- a/modules/gpu/src/cuda/color.cu
+++ b/modules/gpu/src/cuda/color.cu
--- a/modules/gpu/src/cudastream.cpp
+++ b/modules/gpu/src/cudastream.cpp
@ -190,6 +190,9 @@ void cv::gpu::Stream::enqueueCopy(const GpuMat& src, GpuMat& dst) { devcopy(src,
 void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val)
 {
    CV_Assert((src.depth() != CV_64F) || 
        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
    typedef void (*set_caller_t)(GpuMat& src, const Scalar& s, cudaStream_t stream);
    static const set_caller_t set_callers[] =
    {
@ -201,6 +204,11 @@ void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val)
 void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask)
 {
    CV_Assert((src.depth() != CV_64F) || 
        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
    CV_Assert(mask.type() == CV_8UC1);
    typedef void (*set_caller_t)(GpuMat& src, const Scalar& s, const GpuMat& mask, cudaStream_t stream);
    static const set_caller_t set_callers[] =
    {
@ -212,6 +220,9 @@ void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask)
 void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int rtype, double alpha, double beta)
 {
    CV_Assert((src.depth() != CV_64F && CV_MAT_DEPTH(rtype) != CV_64F) || 
        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
    bool noScale = fabs(alpha-1) < std::numeric_limits<double>::epsilon() && fabs(beta) < std::numeric_limits<double>::epsilon();
    if( rtype < 0 )
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
@ -626,6 +626,10 @@ namespace
 void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst) 
 {
    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
    CV_Assert((src1.depth() != CV_64F) || 
        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
    typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
    static const func_t funcs[] = 
    {
@ -637,6 +641,10 @@ void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst)
 void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream) 
 { 
    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
    CV_Assert((src1.depth() != CV_64F) || 
        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
    typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
    static const func_t funcs[] = 
    {
@ -648,6 +656,9 @@ void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Str
 void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst) 
 {
    CV_Assert((src1.depth() != CV_64F) || 
        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
    typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
    static const func_t funcs[] = 
    {
@ -659,6 +670,9 @@ void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst)
 void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream) 
 {
    CV_Assert((src1.depth() != CV_64F) || 
        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
    typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
    static const func_t funcs[] = 
    {
@ -670,6 +684,10 @@ void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst, const Stream& st
 void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst) 
 { 
    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
    CV_Assert((src1.depth() != CV_64F) || 
        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
    typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
    static const func_t funcs[] = 
    {
@ -681,6 +699,10 @@ void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst)
 void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream) 
 { 
    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
    CV_Assert((src1.depth() != CV_64F) || 
        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
    typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
    static const func_t funcs[] = 
    {
@ -692,6 +714,9 @@ void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Str
 void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst) 
 {
    CV_Assert((src1.depth() != CV_64F) || 
        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
    typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
    static const func_t funcs[] = 
    {
@ -703,6 +728,9 @@ void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst)
 void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream) 
 {
    CV_Assert((src1.depth() != CV_64F) || 
        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
    typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
    static const func_t funcs[] = 
    {
@ -749,6 +777,9 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
    }
    else
    {
        CV_Assert((src.depth() != CV_64F) || 
            (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
        typedef void (*caller_t)(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, 
            cudaStream_t stream);
--- a/modules/gpu/src/matrix_operations.cpp
+++ b/modules/gpu/src/matrix_operations.cpp
@ -205,6 +205,9 @@ namespace
 void cv::gpu::GpuMat::convertTo( GpuMat& dst, int rtype, double alpha, double beta ) const
 {
    CV_Assert((depth() != CV_64F && CV_MAT_DEPTH(rtype) != CV_64F) || 
        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
    bool noScale = fabs(alpha-1) < std::numeric_limits<double>::epsilon() && fabs(beta) < std::numeric_limits<double>::epsilon();
    if( rtype < 0 )
@ -428,6 +431,9 @@ GpuMat& GpuMat::setTo(const Scalar& s, const GpuMat& mask)
 {
    CV_Assert(mask.type() == CV_8UC1);
    CV_Assert((depth() != CV_64F) || 
        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
    CV_DbgAssert(!this->empty());
    NppiSize sz;
--- a/modules/gpu/src/opencv2/gpu/device/transform.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/transform.hpp
@ -393,11 +393,37 @@ namespace cv
            }
        };
        template <typename T, typename D, int scn, int dcn> struct UseSmartUn_
        {
            static const bool value = false;
        };
        template <typename T, typename D> struct UseSmartUn_<T, D, 1, 1>
        {
            static const bool value = device::UnReadWriteTraits<T, D>::shift != 1;
        };
        template <typename T, typename D> struct UseSmartUn
        {
            static const bool value = UseSmartUn_<T, D, device::VecTraits<T>::cn, device::VecTraits<D>::cn>::value;
        };
        template <typename T1, typename T2, typename D, int src1cn, int src2cn, int dstcn> struct UseSmartBin_
        {
            static const bool value = false;
        };
        template <typename T1, typename T2, typename D> struct UseSmartBin_<T1, T2, D, 1, 1, 1>
        {
            static const bool value = device::BinReadWriteTraits<T1, T2, D>::shift != 1;
        };
        template <typename T1, typename T2, typename D> struct UseSmartBin
        {
            static const bool value = UseSmartBin_<T1, T2, D, device::VecTraits<T1>::cn, device::VecTraits<T2>::cn, device::VecTraits<D>::cn>::value;
        };
        template <typename T, typename D, typename UnOp, typename Mask>
        static void transform_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, UnOp op, const Mask& mask, 
            cudaStream_t stream = 0)
        {
-            TransformDispatcher<device::VecTraits<T>::cn == 1 && device::VecTraits<D>::cn == 1 && device::UnReadWriteTraits<T, D>::shift != 1>::call(src, dst, op, mask, stream);
+            TransformDispatcher< UseSmartUn<T, D>::value >::call(src, dst, op, mask, stream);
        }
        template <typename T, typename D, typename UnOp>
@ -416,7 +442,7 @@ namespace cv
        static void transform_caller(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, 
            BinOp op, const Mask& mask, cudaStream_t stream = 0)
        {
-            TransformDispatcher<device::VecTraits<T1>::cn == 1 && device::VecTraits<T2>::cn == 1 && device::VecTraits<D>::cn == 1 && device::BinReadWriteTraits<T1, T2, D>::shift != 1>::call(src1, src2, dst, op, mask, stream);
+            TransformDispatcher< UseSmartBin<T1, T2, D>::value >::call(src1, src2, dst, op, mask, stream);
        }
        template <typename T1, typename T2, typename D, typename BinOp>
--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@ -681,3 +681,66 @@ TEST(erode)
        GPU_OFF;
    }
 }
 TEST(threshold)
 {
    Mat src, dst;
    gpu::GpuMat d_src, d_dst;
    for (int size = 2000; size <= 4000; size += 1000)
    {
        SUBTEST << "size " << size << ", 8U, THRESH_TRUNC";
        gen(src, size, size, CV_8U, 0, 100);
        dst.create(size, size, CV_8U);
        CPU_ON; 
        threshold(src, dst, 50.0, 0.0, THRESH_TRUNC);
        CPU_OFF;
        d_src = src;
        d_dst.create(size, size, CV_8U);
        GPU_ON;
        gpu::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
        GPU_OFF;
    }
    for (int size = 2000; size <= 4000; size += 1000)
    {
        SUBTEST << "size " << size << ", 8U, THRESH_BINARY";
        gen(src, size, size, CV_8U, 0, 100);
        dst.create(size, size, CV_8U);
        CPU_ON; 
        threshold(src, dst, 50.0, 0.0, THRESH_BINARY);
        CPU_OFF;
        d_src = src;
        d_dst.create(size, size, CV_8U);
        GPU_ON;
        gpu::threshold(d_src, d_dst, 50.0, 0.0, THRESH_BINARY);
        GPU_OFF;
    }
    for (int size = 2000; size <= 4000; size += 1000)
    {
        SUBTEST << "size " << size << ", 32F, THRESH_TRUNC";
        gen(src, size, size, CV_32F, 0, 100);
        dst.create(size, size, CV_32F);
        CPU_ON; 
        threshold(src, dst, 50.0, 0.0, THRESH_TRUNC);
        CPU_OFF;
        d_src = src;
        d_dst.create(size, size, CV_32F);
        GPU_ON;
        gpu::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
        GPU_OFF;
    }
 }
--- a/tests/gpu/src/brute_force_matcher.cpp
+++ b/tests/gpu/src/brute_force_matcher.cpp
@ -384,7 +384,7 @@ void CV_GpuBruteForceMatcherTest::knnMatchTest( const GpuMat& query, const GpuMa
 void CV_GpuBruteForceMatcherTest::radiusMatchTest( const GpuMat& query, const GpuMat& train )
 {
-    bool atomics_ok = TargetArchs::builtWith(ATOMICS) && DeviceInfo().supports(ATOMICS);
+    bool atomics_ok = TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS);
    if (!atomics_ok)
    {
        ts->printf(CvTS::CONSOLE, "\nCode and device atomics support is required for radiusMatch (CC >= 1.1)");
--- a/tests/gpu/src/meanshift.cpp
+++ b/tests/gpu/src/meanshift.cpp
@ -53,7 +53,7 @@ struct CV_GpuMeanShiftTest : public CvTest
    void run(int)
    {
-        bool cc12_ok = TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12);
+        bool cc12_ok = TargetArchs::builtWith(FEATURE_SET_COMPUTE_12) && DeviceInfo().supports(FEATURE_SET_COMPUTE_12);
        if (!cc12_ok)
        {
            ts->printf(CvTS::CONSOLE, "\nCompute capability 1.2 is required");
@ -67,8 +67,8 @@ struct CV_GpuMeanShiftTest : public CvTest
        cv::Mat img = cv::imread(std::string(ts->get_data_path()) + "meanshift/cones.png");
        cv::Mat img_template;       
-        if (cv::gpu::TargetArchs::builtWith(cv::gpu::COMPUTE_20) &&
+        if (cv::gpu::TargetArchs::builtWith(cv::gpu::FEATURE_SET_COMPUTE_20) &&
-            cv::gpu::DeviceInfo().supports(cv::gpu::COMPUTE_20))
+            cv::gpu::DeviceInfo().supports(cv::gpu::FEATURE_SET_COMPUTE_20))
            img_template = cv::imread(std::string(ts->get_data_path()) + "meanshift/con_result.png");
        else
            img_template = cv::imread(std::string(ts->get_data_path()) + "meanshift/con_result_CC1X.png");
@ -145,7 +145,7 @@ struct CV_GpuMeanShiftProcTest : public CvTest
    void run(int)
    {
-        bool cc12_ok = TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12);
+        bool cc12_ok = TargetArchs::builtWith(FEATURE_SET_COMPUTE_12) && DeviceInfo().supports(FEATURE_SET_COMPUTE_12);
        if (!cc12_ok)
        {
            ts->printf(CvTS::CONSOLE, "\nCompute capability 1.2 is required");
@ -219,8 +219,8 @@ struct CV_GpuMeanShiftProcTest : public CvTest
            cv::Mat spmap_template;
            cv::FileStorage fs;
-            if (cv::gpu::TargetArchs::builtWith(cv::gpu::COMPUTE_20) &&
+            if (cv::gpu::TargetArchs::builtWith(cv::gpu::FEATURE_SET_COMPUTE_20) &&
-                cv::gpu::DeviceInfo().supports(cv::gpu::COMPUTE_20))
+                cv::gpu::DeviceInfo().supports(cv::gpu::FEATURE_SET_COMPUTE_20))
                fs.open(std::string(ts->get_data_path()) + "meanshift/spmap.yaml", cv::FileStorage::READ);
            else
                fs.open(std::string(ts->get_data_path()) + "meanshift/spmap_CC1X.yaml", cv::FileStorage::READ);
--- a/tests/gpu/src/mssegmentation.cpp
+++ b/tests/gpu/src/mssegmentation.cpp
@ -54,7 +54,7 @@ struct CV_GpuMeanShiftSegmentationTest : public CvTest {
    {
        try 
        {
-            bool cc12_ok = TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12);
+            bool cc12_ok = TargetArchs::builtWith(FEATURE_SET_COMPUTE_12) && DeviceInfo().supports(FEATURE_SET_COMPUTE_12);
            if (!cc12_ok)
            {
                ts->printf(CvTS::CONSOLE, "\nCompute capability 1.2 is required");
@ -77,7 +77,7 @@ struct CV_GpuMeanShiftSegmentationTest : public CvTest {
            {
                stringstream path;
                path << ts->get_data_path() << "meanshift/cones_segmented_sp10_sr10_minsize" << minsize;
-                if (TargetArchs::builtWith(COMPUTE_20) && DeviceInfo().supports(COMPUTE_20))
+                if (TargetArchs::builtWith(FEATURE_SET_COMPUTE_20) && DeviceInfo().supports(FEATURE_SET_COMPUTE_20))
                    path << ".png";
                else
                    path << "_CC1X.png";
--- a/tests/gpu/src/operator_convert_to.cpp
+++ b/tests/gpu/src/operator_convert_to.cpp
@ -66,21 +66,24 @@ void CV_GpuMatOpConvertToTest::run(int /* start_from */)
 {
    const Size img_size(67, 35);
    const int types[] = {CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F};
    const int types_num = sizeof(types) / sizeof(int);
    const char* types_str[] = {"CV_8U", "CV_8S", "CV_16U", "CV_16S", "CV_32S", "CV_32F", "CV_64F"};
    bool passed = true;
    try
    {
-        for (int i = 0; i < types_num && passed; ++i)
+        int lastType = CV_32F;
        if (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE))
            lastType = CV_64F;
        for (int i = 0; i <= lastType && passed; ++i)
        {
-            for (int j = 0; j < types_num && passed; ++j)
+            for (int j = 0; j <= lastType && passed; ++j)
            {
                for (int c = 1; c < 5 && passed; ++c)
                {
-                    const int src_type = CV_MAKETYPE(types[i], c);
+                    const int src_type = CV_MAKETYPE(i, c);
-                    const int dst_type = types[j];
+                    const int dst_type = j;
                    cv::RNG rng(*ts->get_rng());
--- a/tests/gpu/src/operator_copy_to.cpp
+++ b/tests/gpu/src/operator_copy_to.cpp
@ -126,7 +126,12 @@ void CV_GpuMatOpCopyToTest::run( int /* start_from */)
    try
    {
-        for (int i = 0 ; i < 7; i++)
+        int lastType = CV_32F;
        if (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE))
            lastType = CV_64F;
        for (int i = 0 ; i <= lastType; i++)
        {
            Mat cpumat(rows, cols, i);
            cpumat.setTo(Scalar::all(127));
--- a/tests/gpu/src/operator_set_to.cpp
+++ b/tests/gpu/src/operator_set_to.cpp
@ -101,7 +101,12 @@ void CV_GpuMatOpSetToTest::run( int /* start_from */)
        rng.fill(cpumask, RNG::UNIFORM, cv::Scalar::all(0.0), cv::Scalar(1.5));
        cv::gpu::GpuMat gpumask(cpumask);
-        for (int i = 0; i < 7; i++)
+        int lastType = CV_32F;
        if (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE))
            lastType = CV_64F;
        for (int i = 0; i <= lastType; i++)
        {
            for (int cn = 1; cn <= 4; ++cn)
            {