update docs

minor fixes and refactoring of GPU module
2024-11-24 19:20:28 +08:00 · 2011-02-16 08:31:45 +00:00 · 2011-02-16 08:31:45 +00:00 · 54fa600b9e
commit 54fa600b9e
parent 7d42dbdd71
16 changed files with 944 additions and 901 deletions
--- a/doc/gpu_features2d.tex
+++ b/doc/gpu_features2d.tex
@ -1,7 +1,55 @@
 \section{Feature Detection and Description}


-\cvclass{gpu::SURF\_GPU}
+\cvclass{gpu::SURFParams\_GPU}\label{class.gpu.SURFParams}
+Various SURF algorithm parameters.
+
+\begin{lstlisting}
+struct SURFParams_GPU 
+{
+    SURFParams_GPU() : threshold(0.1f), nOctaves(4), nIntervals(4), 
+        initialScale(2.f), l1(3.f/1.5f), l2(5.f/1.5f), l3(3.f/1.5f), 
+        l4(1.f/1.5f), edgeScale(0.81f), initialStep(1), extended(true), 
+        featuresRatio(0.01f) {}
+
+    //! The interest operator threshold
+    float threshold;
+    //! The number of octaves to process
+    int nOctaves;
+    //! The number of intervals in each octave
+    int nIntervals;
+    //! The scale associated with the first interval of the first octave
+    float initialScale;
+
+    //! mask parameter l_1
+    float l1;
+    //! mask parameter l_2 
+    float l2;
+    //! mask parameter l_3
+    float l3;
+    //! mask parameter l_4
+    float l4;
+    //! The amount to scale the edge rejection mask
+    float edgeScale;
+    //! The initial sampling step in pixels.
+    int initialStep;
+
+    //! True, if generate 128-len descriptors, false - 64-len descriptors
+    bool extended;
+
+    //! max features = featuresRatio * img.size().area()
+    float featuresRatio;
+};
+\end{lstlisting}
+
+In contrast to \hyperref[cv.class.SURF]{cv::SURF} \texttt{SURF\_GPU} works with float sources (with range [0..1]). It performs conversion after calculation of the integral by division result by 255. Please take it into consideration when change some parameters (like hessian threshold).
+
+Current \texttt{SURF\_GPU} implementation supports the number of intervals in each octave in range [3..21].
+
+See also: \hyperref[class.gpu.SURF]{cv::gpu::SURF\_GPU}.
+
+
+\cvclass{gpu::SURF\_GPU}\label{class.gpu.SURF}
 Class for extracting Speeded Up Robust Features from an image.

 \begin{lstlisting}
@ -62,7 +110,7 @@ The class \texttt{SURF\_GPU} can store results to GPU and CPU memory and provide

 The class \texttt{SURF\_GPU} uses some buffers and provides access to it. All buffers can be safely released between function calls. 

-See also: \hyperref[cv.class.SURF]{cv::SURF}.
+See also: \hyperref[cv.class.SURF]{cv::SURF}, \hyperref[class.gpu.SURFParams]{cv::gpu::SURFParams\_GPU}.


 \cvclass{gpu::BruteForceMatcher\_GPU}
@ -269,7 +317,7 @@ void radiusMatch(const GpuMat\& queryDescs, \par const GpuMat\& trainDescs, \par
 void radiusMatch(const GpuMat\& queryDescs, \par std::vector< std::vector<DMatch> >\& matches, \par float maxDistance, \par const std::vector<GpuMat>\& masks = std::vector<GpuMat>(), \par bool compactResult = false);
 }

-This function works only on devices with Compute Capability $>=$ 1.1.
+\textbf{Please note:} This function works only on devices with Compute Capability $>=$ 1.1.

 See also: \cvCppCross{DescriptorMatcher::radiusMatch}.

@ -293,7 +341,8 @@ void radiusMatch(const GpuMat\& queryDescs, \par const GpuMat\& trainDescs, \par

 In contrast to \hyperref[cppfunc.gpu.BruteForceMatcher.radiusMatch]{cv::gpu::BruteForceMather\_GPU::radiusMatch} results are not sorted by distance increasing order.

-This function works only on devices with Compute Capability $>=$ 1.1.
+\textbf{Please note:} This function works only on devices with Compute Capability $>=$ 1.1.
+

 \cvfunc{cv::gpu::BruteForceMatcher\_GPU::radiusMatchDownload}\label{cppfunc.gpu.BruteForceMatcher.radiusMatchDownload}
 Downloads \texttt{trainIdx}, \texttt{nMatches} and \texttt{distance} matrices obtained via \hyperref[cppfunc.gpu.BruteForceMatcher.radiusMatchSingle]{radiusMatch} to CPU vector with \hyperref[cv.class.DMatch]{cv::DMatch}. If \texttt{compactResult} is true \texttt{matches} vector will not contain matches for fully masked out query descriptors.
--- a/doc/gpu_image_processing.tex
+++ b/doc/gpu_image_processing.tex
@ -17,6 +17,8 @@ Performs mean-shift filtering for each point of the source image. It maps each p
 \cvarg{criteria}{Termination criteria. See \hyperref[TermCriteria]{cv::TermCriteria}.}
 \end{description}

+\textbf{Please note:} This function works only on devices with Compute Capability $>=$ 1.2.
+

 \cvCppFunc{gpu::meanShiftProc}
 Performs mean-shift procedure and stores information about processed points (i.e. their colors and positions) into two images. 
@ -35,6 +37,8 @@ Performs mean-shift procedure and stores information about processed points (i.e
 \cvarg{criteria}{Termination criteria. See \hyperref[TermCriteria]{cv::TermCriteria}.}
 \end{description}

+\textbf{Please note:} This function works only on devices with Compute Capability $>=$ 1.2.
+
 See also: \cvCppCross{gpu::meanShiftFiltering}.


@ -55,6 +59,8 @@ Performs mean-shift segmentation of the source image and eleminates small segmen
 \cvarg{criteria}{Termination criteria. See \hyperref[TermCriteria]{cv::TermCriteria}.}
 \end{description}

+\textbf{Please note:} This function works only on devices with Compute Capability $>=$ 1.2.
+

 \cvCppFunc{gpu::integral}
 Computes integral image and squared integral image.
@ -319,7 +325,7 @@ double threshold(const GpuMat\& src, GpuMat\& dst, double thresh, \par double ma
 }

 \begin{description}
-\cvarg{src}{Source array (single-channel, \texttt{CV\_64F} depth isn't supported).}
+\cvarg{src}{Source array (single-channel).}
 \cvarg{dst}{Destination array; will have the same size and the same type as \texttt{src}.}
 \cvarg{thresh}{Threshold value.}
 \cvarg{maxVal}{Maximum value to use with \texttt{THRESH\_BINARY} and \texttt{THRESH\_BINARY\_INV} thresholding types.}
--- a/doc/opencv.pdf
+++ b/doc/opencv.pdf
--- a/modules/gpu/src/cuda/brute_force_matcher.cu
+++ b/modules/gpu/src/cuda/brute_force_matcher.cu
@ -582,10 +582,10 @@ namespace cv { namespace gpu { namespace bfmatcher
    }
    
    ///////////////////////////////////////////////////////////////////////////////
-    // Match kernel chooser
+    // Match caller

    template <typename Dist, typename T, typename Train, typename Mask>
-    void match_chooser(const DevMem2D_<T>& queryDescs, const Train& train, 
+    void matchDispatcher(const DevMem2D_<T>& queryDescs, const Train& train, 
        const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
        bool cc_12)
    {
@ -616,11 +616,11 @@ namespace cv { namespace gpu { namespace bfmatcher
        if (mask.data)
        {
            SingleMask m(mask);
-            match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12);
+            matchDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12);
        }
        else
        {
-            match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
+            matchDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
        }
    }

@ -640,11 +640,11 @@ namespace cv { namespace gpu { namespace bfmatcher
        if (mask.data)
        {
            SingleMask m(mask);
-            match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12);
+            matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12);
        }
        else
        {
-            match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
+            matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
        }
    }

@ -664,11 +664,11 @@ namespace cv { namespace gpu { namespace bfmatcher
        if (maskCollection.data)
        {
            MaskCollection mask(maskCollection.data);
-            match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12);
+            matchDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12);
        }
        else
        {
-            match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
+            matchDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
        }
    }

@ -688,11 +688,11 @@ namespace cv { namespace gpu { namespace bfmatcher
        if (maskCollection.data)
        {
            MaskCollection mask(maskCollection.data);
-            match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12);
+            matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12);
        }
        else
        {
-            match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
+            matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
        }
    }

@ -942,22 +942,35 @@ namespace cv { namespace gpu { namespace bfmatcher
    ///////////////////////////////////////////////////////////////////////////////
    // knn match caller

+    template <typename Dist, typename T, typename Mask>
+    void calcDistanceDispatcher(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs, 
+        const Mask& mask, const DevMem2Df& allDist)
+    {
+        calcDistance_caller<16, 16, Dist>(queryDescs, trainDescs, mask, allDist);
+    }
+
+    void findKnnMatchDispatcher(int knn, const DevMem2Di& trainIdx, const DevMem2Df& distance, 
+        const DevMem2Df& allDist)
+    {
+        findKnnMatch_caller<256>(knn, trainIdx, distance, allDist);
+    }
+
    template <typename T>
    void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist)
    {
        if (mask.data)
        {
-            calcDistance_caller<16, 16, L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
+            calcDistanceDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
                SingleMask(mask), allDist);
        }
        else
        {
-            calcDistance_caller<16, 16, L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
+            calcDistanceDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
                WithOutMask(), allDist);
        }

-        findKnnMatch_caller<256>(knn, trainIdx, distance, allDist);
+        findKnnMatchDispatcher(knn, trainIdx, distance, allDist);
    }

    template void knnMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
@ -973,16 +986,16 @@ namespace cv { namespace gpu { namespace bfmatcher
    {
        if (mask.data)
        {
-            calcDistance_caller<16, 16, L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
+            calcDistanceDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
                SingleMask(mask), allDist);
        }
        else
        {
-            calcDistance_caller<16, 16, L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
+            calcDistanceDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
                WithOutMask(), allDist);
        }

-        findKnnMatch_caller<256>(knn, trainIdx, distance, allDist);
+        findKnnMatchDispatcher(knn, trainIdx, distance, allDist);
    }

    template void knnMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
@ -1061,7 +1074,16 @@ namespace cv { namespace gpu { namespace bfmatcher
    }
    
    ///////////////////////////////////////////////////////////////////////////////
-    // Radius Match kernel chooser
+    // Radius Match caller
+
+    template <typename Dist, typename T, typename Mask>
+    void radiusMatchDispatcher(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs, 
+        float maxDistance, const Mask& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, 
+        const DevMem2Df& distance)
+    {
+        radiusMatch_caller<16, 16, Dist>(queryDescs, trainDescs, maxDistance, mask, 
+            trainIdx, nMatches, distance);
+    }

    template <typename T>
    void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
@ -1069,12 +1091,12 @@ namespace cv { namespace gpu { namespace bfmatcher
    {
        if (mask.data)
        {
-            radiusMatch_caller<16, 16, L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
+            radiusMatchDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
                maxDistance, SingleMask(mask), trainIdx, nMatches, distance);
        }
        else
        {
-            radiusMatch_caller<16, 16, L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
+            radiusMatchDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
                maxDistance, WithOutMask(), trainIdx, nMatches, distance);
        }
    }
@ -1092,12 +1114,12 @@ namespace cv { namespace gpu { namespace bfmatcher
    {
        if (mask.data)
        {
-            radiusMatch_caller<16, 16, L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
+            radiusMatchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
                maxDistance, SingleMask(mask), trainIdx, nMatches, distance);
        }
        else
        {
-            radiusMatch_caller<16, 16, L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
+            radiusMatchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
                maxDistance, WithOutMask(), trainIdx, nMatches, distance);
        }
    }
--- a/modules/gpu/src/cuda/color.cu
+++ b/modules/gpu/src/cuda/color.cu
--- a/modules/gpu/src/cudastream.cpp
+++ b/modules/gpu/src/cudastream.cpp
@ -190,6 +190,9 @@ void cv::gpu::Stream::enqueueCopy(const GpuMat& src, GpuMat& dst) { devcopy(src,

 void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val)
 {
+    CV_Assert((src.depth() != CV_64F) || 
+        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
+
    typedef void (*set_caller_t)(GpuMat& src, const Scalar& s, cudaStream_t stream);
    static const set_caller_t set_callers[] =
    {
@ -201,6 +204,11 @@ void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val)

 void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask)
 {
+    CV_Assert((src.depth() != CV_64F) || 
+        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
+
+    CV_Assert(mask.type() == CV_8UC1);
+
    typedef void (*set_caller_t)(GpuMat& src, const Scalar& s, const GpuMat& mask, cudaStream_t stream);
    static const set_caller_t set_callers[] =
    {
@ -212,6 +220,9 @@ void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask)

 void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int rtype, double alpha, double beta)
 {
+    CV_Assert((src.depth() != CV_64F && CV_MAT_DEPTH(rtype) != CV_64F) || 
+        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
+
    bool noScale = fabs(alpha-1) < std::numeric_limits<double>::epsilon() && fabs(beta) < std::numeric_limits<double>::epsilon();

    if( rtype < 0 )
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
@ -626,6 +626,10 @@ namespace

 void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst) 
 {
+    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+    CV_Assert((src1.depth() != CV_64F) || 
+        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
+
    typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
    static const func_t funcs[] = 
    {
@ -637,6 +641,10 @@ void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst)

 void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream) 
 { 
+    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+    CV_Assert((src1.depth() != CV_64F) || 
+        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
+
    typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
    static const func_t funcs[] = 
    {
@ -648,6 +656,9 @@ void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Str

 void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst) 
 {
+    CV_Assert((src1.depth() != CV_64F) || 
+        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
+
    typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
    static const func_t funcs[] = 
    {
@ -659,6 +670,9 @@ void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst)

 void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream) 
 {
+    CV_Assert((src1.depth() != CV_64F) || 
+        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
+
    typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
    static const func_t funcs[] = 
    {
@ -670,6 +684,10 @@ void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst, const Stream& st

 void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst) 
 { 
+    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+    CV_Assert((src1.depth() != CV_64F) || 
+        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
+
    typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
    static const func_t funcs[] = 
    {
@ -681,6 +699,10 @@ void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst)

 void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream) 
 { 
+    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+    CV_Assert((src1.depth() != CV_64F) || 
+        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
+
    typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
    static const func_t funcs[] = 
    {
@ -692,6 +714,9 @@ void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Str

 void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst) 
 {
+    CV_Assert((src1.depth() != CV_64F) || 
+        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
+
    typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
    static const func_t funcs[] = 
    {
@ -703,6 +728,9 @@ void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst)

 void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream) 
 {
+    CV_Assert((src1.depth() != CV_64F) || 
+        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
+
    typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
    static const func_t funcs[] = 
    {
@ -749,6 +777,9 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
    }
    else
    {
+        CV_Assert((src.depth() != CV_64F) || 
+            (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
+
        typedef void (*caller_t)(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, 
            cudaStream_t stream);

--- a/modules/gpu/src/matrix_operations.cpp
+++ b/modules/gpu/src/matrix_operations.cpp
@ -205,6 +205,9 @@ namespace

 void cv::gpu::GpuMat::convertTo( GpuMat& dst, int rtype, double alpha, double beta ) const
 {
+    CV_Assert((depth() != CV_64F && CV_MAT_DEPTH(rtype) != CV_64F) || 
+        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
+
    bool noScale = fabs(alpha-1) < std::numeric_limits<double>::epsilon() && fabs(beta) < std::numeric_limits<double>::epsilon();

    if( rtype < 0 )
@ -428,6 +431,9 @@ GpuMat& GpuMat::setTo(const Scalar& s, const GpuMat& mask)
 {
    CV_Assert(mask.type() == CV_8UC1);

+    CV_Assert((depth() != CV_64F) || 
+        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
+
    CV_DbgAssert(!this->empty());

    NppiSize sz;
--- a/modules/gpu/src/opencv2/gpu/device/transform.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/transform.hpp
@ -393,11 +393,37 @@ namespace cv
            }
        };

+        template <typename T, typename D, int scn, int dcn> struct UseSmartUn_
+        {
+            static const bool value = false;
+        };
+        template <typename T, typename D> struct UseSmartUn_<T, D, 1, 1>
+        {
+            static const bool value = device::UnReadWriteTraits<T, D>::shift != 1;
+        };
+        template <typename T, typename D> struct UseSmartUn
+        {
+            static const bool value = UseSmartUn_<T, D, device::VecTraits<T>::cn, device::VecTraits<D>::cn>::value;
+        };
+
+        template <typename T1, typename T2, typename D, int src1cn, int src2cn, int dstcn> struct UseSmartBin_
+        {
+            static const bool value = false;
+        };
+        template <typename T1, typename T2, typename D> struct UseSmartBin_<T1, T2, D, 1, 1, 1>
+        {
+            static const bool value = device::BinReadWriteTraits<T1, T2, D>::shift != 1;
+        };
+        template <typename T1, typename T2, typename D> struct UseSmartBin
+        {
+            static const bool value = UseSmartBin_<T1, T2, D, device::VecTraits<T1>::cn, device::VecTraits<T2>::cn, device::VecTraits<D>::cn>::value;
+        };
+
        template <typename T, typename D, typename UnOp, typename Mask>
        static void transform_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, UnOp op, const Mask& mask, 
            cudaStream_t stream = 0)
        {
-            TransformDispatcher<device::VecTraits<T>::cn == 1 && device::VecTraits<D>::cn == 1 && device::UnReadWriteTraits<T, D>::shift != 1>::call(src, dst, op, mask, stream);
+            TransformDispatcher< UseSmartUn<T, D>::value >::call(src, dst, op, mask, stream);
        }

        template <typename T, typename D, typename UnOp>
@ -416,7 +442,7 @@ namespace cv
        static void transform_caller(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, 
            BinOp op, const Mask& mask, cudaStream_t stream = 0)
        {
-            TransformDispatcher<device::VecTraits<T1>::cn == 1 && device::VecTraits<T2>::cn == 1 && device::VecTraits<D>::cn == 1 && device::BinReadWriteTraits<T1, T2, D>::shift != 1>::call(src1, src2, dst, op, mask, stream);
+            TransformDispatcher< UseSmartBin<T1, T2, D>::value >::call(src1, src2, dst, op, mask, stream);
        }

        template <typename T1, typename T2, typename D, typename BinOp>
--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@ -681,3 +681,66 @@ TEST(erode)
        GPU_OFF;
    }
 }
+
+TEST(threshold)
+{
+    Mat src, dst;
+    gpu::GpuMat d_src, d_dst;
+
+    for (int size = 2000; size <= 4000; size += 1000)
+    {
+        SUBTEST << "size " << size << ", 8U, THRESH_TRUNC";
+
+        gen(src, size, size, CV_8U, 0, 100);
+        dst.create(size, size, CV_8U);
+
+        CPU_ON; 
+        threshold(src, dst, 50.0, 0.0, THRESH_TRUNC);
+        CPU_OFF;
+
+        d_src = src;
+        d_dst.create(size, size, CV_8U);
+
+        GPU_ON;
+        gpu::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
+        GPU_OFF;
+    }
+
+    for (int size = 2000; size <= 4000; size += 1000)
+    {
+        SUBTEST << "size " << size << ", 8U, THRESH_BINARY";
+
+        gen(src, size, size, CV_8U, 0, 100);
+        dst.create(size, size, CV_8U);
+
+        CPU_ON; 
+        threshold(src, dst, 50.0, 0.0, THRESH_BINARY);
+        CPU_OFF;
+
+        d_src = src;
+        d_dst.create(size, size, CV_8U);
+
+        GPU_ON;
+        gpu::threshold(d_src, d_dst, 50.0, 0.0, THRESH_BINARY);
+        GPU_OFF;
+    }
+
+    for (int size = 2000; size <= 4000; size += 1000)
+    {
+        SUBTEST << "size " << size << ", 32F, THRESH_TRUNC";
+
+        gen(src, size, size, CV_32F, 0, 100);
+        dst.create(size, size, CV_32F);
+
+        CPU_ON; 
+        threshold(src, dst, 50.0, 0.0, THRESH_TRUNC);
+        CPU_OFF;
+
+        d_src = src;
+        d_dst.create(size, size, CV_32F);
+
+        GPU_ON;
+        gpu::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
+        GPU_OFF;
+    }
+}
--- a/tests/gpu/src/brute_force_matcher.cpp
+++ b/tests/gpu/src/brute_force_matcher.cpp
@ -384,7 +384,7 @@ void CV_GpuBruteForceMatcherTest::knnMatchTest( const GpuMat& query, const GpuMa

 void CV_GpuBruteForceMatcherTest::radiusMatchTest( const GpuMat& query, const GpuMat& train )
 {
-    bool atomics_ok = TargetArchs::builtWith(ATOMICS) && DeviceInfo().supports(ATOMICS);
+    bool atomics_ok = TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS);
    if (!atomics_ok)
    {
        ts->printf(CvTS::CONSOLE, "\nCode and device atomics support is required for radiusMatch (CC >= 1.1)");
--- a/tests/gpu/src/meanshift.cpp
+++ b/tests/gpu/src/meanshift.cpp
@ -53,7 +53,7 @@ struct CV_GpuMeanShiftTest : public CvTest

    void run(int)
    {
-        bool cc12_ok = TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12);
+        bool cc12_ok = TargetArchs::builtWith(FEATURE_SET_COMPUTE_12) && DeviceInfo().supports(FEATURE_SET_COMPUTE_12);
        if (!cc12_ok)
        {
            ts->printf(CvTS::CONSOLE, "\nCompute capability 1.2 is required");
@ -67,8 +67,8 @@ struct CV_GpuMeanShiftTest : public CvTest
        cv::Mat img = cv::imread(std::string(ts->get_data_path()) + "meanshift/cones.png");
        cv::Mat img_template;       
        
-        if (cv::gpu::TargetArchs::builtWith(cv::gpu::COMPUTE_20) &&
-            cv::gpu::DeviceInfo().supports(cv::gpu::COMPUTE_20))
+        if (cv::gpu::TargetArchs::builtWith(cv::gpu::FEATURE_SET_COMPUTE_20) &&
+            cv::gpu::DeviceInfo().supports(cv::gpu::FEATURE_SET_COMPUTE_20))
            img_template = cv::imread(std::string(ts->get_data_path()) + "meanshift/con_result.png");
        else
            img_template = cv::imread(std::string(ts->get_data_path()) + "meanshift/con_result_CC1X.png");
@ -145,7 +145,7 @@ struct CV_GpuMeanShiftProcTest : public CvTest

    void run(int)
    {
-        bool cc12_ok = TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12);
+        bool cc12_ok = TargetArchs::builtWith(FEATURE_SET_COMPUTE_12) && DeviceInfo().supports(FEATURE_SET_COMPUTE_12);
        if (!cc12_ok)
        {
            ts->printf(CvTS::CONSOLE, "\nCompute capability 1.2 is required");
@ -219,8 +219,8 @@ struct CV_GpuMeanShiftProcTest : public CvTest
            cv::Mat spmap_template;
            cv::FileStorage fs;

-            if (cv::gpu::TargetArchs::builtWith(cv::gpu::COMPUTE_20) &&
-                cv::gpu::DeviceInfo().supports(cv::gpu::COMPUTE_20))
+            if (cv::gpu::TargetArchs::builtWith(cv::gpu::FEATURE_SET_COMPUTE_20) &&
+                cv::gpu::DeviceInfo().supports(cv::gpu::FEATURE_SET_COMPUTE_20))
                fs.open(std::string(ts->get_data_path()) + "meanshift/spmap.yaml", cv::FileStorage::READ);
            else
                fs.open(std::string(ts->get_data_path()) + "meanshift/spmap_CC1X.yaml", cv::FileStorage::READ);
--- a/tests/gpu/src/mssegmentation.cpp
+++ b/tests/gpu/src/mssegmentation.cpp
@ -54,7 +54,7 @@ struct CV_GpuMeanShiftSegmentationTest : public CvTest {
    {
        try 
        {
-            bool cc12_ok = TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12);
+            bool cc12_ok = TargetArchs::builtWith(FEATURE_SET_COMPUTE_12) && DeviceInfo().supports(FEATURE_SET_COMPUTE_12);
            if (!cc12_ok)
            {
                ts->printf(CvTS::CONSOLE, "\nCompute capability 1.2 is required");
@ -77,7 +77,7 @@ struct CV_GpuMeanShiftSegmentationTest : public CvTest {
            {
                stringstream path;
                path << ts->get_data_path() << "meanshift/cones_segmented_sp10_sr10_minsize" << minsize;
-                if (TargetArchs::builtWith(COMPUTE_20) && DeviceInfo().supports(COMPUTE_20))
+                if (TargetArchs::builtWith(FEATURE_SET_COMPUTE_20) && DeviceInfo().supports(FEATURE_SET_COMPUTE_20))
                    path << ".png";
                else
                    path << "_CC1X.png";
--- a/tests/gpu/src/operator_convert_to.cpp
+++ b/tests/gpu/src/operator_convert_to.cpp
@ -66,21 +66,24 @@ void CV_GpuMatOpConvertToTest::run(int /* start_from */)
 {
    const Size img_size(67, 35);

-    const int types[] = {CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F};
-    const int types_num = sizeof(types) / sizeof(int);
    const char* types_str[] = {"CV_8U", "CV_8S", "CV_16U", "CV_16S", "CV_32S", "CV_32F", "CV_64F"};

    bool passed = true;
    try
    {
-        for (int i = 0; i < types_num && passed; ++i)
+        int lastType = CV_32F;
+
+        if (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE))
+            lastType = CV_64F;
+
+        for (int i = 0; i <= lastType && passed; ++i)
        {
-            for (int j = 0; j < types_num && passed; ++j)
+            for (int j = 0; j <= lastType && passed; ++j)
            {
                for (int c = 1; c < 5 && passed; ++c)
                {
-                    const int src_type = CV_MAKETYPE(types[i], c);
-                    const int dst_type = types[j];
+                    const int src_type = CV_MAKETYPE(i, c);
+                    const int dst_type = j;

                    cv::RNG rng(*ts->get_rng());

--- a/tests/gpu/src/operator_copy_to.cpp
+++ b/tests/gpu/src/operator_copy_to.cpp
@ -126,7 +126,12 @@ void CV_GpuMatOpCopyToTest::run( int /* start_from */)

    try
    {
-        for (int i = 0 ; i < 7; i++)
+        int lastType = CV_32F;
+
+        if (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE))
+            lastType = CV_64F;
+
+        for (int i = 0 ; i <= lastType; i++)
        {
            Mat cpumat(rows, cols, i);
            cpumat.setTo(Scalar::all(127));
--- a/tests/gpu/src/operator_set_to.cpp
+++ b/tests/gpu/src/operator_set_to.cpp
@ -101,7 +101,12 @@ void CV_GpuMatOpSetToTest::run( int /* start_from */)
        rng.fill(cpumask, RNG::UNIFORM, cv::Scalar::all(0.0), cv::Scalar(1.5));
        cv::gpu::GpuMat gpumask(cpumask);

-        for (int i = 0; i < 7; i++)
+        int lastType = CV_32F;
+
+        if (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE))
+            lastType = CV_64F;
+
+        for (int i = 0; i <= lastType; i++)
        {
            for (int cn = 1; cn <= 4; ++cn)
            {