refactored and fixed some gpu tests

fixed some bugs in gpu module
2025-08-05 22:19:14 +08:00 · 2012-03-20 12:03:34 +00:00 · 2012-03-20 12:03:34 +00:00 · 509c910101
commit 509c910101
parent a659832df1
9 changed files with 1225 additions and 1263 deletions
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@ -91,6 +91,12 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G
    bool tr2 = (flags & GEMM_2_T) != 0;
    bool tr3 = (flags & GEMM_3_T) != 0;

+    if (src1.type() == CV_64FC2)
+    {
+        if (tr1 || tr2 || tr3)
+            CV_Error(CV_StsNotImplemented, "transpose operation doesn't implemented for CV_64FC2 type");
+    }
+
    Size src1Size = tr1 ? Size(src1.rows, src1.cols) : src1.size();
    Size src2Size = tr2 ? Size(src2.rows, src2.cols) : src2.size();
    Size src3Size = tr3 ? Size(src3.rows, src3.cols) : src3.size();
@ -99,7 +105,7 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G
    CV_Assert(src1Size.width == src2Size.height);
    CV_Assert(src3.empty() || src3Size == dstSize);

-    dst.create(dstSize, CV_32FC1);
+    dst.create(dstSize, src1.type());

    if (beta != 0)
    {
@ -149,7 +155,7 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G
    {
    case CV_32FC1:
        cublasSafeCall( cublasSgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
-            &alphaf, 
+            &alphaf,
            src2.ptr<float>(), static_cast<int>(src2.step / sizeof(float)),
            src1.ptr<float>(), static_cast<int>(src1.step / sizeof(float)),
            &betaf,
@ -158,7 +164,7 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G

    case CV_64FC1:
        cublasSafeCall( cublasDgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
-            &alpha, 
+            &alpha,
            src2.ptr<double>(), static_cast<int>(src2.step / sizeof(double)),
            src1.ptr<double>(), static_cast<int>(src1.step / sizeof(double)),
            &beta,
@ -167,7 +173,7 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G

    case CV_32FC2:
        cublasSafeCall( cublasCgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
-            &alphacf, 
+            &alphacf,
            src2.ptr<cuComplex>(), static_cast<int>(src2.step / sizeof(cuComplex)),
            src1.ptr<cuComplex>(), static_cast<int>(src1.step / sizeof(cuComplex)),
            &betacf,
@ -176,7 +182,7 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G

    case CV_64FC2:
        cublasSafeCall( cublasZgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
-            &alphac, 
+            &alphac,
            src2.ptr<cuDoubleComplex>(), static_cast<int>(src2.step / sizeof(cuDoubleComplex)),
            src1.ptr<cuDoubleComplex>(), static_cast<int>(src1.step / sizeof(cuDoubleComplex)),
            &betac,
@ -208,8 +214,8 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s)
        sz.width  = src.cols;
        sz.height = src.rows;

-        nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), 
-            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );		
+        nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
    }
    else if (src.elemSize() == 4)
    {
@ -219,7 +225,7 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s)
        sz.width  = src.cols;
        sz.height = src.rows;

-        ncvSafeCall( nppiStTranspose_32u_C1R(const_cast<Ncv32u*>(src.ptr<Ncv32u>()), static_cast<int>(src.step), 
+        ncvSafeCall( nppiStTranspose_32u_C1R(const_cast<Ncv32u*>(src.ptr<Ncv32u>()), static_cast<int>(src.step),
            dst.ptr<Ncv32u>(), static_cast<int>(dst.step), sz) );
    }
    else // if (src.elemSize() == 8)
@ -230,8 +236,8 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s)
        sz.width  = src.cols;
        sz.height = src.rows;

-        ncvSafeCall( nppiStTranspose_64u_C1R(const_cast<Ncv64u*>(src.ptr<Ncv64u>()), static_cast<int>(src.step), 
-            dst.ptr<Ncv64u>(), static_cast<int>(dst.step), sz) );		
+        ncvSafeCall( nppiStTranspose_64u_C1R(const_cast<Ncv64u*>(src.ptr<Ncv64u>()), static_cast<int>(src.step),
+            dst.ptr<Ncv64u>(), static_cast<int>(dst.step), sz) );
    }

    if (stream == 0)
@ -285,7 +291,7 @@ void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode, Stream& stream)
 {
    typedef void (*func_t)(const GpuMat& src, GpuMat& dst, int flipCode, cudaStream_t stream);

-    static const func_t funcs[6][4] = 
+    static const func_t funcs[6][4] =
    {
        {NppMirror<CV_8U, nppiMirror_8u_C1R>::call, 0, NppMirror<CV_8U, nppiMirror_8u_C3R>::call, NppMirror<CV_8U, nppiMirror_8u_C4R>::call},
        {0,0,0,0},
@ -345,7 +351,7 @@ void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s)

    if (src.type() == CV_8UC1)
    {
-        nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), 
+        nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, nppLut.ptr<Npp32s>(), lvls.pLevels, 256) );
    }
    else
@ -361,7 +367,7 @@ void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s)
            pValues3[1] = nppLut3[1].ptr<Npp32s>();
            pValues3[2] = nppLut3[2].ptr<Npp32s>();
        }
-        nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step), 
+        nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, pValues3, lvls.pLevels3, lvls.nValues3) );
    }

@ -408,9 +414,9 @@ void cv::gpu::magnitudeSqr(const GpuMat& src, GpuMat& dst, Stream& stream)
 ////////////////////////////////////////////////////////////////////////
 // Polar <-> Cart

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace mathfunc 
+    namespace mathfunc
    {
        void cartToPolar_gpu(DevMem2Df x, DevMem2Df y, DevMem2Df mag, bool magSqr, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream);
        void polarToCart_gpu(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream);
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@ -1672,40 +1672,53 @@ namespace cv { namespace gpu { namespace device

    template<typename T, bool Signed = device::numeric_limits<T>::is_signed> struct PowOp : unary_function<T, T>
    {
-        float power;
-        PowOp(float power_) : power(power_) {}
+        const float power;

-        __device__ __forceinline__ T operator()(const T& e) const
+        PowOp(double power_) : power(static_cast<float>(power_)) {}
+
+        __device__ __forceinline__ T operator()(T e) const
        {
            return saturate_cast<T>(__powf((float)e, power));
        }
    };
-
    template<typename T> struct PowOp<T, true> : unary_function<T, T>
    {
-        float power;
-        PowOp(float power_) : power(power_) {}
+        const float power;

-        __device__ __forceinline__ float operator()(const T& e) const
+        PowOp(double power_) : power(static_cast<float>(power_)) {}
+
+        __device__ __forceinline__ T operator()(T e) const
        {
            T res = saturate_cast<T>(__powf((float)e, power));

-            if ( (e < 0) && (1 & (int)power) )
-                    res *= -1;
+            if ((e < 0) && (1 & static_cast<int>(power)))
+                res *= -1;
+
            return res;
        }
    };
-
    template<> struct PowOp<float> : unary_function<float, float>
    {
-        float power;
-        PowOp(float power_) : power(power_) {}
+        const float power;

-        __device__ __forceinline__ float operator()(const float& e) const
+        PowOp(double power_) : power(static_cast<float>(power_)) {}
+
+        __device__ __forceinline__ float operator()(float e) const
        {
            return __powf(::fabs(e), power);
        }
    };
+    template<> struct PowOp<double> : unary_function<double, double>
+    {
+        const double power;
+
+        PowOp(double power_) : power(power_) {}
+
+        __device__ __forceinline__ double operator()(double e) const
+        {
+            return ::pow(::fabs(e), power);
+        }
+    };

    namespace detail
    {
@ -1733,17 +1746,18 @@ namespace cv { namespace gpu { namespace device
    };

    template<typename T>
-    void pow_caller(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream)
+    void pow_caller(DevMem2Db src, double power, DevMem2Db dst, cudaStream_t stream)
    {
        cv::gpu::device::transform((DevMem2D_<T>)src, (DevMem2D_<T>)dst, PowOp<T>(power), WithOutMask(), stream);
    }

-    template void pow_caller<uchar>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);
-    template void pow_caller<schar>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);
-    template void pow_caller<short>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);
-    template void pow_caller<ushort>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);
-    template void pow_caller<int>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);
-    template void pow_caller<float>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);
+    template void pow_caller<uchar>(DevMem2Db src, double power, DevMem2Db dst, cudaStream_t stream);
+    template void pow_caller<schar>(DevMem2Db src, double power, DevMem2Db dst, cudaStream_t stream);
+    template void pow_caller<short>(DevMem2Db src, double power, DevMem2Db dst, cudaStream_t stream);
+    template void pow_caller<ushort>(DevMem2Db src, double power, DevMem2Db dst, cudaStream_t stream);
+    template void pow_caller<int>(DevMem2Db src, double power, DevMem2Db dst, cudaStream_t stream);
+    template void pow_caller<float>(DevMem2Db src, double power, DevMem2Db dst, cudaStream_t stream);
+    template void pow_caller<double>(DevMem2Db src, double power, DevMem2Db dst, cudaStream_t stream);

    //////////////////////////////////////////////////////////////////////////
    // addWeighted
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
@ -1301,50 +1301,26 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
    };

    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+    CV_Assert(cmpop >= CMP_EQ && cmpop <= CMP_NE);

-    int code;
-    const GpuMat* psrc1;
-    const GpuMat* psrc2;
-
-    switch (cmpop)
+    static const int codes[] =
    {
-    case CMP_EQ:
-        code = 0;
-        psrc1 = &src1;
-        psrc2 = &src2;
-        break;
-    case CMP_GE:
-        code = 3;
-        psrc1 = &src2;
-        psrc2 = &src1;
-        break;
-    case CMP_GT:
-        code = 2;
-        psrc1 = &src2;
-        psrc2 = &src1;
-        break;
-    case CMP_LE:
-        code = 3;
-        psrc1 = &src1;
-        psrc2 = &src2;
-        break;
-    case CMP_LT:
-        code = 2;
-        psrc1 = &src1;
-        psrc2 = &src2;
-        break;
-    case CMP_NE:
-        code = 1;
-        psrc1 = &src1;
-        psrc2 = &src2;
-        break;
-    default:
-        CV_Error(CV_StsBadFlag, "Incorrect compare operation");
+        0, 2, 3, 2, 3, 1
+    };
+
+    const GpuMat* psrc1[] =
+    {
+        &src1, &src2, &src2, &src1, &src1, &src1
+    };
+
+    const GpuMat* psrc2[] =
+    {
+        &src2, &src1, &src1, &src2, &src2, &src2
    };

    dst.create(src1.size(), CV_MAKE_TYPE(CV_8U, src1.channels()));

-    funcs[src1.depth()][code](psrc1->reshape(1), psrc2->reshape(1), dst.reshape(1), StreamAccessor::getStream(stream));
+    funcs[src1.depth()][codes[cmpop]](psrc1[cmpop]->reshape(1), psrc2[cmpop]->reshape(1), dst.reshape(1), StreamAccessor::getStream(stream));
 }


@ -1944,26 +1920,25 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
 namespace cv { namespace gpu { namespace device
 {
    template<typename T>
-    void pow_caller(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);
+    void pow_caller(DevMem2Db src, double power, DevMem2Db dst, cudaStream_t stream);
 }}}

 void cv::gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream)
 {
-    using namespace ::cv::gpu::device;
+    using namespace cv::gpu::device;

-    CV_Assert(src.depth() != CV_64F);
-    dst.create(src.size(), src.type());
+    typedef void (*func_t)(DevMem2Db src, double power, DevMem2Db dst, cudaStream_t stream);

-    typedef void (*caller_t)(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);
-
-    static const caller_t callers[] =
+    static const func_t funcs[] =
    {
        pow_caller<unsigned char>,  pow_caller<signed char>,
        pow_caller<unsigned short>, pow_caller<short>,
-        pow_caller<int>, pow_caller<float>
+        pow_caller<int>, pow_caller<float>, pow_caller<double>
    };

-    callers[src.depth()](src.reshape(1), (float)power, dst.reshape(1), StreamAccessor::getStream(stream));
+    dst.create(src.size(), src.type());
+
+    funcs[src.depth()](src.reshape(1), power, dst.reshape(1), StreamAccessor::getStream(stream));
 }

 ////////////////////////////////////////////////////////////////////////
@ -2052,27 +2027,11 @@ namespace cv { namespace gpu { namespace device

 void cv::gpu::addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst, int dtype, Stream& stream)
 {
-    using namespace ::cv::gpu::device;
+    using namespace cv::gpu::device;

-    CV_Assert(src1.size() == src2.size());
-    CV_Assert(src1.type() == src2.type() || (dtype >= 0 && src1.channels() == src2.channels()));
+    typedef void (*func_t)(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);

-    dtype = dtype >= 0 ? CV_MAKETYPE(dtype, src1.channels()) : src1.type();
-
-    dst.create(src1.size(), dtype);
-
-    const GpuMat* psrc1 = &src1;
-    const GpuMat* psrc2 = &src2;
-
-    if (src1.depth() > src2.depth())
-    {
-        std::swap(psrc1, psrc2);
-        std::swap(alpha, beta);
-    }
-
-    typedef void (*caller_t)(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-
-    static const caller_t callers[7][7][7] =
+    static const func_t funcs[7][7][7] =
    {
        {
            {
@ -2531,7 +2490,26 @@ void cv::gpu::addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2,
        }
    };

-    callers[psrc1->depth()][psrc2->depth()][dst.depth()](psrc1->reshape(1), alpha, psrc2->reshape(1), beta, gamma, dst.reshape(1), StreamAccessor::getStream(stream));
+    CV_Assert(src1.size() == src2.size());
+    CV_Assert(src1.type() == src2.type() || (dtype >= 0 && src1.channels() == src2.channels()));
+
+    dtype = dtype >= 0 ? CV_MAKETYPE(dtype, src1.channels()) : src1.type();
+
+    dst.create(src1.size(), dtype);
+
+    const GpuMat* psrc1 = &src1;
+    const GpuMat* psrc2 = &src2;
+
+    if (src1.depth() > src2.depth())
+    {
+        std::swap(psrc1, psrc2);
+        std::swap(alpha, beta);
+    }
+
+    const func_t func = funcs[psrc1->depth()][psrc2->depth()][dst.depth()];
+    CV_Assert(func != 0);
+
+    func(psrc1->reshape(1), alpha, psrc2->reshape(1), beta, gamma, dst.reshape(1), StreamAccessor::getStream(stream));
 }

 #endif
--- a/modules/gpu/test/main.cpp
+++ b/modules/gpu/test/main.cpp
@ -52,7 +52,7 @@ using namespace cvtest;
 using namespace testing;

 void print_info()
-{    
+{
    printf("\n");
 #if defined _WIN32
 #   if defined _WIN64
@ -78,9 +78,9 @@ void print_info()
    int driver;
    cudaDriverGetVersion(&driver);

-    printf("CUDA Driver  version: %d\n", driver);        
-    printf("CUDA Runtime version: %d\n", CUDART_VERSION);    
-    printf("CUDA device count: %d\n\n", deviceCount);    
+    printf("CUDA Driver  version: %d\n", driver);
+    printf("CUDA Runtime version: %d\n", CUDART_VERSION);
+    printf("CUDA device count: %d\n\n", deviceCount);

    for (int i = 0; i < deviceCount; ++i)
    {
@ -96,7 +96,7 @@ void print_info()
        else
            puts("    This device is NOT compatible with current GPU module build\n");
    }
-    
+
    puts("GPU module was compiled for the following GPU archs:");
    printf("    BIN: %s\n", CUDA_ARCH_BIN);
    printf("    PTX: %s\n\n", CUDA_ARCH_PTX);
--- a/modules/gpu/test/test_arithm.cpp
+++ b/modules/gpu/test/test_arithm.cpp
--- a/modules/gpu/test/test_imgproc.cpp
+++ b/modules/gpu/test/test_imgproc.cpp
@ -2362,53 +2362,6 @@ TEST_P(ColumnSum, Accuracy)

 INSTANTIATE_TEST_CASE_P(ImgProc, ColumnSum, ALL_DEVICES);

-////////////////////////////////////////////////////////////////////////
-// Norm
-
-PARAM_TEST_CASE(Norm, cv::gpu::DeviceInfo, MatType, NormCode, UseRoi)
-{
-    cv::gpu::DeviceInfo devInfo;
-    int type;
-    int normType;
-    bool useRoi;
-
-    cv::Size size;
-    cv::Mat src;
-
-    double gold;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        type = GET_PARAM(1);
-        normType = GET_PARAM(2);
-        useRoi = GET_PARAM(3);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-
-        cv::RNG& rng = TS::ptr()->get_rng();
-
-        size = cv::Size(rng.uniform(100, 400), rng.uniform(100, 400));
-
-        src = randomMat(rng, size, type, 0.0, 10.0, false);
-
-        gold = cv::norm(src, normType);
-    }
-};
-
-TEST_P(Norm, Accuracy)
-{
-    double res = cv::gpu::norm(loadMat(src, useRoi), normType);
-
-    ASSERT_NEAR(res, gold, 0.5);
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, Norm, Combine(
-                        ALL_DEVICES,
-                        TYPES(CV_8U, CV_32F, 1, 1),
-                        Values((int) cv::NORM_INF, (int) cv::NORM_L1, (int) cv::NORM_L2),
-                        WHOLE_SUBMAT));
-
 ////////////////////////////////////////////////////////////////////////////////
 // reprojectImageTo3D

--- a/modules/gpu/test/test_threshold.cpp
+++ b/modules/gpu/test/test_threshold.cpp
@ -82,7 +82,7 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Threshold, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    testing::Values(MatType(CV_8UC1), MatType(CV_16SC1), MatType(CV_32FC1)),
-    testing::Values(ThreshOp(cv::THRESH_BINARY), ThreshOp(cv::THRESH_BINARY_INV), ThreshOp(cv::THRESH_TRUNC), ThreshOp(cv::THRESH_TOZERO), ThreshOp(cv::THRESH_TOZERO_INV)),
+    ALL_THRESH_OPS,
    WHOLE_SUBMAT));

 #endif // HAVE_CUDA
--- a/modules/gpu/test/utility.cpp
+++ b/modules/gpu/test/utility.cpp
@ -45,6 +45,7 @@ using namespace std;
 using namespace cv;
 using namespace cv::gpu;
 using namespace cvtest;
+using namespace testing;

 int randomInt(int minVal, int maxVal)
 {
@ -82,9 +83,9 @@ cv::gpu::GpuMat createMat(cv::Size size, int type, bool useRoi)
        size0.width += randomInt(5, 15);
        size0.height += randomInt(5, 15);
    }
-        
+
    GpuMat d_m(size0, type);
-    
+
    if (size0 != size)
        d_m = d_m(Rect((size0.width - size.width) / 2, (size0.height - size.height) / 2, size.width, size.height));

@ -98,35 +99,6 @@ GpuMat loadMat(const Mat& m, bool useRoi)
    return d_m;
 }

-void showDiff(InputArray gold_, InputArray actual_, double eps)
-{
-    Mat gold;
-    if (gold_.kind() == _InputArray::MAT)
-        gold = gold_.getMat();
-    else
-        gold_.getGpuMat().download(gold);
-
-    Mat actual;
-    if (actual_.kind() == _InputArray::MAT)
-        actual = actual_.getMat();
-    else
-        actual_.getGpuMat().download(actual);
-
-    Mat diff;
-    absdiff(gold, actual, diff);
-    threshold(diff, diff, eps, 255.0, cv::THRESH_BINARY);
-
-    namedWindow("gold", WINDOW_NORMAL);
-    namedWindow("actual", WINDOW_NORMAL);
-    namedWindow("diff", WINDOW_NORMAL);
-
-    imshow("gold", gold);
-    imshow("actual", actual);
-    imshow("diff", diff);
-
-    waitKey();
-}
-
 bool supportFeature(const DeviceInfo& info, FeatureSet feature)
 {
    return TargetArchs::builtWith(feature) && info.supports(feature);
@ -159,7 +131,7 @@ const vector<DeviceInfo>& devices()
 vector<DeviceInfo> devices(FeatureSet feature)
 {
    const vector<DeviceInfo>& d = devices();
-    
+
    vector<DeviceInfo> devs_filtered;

    if (TargetArchs::builtWith(feature))
@ -220,20 +192,50 @@ Mat readImageType(const string& fname, int type)
    return src;
 }

-double checkNorm(const Mat& m)
+namespace
 {
-    return norm(m, NORM_INF);
+    Mat getMat(InputArray arr)
+    {
+        if (arr.kind() == _InputArray::GPU_MAT)
+        {
+            Mat m;
+            arr.getGpuMat().download(m);
+            return m;
+        }
+
+        return arr.getMat();
+    }
 }

-double checkNorm(const Mat& m1, const Mat& m2)
+void showDiff(InputArray gold_, InputArray actual_, double eps)
 {
-    return norm(m1, m2, NORM_INF);
+    Mat gold = getMat(gold_);
+    Mat actual = getMat(actual_);
+
+    Mat diff;
+    absdiff(gold, actual, diff);
+    threshold(diff, diff, eps, 255.0, cv::THRESH_BINARY);
+
+    namedWindow("gold", WINDOW_NORMAL);
+    namedWindow("actual", WINDOW_NORMAL);
+    namedWindow("diff", WINDOW_NORMAL);
+
+    imshow("gold", gold);
+    imshow("actual", actual);
+    imshow("diff", diff);
+
+    waitKey();
 }

-double checkSimilarity(const Mat& m1, const Mat& m2)
+double checkNorm(InputArray m1, const InputArray m2)
+{
+    return norm(getMat(m1), getMat(m2), NORM_INF);
+}
+
+double checkSimilarity(InputArray m1, InputArray m2)
 {
    Mat diff;
-    matchTemplate(m1, m2, diff, CV_TM_CCORR_NORMED);
+    matchTemplate(getMat(m1), getMat(m2), diff, CV_TM_CCORR_NORMED);
    return std::abs(diff.at<float>(0, 0) - 1.f);
 }

--- a/modules/gpu/test/utility.hpp
+++ b/modules/gpu/test/utility.hpp
@ -65,27 +65,30 @@ std::vector<cv::gpu::DeviceInfo> devices(cv::gpu::FeatureSet feature);
 cv::Mat readImage(const std::string& fileName, int flags = cv::IMREAD_COLOR);
 cv::Mat readImageType(const std::string& fname, int type);

-double checkNorm(const cv::Mat& m);
-double checkNorm(const cv::Mat& m1, const cv::Mat& m2);
-double checkSimilarity(const cv::Mat& m1, const cv::Mat& m2);
-
-#define EXPECT_MAT_NORM(mat, eps) \
-    { \
-        EXPECT_LE(checkNorm(cv::Mat(mat)), eps) \
-    }
+double checkNorm(cv::InputArray m1, cv::InputArray m2);

 #define EXPECT_MAT_NEAR(mat1, mat2, eps) \
    { \
        ASSERT_EQ(mat1.type(), mat2.type()); \
        ASSERT_EQ(mat1.size(), mat2.size()); \
-        EXPECT_LE(checkNorm(cv::Mat(mat1), cv::Mat(mat2)), eps); \
+        EXPECT_LE(checkNorm(mat1, mat2), eps); \
    }

+#define EXPECT_SCALAR_NEAR(s1, s2, eps) \
+    { \
+        EXPECT_NEAR(s1[0], s2[0], eps); \
+        EXPECT_NEAR(s1[1], s2[1], eps); \
+        EXPECT_NEAR(s1[2], s2[2], eps); \
+        EXPECT_NEAR(s1[3], s2[3], eps); \
+    }
+
+double checkSimilarity(cv::InputArray m1, cv::InputArray m2);
+
 #define EXPECT_MAT_SIMILAR(mat1, mat2, eps) \
    { \
        ASSERT_EQ(mat1.type(), mat2.type()); \
        ASSERT_EQ(mat1.size(), mat2.size()); \
-        EXPECT_LE(checkSimilarity(cv::Mat(mat1), cv::Mat(mat2)), eps); \
+        EXPECT_LE(checkSimilarity(mat1, mat2), eps); \
    }

 namespace cv { namespace gpu
@ -112,8 +115,10 @@ public:
 private:
    bool val_;
 };
-
 void PrintTo(const UseRoi& useRoi, std::ostream* os);
+#define WHOLE testing::Values(UseRoi(false))
+#define SUBMAT testing::Values(UseRoi(true))
+#define WHOLE_SUBMAT testing::Values(UseRoi(false), UseRoi(true))

 class Inverse
 {
@ -125,25 +130,30 @@ public:
 private:
    bool val_;
 };
-
 void PrintTo(const Inverse& useRoi, std::ostream* os);
+#define DIRECT_INVERSE testing::Values(Inverse(false), Inverse(true))

 CV_ENUM(CmpCode, cv::CMP_EQ, cv::CMP_GT, cv::CMP_GE, cv::CMP_LT, cv::CMP_LE, cv::CMP_NE)
+#define ALL_CMP_CODES testing::Values(CmpCode(cv::CMP_EQ), CmpCode(cv::CMP_NE), CmpCode(cv::CMP_GT), CmpCode(cv::CMP_GE), CmpCode(cv::CMP_LT), CmpCode(cv::CMP_LE))

 CV_ENUM(NormCode, cv::NORM_INF, cv::NORM_L1, cv::NORM_L2, cv::NORM_TYPE_MASK, cv::NORM_RELATIVE, cv::NORM_MINMAX)

 enum {FLIP_BOTH = 0, FLIP_X = 1, FLIP_Y = -1};
 CV_ENUM(FlipCode, FLIP_BOTH, FLIP_X, FLIP_Y)
+#define ALL_FLIP_CODES testing::Values(FlipCode(FLIP_BOTH), FlipCode(FLIP_X), FlipCode(FLIP_Y))

-CV_ENUM(ReduceOp, CV_REDUCE_SUM, CV_REDUCE_AVG, CV_REDUCE_MAX, CV_REDUCE_MIN)
+CV_ENUM(ReduceCode, CV_REDUCE_SUM, CV_REDUCE_AVG, CV_REDUCE_MAX, CV_REDUCE_MIN)
+#define ALL_REDUCE_CODES testing::Values(ReduceCode(CV_REDUCE_SUM), ReduceCode(CV_REDUCE_AVG), ReduceCode(CV_REDUCE_MAX), ReduceCode(CV_REDUCE_MIN))

-CV_FLAGS(GemmFlags, cv::GEMM_1_T, cv::GEMM_2_T, cv::GEMM_3_T);
+CV_FLAGS(GemmFlags, 0, cv::GEMM_1_T, cv::GEMM_2_T, cv::GEMM_3_T);
+#define ALL_GEMM_FLAGS testing::Values(GemmFlags(0), GemmFlags(cv::GEMM_1_T), GemmFlags(cv::GEMM_2_T), GemmFlags(cv::GEMM_3_T), GemmFlags(cv::GEMM_1_T | cv::GEMM_2_T), GemmFlags(cv::GEMM_1_T | cv::GEMM_3_T), GemmFlags(cv::GEMM_1_T | cv::GEMM_2_T | cv::GEMM_3_T))

 CV_ENUM(DistType, cv::gpu::BruteForceMatcher_GPU_base::L1Dist, cv::gpu::BruteForceMatcher_GPU_base::L2Dist)

 CV_ENUM(MorphOp, cv::MORPH_OPEN, cv::MORPH_CLOSE, cv::MORPH_GRADIENT, cv::MORPH_TOPHAT, cv::MORPH_BLACKHAT)

 CV_ENUM(ThreshOp, cv::THRESH_BINARY, cv::THRESH_BINARY_INV, cv::THRESH_TRUNC, cv::THRESH_TOZERO, cv::THRESH_TOZERO_INV)
+#define ALL_THRESH_OPS testing::Values(ThreshOp(cv::THRESH_BINARY), ThreshOp(cv::THRESH_BINARY_INV), ThreshOp(cv::THRESH_TRUNC), ThreshOp(cv::THRESH_TOZERO), ThreshOp(cv::THRESH_TOZERO_INV))

 CV_ENUM(Interpolation, cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_CUBIC)

@ -194,12 +204,4 @@ CV_FLAGS(DftFlags, cv::DFT_INVERSE, cv::DFT_SCALE, cv::DFT_ROWS, cv::DFT_COMPLEX
                                                                                        \
                                    std::make_pair(MatDepth(CV_64F), MatDepth(CV_64F)))

-#define WHOLE testing::Values(UseRoi(false))
-#define SUBMAT testing::Values(UseRoi(true))
-#define WHOLE_SUBMAT testing::Values(UseRoi(false), UseRoi(true))
-
-#define DIRECT_INVERSE testing::Values(Inverse(false), Inverse(true))
-
-#define ALL_CMP_CODES testing::Values(CmpCode(cv::CMP_EQ), CmpCode(cv::CMP_NE), CmpCode(cv::CMP_GT), CmpCode(cv::CMP_GE), CmpCode(cv::CMP_LT), CmpCode(cv::CMP_LE))
-
 #endif // __OPENCV_TEST_UTILITY_HPP__