Merge pull request #943 from jet47:cuda-5.5-support

2025-07-30 17:37:05 +08:00 · 2013-06-03 16:08:23 +04:00 · 2013-06-03 16:08:23 +04:00 · 75cf5cc4ee
commit 75cf5cc4ee
parent 5237647f75 bcf8bdb401
10 changed files with 339 additions and 296 deletions
--- a/cmake/OpenCVDetectCUDA.cmake
+++ b/cmake/OpenCVDetectCUDA.cmake
@ -26,6 +26,15 @@ if(CUDA_FOUND)
    set(HAVE_CUBLAS 1)
  endif()

+  if(${CUDA_VERSION} VERSION_LESS "5.5")
+    find_cuda_helper_libs(npp)
+  else()
+    find_cuda_helper_libs(nppc)
+    find_cuda_helper_libs(nppi)
+    find_cuda_helper_libs(npps)
+    set(CUDA_npp_LIBRARY ${CUDA_nppc_LIBRARY} ${CUDA_nppi_LIBRARY} ${CUDA_npps_LIBRARY})
+  endif()
+
  if(WITH_NVCUVID)
    find_cuda_helper_libs(nvcuvid)
    set(HAVE_NVCUVID 1)
@ -136,8 +145,6 @@ if(CUDA_FOUND)

  mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD CUDA_SDK_ROOT_DIR)

-  find_cuda_helper_libs(npp)
-
  macro(ocv_cuda_compile VAR)
    foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG)
      set(${var}_backup_in_cuda_compile_ "${${var}}")
--- a/modules/gpu/include/opencv2/gpu/device/detail/color_detail.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/detail/color_detail.hpp
@ -120,11 +120,8 @@ namespace cv { namespace gpu { namespace device
                return dst;
            }

-            __device__ __forceinline__ RGB2RGB()
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
-
-            __device__ __forceinline__ RGB2RGB(const RGB2RGB& other_)
-                :unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
+            __host__ __device__ __forceinline__ RGB2RGB() {}
+            __host__ __device__ __forceinline__ RGB2RGB(const RGB2RGB&) {}
        };

        template <> struct RGB2RGB<uchar, 4, 4, 2> : unary_function<uint, uint>
@ -141,8 +138,8 @@ namespace cv { namespace gpu { namespace device
                return dst;
            }

-            __device__ __forceinline__ RGB2RGB():unary_function<uint, uint>(){}
-            __device__ __forceinline__ RGB2RGB(const RGB2RGB& other_):unary_function<uint, uint>(){}
+            __host__ __device__ __forceinline__ RGB2RGB() {}
+            __host__ __device__ __forceinline__ RGB2RGB(const RGB2RGB&) {}
        };
    }

@ -203,8 +200,8 @@ namespace cv { namespace gpu { namespace device
                return RGB2RGB5x5Converter<green_bits, bidx>::cvt(src);
            }

-            __device__ __forceinline__ RGB2RGB5x5():unary_function<uchar3, ushort>(){}
-            __device__ __forceinline__ RGB2RGB5x5(const RGB2RGB5x5& other_):unary_function<uchar3, ushort>(){}
+            __host__ __device__ __forceinline__ RGB2RGB5x5() {}
+            __host__ __device__ __forceinline__ RGB2RGB5x5(const RGB2RGB5x5&) {}
        };

        template<int bidx, int green_bits> struct RGB2RGB5x5<4, bidx,green_bits> : unary_function<uint, ushort>
@ -214,8 +211,8 @@ namespace cv { namespace gpu { namespace device
                return RGB2RGB5x5Converter<green_bits, bidx>::cvt(src);
            }

-            __device__ __forceinline__ RGB2RGB5x5():unary_function<uint, ushort>(){}
-            __device__ __forceinline__ RGB2RGB5x5(const RGB2RGB5x5& other_):unary_function<uint, ushort>(){}
+            __host__ __device__ __forceinline__ RGB2RGB5x5() {}
+            __host__ __device__ __forceinline__ RGB2RGB5x5(const RGB2RGB5x5&) {}
        };
    }

@ -282,8 +279,8 @@ namespace cv { namespace gpu { namespace device
                RGB5x52RGBConverter<green_bits, bidx>::cvt(src, dst);
                return dst;
            }
-            __device__ __forceinline__ RGB5x52RGB():unary_function<ushort, uchar3>(){}
-            __device__ __forceinline__ RGB5x52RGB(const RGB5x52RGB& other_):unary_function<ushort, uchar3>(){}
+            __host__ __device__ __forceinline__ RGB5x52RGB() {}
+            __host__ __device__ __forceinline__ RGB5x52RGB(const RGB5x52RGB&) {}

        };

@ -295,8 +292,8 @@ namespace cv { namespace gpu { namespace device
                RGB5x52RGBConverter<green_bits, bidx>::cvt(src, dst);
                return dst;
            }
-            __device__ __forceinline__ RGB5x52RGB():unary_function<ushort, uint>(){}
-            __device__ __forceinline__ RGB5x52RGB(const RGB5x52RGB& other_):unary_function<ushort, uint>(){}
+            __host__ __device__ __forceinline__ RGB5x52RGB() {}
+            __host__ __device__ __forceinline__ RGB5x52RGB(const RGB5x52RGB&) {}
        };
    }

@ -325,9 +322,8 @@ namespace cv { namespace gpu { namespace device

                return dst;
            }
-            __device__ __forceinline__ Gray2RGB():unary_function<T, typename TypeVec<T, dcn>::vec_type>(){}
-            __device__ __forceinline__ Gray2RGB(const Gray2RGB& other_)
-                : unary_function<T, typename TypeVec<T, dcn>::vec_type>(){}
+            __host__ __device__ __forceinline__ Gray2RGB() {}
+            __host__ __device__ __forceinline__ Gray2RGB(const Gray2RGB&) {}
        };

        template <> struct Gray2RGB<uchar, 4> : unary_function<uchar, uint>
@ -342,8 +338,8 @@ namespace cv { namespace gpu { namespace device

                return dst;
            }
-            __device__ __forceinline__ Gray2RGB():unary_function<uchar, uint>(){}
-            __device__ __forceinline__ Gray2RGB(const Gray2RGB& other_):unary_function<uchar, uint>(){}
+            __host__ __device__ __forceinline__ Gray2RGB() {}
+            __host__ __device__ __forceinline__ Gray2RGB(const Gray2RGB&) {}
        };
    }

@ -384,8 +380,8 @@ namespace cv { namespace gpu { namespace device
                return Gray2RGB5x5Converter<green_bits>::cvt(src);
            }

-            __device__ __forceinline__ Gray2RGB5x5():unary_function<uchar, ushort>(){}
-            __device__ __forceinline__ Gray2RGB5x5(const Gray2RGB5x5& other_):unary_function<uchar, ushort>(){}
+            __host__ __device__ __forceinline__ Gray2RGB5x5() {}
+            __host__ __device__ __forceinline__ Gray2RGB5x5(const Gray2RGB5x5&) {}
        };
    }

@ -426,8 +422,8 @@ namespace cv { namespace gpu { namespace device
            {
                return RGB5x52GrayConverter<green_bits>::cvt(src);
            }
-            __device__ __forceinline__ RGB5x52Gray() : unary_function<ushort, uchar>(){}
-            __device__ __forceinline__ RGB5x52Gray(const RGB5x52Gray& other_) : unary_function<ushort, uchar>(){}
+            __host__ __device__ __forceinline__ RGB5x52Gray() {}
+            __host__ __device__ __forceinline__ RGB5x52Gray(const RGB5x52Gray&) {}
        };
    }

@ -467,9 +463,8 @@ namespace cv { namespace gpu { namespace device
            {
                return RGB2GrayConvert<bidx>(&src.x);
            }
-            __device__ __forceinline__ RGB2Gray() : unary_function<typename TypeVec<T, scn>::vec_type, T>(){}
-            __device__ __forceinline__ RGB2Gray(const RGB2Gray& other_)
-                : unary_function<typename TypeVec<T, scn>::vec_type, T>(){}
+            __host__ __device__ __forceinline__ RGB2Gray() {}
+            __host__ __device__ __forceinline__ RGB2Gray(const RGB2Gray&) {}
        };

        template <int bidx> struct RGB2Gray<uchar, 4, bidx> : unary_function<uint, uchar>
@ -478,8 +473,8 @@ namespace cv { namespace gpu { namespace device
            {
                return RGB2GrayConvert<bidx>(src);
            }
-            __device__ __forceinline__ RGB2Gray() : unary_function<uint, uchar>(){}
-            __device__ __forceinline__ RGB2Gray(const RGB2Gray& other_) : unary_function<uint, uchar>(){}
+            __host__ __device__ __forceinline__ RGB2Gray() {}
+            __host__ __device__ __forceinline__ RGB2Gray(const RGB2Gray&) {}
        };
    }

@ -529,10 +524,8 @@ namespace cv { namespace gpu { namespace device
                RGB2YUVConvert<bidx>(&src.x, dst);
                return dst;
            }
-            __device__ __forceinline__ RGB2YUV()
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
-            __device__ __forceinline__ RGB2YUV(const RGB2YUV& other_)
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
+            __host__ __device__ __forceinline__ RGB2YUV() {}
+            __host__ __device__ __forceinline__ RGB2YUV(const RGB2YUV&) {}
        };
    }

@ -609,10 +602,8 @@ namespace cv { namespace gpu { namespace device

                return dst;
            }
-            __device__ __forceinline__ YUV2RGB()
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
-            __device__ __forceinline__ YUV2RGB(const YUV2RGB& other_)
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
+            __host__ __device__ __forceinline__ YUV2RGB() {}
+            __host__ __device__ __forceinline__ YUV2RGB(const YUV2RGB&) {}
        };

        template <int bidx> struct YUV2RGB<uchar, 4, 4, bidx> : unary_function<uint, uint>
@ -621,8 +612,8 @@ namespace cv { namespace gpu { namespace device
            {
                return YUV2RGBConvert<bidx>(src);
            }
-            __device__ __forceinline__ YUV2RGB() : unary_function<uint, uint>(){}
-            __device__ __forceinline__ YUV2RGB(const YUV2RGB& other_) : unary_function<uint, uint>(){}
+            __host__ __device__ __forceinline__ YUV2RGB() {}
+            __host__ __device__ __forceinline__ YUV2RGB(const YUV2RGB&) {}
        };
    }

@ -689,10 +680,8 @@ namespace cv { namespace gpu { namespace device
                RGB2YCrCbConvert<bidx>(&src.x, dst);
                return dst;
            }
-            __device__ __forceinline__ RGB2YCrCb()
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
-            __device__ __forceinline__ RGB2YCrCb(const RGB2YCrCb& other_)
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
+            __host__ __device__ __forceinline__ RGB2YCrCb() {}
+            __host__ __device__ __forceinline__ RGB2YCrCb(const RGB2YCrCb&) {}
        };

        template <int bidx> struct RGB2YCrCb<uchar, 4, 4, bidx> : unary_function<uint, uint>
@ -702,8 +691,8 @@ namespace cv { namespace gpu { namespace device
                return RGB2YCrCbConvert<bidx>(src);
            }

-            __device__ __forceinline__ RGB2YCrCb() : unary_function<uint, uint>(){}
-            __device__ __forceinline__ RGB2YCrCb(const RGB2YCrCb& other_) : unary_function<uint, uint>(){}
+            __host__ __device__ __forceinline__ RGB2YCrCb() {}
+            __host__ __device__ __forceinline__ RGB2YCrCb(const RGB2YCrCb&) {}
        };
    }

@ -771,10 +760,8 @@ namespace cv { namespace gpu { namespace device

                return dst;
            }
-            __device__ __forceinline__ YCrCb2RGB()
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
-            __device__ __forceinline__ YCrCb2RGB(const YCrCb2RGB& other_)
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
+            __host__ __device__ __forceinline__ YCrCb2RGB() {}
+            __host__ __device__ __forceinline__ YCrCb2RGB(const YCrCb2RGB&) {}
        };

        template <int bidx> struct YCrCb2RGB<uchar, 4, 4, bidx> : unary_function<uint, uint>
@ -783,8 +770,8 @@ namespace cv { namespace gpu { namespace device
            {
                return YCrCb2RGBConvert<bidx>(src);
            }
-            __device__ __forceinline__ YCrCb2RGB() : unary_function<uint, uint>(){}
-            __device__ __forceinline__ YCrCb2RGB(const YCrCb2RGB& other_) : unary_function<uint, uint>(){}
+            __host__ __device__ __forceinline__ YCrCb2RGB() {}
+            __host__ __device__ __forceinline__ YCrCb2RGB(const YCrCb2RGB&) {}
        };
    }

@ -849,10 +836,8 @@ namespace cv { namespace gpu { namespace device

                return dst;
            }
-            __device__ __forceinline__ RGB2XYZ()
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
-            __device__ __forceinline__ RGB2XYZ(const RGB2XYZ& other_)
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
+            __host__ __device__ __forceinline__ RGB2XYZ() {}
+            __host__ __device__ __forceinline__ RGB2XYZ(const RGB2XYZ&) {}
        };

        template <int bidx> struct RGB2XYZ<uchar, 4, 4, bidx> : unary_function<uint, uint>
@ -861,8 +846,8 @@ namespace cv { namespace gpu { namespace device
            {
                return RGB2XYZConvert<bidx>(src);
            }
-            __device__ __forceinline__ RGB2XYZ() : unary_function<uint, uint>(){}
-            __device__ __forceinline__ RGB2XYZ(const RGB2XYZ& other_) : unary_function<uint, uint>(){}
+            __host__ __device__ __forceinline__ RGB2XYZ() {}
+            __host__ __device__ __forceinline__ RGB2XYZ(const RGB2XYZ&) {}
        };
    }

@ -926,10 +911,8 @@ namespace cv { namespace gpu { namespace device

                return dst;
            }
-            __device__ __forceinline__ XYZ2RGB()
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
-            __device__ __forceinline__ XYZ2RGB(const XYZ2RGB& other_)
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
+            __host__ __device__ __forceinline__ XYZ2RGB() {}
+            __host__ __device__ __forceinline__ XYZ2RGB(const XYZ2RGB&) {}
        };

        template <int bidx> struct XYZ2RGB<uchar, 4, 4, bidx> : unary_function<uint, uint>
@ -938,8 +921,8 @@ namespace cv { namespace gpu { namespace device
            {
                return XYZ2RGBConvert<bidx>(src);
            }
-            __device__ __forceinline__ XYZ2RGB() : unary_function<uint, uint>(){}
-            __device__ __forceinline__ XYZ2RGB(const XYZ2RGB& other_) : unary_function<uint, uint>(){}
+            __host__ __device__ __forceinline__ XYZ2RGB() {}
+            __host__ __device__ __forceinline__ XYZ2RGB(const XYZ2RGB&) {}
        };
    }

@ -1066,10 +1049,8 @@ namespace cv { namespace gpu { namespace device

                return dst;
            }
-            __device__ __forceinline__ RGB2HSV()
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
-            __device__ __forceinline__ RGB2HSV(const RGB2HSV& other_)
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
+            __host__ __device__ __forceinline__ RGB2HSV() {}
+            __host__ __device__ __forceinline__ RGB2HSV(const RGB2HSV&) {}
        };

        template <int bidx, int hr> struct RGB2HSV<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>
@ -1078,8 +1059,8 @@ namespace cv { namespace gpu { namespace device
            {
                return RGB2HSVConvert<bidx, hr>(src);
            }
-            __device__ __forceinline__ RGB2HSV():unary_function<uint, uint>(){}
-            __device__ __forceinline__ RGB2HSV(const RGB2HSV& other_):unary_function<uint, uint>(){}
+            __host__ __device__ __forceinline__ RGB2HSV() {}
+            __host__ __device__ __forceinline__ RGB2HSV(const RGB2HSV&) {}
        };
    }

@ -1208,10 +1189,8 @@ namespace cv { namespace gpu { namespace device

                return dst;
            }
-            __device__ __forceinline__ HSV2RGB()
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
-            __device__ __forceinline__ HSV2RGB(const HSV2RGB& other_)
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
+            __host__ __device__ __forceinline__ HSV2RGB() {}
+            __host__ __device__ __forceinline__ HSV2RGB(const HSV2RGB&) {}
        };

        template <int bidx, int hr> struct HSV2RGB<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>
@ -1220,8 +1199,8 @@ namespace cv { namespace gpu { namespace device
            {
                return HSV2RGBConvert<bidx, hr>(src);
            }
-            __device__ __forceinline__ HSV2RGB():unary_function<uint, uint>(){}
-            __device__ __forceinline__ HSV2RGB(const HSV2RGB& other_):unary_function<uint, uint>(){}
+            __host__ __device__ __forceinline__ HSV2RGB() {}
+            __host__ __device__ __forceinline__ HSV2RGB(const HSV2RGB&) {}
        };
    }

@ -1343,10 +1322,8 @@ namespace cv { namespace gpu { namespace device

                return dst;
            }
-            __device__ __forceinline__ RGB2HLS()
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
-            __device__ __forceinline__ RGB2HLS(const RGB2HLS& other_)
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
+            __host__ __device__ __forceinline__ RGB2HLS() {}
+            __host__ __device__ __forceinline__ RGB2HLS(const RGB2HLS&) {}
        };

        template <int bidx, int hr> struct RGB2HLS<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>
@ -1355,8 +1332,8 @@ namespace cv { namespace gpu { namespace device
            {
                return RGB2HLSConvert<bidx, hr>(src);
            }
-            __device__ __forceinline__ RGB2HLS() : unary_function<uint, uint>(){}
-            __device__ __forceinline__ RGB2HLS(const RGB2HLS& other_) : unary_function<uint, uint>(){}
+            __host__ __device__ __forceinline__ RGB2HLS() {}
+            __host__ __device__ __forceinline__ RGB2HLS(const RGB2HLS&) {}
        };
    }

@ -1485,10 +1462,8 @@ namespace cv { namespace gpu { namespace device

                return dst;
            }
-            __device__ __forceinline__ HLS2RGB()
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
-            __device__ __forceinline__ HLS2RGB(const HLS2RGB& other_)
-                : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>(){}
+            __host__ __device__ __forceinline__ HLS2RGB() {}
+            __host__ __device__ __forceinline__ HLS2RGB(const HLS2RGB&) {}
        };

        template <int bidx, int hr> struct HLS2RGB<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>
@ -1497,8 +1472,8 @@ namespace cv { namespace gpu { namespace device
            {
                return HLS2RGBConvert<bidx, hr>(src);
            }
-            __device__ __forceinline__ HLS2RGB() : unary_function<uint, uint>(){}
-            __device__ __forceinline__ HLS2RGB(const HLS2RGB& other_) : unary_function<uint, uint>(){}
+            __host__ __device__ __forceinline__ HLS2RGB() {}
+            __host__ __device__ __forceinline__ HLS2RGB(const HLS2RGB&) {}
        };
    }

@ -1651,8 +1626,8 @@ namespace cv { namespace gpu { namespace device

                return dst;
            }
-            __device__ __forceinline__ RGB2Lab() {}
-            __device__ __forceinline__ RGB2Lab(const RGB2Lab& other_) {}
+            __host__ __device__ __forceinline__ RGB2Lab() {}
+            __host__ __device__ __forceinline__ RGB2Lab(const RGB2Lab&) {}
        };
        template <int scn, int dcn, bool srgb, int blueIdx>
        struct RGB2Lab<float, scn, dcn, srgb, blueIdx>
@ -1666,8 +1641,8 @@ namespace cv { namespace gpu { namespace device

                return dst;
            }
-            __device__ __forceinline__ RGB2Lab() {}
-            __device__ __forceinline__ RGB2Lab(const RGB2Lab& other_) {}
+            __host__ __device__ __forceinline__ RGB2Lab() {}
+            __host__ __device__ __forceinline__ RGB2Lab(const RGB2Lab&) {}
        };
    }

@ -1764,8 +1739,8 @@ namespace cv { namespace gpu { namespace device

                return dst;
            }
-            __device__ __forceinline__ Lab2RGB() {}
-            __device__ __forceinline__ Lab2RGB(const Lab2RGB& other_) {}
+            __host__ __device__ __forceinline__ Lab2RGB() {}
+            __host__ __device__ __forceinline__ Lab2RGB(const Lab2RGB&) {}
        };
        template <int scn, int dcn, bool srgb, int blueIdx>
        struct Lab2RGB<float, scn, dcn, srgb, blueIdx>
@ -1779,8 +1754,8 @@ namespace cv { namespace gpu { namespace device

                return dst;
            }
-            __device__ __forceinline__ Lab2RGB() {}
-            __device__ __forceinline__ Lab2RGB(const Lab2RGB& other_) {}
+            __host__ __device__ __forceinline__ Lab2RGB() {}
+            __host__ __device__ __forceinline__ Lab2RGB(const Lab2RGB&) {}
        };
    }

@ -1863,8 +1838,8 @@ namespace cv { namespace gpu { namespace device

                return dst;
            }
-            __device__ __forceinline__ RGB2Luv() {}
-            __device__ __forceinline__ RGB2Luv(const RGB2Luv& other_) {}
+            __host__ __device__ __forceinline__ RGB2Luv() {}
+            __host__ __device__ __forceinline__ RGB2Luv(const RGB2Luv&) {}
        };
        template <int scn, int dcn, bool srgb, int blueIdx>
        struct RGB2Luv<float, scn, dcn, srgb, blueIdx>
@ -1878,8 +1853,8 @@ namespace cv { namespace gpu { namespace device

                return dst;
            }
-            __device__ __forceinline__ RGB2Luv() {}
-            __device__ __forceinline__ RGB2Luv(const RGB2Luv& other_) {}
+            __host__ __device__ __forceinline__ RGB2Luv() {}
+            __host__ __device__ __forceinline__ RGB2Luv(const RGB2Luv&) {}
        };
    }

@ -1964,8 +1939,8 @@ namespace cv { namespace gpu { namespace device

                return dst;
            }
-            __device__ __forceinline__ Luv2RGB() {}
-            __device__ __forceinline__ Luv2RGB(const Luv2RGB& other_) {}
+            __host__ __device__ __forceinline__ Luv2RGB() {}
+            __host__ __device__ __forceinline__ Luv2RGB(const Luv2RGB&) {}
        };
        template <int scn, int dcn, bool srgb, int blueIdx>
        struct Luv2RGB<float, scn, dcn, srgb, blueIdx>
@ -1979,8 +1954,8 @@ namespace cv { namespace gpu { namespace device

                return dst;
            }
-            __device__ __forceinline__ Luv2RGB() {}
-            __device__ __forceinline__ Luv2RGB(const Luv2RGB& other_) {}
+            __host__ __device__ __forceinline__ Luv2RGB() {}
+            __host__ __device__ __forceinline__ Luv2RGB(const Luv2RGB&) {}
        };
    }

--- a/modules/gpu/include/opencv2/gpu/device/functional.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/functional.hpp
@ -63,8 +63,8 @@ namespace cv { namespace gpu { namespace device
        {
            return a + b;
        }
-        __device__ __forceinline__ plus(const plus& other):binary_function<T,T,T>(){}
-        __device__ __forceinline__ plus():binary_function<T,T,T>(){}
+        __host__ __device__ __forceinline__ plus() {}
+        __host__ __device__ __forceinline__ plus(const plus&) {}
    };

    template <typename T> struct minus : binary_function<T, T, T>
@ -74,8 +74,8 @@ namespace cv { namespace gpu { namespace device
        {
            return a - b;
        }
-        __device__ __forceinline__ minus(const minus& other):binary_function<T,T,T>(){}
-        __device__ __forceinline__ minus():binary_function<T,T,T>(){}
+        __host__ __device__ __forceinline__ minus() {}
+        __host__ __device__ __forceinline__ minus(const minus&) {}
    };

    template <typename T> struct multiplies : binary_function<T, T, T>
@ -85,8 +85,8 @@ namespace cv { namespace gpu { namespace device
        {
            return a * b;
        }
-        __device__ __forceinline__ multiplies(const multiplies& other):binary_function<T,T,T>(){}
-        __device__ __forceinline__ multiplies():binary_function<T,T,T>(){}
+        __host__ __device__ __forceinline__ multiplies() {}
+        __host__ __device__ __forceinline__ multiplies(const multiplies&) {}
    };

    template <typename T> struct divides : binary_function<T, T, T>
@ -96,8 +96,8 @@ namespace cv { namespace gpu { namespace device
        {
            return a / b;
        }
-        __device__ __forceinline__ divides(const divides& other):binary_function<T,T,T>(){}
-        __device__ __forceinline__ divides():binary_function<T,T,T>(){}
+        __host__ __device__ __forceinline__ divides() {}
+        __host__ __device__ __forceinline__ divides(const divides&) {}
    };

    template <typename T> struct modulus : binary_function<T, T, T>
@ -107,8 +107,8 @@ namespace cv { namespace gpu { namespace device
        {
            return a % b;
        }
-        __device__ __forceinline__ modulus(const modulus& other):binary_function<T,T,T>(){}
-        __device__ __forceinline__ modulus():binary_function<T,T,T>(){}
+        __host__ __device__ __forceinline__ modulus() {}
+        __host__ __device__ __forceinline__ modulus(const modulus&) {}
    };

    template <typename T> struct negate : unary_function<T, T>
@ -117,8 +117,8 @@ namespace cv { namespace gpu { namespace device
        {
            return -a;
        }
-        __device__ __forceinline__ negate(const negate& other):unary_function<T,T>(){}
-        __device__ __forceinline__ negate():unary_function<T,T>(){}
+        __host__ __device__ __forceinline__ negate() {}
+        __host__ __device__ __forceinline__ negate(const negate&) {}
    };

    // Comparison Operations
@ -129,8 +129,8 @@ namespace cv { namespace gpu { namespace device
        {
            return a == b;
        }
-        __device__ __forceinline__ equal_to(const equal_to& other):binary_function<T,T,bool>(){}
-        __device__ __forceinline__ equal_to():binary_function<T,T,bool>(){}
+        __host__ __device__ __forceinline__ equal_to() {}
+        __host__ __device__ __forceinline__ equal_to(const equal_to&) {}
    };

    template <typename T> struct not_equal_to : binary_function<T, T, bool>
@ -140,8 +140,8 @@ namespace cv { namespace gpu { namespace device
        {
            return a != b;
        }
-        __device__ __forceinline__ not_equal_to(const not_equal_to& other):binary_function<T,T,bool>(){}
-        __device__ __forceinline__ not_equal_to():binary_function<T,T,bool>(){}
+        __host__ __device__ __forceinline__ not_equal_to() {}
+        __host__ __device__ __forceinline__ not_equal_to(const not_equal_to&) {}
    };

    template <typename T> struct greater : binary_function<T, T, bool>
@ -151,8 +151,8 @@ namespace cv { namespace gpu { namespace device
        {
            return a > b;
        }
-        __device__ __forceinline__ greater(const greater& other):binary_function<T,T,bool>(){}
-        __device__ __forceinline__ greater():binary_function<T,T,bool>(){}
+        __host__ __device__ __forceinline__ greater() {}
+        __host__ __device__ __forceinline__ greater(const greater&) {}
    };

    template <typename T> struct less : binary_function<T, T, bool>
@ -162,8 +162,8 @@ namespace cv { namespace gpu { namespace device
        {
            return a < b;
        }
-        __device__ __forceinline__ less(const less& other):binary_function<T,T,bool>(){}
-        __device__ __forceinline__ less():binary_function<T,T,bool>(){}
+        __host__ __device__ __forceinline__ less() {}
+        __host__ __device__ __forceinline__ less(const less&) {}
    };

    template <typename T> struct greater_equal : binary_function<T, T, bool>
@ -173,8 +173,8 @@ namespace cv { namespace gpu { namespace device
        {
            return a >= b;
        }
-        __device__ __forceinline__ greater_equal(const greater_equal& other):binary_function<T,T,bool>(){}
-        __device__ __forceinline__ greater_equal():binary_function<T,T,bool>(){}
+        __host__ __device__ __forceinline__ greater_equal() {}
+        __host__ __device__ __forceinline__ greater_equal(const greater_equal&) {}
    };

    template <typename T> struct less_equal : binary_function<T, T, bool>
@ -184,8 +184,8 @@ namespace cv { namespace gpu { namespace device
        {
            return a <= b;
        }
-        __device__ __forceinline__ less_equal(const less_equal& other):binary_function<T,T,bool>(){}
-        __device__ __forceinline__ less_equal():binary_function<T,T,bool>(){}
+        __host__ __device__ __forceinline__ less_equal() {}
+        __host__ __device__ __forceinline__ less_equal(const less_equal&) {}
    };

    // Logical Operations
@ -196,8 +196,8 @@ namespace cv { namespace gpu { namespace device
        {
            return a && b;
        }
-        __device__ __forceinline__ logical_and(const logical_and& other):binary_function<T,T,bool>(){}
-        __device__ __forceinline__ logical_and():binary_function<T,T,bool>(){}
+        __host__ __device__ __forceinline__ logical_and() {}
+        __host__ __device__ __forceinline__ logical_and(const logical_and&) {}
    };

    template <typename T> struct logical_or : binary_function<T, T, bool>
@ -207,8 +207,8 @@ namespace cv { namespace gpu { namespace device
        {
            return a || b;
        }
-        __device__ __forceinline__ logical_or(const logical_or& other):binary_function<T,T,bool>(){}
-        __device__ __forceinline__ logical_or():binary_function<T,T,bool>(){}
+        __host__ __device__ __forceinline__ logical_or() {}
+        __host__ __device__ __forceinline__ logical_or(const logical_or&) {}
    };

    template <typename T> struct logical_not : unary_function<T, bool>
@ -217,8 +217,8 @@ namespace cv { namespace gpu { namespace device
        {
            return !a;
        }
-        __device__ __forceinline__ logical_not(const logical_not& other):unary_function<T,bool>(){}
-        __device__ __forceinline__ logical_not():unary_function<T,bool>(){}
+        __host__ __device__ __forceinline__ logical_not() {}
+        __host__ __device__ __forceinline__ logical_not(const logical_not&) {}
    };

    // Bitwise Operations
@ -229,8 +229,8 @@ namespace cv { namespace gpu { namespace device
        {
            return a & b;
        }
-        __device__ __forceinline__ bit_and(const bit_and& other):binary_function<T,T,T>(){}
-        __device__ __forceinline__ bit_and():binary_function<T,T,T>(){}
+        __host__ __device__ __forceinline__ bit_and() {}
+        __host__ __device__ __forceinline__ bit_and(const bit_and&) {}
    };

    template <typename T> struct bit_or : binary_function<T, T, T>
@ -240,8 +240,8 @@ namespace cv { namespace gpu { namespace device
        {
            return a | b;
        }
-        __device__ __forceinline__ bit_or(const bit_or& other):binary_function<T,T,T>(){}
-        __device__ __forceinline__ bit_or():binary_function<T,T,T>(){}
+        __host__ __device__ __forceinline__ bit_or() {}
+        __host__ __device__ __forceinline__ bit_or(const bit_or&) {}
    };

    template <typename T> struct bit_xor : binary_function<T, T, T>
@ -251,8 +251,8 @@ namespace cv { namespace gpu { namespace device
        {
            return a ^ b;
        }
-        __device__ __forceinline__ bit_xor(const bit_xor& other):binary_function<T,T,T>(){}
-        __device__ __forceinline__ bit_xor():binary_function<T,T,T>(){}
+        __host__ __device__ __forceinline__ bit_xor() {}
+        __host__ __device__ __forceinline__ bit_xor(const bit_xor&) {}
    };

    template <typename T> struct bit_not : unary_function<T, T>
@ -261,8 +261,8 @@ namespace cv { namespace gpu { namespace device
        {
            return ~v;
        }
-        __device__ __forceinline__ bit_not(const bit_not& other):unary_function<T,T>(){}
-        __device__ __forceinline__ bit_not():unary_function<T,T>(){}
+        __host__ __device__ __forceinline__ bit_not() {}
+        __host__ __device__ __forceinline__ bit_not(const bit_not&) {}
    };

    // Generalized Identity Operations
@ -272,8 +272,8 @@ namespace cv { namespace gpu { namespace device
        {
            return x;
        }
-        __device__ __forceinline__ identity(const identity& other):unary_function<T,T>(){}
-        __device__ __forceinline__ identity():unary_function<T,T>(){}
+        __host__ __device__ __forceinline__ identity() {}
+        __host__ __device__ __forceinline__ identity(const identity&) {}
    };

    template <typename T1, typename T2> struct project1st : binary_function<T1, T2, T1>
@ -282,8 +282,8 @@ namespace cv { namespace gpu { namespace device
        {
            return lhs;
        }
-        __device__ __forceinline__ project1st(const project1st& other):binary_function<T1,T2,T1>(){}
-        __device__ __forceinline__ project1st():binary_function<T1,T2,T1>(){}
+        __host__ __device__ __forceinline__ project1st() {}
+        __host__ __device__ __forceinline__ project1st(const project1st&) {}
    };

    template <typename T1, typename T2> struct project2nd : binary_function<T1, T2, T2>
@ -292,8 +292,8 @@ namespace cv { namespace gpu { namespace device
        {
            return rhs;
        }
-        __device__ __forceinline__ project2nd(const project2nd& other):binary_function<T1,T2,T2>(){}
-        __device__ __forceinline__ project2nd():binary_function<T1,T2,T2>(){}
+        __host__ __device__ __forceinline__ project2nd() {}
+        __host__ __device__ __forceinline__ project2nd(const project2nd&) {}
    };

    // Min/Max Operations
@ -302,8 +302,8 @@ namespace cv { namespace gpu { namespace device
    template <> struct name<type> : binary_function<type, type, type> \
    { \
        __device__ __forceinline__ type operator()(type lhs, type rhs) const {return op(lhs, rhs);} \
-        __device__ __forceinline__ name() {}\
-        __device__ __forceinline__ name(const name&) {}\
+        __host__ __device__ __forceinline__ name() {}\
+        __host__ __device__ __forceinline__ name(const name&) {}\
    };

    template <typename T> struct maximum : binary_function<T, T, T>
@ -312,8 +312,8 @@ namespace cv { namespace gpu { namespace device
        {
            return max(lhs, rhs);
        }
-        __device__ __forceinline__ maximum() {}
-        __device__ __forceinline__ maximum(const maximum&) {}
+        __host__ __device__ __forceinline__ maximum() {}
+        __host__ __device__ __forceinline__ maximum(const maximum&) {}
    };

    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uchar, ::max)
@ -332,8 +332,8 @@ namespace cv { namespace gpu { namespace device
        {
            return min(lhs, rhs);
        }
-        __device__ __forceinline__ minimum() {}
-        __device__ __forceinline__ minimum(const minimum&) {}
+        __host__ __device__ __forceinline__ minimum() {}
+        __host__ __device__ __forceinline__ minimum(const minimum&) {}
    };

    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uchar, ::min)
@ -349,7 +349,6 @@ namespace cv { namespace gpu { namespace device
 #undef OPENCV_GPU_IMPLEMENT_MINMAX

    // Math functions
-///bound=========================================

    template <typename T> struct abs_func : unary_function<T, T>
    {
@ -358,8 +357,8 @@ namespace cv { namespace gpu { namespace device
            return abs(x);
        }

-        __device__ __forceinline__ abs_func() {}
-        __device__ __forceinline__ abs_func(const abs_func&) {}
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
    };
    template <> struct abs_func<unsigned char> : unary_function<unsigned char, unsigned char>
    {
@ -368,8 +367,8 @@ namespace cv { namespace gpu { namespace device
            return x;
        }

-        __device__ __forceinline__ abs_func() {}
-        __device__ __forceinline__ abs_func(const abs_func&) {}
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
    };
    template <> struct abs_func<signed char> : unary_function<signed char, signed char>
    {
@ -378,8 +377,8 @@ namespace cv { namespace gpu { namespace device
            return ::abs((int)x);
        }

-        __device__ __forceinline__ abs_func() {}
-        __device__ __forceinline__ abs_func(const abs_func&) {}
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
    };
    template <> struct abs_func<char> : unary_function<char, char>
    {
@ -388,8 +387,8 @@ namespace cv { namespace gpu { namespace device
            return ::abs((int)x);
        }

-        __device__ __forceinline__ abs_func() {}
-        __device__ __forceinline__ abs_func(const abs_func&) {}
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
    };
    template <> struct abs_func<unsigned short> : unary_function<unsigned short, unsigned short>
    {
@ -398,8 +397,8 @@ namespace cv { namespace gpu { namespace device
            return x;
        }

-        __device__ __forceinline__ abs_func() {}
-        __device__ __forceinline__ abs_func(const abs_func&) {}
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
    };
    template <> struct abs_func<short> : unary_function<short, short>
    {
@ -408,8 +407,8 @@ namespace cv { namespace gpu { namespace device
            return ::abs((int)x);
        }

-        __device__ __forceinline__ abs_func() {}
-        __device__ __forceinline__ abs_func(const abs_func&) {}
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
    };
    template <> struct abs_func<unsigned int> : unary_function<unsigned int, unsigned int>
    {
@ -418,8 +417,8 @@ namespace cv { namespace gpu { namespace device
            return x;
        }

-        __device__ __forceinline__ abs_func() {}
-        __device__ __forceinline__ abs_func(const abs_func&) {}
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
    };
    template <> struct abs_func<int> : unary_function<int, int>
    {
@ -428,8 +427,8 @@ namespace cv { namespace gpu { namespace device
            return ::abs(x);
        }

-        __device__ __forceinline__ abs_func() {}
-        __device__ __forceinline__ abs_func(const abs_func&) {}
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
    };
    template <> struct abs_func<float> : unary_function<float, float>
    {
@ -438,8 +437,8 @@ namespace cv { namespace gpu { namespace device
            return ::fabsf(x);
        }

-        __device__ __forceinline__ abs_func() {}
-        __device__ __forceinline__ abs_func(const abs_func&) {}
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
    };
    template <> struct abs_func<double> : unary_function<double, double>
    {
@ -448,8 +447,8 @@ namespace cv { namespace gpu { namespace device
            return ::fabs(x);
        }

-        __device__ __forceinline__ abs_func() {}
-        __device__ __forceinline__ abs_func(const abs_func&) {}
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
    };

 #define OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(name, func) \
@ -459,8 +458,8 @@ namespace cv { namespace gpu { namespace device
        { \
            return func ## f(v); \
        } \
-        __device__ __forceinline__ name ## _func() {} \
-        __device__ __forceinline__ name ## _func(const name ## _func&) {} \
+        __host__ __device__ __forceinline__ name ## _func() {} \
+        __host__ __device__ __forceinline__ name ## _func(const name ## _func&) {} \
    }; \
    template <> struct name ## _func<double> : unary_function<double, double> \
    { \
@ -468,8 +467,8 @@ namespace cv { namespace gpu { namespace device
        { \
            return func(v); \
        } \
-        __device__ __forceinline__ name ## _func() {} \
-        __device__ __forceinline__ name ## _func(const name ## _func&) {} \
+        __host__ __device__ __forceinline__ name ## _func() {} \
+        __host__ __device__ __forceinline__ name ## _func(const name ## _func&) {} \
    };

 #define OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(name, func) \
@ -479,6 +478,8 @@ namespace cv { namespace gpu { namespace device
        { \
            return func ## f(v1, v2); \
        } \
+        __host__ __device__ __forceinline__ name ## _func() {} \
+        __host__ __device__ __forceinline__ name ## _func(const name ## _func&) {} \
    }; \
    template <> struct name ## _func<double> : binary_function<double, double, double> \
    { \
@ -486,6 +487,8 @@ namespace cv { namespace gpu { namespace device
        { \
            return func(v1, v2); \
        } \
+        __host__ __device__ __forceinline__ name ## _func() {} \
+        __host__ __device__ __forceinline__ name ## _func(const name ## _func&) {} \
    };

    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sqrt, ::sqrt)
@ -522,8 +525,8 @@ namespace cv { namespace gpu { namespace device
        {
            return src1 * src1 + src2 * src2;
        }
-        __device__ __forceinline__ hypot_sqr_func(const hypot_sqr_func& other) : binary_function<T, T, float>(){}
-        __device__ __forceinline__ hypot_sqr_func() : binary_function<T, T, float>(){}
+        __host__ __device__ __forceinline__ hypot_sqr_func() {}
+        __host__ __device__ __forceinline__ hypot_sqr_func(const hypot_sqr_func&) {}
    };

    // Saturate Cast Functor
@ -533,8 +536,8 @@ namespace cv { namespace gpu { namespace device
        {
            return saturate_cast<D>(v);
        }
-        __device__ __forceinline__ saturate_cast_func(const saturate_cast_func& other):unary_function<T, D>(){}
-        __device__ __forceinline__ saturate_cast_func():unary_function<T, D>(){}
+        __host__ __device__ __forceinline__ saturate_cast_func() {}
+        __host__ __device__ __forceinline__ saturate_cast_func(const saturate_cast_func&) {}
    };

    // Threshold Functors
@ -547,10 +550,9 @@ namespace cv { namespace gpu { namespace device
            return (src > thresh) * maxVal;
        }

-        __device__ __forceinline__ thresh_binary_func(const thresh_binary_func& other)
-            : unary_function<T, T>(), thresh(other.thresh), maxVal(other.maxVal){}
-
-        __device__ __forceinline__ thresh_binary_func():unary_function<T, T>(){}
+        __host__ __device__ __forceinline__ thresh_binary_func() {}
+        __host__ __device__ __forceinline__ thresh_binary_func(const thresh_binary_func& other)
+            : thresh(other.thresh), maxVal(other.maxVal) {}

        const T thresh;
        const T maxVal;
@ -565,10 +567,9 @@ namespace cv { namespace gpu { namespace device
            return (src <= thresh) * maxVal;
        }

-        __device__ __forceinline__ thresh_binary_inv_func(const thresh_binary_inv_func& other)
-            : unary_function<T, T>(), thresh(other.thresh), maxVal(other.maxVal){}
-
-        __device__ __forceinline__ thresh_binary_inv_func():unary_function<T, T>(){}
+        __host__ __device__ __forceinline__ thresh_binary_inv_func() {}
+        __host__ __device__ __forceinline__ thresh_binary_inv_func(const thresh_binary_inv_func& other)
+            : thresh(other.thresh), maxVal(other.maxVal) {}

        const T thresh;
        const T maxVal;
@ -583,10 +584,9 @@ namespace cv { namespace gpu { namespace device
            return minimum<T>()(src, thresh);
        }

-        __device__ __forceinline__ thresh_trunc_func(const thresh_trunc_func& other)
-            : unary_function<T, T>(), thresh(other.thresh){}
-
-        __device__ __forceinline__ thresh_trunc_func():unary_function<T, T>(){}
+        __host__ __device__ __forceinline__ thresh_trunc_func() {}
+        __host__ __device__ __forceinline__ thresh_trunc_func(const thresh_trunc_func& other)
+            : thresh(other.thresh) {}

        const T thresh;
    };
@ -599,10 +599,10 @@ namespace cv { namespace gpu { namespace device
        {
            return (src > thresh) * src;
        }
-        __device__ __forceinline__ thresh_to_zero_func(const thresh_to_zero_func& other)
-            : unary_function<T, T>(), thresh(other.thresh){}

-        __device__ __forceinline__ thresh_to_zero_func():unary_function<T, T>(){}
+        __host__ __device__ __forceinline__ thresh_to_zero_func() {}
+       __host__  __device__ __forceinline__ thresh_to_zero_func(const thresh_to_zero_func& other)
+            : thresh(other.thresh) {}

        const T thresh;
    };
@ -615,14 +615,14 @@ namespace cv { namespace gpu { namespace device
        {
            return (src <= thresh) * src;
        }
-        __device__ __forceinline__ thresh_to_zero_inv_func(const thresh_to_zero_inv_func& other)
-            : unary_function<T, T>(), thresh(other.thresh){}

-        __device__ __forceinline__ thresh_to_zero_inv_func():unary_function<T, T>(){}
+        __host__ __device__ __forceinline__ thresh_to_zero_inv_func() {}
+        __host__ __device__ __forceinline__ thresh_to_zero_inv_func(const thresh_to_zero_inv_func& other)
+            : thresh(other.thresh) {}

        const T thresh;
    };
-//bound!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ============>
+
    // Function Object Adaptors
    template <typename Predicate> struct unary_negate : unary_function<typename Predicate::argument_type, bool>
    {
@ -633,8 +633,8 @@ namespace cv { namespace gpu { namespace device
          return !pred(x);
      }

-        __device__ __forceinline__ unary_negate(const unary_negate& other) : unary_function<typename Predicate::argument_type, bool>(){}
-        __device__ __forceinline__ unary_negate() : unary_function<typename Predicate::argument_type, bool>(){}
+      __host__ __device__ __forceinline__ unary_negate() {}
+      __host__ __device__ __forceinline__ unary_negate(const unary_negate& other) : pred(other.pred) {}

      const Predicate pred;
    };
@ -653,11 +653,9 @@ namespace cv { namespace gpu { namespace device
        {
            return !pred(x,y);
        }
-        __device__ __forceinline__ binary_negate(const binary_negate& other)
-        : binary_function<typename Predicate::first_argument_type, typename Predicate::second_argument_type, bool>(){}

-        __device__ __forceinline__ binary_negate() :
-        binary_function<typename Predicate::first_argument_type, typename Predicate::second_argument_type, bool>(){}
+        __host__ __device__ __forceinline__ binary_negate() {}
+        __host__ __device__ __forceinline__ binary_negate(const binary_negate& other) : pred(other.pred) {}

        const Predicate pred;
    };
@ -676,8 +674,8 @@ namespace cv { namespace gpu { namespace device
            return op(arg1, a);
        }

-        __device__ __forceinline__ binder1st(const binder1st& other) :
-        unary_function<typename Op::second_argument_type, typename Op::result_type>(){}
+        __host__ __device__ __forceinline__ binder1st() {}
+        __host__ __device__ __forceinline__ binder1st(const binder1st& other) : op(other.op), arg1(other.arg1) {}

        const Op op;
        const typename Op::first_argument_type arg1;
@ -697,8 +695,8 @@ namespace cv { namespace gpu { namespace device
            return op(a, arg2);
        }

-         __device__ __forceinline__ binder2nd(const binder2nd& other) :
-        unary_function<typename Op::first_argument_type, typename Op::result_type>(), op(other.op), arg2(other.arg2){}
+        __host__ __device__ __forceinline__ binder2nd() {}
+        __host__ __device__ __forceinline__ binder2nd(const binder2nd& other) : op(other.op), arg2(other.arg2) {}

        const Op op;
        const typename Op::second_argument_type arg2;
--- a/modules/gpu/include/opencv2/gpu/device/utility.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/utility.hpp
@ -124,8 +124,8 @@ namespace cv { namespace gpu { namespace device

    struct WithOutMask
    {
-        __device__ __forceinline__ WithOutMask(){}
-        __device__ __forceinline__ WithOutMask(const WithOutMask& mask){}
+        __host__ __device__ __forceinline__ WithOutMask(){}
+        __host__ __device__ __forceinline__ WithOutMask(const WithOutMask&){}

        __device__ __forceinline__ void next() const
        {
--- a/modules/gpu/src/cuda/calib3d.cu
+++ b/modules/gpu/src/cuda/calib3d.cu
@ -67,8 +67,8 @@ namespace cv { namespace gpu { namespace device
                        crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
                        crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
            }
-            __device__ __forceinline__ TransformOp() {}
-            __device__ __forceinline__ TransformOp(const TransformOp&) {}
+            __host__ __device__ __forceinline__ TransformOp() {}
+            __host__ __device__ __forceinline__ TransformOp(const TransformOp&) {}
        };

        void call(const PtrStepSz<float3> src, const float* rot,
@ -106,8 +106,8 @@ namespace cv { namespace gpu { namespace device
                        (cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z,
                        (cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z);
            }
-            __device__ __forceinline__ ProjectOp() {}
-            __device__ __forceinline__ ProjectOp(const ProjectOp&) {}
+            __host__ __device__ __forceinline__ ProjectOp() {}
+            __host__ __device__ __forceinline__ ProjectOp(const ProjectOp&) {}
        };

        void call(const PtrStepSz<float3> src, const float* rot,
--- a/modules/gpu/src/cuda/canny.cu
+++ b/modules/gpu/src/cuda/canny.cu
@ -62,8 +62,8 @@ namespace canny
            return ::abs(x) + ::abs(y);
        }

-        __device__ __forceinline__ L1() {}
-        __device__ __forceinline__ L1(const L1&) {}
+        __host__ __device__ __forceinline__ L1() {}
+        __host__ __device__ __forceinline__ L1(const L1&) {}
    };
    struct L2 : binary_function<int, int, float>
    {
@ -72,8 +72,8 @@ namespace canny
            return ::sqrtf(x * x + y * y);
        }

-        __device__ __forceinline__ L2() {}
-        __device__ __forceinline__ L2(const L2&) {}
+        __host__ __device__ __forceinline__ L2() {}
+        __host__ __device__ __forceinline__ L2(const L2&) {}
    };
 }

@ -470,8 +470,8 @@ namespace canny
            return (uchar)(-(e >> 1));
        }

-        __device__ __forceinline__ GetEdges() {}
-        __device__ __forceinline__ GetEdges(const GetEdges&) {}
+        __host__ __device__ __forceinline__ GetEdges() {}
+        __host__ __device__ __forceinline__ GetEdges(const GetEdges&) {}
    };
 }

--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@ -162,8 +162,8 @@ namespace arithm
            return vadd4(a, b);
        }

-        __device__ __forceinline__ VAdd4() {}
-        __device__ __forceinline__ VAdd4(const VAdd4& other) {}
+        __host__ __device__ __forceinline__ VAdd4() {}
+        __host__ __device__ __forceinline__ VAdd4(const VAdd4&) {}
    };

    ////////////////////////////////////
@ -175,8 +175,8 @@ namespace arithm
            return vadd2(a, b);
        }

-        __device__ __forceinline__ VAdd2() {}
-        __device__ __forceinline__ VAdd2(const VAdd2& other) {}
+        __host__ __device__ __forceinline__ VAdd2() {}
+        __host__ __device__ __forceinline__ VAdd2(const VAdd2&) {}
    };

    ////////////////////////////////////
@ -188,8 +188,8 @@ namespace arithm
            return saturate_cast<D>(a + b);
        }

-        __device__ __forceinline__ AddMat() {}
-        __device__ __forceinline__ AddMat(const AddMat& other) {}
+        __host__ __device__ __forceinline__ AddMat() {}
+        __host__ __device__ __forceinline__ AddMat(const AddMat&) {}
    };
 }

@ -397,8 +397,8 @@ namespace arithm
            return vsub4(a, b);
        }

-        __device__ __forceinline__ VSub4() {}
-        __device__ __forceinline__ VSub4(const VSub4& other) {}
+        __host__ __device__ __forceinline__ VSub4() {}
+        __host__ __device__ __forceinline__ VSub4(const VSub4&) {}
    };

    ////////////////////////////////////
@ -410,8 +410,8 @@ namespace arithm
            return vsub2(a, b);
        }

-        __device__ __forceinline__ VSub2() {}
-        __device__ __forceinline__ VSub2(const VSub2& other) {}
+        __host__ __device__ __forceinline__ VSub2() {}
+        __host__ __device__ __forceinline__ VSub2(const VSub2&) {}
    };

    ////////////////////////////////////
@ -423,8 +423,8 @@ namespace arithm
            return saturate_cast<D>(a - b);
        }

-        __device__ __forceinline__ SubMat() {}
-        __device__ __forceinline__ SubMat(const SubMat& other) {}
+        __host__ __device__ __forceinline__ SubMat() {}
+        __host__ __device__ __forceinline__ SubMat(const SubMat&) {}
    };
 }

@ -617,8 +617,8 @@ namespace arithm
            return res;
        }

-        __device__ __forceinline__ Mul_8uc4_32f() {}
-        __device__ __forceinline__ Mul_8uc4_32f(const Mul_8uc4_32f& other) {}
+        __host__ __device__ __forceinline__ Mul_8uc4_32f() {}
+        __host__ __device__ __forceinline__ Mul_8uc4_32f(const Mul_8uc4_32f&) {}
    };

    struct Mul_16sc4_32f : binary_function<short4, float, short4>
@ -629,8 +629,8 @@ namespace arithm
                               saturate_cast<short>(a.z * b), saturate_cast<short>(a.w * b));
        }

-        __device__ __forceinline__ Mul_16sc4_32f() {}
-        __device__ __forceinline__ Mul_16sc4_32f(const Mul_16sc4_32f& other) {}
+        __host__ __device__ __forceinline__ Mul_16sc4_32f() {}
+        __host__ __device__ __forceinline__ Mul_16sc4_32f(const Mul_16sc4_32f&) {}
    };

    template <typename T, typename D> struct Mul : binary_function<T, T, D>
@ -640,8 +640,8 @@ namespace arithm
            return saturate_cast<D>(a * b);
        }

-        __device__ __forceinline__ Mul() {}
-        __device__ __forceinline__ Mul(const Mul& other) {}
+        __host__ __device__ __forceinline__ Mul() {}
+        __host__ __device__ __forceinline__ Mul(const Mul&) {}
    };

    template <typename T, typename S, typename D> struct MulScale : binary_function<T, T, D>
@ -888,8 +888,8 @@ namespace arithm
            return b != 0 ? saturate_cast<D>(a / b) : 0;
        }

-        __device__ __forceinline__ Div() {}
-        __device__ __forceinline__ Div(const Div& other) {}
+        __host__ __device__ __forceinline__ Div() {}
+        __host__ __device__ __forceinline__ Div(const Div&) {}
    };
    template <typename T> struct Div<T, float> : binary_function<T, T, float>
    {
@ -898,8 +898,8 @@ namespace arithm
            return b != 0 ? static_cast<float>(a) / b : 0;
        }

-        __device__ __forceinline__ Div() {}
-        __device__ __forceinline__ Div(const Div& other) {}
+        __host__ __device__ __forceinline__ Div() {}
+        __host__ __device__ __forceinline__ Div(const Div&) {}
    };
    template <typename T> struct Div<T, double> : binary_function<T, T, double>
    {
@ -908,8 +908,8 @@ namespace arithm
            return b != 0 ? static_cast<double>(a) / b : 0;
        }

-        __device__ __forceinline__ Div() {}
-        __device__ __forceinline__ Div(const Div& other) {}
+        __host__ __device__ __forceinline__ Div() {}
+        __host__ __device__ __forceinline__ Div(const Div&) {}
    };

    template <typename T, typename S, typename D> struct DivScale : binary_function<T, T, D>
@ -1196,8 +1196,8 @@ namespace arithm
            return vabsdiff4(a, b);
        }

-        __device__ __forceinline__ VAbsDiff4() {}
-        __device__ __forceinline__ VAbsDiff4(const VAbsDiff4& other) {}
+        __host__ __device__ __forceinline__ VAbsDiff4() {}
+        __host__ __device__ __forceinline__ VAbsDiff4(const VAbsDiff4&) {}
    };

    ////////////////////////////////////
@ -1209,8 +1209,8 @@ namespace arithm
            return vabsdiff2(a, b);
        }

-        __device__ __forceinline__ VAbsDiff2() {}
-        __device__ __forceinline__ VAbsDiff2(const VAbsDiff2& other) {}
+        __host__ __device__ __forceinline__ VAbsDiff2() {}
+        __host__ __device__ __forceinline__ VAbsDiff2(const VAbsDiff2&) {}
    };

    ////////////////////////////////////
@ -1235,8 +1235,8 @@ namespace arithm
            return saturate_cast<T>(_abs(a - b));
        }

-        __device__ __forceinline__ AbsDiffMat() {}
-        __device__ __forceinline__ AbsDiffMat(const AbsDiffMat& other) {}
+        __host__ __device__ __forceinline__ AbsDiffMat() {}
+        __host__ __device__ __forceinline__ AbsDiffMat(const AbsDiffMat&) {}
    };
 }

@ -1370,8 +1370,8 @@ namespace arithm
            return saturate_cast<T>(x * x);
        }

-        __device__ __forceinline__ Sqr() {}
-        __device__ __forceinline__ Sqr(const Sqr& other) {}
+        __host__ __device__ __forceinline__ Sqr() {}
+        __host__ __device__ __forceinline__ Sqr(const Sqr&) {}
    };
 }

@ -1466,8 +1466,8 @@ namespace arithm
            return saturate_cast<T>(f(x));
        }

-        __device__ __forceinline__ Exp() {}
-        __device__ __forceinline__ Exp(const Exp& other) {}
+        __host__ __device__ __forceinline__ Exp() {}
+        __host__ __device__ __forceinline__ Exp(const Exp&) {}
    };
 }

@ -1507,8 +1507,8 @@ namespace arithm
            return vcmpeq4(a, b);
        }

-        __device__ __forceinline__ VCmpEq4() {}
-        __device__ __forceinline__ VCmpEq4(const VCmpEq4& other) {}
+        __host__ __device__ __forceinline__ VCmpEq4() {}
+        __host__ __device__ __forceinline__ VCmpEq4(const VCmpEq4&) {}
    };
    struct VCmpNe4 : binary_function<uint, uint, uint>
    {
@ -1517,8 +1517,8 @@ namespace arithm
            return vcmpne4(a, b);
        }

-        __device__ __forceinline__ VCmpNe4() {}
-        __device__ __forceinline__ VCmpNe4(const VCmpNe4& other) {}
+        __host__ __device__ __forceinline__ VCmpNe4() {}
+        __host__ __device__ __forceinline__ VCmpNe4(const VCmpNe4&) {}
    };
    struct VCmpLt4 : binary_function<uint, uint, uint>
    {
@ -1527,8 +1527,8 @@ namespace arithm
            return vcmplt4(a, b);
        }

-        __device__ __forceinline__ VCmpLt4() {}
-        __device__ __forceinline__ VCmpLt4(const VCmpLt4& other) {}
+        __host__ __device__ __forceinline__ VCmpLt4() {}
+        __host__ __device__ __forceinline__ VCmpLt4(const VCmpLt4&) {}
    };
    struct VCmpLe4 : binary_function<uint, uint, uint>
    {
@ -1537,8 +1537,8 @@ namespace arithm
            return vcmple4(a, b);
        }

-        __device__ __forceinline__ VCmpLe4() {}
-        __device__ __forceinline__ VCmpLe4(const VCmpLe4& other) {}
+        __host__ __device__ __forceinline__ VCmpLe4() {}
+        __host__ __device__ __forceinline__ VCmpLe4(const VCmpLe4&) {}
    };

    ////////////////////////////////////
@ -2008,8 +2008,8 @@ namespace arithm
            return vmin4(a, b);
        }

-        __device__ __forceinline__ VMin4() {}
-        __device__ __forceinline__ VMin4(const VMin4& other) {}
+        __host__ __device__ __forceinline__ VMin4() {}
+        __host__ __device__ __forceinline__ VMin4(const VMin4&) {}
    };

    ////////////////////////////////////
@ -2021,8 +2021,8 @@ namespace arithm
            return vmin2(a, b);
        }

-        __device__ __forceinline__ VMin2() {}
-        __device__ __forceinline__ VMin2(const VMin2& other) {}
+        __host__ __device__ __forceinline__ VMin2() {}
+        __host__ __device__ __forceinline__ VMin2(const VMin2&) {}
    };
 }

@ -2100,8 +2100,8 @@ namespace arithm
            return vmax4(a, b);
        }

-        __device__ __forceinline__ VMax4() {}
-        __device__ __forceinline__ VMax4(const VMax4& other) {}
+        __host__ __device__ __forceinline__ VMax4() {}
+        __host__ __device__ __forceinline__ VMax4(const VMax4&) {}
    };

    ////////////////////////////////////
@ -2113,8 +2113,8 @@ namespace arithm
            return vmax2(a, b);
        }

-        __device__ __forceinline__ VMax2() {}
-        __device__ __forceinline__ VMax2(const VMax2& other) {}
+        __host__ __device__ __forceinline__ VMax2() {}
+        __host__ __device__ __forceinline__ VMax2(const VMax2&) {}
    };
 }

--- a/modules/gpu/src/error.cpp
+++ b/modules/gpu/src/error.cpp
@ -81,48 +81,90 @@ namespace

    const ErrorEntry npp_errors [] =
    {
-        error_entry( NPP_NOT_SUPPORTED_MODE_ERROR ),
-        error_entry( NPP_ROUND_MODE_NOT_SUPPORTED_ERROR ),
-        error_entry( NPP_RESIZE_NO_OPERATION_ERROR ),
-
 #if defined (_MSC_VER)
        error_entry( NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY ),
 #endif

+#if NPP_VERSION < 5500
        error_entry( NPP_BAD_ARG_ERROR ),
-        error_entry( NPP_LUT_NUMBER_OF_LEVELS_ERROR ),
-        error_entry( NPP_TEXTURE_BIND_ERROR ),
        error_entry( NPP_COEFF_ERROR ),
        error_entry( NPP_RECT_ERROR ),
        error_entry( NPP_QUAD_ERROR ),
+        error_entry( NPP_MEMFREE_ERR ),
+        error_entry( NPP_MEMSET_ERR ),
+        error_entry( NPP_MEM_ALLOC_ERR ),
+        error_entry( NPP_HISTO_NUMBER_OF_LEVELS_ERROR ),
+        error_entry( NPP_MIRROR_FLIP_ERR ),
+        error_entry( NPP_INVALID_INPUT ),
+        error_entry( NPP_POINTER_ERROR ),
+        error_entry( NPP_WARNING ),
+        error_entry( NPP_ODD_ROI_WARNING ),
+#else
+        error_entry( NPP_INVALID_HOST_POINTER_ERROR ),
+        error_entry( NPP_INVALID_DEVICE_POINTER_ERROR ),
+        error_entry( NPP_LUT_PALETTE_BITSIZE_ERROR ),
+        error_entry( NPP_ZC_MODE_NOT_SUPPORTED_ERROR ),
+        error_entry( NPP_MEMFREE_ERROR ),
+        error_entry( NPP_MEMSET_ERROR ),
+        error_entry( NPP_QUALITY_INDEX_ERROR ),
+        error_entry( NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR ),
+        error_entry( NPP_CHANNEL_ORDER_ERROR ),
+        error_entry( NPP_ZERO_MASK_VALUE_ERROR ),
+        error_entry( NPP_QUADRANGLE_ERROR ),
+        error_entry( NPP_RECTANGLE_ERROR ),
+        error_entry( NPP_COEFFICIENT_ERROR ),
+        error_entry( NPP_NUMBER_OF_CHANNELS_ERROR ),
+        error_entry( NPP_COI_ERROR ),
+        error_entry( NPP_DIVISOR_ERROR ),
+        error_entry( NPP_CHANNEL_ERROR ),
+        error_entry( NPP_STRIDE_ERROR ),
+        error_entry( NPP_ANCHOR_ERROR ),
+        error_entry( NPP_MASK_SIZE_ERROR ),
+        error_entry( NPP_MIRROR_FLIP_ERROR ),
+        error_entry( NPP_MOMENT_00_ZERO_ERROR ),
+        error_entry( NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR ),
+        error_entry( NPP_THRESHOLD_ERROR ),
+        error_entry( NPP_CONTEXT_MATCH_ERROR ),
+        error_entry( NPP_FFT_FLAG_ERROR ),
+        error_entry( NPP_FFT_ORDER_ERROR ),
+        error_entry( NPP_SCALE_RANGE_ERROR ),
+        error_entry( NPP_DATA_TYPE_ERROR ),
+        error_entry( NPP_OUT_OFF_RANGE_ERROR ),
+        error_entry( NPP_DIVIDE_BY_ZERO_ERROR ),
+        error_entry( NPP_MEMORY_ALLOCATION_ERR ),
+        error_entry( NPP_RANGE_ERROR ),
+        error_entry( NPP_BAD_ARGUMENT_ERROR ),
+        error_entry( NPP_NO_MEMORY_ERROR ),
+        error_entry( NPP_ERROR_RESERVED ),
+        error_entry( NPP_NO_OPERATION_WARNING ),
+        error_entry( NPP_DIVIDE_BY_ZERO_WARNING ),
+        error_entry( NPP_WRONG_INTERSECTION_ROI_WARNING ),
+#endif
+
+        error_entry( NPP_NOT_SUPPORTED_MODE_ERROR ),
+        error_entry( NPP_ROUND_MODE_NOT_SUPPORTED_ERROR ),
+        error_entry( NPP_RESIZE_NO_OPERATION_ERROR ),
+        error_entry( NPP_LUT_NUMBER_OF_LEVELS_ERROR ),
+        error_entry( NPP_TEXTURE_BIND_ERROR ),
        error_entry( NPP_WRONG_INTERSECTION_ROI_ERROR ),
        error_entry( NPP_NOT_EVEN_STEP_ERROR ),
        error_entry( NPP_INTERPOLATION_ERROR ),
        error_entry( NPP_RESIZE_FACTOR_ERROR ),
        error_entry( NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR ),
-        error_entry( NPP_MEMFREE_ERR ),
-        error_entry( NPP_MEMSET_ERR ),
        error_entry( NPP_MEMCPY_ERROR ),
-        error_entry( NPP_MEM_ALLOC_ERR ),
-        error_entry( NPP_HISTO_NUMBER_OF_LEVELS_ERROR ),
-        error_entry( NPP_MIRROR_FLIP_ERR ),
-        error_entry( NPP_INVALID_INPUT ),
        error_entry( NPP_ALIGNMENT_ERROR ),
        error_entry( NPP_STEP_ERROR ),
        error_entry( NPP_SIZE_ERROR ),
-        error_entry( NPP_POINTER_ERROR ),
        error_entry( NPP_NULL_POINTER_ERROR ),
        error_entry( NPP_CUDA_KERNEL_EXECUTION_ERROR ),
        error_entry( NPP_NOT_IMPLEMENTED_ERROR ),
        error_entry( NPP_ERROR ),
        error_entry( NPP_NO_ERROR ),
        error_entry( NPP_SUCCESS ),
-        error_entry( NPP_WARNING ),
        error_entry( NPP_WRONG_INTERSECTION_QUAD_WARNING ),
        error_entry( NPP_MISALIGNED_DST_ROI_WARNING ),
        error_entry( NPP_AFFINE_QUAD_INCORRECT_WARNING ),
-        error_entry( NPP_DOUBLE_SIZE_WARNING ),
-        error_entry( NPP_ODD_ROI_WARNING )
+        error_entry( NPP_DOUBLE_SIZE_WARNING )
    };

    const size_t npp_error_num = sizeof(npp_errors) / sizeof(npp_errors[0]);
--- a/modules/gpu/src/matrix_reductions.cpp
+++ b/modules/gpu/src/matrix_reductions.cpp
@ -187,10 +187,20 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
    CV_Assert(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2);

-    typedef NppStatus (*npp_norm_diff_func_t)(const Npp8u* pSrc1, int nSrcStep1, const Npp8u* pSrc2, int nSrcStep2,
-        NppiSize oSizeROI, Npp64f* pRetVal);
+#if CUDA_VERSION < 5050
+    typedef NppStatus (*func_t)(const Npp8u* pSrc1, int nSrcStep1, const Npp8u* pSrc2, int nSrcStep2, NppiSize oSizeROI, Npp64f* pRetVal);

-    static const npp_norm_diff_func_t npp_norm_diff_func[] = {nppiNormDiff_Inf_8u_C1R, nppiNormDiff_L1_8u_C1R, nppiNormDiff_L2_8u_C1R};
+    static const func_t funcs[] = {nppiNormDiff_Inf_8u_C1R, nppiNormDiff_L1_8u_C1R, nppiNormDiff_L2_8u_C1R};
+#else
+    typedef NppStatus (*func_t)(const Npp8u* pSrc1, int nSrcStep1, const Npp8u* pSrc2, int nSrcStep2,
+        NppiSize oSizeROI, Npp64f* pRetVal, Npp8u * pDeviceBuffer);
+
+    typedef NppStatus (*buf_size_func_t)(NppiSize oSizeROI, int* hpBufferSize);
+
+    static const func_t funcs[] = {nppiNormDiff_Inf_8u_C1R, nppiNormDiff_L1_8u_C1R, nppiNormDiff_L2_8u_C1R};
+
+    static const buf_size_func_t buf_size_funcs[] = {nppiNormDiffInfGetBufferHostSize_8u_C1R, nppiNormDiffL1GetBufferHostSize_8u_C1R, nppiNormDiffL2GetBufferHostSize_8u_C1R};
+#endif

    NppiSize sz;
    sz.width  = src1.cols;
@ -202,7 +212,16 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)

    DeviceBuffer dbuf;

-    nppSafeCall( npp_norm_diff_func[funcIdx](src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), sz, dbuf) );
+#if CUDA_VERSION < 5050
+    nppSafeCall( funcs[funcIdx](src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), sz, dbuf) );
+#else
+    int bufSize;
+    buf_size_funcs[funcIdx](sz, &bufSize);
+
+    GpuMat buf(1, bufSize, CV_8UC1);
+
+    nppSafeCall( funcs[funcIdx](src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), sz, dbuf, buf.data) );
+#endif

    cudaSafeCall( cudaDeviceSynchronize() );

--- a/modules/gpu/src/precomp.hpp
+++ b/modules/gpu/src/precomp.hpp
@ -116,11 +116,13 @@
    #define CUDART_MINIMUM_REQUIRED_VERSION 4010
    #define NPP_MINIMUM_REQUIRED_VERSION 4100

+    #define NPP_VERSION (NPP_VERSION_MAJOR * 1000 + NPP_VERSION_MINOR * 100 + NPP_VERSION_BUILD)
+
    #if (CUDART_VERSION < CUDART_MINIMUM_REQUIRED_VERSION)
        #error "Insufficient Cuda Runtime library version, please update it."
    #endif

-    #if (NPP_VERSION_MAJOR * 1000 + NPP_VERSION_MINOR * 100 + NPP_VERSION_BUILD < NPP_MINIMUM_REQUIRED_VERSION)
+    #if (NPP_VERSION < NPP_MINIMUM_REQUIRED_VERSION)
        #error "Insufficient NPP version, please update it."
    #endif