Merge branch 'master' of code.opencv.org:opencv

2025-06-07 09:25:45 +08:00 · 2013-07-04 16:19:24 +02:00 · 2013-07-04 16:19:24 +02:00 · 188f889949
commit 188f889949
parent c39159069e 70deda354a
205 changed files with 11735 additions and 10382 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -103,6 +103,19 @@ if(UNIX AND NOT ANDROID)
  endif()
 endif()

+# Add these standard paths to the search paths for FIND_PATH
+# to find include files from these locations first
+if(MINGW)
+  if(EXISTS /mingw)
+      list(APPEND CMAKE_INCLUDE_PATH /mingw)
+  endif()
+  if(EXISTS /mingw32)
+      list(APPEND CMAKE_INCLUDE_PATH /mingw32)
+  endif()
+  if(EXISTS /mingw64)
+      list(APPEND CMAKE_INCLUDE_PATH /mingw64)
+  endif()
+endif()

 # ----------------------------------------------------------------------------
 # OpenCV cmake options
@ -110,7 +123,7 @@ endif()

 # Optional 3rd party components
 # ===================================================
-OCV_OPTION(WITH_1394           "Include IEEE1394 support"                    ON   IF (UNIX AND NOT ANDROID AND NOT IOS) )
+OCV_OPTION(WITH_1394           "Include IEEE1394 support"                    ON   IF (NOT ANDROID AND NOT IOS) )
 OCV_OPTION(WITH_AVFOUNDATION   "Use AVFoundation for Video I/O"              ON   IF IOS)
 OCV_OPTION(WITH_CARBON         "Use Carbon for UI instead of Cocoa"          OFF  IF APPLE )
 OCV_OPTION(WITH_CUDA           "Include NVidia Cuda Runtime support"         ON   IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS) )
--- a/android/android.toolchain.cmake
+++ b/android/android.toolchain.cmake
@ -1,6 +1,7 @@
 message(STATUS "Android toolchain was moved to platfroms/android!")
 message(STATUS "This file is depricated and will be removed!")

+# Copyright (c) 2010-2011, Ethan Rublee
 # Copyright (c) 2011-2013, Andrey Kamaev
 # All rights reserved.
 #
@ -291,6 +292,9 @@ message(STATUS "This file is depricated and will be removed!")
 #   - March 2013
 #     [+] updated for NDK r8e (x86 version)
 #     [+] support x86_64 version of NDK
+#   - April 2013
+#     [+] support non-release NDK layouts (from Linaro git and Android git)
+#     [~] automatically detect if explicit link to crtbegin_*.o is needed
 # ------------------------------------------------------------------------------

 cmake_minimum_required( VERSION 2.6.3 )
@ -518,24 +522,19 @@ if( NOT ANDROID_NDK )
  endif( ANDROID_NDK )
 endif( NOT ANDROID_STANDALONE_TOOLCHAIN )
 endif( NOT ANDROID_NDK )
+
 # remember found paths
 if( ANDROID_NDK )
 get_filename_component( ANDROID_NDK "${ANDROID_NDK}" ABSOLUTE )
- # try to detect change
- if( CMAKE_AR )
-  string( LENGTH "${ANDROID_NDK}" __length )
-  string( SUBSTRING "${CMAKE_AR}" 0 ${__length} __androidNdkPreviousPath )
-  if( NOT __androidNdkPreviousPath STREQUAL ANDROID_NDK )
-   message( FATAL_ERROR "It is not possible to change the path to the NDK on subsequent CMake run. You must remove all generated files from your build folder first.
-   " )
-  endif()
-  unset( __androidNdkPreviousPath )
-  unset( __length )
- endif()
 set( ANDROID_NDK "${ANDROID_NDK}" CACHE INTERNAL "Path of the Android NDK" FORCE )
 set( BUILD_WITH_ANDROID_NDK True )
- file( STRINGS "${ANDROID_NDK}/RELEASE.TXT" ANDROID_NDK_RELEASE_FULL LIMIT_COUNT 1 REGEX r[0-9]+[a-z]? )
- string( REGEX MATCH r[0-9]+[a-z]? ANDROID_NDK_RELEASE "${ANDROID_NDK_RELEASE_FULL}" )
+ if( EXISTS "${ANDROID_NDK}/RELEASE.TXT" )
+  file( STRINGS "${ANDROID_NDK}/RELEASE.TXT" ANDROID_NDK_RELEASE_FULL LIMIT_COUNT 1 REGEX r[0-9]+[a-z]? )
+  string( REGEX MATCH r[0-9]+[a-z]? ANDROID_NDK_RELEASE "${ANDROID_NDK_RELEASE_FULL}" )
+ else()
+  set( ANDROID_NDK_RELEASE "r1x" )
+  set( ANDROID_NDK_RELEASE_FULL "unreleased" )
+ endif()
 elseif( ANDROID_STANDALONE_TOOLCHAIN )
 get_filename_component( ANDROID_STANDALONE_TOOLCHAIN "${ANDROID_STANDALONE_TOOLCHAIN}" ABSOLUTE )
 # try to detect change
@ -562,6 +561,51 @@ else()
      sudo ln -s ~/my-android-toolchain ${ANDROID_STANDALONE_TOOLCHAIN_SEARCH_PATH}" )
 endif()

+# android NDK layout
+if( BUILD_WITH_ANDROID_NDK )
+ if( NOT DEFINED ANDROID_NDK_LAYOUT )
+  # try to automatically detect the layout
+  if( EXISTS "${ANDROID_NDK}/RELEASE.TXT")
+   set( ANDROID_NDK_LAYOUT "RELEASE" )
+  elseif( EXISTS "${ANDROID_NDK}/../../linux-x86/toolchain/" )
+   set( ANDROID_NDK_LAYOUT "LINARO" )
+  elseif( EXISTS "${ANDROID_NDK}/../../gcc/" )
+   set( ANDROID_NDK_LAYOUT "ANDROID" )
+  endif()
+ endif()
+ set( ANDROID_NDK_LAYOUT "${ANDROID_NDK_LAYOUT}" CACHE STRING "The inner layout of NDK" )
+ mark_as_advanced( ANDROID_NDK_LAYOUT )
+ if( ANDROID_NDK_LAYOUT STREQUAL "LINARO" )
+  set( ANDROID_NDK_HOST_SYSTEM_NAME ${ANDROID_NDK_HOST_SYSTEM_NAME2} ) # only 32-bit at the moment
+  set( ANDROID_NDK_TOOLCHAINS_PATH "${ANDROID_NDK}/../../${ANDROID_NDK_HOST_SYSTEM_NAME}/toolchain" )
+  set( ANDROID_NDK_TOOLCHAINS_SUBPATH  "" )
+  set( ANDROID_NDK_TOOLCHAINS_SUBPATH2 "" )
+ elseif( ANDROID_NDK_LAYOUT STREQUAL "ANDROID" )
+  set( ANDROID_NDK_HOST_SYSTEM_NAME ${ANDROID_NDK_HOST_SYSTEM_NAME2} ) # only 32-bit at the moment
+  set( ANDROID_NDK_TOOLCHAINS_PATH "${ANDROID_NDK}/../../gcc/${ANDROID_NDK_HOST_SYSTEM_NAME}/arm" )
+  set( ANDROID_NDK_TOOLCHAINS_SUBPATH  "" )
+  set( ANDROID_NDK_TOOLCHAINS_SUBPATH2 "" )
+ else() # ANDROID_NDK_LAYOUT STREQUAL "RELEASE"
+  set( ANDROID_NDK_TOOLCHAINS_PATH "${ANDROID_NDK}/toolchains" )
+  set( ANDROID_NDK_TOOLCHAINS_SUBPATH  "/prebuilt/${ANDROID_NDK_HOST_SYSTEM_NAME}" )
+  set( ANDROID_NDK_TOOLCHAINS_SUBPATH2 "/prebuilt/${ANDROID_NDK_HOST_SYSTEM_NAME2}" )
+ endif()
+ get_filename_component( ANDROID_NDK_TOOLCHAINS_PATH "${ANDROID_NDK_TOOLCHAINS_PATH}" ABSOLUTE )
+
+ # try to detect change of NDK
+ if( CMAKE_AR )
+  string( LENGTH "${ANDROID_NDK_TOOLCHAINS_PATH}" __length )
+  string( SUBSTRING "${CMAKE_AR}" 0 ${__length} __androidNdkPreviousPath )
+  if( NOT __androidNdkPreviousPath STREQUAL ANDROID_NDK_TOOLCHAINS_PATH )
+   message( FATAL_ERROR "It is not possible to change the path to the NDK on subsequent CMake run. You must remove all generated files from your build folder first.
+   " )
+  endif()
+  unset( __androidNdkPreviousPath )
+  unset( __length )
+ endif()
+endif()
+
+
 # get all the details about standalone toolchain
 if( BUILD_WITH_STANDALONE_TOOLCHAIN )
 __DETECT_NATIVE_API_LEVEL( ANDROID_SUPPORTED_NATIVE_API_LEVELS "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot/usr/include/android/api-level.h" )
@ -589,17 +633,23 @@ if( BUILD_WITH_STANDALONE_TOOLCHAIN )
 endif()
 endif()

-macro( __GLOB_NDK_TOOLCHAINS __availableToolchainsVar __availableToolchainsLst __host_system_name )
+macro( __GLOB_NDK_TOOLCHAINS __availableToolchainsVar __availableToolchainsLst __toolchain_subpath )
 foreach( __toolchain ${${__availableToolchainsLst}} )
-  if( "${__toolchain}" MATCHES "-clang3[.][0-9]$" AND NOT EXISTS "${ANDROID_NDK}/toolchains/${__toolchain}/prebuilt/" )
+  if( "${__toolchain}" MATCHES "-clang3[.][0-9]$" AND NOT EXISTS "${ANDROID_NDK_TOOLCHAINS_PATH}/${__toolchain}${__toolchain_subpath}" )
   string( REGEX REPLACE "-clang3[.][0-9]$" "-4.6" __gcc_toolchain "${__toolchain}" )
  else()
   set( __gcc_toolchain "${__toolchain}" )
  endif()
-  __DETECT_TOOLCHAIN_MACHINE_NAME( __machine "${ANDROID_NDK}/toolchains/${__gcc_toolchain}/prebuilt/${__host_system_name}" )
+  __DETECT_TOOLCHAIN_MACHINE_NAME( __machine "${ANDROID_NDK_TOOLCHAINS_PATH}/${__gcc_toolchain}${__toolchain_subpath}" )
  if( __machine )
-   string( REGEX MATCH "[0-9]+[.][0-9]+([.][0-9]+)?$" __version "${__gcc_toolchain}" )
-   string( REGEX MATCH "^[^-]+" __arch "${__gcc_toolchain}" )
+   string( REGEX MATCH "[0-9]+[.][0-9]+([.][0-9x]+)?$" __version "${__gcc_toolchain}" )
+   if( __machine MATCHES i686 )
+    set( __arch "x86" )
+   elseif( __machine MATCHES arm )
+    set( __arch "arm" )
+   elseif( __machine MATCHES mipsel )
+    set( __arch "mipsel" )
+   endif()
   list( APPEND __availableToolchainMachines "${__machine}" )
   list( APPEND __availableToolchainArchs "${__arch}" )
   list( APPEND __availableToolchainCompilerVersions "${__version}" )
@ -617,29 +667,29 @@ if( BUILD_WITH_ANDROID_NDK )
 set( __availableToolchainMachines "" )
 set( __availableToolchainArchs "" )
 set( __availableToolchainCompilerVersions "" )
- if( ANDROID_TOOLCHAIN_NAME AND EXISTS "${ANDROID_NDK}/toolchains/${ANDROID_TOOLCHAIN_NAME}/" )
+ if( ANDROID_TOOLCHAIN_NAME AND EXISTS "${ANDROID_NDK_TOOLCHAINS_PATH}/${ANDROID_TOOLCHAIN_NAME}/" )
  # do not go through all toolchains if we know the name
  set( __availableToolchainsLst "${ANDROID_TOOLCHAIN_NAME}" )
-  __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst ${ANDROID_NDK_HOST_SYSTEM_NAME} )
-  if( NOT __availableToolchains AND NOT ANDROID_NDK_HOST_SYSTEM_NAME STREQUAL ANDROID_NDK_HOST_SYSTEM_NAME2 )
-   __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst ${ANDROID_NDK_HOST_SYSTEM_NAME2} )
+  __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst "${ANDROID_NDK_TOOLCHAINS_SUBPATH}" )
+  if( NOT __availableToolchains AND NOT ANDROID_NDK_TOOLCHAINS_SUBPATH STREQUAL ANDROID_NDK_TOOLCHAINS_SUBPATH2 )
+   __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst "${ANDROID_NDK_TOOLCHAINS_SUBPATH2}" )
   if( __availableToolchains )
-    set( ANDROID_NDK_HOST_SYSTEM_NAME ${ANDROID_NDK_HOST_SYSTEM_NAME2} )
+    set( ANDROID_NDK_TOOLCHAINS_SUBPATH ${ANDROID_NDK_TOOLCHAINS_SUBPATH2} )
   endif()
  endif()
 endif()
 if( NOT __availableToolchains )
-  file( GLOB __availableToolchainsLst RELATIVE "${ANDROID_NDK}/toolchains" "${ANDROID_NDK}/toolchains/*" )
+  file( GLOB __availableToolchainsLst RELATIVE "${ANDROID_NDK_TOOLCHAINS_PATH}" "${ANDROID_NDK_TOOLCHAINS_PATH}/*" )
  if( __availableToolchains )
   list(SORT __availableToolchainsLst) # we need clang to go after gcc
  endif()
  __LIST_FILTER( __availableToolchainsLst "^[.]" )
  __LIST_FILTER( __availableToolchainsLst "llvm" )
-  __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst ${ANDROID_NDK_HOST_SYSTEM_NAME} )
-  if( NOT __availableToolchains AND NOT ANDROID_NDK_HOST_SYSTEM_NAME STREQUAL ANDROID_NDK_HOST_SYSTEM_NAME2 )
-   __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst ${ANDROID_NDK_HOST_SYSTEM_NAME2} )
+  __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst "${ANDROID_NDK_TOOLCHAINS_SUBPATH}" )
+  if( NOT __availableToolchains AND NOT ANDROID_NDK_TOOLCHAINS_SUBPATH STREQUAL ANDROID_NDK_TOOLCHAINS_SUBPATH2 )
+   __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst "${ANDROID_NDK_TOOLCHAINS_SUBPATH2}" )
   if( __availableToolchains )
-    set( ANDROID_NDK_HOST_SYSTEM_NAME ${ANDROID_NDK_HOST_SYSTEM_NAME2} )
+    set( ANDROID_NDK_TOOLCHAINS_SUBPATH ${ANDROID_NDK_TOOLCHAINS_SUBPATH2} )
   endif()
  endif()
 endif()
@ -770,6 +820,7 @@ else()
  list( GET __availableToolchainArchs ${__idx} __toolchainArch )
  if( __toolchainArch STREQUAL ANDROID_ARCH_FULLNAME )
   list( GET __availableToolchainCompilerVersions ${__idx} __toolchainVersion )
+   string( REPLACE "x" "99" __toolchainVersion "${__toolchainVersion}")
   if( __toolchainVersion VERSION_GREATER __toolchainMaxVersion )
    set( __toolchainMaxVersion "${__toolchainVersion}" )
    set( __toolchainIdx ${__idx} )
@ -973,11 +1024,11 @@ if( "${ANDROID_TOOLCHAIN_NAME}" STREQUAL "standalone-clang" )
 elseif( "${ANDROID_TOOLCHAIN_NAME}" MATCHES "-clang3[.][0-9]?$" )
 string( REGEX MATCH "3[.][0-9]$" ANDROID_CLANG_VERSION "${ANDROID_TOOLCHAIN_NAME}")
 string( REGEX REPLACE "-clang${ANDROID_CLANG_VERSION}$" "-4.6" ANDROID_GCC_TOOLCHAIN_NAME "${ANDROID_TOOLCHAIN_NAME}" )
- if( NOT EXISTS "${ANDROID_NDK}/toolchains/llvm-${ANDROID_CLANG_VERSION}/prebuilt/${ANDROID_NDK_HOST_SYSTEM_NAME}/bin/clang${TOOL_OS_SUFFIX}" )
+ if( NOT EXISTS "${ANDROID_NDK_TOOLCHAINS_PATH}/llvm-${ANDROID_CLANG_VERSION}${ANDROID_NDK_TOOLCHAINS_SUBPATH}/bin/clang${TOOL_OS_SUFFIX}" )
  message( FATAL_ERROR "Could not find the Clang compiler driver" )
 endif()
 set( ANDROID_COMPILER_IS_CLANG 1 )
- set( ANDROID_CLANG_TOOLCHAIN_ROOT "${ANDROID_NDK}/toolchains/llvm-${ANDROID_CLANG_VERSION}/prebuilt/${ANDROID_NDK_HOST_SYSTEM_NAME}" )
+ set( ANDROID_CLANG_TOOLCHAIN_ROOT "${ANDROID_NDK_TOOLCHAINS_PATH}/llvm-${ANDROID_CLANG_VERSION}${ANDROID_NDK_TOOLCHAINS_SUBPATH}" )
 else()
 set( ANDROID_GCC_TOOLCHAIN_NAME "${ANDROID_TOOLCHAIN_NAME}" )
 unset( ANDROID_COMPILER_IS_CLANG CACHE )
@ -991,7 +1042,7 @@ endif()

 # setup paths and STL for NDK
 if( BUILD_WITH_ANDROID_NDK )
- set( ANDROID_TOOLCHAIN_ROOT "${ANDROID_NDK}/toolchains/${ANDROID_GCC_TOOLCHAIN_NAME}/prebuilt/${ANDROID_NDK_HOST_SYSTEM_NAME}" )
+ set( ANDROID_TOOLCHAIN_ROOT "${ANDROID_NDK_TOOLCHAINS_PATH}/${ANDROID_GCC_TOOLCHAIN_NAME}${ANDROID_NDK_TOOLCHAINS_SUBPATH}" )
 set( ANDROID_SYSROOT "${ANDROID_NDK}/platforms/android-${ANDROID_NATIVE_API_LEVEL}/arch-${ANDROID_ARCH_NAME}" )

 if( ANDROID_STL STREQUAL "none" )
@ -1050,11 +1101,11 @@ if( BUILD_WITH_ANDROID_NDK )
 endif()
 # find libsupc++.a - rtti & exceptions
 if( ANDROID_STL STREQUAL "system_re" OR ANDROID_STL MATCHES "gnustl" )
-  if( ANDROID_NDK_RELEASE STRGREATER "r8" ) # r8b
-   set( __libsupcxx "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/${ANDROID_COMPILER_VERSION}/libs/${ANDROID_NDK_ABI_NAME}/libsupc++.a" )
-  elseif( NOT ANDROID_NDK_RELEASE STRLESS "r7" AND ANDROID_NDK_RELEASE STRLESS "r8b")
-   set( __libsupcxx "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/libs/${ANDROID_NDK_ABI_NAME}/libsupc++.a" )
-  else( ANDROID_NDK_RELEASE STRLESS "r7" )
+  set( __libsupcxx "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/${ANDROID_COMPILER_VERSION}/libs/${ANDROID_NDK_ABI_NAME}/libsupc++.a" ) # r8b or newer
+  if( NOT EXISTS "${__libsupcxx}" )
+   set( __libsupcxx "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/libs/${ANDROID_NDK_ABI_NAME}/libsupc++.a" ) # r7-r8
+  endif()
+  if( NOT EXISTS "${__libsupcxx}" ) # before r7
   if( ARMEABI_V7A )
    if( ANDROID_FORCE_ARM_BUILD )
     set( __libsupcxx "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/${CMAKE_SYSTEM_PROCESSOR}/libsupc++.a" )
@ -1104,7 +1155,7 @@ unset( _ndk_ccache )

 # setup the cross-compiler
 if( NOT CMAKE_C_COMPILER )
- if( NDK_CCACHE )
+ if( NDK_CCACHE AND NOT ANDROID_SYSROOT MATCHES "[ ;\"]" )
  set( CMAKE_C_COMPILER   "${NDK_CCACHE}" CACHE PATH "ccache as C compiler" )
  set( CMAKE_CXX_COMPILER "${NDK_CCACHE}" CACHE PATH "ccache as C++ compiler" )
  if( ANDROID_COMPILER_IS_CLANG )
@ -1176,11 +1227,25 @@ set( CMAKE_ASM_SOURCE_FILE_EXTENSIONS s S asm )
 remove_definitions( -DANDROID )
 add_definitions( -DANDROID )

-if(ANDROID_SYSROOT MATCHES "[ ;\"]")
- set( ANDROID_CXX_FLAGS "--sysroot=\"${ANDROID_SYSROOT}\"" )
+if( ANDROID_SYSROOT MATCHES "[ ;\"]" )
+ if( CMAKE_HOST_WIN32 )
+  # try to convert path to 8.3 form
+  file( WRITE "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/cvt83.cmd" "@echo %~s1" )
+  execute_process( COMMAND "$ENV{ComSpec}" /c "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/cvt83.cmd" "${ANDROID_SYSROOT}"
+                   OUTPUT_VARIABLE __path OUTPUT_STRIP_TRAILING_WHITESPACE
+                   RESULT_VARIABLE __result ERROR_QUIET )
+  if( __result EQUAL 0 )
+   file( TO_CMAKE_PATH "${__path}" ANDROID_SYSROOT )
+   set( ANDROID_CXX_FLAGS "--sysroot=${ANDROID_SYSROOT}" )
+  else()
+   set( ANDROID_CXX_FLAGS "--sysroot=\"${ANDROID_SYSROOT}\"" )
+  endif()
+ else()
+  set( ANDROID_CXX_FLAGS "'--sysroot=${ANDROID_SYSROOT}'" )
+ endif()
 if( NOT _CMAKE_IN_TRY_COMPILE )
-  # quotes will break try_compile and compiler identification
-  message(WARNING "Your Android system root has non-alphanumeric symbols. It can break compiler features detection and the whole build.")
+  # quotes can break try_compile and compiler identification
+  message(WARNING "Path to your Android NDK (or toolchain) has non-alphanumeric symbols.\nThe build might be broken.\n")
 endif()
 else()
 set( ANDROID_CXX_FLAGS "--sysroot=${ANDROID_SYSROOT}" )
@ -1251,22 +1316,18 @@ elseif( ARMEABI )
 set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -march=armv5te -mtune=xscale -msoft-float" )
 endif()

+if( ANDROID_STL MATCHES "gnustl" AND (EXISTS "${__libstl}" OR EXISTS "${__libsupcxx}") )
+ set( CMAKE_CXX_CREATE_SHARED_LIBRARY "<CMAKE_C_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
+ set( CMAKE_CXX_CREATE_SHARED_MODULE  "<CMAKE_C_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
+ set( CMAKE_CXX_LINK_EXECUTABLE       "<CMAKE_C_COMPILER> <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>" )
+else()
+ set( CMAKE_CXX_CREATE_SHARED_LIBRARY "<CMAKE_CXX_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
+ set( CMAKE_CXX_CREATE_SHARED_MODULE  "<CMAKE_CXX_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
+ set( CMAKE_CXX_LINK_EXECUTABLE       "<CMAKE_CXX_COMPILER> <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>" )
+endif()
+
 # STL
 if( EXISTS "${__libstl}" OR EXISTS "${__libsupcxx}" )
- if( ANDROID_STL MATCHES "gnustl" )
-  set( CMAKE_CXX_CREATE_SHARED_LIBRARY "<CMAKE_C_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
-  set( CMAKE_CXX_CREATE_SHARED_MODULE  "<CMAKE_C_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
-  set( CMAKE_CXX_LINK_EXECUTABLE       "<CMAKE_C_COMPILER> <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>" )
- else()
-  set( CMAKE_CXX_CREATE_SHARED_LIBRARY "<CMAKE_CXX_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
-  set( CMAKE_CXX_CREATE_SHARED_MODULE  "<CMAKE_CXX_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>" )
-  set( CMAKE_CXX_LINK_EXECUTABLE       "<CMAKE_CXX_COMPILER> <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>" )
- endif()
- if ( X86 AND ANDROID_STL MATCHES "gnustl" AND ANDROID_NDK_RELEASE STREQUAL "r6" )
-  # workaround "undefined reference to `__dso_handle'" problem
-  set( CMAKE_CXX_CREATE_SHARED_LIBRARY "${CMAKE_CXX_CREATE_SHARED_LIBRARY} \"${ANDROID_SYSROOT}/usr/lib/crtbegin_so.o\"" )
-  set( CMAKE_CXX_CREATE_SHARED_MODULE  "${CMAKE_CXX_CREATE_SHARED_MODULE} \"${ANDROID_SYSROOT}/usr/lib/crtbegin_so.o\"" )
- endif()
 if( EXISTS "${__libstl}" )
  set( CMAKE_CXX_CREATE_SHARED_LIBRARY "${CMAKE_CXX_CREATE_SHARED_LIBRARY} \"${__libstl}\"" )
  set( CMAKE_CXX_CREATE_SHARED_MODULE  "${CMAKE_CXX_CREATE_SHARED_MODULE} \"${__libstl}\"" )
@ -1285,9 +1346,12 @@ if( EXISTS "${__libstl}" OR EXISTS "${__libsupcxx}" )
  set( CMAKE_C_LINK_EXECUTABLE       "${CMAKE_C_LINK_EXECUTABLE} \"${__libsupcxx}\"" )
 endif()
 if( ANDROID_STL MATCHES "gnustl" )
-  set( CMAKE_CXX_CREATE_SHARED_LIBRARY "${CMAKE_CXX_CREATE_SHARED_LIBRARY} -lm" )
-  set( CMAKE_CXX_CREATE_SHARED_MODULE  "${CMAKE_CXX_CREATE_SHARED_MODULE} -lm" )
-  set( CMAKE_CXX_LINK_EXECUTABLE       "${CMAKE_CXX_LINK_EXECUTABLE} -lm" )
+  if( NOT EXISTS "${ANDROID_LIBM_PATH}" )
+   set( ANDROID_LIBM_PATH -lm )
+  endif()
+  set( CMAKE_CXX_CREATE_SHARED_LIBRARY "${CMAKE_CXX_CREATE_SHARED_LIBRARY} ${ANDROID_LIBM_PATH}" )
+  set( CMAKE_CXX_CREATE_SHARED_MODULE  "${CMAKE_CXX_CREATE_SHARED_MODULE} ${ANDROID_LIBM_PATH}" )
+  set( CMAKE_CXX_LINK_EXECUTABLE       "${CMAKE_CXX_LINK_EXECUTABLE} ${ANDROID_LIBM_PATH}" )
 endif()
 endif()

@ -1323,7 +1387,14 @@ if( ARMEABI_V7A )
 endif()

 if( ANDROID_NO_UNDEFINED )
- set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -Wl,--no-undefined" )
+ if( MIPS )
+  # there is some sysroot-related problem in mips linker...
+  if( NOT ANDROID_SYSROOT MATCHES "[ ;\"]" )
+   set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -Wl,--no-undefined -Wl,-rpath-link,${ANDROID_SYSROOT}/usr/lib" )
+  endif()
+ else()
+  set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -Wl,--no-undefined" )
+ endif()
 endif()

 if( ANDROID_SO_UNDEFINED )
@ -1403,9 +1474,9 @@ set( CMAKE_MODULE_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} ${CMAKE_MODULE_LINKER_FL
 set( CMAKE_EXE_LINKER_FLAGS    "${ANDROID_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS}" )

 if( MIPS AND BUILD_WITH_ANDROID_NDK AND ANDROID_NDK_RELEASE STREQUAL "r8" )
- set( CMAKE_SHARED_LINKER_FLAGS "-Wl,-T,${ANDROID_NDK}/toolchains/${ANDROID_GCC_TOOLCHAIN_NAME}/mipself.xsc ${CMAKE_SHARED_LINKER_FLAGS}" )
- set( CMAKE_MODULE_LINKER_FLAGS "-Wl,-T,${ANDROID_NDK}/toolchains/${ANDROID_GCC_TOOLCHAIN_NAME}/mipself.xsc ${CMAKE_MODULE_LINKER_FLAGS}" )
- set( CMAKE_EXE_LINKER_FLAGS    "-Wl,-T,${ANDROID_NDK}/toolchains/${ANDROID_GCC_TOOLCHAIN_NAME}/mipself.x ${CMAKE_EXE_LINKER_FLAGS}" )
+ set( CMAKE_SHARED_LINKER_FLAGS "-Wl,-T,${ANDROID_NDK_TOOLCHAINS_PATH}/${ANDROID_GCC_TOOLCHAIN_NAME}/mipself.xsc ${CMAKE_SHARED_LINKER_FLAGS}" )
+ set( CMAKE_MODULE_LINKER_FLAGS "-Wl,-T,${ANDROID_NDK_TOOLCHAINS_PATH}/${ANDROID_GCC_TOOLCHAIN_NAME}/mipself.xsc ${CMAKE_MODULE_LINKER_FLAGS}" )
+ set( CMAKE_EXE_LINKER_FLAGS    "-Wl,-T,${ANDROID_NDK_TOOLCHAINS_PATH}/${ANDROID_GCC_TOOLCHAIN_NAME}/mipself.x ${CMAKE_EXE_LINKER_FLAGS}" )
 endif()

 # configure rtti
@ -1432,6 +1503,43 @@ endif()
 include_directories( SYSTEM "${ANDROID_SYSROOT}/usr/include" ${ANDROID_STL_INCLUDE_DIRS} )
 link_directories( "${CMAKE_INSTALL_PREFIX}/libs/${ANDROID_NDK_ABI_NAME}" )

+# detect if need link crtbegin_so.o explicitly
+if( NOT DEFINED ANDROID_EXPLICIT_CRT_LINK )
+ set( __cmd "${CMAKE_CXX_CREATE_SHARED_LIBRARY}" )
+ string( REPLACE "<CMAKE_CXX_COMPILER>" "${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER_ARG1}" __cmd "${__cmd}" )
+ string( REPLACE "<CMAKE_C_COMPILER>"   "${CMAKE_C_COMPILER} ${CMAKE_C_COMPILER_ARG1}"   __cmd "${__cmd}" )
+ string( REPLACE "<CMAKE_SHARED_LIBRARY_CXX_FLAGS>" "${CMAKE_CXX_FLAGS}" __cmd "${__cmd}" )
+ string( REPLACE "<LANGUAGE_COMPILE_FLAGS>" "" __cmd "${__cmd}" )
+ string( REPLACE "<LINK_FLAGS>" "${CMAKE_SHARED_LINKER_FLAGS}" __cmd "${__cmd}" )
+ string( REPLACE "<CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS>" "-shared" __cmd "${__cmd}" )
+ string( REPLACE "<CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG>" "" __cmd "${__cmd}" )
+ string( REPLACE "<TARGET_SONAME>" "" __cmd "${__cmd}" )
+ string( REPLACE "<TARGET>" "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/toolchain_crtlink_test.so" __cmd "${__cmd}" )
+ string( REPLACE "<OBJECTS>" "\"${ANDROID_SYSROOT}/usr/lib/crtbegin_so.o\"" __cmd "${__cmd}" )
+ string( REPLACE "<LINK_LIBRARIES>" "" __cmd "${__cmd}" )
+ separate_arguments( __cmd )
+ foreach( __var ANDROID_NDK ANDROID_NDK_TOOLCHAINS_PATH ANDROID_STANDALONE_TOOLCHAIN )
+  if( ${__var} )
+   set( __tmp "${${__var}}" )
+   separate_arguments( __tmp )
+   string( REPLACE "${__tmp}" "${${__var}}" __cmd "${__cmd}")
+  endif()
+ endforeach()
+ string( REPLACE "'" "" __cmd "${__cmd}" )
+ string( REPLACE "\"" "" __cmd "${__cmd}" )
+ execute_process( COMMAND ${__cmd} RESULT_VARIABLE __cmd_result OUTPUT_QUIET ERROR_QUIET )
+ if( __cmd_result EQUAL 0 )
+  set( ANDROID_EXPLICIT_CRT_LINK ON )
+ else()
+  set( ANDROID_EXPLICIT_CRT_LINK OFF )
+ endif()
+endif()
+
+if( ANDROID_EXPLICIT_CRT_LINK )
+ set( CMAKE_CXX_CREATE_SHARED_LIBRARY "${CMAKE_CXX_CREATE_SHARED_LIBRARY} \"${ANDROID_SYSROOT}/usr/lib/crtbegin_so.o\"" )
+ set( CMAKE_CXX_CREATE_SHARED_MODULE  "${CMAKE_CXX_CREATE_SHARED_MODULE} \"${ANDROID_SYSROOT}/usr/lib/crtbegin_so.o\"" )
+endif()
+
 # setup output directories
 set( LIBRARY_OUTPUT_PATH_ROOT ${CMAKE_SOURCE_DIR} CACHE PATH "root for library output, set this to change where android libs are installed to" )
 set( CMAKE_INSTALL_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/user" CACHE STRING "path for installing" )
@ -1523,6 +1631,7 @@ if( NOT PROJECT_NAME STREQUAL "CMAKE_TRY_COMPILE" )
 foreach( __var NDK_CCACHE  LIBRARY_OUTPUT_PATH_ROOT  ANDROID_FORBID_SYGWIN  ANDROID_SET_OBSOLETE_VARIABLES
                ANDROID_NDK_HOST_X64
                ANDROID_NDK
+                ANDROID_NDK_LAYOUT
                ANDROID_STANDALONE_TOOLCHAIN
                ANDROID_TOOLCHAIN_NAME
                ANDROID_ABI
@ -1536,6 +1645,8 @@ if( NOT PROJECT_NAME STREQUAL "CMAKE_TRY_COMPILE" )
                ANDROID_GOLD_LINKER
                ANDROID_NOEXECSTACK
                ANDROID_RELRO
+                ANDROID_LIBM_PATH
+                ANDROID_EXPLICIT_CRT_LINK
                )
  if( DEFINED ${__var} )
   if( "${__var}" MATCHES " ")
@ -1579,6 +1690,7 @@ endif()
 #   ANDROID_STANDALONE_TOOLCHAIN
 #   ANDROID_TOOLCHAIN_NAME : the NDK name of compiler toolchain
 #   ANDROID_NDK_HOST_X64 : try to use x86_64 toolchain (default for x64 host systems)
+#   ANDROID_NDK_LAYOUT : the inner NDK structure (RELEASE, LINARO, ANDROID)
 #   LIBRARY_OUTPUT_PATH_ROOT : <any valid path>
 #   NDK_CCACHE : <path to your ccache executable>
 # Obsolete:
@ -1624,6 +1736,7 @@ endif()
 #   ANDROID_EXCEPTIONS : if exceptions are enabled by the runtime
 #   ANDROID_GCC_TOOLCHAIN_NAME : read-only, differs from ANDROID_TOOLCHAIN_NAME only if clang is used
 #   ANDROID_CLANG_VERSION : version of clang compiler if clang is used
+#   ANDROID_LIBM_PATH : path to libm.so (set to something like $(TOP)/out/target/product/<product_name>/obj/lib/libm.so) to workaround unresolved `sincos`
 #
 # Defaults:
 #   ANDROID_DEFAULT_NDK_API_LEVEL
--- a/cmake/OpenCVDetectOpenCL.cmake
+++ b/cmake/OpenCVDetectOpenCL.cmake
@ -44,12 +44,18 @@ if(OPENCL_FOUND)
  set(OPENCL_INCLUDE_DIRS ${OPENCL_INCLUDE_DIR})
  set(OPENCL_LIBRARIES    ${OPENCL_LIBRARY})

-  if (X86_64)
+  if(WIN32 AND X86_64)
    set(CLAMD_POSSIBLE_LIB_SUFFIXES lib64/import)
-  elseif (X86)
+  elseif(WIN32)
    set(CLAMD_POSSIBLE_LIB_SUFFIXES lib32/import)
  endif()

+  if(X86_64 AND UNIX)
+    set(CLAMD_POSSIBLE_LIB_SUFFIXES lib64)
+  elseif(X86 AND UNIX)
+    set(CLAMD_POSSIBLE_LIB_SUFFIXES lib32)
+  endif()
+
  if(WITH_OPENCLAMDFFT)
    find_path(CLAMDFFT_ROOT_DIR
              NAMES include/clAmdFft.h
@ -80,7 +86,7 @@ if(OPENCL_FOUND)
  if(WITH_OPENCLAMDBLAS)
    find_path(CLAMDBLAS_ROOT_DIR
              NAMES include/clAmdBlas.h
-              PATHS ENV CLAMDFFT_PATH ENV ProgramFiles
+              PATHS ENV CLAMDBLAS_PATH ENV ProgramFiles
              PATH_SUFFIXES clAmdBlas AMD/clAmdBlas
              DOC "AMD FFT root directory"
              NO_DEFAULT_PATH)
--- a/cmake/OpenCVDetectPython.cmake
+++ b/cmake/OpenCVDetectPython.cmake
@ -49,7 +49,7 @@ if(PYTHON_EXECUTABLE)

  if(NOT ANDROID AND NOT IOS)
    if(CMAKE_HOST_UNIX)
-      execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "from distutils.sysconfig import *; print get_python_lib()"
+      execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "from distutils.sysconfig import *; print(get_python_lib())"
                      RESULT_VARIABLE PYTHON_CVPY_PROCESS
                      OUTPUT_VARIABLE PYTHON_STD_PACKAGES_PATH
                      OUTPUT_STRIP_TRAILING_WHITESPACE)
@ -80,7 +80,7 @@ if(PYTHON_EXECUTABLE)

    if(NOT PYTHON_NUMPY_INCLUDE_DIR)
      # Attempt to discover the NumPy include directory. If this succeeds, then build python API with NumPy
-      execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import os; os.environ['DISTUTILS_USE_SDK']='1'; import numpy.distutils; print numpy.distutils.misc_util.get_numpy_include_dirs()[0]"
+      execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import os; os.environ['DISTUTILS_USE_SDK']='1'; import numpy.distutils; print(numpy.distutils.misc_util.get_numpy_include_dirs()[0])"
                      RESULT_VARIABLE PYTHON_NUMPY_PROCESS
                      OUTPUT_VARIABLE PYTHON_NUMPY_INCLUDE_DIR
                      OUTPUT_STRIP_TRAILING_WHITESPACE)
@ -92,7 +92,7 @@ if(PYTHON_EXECUTABLE)
    endif()

    if(PYTHON_NUMPY_INCLUDE_DIR)
-      execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import numpy; print numpy.version.version"
+      execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import numpy; print(numpy.version.version)"
                        RESULT_VARIABLE PYTHON_NUMPY_PROCESS
                        OUTPUT_VARIABLE PYTHON_NUMPY_VERSION
                        OUTPUT_STRIP_TRAILING_WHITESPACE)
--- a/cmake/OpenCVFindLibsVideo.cmake
+++ b/cmake/OpenCVFindLibsVideo.cmake
@ -110,10 +110,33 @@ endif(WITH_GIGEAPI)
 # --- Dc1394 ---
 ocv_clear_vars(HAVE_DC1394 HAVE_DC1394_2)
 if(WITH_1394)
-  CHECK_MODULE(libdc1394-2 HAVE_DC1394_2)
-  if(NOT HAVE_DC1394_2)
-    CHECK_MODULE(libdc1394 HAVE_DC1394)
-  endif()
+  if(WIN32 AND MINGW)
+      find_path(CMU1394_INCLUDE_PATH "/1394common.h"
+                PATH_SUFFIXES include
+                DOC "The path to cmu1394 headers")
+      find_path(DC1394_2_INCLUDE_PATH "/dc1394/dc1394.h"
+                PATH_SUFFIXES include
+                DOC "The path to DC1394 2.x headers")
+      if(CMU1394_INCLUDE_PATH AND DC1394_2_INCLUDE_PATH)
+        set(CMU1394_LIB_DIR  "${CMU1394_INCLUDE_PATH}/../lib"  CACHE PATH "Full path of CMU1394 library directory")
+        set(DC1394_2_LIB_DIR "${DC1394_2_INCLUDE_PATH}/../lib" CACHE PATH "Full path of DC1394 2.x library directory")
+        if(EXISTS "${CMU1394_LIB_DIR}/lib1394camera.a" AND EXISTS "${DC1394_2_LIB_DIR}/libdc1394.a")
+          set(HAVE_DC1394_2 TRUE)
+        endif()
+      endif()
+      if(HAVE_DC1394_2)
+        ocv_parse_pkg("libdc1394-2" "${DC1394_2_LIB_DIR}/pkgconfig" "")
+        ocv_include_directories(${DC1394_2_INCLUDE_PATH})
+        set(HIGHGUI_LIBRARIES ${HIGHGUI_LIBRARIES}
+            "${DC1394_2_LIB_DIR}/libdc1394.a"
+            "${CMU1394_LIB_DIR}/lib1394camera.a")
+      endif(HAVE_DC1394_2)
+  else(WIN32 AND MINGW)
+    CHECK_MODULE(libdc1394-2 HAVE_DC1394_2)
+    if(NOT HAVE_DC1394_2)
+      CHECK_MODULE(libdc1394 HAVE_DC1394)
+    endif()
+  endif(WIN32 AND MINGW)
 endif(WITH_1394)

 # --- xine ---
@ -226,7 +249,7 @@ endif(WITH_MSMF)

 # --- Extra HighGUI libs on Windows ---
 if(WIN32)
-  list(APPEND HIGHGUI_LIBRARIES comctl32 gdi32 ole32 vfw32)
+  list(APPEND HIGHGUI_LIBRARIES comctl32 gdi32 ole32 setupapi ws2_32 vfw32)
  if(MINGW64)
    list(APPEND HIGHGUI_LIBRARIES avifil32 avicap32 winmm msvfw32)
    list(REMOVE_ITEM HIGHGUI_LIBRARIES vfw32)
--- a/cmake/OpenCVFindXimea.cmake
+++ b/cmake/OpenCVFindXimea.cmake
@ -9,6 +9,7 @@
 #
 # Created: 5 Aug 2011 by Marian Zajko (marian.zajko@ximea.com)
 # Updated: 25 June 2012 by Igor Kuzmin (parafin@ximea.com)
+# Updated: 22 October 2012 by Marian Zajko (marian.zajko@ximea.com)
 #

 set(XIMEA_FOUND)
@ -18,11 +19,15 @@ set(XIMEA_LIBRARY_DIR)
 if(WIN32)
  # Try to find the XIMEA API path in registry.
  GET_FILENAME_COMPONENT(XIMEA_PATH "[HKEY_CURRENT_USER\\Software\\XIMEA\\CamSupport\\API;Path]" ABSOLUTE)
-
-  if(EXISTS XIMEA_PATH)
+ 
+  if(EXISTS ${XIMEA_PATH})
    set(XIMEA_FOUND 1)
    # set LIB folders
-    set(XIMEA_LIBRARY_DIR "${XIMEA_PATH}/x86")
+    if(CMAKE_CL_64)
+      set(XIMEA_LIBRARY_DIR "${XIMEA_PATH}/x64")
+    else()
+      set(XIMEA_LIBRARY_DIR "${XIMEA_PATH}/x86")
+    endif()
  else()
    set(XIMEA_FOUND 0)
  endif()
@ -38,5 +43,4 @@ endif()

 mark_as_advanced(FORCE XIMEA_FOUND)
 mark_as_advanced(FORCE XIMEA_PATH)
-mark_as_advanced(FORCE XIMEA_LIBRARY_DIR)
-
+mark_as_advanced(FORCE XIMEA_LIBRARY_DIR)
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@ -470,7 +470,8 @@ endmacro()
 #   ocv_create_module(<extra link dependencies>)
 #   ocv_create_module(SKIP_LINK)
 macro(ocv_create_module)
-  add_library(${the_module} ${OPENCV_MODULE_TYPE} ${OPENCV_MODULE_${the_module}_HEADERS} ${OPENCV_MODULE_${the_module}_SOURCES})
+  add_library(${the_module} ${OPENCV_MODULE_TYPE} ${OPENCV_MODULE_${the_module}_HEADERS} ${OPENCV_MODULE_${the_module}_SOURCES}
+    "${OPENCV_CONFIG_FILE_INCLUDE_DIR}/cvconfig.h" "${OPENCV_CONFIG_FILE_INCLUDE_DIR}/opencv2/opencv_modules.hpp")
  if(NOT the_module STREQUAL opencv_ts)
    set_target_properties(${the_module} PROPERTIES COMPILE_DEFINITIONS OPENCV_NOSTL)
  endif()
--- a/cmake/OpenCVUtils.cmake
+++ b/cmake/OpenCVUtils.cmake
@ -501,6 +501,13 @@ macro(ocv_parse_header2 LIBNAME HDR_PATH VARNAME)
  endif()
 endmacro()

+# read single version info from the pkg file
+macro(ocv_parse_pkg LIBNAME PKG_PATH SCOPE)
+  if(EXISTS "${PKG_PATH}/${LIBNAME}.pc")
+    file(STRINGS "${PKG_PATH}/${LIBNAME}.pc" line_to_parse REGEX "^Version:[ \t]+[0-9.]*.*$" LIMIT_COUNT 1)
+    STRING(REGEX REPLACE ".*Version: ([^ ]+).*" "\\1" ALIASOF_${LIBNAME}_VERSION "${line_to_parse}" )
+  endif()
+endmacro()

 ################################################################################################
 # short command to setup source group
--- a/doc/tutorials/features2d/feature_flann_matcher/feature_flann_matcher.rst
+++ b/doc/tutorials/features2d/feature_flann_matcher/feature_flann_matcher.rst
@ -85,7 +85,7 @@ This tutorial code's is shown lines below. You can also download it from `here <
     std::vector< DMatch > good_matches;

     for( int i = 0; i < descriptors_1.rows; i++ )
-     { if( matches[i].distance < 2*min_dist )
+     { if( matches[i].distance <= 2*min_dist )
       { good_matches.push_back( matches[i]); }
     }

@ -127,6 +127,3 @@ Result
   .. image:: images/Feature_FlannMatcher_Keypoints_Result.jpg
      :align: center
      :height: 250pt
-
-
-
--- a/modules/calib3d/include/opencv2/calib3d.hpp
+++ b/modules/calib3d/include/opencv2/calib3d.hpp
@ -278,8 +278,8 @@ CV_EXPORTS int recoverPose( InputArray E, InputArray points1, InputArray points2


 //! finds coordinates of epipolar lines corresponding the specified points
-CV_EXPORTS void computeCorrespondEpilines( InputArray points, int whichImage,
-                                           InputArray F, OutputArray lines );
+CV_EXPORTS_W void computeCorrespondEpilines( InputArray points, int whichImage,
+                                             InputArray F, OutputArray lines );

 CV_EXPORTS_W void triangulatePoints( InputArray projMatr1, InputArray projMatr2,
                                     InputArray projPoints1, InputArray projPoints2,
--- a/modules/core/doc/basic_structures.rst
+++ b/modules/core/doc/basic_structures.rst
@ -1741,7 +1741,7 @@ Returns the depth of a matrix element.

 .. ocv:function:: int Mat::depth() const

-The method returns the identifier of the matrix element depth (the type of each individual channel). For example, for a 16-bit signed 3-channel array, the method returns ``CV_16S`` . A complete list of matrix types contains the following values:
+The method returns the identifier of the matrix element depth (the type of each individual channel). For example, for a 16-bit signed element array, the method returns ``CV_16S`` . A complete list of matrix types contains the following values:

 * ``CV_8U``     - 8-bit unsigned integers ( ``0..255``     )

--- a/modules/core/include/opencv2/core/cuda/limits.hpp
+++ b/modules/core/include/opencv2/core/cuda/limits.hpp
@ -43,193 +43,80 @@
 #ifndef __OPENCV_GPU_LIMITS_GPU_HPP__
 #define __OPENCV_GPU_LIMITS_GPU_HPP__

-#include <limits>
+#include <limits.h>
+#include <float.h>
 #include "common.hpp"

 namespace cv { namespace gpu { namespace cudev
 {
-    template<class T> struct numeric_limits
-    {
-        typedef T type;
-        __device__ __forceinline__ static type min()  { return type(); };
-        __device__ __forceinline__ static type max() { return type(); };
-        __device__ __forceinline__ static type epsilon() { return type(); }
-        __device__ __forceinline__ static type round_error() { return type(); }
-        __device__ __forceinline__ static type denorm_min()  { return type(); }
-        __device__ __forceinline__ static type infinity() { return type(); }
-        __device__ __forceinline__ static type quiet_NaN() { return type(); }
-        __device__ __forceinline__ static type signaling_NaN() { return T(); }
-        static const bool is_signed;
-    };

-    template<> struct numeric_limits<bool>
-    {
-        typedef bool type;
-        __device__ __forceinline__ static type min() { return false; };
-        __device__ __forceinline__ static type max() { return true;  };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = false;
-    };
+template <class T> struct numeric_limits;

-    template<> struct numeric_limits<char>
-    {
-        typedef char type;
-        __device__ __forceinline__ static type min() { return CHAR_MIN; };
-        __device__ __forceinline__ static type max() { return CHAR_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = (char)-1 == -1;
-    };
+template <> struct numeric_limits<bool>
+{
+    __device__ __forceinline__ static bool min() { return false; }
+    __device__ __forceinline__ static bool max() { return true;  }
+    static const bool is_signed = false;
+};

-    template<> struct numeric_limits<signed char>
-    {
-        typedef char type;
-        __device__ __forceinline__ static type min() { return SCHAR_MIN; };
-        __device__ __forceinline__ static type max() { return SCHAR_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = (signed char)-1 == -1;
-    };
+template <> struct numeric_limits<signed char>
+{
+    __device__ __forceinline__ static signed char min() { return SCHAR_MIN; }
+    __device__ __forceinline__ static signed char max() { return SCHAR_MAX; }
+    static const bool is_signed = true;
+};

-    template<> struct numeric_limits<unsigned char>
-    {
-        typedef unsigned char type;
-        __device__ __forceinline__ static type min() { return 0; };
-        __device__ __forceinline__ static type max() { return UCHAR_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = false;
-    };
+template <> struct numeric_limits<unsigned char>
+{
+    __device__ __forceinline__ static unsigned char min() { return 0; }
+    __device__ __forceinline__ static unsigned char max() { return UCHAR_MAX; }
+    static const bool is_signed = false;
+};

-    template<> struct numeric_limits<short>
-    {
-        typedef short type;
-        __device__ __forceinline__ static type min() { return SHRT_MIN; };
-        __device__ __forceinline__ static type max() { return SHRT_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = true;
-    };
+template <> struct numeric_limits<short>
+{
+    __device__ __forceinline__ static short min() { return SHRT_MIN; }
+    __device__ __forceinline__ static short max() { return SHRT_MAX; }
+    static const bool is_signed = true;
+};

-    template<> struct numeric_limits<unsigned short>
-    {
-        typedef unsigned short type;
-        __device__ __forceinline__ static type min() { return 0; };
-        __device__ __forceinline__ static type max() { return USHRT_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = false;
-    };
+template <> struct numeric_limits<unsigned short>
+{
+    __device__ __forceinline__ static unsigned short min() { return 0; }
+    __device__ __forceinline__ static unsigned short max() { return USHRT_MAX; }
+    static const bool is_signed = false;
+};

-    template<> struct numeric_limits<int>
-    {
-        typedef int type;
-        __device__ __forceinline__ static type min() { return INT_MIN; };
-        __device__ __forceinline__ static type max() { return INT_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = true;
-    };
+template <> struct numeric_limits<int>
+{
+    __device__ __forceinline__ static int min() { return INT_MIN; }
+    __device__ __forceinline__ static int max() { return INT_MAX; }
+    static const bool is_signed = true;
+};

+template <> struct numeric_limits<unsigned int>
+{
+    __device__ __forceinline__ static unsigned int min() { return 0; }
+    __device__ __forceinline__ static unsigned int max() { return UINT_MAX; }
+    static const bool is_signed = false;
+};

-    template<> struct numeric_limits<unsigned int>
-    {
-        typedef unsigned int type;
-        __device__ __forceinline__ static type min() { return 0; };
-        __device__ __forceinline__ static type max() { return UINT_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = false;
-    };
+template <> struct numeric_limits<float>
+{
+    __device__ __forceinline__ static float min() { return FLT_MIN; }
+    __device__ __forceinline__ static float max() { return FLT_MAX; }
+    __device__ __forceinline__ static float epsilon() { return FLT_EPSILON; }
+    static const bool is_signed = true;
+};

-    template<> struct numeric_limits<long>
-    {
-        typedef long type;
-        __device__ __forceinline__ static type min() { return LONG_MIN; };
-        __device__ __forceinline__ static type max() { return LONG_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = true;
-    };
+template <> struct numeric_limits<double>
+{
+    __device__ __forceinline__ static double min() { return DBL_MIN; }
+    __device__ __forceinline__ static double max() { return DBL_MAX; }
+    __device__ __forceinline__ static double epsilon() { return DBL_EPSILON; }
+    static const bool is_signed = true;
+};

-    template<> struct numeric_limits<unsigned long>
-    {
-        typedef unsigned long type;
-        __device__ __forceinline__ static type min() { return 0; };
-        __device__ __forceinline__ static type max() { return ULONG_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = false;
-    };
-
-    template<> struct numeric_limits<float>
-    {
-        typedef float type;
-        __device__ __forceinline__ static type min() { return 1.175494351e-38f/*FLT_MIN*/; };
-        __device__ __forceinline__ static type max() { return 3.402823466e+38f/*FLT_MAX*/; };
-        __device__ __forceinline__ static type epsilon() { return 1.192092896e-07f/*FLT_EPSILON*/; };
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = true;
-    };
-
-    template<> struct numeric_limits<double>
-    {
-        typedef double type;
-        __device__ __forceinline__ static type min() { return 2.2250738585072014e-308/*DBL_MIN*/; };
-        __device__ __forceinline__ static type max() { return 1.7976931348623158e+308/*DBL_MAX*/; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = true;
-    };
 }}} // namespace cv { namespace gpu { namespace cudev {

 #endif // __OPENCV_GPU_LIMITS_GPU_HPP__
--- a/modules/core/include/opencv2/core/gpu.hpp
+++ b/modules/core/include/opencv2/core/gpu.hpp
@ -375,19 +375,6 @@ public:
    //! returns true if stream object is not default (!= 0)
    operator bool_type() const;

-    // obsolete methods
-
-    void enqueueDownload(const GpuMat& src, OutputArray dst);
-
-    void enqueueUpload(InputArray src, GpuMat& dst);
-
-    void enqueueCopy(const GpuMat& src, OutputArray dst);
-
-    void enqueueMemSet(GpuMat& src, Scalar val);
-    void enqueueMemSet(GpuMat& src, Scalar val, InputArray mask);
-
-    void enqueueConvert(const GpuMat& src, OutputArray dst, int dtype, double alpha = 1.0, double beta = 0.0);
-
    class Impl;

 private:
@ -529,10 +516,10 @@ public:
    size_t totalConstMem() const;

    //! major compute capability
-    int major() const;
+    int majorVersion() const;

    //! minor compute capability
-    int minor() const;
+    int minorVersion() const;

    //! alignment requirement for textures
    size_t textureAlignment() const;
--- a/modules/core/include/opencv2/core/gpu.inl.hpp
+++ b/modules/core/include/opencv2/core/gpu.inl.hpp
@ -525,42 +525,6 @@ void swap(CudaMem& a, CudaMem& b)

 //////////////////////////////// Stream ///////////////////////////////

-inline
-void Stream::enqueueDownload(const GpuMat& src, OutputArray dst)
-{
-    src.download(dst, *this);
-}
-
-inline
-void Stream::enqueueUpload(InputArray src, GpuMat& dst)
-{
-    dst.upload(src, *this);
-}
-
-inline
-void Stream::enqueueCopy(const GpuMat& src, OutputArray dst)
-{
-    src.copyTo(dst, *this);
-}
-
-inline
-void Stream::enqueueMemSet(GpuMat& src, Scalar val)
-{
-    src.setTo(val, *this);
-}
-
-inline
-void Stream::enqueueMemSet(GpuMat& src, Scalar val, InputArray mask)
-{
-    src.setTo(val, mask, *this);
-}
-
-inline
-void Stream::enqueueConvert(const GpuMat& src, OutputArray dst, int dtype, double alpha, double beta)
-{
-    src.convertTo(dst, dtype, alpha, beta, *this);
-}
-
 inline
 Stream::Stream(const Ptr<Impl>& impl)
    : impl_(impl)
@ -619,7 +583,7 @@ size_t DeviceInfo::totalMemory() const
 inline
 bool DeviceInfo::supports(FeatureSet feature_set) const
 {
-    int version = major() * 10 + minor();
+    int version = majorVersion() * 10 + minorVersion();
    return version >= feature_set;
 }

--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@ -78,7 +78,8 @@ public:
        EXPR              = 6 << KIND_SHIFT,
        OPENGL_BUFFER     = 7 << KIND_SHIFT,
        CUDA_MEM          = 8 << KIND_SHIFT,
-        GPU_MAT           = 9 << KIND_SHIFT
+        GPU_MAT           = 9 << KIND_SHIFT,
+        OCL_MAT           =10 << KIND_SHIFT
    };

    _InputArray();
--- a/modules/core/include/opencv2/core/matx.hpp
+++ b/modules/core/include/opencv2/core/matx.hpp
@ -77,6 +77,7 @@ struct CV_EXPORTS Matx_AddOp {};
 struct CV_EXPORTS Matx_SubOp {};
 struct CV_EXPORTS Matx_ScaleOp {};
 struct CV_EXPORTS Matx_MulOp {};
+struct CV_EXPORTS Matx_DivOp {};
 struct CV_EXPORTS Matx_MatMulOp {};
 struct CV_EXPORTS Matx_TOp {};

@ -174,6 +175,7 @@ public:
    Matx(const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b, Matx_SubOp);
    template<typename _T2> Matx(const Matx<_Tp, m, n>& a, _T2 alpha, Matx_ScaleOp);
    Matx(const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b, Matx_MulOp);
+    Matx(const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b, Matx_DivOp);
    template<int l> Matx(const Matx<_Tp, m, l>& a, const Matx<_Tp, l, n>& b, Matx_MatMulOp);
    Matx(const Matx<_Tp, n, m>& a, Matx_TOp);

@ -746,6 +748,13 @@ Matx<_Tp,m,n>::Matx(const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b, Matx_Mul
        val[i] = saturate_cast<_Tp>(a.val[i] * b.val[i]);
 }

+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n>::Matx(const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b, Matx_DivOp)
+{
+    for( int i = 0; i < channels; i++ )
+        val[i] = saturate_cast<_Tp>(a.val[i] / b.val[i]);
+}
+
 template<typename _Tp, int m, int n> template<int l> inline
 Matx<_Tp,m,n>::Matx(const Matx<_Tp, m, l>& a, const Matx<_Tp, l, n>& b, Matx_MatMulOp)
 {
@ -1162,6 +1171,12 @@ Vec<_Tp, m> operator * (const Matx<_Tp, m, n>& a, const Vec<_Tp, n>& b)
    return (const Vec<_Tp, m>&)(c);
 }

+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator / (const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b)
+{
+    return Matx<_Tp, m, n>(a, b, Matx_DivOp());
+}
+
 template<typename _Tp, int m, int n> static inline
 bool operator == (const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b)
 {
@ -1337,4 +1352,4 @@ template<typename _Tp> inline Vec<_Tp, 4>& operator *= (Vec<_Tp, 4>& v1, const V

 } // cv

-#endif // __OPENCV_CORE_MATX_HPP__
+#endif // __OPENCV_CORE_MATX_HPP__
--- a/modules/core/include/opencv2/core/private.hpp
+++ b/modules/core/include/opencv2/core/private.hpp
@ -71,6 +71,30 @@
 #  endif
 #endif

+#ifdef _OPENMP
+#  define HAVE_OPENMP
+#endif
+
+#ifdef __APPLE__
+#  define HAVE_GCD
+#endif
+
+#if defined _MSC_VER && _MSC_VER >= 1600
+#  define HAVE_CONCURRENCY
+#endif
+
+#if defined HAVE_TBB
+#  define CV_PARALLEL_FRAMEWORK "tbb"
+#elif defined HAVE_CSTRIPES
+#  define CV_PARALLEL_FRAMEWORK "cstripes"
+#elif defined HAVE_OPENMP
+#  define CV_PARALLEL_FRAMEWORK "openmp"
+#elif defined HAVE_GCD
+#  define CV_PARALLEL_FRAMEWORK "gcd"
+#elif defined HAVE_CONCURRENCY
+#  define CV_PARALLEL_FRAMEWORK "ms-concurrency"
+#endif
+
 namespace cv
 {
 #ifdef HAVE_TBB
--- a/modules/core/perf/perf_reduce.cpp
+++ b/modules/core/perf/perf_reduce.cpp
@ -34,7 +34,8 @@ PERF_TEST_P(Size_MatType_ROp, reduceR,
    declare.in(src, WARMUP_RNG).out(vec);
    declare.time(100);

-    TEST_CYCLE() reduce(src, vec, 0, reduceOp, ddepth);
+    int runs = 15;
+    TEST_CYCLE_MULTIRUN(runs) reduce(src, vec, 0, reduceOp, ddepth);

    SANITY_CHECK(vec, 1);
 }
@ -65,4 +66,3 @@ PERF_TEST_P(Size_MatType_ROp, reduceC,

    SANITY_CHECK(vec, 1);
 }
-
--- a/modules/core/src/gpu_info.cpp
+++ b/modules/core/src/gpu_info.cpp
@ -119,7 +119,7 @@ bool cv::gpu::deviceSupports(FeatureSet feature_set)
    else
    {
        DeviceInfo dev(devId);
-        version = dev.major() * 10 + dev.minor();
+        version = dev.majorVersion() * 10 + dev.minorVersion();
        if (devId < cache_size)
            versions[devId] = version;
    }
@ -455,7 +455,7 @@ size_t cv::gpu::DeviceInfo::totalConstMem() const
 #endif
 }

-int cv::gpu::DeviceInfo::major() const
+int cv::gpu::DeviceInfo::majorVersion() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -465,7 +465,7 @@ int cv::gpu::DeviceInfo::major() const
 #endif
 }

-int cv::gpu::DeviceInfo::minor() const
+int cv::gpu::DeviceInfo::minorVersion() const
 {
 #ifndef HAVE_CUDA
    throw_no_cuda();
@ -569,7 +569,12 @@ int cv::gpu::DeviceInfo::maxTexture1DMipmap() const
    throw_no_cuda();
    return 0;
 #else
-    return deviceProps().get(device_id_)->maxTexture1DMipmap;
+    #if CUDA_VERSION >= 5000
+        return deviceProps().get(device_id_)->maxTexture1DMipmap;
+    #else
+        CV_Error(Error::StsNotImplemented, "This function requires CUDA 5.0");
+        return 0;
+    #endif
 #endif
 }

@ -599,7 +604,12 @@ Vec2i cv::gpu::DeviceInfo::maxTexture2DMipmap() const
    throw_no_cuda();
    return Vec2i();
 #else
-    return Vec2i(deviceProps().get(device_id_)->maxTexture2DMipmap);
+    #if CUDA_VERSION >= 5000
+        return Vec2i(deviceProps().get(device_id_)->maxTexture2DMipmap);
+    #else
+        CV_Error(Error::StsNotImplemented, "This function requires CUDA 5.0");
+        return Vec2i();
+    #endif
 #endif
 }

@ -898,12 +908,12 @@ bool cv::gpu::DeviceInfo::isCompatible() const
    return false;
 #else
    // Check PTX compatibility
-    if (TargetArchs::hasEqualOrLessPtx(major(), minor()))
+    if (TargetArchs::hasEqualOrLessPtx(majorVersion(), minorVersion()))
        return true;

    // Check BIN compatibility
-    for (int i = minor(); i >= 0; --i)
-        if (TargetArchs::hasBin(major(), i))
+    for (int i = minorVersion(); i >= 0; --i)
+        if (TargetArchs::hasBin(majorVersion(), i))
            return true;

    return false;
--- a/modules/core/src/matmul.cpp
+++ b/modules/core/src/matmul.cpp
@ -2850,9 +2850,9 @@ PCA& PCA::operator()(InputArray _data, InputArray __mean, int flags, int maxComp

    if( _mean.data )
    {
-        CV_Assert( _mean.size() == mean_sz );        
+        CV_Assert( _mean.size() == mean_sz );
        _mean.convertTo(mean, ctype);
-        covar_flags |= CV_COVAR_USE_AVG; 
+        covar_flags |= CV_COVAR_USE_AVG;
    }

    calcCovarMatrix( data, covar, mean, covar_flags, ctype );
@ -2896,6 +2896,36 @@ PCA& PCA::operator()(InputArray _data, InputArray __mean, int flags, int maxComp
    return *this;
 }

+template <typename T>
+int computeCumulativeEnergy(const Mat& eigenvalues, double retainedVariance)
+{
+    CV_DbgAssert( eigenvalues.type() == DataType<T>::type );
+
+    Mat g(eigenvalues.size(), DataType<T>::type);
+
+    for(int ig = 0; ig < g.rows; ig++)
+    {
+        g.at<T>(ig, 0) = 0;
+        for(int im = 0; im <= ig; im++)
+        {
+            g.at<T>(ig,0) += eigenvalues.at<T>(im,0);
+        }
+    }
+
+    int L;
+
+    for(L = 0; L < eigenvalues.rows; L++)
+    {
+        double energy = g.at<T>(L, 0) / g.at<T>(g.rows - 1, 0);
+        if(energy > retainedVariance)
+            break;
+    }
+
+    L = std::max(2, L);
+
+    return L;
+}
+
 PCA& PCA::operator()(InputArray _data, InputArray __mean, int flags, double retainedVariance)
 {
    Mat data = _data.getMat(), _mean = __mean.getMat();
@ -2972,26 +3002,11 @@ PCA& PCA::operator()(InputArray _data, InputArray __mean, int flags, double reta
    }

    // compute the cumulative energy content for each eigenvector
-    Mat g(eigenvalues.size(), ctype);
-
-    for(int ig = 0; ig < g.rows; ig++)
-    {
-        g.at<float>(ig,0) = 0;
-        for(int im = 0; im <= ig; im++)
-        {
-            g.at<float>(ig,0) += eigenvalues.at<float>(im,0);
-        }
-    }
-
    int L;
-    for(L = 0; L < eigenvalues.rows; L++)
-    {
-        double energy = g.at<float>(L, 0) / g.at<float>(g.rows - 1, 0);
-        if(energy > retainedVariance)
-            break;
-    }
-
-    L = std::max(2, L);
+    if (ctype == CV_32F)
+        L = computeCumulativeEnergy<float>(eigenvalues, retainedVariance);
+    else
+        L = computeCumulativeEnergy<double>(eigenvalues, retainedVariance);

    // use clone() to physically copy the data and thus deallocate the original matrices
    eigenvalues = eigenvalues.rowRange(0,L).clone();
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@ -995,6 +995,11 @@ Mat _InputArray::getMat(int i) const
        return !v.empty() ? Mat(size(i), t, (void*)&v[0]) : Mat();
    }

+    if( k == OCL_MAT )
+    {
+        CV_Error(CV_StsNotImplemented, "This method is not implemented for oclMat yet");
+    }
+
    if( k == STD_VECTOR_MAT )
    {
        const std::vector<Mat>& v = *(const std::vector<Mat>*)obj;
@ -1100,6 +1105,11 @@ void _InputArray::getMatVector(std::vector<Mat>& mv) const
        return;
    }

+    if( k == OCL_MAT )
+    {
+        CV_Error(CV_StsNotImplemented, "This method is not implemented for oclMat yet");
+    }
+
    CV_Assert( k == STD_VECTOR_MAT );
    //if( k == STD_VECTOR_MAT )
    {
@ -1224,6 +1234,11 @@ Size _InputArray::size(int i) const
        return d_mat->size();
    }

+    if( k == OCL_MAT )
+    {
+        CV_Error(CV_StsNotImplemented, "This method is not implemented for oclMat yet");
+    }
+
    CV_Assert( k == CUDA_MEM );
    //if( k == CUDA_MEM )
    {
@ -1338,6 +1353,11 @@ bool _InputArray::empty() const
    if( k == OPENGL_BUFFER )
        return ((const ogl::Buffer*)obj)->empty();

+    if( k == OCL_MAT )
+    {
+        CV_Error(CV_StsNotImplemented, "This method is not implemented for oclMat yet");
+    }
+
    if( k == GPU_MAT )
        return ((const gpu::GpuMat*)obj)->empty();

@ -1573,6 +1593,11 @@ void _OutputArray::create(int dims, const int* sizes, int mtype, int i, bool all
        return;
    }

+    if( k == OCL_MAT )
+    {
+        CV_Error(CV_StsNotImplemented, "This method is not implemented for oclMat yet");
+    }
+
    if( k == NONE )
    {
        CV_Error(CV_StsNullPtr, "create() called for the missing output array" );
@ -1684,6 +1709,11 @@ void _OutputArray::release() const
        return;
    }

+    if( k == OCL_MAT )
+    {
+        CV_Error(CV_StsNotImplemented, "This method is not implemented for oclMat yet");
+    }
+
    CV_Assert( k == STD_VECTOR_MAT );
    //if( k == STD_VECTOR_MAT )
    {
--- a/modules/core/src/parallel.cpp
+++ b/modules/core/src/parallel.cpp
@ -61,17 +61,6 @@
    #endif
 #endif

-#ifdef _OPENMP
-    #define HAVE_OPENMP
-#endif
-
-#ifdef __APPLE__
-    #define HAVE_GCD
-#endif
-
-#if defined _MSC_VER && _MSC_VER >= 1600
-    #define HAVE_CONCURRENCY
-#endif

 /* IMPORTANT: always use the same order of defines
   1. HAVE_TBB         - 3rdparty library, should be explicitly enabled
@ -110,10 +99,6 @@
    #endif
 #endif

-#if defined HAVE_TBB || defined HAVE_CSTRIPES || defined HAVE_OPENMP || defined HAVE_GCD || defined HAVE_CONCURRENCY
-   #define HAVE_PARALLEL_FRAMEWORK
-#endif
-
 namespace cv
 {
    ParallelLoopBody::~ParallelLoopBody() {}
@ -121,7 +106,7 @@ namespace cv

 namespace
 {
-#ifdef HAVE_PARALLEL_FRAMEWORK
+#ifdef CV_PARALLEL_FRAMEWORK
    class ParallelLoopBodyWrapper
    {
    public:
@ -218,7 +203,7 @@ public:
 static SchedPtr pplScheduler;
 #endif

-#endif // HAVE_PARALLEL_FRAMEWORK
+#endif // CV_PARALLEL_FRAMEWORK

 } //namespace

@ -226,7 +211,7 @@ static SchedPtr pplScheduler;

 void cv::parallel_for_(const cv::Range& range, const cv::ParallelLoopBody& body, double nstripes)
 {
-#ifdef HAVE_PARALLEL_FRAMEWORK
+#ifdef CV_PARALLEL_FRAMEWORK

    if(numThreads != 0)
    {
@ -281,7 +266,7 @@ void cv::parallel_for_(const cv::Range& range, const cv::ParallelLoopBody& body,
    }
    else

-#endif // HAVE_PARALLEL_FRAMEWORK
+#endif // CV_PARALLEL_FRAMEWORK
    {
        (void)nstripes;
        body(range);
@ -290,7 +275,7 @@ void cv::parallel_for_(const cv::Range& range, const cv::ParallelLoopBody& body,

 int cv::getNumThreads(void)
 {
-#ifdef HAVE_PARALLEL_FRAMEWORK
+#ifdef CV_PARALLEL_FRAMEWORK

    if(numThreads == 0)
        return 1;
@ -333,7 +318,7 @@ int cv::getNumThreads(void)
 void cv::setNumThreads( int threads )
 {
    (void)threads;
-#ifdef HAVE_PARALLEL_FRAMEWORK
+#ifdef CV_PARALLEL_FRAMEWORK
    numThreads = threads;
 #endif

--- a/modules/core/test/test_operations.cpp
+++ b/modules/core/test/test_operations.cpp
@ -75,6 +75,7 @@ protected:
    bool TestSparseMat();
    bool TestVec();
    bool TestMatxMultiplication();
+    bool TestMatxElementwiseDivison();
    bool TestSubMatAccess();
    bool TestExp();
    bool TestSVD();
@ -891,6 +892,28 @@ bool CV_OperationsTest::TestMatxMultiplication()
    return true;
 }

+bool CV_OperationsTest::TestMatxElementwiseDivison()
+{
+    try
+    {
+        Matx22f mat(2, 4, 6, 8);
+        Matx22f mat2(2, 2, 2, 2);
+
+        Matx22f res = mat / mat2;
+
+        if(res(0, 0) != 1.0) throw test_excep();
+        if(res(0, 1) != 2.0) throw test_excep();
+        if(res(1, 0) != 3.0) throw test_excep();
+        if(res(1, 1) != 4.0) throw test_excep();
+    }
+    catch(const test_excep&)
+    {
+        ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_OUTPUT);
+        return false;
+    }
+    return true;
+}
+

 bool CV_OperationsTest::TestVec()
 {
@ -1109,6 +1132,9 @@ void CV_OperationsTest::run( int /* start_from */)
    if (!TestMatxMultiplication())
        return;

+    if (!TestMatxElementwiseDivison())
+        return;
+
    if (!TestSubMatAccess())
        return;

--- a/modules/features2d/doc/common_interfaces_of_descriptor_matchers.rst
+++ b/modules/features2d/doc/common_interfaces_of_descriptor_matchers.rst
@ -189,7 +189,7 @@ For each query descriptor, finds the training descriptors not farther than the s

    :param compactResult: Parameter used when the mask (or masks) is not empty. If  ``compactResult``  is false, the  ``matches``  vector has the same size as  ``queryDescriptors``  rows. If  ``compactResult``  is true, the  ``matches``  vector does not contain matches for fully masked-out query descriptors.

-    :param maxDistance: Threshold for the distance between matched descriptors.
+    :param maxDistance: Threshold for the distance between matched descriptors. Distance means here metric distance (e.g. Hamming distance), not the distance between coordinates (which is measured in Pixels)!

 For each query descriptor, the methods find such training descriptors that the distance between the query descriptor and the training descriptor is equal or smaller than ``maxDistance``. Found matches are returned in the distance increasing order.

--- a/modules/features2d/include/opencv2/features2d.hpp
+++ b/modules/features2d/include/opencv2/features2d.hpp
@ -206,6 +206,8 @@ public:
                                     OutputArray descriptors,
                                     bool useProvidedKeypoints=false ) const = 0;

+    CV_WRAP void compute( const Mat& image, CV_OUT CV_IN_OUT std::vector<KeyPoint>& keypoints, CV_OUT Mat& descriptors ) const;
+
    // Create feature detector and descriptor extractor by name.
    CV_WRAP static Ptr<Feature2D> create( const String& name );
 };
--- a/modules/features2d/src/descriptors.cpp
+++ b/modules/features2d/src/descriptors.cpp
@ -105,6 +105,12 @@ Ptr<DescriptorExtractor> DescriptorExtractor::create(const String& descriptorExt
    return Algorithm::create<DescriptorExtractor>("Feature2D." + descriptorExtractorType);
 }

+
+CV_WRAP void Feature2D::compute( const Mat& image, CV_OUT CV_IN_OUT std::vector<KeyPoint>& keypoints, CV_OUT Mat& descriptors ) const
+{
+   DescriptorExtractor::compute(image, keypoints, descriptors);
+}
+
 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////

 /****************************************************************************************\
--- a/modules/gpu/doc/initalization_and_information.rst
+++ b/modules/gpu/doc/initalization_and_information.rst
@ -147,10 +147,10 @@ Class providing functionality for querying the specified GPU properties. ::
        size_t totalConstMem() const;

        //! major compute capability
-        int major() const;
+        int majorVersion() const;

        //! minor compute capability
-        int minor() const;
+        int minorVersion() const;

        //! alignment requirement for textures
        size_t textureAlignment() const;
@ -313,19 +313,19 @@ Returns the device name.



-gpu::DeviceInfo::major
----------------------
+gpu::DeviceInfo::majorVersion
+-----------------------------
 Returns the major compute capability version.

-.. ocv:function:: int gpu::DeviceInfo::major()
+.. ocv:function:: int gpu::DeviceInfo::majorVersion()



-gpu::DeviceInfo::minor
----------------------
+gpu::DeviceInfo::minorVersion
+-----------------------------
 Returns the minor compute capability version.

-.. ocv:function:: int gpu::DeviceInfo::minor()
+.. ocv:function:: int gpu::DeviceInfo::minorVersion()



--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@ -458,7 +458,7 @@ public:

                // generate integral for scale
                gpu::resize(image, src, level.sFrame, 0, 0, cv::INTER_LINEAR);
-                gpu::integralBuffered(src, sint, buff);
+                gpu::integral(src, sint, buff);

                // calculate job
                int totalWidth = level.workArea.width / step;
--- a/modules/gpuarithm/doc/arithm.rst
+++ b/modules/gpuarithm/doc/arithm.rst
@ -6,10 +6,10 @@ Arithm Operations on Matrices


 gpu::gemm
------------------
+---------
 Performs generalized matrix multiplication.

-.. ocv:function:: void gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const GpuMat& src3, double beta, GpuMat& dst, int flags = 0, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::gemm(InputArray src1, InputArray src2, double alpha, InputArray src3, double beta, OutputArray dst, int flags = 0, Stream& stream = Stream::Null())

    :param src1: First multiplied input matrix that should have  ``CV_32FC1`` , ``CV_64FC1`` , ``CV_32FC2`` , or  ``CV_64FC2``  type.

@ -44,38 +44,40 @@ The function performs generalized matrix multiplication similar to the ``gemm``


 gpu::mulSpectrums
---------------------
+-----------------
 Performs a per-element multiplication of two Fourier spectrums.

-.. ocv:function:: void gpu::mulSpectrums( const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB=false, Stream& stream=Stream::Null() )
+.. ocv:function:: void gpu::mulSpectrums(InputArray src1, InputArray src2, OutputArray dst, int flags, bool conjB=false, Stream& stream = Stream::Null())

-    :param a: First spectrum.
+    :param src1: First spectrum.

-    :param b: Second spectrum with the same size and type as  ``a`` .
+    :param src2: Second spectrum with the same size and type as  ``a`` .

-    :param c: Destination spectrum.
+    :param dst: Destination spectrum.

    :param flags: Mock parameter used for CPU/GPU interfaces similarity.

    :param conjB: Optional flag to specify if the second spectrum needs to be conjugated before the multiplication.

-    Only full (not packed) ``CV_32FC2`` complex spectrums in the interleaved format are supported for now.
+    :param stream: Stream for the asynchronous version.
+
+Only full (not packed) ``CV_32FC2`` complex spectrums in the interleaved format are supported for now.

 .. seealso:: :ocv:func:`mulSpectrums`



 gpu::mulAndScaleSpectrums
-----------------------------
+-------------------------
 Performs a per-element multiplication of two Fourier spectrums and scales the result.

-.. ocv:function:: void gpu::mulAndScaleSpectrums( const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, float scale, bool conjB=false, Stream& stream=Stream::Null() )
+.. ocv:function:: void gpu::mulAndScaleSpectrums(InputArray src1, InputArray src2, OutputArray dst, int flags, float scale, bool conjB=false, Stream& stream = Stream::Null())

-    :param a: First spectrum.
+    :param src1: First spectrum.

-    :param b: Second spectrum with the same size and type as  ``a`` .
+    :param src2: Second spectrum with the same size and type as  ``a`` .

-    :param c: Destination spectrum.
+    :param dst: Destination spectrum.

    :param flags: Mock parameter used for CPU/GPU interfaces similarity.

@ -83,17 +85,17 @@ Performs a per-element multiplication of two Fourier spectrums and scales the re

    :param conjB: Optional flag to specify if the second spectrum needs to be conjugated before the multiplication.

-    Only full (not packed) ``CV_32FC2`` complex spectrums in the interleaved format are supported for now.
+Only full (not packed) ``CV_32FC2`` complex spectrums in the interleaved format are supported for now.

 .. seealso:: :ocv:func:`mulSpectrums`



 gpu::dft
------------
+--------
 Performs a forward or inverse discrete Fourier transform (1D or 2D) of the floating point matrix.

-.. ocv:function:: void gpu::dft( const GpuMat& src, GpuMat& dst, Size dft_size, int flags=0, Stream& stream=Stream::Null() )
+.. ocv:function:: void gpu::dft(InputArray src, OutputArray dst, Size dft_size, int flags=0, Stream& stream = Stream::Null())

    :param src: Source matrix (real or complex).

@ -125,46 +127,25 @@ The source matrix should be continuous, otherwise reallocation and data copying



-gpu::ConvolveBuf
+gpu::Convolution
 ----------------
-.. ocv:struct:: gpu::ConvolveBuf
+.. ocv:class:: gpu::Convolution : public Algorithm

-Class providing a memory buffer for :ocv:func:`gpu::convolve` function, plus it allows to adjust some specific parameters. ::
+Base class for convolution (or cross-correlation) operator. ::

-    struct CV_EXPORTS ConvolveBuf
+    class CV_EXPORTS Convolution : public Algorithm
    {
-        Size result_size;
-        Size block_size;
-        Size user_block_size;
-        Size dft_size;
-        int spect_len;
-
-        GpuMat image_spect, templ_spect, result_spect;
-        GpuMat image_block, templ_block, result_data;
-
-        void create(Size image_size, Size templ_size);
-        static Size estimateBlockSize(Size result_size, Size templ_size);
+    public:
+        virtual void convolve(InputArray image, InputArray templ, OutputArray result, bool ccorr = false, Stream& stream = Stream::Null()) = 0;
    };

-You can use field `user_block_size` to set specific block size for :ocv:func:`gpu::convolve` function. If you leave its default value `Size(0,0)` then automatic estimation of block size will be used (which is optimized for speed). By varying `user_block_size` you can reduce memory requirements at the cost of speed.


-
-gpu::ConvolveBuf::create
------------------------
-.. ocv:function:: gpu::ConvolveBuf::create(Size image_size, Size templ_size)
-
-Constructs a buffer for :ocv:func:`gpu::convolve` function with respective arguments.
-
-
-
-gpu::convolve
-----------------
+gpu::Convolution::convolve
+---------------------------
 Computes a convolution (or cross-correlation) of two images.

-.. ocv:function:: void gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr=false)
-
-.. ocv:function:: void gpu::convolve( const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream=Stream::Null() )
+.. ocv:function:: void gpu::Convolution::convolve(InputArray image, InputArray templ, OutputArray result, bool ccorr = false, Stream& stream = Stream::Null())

    :param image: Source image. Only  ``CV_32FC1`` images are supported for now.

@ -174,38 +155,14 @@ Computes a convolution (or cross-correlation) of two images.

    :param ccorr: Flags to evaluate cross-correlation instead of convolution.

-    :param buf: Optional buffer to avoid extra memory allocations and to adjust some specific parameters. See :ocv:struct:`gpu::ConvolveBuf`.
-
    :param stream: Stream for the asynchronous version.

-.. seealso:: :ocv:func:`gpu::filter2D`


+gpu::createConvolution
+----------------------
+Creates implementation for :ocv:class:`gpu::Convolution` .

-gpu::integral
-----------------
-Computes an integral image.
+.. ocv:function:: Ptr<Convolution> createConvolution(Size user_block_size = Size())

-.. ocv:function:: void gpu::integral(const GpuMat& src, GpuMat& sum, Stream& stream = Stream::Null())
-
-    :param src: Source image. Only  ``CV_8UC1`` images are supported for now.
-
-    :param sum: Integral image containing 32-bit unsigned integer values packed into  ``CV_32SC1`` .
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`integral`
-
-
-
-gpu::sqrIntegral
--------------------
-Computes a squared integral image.
-
-.. ocv:function:: void gpu::sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& stream = Stream::Null())
-
-    :param src: Source image. Only  ``CV_8UC1`` images are supported for now.
-
-    :param sqsum: Squared integral image containing 64-bit unsigned integer values packed into  ``CV_64FC1`` .
-
-    :param stream: Stream for the asynchronous version.
+    :param user_block_size: Block size. If you leave default value `Size(0,0)` then automatic estimation of block size will be used (which is optimized for speed). By varying `user_block_size` you can reduce memory requirements at the cost of speed.
--- a/modules/gpuarithm/doc/core.rst
+++ b/modules/gpuarithm/doc/core.rst
@ -6,12 +6,12 @@ Core Operations on Matrices


 gpu::merge
--------------
+----------
 Makes a multi-channel matrix out of several single-channel matrices.

-.. ocv:function:: void gpu::merge(const GpuMat* src, size_t n, GpuMat& dst, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::merge(const GpuMat* src, size_t n, OutputArray dst, Stream& stream = Stream::Null())

-.. ocv:function:: void gpu::merge(const vector<GpuMat>& src, GpuMat& dst, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::merge(const std::vector<GpuMat>& src, OutputArray dst, Stream& stream = Stream::Null())

    :param src: Array/vector of source matrices.

@ -26,12 +26,12 @@ Makes a multi-channel matrix out of several single-channel matrices.


 gpu::split
--------------
+----------
 Copies each plane of a multi-channel matrix into an array.

-.. ocv:function:: void gpu::split(const GpuMat& src, GpuMat* dst, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::split(InputArray src, GpuMat* dst, Stream& stream = Stream::Null())

-.. ocv:function:: void gpu::split(const GpuMat& src, vector<GpuMat>& dst, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::split(InputArray src, vector<GpuMat>& dst, Stream& stream = Stream::Null())

    :param src: Source matrix.

@ -43,15 +43,95 @@ Copies each plane of a multi-channel matrix into an array.



+gpu::transpose
+--------------
+Transposes a matrix.
+
+.. ocv:function:: void gpu::transpose(InputArray src1, OutputArray dst, Stream& stream = Stream::Null())
+
+    :param src1: Source matrix. 1-, 4-, 8-byte element sizes are supported for now.
+
+    :param dst: Destination matrix.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`transpose`
+
+
+
+gpu::flip
+---------
+Flips a 2D matrix around vertical, horizontal, or both axes.
+
+.. ocv:function:: void gpu::flip(InputArray src, OutputArray dst, int flipCode, Stream& stream = Stream::Null())
+
+    :param src: Source matrix. Supports 1, 3 and 4 channels images with ``CV_8U``, ``CV_16U``, ``CV_32S`` or ``CV_32F`` depth.
+
+    :param dst: Destination matrix.
+
+    :param flipCode: Flip mode for the source:
+
+        * ``0`` Flips around x-axis.
+
+        * ``> 0`` Flips around y-axis.
+
+        * ``< 0`` Flips around both axes.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`flip`
+
+
+
+gpu::LookUpTable
+----------------
+.. ocv:class:: gpu::LookUpTable : public Algorithm
+
+Base class for transform using lookup table. ::
+
+    class CV_EXPORTS LookUpTable : public Algorithm
+    {
+    public:
+        virtual void transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null()) = 0;
+    };
+
+.. seealso:: :ocv:func:`LUT`
+
+
+
+gpu::LookUpTable::transform
+---------------------------
+Transforms the source matrix into the destination matrix using the given look-up table: ``dst(I) = lut(src(I))`` .
+
+.. ocv:function:: void gpu::LookUpTable::transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null())
+
+    :param src: Source matrix.  ``CV_8UC1``  and  ``CV_8UC3``  matrices are supported for now.
+
+    :param dst: Destination matrix.
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::createLookUpTable
+----------------------
+Creates implementation for :ocv:class:`gpu::LookUpTable` .
+
+.. ocv:function:: Ptr<LookUpTable> createLookUpTable(InputArray lut)
+
+    :param lut: Look-up table of 256 elements. It is a continuous ``CV_8U`` matrix.
+
+
+
 gpu::copyMakeBorder
 -----------------------
 Forms a border around an image.

-.. ocv:function:: void gpu::copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, int borderType, const Scalar& value = Scalar(), Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::copyMakeBorder(InputArray src, OutputArray dst, int top, int bottom, int left, int right, int borderType, Scalar value = Scalar(), Stream& stream = Stream::Null())

-    :param src: Source image. ``CV_8UC1`` , ``CV_8UC4`` , ``CV_32SC1`` , and  ``CV_32FC1`` types are supported.
+    :param src: Source image. ``CV_8UC1`` , ``CV_8UC4`` , ``CV_32SC1`` , and ``CV_32FC1`` types are supported.

-    :param dst: Destination image with the same type as  ``src``. The size is  ``Size(src.cols+left+right, src.rows+top+bottom)`` .
+    :param dst: Destination image with the same type as  ``src``. The size is ``Size(src.cols+left+right, src.rows+top+bottom)`` .

    :param top:

@ -68,61 +148,3 @@ Forms a border around an image.
    :param stream: Stream for the asynchronous version.

 .. seealso:: :ocv:func:`copyMakeBorder`
-
-
-
-gpu::transpose
------------------
-Transposes a matrix.
-
-.. ocv:function:: void gpu::transpose( const GpuMat& src1, GpuMat& dst, Stream& stream=Stream::Null() )
-
-    :param src1: Source matrix. 1-, 4-, 8-byte element sizes are supported for now (CV_8UC1, CV_8UC4, CV_16UC2, CV_32FC1, etc).
-
-    :param dst: Destination matrix.
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`transpose`
-
-
-
-gpu::flip
-------------
-Flips a 2D matrix around vertical, horizontal, or both axes.
-
-.. ocv:function:: void gpu::flip( const GpuMat& a, GpuMat& b, int flipCode, Stream& stream=Stream::Null() )
-
-    :param a: Source matrix. Supports 1, 3 and 4 channels images with ``CV_8U``, ``CV_16U``, ``CV_32S`` or ``CV_32F`` depth.
-
-    :param b: Destination matrix.
-
-    :param flipCode: Flip mode for the source:
-
-        * ``0`` Flips around x-axis.
-
-        * ``>0`` Flips around y-axis.
-
-        * ``<0`` Flips around both axes.
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`flip`
-
-
-
-gpu::LUT
------------
-Transforms the source matrix into the destination matrix using the given look-up table: ``dst(I) = lut(src(I))``
-
-.. ocv:function:: void gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& stream = Stream::Null())
-
-    :param src: Source matrix.  ``CV_8UC1``  and  ``CV_8UC3``  matrices are supported for now.
-
-    :param lut: Look-up table of 256 elements. It is a continuous ``CV_8U`` matrix.
-
-    :param dst: Destination matrix with the same depth as  ``lut``  and the same number of channels as  ``src`` .
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`LUT`
--- a/modules/gpuarithm/doc/element_operations.rst
+++ b/modules/gpuarithm/doc/element_operations.rst
@ -6,20 +6,16 @@ Per-element Operations


 gpu::add
------------
+--------
 Computes a matrix-matrix or matrix-scalar sum.

-.. ocv:function:: void gpu::add( const GpuMat& a, const GpuMat& b, GpuMat& c, const GpuMat& mask=GpuMat(), int dtype=-1, Stream& stream=Stream::Null() )
+.. ocv:function:: void gpu::add(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), int dtype = -1, Stream& stream = Stream::Null())

-.. ocv:function:: void gpu::add( const GpuMat& a, const Scalar& sc, GpuMat& c, const GpuMat& mask=GpuMat(), int dtype=-1, Stream& stream=Stream::Null() )
+    :param src1: First source matrix or scalar.

-    :param a: First source matrix.
+    :param src2: Second source matrix or scalar. Matrix should have the same size and type as ``src1`` .

-    :param b: Second source matrix to be added to ``a`` . Matrix should have the same size and type as ``a`` .
-
-    :param sc: A scalar to be added to ``a`` .
-
-    :param c: Destination matrix that has the same size and number of channels as the input array(s). The depth is defined by ``dtype`` or ``a`` depth.
+    :param dst: Destination matrix that has the same size and number of channels as the input array(s). The depth is defined by ``dtype`` or ``src1`` depth.

    :param mask: Optional operation mask, 8-bit single channel array, that specifies elements of the destination array to be changed.

@ -32,20 +28,16 @@ Computes a matrix-matrix or matrix-scalar sum.


 gpu::subtract
-----------------
+-------------
 Computes a matrix-matrix or matrix-scalar difference.

-.. ocv:function:: void gpu::subtract( const GpuMat& a, const GpuMat& b, GpuMat& c, const GpuMat& mask=GpuMat(), int dtype=-1, Stream& stream=Stream::Null() )
+.. ocv:function:: void gpu::subtract(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), int dtype = -1, Stream& stream = Stream::Null())

-.. ocv:function:: void gpu::subtract( const GpuMat& a, const Scalar& sc, GpuMat& c, const GpuMat& mask=GpuMat(), int dtype=-1, Stream& stream=Stream::Null() )
+    :param src1: First source matrix or scalar.

-    :param a: First source matrix.
+    :param src2: Second source matrix or scalar. Matrix should have the same size and type as ``src1`` .

-    :param b: Second source matrix to be added to ``a`` . Matrix should have the same size and type as ``a`` .
-
-    :param sc: A scalar to be added to ``a`` .
-
-    :param c: Destination matrix that has the same size and number of channels as the input array(s). The depth is defined by ``dtype`` or ``a`` depth.
+    :param dst: Destination matrix that has the same size and number of channels as the input array(s). The depth is defined by ``dtype`` or ``src1`` depth.

    :param mask: Optional operation mask, 8-bit single channel array, that specifies elements of the destination array to be changed.

@ -58,20 +50,16 @@ Computes a matrix-matrix or matrix-scalar difference.


 gpu::multiply
-----------------
+-------------
 Computes a matrix-matrix or matrix-scalar per-element product.

-.. ocv:function:: void gpu::multiply( const GpuMat& a, const GpuMat& b, GpuMat& c, double scale=1, int dtype=-1, Stream& stream=Stream::Null() )
+.. ocv:function:: void gpu::multiply(InputArray src1, InputArray src2, OutputArray dst, double scale = 1, int dtype = -1, Stream& stream = Stream::Null())

-.. ocv:function:: void gpu::multiply( const GpuMat& a, const Scalar& sc, GpuMat& c, double scale=1, int dtype=-1, Stream& stream=Stream::Null() )
+    :param src1: First source matrix or scalar.

-    :param a: First source matrix.
+    :param src2: Second source matrix or scalar.

-    :param b: Second source matrix to be multiplied by ``a`` elements.
-
-    :param sc: A scalar to be multiplied by ``a`` elements.
-
-    :param c: Destination matrix that has the same size and number of channels as the input array(s). The depth is defined by ``dtype`` or ``a`` depth.
+    :param dst: Destination matrix that has the same size and number of channels as the input array(s). The depth is defined by ``dtype`` or ``src1`` depth.

    :param scale: Optional scale factor.

@ -87,19 +75,15 @@ gpu::divide
 -----------
 Computes a matrix-matrix or matrix-scalar division.

-.. ocv:function:: void gpu::divide( const GpuMat& a, const GpuMat& b, GpuMat& c, double scale=1, int dtype=-1, Stream& stream=Stream::Null() )
+.. ocv:function:: void gpu::divide(InputArray src1, InputArray src2, OutputArray dst, double scale = 1, int dtype = -1, Stream& stream = Stream::Null())

-.. ocv:function:: void gpu::divide(const GpuMat& a, const Scalar& sc, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::divide(double src1, InputArray src2, OutputArray dst, int dtype = -1, Stream& stream = Stream::Null())

-.. ocv:function:: void gpu::divide( double scale, const GpuMat& b, GpuMat& c, int dtype=-1, Stream& stream=Stream::Null() )
+    :param src1: First source matrix or a scalar.

-    :param a: First source matrix or a scalar.
+    :param src2: Second source matrix or scalar.

-    :param b: Second source matrix. The ``a`` elements are divided by it.
-
-    :param sc: A scalar to be divided by the elements of ``a`` matrix.
-
-    :param c: Destination matrix that has the same size and number of channels as the input array(s). The depth is defined by ``dtype`` or ``a`` depth.
+    :param dst: Destination matrix that has the same size and number of channels as the input array(s). The depth is defined by ``dtype`` or ``src1`` depth.

    :param scale: Optional scale factor.

@ -113,11 +97,296 @@ This function, in contrast to :ocv:func:`divide`, uses a round-down rounding mod



+gpu::absdiff
+------------
+Computes per-element absolute difference of two matrices (or of a matrix and scalar).
+
+.. ocv:function:: void gpu::absdiff(InputArray src1, InputArray src2, OutputArray dst, Stream& stream = Stream::Null())
+
+    :param src1: First source matrix or scalar.
+
+    :param src2: Second source matrix or scalar.
+
+    :param dst: Destination matrix that has the same size and type as the input array(s).
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`absdiff`
+
+
+
+gpu::abs
+--------
+Computes an absolute value of each matrix element.
+
+.. ocv:function:: void gpu::abs(InputArray src, OutputArray dst, Stream& stream = Stream::Null())
+
+    :param src: Source matrix.
+
+    :param dst: Destination matrix with the same size and type as ``src`` .
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`abs`
+
+
+
+gpu::sqr
+--------
+Computes a square value of each matrix element.
+
+.. ocv:function:: void gpu::sqr(InputArray src, OutputArray dst, Stream& stream = Stream::Null())
+
+    :param src: Source matrix.
+
+    :param dst: Destination matrix with the same size and type as ``src`` .
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::sqrt
+---------
+Computes a square root of each matrix element.
+
+.. ocv:function:: void gpu::sqrt(InputArray src, OutputArray dst, Stream& stream = Stream::Null())
+
+    :param src: Source matrix.
+
+    :param dst: Destination matrix with the same size and type as ``src`` .
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`sqrt`
+
+
+
+gpu::exp
+--------
+Computes an exponent of each matrix element.
+
+.. ocv:function:: void gpu::exp(InputArray src, OutputArray dst, Stream& stream = Stream::Null())
+
+    :param src: Source matrix.
+
+    :param dst: Destination matrix with the same size and type as ``src`` .
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`exp`
+
+
+
+gpu::log
+--------
+Computes a natural logarithm of absolute value of each matrix element.
+
+.. ocv:function:: void gpu::log(InputArray src, OutputArray dst, Stream& stream = Stream::Null())
+
+    :param src: Source matrix.
+
+    :param dst: Destination matrix with the same size and type as ``src`` .
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`log`
+
+
+
+gpu::pow
+--------
+Raises every matrix element to a power.
+
+.. ocv:function:: void gpu::pow(InputArray src, double power, OutputArray dst, Stream& stream = Stream::Null())
+
+    :param src: Source matrix.
+
+    :param power: Exponent of power.
+
+    :param dst: Destination matrix with the same size and type as ``src`` .
+
+    :param stream: Stream for the asynchronous version.
+
+The function ``pow`` raises every element of the input matrix to ``power`` :
+
+.. math::
+
+    \texttt{dst} (I) =  \fork{\texttt{src}(I)^power}{if \texttt{power} is integer}{|\texttt{src}(I)|^power}{otherwise}
+
+.. seealso:: :ocv:func:`pow`
+
+
+
+gpu::compare
+------------
+Compares elements of two matrices (or of a matrix and scalar).
+
+.. ocv:function:: void gpu::compare(InputArray src1, InputArray src2, OutputArray dst, int cmpop, Stream& stream = Stream::Null())
+
+    :param src1: First source matrix or scalar.
+
+    :param src2: Second source matrix or scalar.
+
+    :param dst: Destination matrix that has the same size and type as the input array(s).
+
+    :param cmpop: Flag specifying the relation between the elements to be checked:
+
+            * **CMP_EQ:** ``a(.) == b(.)``
+            * **CMP_GT:** ``a(.) < b(.)``
+            * **CMP_GE:** ``a(.) <= b(.)``
+            * **CMP_LT:** ``a(.) < b(.)``
+            * **CMP_LE:** ``a(.) <= b(.)``
+            * **CMP_NE:** ``a(.) != b(.)``
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`compare`
+
+
+
+gpu::bitwise_not
+----------------
+Performs a per-element bitwise inversion.
+
+.. ocv:function:: void gpu::bitwise_not(InputArray src, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null())
+
+    :param src: Source matrix.
+
+    :param dst: Destination matrix with the same size and type as ``src`` .
+
+    :param mask: Optional operation mask. 8-bit single channel image.
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::bitwise_or
+---------------
+Performs a per-element bitwise disjunction of two matrices (or of matrix and scalar).
+
+.. ocv:function:: void gpu::bitwise_or(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null())
+
+    :param src1: First source matrix or scalar.
+
+    :param src2: Second source matrix or scalar.
+
+    :param dst: Destination matrix that has the same size and type as the input array(s).
+
+    :param mask: Optional operation mask. 8-bit single channel image.
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::bitwise_and
+----------------
+Performs a per-element bitwise conjunction of two matrices (or of matrix and scalar).
+
+.. ocv:function:: void gpu::bitwise_and(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null())
+
+    :param src1: First source matrix or scalar.
+
+    :param src2: Second source matrix or scalar.
+
+    :param dst: Destination matrix that has the same size and type as the input array(s).
+
+    :param mask: Optional operation mask. 8-bit single channel image.
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::bitwise_xor
+----------------
+Performs a per-element bitwise ``exclusive or`` operation of two matrices (or of matrix and scalar).
+
+.. ocv:function:: void gpu::bitwise_xor(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null())
+
+    :param src1: First source matrix or scalar.
+
+    :param src2: Second source matrix or scalar.
+
+    :param dst: Destination matrix that has the same size and type as the input array(s).
+
+    :param mask: Optional operation mask. 8-bit single channel image.
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::rshift
+-----------
+Performs pixel by pixel right shift of an image by a constant value.
+
+.. ocv:function:: void gpu::rshift(InputArray src, Scalar_<int> val, OutputArray dst, Stream& stream = Stream::Null())
+
+    :param src: Source matrix. Supports 1, 3 and 4 channels images with integers elements.
+
+    :param val: Constant values, one per channel.
+
+    :param dst: Destination matrix with the same size and type as ``src`` .
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::lshift
+-----------
+Performs pixel by pixel right left of an image by a constant value.
+
+.. ocv:function:: void gpu::lshift(InputArray src, Scalar_<int> val, OutputArray dst, Stream& stream = Stream::Null())
+
+    :param src: Source matrix. Supports 1, 3 and 4 channels images with ``CV_8U`` , ``CV_16U`` or ``CV_32S`` depth.
+
+    :param val: Constant values, one per channel.
+
+    :param dst: Destination matrix with the same size and type as ``src`` .
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::min
+--------
+Computes the per-element minimum of two matrices (or a matrix and a scalar).
+
+.. ocv:function:: void gpu::min(InputArray src1, InputArray src2, OutputArray dst, Stream& stream = Stream::Null())
+
+    :param src1: First source matrix or scalar.
+
+    :param src2: Second source matrix or scalar.
+
+    :param dst: Destination matrix that has the same size and type as the input array(s).
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`min`
+
+
+
+gpu::max
+--------
+Computes the per-element maximum of two matrices (or a matrix and a scalar).
+
+.. ocv:function:: void gpu::max(InputArray src1, InputArray src2, OutputArray dst, Stream& stream = Stream::Null())
+
+    :param src1: First source matrix or scalar.
+
+    :param src2: Second source matrix or scalar.
+
+    :param dst: Destination matrix that has the same size and type as the input array(s).
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`max`
+
+
+
 gpu::addWeighted
 ----------------
 Computes the weighted sum of two arrays.

-.. ocv:function:: void gpu::addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst, int dtype = -1, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::addWeighted(InputArray src1, double alpha, InputArray src2, double beta, double gamma, OutputArray dst, int dtype = -1, Stream& stream = Stream::Null())

    :param src1: First source array.

@ -147,311 +416,11 @@ where ``I`` is a multi-dimensional index of array elements. In case of multi-cha



-gpu::abs
------------
-Computes an absolute value of each matrix element.
-
-.. ocv:function:: void gpu::abs(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())
-
-    :param src: Source matrix. Supports ``CV_16S`` and ``CV_32F`` depth.
-
-    :param dst: Destination matrix with the same size and type as ``src`` .
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`abs`
-
-
-
-gpu::sqr
------------
-Computes a square value of each matrix element.
-
-.. ocv:function:: void gpu::sqr(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())
-
-    :param src: Source matrix. Supports ``CV_8U`` , ``CV_16U`` , ``CV_16S`` and ``CV_32F`` depth.
-
-    :param dst: Destination matrix with the same size and type as ``src`` .
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::sqrt
------------
-Computes a square root of each matrix element.
-
-.. ocv:function:: void gpu::sqrt(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())
-
-    :param src: Source matrix. Supports ``CV_8U`` , ``CV_16U`` , ``CV_16S`` and ``CV_32F`` depth.
-
-    :param dst: Destination matrix with the same size and type as ``src`` .
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`sqrt`
-
-
-
-gpu::exp
------------
-Computes an exponent of each matrix element.
-
-.. ocv:function:: void gpu::exp( const GpuMat& a, GpuMat& b, Stream& stream=Stream::Null() )
-
-    :param a: Source matrix. Supports ``CV_8U`` , ``CV_16U`` , ``CV_16S`` and ``CV_32F`` depth.
-
-    :param b: Destination matrix with the same size and type as ``a`` .
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`exp`
-
-
-
-gpu::log
------------
-Computes a natural logarithm of absolute value of each matrix element.
-
-.. ocv:function:: void gpu::log( const GpuMat& a, GpuMat& b, Stream& stream=Stream::Null() )
-
-    :param a: Source matrix. Supports ``CV_8U`` , ``CV_16U`` , ``CV_16S`` and ``CV_32F`` depth.
-
-    :param b: Destination matrix with the same size and type as ``a`` .
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`log`
-
-
-
-gpu::pow
------------
-Raises every matrix element to a power.
-
-.. ocv:function:: void gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream = Stream::Null())
-
-    :param src: Source matrix. Supports all type, except ``CV_64F`` depth.
-
-    :param power: Exponent of power.
-
-    :param dst: Destination matrix with the same size and type as ``src`` .
-
-    :param stream: Stream for the asynchronous version.
-
-The function ``pow`` raises every element of the input matrix to ``p`` :
-
-.. math::
-
-    \texttt{dst} (I) =  \fork{\texttt{src}(I)^p}{if \texttt{p} is integer}{|\texttt{src}(I)|^p}{otherwise}
-
-.. seealso:: :ocv:func:`pow`
-
-
-
-gpu::absdiff
----------------
-Computes per-element absolute difference of two matrices (or of a matrix and scalar).
-
-.. ocv:function:: void gpu::absdiff( const GpuMat& a, const GpuMat& b, GpuMat& c, Stream& stream=Stream::Null() )
-
-.. ocv:function:: void gpu::absdiff( const GpuMat& a, const Scalar& s, GpuMat& c, Stream& stream=Stream::Null() )
-
-    :param a: First source matrix.
-
-    :param b: Second source matrix to be added to ``a`` .
-
-    :param s: A scalar to be added to ``a`` .
-
-    :param c: Destination matrix with the same size and type as ``a`` .
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`absdiff`
-
-
-
-gpu::compare
----------------
-Compares elements of two matrices.
-
-.. ocv:function:: void gpu::compare( const GpuMat& a, const GpuMat& b, GpuMat& c, int cmpop, Stream& stream=Stream::Null() )
-
-.. ocv:function:: void gpu::compare(const GpuMat& a, Scalar sc, GpuMat& c, int cmpop, Stream& stream = Stream::Null())
-
-    :param a: First source matrix.
-
-    :param b: Second source matrix with the same size and type as ``a`` .
-
-    :param sc: A scalar to be compared with ``a`` .
-
-    :param c: Destination matrix with the same size as ``a`` and the ``CV_8UC1`` type.
-
-    :param cmpop: Flag specifying the relation between the elements to be checked:
-
-            * **CMP_EQ:** ``a(.) == b(.)``
-            * **CMP_GT:** ``a(.) < b(.)``
-            * **CMP_GE:** ``a(.) <= b(.)``
-            * **CMP_LT:** ``a(.) < b(.)``
-            * **CMP_LE:** ``a(.) <= b(.)``
-            * **CMP_NE:** ``a(.) != b(.)``
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`compare`
-
-
-
-gpu::bitwise_not
--------------------
-Performs a per-element bitwise inversion.
-
-.. ocv:function:: void gpu::bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null())
-
-    :param src: Source matrix.
-
-    :param dst: Destination matrix with the same size and type as ``src`` .
-
-    :param mask: Optional operation mask. 8-bit single channel image.
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::bitwise_or
-------------------
-Performs a per-element bitwise disjunction of two matrices or of matrix and scalar.
-
-.. ocv:function:: void gpu::bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null())
-.. ocv:function:: void gpu::bitwise_or(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null())
-
-    :param src1: First source matrix.
-
-    :param src2: Second source matrix with the same size and type as ``src1`` .
-
-    :param dst: Destination matrix with the same size and type as ``src1`` .
-
-    :param mask: Optional operation mask. 8-bit single channel image.
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::bitwise_and
--------------------
-Performs a per-element bitwise conjunction of two matrices or of matrix and scalar.
-
-.. ocv:function:: void gpu::bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null())
-.. ocv:function:: void gpu::bitwise_and(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null())
-
-    :param src1: First source matrix.
-
-    :param src2: Second source matrix with the same size and type as ``src1`` .
-
-    :param dst: Destination matrix with the same size and type as ``src1`` .
-
-    :param mask: Optional operation mask. 8-bit single channel image.
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::bitwise_xor
--------------------
-Performs a per-element bitwise ``exclusive or`` operation of two matrices of matrix and scalar.
-
-.. ocv:function:: void gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null())
-.. ocv:function:: void gpu::bitwise_xor(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null())
-
-    :param src1: First source matrix.
-
-    :param src2: Second source matrix with the same size and type as ``src1`` .
-
-    :param dst: Destination matrix with the same size and type as ``src1`` .
-
-    :param mask: Optional operation mask. 8-bit single channel image.
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::rshift
--------------------
-Performs pixel by pixel right shift of an image by a constant value.
-
-.. ocv:function:: void gpu::rshift( const GpuMat& src, Scalar_<int> sc, GpuMat& dst, Stream& stream=Stream::Null() )
-
-    :param src: Source matrix. Supports 1, 3 and 4 channels images with integers elements.
-
-    :param sc: Constant values, one per channel.
-
-    :param dst: Destination matrix with the same size and type as ``src`` .
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::lshift
--------------------
-Performs pixel by pixel right left of an image by a constant value.
-
-.. ocv:function:: void gpu::lshift( const GpuMat& src, Scalar_<int> sc, GpuMat& dst, Stream& stream=Stream::Null() )
-
-    :param src: Source matrix. Supports 1, 3 and 4 channels images with ``CV_8U`` , ``CV_16U`` or ``CV_32S`` depth.
-
-    :param sc: Constant values, one per channel.
-
-    :param dst: Destination matrix with the same size and type as ``src`` .
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::min
------------
-Computes the per-element minimum of two matrices (or a matrix and a scalar).
-
-.. ocv:function:: void gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null())
-
-.. ocv:function:: void gpu::min(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream = Stream::Null())
-
-    :param src1: First source matrix.
-
-    :param src2: Second source matrix or a scalar to compare ``src1`` elements with.
-
-    :param dst: Destination matrix with the same size and type as ``src1`` .
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`min`
-
-
-
-gpu::max
------------
-Computes the per-element maximum of two matrices (or a matrix and a scalar).
-
-.. ocv:function:: void gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null())
-
-.. ocv:function:: void gpu::max(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream = Stream::Null())
-
-    :param src1: First source matrix.
-
-    :param src2: Second source matrix or a scalar to compare ``src1`` elements with.
-
-    :param dst: Destination matrix with the same size and type as ``src1`` .
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`max`
-
-
-
 gpu::threshold
------------------
+--------------
 Applies a fixed-level threshold to each array element.

-.. ocv:function:: double gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxval, int type, Stream& stream = Stream::Null())
+.. ocv:function:: double gpu::threshold(InputArray src, OutputArray dst, double thresh, double maxval, int type, Stream& stream = Stream::Null())

    :param src: Source array (single-channel).

@ -470,12 +439,12 @@ Applies a fixed-level threshold to each array element.


 gpu::magnitude
------------------
+--------------
 Computes magnitudes of complex matrix elements.

-.. ocv:function:: void gpu::magnitude( const GpuMat& xy, GpuMat& magnitude, Stream& stream=Stream::Null() )
+.. ocv:function:: void gpu::magnitude(InputArray xy, OutputArray magnitude, Stream& stream = Stream::Null())

-.. ocv:function:: void gpu::magnitude(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::magnitude(InputArray x, InputArray y, OutputArray magnitude, Stream& stream = Stream::Null())

    :param xy: Source complex matrix in the interleaved format ( ``CV_32FC2`` ).

@ -492,12 +461,12 @@ Computes magnitudes of complex matrix elements.


 gpu::magnitudeSqr
---------------------
+-----------------
 Computes squared magnitudes of complex matrix elements.

-.. ocv:function:: void gpu::magnitudeSqr( const GpuMat& xy, GpuMat& magnitude, Stream& stream=Stream::Null() )
+.. ocv:function:: void gpu::magnitudeSqr(InputArray xy, OutputArray magnitude, Stream& stream=Stream::Null() )

-.. ocv:function:: void gpu::magnitudeSqr(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::magnitudeSqr(InputArray x, InputArray y, OutputArray magnitude, Stream& stream = Stream::Null())

    :param xy: Source complex matrix in the interleaved format ( ``CV_32FC2`` ).

@ -512,10 +481,10 @@ Computes squared magnitudes of complex matrix elements.


 gpu::phase
--------------
+----------
 Computes polar angles of complex matrix elements.

-.. ocv:function:: void gpu::phase(const GpuMat& x, const GpuMat& y, GpuMat& angle, bool angleInDegrees=false, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::phase(InputArray x, InputArray y, OutputArray angle, bool angleInDegrees = false, Stream& stream = Stream::Null())

    :param x: Source matrix containing real components ( ``CV_32FC1`` ).

@ -532,10 +501,10 @@ Computes polar angles of complex matrix elements.


 gpu::cartToPolar
--------------------
+----------------
 Converts Cartesian coordinates into polar.

-.. ocv:function:: void gpu::cartToPolar(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, GpuMat& angle, bool angleInDegrees=false, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::cartToPolar(InputArray x, InputArray y, OutputArray magnitude, OutputArray angle, bool angleInDegrees = false, Stream& stream = Stream::Null())

    :param x: Source matrix containing real components ( ``CV_32FC1`` ).

@ -554,10 +523,10 @@ Converts Cartesian coordinates into polar.


 gpu::polarToCart
--------------------
+----------------
 Converts polar coordinates into Cartesian.

-.. ocv:function:: void gpu::polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees=false, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::polarToCart(InputArray magnitude, InputArray angle, OutputArray x, OutputArray y, bool angleInDegrees = false, Stream& stream = Stream::Null())

    :param magnitude: Source matrix containing magnitudes ( ``CV_32FC1`` ).

--- a/modules/gpuarithm/doc/reductions.rst
+++ b/modules/gpuarithm/doc/reductions.rst
@ -6,16 +6,16 @@ Matrix Reductions


 gpu::norm
-------------
+---------
 Returns the norm of a matrix (or difference of two matrices).

-.. ocv:function:: double gpu::norm(const GpuMat& src1, int normType=NORM_L2)
+.. ocv:function:: double gpu::norm(InputArray src1, int normType)

-.. ocv:function:: double gpu::norm(const GpuMat& src1, int normType, GpuMat& buf)
+.. ocv:function:: double gpu::norm(InputArray src1, int normType, GpuMat& buf)

-.. ocv:function:: double gpu::norm(const GpuMat& src1, int normType, const GpuMat& mask, GpuMat& buf)
+.. ocv:function:: double gpu::norm(InputArray src1, int normType, InputArray mask, GpuMat& buf)

-.. ocv:function:: double gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType=NORM_L2)
+.. ocv:function:: double gpu::norm(InputArray src1, InputArray src2, int normType=NORM_L2)

    :param src1: Source matrix. Any matrices except 64F are supported.

@ -32,14 +32,14 @@ Returns the norm of a matrix (or difference of two matrices).


 gpu::sum
------------
+--------
 Returns the sum of matrix elements.

-.. ocv:function:: Scalar gpu::sum(const GpuMat& src)
+.. ocv:function:: Scalar gpu::sum(InputArray src)

-.. ocv:function:: Scalar gpu::sum(const GpuMat& src, GpuMat& buf)
+.. ocv:function:: Scalar gpu::sum(InputArray src, GpuMat& buf)

-.. ocv:function:: Scalar gpu::sum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
+.. ocv:function:: Scalar gpu::sum(InputArray src, InputArray mask, GpuMat& buf)

    :param src: Source image of any depth except for ``CV_64F`` .

@ -52,14 +52,14 @@ Returns the sum of matrix elements.


 gpu::absSum
---------------
+-----------
 Returns the sum of absolute values for matrix elements.

-.. ocv:function:: Scalar gpu::absSum(const GpuMat& src)
+.. ocv:function:: Scalar gpu::absSum(InputArray src)

-.. ocv:function:: Scalar gpu::absSum(const GpuMat& src, GpuMat& buf)
+.. ocv:function:: Scalar gpu::absSum(InputArray src, GpuMat& buf)

-.. ocv:function:: Scalar gpu::absSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
+.. ocv:function:: Scalar gpu::absSum(InputArray src, InputArray mask, GpuMat& buf)

    :param src: Source image of any depth except for ``CV_64F`` .

@ -70,14 +70,14 @@ Returns the sum of absolute values for matrix elements.


 gpu::sqrSum
---------------
+-----------
 Returns the squared sum of matrix elements.

-.. ocv:function:: Scalar gpu::sqrSum(const GpuMat& src)
+.. ocv:function:: Scalar gpu::sqrSum(InputArray src)

-.. ocv:function:: Scalar gpu::sqrSum(const GpuMat& src, GpuMat& buf)
+.. ocv:function:: Scalar gpu::sqrSum(InputArray src, GpuMat& buf)

-.. ocv:function:: Scalar gpu::sqrSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
+.. ocv:function:: Scalar gpu::sqrSum(InputArray src, InputArray mask, GpuMat& buf)

    :param src: Source image of any depth except for ``CV_64F`` .

@ -88,12 +88,12 @@ Returns the squared sum of matrix elements.


 gpu::minMax
---------------
+-----------
 Finds global minimum and maximum matrix elements and returns their values.

-.. ocv:function:: void gpu::minMax(const GpuMat& src, double* minVal, double* maxVal=0, const GpuMat& mask=GpuMat())
+.. ocv:function:: void gpu::minMax(InputArray src, double* minVal, double* maxVal=0, InputArray mask=noArray())

-.. ocv:function:: void gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf)
+.. ocv:function:: void gpu::minMax(InputArray src, double* minVal, double* maxVal, InputArray mask, GpuMat& buf)

    :param src: Single-channel source image.

@ -112,12 +112,12 @@ The function does not work with ``CV_64F`` images on GPUs with the compute capab


 gpu::minMaxLoc
------------------
+--------------
 Finds global minimum and maximum matrix elements and returns their values with locations.

-.. ocv:function:: void gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal=0, Point* minLoc=0, Point* maxLoc=0, const GpuMat& mask=GpuMat())
+.. ocv:function:: void gpu::minMaxLoc(InputArray src, double* minVal, double* maxVal=0, Point* minLoc=0, Point* maxLoc=0, InputArray mask=noArray())

-.. ocv:function:: void gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, const GpuMat& mask, GpuMat& valbuf, GpuMat& locbuf)
+.. ocv:function:: void gpu::minMaxLoc(InputArray src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, InputArray mask, GpuMat& valbuf, GpuMat& locbuf)

    :param src: Single-channel source image.

@ -142,12 +142,12 @@ Finds global minimum and maximum matrix elements and returns their values with l


 gpu::countNonZero
---------------------
+-----------------
 Counts non-zero matrix elements.

-.. ocv:function:: int gpu::countNonZero(const GpuMat& src)
+.. ocv:function:: int gpu::countNonZero(InputArray src)

-.. ocv:function:: int gpu::countNonZero(const GpuMat& src, GpuMat& buf)
+.. ocv:function:: int gpu::countNonZero(InputArray src, GpuMat& buf)

    :param src: Single-channel source image.

@ -163,7 +163,7 @@ gpu::reduce
 -----------
 Reduces a matrix to a vector.

-.. ocv:function:: void gpu::reduce(const GpuMat& mtx, GpuMat& vec, int dim, int reduceOp, int dtype = -1, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::reduce(InputArray mtx, OutputArray vec, int dim, int reduceOp, int dtype = -1, Stream& stream = Stream::Null())

    :param mtx: Source 2D matrix.

@ -183,48 +183,20 @@ Reduces a matrix to a vector.

    :param dtype: When it is negative, the destination vector will have the same type as the source matrix. Otherwise, its type will be  ``CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), mtx.channels())`` .

+    :param stream: Stream for the asynchronous version.
+
 The function ``reduce`` reduces the matrix to a vector by treating the matrix rows/columns as a set of 1D vectors and performing the specified operation on the vectors until a single row/column is obtained. For example, the function can be used to compute horizontal and vertical projections of a raster image. In case of ``CV_REDUCE_SUM`` and ``CV_REDUCE_AVG`` , the output may have a larger element bit-depth to preserve accuracy. And multi-channel arrays are also supported in these two reduction modes.

 .. seealso:: :ocv:func:`reduce`



-gpu::normalize
--------------
-Normalizes the norm or value range of an array.
-
-.. ocv:function:: void gpu::normalize(const GpuMat& src, GpuMat& dst, double alpha = 1, double beta = 0, int norm_type = NORM_L2, int dtype = -1, const GpuMat& mask = GpuMat())
-
-.. ocv:function:: void gpu::normalize(const GpuMat& src, GpuMat& dst, double a, double b, int norm_type, int dtype, const GpuMat& mask, GpuMat& norm_buf, GpuMat& cvt_buf)
-
-    :param src: input array.
-
-    :param dst: output array of the same size as  ``src`` .
-
-    :param alpha: norm value to normalize to or the lower range boundary in case of the range normalization.
-
-    :param beta: upper range boundary in case of the range normalization; it is not used for the norm normalization.
-
-    :param normType: normalization type (see the details below).
-
-    :param dtype: when negative, the output array has the same type as ``src``; otherwise, it has the same number of channels as  ``src`` and the depth ``=CV_MAT_DEPTH(dtype)``.
-
-    :param mask: optional operation mask.
-
-    :param norm_buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
-
-    :param cvt_buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
-
-.. seealso:: :ocv:func:`normalize`
-
-
-
 gpu::meanStdDev
-------------------
+---------------
 Computes a mean value and a standard deviation of matrix elements.

-.. ocv:function:: void gpu::meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev)
-.. ocv:function:: void gpu::meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev, GpuMat& buf)
+.. ocv:function:: void gpu::meanStdDev(InputArray mtx, Scalar& mean, Scalar& stddev)
+.. ocv:function:: void gpu::meanStdDev(InputArray mtx, Scalar& mean, Scalar& stddev, GpuMat& buf)

    :param mtx: Source matrix.  ``CV_8UC1``  matrices are supported for now.

@ -239,10 +211,10 @@ Computes a mean value and a standard deviation of matrix elements.


 gpu::rectStdDev
-------------------
+---------------
 Computes a standard deviation of integral images.

-.. ocv:function:: void gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, const Rect& rect, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::rectStdDev(InputArray src, InputArray sqr, OutputArray dst, Rect rect, Stream& stream = Stream::Null())

    :param src: Source image. Only the ``CV_32SC1`` type is supported.

@ -253,3 +225,71 @@ Computes a standard deviation of integral images.
    :param rect: Rectangular window.

    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::normalize
+--------------
+Normalizes the norm or value range of an array.
+
+.. ocv:function:: void gpu::normalize(InputArray src, OutputArray dst, double alpha = 1, double beta = 0, int norm_type = NORM_L2, int dtype = -1, InputArray mask = noArray())
+
+.. ocv:function:: void gpu::normalize(InputArray src, OutputArray dst, double alpha, double beta, int norm_type, int dtype, InputArray mask, GpuMat& norm_buf, GpuMat& cvt_buf)
+
+    :param src: Input array.
+
+    :param dst: Output array of the same size as  ``src`` .
+
+    :param alpha: Norm value to normalize to or the lower range boundary in case of the range normalization.
+
+    :param beta: Upper range boundary in case of the range normalization; it is not used for the norm normalization.
+
+    :param normType: Normalization type ( ``NORM_MINMAX`` , ``NORM_L2`` , ``NORM_L1`` or ``NORM_INF`` ).
+
+    :param dtype: When negative, the output array has the same type as ``src``; otherwise, it has the same number of channels as  ``src`` and the depth ``=CV_MAT_DEPTH(dtype)``.
+
+    :param mask: Optional operation mask.
+
+    :param norm_buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
+
+    :param cvt_buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
+
+.. seealso:: :ocv:func:`normalize`
+
+
+
+gpu::integral
+-------------
+Computes an integral image.
+
+.. ocv:function:: void gpu::integral(InputArray src, OutputArray sum, Stream& stream = Stream::Null())
+
+.. ocv:function:: void gpu::integral(InputArray src, OutputArray sum, GpuMat& buffer, Stream& stream = Stream::Null())
+
+    :param src: Source image. Only  ``CV_8UC1`` images are supported for now.
+
+    :param sum: Integral image containing 32-bit unsigned integer values packed into  ``CV_32SC1`` .
+
+    :param buffer: Optional buffer to avoid extra memory allocations. It is resized automatically.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`integral`
+
+
+
+gpu::sqrIntegral
+----------------
+Computes a squared integral image.
+
+.. ocv:function:: void gpu::sqrIntegral(InputArray src, OutputArray sqsum, Stream& stream = Stream::Null())
+
+.. ocv:function:: void gpu::sqrIntegral(InputArray src, OutputArray sqsum, GpuMat& buf, Stream& stream = Stream::Null())
+
+    :param src: Source image. Only  ``CV_8UC1`` images are supported for now.
+
+    :param sqsum: Squared integral image containing 64-bit unsigned integer values packed into  ``CV_64FC1`` .
+
+    :param buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
+
+    :param stream: Stream for the asynchronous version.
--- a/modules/gpuarithm/include/opencv2/gpuarithm.hpp
+++ b/modules/gpuarithm/include/opencv2/gpuarithm.hpp
@ -49,263 +49,317 @@

 #include "opencv2/core/gpu.hpp"

+#if defined __GNUC__
+    #define __OPENCV_GPUARITHM_DEPR_BEFORE__
+    #define __OPENCV_GPUARITHM_DEPR_AFTER__ __attribute__ ((deprecated))
+#elif (defined WIN32 || defined _WIN32)
+    #define __OPENCV_GPUARITHM_DEPR_BEFORE__ __declspec(deprecated)
+    #define __OPENCV_GPUARITHM_DEPR_AFTER__
+#else
+    #define __OPENCV_GPUARITHM_DEPR_BEFORE__
+    #define __OPENCV_GPUARITHM_DEPR_AFTER__
+#endif
+
 namespace cv { namespace gpu {

-//! adds one matrix to another (c = a + b)
-CV_EXPORTS void add(const GpuMat& a, const GpuMat& b, GpuMat& c, const GpuMat& mask = GpuMat(), int dtype = -1, Stream& stream = Stream::Null());
-//! adds scalar to a matrix (c = a + s)
-CV_EXPORTS void add(const GpuMat& a, const Scalar& sc, GpuMat& c, const GpuMat& mask = GpuMat(), int dtype = -1, Stream& stream = Stream::Null());
+//! adds one matrix to another (dst = src1 + src2)
+CV_EXPORTS void add(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), int dtype = -1, Stream& stream = Stream::Null());

-//! subtracts one matrix from another (c = a - b)
-CV_EXPORTS void subtract(const GpuMat& a, const GpuMat& b, GpuMat& c, const GpuMat& mask = GpuMat(), int dtype = -1, Stream& stream = Stream::Null());
-//! subtracts scalar from a matrix (c = a - s)
-CV_EXPORTS void subtract(const GpuMat& a, const Scalar& sc, GpuMat& c, const GpuMat& mask = GpuMat(), int dtype = -1, Stream& stream = Stream::Null());
+//! subtracts one matrix from another (dst = src1 - src2)
+CV_EXPORTS void subtract(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), int dtype = -1, Stream& stream = Stream::Null());

-//! computes element-wise weighted product of the two arrays (c = scale * a * b)
-CV_EXPORTS void multiply(const GpuMat& a, const GpuMat& b, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());
-//! weighted multiplies matrix to a scalar (c = scale * a * s)
-CV_EXPORTS void multiply(const GpuMat& a, const Scalar& sc, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());
+//! computes element-wise weighted product of the two arrays (dst = scale * src1 * src2)
+CV_EXPORTS void multiply(InputArray src1, InputArray src2, OutputArray dst, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());
+
+//! computes element-wise weighted quotient of the two arrays (dst = scale * (src1 / src2))
+CV_EXPORTS void divide(InputArray src1, InputArray src2, OutputArray dst, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());

-//! computes element-wise weighted quotient of the two arrays (c = a / b)
-CV_EXPORTS void divide(const GpuMat& a, const GpuMat& b, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());
-//! computes element-wise weighted quotient of matrix and scalar (c = a / s)
-CV_EXPORTS void divide(const GpuMat& a, const Scalar& sc, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());
 //! computes element-wise weighted reciprocal of an array (dst = scale/src2)
-CV_EXPORTS void divide(double scale, const GpuMat& b, GpuMat& c, int dtype = -1, Stream& stream = Stream::Null());
+static inline void divide(double src1, InputArray src2, OutputArray dst, int dtype = -1, Stream& stream = Stream::Null())
+{
+    divide(src1, src2, dst, 1.0, dtype, stream);
+}
+
+//! computes element-wise absolute difference of two arrays (dst = abs(src1 - src2))
+CV_EXPORTS void absdiff(InputArray src1, InputArray src2, OutputArray dst, Stream& stream = Stream::Null());
+
+//! computes absolute value of each matrix element
+CV_EXPORTS void abs(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+//! computes square of each pixel in an image
+CV_EXPORTS void sqr(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+//! computes square root of each pixel in an image
+CV_EXPORTS void sqrt(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+//! computes exponent of each matrix element
+CV_EXPORTS void exp(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+//! computes natural logarithm of absolute value of each matrix element
+CV_EXPORTS void log(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+//! computes power of each matrix element:
+//!    (dst(i,j) = pow(     src(i,j) , power), if src.type() is integer
+//!    (dst(i,j) = pow(fabs(src(i,j)), power), otherwise
+CV_EXPORTS void pow(InputArray src, double power, OutputArray dst, Stream& stream = Stream::Null());
+
+//! compares elements of two arrays (dst = src1 <cmpop> src2)
+CV_EXPORTS void compare(InputArray src1, InputArray src2, OutputArray dst, int cmpop, Stream& stream = Stream::Null());
+
+//! performs per-elements bit-wise inversion
+CV_EXPORTS void bitwise_not(InputArray src, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null());
+
+//! calculates per-element bit-wise disjunction of two arrays
+CV_EXPORTS void bitwise_or(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null());
+
+//! calculates per-element bit-wise conjunction of two arrays
+CV_EXPORTS void bitwise_and(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null());
+
+//! calculates per-element bit-wise "exclusive or" operation
+CV_EXPORTS void bitwise_xor(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null());
+
+//! pixel by pixel right shift of an image by a constant value
+//! supports 1, 3 and 4 channels images with integers elements
+CV_EXPORTS void rshift(InputArray src, Scalar_<int> val, OutputArray dst, Stream& stream = Stream::Null());
+
+//! pixel by pixel left shift of an image by a constant value
+//! supports 1, 3 and 4 channels images with CV_8U, CV_16U or CV_32S depth
+CV_EXPORTS void lshift(InputArray src, Scalar_<int> val, OutputArray dst, Stream& stream = Stream::Null());
+
+//! computes per-element minimum of two arrays (dst = min(src1, src2))
+CV_EXPORTS void min(InputArray src1, InputArray src2, OutputArray dst, Stream& stream = Stream::Null());
+
+//! computes per-element maximum of two arrays (dst = max(src1, src2))
+CV_EXPORTS void max(InputArray src1, InputArray src2, OutputArray dst, Stream& stream = Stream::Null());

 //! computes the weighted sum of two arrays (dst = alpha*src1 + beta*src2 + gamma)
-CV_EXPORTS void addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst,
+CV_EXPORTS void addWeighted(InputArray src1, double alpha, InputArray src2, double beta, double gamma, OutputArray dst,
                            int dtype = -1, Stream& stream = Stream::Null());

 //! adds scaled array to another one (dst = alpha*src1 + src2)
-static inline void scaleAdd(const GpuMat& src1, double alpha, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null())
+static inline void scaleAdd(InputArray src1, double alpha, InputArray src2, OutputArray dst, Stream& stream = Stream::Null())
 {
    addWeighted(src1, alpha, src2, 1.0, 0.0, dst, -1, stream);
 }

-//! computes element-wise absolute difference of two arrays (c = abs(a - b))
-CV_EXPORTS void absdiff(const GpuMat& a, const GpuMat& b, GpuMat& c, Stream& stream = Stream::Null());
-//! computes element-wise absolute difference of array and scalar (c = abs(a - s))
-CV_EXPORTS void absdiff(const GpuMat& a, const Scalar& s, GpuMat& c, Stream& stream = Stream::Null());
+//! applies fixed threshold to the image
+CV_EXPORTS double threshold(InputArray src, OutputArray dst, double thresh, double maxval, int type, Stream& stream = Stream::Null());

-//! computes absolute value of each matrix element
-//! supports CV_16S and CV_32F depth
-CV_EXPORTS void abs(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
+//! computes magnitude of complex (x(i).re, x(i).im) vector
+//! supports only CV_32FC2 type
+CV_EXPORTS void magnitude(InputArray xy, OutputArray magnitude, Stream& stream = Stream::Null());

-//! computes square of each pixel in an image
-//! supports CV_8U, CV_16U, CV_16S and CV_32F depth
-CV_EXPORTS void sqr(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
+//! computes squared magnitude of complex (x(i).re, x(i).im) vector
+//! supports only CV_32FC2 type
+CV_EXPORTS void magnitudeSqr(InputArray xy, OutputArray magnitude, Stream& stream = Stream::Null());

-//! computes square root of each pixel in an image
-//! supports CV_8U, CV_16U, CV_16S and CV_32F depth
-CV_EXPORTS void sqrt(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
+//! computes magnitude of each (x(i), y(i)) vector
+//! supports only floating-point source
+CV_EXPORTS void magnitude(InputArray x, InputArray y, OutputArray magnitude, Stream& stream = Stream::Null());

-//! computes exponent of each matrix element (b = e**a)
-//! supports CV_8U, CV_16U, CV_16S and CV_32F depth
-CV_EXPORTS void exp(const GpuMat& a, GpuMat& b, Stream& stream = Stream::Null());
+//! computes squared magnitude of each (x(i), y(i)) vector
+//! supports only floating-point source
+CV_EXPORTS void magnitudeSqr(InputArray x, InputArray y, OutputArray magnitude, Stream& stream = Stream::Null());

-//! computes natural logarithm of absolute value of each matrix element: b = log(abs(a))
-//! supports CV_8U, CV_16U, CV_16S and CV_32F depth
-CV_EXPORTS void log(const GpuMat& a, GpuMat& b, Stream& stream = Stream::Null());
+//! computes angle of each (x(i), y(i)) vector
+//! supports only floating-point source
+CV_EXPORTS void phase(InputArray x, InputArray y, OutputArray angle, bool angleInDegrees = false, Stream& stream = Stream::Null());

-//! computes power of each matrix element:
-//    (dst(i,j) = pow(     src(i,j) , power), if src.type() is integer
-//    (dst(i,j) = pow(fabs(src(i,j)), power), otherwise
-//! supports all, except depth == CV_64F
-CV_EXPORTS void pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream = Stream::Null());
+//! converts Cartesian coordinates to polar
+//! supports only floating-point source
+CV_EXPORTS void cartToPolar(InputArray x, InputArray y, OutputArray magnitude, OutputArray angle, bool angleInDegrees = false, Stream& stream = Stream::Null());

-//! compares elements of two arrays (c = a <cmpop> b)
-CV_EXPORTS void compare(const GpuMat& a, const GpuMat& b, GpuMat& c, int cmpop, Stream& stream = Stream::Null());
-CV_EXPORTS void compare(const GpuMat& a, Scalar sc, GpuMat& c, int cmpop, Stream& stream = Stream::Null());
+//! converts polar coordinates to Cartesian
+//! supports only floating-point source
+CV_EXPORTS void polarToCart(InputArray magnitude, InputArray angle, OutputArray x, OutputArray y, bool angleInDegrees = false, Stream& stream = Stream::Null());

-//! performs per-elements bit-wise inversion
-CV_EXPORTS void bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null());
+//! makes multi-channel array out of several single-channel arrays
+CV_EXPORTS void merge(const GpuMat* src, size_t n, OutputArray dst, Stream& stream = Stream::Null());
+CV_EXPORTS void merge(const std::vector<GpuMat>& src, OutputArray dst, Stream& stream = Stream::Null());

-//! calculates per-element bit-wise disjunction of two arrays
-CV_EXPORTS void bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null());
-//! calculates per-element bit-wise disjunction of array and scalar
-//! supports 1, 3 and 4 channels images with CV_8U, CV_16U or CV_32S depth
-CV_EXPORTS void bitwise_or(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! calculates per-element bit-wise conjunction of two arrays
-CV_EXPORTS void bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null());
-//! calculates per-element bit-wise conjunction of array and scalar
-//! supports 1, 3 and 4 channels images with CV_8U, CV_16U or CV_32S depth
-CV_EXPORTS void bitwise_and(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! calculates per-element bit-wise "exclusive or" operation
-CV_EXPORTS void bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null());
-//! calculates per-element bit-wise "exclusive or" of array and scalar
-//! supports 1, 3 and 4 channels images with CV_8U, CV_16U or CV_32S depth
-CV_EXPORTS void bitwise_xor(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! pixel by pixel right shift of an image by a constant value
-//! supports 1, 3 and 4 channels images with integers elements
-CV_EXPORTS void rshift(const GpuMat& src, Scalar_<int> sc, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! pixel by pixel left shift of an image by a constant value
-//! supports 1, 3 and 4 channels images with CV_8U, CV_16U or CV_32S depth
-CV_EXPORTS void lshift(const GpuMat& src, Scalar_<int> sc, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! computes per-element minimum of two arrays (dst = min(src1, src2))
-CV_EXPORTS void min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! computes per-element minimum of array and scalar (dst = min(src1, src2))
-CV_EXPORTS void min(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! computes per-element maximum of two arrays (dst = max(src1, src2))
-CV_EXPORTS void max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! computes per-element maximum of array and scalar (dst = max(src1, src2))
-CV_EXPORTS void max(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! implements generalized matrix product algorithm GEMM from BLAS
-CV_EXPORTS void gemm(const GpuMat& src1, const GpuMat& src2, double alpha,
-    const GpuMat& src3, double beta, GpuMat& dst, int flags = 0, Stream& stream = Stream::Null());
+//! copies each plane of a multi-channel array to a dedicated array
+CV_EXPORTS void split(InputArray src, GpuMat* dst, Stream& stream = Stream::Null());
+CV_EXPORTS void split(InputArray src, std::vector<GpuMat>& dst, Stream& stream = Stream::Null());

 //! transposes the matrix
 //! supports matrix with element size = 1, 4 and 8 bytes (CV_8UC1, CV_8UC4, CV_16UC2, CV_32FC1, etc)
-CV_EXPORTS void transpose(const GpuMat& src1, GpuMat& dst, Stream& stream = Stream::Null());
+CV_EXPORTS void transpose(InputArray src1, OutputArray dst, Stream& stream = Stream::Null());

 //! reverses the order of the rows, columns or both in a matrix
 //! supports 1, 3 and 4 channels images with CV_8U, CV_16U, CV_32S or CV_32F depth
-CV_EXPORTS void flip(const GpuMat& a, GpuMat& b, int flipCode, Stream& stream = Stream::Null());
+CV_EXPORTS void flip(InputArray src, OutputArray dst, int flipCode, Stream& stream = Stream::Null());

 //! transforms 8-bit unsigned integers using lookup table: dst(i)=lut(src(i))
 //! destination array will have the depth type as lut and the same channels number as source
 //! supports CV_8UC1, CV_8UC3 types
-CV_EXPORTS void LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& stream = Stream::Null());
+class CV_EXPORTS LookUpTable : public Algorithm
+{
+public:
+    virtual void transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null()) = 0;
+};
+CV_EXPORTS Ptr<LookUpTable> createLookUpTable(InputArray lut);

-//! makes multi-channel array out of several single-channel arrays
-CV_EXPORTS void merge(const GpuMat* src, size_t n, GpuMat& dst, Stream& stream = Stream::Null());
+__OPENCV_GPUARITHM_DEPR_BEFORE__ void LUT(InputArray src, InputArray lut, OutputArray dst, Stream& stream = Stream::Null()) __OPENCV_GPUARITHM_DEPR_AFTER__;
+inline void LUT(InputArray src, InputArray lut, OutputArray dst, Stream& stream)
+{
+    createLookUpTable(lut)->transform(src, dst, stream);
+}

-//! makes multi-channel array out of several single-channel arrays
-CV_EXPORTS void merge(const std::vector<GpuMat>& src, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! copies each plane of a multi-channel array to a dedicated array
-CV_EXPORTS void split(const GpuMat& src, GpuMat* dst, Stream& stream = Stream::Null());
-
-//! copies each plane of a multi-channel array to a dedicated array
-CV_EXPORTS void split(const GpuMat& src, std::vector<GpuMat>& dst, Stream& stream = Stream::Null());
-
-//! computes magnitude of complex (x(i).re, x(i).im) vector
-//! supports only CV_32FC2 type
-CV_EXPORTS void magnitude(const GpuMat& xy, GpuMat& magnitude, Stream& stream = Stream::Null());
-
-//! computes squared magnitude of complex (x(i).re, x(i).im) vector
-//! supports only CV_32FC2 type
-CV_EXPORTS void magnitudeSqr(const GpuMat& xy, GpuMat& magnitude, Stream& stream = Stream::Null());
-
-//! computes magnitude of each (x(i), y(i)) vector
-//! supports only floating-point source
-CV_EXPORTS void magnitude(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, Stream& stream = Stream::Null());
-
-//! computes squared magnitude of each (x(i), y(i)) vector
-//! supports only floating-point source
-CV_EXPORTS void magnitudeSqr(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, Stream& stream = Stream::Null());
-
-//! computes angle (angle(i)) of each (x(i), y(i)) vector
-//! supports only floating-point source
-CV_EXPORTS void phase(const GpuMat& x, const GpuMat& y, GpuMat& angle, bool angleInDegrees = false, Stream& stream = Stream::Null());
-
-//! converts Cartesian coordinates to polar
-//! supports only floating-point source
-CV_EXPORTS void cartToPolar(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, GpuMat& angle, bool angleInDegrees = false, Stream& stream = Stream::Null());
-
-//! converts polar coordinates to Cartesian
-//! supports only floating-point source
-CV_EXPORTS void polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees = false, Stream& stream = Stream::Null());
-
-//! scales and shifts array elements so that either the specified norm (alpha) or the minimum (alpha) and maximum (beta) array values get the specified values
-CV_EXPORTS void normalize(const GpuMat& src, GpuMat& dst, double alpha = 1, double beta = 0,
-                          int norm_type = NORM_L2, int dtype = -1, const GpuMat& mask = GpuMat());
-CV_EXPORTS void normalize(const GpuMat& src, GpuMat& dst, double a, double b,
-                          int norm_type, int dtype, const GpuMat& mask, GpuMat& norm_buf, GpuMat& cvt_buf);
+//! copies 2D array to a larger destination array and pads borders with user-specifiable constant
+CV_EXPORTS void copyMakeBorder(InputArray src, OutputArray dst, int top, int bottom, int left, int right, int borderType,
+                               Scalar value = Scalar(), Stream& stream = Stream::Null());

 //! computes norm of array
 //! supports NORM_INF, NORM_L1, NORM_L2
 //! supports all matrices except 64F
-CV_EXPORTS double norm(const GpuMat& src1, int normType=NORM_L2);
-CV_EXPORTS double norm(const GpuMat& src1, int normType, GpuMat& buf);
-CV_EXPORTS double norm(const GpuMat& src1, int normType, const GpuMat& mask, GpuMat& buf);
+CV_EXPORTS double norm(InputArray src1, int normType, InputArray mask, GpuMat& buf);
+static inline double norm(InputArray src, int normType)
+{
+    GpuMat buf;
+    return norm(src, normType, GpuMat(), buf);
+}
+static inline double norm(InputArray src, int normType, GpuMat& buf)
+{
+    return norm(src, normType, GpuMat(), buf);
+}

 //! computes norm of the difference between two arrays
 //! supports NORM_INF, NORM_L1, NORM_L2
 //! supports only CV_8UC1 type
-CV_EXPORTS double norm(const GpuMat& src1, const GpuMat& src2, int normType=NORM_L2);
+CV_EXPORTS double norm(InputArray src1, InputArray src2, GpuMat& buf, int normType=NORM_L2);
+static inline double norm(InputArray src1, InputArray src2, int normType=NORM_L2)
+{
+    GpuMat buf;
+    return norm(src1, src2, buf, normType);
+}

 //! computes sum of array elements
 //! supports only single channel images
-CV_EXPORTS Scalar sum(const GpuMat& src);
-CV_EXPORTS Scalar sum(const GpuMat& src, GpuMat& buf);
-CV_EXPORTS Scalar sum(const GpuMat& src, const GpuMat& mask, GpuMat& buf);
+CV_EXPORTS Scalar sum(InputArray src, InputArray mask, GpuMat& buf);
+static inline Scalar sum(InputArray src)
+{
+    GpuMat buf;
+    return sum(src, GpuMat(), buf);
+}
+static inline Scalar sum(InputArray src, GpuMat& buf)
+{
+    return sum(src, GpuMat(), buf);
+}

 //! computes sum of array elements absolute values
 //! supports only single channel images
-CV_EXPORTS Scalar absSum(const GpuMat& src);
-CV_EXPORTS Scalar absSum(const GpuMat& src, GpuMat& buf);
-CV_EXPORTS Scalar absSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf);
+CV_EXPORTS Scalar absSum(InputArray src, InputArray mask, GpuMat& buf);
+static inline Scalar absSum(InputArray src)
+{
+    GpuMat buf;
+    return absSum(src, GpuMat(), buf);
+}
+static inline Scalar absSum(InputArray src, GpuMat& buf)
+{
+    return absSum(src, GpuMat(), buf);
+}

 //! computes squared sum of array elements
 //! supports only single channel images
-CV_EXPORTS Scalar sqrSum(const GpuMat& src);
-CV_EXPORTS Scalar sqrSum(const GpuMat& src, GpuMat& buf);
-CV_EXPORTS Scalar sqrSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf);
+CV_EXPORTS Scalar sqrSum(InputArray src, InputArray mask, GpuMat& buf);
+static inline Scalar sqrSum(InputArray src)
+{
+    GpuMat buf;
+    return sqrSum(src, GpuMat(), buf);
+}
+static inline Scalar sqrSum(InputArray src, GpuMat& buf)
+{
+    return sqrSum(src, GpuMat(), buf);
+}

 //! finds global minimum and maximum array elements and returns their values
-CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal=0, const GpuMat& mask=GpuMat());
-CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf);
+CV_EXPORTS void minMax(InputArray src, double* minVal, double* maxVal, InputArray mask, GpuMat& buf);
+static inline void minMax(InputArray src, double* minVal, double* maxVal=0, InputArray mask=noArray())
+{
+    GpuMat buf;
+    minMax(src, minVal, maxVal, mask, buf);
+}

 //! finds global minimum and maximum array elements and returns their values with locations
-CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal=0, Point* minLoc=0, Point* maxLoc=0,
-                          const GpuMat& mask=GpuMat());
-CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
-                          const GpuMat& mask, GpuMat& valbuf, GpuMat& locbuf);
+CV_EXPORTS void minMaxLoc(InputArray src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
+                          InputArray mask, GpuMat& valbuf, GpuMat& locbuf);
+static inline void minMaxLoc(InputArray src, double* minVal, double* maxVal=0, Point* minLoc=0, Point* maxLoc=0,
+                             InputArray mask=noArray())
+{
+    GpuMat valBuf, locBuf;
+    minMaxLoc(src, minVal, maxVal, minLoc, maxLoc, mask, valBuf, locBuf);
+}

 //! counts non-zero array elements
-CV_EXPORTS int countNonZero(const GpuMat& src);
-CV_EXPORTS int countNonZero(const GpuMat& src, GpuMat& buf);
+CV_EXPORTS int countNonZero(InputArray src, GpuMat& buf);
+static inline int countNonZero(const GpuMat& src)
+{
+    GpuMat buf;
+    return countNonZero(src, buf);
+}

 //! reduces a matrix to a vector
-CV_EXPORTS void reduce(const GpuMat& mtx, GpuMat& vec, int dim, int reduceOp, int dtype = -1, Stream& stream = Stream::Null());
+CV_EXPORTS void reduce(InputArray mtx, OutputArray vec, int dim, int reduceOp, int dtype = -1, Stream& stream = Stream::Null());

 //! computes mean value and standard deviation of all or selected array elements
 //! supports only CV_8UC1 type
-CV_EXPORTS void meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev);
-//! buffered version
-CV_EXPORTS void meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev, GpuMat& buf);
+CV_EXPORTS void meanStdDev(InputArray mtx, Scalar& mean, Scalar& stddev, GpuMat& buf);
+static inline void meanStdDev(InputArray src, Scalar& mean, Scalar& stddev)
+{
+    GpuMat buf;
+    meanStdDev(src, mean, stddev, buf);
+}

 //! computes the standard deviation of integral images
 //! supports only CV_32SC1 source type and CV_32FC1 sqr type
 //! output will have CV_32FC1 type
-CV_EXPORTS void rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, const Rect& rect, Stream& stream = Stream::Null());
+CV_EXPORTS void rectStdDev(InputArray src, InputArray sqr, OutputArray dst, Rect rect, Stream& stream = Stream::Null());

-//! copies 2D array to a larger destination array and pads borders with user-specifiable constant
-CV_EXPORTS void copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, int borderType,
-                               const Scalar& value = Scalar(), Stream& stream = Stream::Null());
-
-//! applies fixed threshold to the image
-CV_EXPORTS double threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxval, int type, Stream& stream = Stream::Null());
+//! scales and shifts array elements so that either the specified norm (alpha) or the minimum (alpha) and maximum (beta) array values get the specified values
+CV_EXPORTS void normalize(InputArray src, OutputArray dst, double alpha, double beta,
+                          int norm_type, int dtype, InputArray mask, GpuMat& norm_buf, GpuMat& cvt_buf);
+static inline void normalize(InputArray src, OutputArray dst, double alpha = 1, double beta = 0,
+                             int norm_type = NORM_L2, int dtype = -1, InputArray mask = noArray())
+{
+    GpuMat norm_buf;
+    GpuMat cvt_buf;
+    normalize(src, dst, alpha, beta, norm_type, dtype, mask, norm_buf, cvt_buf);
+}

 //! computes the integral image
 //! sum will have CV_32S type, but will contain unsigned int values
 //! supports only CV_8UC1 source type
-CV_EXPORTS void integral(const GpuMat& src, GpuMat& sum, Stream& stream = Stream::Null());
-//! buffered version
-CV_EXPORTS void integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, Stream& stream = Stream::Null());
+CV_EXPORTS void integral(InputArray src, OutputArray sum, GpuMat& buffer, Stream& stream = Stream::Null());
+static inline void integralBuffered(InputArray src, OutputArray sum, GpuMat& buffer, Stream& stream = Stream::Null())
+{
+    integral(src, sum, buffer, stream);
+}
+static inline void integral(InputArray src, OutputArray sum, Stream& stream = Stream::Null())
+{
+    GpuMat buffer;
+    integral(src, sum, buffer, stream);
+}

 //! computes squared integral image
 //! result matrix will have 64F type, but will contain 64U values
 //! supports source images of 8UC1 type only
-CV_EXPORTS void sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& stream = Stream::Null());
+CV_EXPORTS void sqrIntegral(InputArray src, OutputArray sqsum, GpuMat& buf, Stream& stream = Stream::Null());
+static inline void sqrIntegral(InputArray src, OutputArray sqsum, Stream& stream = Stream::Null())
+{
+    GpuMat buffer;
+    sqrIntegral(src, sqsum, buffer, stream);
+}
+
+CV_EXPORTS void gemm(InputArray src1, InputArray src2, double alpha,
+                     InputArray src3, double beta, OutputArray dst, int flags = 0, Stream& stream = Stream::Null());

 //! performs per-element multiplication of two full (not packed) Fourier spectrums
 //! supports 32FC2 matrixes only (interleaved format)
-CV_EXPORTS void mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB=false, Stream& stream = Stream::Null());
+CV_EXPORTS void mulSpectrums(InputArray src1, InputArray src2, OutputArray dst, int flags, bool conjB=false, Stream& stream = Stream::Null());

 //! performs per-element multiplication of two full (not packed) Fourier spectrums
 //! supports 32FC2 matrixes only (interleaved format)
-CV_EXPORTS void mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, float scale, bool conjB=false, Stream& stream = Stream::Null());
+CV_EXPORTS void mulAndScaleSpectrums(InputArray src1, InputArray src2, OutputArray dst, int flags, float scale, bool conjB=false, Stream& stream = Stream::Null());

 //! Performs a forward or inverse discrete Fourier transform (1D or 2D) of floating point matrix.
 //! Param dft_size is the size of DFT transform.
@ -318,9 +372,25 @@ CV_EXPORTS void mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c
 //! in CUFFT's format. Result as full complex matrix for such kind of transform cannot be retrieved.
 //!
 //! For complex-to-real transform it is assumed that the source matrix is packed in CUFFT's format.
-CV_EXPORTS void dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags=0, Stream& stream = Stream::Null());
+CV_EXPORTS void dft(InputArray src, OutputArray dst, Size dft_size, int flags=0, Stream& stream = Stream::Null());

-struct CV_EXPORTS ConvolveBuf
+//! computes convolution (or cross-correlation) of two images using discrete Fourier transform
+//! supports source images of 32FC1 type only
+//! result matrix will have 32FC1 type
+class CV_EXPORTS Convolution : public Algorithm
+{
+public:
+    virtual void convolve(InputArray image, InputArray templ, OutputArray result, bool ccorr = false, Stream& stream = Stream::Null()) = 0;
+};
+CV_EXPORTS Ptr<Convolution> createConvolution(Size user_block_size = Size());
+
+__OPENCV_GPUARITHM_DEPR_BEFORE__ void convolve(InputArray image, InputArray templ, OutputArray result, bool ccorr = false, Stream& stream = Stream::Null()) __OPENCV_GPUARITHM_DEPR_AFTER__;
+inline void convolve(InputArray image, InputArray templ, OutputArray result, bool ccorr , Stream& stream)
+{
+    createConvolution()->convolve(image, templ, result, ccorr, stream);
+}
+
+struct ConvolveBuf
 {
    Size result_size;
    Size block_size;
@ -331,16 +401,19 @@ struct CV_EXPORTS ConvolveBuf
    GpuMat image_spect, templ_spect, result_spect;
    GpuMat image_block, templ_block, result_data;

-    void create(Size image_size, Size templ_size);
-    static Size estimateBlockSize(Size result_size, Size templ_size);
+    void create(Size, Size){}
+    static Size estimateBlockSize(Size, Size){ return Size(); }
 };

-//! computes convolution (or cross-correlation) of two images using discrete Fourier transform
-//! supports source images of 32FC1 type only
-//! result matrix will have 32FC1 type
-CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr = false);
-CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream = Stream::Null());
+__OPENCV_GPUARITHM_DEPR_BEFORE__ void convolve(InputArray image, InputArray templ, OutputArray result, bool ccorr, ConvolveBuf& buf, Stream& stream = Stream::Null()) __OPENCV_GPUARITHM_DEPR_AFTER__;
+inline void convolve(InputArray image, InputArray templ, OutputArray result, bool ccorr, ConvolveBuf& buf, Stream& stream)
+{
+    createConvolution(buf.user_block_size)->convolve(image, templ, result, ccorr, stream);
+}

 }} // namespace cv { namespace gpu {

+#undef __OPENCV_GPUARITHM_DEPR_BEFORE__
+#undef __OPENCV_GPUARITHM_DEPR_AFTER__
+
 #endif /* __OPENCV_GPUARITHM_HPP__ */
--- a/modules/gpuarithm/perf/perf_arithm.cpp
+++ b/modules/gpuarithm/perf/perf_arithm.cpp
@ -228,10 +228,11 @@ PERF_TEST_P(Sz_KernelSz_Ccorr, Convolve,
        cv::gpu::GpuMat d_templ = cv::gpu::createContinuous(templ_size, templ_size, CV_32FC1);
        d_templ.upload(templ);

-        cv::gpu::GpuMat dst;
-        cv::gpu::ConvolveBuf d_buf;
+        cv::Ptr<cv::gpu::Convolution> convolution = cv::gpu::createConvolution();

-        TEST_CYCLE() cv::gpu::convolve(d_image, d_templ, dst, ccorr, d_buf);
+        cv::gpu::GpuMat dst;
+
+        TEST_CYCLE() convolution->convolve(d_image, d_templ, dst, ccorr);

        GPU_SANITY_CHECK(dst);
    }
@ -265,7 +266,7 @@ PERF_TEST_P(Sz, Integral,
        cv::gpu::GpuMat dst;
        cv::gpu::GpuMat d_buf;

-        TEST_CYCLE() cv::gpu::integralBuffered(d_src, dst, d_buf);
+        TEST_CYCLE() cv::gpu::integral(d_src, dst, d_buf);

        GPU_SANITY_CHECK(dst);
    }
@ -293,9 +294,9 @@ PERF_TEST_P(Sz, IntegralSqr,
    if (PERF_RUN_GPU())
    {
        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
+        cv::gpu::GpuMat dst, buf;

-        TEST_CYCLE() cv::gpu::sqrIntegral(d_src, dst);
+        TEST_CYCLE() cv::gpu::sqrIntegral(d_src, dst, buf);

        GPU_SANITY_CHECK(dst);
    }
--- a/modules/gpuarithm/perf/perf_core.cpp
+++ b/modules/gpuarithm/perf/perf_core.cpp
@ -224,10 +224,12 @@ PERF_TEST_P(Sz_Type, LutOneChannel,

    if (PERF_RUN_GPU())
    {
+        cv::Ptr<cv::gpu::LookUpTable> lutAlg = cv::gpu::createLookUpTable(lut);
+
        const cv::gpu::GpuMat d_src(src);
        cv::gpu::GpuMat dst;

-        TEST_CYCLE() cv::gpu::LUT(d_src, lut, dst);
+        TEST_CYCLE() lutAlg->transform(d_src, dst);

        GPU_SANITY_CHECK(dst);
    }
@ -259,10 +261,12 @@ PERF_TEST_P(Sz_Type, LutMultiChannel,

    if (PERF_RUN_GPU())
    {
+        cv::Ptr<cv::gpu::LookUpTable> lutAlg = cv::gpu::createLookUpTable(lut);
+
        const cv::gpu::GpuMat d_src(src);
        cv::gpu::GpuMat dst;

-        TEST_CYCLE() cv::gpu::LUT(d_src, lut, dst);
+        TEST_CYCLE() lutAlg->transform(d_src, dst);

        GPU_SANITY_CHECK(dst);
    }
--- a/modules/gpuarithm/perf/perf_reductions.cpp
+++ b/modules/gpuarithm/perf/perf_reductions.cpp
@ -108,9 +108,10 @@ PERF_TEST_P(Sz_Norm, NormDiff,
    {
        const cv::gpu::GpuMat d_src1(src1);
        const cv::gpu::GpuMat d_src2(src2);
+        cv::gpu::GpuMat d_buf;
        double gpu_dst;

-        TEST_CYCLE() gpu_dst = cv::gpu::norm(d_src1, d_src2, normType);
+        TEST_CYCLE() gpu_dst = cv::gpu::norm(d_src1, d_src2, d_buf, normType);

        SANITY_CHECK(gpu_dst);

--- a/modules/gpuarithm/src/arithm.cpp
+++ b/modules/gpuarithm/src/arithm.cpp
@ -47,21 +47,14 @@ using namespace cv::gpu;

 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)

-void cv::gpu::gemm(const GpuMat&, const GpuMat&, double, const GpuMat&, double, GpuMat&, int, Stream&) { throw_no_cuda(); }
+void cv::gpu::gemm(InputArray, InputArray, double, InputArray, double, OutputArray, int, Stream&) { throw_no_cuda(); }

-void cv::gpu::integral(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::integralBuffered(const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::mulSpectrums(InputArray, InputArray, OutputArray, int, bool, Stream&) { throw_no_cuda(); }
+void cv::gpu::mulAndScaleSpectrums(InputArray, InputArray, OutputArray, int, float, bool, Stream&) { throw_no_cuda(); }

-void cv::gpu::sqrIntegral(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::dft(InputArray, OutputArray, Size, int, Stream&) { throw_no_cuda(); }

-void cv::gpu::mulSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, bool, Stream&) { throw_no_cuda(); }
-void cv::gpu::mulAndScaleSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, float, bool, Stream&) { throw_no_cuda(); }
-
-void cv::gpu::dft(const GpuMat&, GpuMat&, Size, int, Stream&) { throw_no_cuda(); }
-
-void cv::gpu::ConvolveBuf::create(Size, Size) { throw_no_cuda(); }
-void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool) { throw_no_cuda(); }
-void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&, Stream&) { throw_no_cuda(); }
+Ptr<Convolution> cv::gpu::createConvolution(Size) { throw_no_cuda(); return Ptr<Convolution>(); }

 #else /* !defined (HAVE_CUDA) */

@ -169,23 +162,27 @@ namespace
 ////////////////////////////////////////////////////////////////////////
 // gemm

-void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const GpuMat& src3, double beta, GpuMat& dst, int flags, Stream& stream)
+void cv::gpu::gemm(InputArray _src1, InputArray _src2, double alpha, InputArray _src3, double beta, OutputArray _dst, int flags, Stream& stream)
 {
 #ifndef HAVE_CUBLAS
-    (void)src1;
-    (void)src2;
-    (void)alpha;
-    (void)src3;
-    (void)beta;
-    (void)dst;
-    (void)flags;
-    (void)stream;
-    CV_Error(cv::Error::StsNotImplemented, "The library was build without CUBLAS");
+    (void) _src1;
+    (void) _src2;
+    (void) alpha;
+    (void) _src3;
+    (void) beta;
+    (void) _dst;
+    (void) flags;
+    (void) stream;
+    CV_Error(Error::StsNotImplemented, "The library was build without CUBLAS");
 #else
    // CUBLAS works with column-major matrices

-    CV_Assert(src1.type() == CV_32FC1 || src1.type() == CV_32FC2 || src1.type() == CV_64FC1 || src1.type() == CV_64FC2);
-    CV_Assert(src2.type() == src1.type() && (src3.empty() || src3.type() == src1.type()));
+    GpuMat src1 = _src1.getGpuMat();
+    GpuMat src2 = _src2.getGpuMat();
+    GpuMat src3 = _src3.getGpuMat();
+
+    CV_Assert( src1.type() == CV_32FC1 || src1.type() == CV_32FC2 || src1.type() == CV_64FC1 || src1.type() == CV_64FC2 );
+    CV_Assert( src2.type() == src1.type() && (src3.empty() || src3.type() == src1.type()) );

    if (src1.depth() == CV_64F)
    {
@ -208,10 +205,11 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G
    Size src3Size = tr3 ? Size(src3.rows, src3.cols) : src3.size();
    Size dstSize(src2Size.width, src1Size.height);

-    CV_Assert(src1Size.width == src2Size.height);
-    CV_Assert(src3.empty() || src3Size == dstSize);
+    CV_Assert( src1Size.width == src2Size.height );
+    CV_Assert( src3.empty() || src3Size == dstSize );

-    dst.create(dstSize, src1.type());
+    _dst.create(dstSize, src1.type());
+    GpuMat dst = _dst.getGpuMat();

    if (beta != 0)
    {
@ -294,116 +292,6 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G
 #endif
 }

-////////////////////////////////////////////////////////////////////////
-// integral
-
-void cv::gpu::integral(const GpuMat& src, GpuMat& sum, Stream& s)
-{
-    GpuMat buffer;
-    gpu::integralBuffered(src, sum, buffer, s);
-}
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        void shfl_integral_gpu(const PtrStepSzb& img, PtrStepSz<unsigned int> integral, cudaStream_t stream);
-    }
-}}}
-
-void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, Stream& s)
-{
-    CV_Assert(src.type() == CV_8UC1);
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    cv::Size whole;
-    cv::Point offset;
-
-    src.locateROI(whole, offset);
-
-    if (deviceSupports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048
-        && offset.x % 16 == 0 && ((src.cols + 63) / 64) * 64 <= (static_cast<int>(src.step) - offset.x))
-    {
-        ensureSizeIsEnough(((src.rows + 7) / 8) * 8, ((src.cols + 63) / 64) * 64, CV_32SC1, buffer);
-
-        cv::gpu::cudev::imgproc::shfl_integral_gpu(src, buffer, stream);
-
-        sum.create(src.rows + 1, src.cols + 1, CV_32SC1);
-
-        sum.setTo(Scalar::all(0), s);
-
-        GpuMat inner = sum(Rect(1, 1, src.cols, src.rows));
-        GpuMat res = buffer(Rect(0, 0, src.cols, src.rows));
-
-        res.copyTo(inner, s);
-    }
-    else
-    {
-#ifndef HAVE_OPENCV_GPULEGACY
-    throw_no_cuda();
-#else
-        sum.create(src.rows + 1, src.cols + 1, CV_32SC1);
-
-        NcvSize32u roiSize;
-        roiSize.width = src.cols;
-        roiSize.height = src.rows;
-
-        cudaDeviceProp prop;
-        cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) );
-
-        Ncv32u bufSize;
-        ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
-        ensureSizeIsEnough(1, bufSize, CV_8UC1, buffer);
-
-        NppStStreamHandler h(stream);
-
-        ncvSafeCall( nppiStIntegral_8u32u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>()), static_cast<int>(src.step),
-            sum.ptr<Ncv32u>(), static_cast<int>(sum.step), roiSize, buffer.ptr<Ncv8u>(), bufSize, prop) );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-#endif
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// sqrIntegral
-
-void cv::gpu::sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& s)
-{
-#ifndef HAVE_OPENCV_GPULEGACY
-    (void) src;
-    (void) sqsum;
-    (void) s;
-    throw_no_cuda();
-#else
-    CV_Assert(src.type() == CV_8U);
-
-    NcvSize32u roiSize;
-    roiSize.width = src.cols;
-    roiSize.height = src.rows;
-
-    cudaDeviceProp prop;
-    cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) );
-
-    Ncv32u bufSize;
-    ncvSafeCall(nppiStSqrIntegralGetSize_8u64u(roiSize, &bufSize, prop));
-    GpuMat buf(1, bufSize, CV_8U);
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    NppStStreamHandler h(stream);
-
-    sqsum.create(src.rows + 1, src.cols + 1, CV_64F);
-    ncvSafeCall(nppiStSqrIntegral_8u64u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>(0)), static_cast<int>(src.step),
-            sqsum.ptr<Ncv64u>(0), static_cast<int>(sqsum.step), roiSize, buf.ptr<Ncv8u>(0), bufSize, prop));
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-#endif
-}
-
 //////////////////////////////////////////////////////////////////////////////
 // mulSpectrums

@ -418,12 +306,12 @@ namespace cv { namespace gpu { namespace cudev

 #endif

-void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB, Stream& stream)
+void cv::gpu::mulSpectrums(InputArray _src1, InputArray _src2, OutputArray _dst, int flags, bool conjB, Stream& stream)
 {
 #ifndef HAVE_CUFFT
-    (void) a;
-    (void) b;
-    (void) c;
+    (void) _src1;
+    (void) _src2;
+    (void) _dst;
    (void) flags;
    (void) conjB;
    (void) stream;
@ -432,16 +320,19 @@ void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flag
    (void) flags;

    typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, PtrStepSz<cufftComplex>, cudaStream_t stream);
-
    static Caller callers[] = { cudev::mulSpectrums, cudev::mulSpectrums_CONJ };

-    CV_Assert(a.type() == b.type() && a.type() == CV_32FC2);
-    CV_Assert(a.size() == b.size());
+    GpuMat src1 = _src1.getGpuMat();
+    GpuMat src2 = _src2.getGpuMat();

-    c.create(a.size(), CV_32FC2);
+    CV_Assert( src1.type() == src2.type() && src1.type() == CV_32FC2 );
+    CV_Assert( src1.size() == src2.size() );
+
+    _dst.create(src1.size(), CV_32FC2);
+    GpuMat dst = _dst.getGpuMat();

    Caller caller = callers[(int)conjB];
-    caller(a, b, c, StreamAccessor::getStream(stream));
+    caller(src1, src2, dst, StreamAccessor::getStream(stream));
 #endif
 }

@ -459,12 +350,12 @@ namespace cv { namespace gpu { namespace cudev

 #endif

-void cv::gpu::mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, float scale, bool conjB, Stream& stream)
+void cv::gpu::mulAndScaleSpectrums(InputArray _src1, InputArray _src2, OutputArray _dst, int flags, float scale, bool conjB, Stream& stream)
 {
 #ifndef HAVE_CUFFT
-    (void) a;
-    (void) b;
-    (void) c;
+    (void) _src1;
+    (void) _src2;
+    (void) _dst;
    (void) flags;
    (void) scale;
    (void) conjB;
@ -476,53 +367,57 @@ void cv::gpu::mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c,
    typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, float scale, PtrStepSz<cufftComplex>, cudaStream_t stream);
    static Caller callers[] = { cudev::mulAndScaleSpectrums, cudev::mulAndScaleSpectrums_CONJ };

-    CV_Assert(a.type() == b.type() && a.type() == CV_32FC2);
-    CV_Assert(a.size() == b.size());
+    GpuMat src1 = _src1.getGpuMat();
+    GpuMat src2 = _src2.getGpuMat();

-    c.create(a.size(), CV_32FC2);
+    CV_Assert( src1.type() == src2.type() && src1.type() == CV_32FC2);
+    CV_Assert( src1.size() == src2.size() );
+
+    _dst.create(src1.size(), CV_32FC2);
+    GpuMat dst = _dst.getGpuMat();

    Caller caller = callers[(int)conjB];
-    caller(a, b, scale, c, StreamAccessor::getStream(stream));
+    caller(src1, src2, scale, dst, StreamAccessor::getStream(stream));
 #endif
 }

 //////////////////////////////////////////////////////////////////////////////
 // dft

-void cv::gpu::dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags, Stream& stream)
+void cv::gpu::dft(InputArray _src, OutputArray _dst, Size dft_size, int flags, Stream& stream)
 {
 #ifndef HAVE_CUFFT
-    (void) src;
-    (void) dst;
+    (void) _src;
+    (void) _dst;
    (void) dft_size;
    (void) flags;
    (void) stream;
    throw_no_cuda();
 #else
+    GpuMat src = _src.getGpuMat();

-    CV_Assert(src.type() == CV_32F || src.type() == CV_32FC2);
+    CV_Assert( src.type() == CV_32FC1 || src.type() == CV_32FC2 );

    // We don't support unpacked output (in the case of real input)
-    CV_Assert(!(flags & DFT_COMPLEX_OUTPUT));
+    CV_Assert( !(flags & DFT_COMPLEX_OUTPUT) );

-    bool is_1d_input = (dft_size.height == 1) || (dft_size.width == 1);
-    int is_row_dft = flags & DFT_ROWS;
-    int is_scaled_dft = flags & DFT_SCALE;
-    int is_inverse = flags & DFT_INVERSE;
-    bool is_complex_input = src.channels() == 2;
-    bool is_complex_output = !(flags & DFT_REAL_OUTPUT);
+    const bool is_1d_input       = (dft_size.height == 1) || (dft_size.width == 1);
+    const bool is_row_dft        = (flags & DFT_ROWS) != 0;
+    const bool is_scaled_dft     = (flags & DFT_SCALE) != 0;
+    const bool is_inverse        = (flags & DFT_INVERSE) != 0;
+    const bool is_complex_input  = src.channels() == 2;
+    const bool is_complex_output = !(flags & DFT_REAL_OUTPUT);

    // We don't support real-to-real transform
-    CV_Assert(is_complex_input || is_complex_output);
+    CV_Assert( is_complex_input || is_complex_output );

-    GpuMat src_data;
+    GpuMat src_cont = src;

    // Make sure here we work with the continuous input,
    // as CUFFT can't handle gaps
-    src_data = src;
-    createContinuous(src.rows, src.cols, src.type(), src_data);
-    if (src_data.data != src.data)
-        src.copyTo(src_data);
+    createContinuous(src.rows, src.cols, src.type(), src_cont);
+    if (src_cont.data != src.data)
+        src.copyTo(src_cont, stream);

    Size dft_size_opt = dft_size;
    if (is_1d_input && !is_row_dft)
@ -532,17 +427,17 @@ void cv::gpu::dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags, Stre
        dft_size_opt.height = std::min(dft_size.width, dft_size.height);
    }

+    CV_Assert( dft_size_opt.width > 1 );
+
    cufftType dft_type = CUFFT_R2C;
    if (is_complex_input)
        dft_type = is_complex_output ? CUFFT_C2C : CUFFT_C2R;

-    CV_Assert(dft_size_opt.width > 1);
-
    cufftHandle plan;
    if (is_1d_input || is_row_dft)
-        cufftPlan1d(&plan, dft_size_opt.width, dft_type, dft_size_opt.height);
+        cufftSafeCall( cufftPlan1d(&plan, dft_size_opt.width, dft_type, dft_size_opt.height) );
    else
-        cufftPlan2d(&plan, dft_size_opt.height, dft_size_opt.width, dft_type);
+        cufftSafeCall( cufftPlan2d(&plan, dft_size_opt.height, dft_size_opt.width, dft_type) );

    cufftSafeCall( cufftSetStream(plan, StreamAccessor::getStream(stream)) );

@ -550,171 +445,191 @@ void cv::gpu::dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags, Stre
    {
        if (is_complex_output)
        {
-            createContinuous(dft_size, CV_32FC2, dst);
+            createContinuous(dft_size, CV_32FC2, _dst);
+            GpuMat dst = _dst.getGpuMat();
+
            cufftSafeCall(cufftExecC2C(
-                    plan, src_data.ptr<cufftComplex>(), dst.ptr<cufftComplex>(),
+                    plan, src_cont.ptr<cufftComplex>(), dst.ptr<cufftComplex>(),
                    is_inverse ? CUFFT_INVERSE : CUFFT_FORWARD));
        }
        else
        {
-            createContinuous(dft_size, CV_32F, dst);
+            createContinuous(dft_size, CV_32F, _dst);
+            GpuMat dst = _dst.getGpuMat();
+
            cufftSafeCall(cufftExecC2R(
-                    plan, src_data.ptr<cufftComplex>(), dst.ptr<cufftReal>()));
+                    plan, src_cont.ptr<cufftComplex>(), dst.ptr<cufftReal>()));
        }
    }
    else
    {
        // We could swap dft_size for efficiency. Here we must reflect it
        if (dft_size == dft_size_opt)
-            createContinuous(Size(dft_size.width / 2 + 1, dft_size.height), CV_32FC2, dst);
+            createContinuous(Size(dft_size.width / 2 + 1, dft_size.height), CV_32FC2, _dst);
        else
-            createContinuous(Size(dft_size.width, dft_size.height / 2 + 1), CV_32FC2, dst);
+            createContinuous(Size(dft_size.width, dft_size.height / 2 + 1), CV_32FC2, _dst);
+
+        GpuMat dst = _dst.getGpuMat();

        cufftSafeCall(cufftExecR2C(
-                plan, src_data.ptr<cufftReal>(), dst.ptr<cufftComplex>()));
+                plan, src_cont.ptr<cufftReal>(), dst.ptr<cufftComplex>()));
    }

-    cufftSafeCall(cufftDestroy(plan));
+    cufftSafeCall( cufftDestroy(plan) );

    if (is_scaled_dft)
-        multiply(dst, Scalar::all(1. / dft_size.area()), dst, 1, -1, stream);
+        gpu::multiply(_dst, Scalar::all(1. / dft_size.area()), _dst, 1, -1, stream);

 #endif
 }

 //////////////////////////////////////////////////////////////////////////////
-// convolve
+// Convolution

-void cv::gpu::ConvolveBuf::create(Size image_size, Size templ_size)
+#ifdef HAVE_CUFFT
+
+namespace
 {
-    result_size = Size(image_size.width - templ_size.width + 1,
-                       image_size.height - templ_size.height + 1);
-
-    block_size = user_block_size;
-    if (user_block_size.width == 0 || user_block_size.height == 0)
-        block_size = estimateBlockSize(result_size, templ_size);
-
-    dft_size.width = 1 << int(ceil(std::log(block_size.width + templ_size.width - 1.) / std::log(2.)));
-    dft_size.height = 1 << int(ceil(std::log(block_size.height + templ_size.height - 1.) / std::log(2.)));
-
-    // CUFFT has hard-coded kernels for power-of-2 sizes (up to 8192),
-    // see CUDA Toolkit 4.1 CUFFT Library Programming Guide
-    if (dft_size.width > 8192)
-        dft_size.width = getOptimalDFTSize(block_size.width + templ_size.width - 1);
-    if (dft_size.height > 8192)
-        dft_size.height = getOptimalDFTSize(block_size.height + templ_size.height - 1);
-
-    // To avoid wasting time doing small DFTs
-    dft_size.width = std::max(dft_size.width, 512);
-    dft_size.height = std::max(dft_size.height, 512);
-
-    createContinuous(dft_size, CV_32F, image_block);
-    createContinuous(dft_size, CV_32F, templ_block);
-    createContinuous(dft_size, CV_32F, result_data);
-
-    spect_len = dft_size.height * (dft_size.width / 2 + 1);
-    createContinuous(1, spect_len, CV_32FC2, image_spect);
-    createContinuous(1, spect_len, CV_32FC2, templ_spect);
-    createContinuous(1, spect_len, CV_32FC2, result_spect);
-
-    // Use maximum result matrix block size for the estimated DFT block size
-    block_size.width = std::min(dft_size.width - templ_size.width + 1, result_size.width);
-    block_size.height = std::min(dft_size.height - templ_size.height + 1, result_size.height);
-}
-
-
-Size cv::gpu::ConvolveBuf::estimateBlockSize(Size result_size, Size /*templ_size*/)
-{
-    int width = (result_size.width + 2) / 3;
-    int height = (result_size.height + 2) / 3;
-    width = std::min(width, result_size.width);
-    height = std::min(height, result_size.height);
-    return Size(width, height);
-}
-
-
-void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr)
-{
-    ConvolveBuf buf;
-    gpu::convolve(image, templ, result, ccorr, buf);
-}
-
-void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream)
-{
-#ifndef HAVE_CUFFT
-    (void) image;
-    (void) templ;
-    (void) result;
-    (void) ccorr;
-    (void) buf;
-    (void) stream;
-    throw_no_cuda();
-#else
-    using namespace cv::gpu::cudev::imgproc;
-
-    CV_Assert(image.type() == CV_32F);
-    CV_Assert(templ.type() == CV_32F);
-
-    buf.create(image.size(), templ.size());
-    result.create(buf.result_size, CV_32F);
-
-    Size& block_size = buf.block_size;
-    Size& dft_size = buf.dft_size;
-
-    GpuMat& image_block = buf.image_block;
-    GpuMat& templ_block = buf.templ_block;
-    GpuMat& result_data = buf.result_data;
-
-    GpuMat& image_spect = buf.image_spect;
-    GpuMat& templ_spect = buf.templ_spect;
-    GpuMat& result_spect = buf.result_spect;
-
-    cufftHandle planR2C, planC2R;
-    cufftSafeCall(cufftPlan2d(&planC2R, dft_size.height, dft_size.width, CUFFT_C2R));
-    cufftSafeCall(cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C));
-
-    cufftSafeCall( cufftSetStream(planR2C, StreamAccessor::getStream(stream)) );
-    cufftSafeCall( cufftSetStream(planC2R, StreamAccessor::getStream(stream)) );
-
-    GpuMat templ_roi(templ.size(), CV_32F, templ.data, templ.step);
-    gpu::copyMakeBorder(templ_roi, templ_block, 0, templ_block.rows - templ_roi.rows, 0,
-                        templ_block.cols - templ_roi.cols, 0, Scalar(), stream);
-
-    cufftSafeCall(cufftExecR2C(planR2C, templ_block.ptr<cufftReal>(),
-                               templ_spect.ptr<cufftComplex>()));
-
-    // Process all blocks of the result matrix
-    for (int y = 0; y < result.rows; y += block_size.height)
+    class ConvolutionImpl : public Convolution
    {
-        for (int x = 0; x < result.cols; x += block_size.width)
-        {
-            Size image_roi_size(std::min(x + dft_size.width, image.cols) - x,
-                                std::min(y + dft_size.height, image.rows) - y);
-            GpuMat image_roi(image_roi_size, CV_32F, (void*)(image.ptr<float>(y) + x),
-                             image.step);
-            gpu::copyMakeBorder(image_roi, image_block, 0, image_block.rows - image_roi.rows,
-                                0, image_block.cols - image_roi.cols, 0, Scalar(), stream);
+    public:
+        explicit ConvolutionImpl(Size user_block_size_) : user_block_size(user_block_size_) {}

-            cufftSafeCall(cufftExecR2C(planR2C, image_block.ptr<cufftReal>(),
-                                       image_spect.ptr<cufftComplex>()));
-            gpu::mulAndScaleSpectrums(image_spect, templ_spect, result_spect, 0,
-                                      1.f / dft_size.area(), ccorr, stream);
-            cufftSafeCall(cufftExecC2R(planC2R, result_spect.ptr<cufftComplex>(),
-                                       result_data.ptr<cufftReal>()));
+        void convolve(InputArray image, InputArray templ, OutputArray result, bool ccorr = false, Stream& stream = Stream::Null());

-            Size result_roi_size(std::min(x + block_size.width, result.cols) - x,
-                                 std::min(y + block_size.height, result.rows) - y);
-            GpuMat result_roi(result_roi_size, result.type(),
-                              (void*)(result.ptr<float>(y) + x), result.step);
-            GpuMat result_block(result_roi_size, result_data.type(),
-                                result_data.ptr(), result_data.step);
+    private:
+        void create(Size image_size, Size templ_size);
+        static Size estimateBlockSize(Size result_size);

-            result_block.copyTo(result_roi, stream);
-        }
+        Size result_size;
+        Size block_size;
+        Size user_block_size;
+        Size dft_size;
+        int spect_len;
+
+        GpuMat image_spect, templ_spect, result_spect;
+        GpuMat image_block, templ_block, result_data;
+    };
+
+    void ConvolutionImpl::create(Size image_size, Size templ_size)
+    {
+        result_size = Size(image_size.width - templ_size.width + 1,
+                           image_size.height - templ_size.height + 1);
+
+        block_size = user_block_size;
+        if (user_block_size.width == 0 || user_block_size.height == 0)
+            block_size = estimateBlockSize(result_size);
+
+        dft_size.width = 1 << int(ceil(std::log(block_size.width + templ_size.width - 1.) / std::log(2.)));
+        dft_size.height = 1 << int(ceil(std::log(block_size.height + templ_size.height - 1.) / std::log(2.)));
+
+        // CUFFT has hard-coded kernels for power-of-2 sizes (up to 8192),
+        // see CUDA Toolkit 4.1 CUFFT Library Programming Guide
+        if (dft_size.width > 8192)
+            dft_size.width = getOptimalDFTSize(block_size.width + templ_size.width - 1);
+        if (dft_size.height > 8192)
+            dft_size.height = getOptimalDFTSize(block_size.height + templ_size.height - 1);
+
+        // To avoid wasting time doing small DFTs
+        dft_size.width = std::max(dft_size.width, 512);
+        dft_size.height = std::max(dft_size.height, 512);
+
+        createContinuous(dft_size, CV_32F, image_block);
+        createContinuous(dft_size, CV_32F, templ_block);
+        createContinuous(dft_size, CV_32F, result_data);
+
+        spect_len = dft_size.height * (dft_size.width / 2 + 1);
+        createContinuous(1, spect_len, CV_32FC2, image_spect);
+        createContinuous(1, spect_len, CV_32FC2, templ_spect);
+        createContinuous(1, spect_len, CV_32FC2, result_spect);
+
+        // Use maximum result matrix block size for the estimated DFT block size
+        block_size.width = std::min(dft_size.width - templ_size.width + 1, result_size.width);
+        block_size.height = std::min(dft_size.height - templ_size.height + 1, result_size.height);
    }

-    cufftSafeCall(cufftDestroy(planR2C));
-    cufftSafeCall(cufftDestroy(planC2R));
+    Size ConvolutionImpl::estimateBlockSize(Size result_size)
+    {
+        int width = (result_size.width + 2) / 3;
+        int height = (result_size.height + 2) / 3;
+        width = std::min(width, result_size.width);
+        height = std::min(height, result_size.height);
+        return Size(width, height);
+    }
+
+    void ConvolutionImpl::convolve(InputArray _image, InputArray _templ, OutputArray _result, bool ccorr, Stream& _stream)
+    {
+        GpuMat image = _image.getGpuMat();
+        GpuMat templ = _templ.getGpuMat();
+
+        CV_Assert( image.type() == CV_32FC1 );
+        CV_Assert( templ.type() == CV_32FC1 );
+
+        create(image.size(), templ.size());
+
+        _result.create(result_size, CV_32FC1);
+        GpuMat result = _result.getGpuMat();
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+        cufftHandle planR2C, planC2R;
+        cufftSafeCall( cufftPlan2d(&planC2R, dft_size.height, dft_size.width, CUFFT_C2R) );
+        cufftSafeCall( cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C) );
+
+        cufftSafeCall( cufftSetStream(planR2C, stream) );
+        cufftSafeCall( cufftSetStream(planC2R, stream) );
+
+        GpuMat templ_roi(templ.size(), CV_32FC1, templ.data, templ.step);
+        gpu::copyMakeBorder(templ_roi, templ_block, 0, templ_block.rows - templ_roi.rows, 0,
+                            templ_block.cols - templ_roi.cols, 0, Scalar(), _stream);
+
+        cufftSafeCall( cufftExecR2C(planR2C, templ_block.ptr<cufftReal>(), templ_spect.ptr<cufftComplex>()) );
+
+        // Process all blocks of the result matrix
+        for (int y = 0; y < result.rows; y += block_size.height)
+        {
+            for (int x = 0; x < result.cols; x += block_size.width)
+            {
+                Size image_roi_size(std::min(x + dft_size.width, image.cols) - x,
+                                    std::min(y + dft_size.height, image.rows) - y);
+                GpuMat image_roi(image_roi_size, CV_32F, (void*)(image.ptr<float>(y) + x),
+                                 image.step);
+                gpu::copyMakeBorder(image_roi, image_block, 0, image_block.rows - image_roi.rows,
+                                    0, image_block.cols - image_roi.cols, 0, Scalar(), _stream);
+
+                cufftSafeCall(cufftExecR2C(planR2C, image_block.ptr<cufftReal>(),
+                                           image_spect.ptr<cufftComplex>()));
+                gpu::mulAndScaleSpectrums(image_spect, templ_spect, result_spect, 0,
+                                          1.f / dft_size.area(), ccorr, _stream);
+                cufftSafeCall(cufftExecC2R(planC2R, result_spect.ptr<cufftComplex>(),
+                                           result_data.ptr<cufftReal>()));
+
+                Size result_roi_size(std::min(x + block_size.width, result.cols) - x,
+                                     std::min(y + block_size.height, result.rows) - y);
+                GpuMat result_roi(result_roi_size, result.type(),
+                                  (void*)(result.ptr<float>(y) + x), result.step);
+                GpuMat result_block(result_roi_size, result_data.type(),
+                                    result_data.ptr(), result_data.step);
+
+                result_block.copyTo(result_roi, _stream);
+            }
+        }
+
+        cufftSafeCall( cufftDestroy(planR2C) );
+        cufftSafeCall( cufftDestroy(planC2R) );
+    }
+}
+
+#endif
+
+Ptr<Convolution> cv::gpu::createConvolution(Size user_block_size)
+{
+#ifndef HAVE_CUFFT
+    (void) user_block_size;
+    CV_Error(Error::StsNotImplemented, "The library was build without CUFFT");
+    return Ptr<Convolution>();
+#else
+    return new ConvolutionImpl(user_block_size);
 #endif
 }

--- a/modules/gpuarithm/src/core.cpp
+++ b/modules/gpuarithm/src/core.cpp
@ -47,19 +47,19 @@ using namespace cv::gpu;

 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)

-void cv::gpu::merge(const GpuMat* /*src*/, size_t /*count*/, GpuMat& /*dst*/, Stream& /*stream*/) { throw_no_cuda(); }
-void cv::gpu::merge(const std::vector<GpuMat>& /*src*/, GpuMat& /*dst*/, Stream& /*stream*/) { throw_no_cuda(); }
+void cv::gpu::merge(const GpuMat*, size_t, OutputArray, Stream&) { throw_no_cuda(); }
+void cv::gpu::merge(const std::vector<GpuMat>&, OutputArray, Stream&) { throw_no_cuda(); }

-void cv::gpu::split(const GpuMat& /*src*/, GpuMat* /*dst*/, Stream& /*stream*/) { throw_no_cuda(); }
-void cv::gpu::split(const GpuMat& /*src*/, std::vector<GpuMat>& /*dst*/, Stream& /*stream*/) { throw_no_cuda(); }
+void cv::gpu::split(InputArray, GpuMat*, Stream&) { throw_no_cuda(); }
+void cv::gpu::split(InputArray, std::vector<GpuMat>&, Stream&) { throw_no_cuda(); }

-void cv::gpu::transpose(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::transpose(InputArray, OutputArray, Stream&) { throw_no_cuda(); }

-void cv::gpu::flip(const GpuMat&, GpuMat&, int, Stream&) { throw_no_cuda(); }
+void cv::gpu::flip(InputArray, OutputArray, int, Stream&) { throw_no_cuda(); }

-void cv::gpu::LUT(const GpuMat&, const Mat&, GpuMat&, Stream&) { throw_no_cuda(); }
+Ptr<LookUpTable> cv::gpu::createLookUpTable(InputArray) { throw_no_cuda(); return Ptr<LookUpTable>(); }

-void cv::gpu::copyMakeBorder(const GpuMat&, GpuMat&, int, int, int, int, int, const Scalar&, Stream&) { throw_no_cuda(); }
+void cv::gpu::copyMakeBorder(InputArray, OutputArray, int, int, int, int, int, Scalar, Stream&) { throw_no_cuda(); }

 #else /* !defined (HAVE_CUDA) */

@ -70,22 +70,27 @@ namespace cv { namespace gpu { namespace cudev
 {
    namespace split_merge
    {
-        void merge_caller(const PtrStepSzb* src, PtrStepSzb& dst, int total_channels, size_t elem_size, const cudaStream_t& stream);
-        void split_caller(const PtrStepSzb& src, PtrStepSzb* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream);
+        void merge(const PtrStepSzb* src, PtrStepSzb& dst, int total_channels, size_t elem_size, const cudaStream_t& stream);
+        void split(const PtrStepSzb& src, PtrStepSzb* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream);
    }
 }}}

 namespace
 {
-    void merge(const GpuMat* src, size_t n, GpuMat& dst, const cudaStream_t& stream)
+    void merge_caller(const GpuMat* src, size_t n, OutputArray _dst, Stream& stream)
    {
-        using namespace ::cv::gpu::cudev::split_merge;
+        CV_Assert( src != 0 );
+        CV_Assert( n > 0 && n <= 4 );

-        CV_Assert(src);
-        CV_Assert(n > 0);
+        const int depth = src[0].depth();
+        const Size size = src[0].size();

-        int depth = src[0].depth();
-        Size size = src[0].size();
+        for (size_t i = 0; i < n; ++i)
+        {
+            CV_Assert( src[i].size() == size );
+            CV_Assert( src[i].depth() == depth );
+            CV_Assert( src[i].channels() == 1 );
+        }

        if (depth == CV_64F)
        {
@ -93,43 +98,32 @@ namespace
                CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
        }

-        bool single_channel_only = true;
-        int total_channels = 0;
-
-        for (size_t i = 0; i < n; ++i)
+        if (n == 1)
        {
-            CV_Assert(src[i].size() == size);
-            CV_Assert(src[i].depth() == depth);
-            single_channel_only = single_channel_only && src[i].channels() == 1;
-            total_channels += src[i].channels();
+            src[0].copyTo(_dst, stream);
        }
-
-        CV_Assert(single_channel_only);
-        CV_Assert(total_channels <= 4);
-
-        if (total_channels == 1)
-            src[0].copyTo(dst);
        else
        {
-            dst.create(size, CV_MAKETYPE(depth, total_channels));
+            _dst.create(size, CV_MAKE_TYPE(depth, (int)n));
+            GpuMat dst = _dst.getGpuMat();

            PtrStepSzb src_as_devmem[4];
            for(size_t i = 0; i < n; ++i)
                src_as_devmem[i] = src[i];

            PtrStepSzb dst_as_devmem(dst);
-            merge_caller(src_as_devmem, dst_as_devmem, total_channels, CV_ELEM_SIZE(depth), stream);
+            cv::gpu::cudev::split_merge::merge(src_as_devmem, dst_as_devmem, (int)n, CV_ELEM_SIZE(depth), StreamAccessor::getStream(stream));
        }
    }

-    void split(const GpuMat& src, GpuMat* dst, const cudaStream_t& stream)
+    void split_caller(const GpuMat& src, GpuMat* dst, Stream& stream)
    {
-        using namespace ::cv::gpu::cudev::split_merge;
+        CV_Assert( dst != 0 );

-        CV_Assert(dst);
+        const int depth = src.depth();
+        const int num_channels = src.channels();

-        int depth = src.depth();
-        int num_channels = src.channels();
+        CV_Assert( num_channels <= 4 );

        if (depth == CV_64F)
        {
@ -139,45 +133,45 @@ namespace

        if (num_channels == 1)
        {
-            src.copyTo(dst[0]);
+            src.copyTo(dst[0], stream);
            return;
        }

        for (int i = 0; i < num_channels; ++i)
            dst[i].create(src.size(), depth);

-        CV_Assert(num_channels <= 4);
-
        PtrStepSzb dst_as_devmem[4];
        for (int i = 0; i < num_channels; ++i)
            dst_as_devmem[i] = dst[i];

        PtrStepSzb src_as_devmem(src);
-        split_caller(src_as_devmem, dst_as_devmem, num_channels, src.elemSize1(), stream);
+        cv::gpu::cudev::split_merge::split(src_as_devmem, dst_as_devmem, num_channels, src.elemSize1(), StreamAccessor::getStream(stream));
    }
 }

-void cv::gpu::merge(const GpuMat* src, size_t n, GpuMat& dst, Stream& stream)
+void cv::gpu::merge(const GpuMat* src, size_t n, OutputArray dst, Stream& stream)
 {
-    ::merge(src, n, dst, StreamAccessor::getStream(stream));
+    merge_caller(src, n, dst, stream);
 }


-void cv::gpu::merge(const std::vector<GpuMat>& src, GpuMat& dst, Stream& stream)
+void cv::gpu::merge(const std::vector<GpuMat>& src, OutputArray dst, Stream& stream)
 {
-    ::merge(&src[0], src.size(), dst, StreamAccessor::getStream(stream));
+    merge_caller(&src[0], src.size(), dst, stream);
 }

-void cv::gpu::split(const GpuMat& src, GpuMat* dst, Stream& stream)
+void cv::gpu::split(InputArray _src, GpuMat* dst, Stream& stream)
 {
-    ::split(src, dst, StreamAccessor::getStream(stream));
+    GpuMat src = _src.getGpuMat();
+    split_caller(src, dst, stream);
 }

-void cv::gpu::split(const GpuMat& src, std::vector<GpuMat>& dst, Stream& stream)
+void cv::gpu::split(InputArray _src, std::vector<GpuMat>& dst, Stream& stream)
 {
+    GpuMat src = _src.getGpuMat();
    dst.resize(src.channels());
    if(src.channels() > 0)
-        ::split(src, &dst[0], StreamAccessor::getStream(stream));
+        split_caller(src, &dst[0], stream);
 }

 ////////////////////////////////////////////////////////////////////////
@ -188,13 +182,16 @@ namespace arithm
    template <typename T> void transpose(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream);
 }

-void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s)
+void cv::gpu::transpose(InputArray _src, OutputArray _dst, Stream& _stream)
 {
+    GpuMat src = _src.getGpuMat();
+
    CV_Assert( src.elemSize() == 1 || src.elemSize() == 4 || src.elemSize() == 8 );

-    dst.create( src.cols, src.rows, src.type() );
+    _dst.create( src.cols, src.rows, src.type() );
+    GpuMat dst = _dst.getGpuMat();

-    cudaStream_t stream = StreamAccessor::getStream(s);
+    cudaStream_t stream = StreamAccessor::getStream(_stream);

    if (src.elemSize() == 1)
    {
@ -266,7 +263,7 @@ namespace
    };
 }

-void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode, Stream& stream)
+void cv::gpu::flip(InputArray _src, OutputArray _dst, int flipCode, Stream& stream)
 {
    typedef void (*func_t)(const GpuMat& src, GpuMat& dst, int flipCode, cudaStream_t stream);
    static const func_t funcs[6][4] =
@ -279,10 +276,13 @@ void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode, Stream& stream)
        {NppMirror<CV_32F, nppiMirror_32f_C1R>::call, 0, NppMirror<CV_32F, nppiMirror_32f_C3R>::call, NppMirror<CV_32F, nppiMirror_32f_C4R>::call}
    };

+    GpuMat src = _src.getGpuMat();
+
    CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S || src.depth() == CV_32F);
    CV_Assert(src.channels() == 1 || src.channels() == 3 || src.channels() == 4);

-    dst.create(src.size(), src.type());
+    _dst.create(src.size(), src.type());
+    GpuMat dst = _dst.getGpuMat();

    funcs[src.depth()][src.channels() - 1](src, dst, flipCode, StreamAccessor::getStream(stream));
 }
@ -290,93 +290,214 @@ void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode, Stream& stream)
 ////////////////////////////////////////////////////////////////////////
 // LUT

-void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s)
+#if (CUDA_VERSION >= 5000)
+
+namespace
 {
-    const int cn = src.channels();
-
-    CV_Assert( src.type() == CV_8UC1 || src.type() == CV_8UC3 );
-    CV_Assert( lut.depth() == CV_8U );
-    CV_Assert( lut.channels() == 1 || lut.channels() == cn );
-    CV_Assert( lut.rows * lut.cols == 256 && lut.isContinuous() );
-
-    dst.create(src.size(), CV_MAKE_TYPE(lut.depth(), cn));
-
-    NppiSize sz;
-    sz.height = src.rows;
-    sz.width = src.cols;
-
-    Mat nppLut;
-    lut.convertTo(nppLut, CV_32S);
-
-    int nValues3[] = {256, 256, 256};
-
-    Npp32s pLevels[256];
-    for (int i = 0; i < 256; ++i)
-        pLevels[i] = i;
-
-    const Npp32s* pLevels3[3];
-
-#if (CUDA_VERSION <= 4020)
-    pLevels3[0] = pLevels3[1] = pLevels3[2] = pLevels;
-#else
-    GpuMat d_pLevels;
-    d_pLevels.upload(Mat(1, 256, CV_32S, pLevels));
-    pLevels3[0] = pLevels3[1] = pLevels3[2] = d_pLevels.ptr<Npp32s>();
-#endif
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-    NppStreamHandler h(stream);
-
-    if (src.type() == CV_8UC1)
-    {
-#if (CUDA_VERSION <= 4020)
-        nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, nppLut.ptr<Npp32s>(), pLevels, 256) );
-#else
-        GpuMat d_nppLut(Mat(1, 256, CV_32S, nppLut.data));
-        nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, d_nppLut.ptr<Npp32s>(), d_pLevels.ptr<Npp32s>(), 256) );
-#endif
-    }
-    else
+    class LookUpTableImpl : public LookUpTable
    {
+    public:
+        LookUpTableImpl(InputArray lut);
+
+        void transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+    private:
+        int lut_cn;
+
+        int nValues3[3];
        const Npp32s* pValues3[3];
+        const Npp32s* pLevels3[3];

-        Mat nppLut3[3];
-        if (nppLut.channels() == 1)
+        GpuMat d_pLevels;
+        GpuMat d_nppLut;
+        GpuMat d_nppLut3[3];
+    };
+
+    LookUpTableImpl::LookUpTableImpl(InputArray _lut)
+    {
+        nValues3[0] = nValues3[1] = nValues3[2] = 256;
+
+        Npp32s pLevels[256];
+        for (int i = 0; i < 256; ++i)
+            pLevels[i] = i;
+
+        d_pLevels.upload(Mat(1, 256, CV_32S, pLevels));
+        pLevels3[0] = pLevels3[1] = pLevels3[2] = d_pLevels.ptr<Npp32s>();
+
+        GpuMat lut;
+        if (_lut.kind() == _InputArray::GPU_MAT)
+        {
+            lut = _lut.getGpuMat();
+        }
+        else
+        {
+            Mat hLut = _lut.getMat();
+            CV_Assert( hLut.total() == 256 && hLut.isContinuous() );
+            lut.upload(Mat(1, 256, hLut.type(), hLut.data));
+        }
+
+        lut_cn = lut.channels();
+
+        CV_Assert( lut.depth() == CV_8U );
+        CV_Assert( lut.rows == 1 && lut.cols == 256 );
+
+        lut.convertTo(d_nppLut, CV_32S);
+
+        if (lut_cn == 1)
        {
-#if (CUDA_VERSION <= 4020)
-            pValues3[0] = pValues3[1] = pValues3[2] = nppLut.ptr<Npp32s>();
-#else
-            GpuMat d_nppLut(Mat(1, 256, CV_32S, nppLut.data));
            pValues3[0] = pValues3[1] = pValues3[2] = d_nppLut.ptr<Npp32s>();
-#endif
+        }
+        else
+        {
+            gpu::split(d_nppLut, d_nppLut3);
+
+            pValues3[0] = d_nppLut3[0].ptr<Npp32s>();
+            pValues3[1] = d_nppLut3[1].ptr<Npp32s>();
+            pValues3[2] = d_nppLut3[2].ptr<Npp32s>();
+        }
+    }
+
+    void LookUpTableImpl::transform(InputArray _src, OutputArray _dst, Stream& _stream)
+    {
+        GpuMat src = _src.getGpuMat();
+
+        const int cn = src.channels();
+
+        CV_Assert( src.type() == CV_8UC1 || src.type() == CV_8UC3 );
+        CV_Assert( lut_cn == 1 || lut_cn == cn );
+
+        _dst.create(src.size(), src.type());
+        GpuMat dst = _dst.getGpuMat();
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+        NppStreamHandler h(stream);
+
+        NppiSize sz;
+        sz.height = src.rows;
+        sz.width = src.cols;
+
+        if (src.type() == CV_8UC1)
+        {
+            nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+                dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, d_nppLut.ptr<Npp32s>(), d_pLevels.ptr<Npp32s>(), 256) );
+        }
+        else
+        {
+            nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+                dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, pValues3, pLevels3, nValues3) );
+        }
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+#else //  (CUDA_VERSION >= 5000)
+
+namespace
+{
+    class LookUpTableImpl : public LookUpTable
+    {
+    public:
+        LookUpTableImpl(InputArray lut);
+
+        void transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+    private:
+        int lut_cn;
+
+        Npp32s pLevels[256];
+        int nValues3[3];
+        const Npp32s* pValues3[3];
+        const Npp32s* pLevels3[3];
+
+        Mat nppLut;
+        Mat nppLut3[3];
+    };
+
+    LookUpTableImpl::LookUpTableImpl(InputArray _lut)
+    {
+        nValues3[0] = nValues3[1] = nValues3[2] = 256;
+
+        for (int i = 0; i < 256; ++i)
+            pLevels[i] = i;
+        pLevels3[0] = pLevels3[1] = pLevels3[2] = pLevels;
+
+        Mat lut;
+        if (_lut.kind() == _InputArray::GPU_MAT)
+        {
+            lut = Mat(_lut.getGpuMat());
+        }
+        else
+        {
+            Mat hLut = _lut.getMat();
+            CV_Assert( hLut.total() == 256 && hLut.isContinuous() );
+            lut = hLut;
+        }
+
+        lut_cn = lut.channels();
+
+        CV_Assert( lut.depth() == CV_8U );
+        CV_Assert( lut.rows == 1 && lut.cols == 256 );
+
+        lut.convertTo(nppLut, CV_32S);
+
+        if (lut_cn == 1)
+        {
+            pValues3[0] = pValues3[1] = pValues3[2] = nppLut.ptr<Npp32s>();
        }
        else
        {
            cv::split(nppLut, nppLut3);

-#if (CUDA_VERSION <= 4020)
            pValues3[0] = nppLut3[0].ptr<Npp32s>();
            pValues3[1] = nppLut3[1].ptr<Npp32s>();
            pValues3[2] = nppLut3[2].ptr<Npp32s>();
-#else
-            GpuMat d_nppLut0(Mat(1, 256, CV_32S, nppLut3[0].data));
-            GpuMat d_nppLut1(Mat(1, 256, CV_32S, nppLut3[1].data));
-            GpuMat d_nppLut2(Mat(1, 256, CV_32S, nppLut3[2].data));
-
-            pValues3[0] = d_nppLut0.ptr<Npp32s>();
-            pValues3[1] = d_nppLut1.ptr<Npp32s>();
-            pValues3[2] = d_nppLut2.ptr<Npp32s>();
-#endif
        }
-
-        nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, pValues3, pLevels3, nValues3) );
    }

-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
+    void LookUpTableImpl::transform(InputArray _src, OutputArray _dst, Stream& _stream)
+    {
+        GpuMat src = _src.getGpuMat();
+
+        const int cn = src.channels();
+
+        CV_Assert( src.type() == CV_8UC1 || src.type() == CV_8UC3 );
+        CV_Assert( lut_cn == 1 || lut_cn == cn );
+
+        _dst.create(src.size(), src.type());
+        GpuMat dst = _dst.getGpuMat();
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+        NppStreamHandler h(stream);
+
+        NppiSize sz;
+        sz.height = src.rows;
+        sz.width = src.cols;
+
+        if (src.type() == CV_8UC1)
+        {
+            nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+                dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, nppLut.ptr<Npp32s>(), pLevels, 256) );
+        }
+        else
+        {
+            nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+                dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, pValues3, pLevels3, nValues3) );
+        }
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+#endif //  (CUDA_VERSION >= 5000)
+
+Ptr<LookUpTable> cv::gpu::createLookUpTable(InputArray lut)
+{
+    return new LookUpTableImpl(lut);
 }

 ////////////////////////////////////////////////////////////////////////
@ -408,14 +529,17 @@ typedef Npp32s __attribute__((__may_alias__)) Npp32s_a;
 typedef Npp32s Npp32s_a;
 #endif

-void cv::gpu::copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, int borderType, const Scalar& value, Stream& s)
+void cv::gpu::copyMakeBorder(InputArray _src, OutputArray _dst, int top, int bottom, int left, int right, int borderType, Scalar value, Stream& _stream)
 {
-    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
-    CV_Assert(borderType == BORDER_REFLECT_101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP);
+    GpuMat src = _src.getGpuMat();

-    dst.create(src.rows + top + bottom, src.cols + left + right, src.type());
+    CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 );
+    CV_Assert( borderType == BORDER_REFLECT_101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP );

-    cudaStream_t stream = StreamAccessor::getStream(s);
+    _dst.create(src.rows + top + bottom, src.cols + left + right, src.type());
+    GpuMat dst = _dst.getGpuMat();
+
+    cudaStream_t stream = StreamAccessor::getStream(_stream);

    if (borderType == BORDER_CONSTANT && (src.type() == CV_8UC1 || src.type() == CV_8UC4 || src.type() == CV_32SC1 || src.type() == CV_32FC1))
    {
--- a/modules/gpuarithm/src/cuda/div_inv.cu
+++ b/modules/gpuarithm/src/cuda/div_inv.cu
@ -1,144 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-#include "opencv2/core/cuda/transform.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-#include "opencv2/core/cuda/simd_functions.hpp"
-
-#include "arithm_func_traits.hpp"
-
-using namespace cv::gpu;
-using namespace cv::gpu::cudev;
-
-namespace arithm
-{
-    template <typename T, typename S, typename D> struct DivInv : unary_function<T, D>
-    {
-        S val;
-
-        __host__ explicit DivInv(S val_) : val(val_) {}
-
-        __device__ __forceinline__ D operator ()(T a) const
-        {
-            return a != 0 ? saturate_cast<D>(val / a) : 0;
-        }
-    };
-}
-
-namespace cv { namespace gpu { namespace cudev
-{
-    template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::DivInv<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
-    {
-    };
-}}}
-
-namespace arithm
-{
-    template <typename T, typename S, typename D>
-    void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
-    {
-        DivInv<T, S, D> op(static_cast<S>(val));
-        cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
-    }
-
-    template void divInv<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-
-    template void divInv<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-
-    //template void divInv<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-
-    //template void divInv<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-
-    //template void divInv<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-
-    //template void divInv<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-
-    //template void divInv<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-}
-
-#endif // CUDA_DISABLER
--- a/modules/gpuarithm/src/cuda/div_scalar.cu
+++ b/modules/gpuarithm/src/cuda/div_scalar.cu
@ -66,6 +66,18 @@ namespace arithm
            return saturate_cast<D>(a / val);
        }
    };
+
+    template <typename T, typename S, typename D> struct DivScalarInv : unary_function<T, D>
+    {
+        S val;
+
+        explicit DivScalarInv(S val_) : val(val_) {}
+
+        __device__ __forceinline__ D operator ()(T a) const
+        {
+            return a != 0 ? saturate_cast<D>(val / a) : 0;
+        }
+    };
 }

 namespace cv { namespace gpu { namespace cudev
@ -73,72 +85,84 @@ namespace cv { namespace gpu { namespace cudev
    template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::DivScalar<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
    {
    };
+
+    template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::DivScalarInv<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
+    {
+    };
 }}}

 namespace arithm
 {
    template <typename T, typename S, typename D>
-    void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
+    void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream)
    {
-        DivScalar<T, S, D> op(static_cast<S>(val));
-        cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+        if (inv)
+        {
+            DivScalarInv<T, S, D> op(static_cast<S>(val));
+            cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+        }
+        else
+        {
+            DivScalar<T, S, D> op(static_cast<S>(val));
+            cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+        }
    }

-    template void divScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<uchar, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<uchar, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<uchar, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<uchar, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<uchar, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<uchar, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<uchar, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);

-    template void divScalar<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<schar, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<schar, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<schar, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<schar, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<schar, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<schar, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<schar, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);

-    //template void divScalar<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divScalar<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<ushort, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<ushort, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<ushort, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<ushort, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<ushort, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<ushort, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<ushort, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);

-    //template void divScalar<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divScalar<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<short, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<short, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<short, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<short, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<short, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<short, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<short, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);

-    //template void divScalar<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divScalar<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divScalar<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divScalar<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<int, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<int, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<int, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<int, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<int, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<int, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<int, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);

-    //template void divScalar<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divScalar<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<float, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<float, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<float, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<float, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<float, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<float, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<float, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);

-    //template void divScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divScalar<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divScalar<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divScalar<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<double, double, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<double, double, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<double, double, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<double, double, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<double, double, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<double, double, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<double, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
 }

 #endif // CUDA_DISABLER
--- a/modules/gpuarithm/src/cuda/split_merge.cu
+++ b/modules/gpuarithm/src/cuda/split_merge.cu
@ -278,7 +278,7 @@ namespace cv { namespace gpu { namespace cudev
        }


-        void merge_caller(const PtrStepSzb* src, PtrStepSzb& dst,
+        void merge(const PtrStepSzb* src, PtrStepSzb& dst,
                                     int total_channels, size_t elem_size,
                                     const cudaStream_t& stream)
        {
@ -487,7 +487,7 @@ namespace cv { namespace gpu { namespace cudev
        }


-        void split_caller(const PtrStepSzb& src, PtrStepSzb* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream)
+        void split(const PtrStepSzb& src, PtrStepSzb* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream)
        {
            static SplitFunction split_func_tbl[] =
            {
--- a/modules/gpuarithm/src/cuda/sub_scalar.cu
+++ b/modules/gpuarithm/src/cuda/sub_scalar.cu
@ -58,12 +58,13 @@ namespace arithm
    template <typename T, typename S, typename D> struct SubScalar : unary_function<T, D>
    {
        S val;
+        int scale;

-        __host__ explicit SubScalar(S val_) : val(val_) {}
+        __host__ SubScalar(S val_, int scale_) : val(val_), scale(scale_) {}

        __device__ __forceinline__ D operator ()(T a) const
        {
-            return saturate_cast<D>(a - val);
+            return saturate_cast<D>(scale * (a - val));
        }
    };
 }
@ -78,9 +79,9 @@ namespace cv { namespace gpu { namespace cudev
 namespace arithm
 {
    template <typename T, typename S, typename D>
-    void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+    void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
    {
-        SubScalar<T, S, D> op(static_cast<S>(val));
+        SubScalar<T, S, D> op(static_cast<S>(val), inv ? -1 : 1);

        if (mask.data)
            cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, mask, stream);
@ -88,61 +89,61 @@ namespace arithm
            cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
    }

-    template void subScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<uchar, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<uchar, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<uchar, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<uchar, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<uchar, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<uchar, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<uchar, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);

-    template void subScalar<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<schar, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<schar, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<schar, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<schar, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<schar, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<schar, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<schar, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);

-    //template void subScalar<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void subScalar<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<ushort, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<ushort, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<ushort, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<ushort, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<ushort, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<ushort, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<ushort, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);

-    //template void subScalar<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void subScalar<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<short, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<short, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<short, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<short, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<short, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<short, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<short, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);

-    //template void subScalar<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void subScalar<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void subScalar<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void subScalar<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<int, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<int, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<int, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<int, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<int, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<int, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<int, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);

-    //template void subScalar<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void subScalar<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void subScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void subScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void subScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<float, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<float, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<float, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<float, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<float, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<float, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<float, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);

-    //template void subScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void subScalar<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void subScalar<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void subScalar<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void subScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    //template void subScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-    template void subScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<double, double, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<double, double, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<double, double, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<double, double, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<double, double, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<double, double, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<double, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 }

 #endif // CUDA_DISABLER
--- a/modules/gpuarithm/src/element_operations.cpp
+++ b/modules/gpuarithm/src/element_operations.cpp
--- a/modules/gpuarithm/src/reductions.cpp
+++ b/modules/gpuarithm/src/reductions.cpp
@ -47,41 +47,28 @@ using namespace cv::gpu;

 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)

-double cv::gpu::norm(const GpuMat&, int) { throw_no_cuda(); return 0.0; }
-double cv::gpu::norm(const GpuMat&, int, GpuMat&) { throw_no_cuda(); return 0.0; }
-double cv::gpu::norm(const GpuMat&, int, const GpuMat&, GpuMat&) { throw_no_cuda(); return 0.0; }
-double cv::gpu::norm(const GpuMat&, const GpuMat&, int) { throw_no_cuda(); return 0.0; }
+double cv::gpu::norm(InputArray, int, InputArray, GpuMat&) { throw_no_cuda(); return 0.0; }
+double cv::gpu::norm(InputArray, InputArray, GpuMat&, int) { throw_no_cuda(); return 0.0; }

-Scalar cv::gpu::sum(const GpuMat&) { throw_no_cuda(); return Scalar(); }
-Scalar cv::gpu::sum(const GpuMat&, GpuMat&) { throw_no_cuda(); return Scalar(); }
-Scalar cv::gpu::sum(const GpuMat&, const GpuMat&, GpuMat&) { throw_no_cuda(); return Scalar(); }
+Scalar cv::gpu::sum(InputArray, InputArray, GpuMat&) { throw_no_cuda(); return Scalar(); }
+Scalar cv::gpu::absSum(InputArray, InputArray, GpuMat&) { throw_no_cuda(); return Scalar(); }
+Scalar cv::gpu::sqrSum(InputArray, InputArray, GpuMat&) { throw_no_cuda(); return Scalar(); }

-Scalar cv::gpu::absSum(const GpuMat&) { throw_no_cuda(); return Scalar(); }
-Scalar cv::gpu::absSum(const GpuMat&, GpuMat&) { throw_no_cuda(); return Scalar(); }
-Scalar cv::gpu::absSum(const GpuMat&, const GpuMat&, GpuMat&) { throw_no_cuda(); return Scalar(); }
+void cv::gpu::minMax(InputArray, double*, double*, InputArray, GpuMat&) { throw_no_cuda(); }
+void cv::gpu::minMaxLoc(InputArray, double*, double*, Point*, Point*, InputArray, GpuMat&, GpuMat&) { throw_no_cuda(); }

-Scalar cv::gpu::sqrSum(const GpuMat&) { throw_no_cuda(); return Scalar(); }
-Scalar cv::gpu::sqrSum(const GpuMat&, GpuMat&) { throw_no_cuda(); return Scalar(); }
-Scalar cv::gpu::sqrSum(const GpuMat&, const GpuMat&, GpuMat&) { throw_no_cuda(); return Scalar(); }
+int cv::gpu::countNonZero(InputArray, GpuMat&) { throw_no_cuda(); return 0; }

-void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&) { throw_no_cuda(); }
-void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&, GpuMat&) { throw_no_cuda(); }
+void cv::gpu::reduce(InputArray, OutputArray, int, int, int, Stream&) { throw_no_cuda(); }

-void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, const GpuMat&) { throw_no_cuda(); }
-void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, const GpuMat&, GpuMat&, GpuMat&) { throw_no_cuda(); }
+void cv::gpu::meanStdDev(InputArray, Scalar&, Scalar&, GpuMat&) { throw_no_cuda(); }

-int cv::gpu::countNonZero(const GpuMat&) { throw_no_cuda(); return 0; }
-int cv::gpu::countNonZero(const GpuMat&, GpuMat&) { throw_no_cuda(); return 0; }
+void cv::gpu::rectStdDev(InputArray, InputArray, OutputArray, Rect, Stream&) { throw_no_cuda(); }

-void cv::gpu::reduce(const GpuMat&, GpuMat&, int, int, int, Stream&) { throw_no_cuda(); }
+void cv::gpu::normalize(InputArray, OutputArray, double, double, int, int, InputArray, GpuMat&, GpuMat&) { throw_no_cuda(); }

-void cv::gpu::meanStdDev(const GpuMat&, Scalar&, Scalar&) { throw_no_cuda(); }
-void cv::gpu::meanStdDev(const GpuMat&, Scalar&, Scalar&, GpuMat&) { throw_no_cuda(); }
-
-void cv::gpu::rectStdDev(const GpuMat&, const GpuMat&, GpuMat&, const Rect&, Stream&) { throw_no_cuda(); }
-
-void cv::gpu::normalize(const GpuMat&, GpuMat&, double, double, int, int, const GpuMat&) { throw_no_cuda(); }
-void cv::gpu::normalize(const GpuMat&, GpuMat&, double, double, int, int, const GpuMat&, GpuMat&, GpuMat&) { throw_no_cuda(); }
+void cv::gpu::integral(InputArray, OutputArray, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::sqrIntegral(InputArray, OutputArray, GpuMat&, Stream&) { throw_no_cuda(); }

 #else

@ -124,21 +111,13 @@ namespace
 ////////////////////////////////////////////////////////////////////////
 // norm

-double cv::gpu::norm(const GpuMat& src, int normType)
+double cv::gpu::norm(InputArray _src, int normType, InputArray _mask, GpuMat& buf)
 {
-    GpuMat buf;
-    return gpu::norm(src, normType, GpuMat(), buf);
-}
+    GpuMat src = _src.getGpuMat();
+    GpuMat mask = _mask.getGpuMat();

-double cv::gpu::norm(const GpuMat& src, int normType, GpuMat& buf)
-{
-    return gpu::norm(src, normType, GpuMat(), buf);
-}
-
-double cv::gpu::norm(const GpuMat& src, int normType, const GpuMat& mask, GpuMat& buf)
-{
-    CV_Assert(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2);
-    CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size() && src.channels() == 1));
+    CV_Assert( normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 );
+    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size() && src.channels() == 1) );

    GpuMat src_single_channel = src.reshape(1);

@ -154,13 +133,11 @@ double cv::gpu::norm(const GpuMat& src, int normType, const GpuMat& mask, GpuMat
    return std::max(std::abs(min_val), std::abs(max_val));
 }

-double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
+double cv::gpu::norm(InputArray _src1, InputArray _src2, GpuMat& buf, int normType)
 {
-    CV_Assert(src1.type() == CV_8UC1);
-    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
-    CV_Assert(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2);
-
 #if CUDA_VERSION < 5050
+    (void) buf;
+
    typedef NppStatus (*func_t)(const Npp8u* pSrc1, int nSrcStep1, const Npp8u* pSrc2, int nSrcStep2, NppiSize oSizeROI, Npp64f* pRetVal);

    static const func_t funcs[] = {nppiNormDiff_Inf_8u_C1R, nppiNormDiff_L1_8u_C1R, nppiNormDiff_L2_8u_C1R};
@ -175,13 +152,18 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
    static const buf_size_func_t buf_size_funcs[] = {nppiNormDiffInfGetBufferHostSize_8u_C1R, nppiNormDiffL1GetBufferHostSize_8u_C1R, nppiNormDiffL2GetBufferHostSize_8u_C1R};
 #endif

+    GpuMat src1 = _src1.getGpuMat();
+    GpuMat src2 = _src2.getGpuMat();
+
+    CV_Assert( src1.type() == CV_8UC1 );
+    CV_Assert( src1.size() == src2.size() && src1.type() == src2.type() );
+    CV_Assert( normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 );
+
    NppiSize sz;
    sz.width  = src1.cols;
    sz.height = src1.rows;

-    int funcIdx = normType >> 1;
-
-    double retVal;
+    const int funcIdx = normType >> 1;

    DeviceBuffer dbuf;

@ -191,13 +173,14 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
    int bufSize;
    buf_size_funcs[funcIdx](sz, &bufSize);

-    GpuMat buf(1, bufSize, CV_8UC1);
+    ensureSizeIsEnough(1, bufSize, CV_8UC1, buf);

    nppSafeCall( funcs[funcIdx](src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), sz, dbuf, buf.data) );
 #endif

    cudaSafeCall( cudaDeviceSynchronize() );

+    double retVal;
    dbuf.download(&retVal);

    return retVal;
@ -220,19 +203,11 @@ namespace sum
    void runSqr(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
 }

-Scalar cv::gpu::sum(const GpuMat& src)
+Scalar cv::gpu::sum(InputArray _src, InputArray _mask, GpuMat& buf)
 {
-    GpuMat buf;
-    return gpu::sum(src, GpuMat(), buf);
-}
+    GpuMat src = _src.getGpuMat();
+    GpuMat mask = _mask.getGpuMat();

-Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
-{
-    return gpu::sum(src, GpuMat(), buf);
-}
-
-Scalar cv::gpu::sum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
-{
    typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
    static const func_t funcs[7][5] =
    {
@ -266,19 +241,11 @@ Scalar cv::gpu::sum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
    return Scalar(result[0], result[1], result[2], result[3]);
 }

-Scalar cv::gpu::absSum(const GpuMat& src)
+Scalar cv::gpu::absSum(InputArray _src, InputArray _mask, GpuMat& buf)
 {
-    GpuMat buf;
-    return gpu::absSum(src, GpuMat(), buf);
-}
+    GpuMat src = _src.getGpuMat();
+    GpuMat mask = _mask.getGpuMat();

-Scalar cv::gpu::absSum(const GpuMat& src, GpuMat& buf)
-{
-    return gpu::absSum(src, GpuMat(), buf);
-}
-
-Scalar cv::gpu::absSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
-{
    typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
    static const func_t funcs[7][5] =
    {
@ -312,19 +279,11 @@ Scalar cv::gpu::absSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
    return Scalar(result[0], result[1], result[2], result[3]);
 }

-Scalar cv::gpu::sqrSum(const GpuMat& src)
+Scalar cv::gpu::sqrSum(InputArray _src, InputArray _mask, GpuMat& buf)
 {
-    GpuMat buf;
-    return gpu::sqrSum(src, GpuMat(), buf);
-}
+    GpuMat src = _src.getGpuMat();
+    GpuMat mask = _mask.getGpuMat();

-Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
-{
-    return gpu::sqrSum(src, GpuMat(), buf);
-}
-
-Scalar cv::gpu::sqrSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
-{
    typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
    static const func_t funcs[7][5] =
    {
@ -369,14 +328,11 @@ namespace minMax
    void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
 }

-void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask)
+void cv::gpu::minMax(InputArray _src, double* minVal, double* maxVal, InputArray _mask, GpuMat& buf)
 {
-    GpuMat buf;
-    gpu::minMax(src, minVal, maxVal, mask, buf);
-}
+    GpuMat src = _src.getGpuMat();
+    GpuMat mask = _mask.getGpuMat();

-void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf)
-{
    typedef void (*func_t)(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
    static const func_t funcs[] =
    {
@ -419,15 +375,12 @@ namespace minMaxLoc
    void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
 }

-void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, const GpuMat& mask)
+void cv::gpu::minMaxLoc(InputArray _src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
+                        InputArray _mask, GpuMat& valBuf, GpuMat& locBuf)
 {
-    GpuMat valBuf, locBuf;
-    gpu::minMaxLoc(src, minVal, maxVal, minLoc, maxLoc, mask, valBuf, locBuf);
-}
+    GpuMat src = _src.getGpuMat();
+    GpuMat mask = _mask.getGpuMat();

-void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
-                        const GpuMat& mask, GpuMat& valBuf, GpuMat& locBuf)
-{
    typedef void (*func_t)(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
    static const func_t funcs[] =
    {
@ -472,14 +425,10 @@ namespace countNonZero
    int run(const PtrStepSzb src, PtrStep<unsigned int> buf);
 }

-int cv::gpu::countNonZero(const GpuMat& src)
+int cv::gpu::countNonZero(InputArray _src, GpuMat& buf)
 {
-    GpuMat buf;
-    return countNonZero(src, buf);
-}
+    GpuMat src = _src.getGpuMat();

-int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
-{
    typedef int (*func_t)(const PtrStepSzb src, PtrStep<unsigned int> buf);
    static const func_t funcs[] =
    {
@ -521,8 +470,10 @@ namespace reduce
    void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
 }

-void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int dtype, Stream& stream)
+void cv::gpu::reduce(InputArray _src, OutputArray _dst, int dim, int reduceOp, int dtype, Stream& stream)
 {
+    GpuMat src = _src.getGpuMat();
+
    CV_Assert( src.channels() <= 4 );
    CV_Assert( dim == 0 || dim == 1 );
    CV_Assert( reduceOp == REDUCE_SUM || reduceOp == REDUCE_AVG || reduceOp == REDUCE_MAX || reduceOp == REDUCE_MIN );
@ -530,7 +481,8 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
    if (dtype < 0)
        dtype = src.depth();

-    dst.create(1, dim == 0 ? src.cols : src.rows, CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()));
+    _dst.create(1, dim == 0 ? src.cols : src.rows, CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()));
+    GpuMat dst = _dst.getGpuMat();

    if (dim == 0)
    {
@ -691,15 +643,11 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
 ////////////////////////////////////////////////////////////////////////
 // meanStdDev

-void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev)
+void cv::gpu::meanStdDev(InputArray _src, Scalar& mean, Scalar& stddev, GpuMat& buf)
 {
-    GpuMat buf;
-    meanStdDev(src, mean, stddev, buf);
-}
+    GpuMat src = _src.getGpuMat();

-void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev, GpuMat& buf)
-{
-    CV_Assert(src.type() == CV_8UC1);
+    CV_Assert( src.type() == CV_8UC1 );

    if (!deviceSupports(FEATURE_SET_COMPUTE_13))
        CV_Error(cv::Error::StsNotImplemented, "Not sufficient compute capebility");
@ -730,11 +678,15 @@ void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev, GpuMat
 //////////////////////////////////////////////////////////////////////////////
 // rectStdDev

-void cv::gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, const Rect& rect, Stream& s)
+void cv::gpu::rectStdDev(InputArray _src, InputArray _sqr, OutputArray _dst, Rect rect, Stream& _stream)
 {
-    CV_Assert(src.type() == CV_32SC1 && sqr.type() == CV_64FC1);
+    GpuMat src = _src.getGpuMat();
+    GpuMat sqr = _sqr.getGpuMat();

-    dst.create(src.size(), CV_32FC1);
+    CV_Assert( src.type() == CV_32SC1 && sqr.type() == CV_64FC1 );
+
+    _dst.create(src.size(), CV_32FC1);
+    GpuMat dst = _dst.getGpuMat();

    NppiSize sz;
    sz.width = src.cols;
@ -746,7 +698,7 @@ void cv::gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, cons
    nppRect.x = rect.x;
    nppRect.y = rect.y;

-    cudaStream_t stream = StreamAccessor::getStream(s);
+    cudaStream_t stream = StreamAccessor::getStream(_stream);

    NppStreamHandler h(stream);

@ -760,16 +712,12 @@ void cv::gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, cons
 ////////////////////////////////////////////////////////////////////////
 // normalize

-void cv::gpu::normalize(const GpuMat& src, GpuMat& dst, double a, double b, int norm_type, int dtype, const GpuMat& mask)
+void cv::gpu::normalize(InputArray _src, OutputArray dst, double a, double b, int norm_type, int dtype, InputArray mask, GpuMat& norm_buf, GpuMat& cvt_buf)
 {
-    GpuMat norm_buf;
-    GpuMat cvt_buf;
-    normalize(src, dst, a, b, norm_type, dtype, mask, norm_buf, cvt_buf);
-}
+    GpuMat src = _src.getGpuMat();

-void cv::gpu::normalize(const GpuMat& src, GpuMat& dst, double a, double b, int norm_type, int dtype, const GpuMat& mask, GpuMat& norm_buf, GpuMat& cvt_buf)
-{
    double scale = 1, shift = 0;
+
    if (norm_type == NORM_MINMAX)
    {
        double smin = 0, smax = 0;
@ -800,4 +748,116 @@ void cv::gpu::normalize(const GpuMat& src, GpuMat& dst, double a, double b, int
    }
 }

+////////////////////////////////////////////////////////////////////////
+// integral
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace imgproc
+    {
+        void shfl_integral_gpu(const PtrStepSzb& img, PtrStepSz<unsigned int> integral, cudaStream_t stream);
+    }
+}}}
+
+void cv::gpu::integral(InputArray _src, OutputArray _dst, GpuMat& buffer, Stream& _stream)
+{
+    GpuMat src = _src.getGpuMat();
+
+    CV_Assert( src.type() == CV_8UC1 );
+
+    cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+    cv::Size whole;
+    cv::Point offset;
+    src.locateROI(whole, offset);
+
+    if (deviceSupports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048
+        && offset.x % 16 == 0 && ((src.cols + 63) / 64) * 64 <= (static_cast<int>(src.step) - offset.x))
+    {
+        ensureSizeIsEnough(((src.rows + 7) / 8) * 8, ((src.cols + 63) / 64) * 64, CV_32SC1, buffer);
+
+        cv::gpu::cudev::imgproc::shfl_integral_gpu(src, buffer, stream);
+
+        _dst.create(src.rows + 1, src.cols + 1, CV_32SC1);
+        GpuMat dst = _dst.getGpuMat();
+
+        dst.setTo(Scalar::all(0), _stream);
+
+        GpuMat inner = dst(Rect(1, 1, src.cols, src.rows));
+        GpuMat res = buffer(Rect(0, 0, src.cols, src.rows));
+
+        res.copyTo(inner, _stream);
+    }
+    else
+    {
+    #ifndef HAVE_OPENCV_GPULEGACY
+        throw_no_cuda();
+    #else
+        _dst.create(src.rows + 1, src.cols + 1, CV_32SC1);
+        GpuMat dst = _dst.getGpuMat();
+
+        NcvSize32u roiSize;
+        roiSize.width = src.cols;
+        roiSize.height = src.rows;
+
+        cudaDeviceProp prop;
+        cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) );
+
+        Ncv32u bufSize;
+        ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
+        ensureSizeIsEnough(1, bufSize, CV_8UC1, buffer);
+
+        NppStStreamHandler h(stream);
+
+        ncvSafeCall( nppiStIntegral_8u32u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>()), static_cast<int>(src.step),
+            dst.ptr<Ncv32u>(), static_cast<int>(dst.step), roiSize, buffer.ptr<Ncv8u>(), bufSize, prop) );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    #endif
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// sqrIntegral
+
+void cv::gpu::sqrIntegral(InputArray _src, OutputArray _dst, GpuMat& buf, Stream& _stream)
+{
+#ifndef HAVE_OPENCV_GPULEGACY
+    (void) _src;
+    (void) _dst;
+    (void) _stream;
+    throw_no_cuda();
+#else
+    GpuMat src = _src.getGpuMat();
+
+    CV_Assert( src.type() == CV_8U );
+
+    NcvSize32u roiSize;
+    roiSize.width = src.cols;
+    roiSize.height = src.rows;
+
+    cudaDeviceProp prop;
+    cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) );
+
+    Ncv32u bufSize;
+    ncvSafeCall(nppiStSqrIntegralGetSize_8u64u(roiSize, &bufSize, prop));
+
+    ensureSizeIsEnough(1, bufSize, CV_8U, buf);
+
+    cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+    NppStStreamHandler h(stream);
+
+    _dst.create(src.rows + 1, src.cols + 1, CV_64F);
+    GpuMat dst = _dst.getGpuMat();
+
+    ncvSafeCall(nppiStSqrIntegral_8u64u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>(0)), static_cast<int>(src.step),
+            dst.ptr<Ncv64u>(0), static_cast<int>(dst.step), roiSize, buf.ptr<Ncv8u>(0), bufSize, prop));
+
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+#endif
+}
+
 #endif
--- a/modules/gpuarithm/test/test_arithm.cpp
+++ b/modules/gpuarithm/test/test_arithm.cpp
@ -419,8 +419,10 @@ GPU_TEST_P(Convolve, Accuracy)
    cv::Mat src = randomMat(size, CV_32FC1, 0.0, 100.0);
    cv::Mat kernel = randomMat(cv::Size(ksize, ksize), CV_32FC1, 0.0, 1.0);

+    cv::Ptr<cv::gpu::Convolution> conv = cv::gpu::createConvolution();
+
    cv::gpu::GpuMat dst;
-    cv::gpu::convolve(loadMat(src), loadMat(kernel), dst, ccorr);
+    conv->convolve(loadMat(src), loadMat(kernel), dst, ccorr);

    cv::Mat dst_gold;
    convolveDFT(src, kernel, dst_gold, ccorr);
--- a/modules/gpuarithm/test/test_core.cpp
+++ b/modules/gpuarithm/test/test_core.cpp
@ -323,8 +323,10 @@ GPU_TEST_P(LUT, OneChannel)
    cv::Mat src = randomMat(size, type);
    cv::Mat lut = randomMat(cv::Size(256, 1), CV_8UC1);

+    cv::Ptr<cv::gpu::LookUpTable> lutAlg = cv::gpu::createLookUpTable(lut);
+
    cv::gpu::GpuMat dst = createMat(size, CV_MAKE_TYPE(lut.depth(), src.channels()));
-    cv::gpu::LUT(loadMat(src, useRoi), lut, dst);
+    lutAlg->transform(loadMat(src, useRoi), dst);

    cv::Mat dst_gold;
    cv::LUT(src, lut, dst_gold);
@ -337,8 +339,10 @@ GPU_TEST_P(LUT, MultiChannel)
    cv::Mat src = randomMat(size, type);
    cv::Mat lut = randomMat(cv::Size(256, 1), CV_MAKE_TYPE(CV_8U, src.channels()));

+    cv::Ptr<cv::gpu::LookUpTable> lutAlg = cv::gpu::createLookUpTable(lut);
+
    cv::gpu::GpuMat dst = createMat(size, CV_MAKE_TYPE(lut.depth(), src.channels()), useRoi);
-    cv::gpu::LUT(loadMat(src, useRoi), lut, dst);
+    lutAlg->transform(loadMat(src, useRoi), dst);

    cv::Mat dst_gold;
    cv::LUT(src, lut, dst_gold);
--- a/modules/gpuarithm/test/test_element_operations.cpp
+++ b/modules/gpuarithm/test/test_element_operations.cpp
@ -261,6 +261,94 @@ INSTANTIATE_TEST_CASE_P(GPU_Arithm, Add_Scalar, testing::Combine(
    DEPTH_PAIRS,
    WHOLE_SUBMAT));

+////////////////////////////////////////////////////////////////////////////////
+// Add_Scalar_First
+
+PARAM_TEST_CASE(Add_Scalar_First, cv::gpu::DeviceInfo, cv::Size, std::pair<MatDepth, MatDepth>, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    std::pair<MatDepth, MatDepth> depth;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(Add_Scalar_First, WithOutMask)
+{
+    cv::Mat mat = randomMat(size, depth.first);
+    cv::Scalar val = randomScalar(0, 255);
+
+    if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::add(val, loadMat(mat), dst, cv::gpu::GpuMat(), depth.second);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::gpu::GpuMat dst = createMat(size, depth.second, useRoi);
+        dst.setTo(cv::Scalar::all(0));
+        cv::gpu::add(val, loadMat(mat, useRoi), dst, cv::gpu::GpuMat(), depth.second);
+
+        cv::Mat dst_gold(size, depth.second, cv::Scalar::all(0));
+        cv::add(val, mat, dst_gold, cv::noArray(), depth.second);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
+    }
+}
+
+GPU_TEST_P(Add_Scalar_First, WithMask)
+{
+    cv::Mat mat = randomMat(size, depth.first);
+    cv::Scalar val = randomScalar(0, 255);
+    cv::Mat mask = randomMat(size, CV_8UC1, 0.0, 2.0);
+
+    if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::add(val, loadMat(mat), dst, cv::gpu::GpuMat(), depth.second);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::gpu::GpuMat dst = createMat(size, depth.second, useRoi);
+        dst.setTo(cv::Scalar::all(0));
+        cv::gpu::add(val, loadMat(mat, useRoi), dst, loadMat(mask, useRoi), depth.second);
+
+        cv::Mat dst_gold(size, depth.second, cv::Scalar::all(0));
+        cv::add(val, mat, dst_gold, mask, depth.second);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Arithm, Add_Scalar_First, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    DEPTH_PAIRS,
+    WHOLE_SUBMAT));
+
 ////////////////////////////////////////////////////////////////////////////////
 // Subtract_Array

@ -476,6 +564,94 @@ INSTANTIATE_TEST_CASE_P(GPU_Arithm, Subtract_Scalar, testing::Combine(
    DEPTH_PAIRS,
    WHOLE_SUBMAT));

+////////////////////////////////////////////////////////////////////////////////
+// Subtract_Scalar_First
+
+PARAM_TEST_CASE(Subtract_Scalar_First, cv::gpu::DeviceInfo, cv::Size, std::pair<MatDepth, MatDepth>, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    std::pair<MatDepth, MatDepth> depth;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(Subtract_Scalar_First, WithOutMask)
+{
+    cv::Mat mat = randomMat(size, depth.first);
+    cv::Scalar val = randomScalar(0, 255);
+
+    if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::subtract(val, loadMat(mat), dst, cv::gpu::GpuMat(), depth.second);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::gpu::GpuMat dst = createMat(size, depth.second, useRoi);
+        dst.setTo(cv::Scalar::all(0));
+        cv::gpu::subtract(val, loadMat(mat, useRoi), dst, cv::gpu::GpuMat(), depth.second);
+
+        cv::Mat dst_gold(size, depth.second, cv::Scalar::all(0));
+        cv::subtract(val, mat, dst_gold, cv::noArray(), depth.second);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
+    }
+}
+
+GPU_TEST_P(Subtract_Scalar_First, WithMask)
+{
+    cv::Mat mat = randomMat(size, depth.first);
+    cv::Scalar val = randomScalar(0, 255);
+    cv::Mat mask = randomMat(size, CV_8UC1, 0.0, 2.0);
+
+    if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::subtract(val, loadMat(mat), dst, cv::gpu::GpuMat(), depth.second);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::gpu::GpuMat dst = createMat(size, depth.second, useRoi);
+        dst.setTo(cv::Scalar::all(0));
+        cv::gpu::subtract(val, loadMat(mat, useRoi), dst, loadMat(mask, useRoi), depth.second);
+
+        cv::Mat dst_gold(size, depth.second, cv::Scalar::all(0));
+        cv::subtract(val, mat, dst_gold, mask, depth.second);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Arithm, Subtract_Scalar_First, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    DEPTH_PAIRS,
+    WHOLE_SUBMAT));
+
 ////////////////////////////////////////////////////////////////////////////////
 // Multiply_Array

@ -756,6 +932,93 @@ INSTANTIATE_TEST_CASE_P(GPU_Arithm, Multiply_Scalar, testing::Combine(
    DEPTH_PAIRS,
    WHOLE_SUBMAT));

+////////////////////////////////////////////////////////////////////////////////
+// Multiply_Scalar_First
+
+PARAM_TEST_CASE(Multiply_Scalar_First, cv::gpu::DeviceInfo, cv::Size, std::pair<MatDepth, MatDepth>, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    std::pair<MatDepth, MatDepth> depth;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(Multiply_Scalar_First, WithOutScale)
+{
+    cv::Mat mat = randomMat(size, depth.first);
+    cv::Scalar val = randomScalar(0, 255);
+
+    if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::multiply(val, loadMat(mat), dst, 1, depth.second);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::gpu::GpuMat dst = createMat(size, depth.second, useRoi);
+        cv::gpu::multiply(val, loadMat(mat, useRoi), dst, 1, depth.second);
+
+        cv::Mat dst_gold;
+        cv::multiply(val, mat, dst_gold, 1, depth.second);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 1.0);
+    }
+}
+
+
+GPU_TEST_P(Multiply_Scalar_First, WithScale)
+{
+    cv::Mat mat = randomMat(size, depth.first);
+    cv::Scalar val = randomScalar(0, 255);
+    double scale = randomDouble(0.0, 255.0);
+
+    if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::multiply(val, loadMat(mat), dst, scale, depth.second);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::gpu::GpuMat dst = createMat(size, depth.second, useRoi);
+        cv::gpu::multiply(val, loadMat(mat, useRoi), dst, scale, depth.second);
+
+        cv::Mat dst_gold;
+        cv::multiply(val, mat, dst_gold, scale, depth.second);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 1.0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Arithm, Multiply_Scalar_First, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    DEPTH_PAIRS,
+    WHOLE_SUBMAT));
+
 ////////////////////////////////////////////////////////////////////////////////
 // Divide_Array

@ -1036,9 +1299,9 @@ INSTANTIATE_TEST_CASE_P(GPU_Arithm, Divide_Scalar, testing::Combine(
    WHOLE_SUBMAT));

 ////////////////////////////////////////////////////////////////////////////////
-// Divide_Scalar_Inv
+// Divide_Scalar_First

-PARAM_TEST_CASE(Divide_Scalar_Inv, cv::gpu::DeviceInfo, cv::Size, std::pair<MatDepth, MatDepth>, UseRoi)
+PARAM_TEST_CASE(Divide_Scalar_First, cv::gpu::DeviceInfo, cv::Size, std::pair<MatDepth, MatDepth>, UseRoi)
 {
    cv::gpu::DeviceInfo devInfo;
    cv::Size size;
@ -1056,7 +1319,7 @@ PARAM_TEST_CASE(Divide_Scalar_Inv, cv::gpu::DeviceInfo, cv::Size, std::pair<MatD
    }
 };

-GPU_TEST_P(Divide_Scalar_Inv, Accuracy)
+GPU_TEST_P(Divide_Scalar_First, Accuracy)
 {
    double scale = randomDouble(0.0, 255.0);
    cv::Mat mat = randomMat(size, depth.first, 1.0, 255.0);
@ -1085,7 +1348,7 @@ GPU_TEST_P(Divide_Scalar_Inv, Accuracy)
    }
 }

-INSTANTIATE_TEST_CASE_P(GPU_Arithm, Divide_Scalar_Inv, testing::Combine(
+INSTANTIATE_TEST_CASE_P(GPU_Arithm, Divide_Scalar_First, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    DEPTH_PAIRS,
@ -1170,6 +1433,35 @@ GPU_TEST_P(AbsDiff, Scalar)
    }
 }

+GPU_TEST_P(AbsDiff, Scalar_First)
+{
+    cv::Mat src = randomMat(size, depth);
+    cv::Scalar val = randomScalar(0.0, 255.0);
+
+    if (depth == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::absdiff(val, loadMat(src), dst);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::gpu::GpuMat dst = createMat(size, depth, useRoi);
+        cv::gpu::absdiff(val, loadMat(src, useRoi), dst);
+
+        cv::Mat dst_gold;
+        cv::absdiff(val, src, dst_gold);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, depth <= CV_32F ? 1.0 : 1e-5);
+    }
+}
+
 INSTANTIATE_TEST_CASE_P(GPU_Arithm, AbsDiff, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
@ -1478,6 +1770,65 @@ INSTANTIATE_TEST_CASE_P(GPU_Arithm, Exp, testing::Combine(
                    MatDepth(CV_32F)),
    WHOLE_SUBMAT));

+////////////////////////////////////////////////////////////////////////////////
+// Pow
+
+PARAM_TEST_CASE(Pow, cv::gpu::DeviceInfo, cv::Size, MatDepth, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(Pow, Accuracy)
+{
+    cv::Mat src = randomMat(size, depth, 0.0, 10.0);
+    double power = randomDouble(2.0, 4.0);
+
+    if (src.depth() < CV_32F)
+        power = static_cast<int>(power);
+
+    if (depth == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::pow(loadMat(src), power, dst);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::gpu::GpuMat dst = createMat(size, depth, useRoi);
+        cv::gpu::pow(loadMat(src, useRoi), power, dst);
+
+        cv::Mat dst_gold;
+        cv::pow(src, power, dst_gold);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, depth < CV_32F ? 0.0 : 1e-1);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Arithm, Pow, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    ALL_DEPTH,
+    WHOLE_SUBMAT));
+
 ////////////////////////////////////////////////////////////////////////////////
 // Compare_Array

@ -2110,65 +2461,6 @@ INSTANTIATE_TEST_CASE_P(GPU_Arithm, Max, testing::Combine(
    ALL_DEPTH,
    WHOLE_SUBMAT));

-////////////////////////////////////////////////////////////////////////////////
-// Pow
-
-PARAM_TEST_CASE(Pow, cv::gpu::DeviceInfo, cv::Size, MatDepth, UseRoi)
-{
-    cv::gpu::DeviceInfo devInfo;
-    cv::Size size;
-    int depth;
-    bool useRoi;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
-        depth = GET_PARAM(2);
-        useRoi = GET_PARAM(3);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(Pow, Accuracy)
-{
-    cv::Mat src = randomMat(size, depth, 0.0, 10.0);
-    double power = randomDouble(2.0, 4.0);
-
-    if (src.depth() < CV_32F)
-        power = static_cast<int>(power);
-
-    if (depth == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
-    {
-        try
-        {
-            cv::gpu::GpuMat dst;
-            cv::gpu::pow(loadMat(src), power, dst);
-        }
-        catch (const cv::Exception& e)
-        {
-            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
-        }
-    }
-    else
-    {
-        cv::gpu::GpuMat dst = createMat(size, depth, useRoi);
-        cv::gpu::pow(loadMat(src, useRoi), power, dst);
-
-        cv::Mat dst_gold;
-        cv::pow(src, power, dst_gold);
-
-        EXPECT_MAT_NEAR(dst_gold, dst, depth < CV_32F ? 0.0 : 1e-1);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Arithm, Pow, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    ALL_DEPTH,
-    WHOLE_SUBMAT));
-
 //////////////////////////////////////////////////////////////////////////////
 // AddWeighted

@ -2234,6 +2526,54 @@ INSTANTIATE_TEST_CASE_P(GPU_Arithm, AddWeighted, testing::Combine(
    ALL_DEPTH,
    WHOLE_SUBMAT));

+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// Threshold
+
+CV_ENUM(ThreshOp, cv::THRESH_BINARY, cv::THRESH_BINARY_INV, cv::THRESH_TRUNC, cv::THRESH_TOZERO, cv::THRESH_TOZERO_INV)
+#define ALL_THRESH_OPS testing::Values(ThreshOp(cv::THRESH_BINARY), ThreshOp(cv::THRESH_BINARY_INV), ThreshOp(cv::THRESH_TRUNC), ThreshOp(cv::THRESH_TOZERO), ThreshOp(cv::THRESH_TOZERO_INV))
+
+PARAM_TEST_CASE(Threshold, cv::gpu::DeviceInfo, cv::Size, MatType, ThreshOp, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    int threshOp;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        threshOp = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(Threshold, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+    double maxVal = randomDouble(20.0, 127.0);
+    double thresh = randomDouble(0.0, maxVal);
+
+    cv::gpu::GpuMat dst = createMat(src.size(), src.type(), useRoi);
+    cv::gpu::threshold(loadMat(src, useRoi), dst, thresh, maxVal, threshOp);
+
+    cv::Mat dst_gold;
+    cv::threshold(src, dst_gold, thresh, maxVal, threshOp);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Arithm, Threshold, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_16SC1), MatType(CV_32FC1)),
+    ALL_THRESH_OPS,
+    WHOLE_SUBMAT));
+
 ////////////////////////////////////////////////////////////////////////////////
 // Magnitude

@ -2452,52 +2792,4 @@ INSTANTIATE_TEST_CASE_P(GPU_Arithm, PolarToCart, testing::Combine(
    testing::Values(AngleInDegrees(false), AngleInDegrees(true)),
    WHOLE_SUBMAT));

-///////////////////////////////////////////////////////////////////////////////////////////////////////
-// Threshold
-
-CV_ENUM(ThreshOp, cv::THRESH_BINARY, cv::THRESH_BINARY_INV, cv::THRESH_TRUNC, cv::THRESH_TOZERO, cv::THRESH_TOZERO_INV)
-#define ALL_THRESH_OPS testing::Values(ThreshOp(cv::THRESH_BINARY), ThreshOp(cv::THRESH_BINARY_INV), ThreshOp(cv::THRESH_TRUNC), ThreshOp(cv::THRESH_TOZERO), ThreshOp(cv::THRESH_TOZERO_INV))
-
-PARAM_TEST_CASE(Threshold, cv::gpu::DeviceInfo, cv::Size, MatType, ThreshOp, UseRoi)
-{
-    cv::gpu::DeviceInfo devInfo;
-    cv::Size size;
-    int type;
-    int threshOp;
-    bool useRoi;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
-        type = GET_PARAM(2);
-        threshOp = GET_PARAM(3);
-        useRoi = GET_PARAM(4);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(Threshold, Accuracy)
-{
-    cv::Mat src = randomMat(size, type);
-    double maxVal = randomDouble(20.0, 127.0);
-    double thresh = randomDouble(0.0, maxVal);
-
-    cv::gpu::GpuMat dst = createMat(src.size(), src.type(), useRoi);
-    cv::gpu::threshold(loadMat(src, useRoi), dst, thresh, maxVal, threshOp);
-
-    cv::Mat dst_gold;
-    cv::threshold(src, dst_gold, thresh, maxVal, threshOp);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Arithm, Threshold, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_16SC1), MatType(CV_32FC1)),
-    ALL_THRESH_OPS,
-    WHOLE_SUBMAT));
-
 #endif // HAVE_CUDA
--- a/modules/gpubgsegm/include/opencv2/gpubgsegm.hpp
+++ b/modules/gpubgsegm/include/opencv2/gpubgsegm.hpp
@ -321,7 +321,7 @@ private:
    GpuMat colors_;
    GpuMat weights_;

-    Ptr<FilterEngine_GPU> boxFilter_;
+    Ptr<gpu::Filter> boxFilter_;
    GpuMat buf_;
 };

--- a/modules/gpubgsegm/src/fgd.cpp
+++ b/modules/gpubgsegm/src/fgd.cpp
@ -228,11 +228,10 @@ private:
    cv::gpu::GpuMat countBuf_;

    cv::gpu::GpuMat buf_;
-    cv::gpu::GpuMat filterBuf_;
    cv::gpu::GpuMat filterBrd_;

-    cv::Ptr<cv::gpu::FilterEngine_GPU> dilateFilter_;
-    cv::Ptr<cv::gpu::FilterEngine_GPU> erodeFilter_;
+    cv::Ptr<cv::gpu::Filter> dilateFilter_;
+    cv::Ptr<cv::gpu::Filter> erodeFilter_;

    CvMemStorage* storage_;
 };
@ -305,8 +304,8 @@ void cv::gpu::FGDStatModel::Impl::create(const cv::gpu::GpuMat& firstFrame, cons
        cv::Mat kernel = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(1 + params_.perform_morphing * 2, 1 + params_.perform_morphing * 2));
        cv::Point anchor(params_.perform_morphing, params_.perform_morphing);

-        dilateFilter_ = cv::gpu::createMorphologyFilter_GPU(cv::MORPH_DILATE, CV_8UC1, kernel, filterBuf_, anchor);
-        erodeFilter_ = cv::gpu::createMorphologyFilter_GPU(cv::MORPH_ERODE, CV_8UC1, kernel, filterBuf_, anchor);
+        dilateFilter_ = cv::gpu::createMorphologyFilter(cv::MORPH_DILATE, CV_8UC1, kernel, anchor);
+        erodeFilter_ = cv::gpu::createMorphologyFilter(cv::MORPH_ERODE, CV_8UC1, kernel, anchor);
    }
 }

@ -326,7 +325,6 @@ void cv::gpu::FGDStatModel::Impl::release()
    countBuf_.release();

    buf_.release();
-    filterBuf_.release();
    filterBrd_.release();
 }

@ -488,14 +486,14 @@ namespace

 namespace
 {
-    void morphology(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst, cv::gpu::GpuMat& filterBrd, int brd, cv::Ptr<cv::gpu::FilterEngine_GPU>& filter, cv::Scalar brdVal)
+    void morphology(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst, cv::gpu::GpuMat& filterBrd, int brd, cv::Ptr<cv::gpu::Filter>& filter, cv::Scalar brdVal)
    {
        cv::gpu::copyMakeBorder(src, filterBrd, brd, brd, brd, brd, cv::BORDER_CONSTANT, brdVal);
-        filter->apply(filterBrd(cv::Rect(brd, brd, src.cols, src.rows)), dst, cv::Rect(0, 0, src.cols, src.rows));
+        filter->apply(filterBrd(cv::Rect(brd, brd, src.cols, src.rows)), dst);
    }

    void smoothForeground(cv::gpu::GpuMat& foreground, cv::gpu::GpuMat& filterBrd, cv::gpu::GpuMat& buf,
-                          cv::Ptr<cv::gpu::FilterEngine_GPU>& erodeFilter, cv::Ptr<cv::gpu::FilterEngine_GPU>& dilateFilter,
+                          cv::Ptr<cv::gpu::Filter>& erodeFilter, cv::Ptr<cv::gpu::Filter>& dilateFilter,
                          const cv::gpu::FGDStatModel::Params& params)
    {
        const int brd = params.perform_morphing;
--- a/modules/gpubgsegm/src/gmg.cpp
+++ b/modules/gpubgsegm/src/gmg.cpp
@ -100,7 +100,7 @@ void cv::gpu::GMG_GPU::initialize(cv::Size frameSize, float min, float max)
    nfeatures_.setTo(cv::Scalar::all(0));

    if (smoothingRadius > 0)
-        boxFilter_ = cv::gpu::createBoxFilter_GPU(CV_8UC1, CV_8UC1, cv::Size(smoothingRadius, smoothingRadius));
+        boxFilter_ = cv::gpu::createBoxFilter(CV_8UC1, -1, cv::Size(smoothingRadius, smoothingRadius));

    loadConstants(frameSize_.width, frameSize_.height, minVal_, maxVal_, quantizationLevels, backgroundPrior, decisionThreshold, maxFeatures, numInitializationFrames);
 }
@ -141,7 +141,7 @@ void cv::gpu::GMG_GPU::operator ()(const cv::gpu::GpuMat& frame, cv::gpu::GpuMat
    // medianBlur
    if (smoothingRadius > 0)
    {
-        boxFilter_->apply(fgmask, buf_, cv::Rect(0,0,-1,-1), stream);
+        boxFilter_->apply(fgmask, buf_, stream);
        int minCount = (smoothingRadius * smoothingRadius + 1) / 2;
        double thresh = 255.0 * minCount / (smoothingRadius * smoothingRadius);
        cv::gpu::threshold(buf_, fgmask, thresh, 255.0, cv::THRESH_BINARY, stream);
--- a/modules/gpucodec/doc/videodec.rst
+++ b/modules/gpucodec/doc/videodec.rst
@ -5,20 +5,37 @@ Video Decoding



-gpu::VideoReader_GPU
--------------------
-Video reader class.
+gpucodec::VideoReader
+---------------------
+Video reader interface.

-.. ocv:class:: gpu::VideoReader_GPU
+.. ocv:class:: gpucodec::VideoReader



-gpu::VideoReader_GPU::Codec
---------------------------
+gpucodec::VideoReader::nextFrame
+--------------------------------
+Grabs, decodes and returns the next video frame.

-Video codecs supported by :ocv:class:`gpu::VideoReader_GPU` .
+.. ocv:function:: bool gpucodec::VideoReader::nextFrame(OutputArray frame)

-.. ocv:enum:: gpu::VideoReader_GPU::Codec
+If no frames has been grabbed (there are no more frames in video file), the methods return ``false`` . The method throws :ocv:class:`Exception` if error occurs.
+
+
+
+gpucodec::VideoReader::format
+-----------------------------
+Returns information about video file format.
+
+.. ocv:function:: FormatInfo gpucodec::VideoReader::format() const
+
+
+
+gpucodec::Codec
+---------------
+Video codecs supported by :ocv:class:`gpucodec::VideoReader` .
+
+.. ocv:enum:: gpucodec::Codec

  .. ocv:emember:: MPEG1 = 0
  .. ocv:emember:: MPEG2
@ -50,12 +67,12 @@ Video codecs supported by :ocv:class:`gpu::VideoReader_GPU` .
        UYVY (4:2:2)


-gpu::VideoReader_GPU::ChromaFormat
----------------------------------

-Chroma formats supported by :ocv:class:`gpu::VideoReader_GPU` .
+gpucodec::ChromaFormat
+----------------------
+Chroma formats supported by :ocv:class:`gpucodec::VideoReader` .

-.. ocv:enum:: gpu::VideoReader_GPU::ChromaFormat
+.. ocv:enum:: gpucodec::ChromaFormat

  .. ocv:emember:: Monochrome = 0
  .. ocv:emember:: YUV420
@ -63,9 +80,10 @@ Chroma formats supported by :ocv:class:`gpu::VideoReader_GPU` .
  .. ocv:emember:: YUV444


-gpu::VideoReader_GPU::FormatInfo
--------------------------------
-.. ocv:struct:: gpu::VideoReader_GPU::FormatInfo
+
+gpucodec::FormatInfo
+--------------------
+.. ocv:struct:: gpucodec::FormatInfo

 Struct providing information about video file format. ::

@ -78,157 +96,58 @@ Struct providing information about video file format. ::
    };


-gpu::VideoReader_GPU::VideoReader_GPU
-------------------------------------
-Constructors.

-.. ocv:function:: gpu::VideoReader_GPU::VideoReader_GPU()
-.. ocv:function:: gpu::VideoReader_GPU::VideoReader_GPU(const String& filename)
-.. ocv:function:: gpu::VideoReader_GPU::VideoReader_GPU(const cv::Ptr<VideoSource>& source)
+gpucodec::createVideoReader
+---------------------------
+Creates video reader.
+
+.. ocv:function:: Ptr<VideoReader> gpucodec::createVideoReader(const String& filename)
+.. ocv:function:: Ptr<VideoReader> gpucodec::createVideoReader(const Ptr<RawVideoSource>& source)

    :param filename: Name of the input video file.

-    :param source: Video file parser implemented by user.
+    :param source: RAW video source implemented by user.

-The constructors initialize video reader. FFMPEG is used to read videos. User can implement own demultiplexing with :ocv:class:`gpu::VideoReader_GPU::VideoSource` .
+FFMPEG is used to read videos. User can implement own demultiplexing with :ocv:class:`gpucodec::RawVideoSource` .



-gpu::VideoReader_GPU::open
--------------------------
-Initializes or reinitializes video reader.
-
-.. ocv:function:: void gpu::VideoReader_GPU::open(const String& filename)
-.. ocv:function:: void gpu::VideoReader_GPU::open(const cv::Ptr<VideoSource>& source)
-
-The method opens video reader. Parameters are the same as in the constructor :ocv:func:`gpu::VideoReader_GPU::VideoReader_GPU` . The method throws :ocv:class:`Exception` if error occurs.
-
-
-
-gpu::VideoReader_GPU::isOpened
------------------------------
-Returns true if video reader has been successfully initialized.
-
-.. ocv:function:: bool gpu::VideoReader_GPU::isOpened() const
-
-
-
-gpu::VideoReader_GPU::close
---------------------------
-Releases the video reader.
-
-.. ocv:function:: void gpu::VideoReader_GPU::close()
-
-
-
-gpu::VideoReader_GPU::read
--------------------------
-Grabs, decodes and returns the next video frame.
-
-.. ocv:function:: bool gpu::VideoReader_GPU::read(GpuMat& image)
-
-If no frames has been grabbed (there are no more frames in video file), the methods return ``false`` . The method throws :ocv:class:`Exception` if error occurs.
-
-
-
-gpu::VideoReader_GPU::format
----------------------------
-Returns information about video file format.
-
-.. ocv:function:: FormatInfo gpu::VideoReader_GPU::format() const
-
-The method throws :ocv:class:`Exception` if video reader wasn't initialized.
-
-
-
-gpu::VideoReader_GPU::dumpFormat
--------------------------------
-Dump information about video file format to specified stream.
-
-.. ocv:function:: void gpu::VideoReader_GPU::dumpFormat(std::ostream& st)
-
-    :param st: Output stream.
-
-The method throws :ocv:class:`Exception` if video reader wasn't initialized.
-
-
-
-gpu::VideoReader_GPU::VideoSource
-----------------------------------
-.. ocv:class:: gpu::VideoReader_GPU::VideoSource
+gpucodec::RawVideoSource
+------------------------
+.. ocv:class:: gpucodec::RawVideoSource

 Interface for video demultiplexing. ::

-    class VideoSource
+    class RawVideoSource
    {
    public:
-        VideoSource();
-        virtual ~VideoSource() {}
+        virtual ~RawVideoSource() {}
+
+        virtual bool getNextPacket(unsigned char** data, int* size, bool* endOfFile) = 0;

        virtual FormatInfo format() const = 0;
-        virtual void start() = 0;
-        virtual void stop() = 0;
-        virtual bool isStarted() const = 0;
-        virtual bool hasError() const = 0;
-
-    protected:
-        bool parseVideoData(const unsigned char* data, size_t size, bool endOfStream = false);
    };

 User can implement own demultiplexing by implementing this interface.



-gpu::VideoReader_GPU::VideoSource::format
-----------------------------------------
-Returns information about video file format.
-
-.. ocv:function:: virtual FormatInfo gpu::VideoReader_GPU::VideoSource::format() const = 0
-
-
-
-gpu::VideoReader_GPU::VideoSource::start
----------------------------------------
-Starts processing.
-
-.. ocv:function:: virtual void gpu::VideoReader_GPU::VideoSource::start() = 0
-
-Implementation must create own thread with video processing and call periodic :ocv:func:`gpu::VideoReader_GPU::VideoSource::parseVideoData` .
-
-
-
-gpu::VideoReader_GPU::VideoSource::stop
+gpucodec::RawVideoSource::getNextPacket
 ---------------------------------------
-Stops processing.
+Returns next packet with RAW video frame.

-.. ocv:function:: virtual void gpu::VideoReader_GPU::VideoSource::stop() = 0
+.. ocv:function:: bool gpucodec::VideoSource::getNextPacket(unsigned char** data, int* size, bool* endOfFile) = 0

-
-
-gpu::VideoReader_GPU::VideoSource::isStarted
--------------------------------------------
-Returns ``true`` if processing was successfully started.
-
-.. ocv:function:: virtual bool gpu::VideoReader_GPU::VideoSource::isStarted() const = 0
-
-
-
-gpu::VideoReader_GPU::VideoSource::hasError
-------------------------------------------
-Returns ``true`` if error occured during processing.
-
-.. ocv:function:: virtual bool gpu::VideoReader_GPU::VideoSource::hasError() const = 0
-
-
-
-gpu::VideoReader_GPU::VideoSource::parseVideoData
-------------------------------------------------
-Parse next video frame. Implementation must call this method after new frame was grabbed.
-
-.. ocv:function:: bool gpu::VideoReader_GPU::VideoSource::parseVideoData(const uchar* data, size_t size, bool endOfStream = false)
-
-    :param data: Pointer to frame data. Can be ``NULL`` if ``endOfStream`` if ``true`` .
+    :param data: Pointer to frame data.

    :param size: Size in bytes of current frame.

    :param endOfStream: Indicates that it is end of stream.
+
+
+
+gpucodec::RawVideoSource::format
+--------------------------------
+Returns information about video file format.
+
+.. ocv:function:: virtual FormatInfo gpucodec::RawVideoSource::format() const = 0
--- a/modules/gpucodec/doc/videoenc.rst
+++ b/modules/gpucodec/doc/videoenc.rst
@ -5,80 +5,25 @@ Video Encoding



-gpu::VideoWriter_GPU
+gpucodec::VideoWriter
 ---------------------
-Video writer class.
+Video writer interface.

-.. ocv:class:: gpu::VideoWriter_GPU
+.. ocv:class:: gpucodec::VideoWriter

-The class uses H264 video codec.
+The implementation uses H264 video codec.

 .. note:: Currently only Windows platform is supported.



-gpu::VideoWriter_GPU::VideoWriter_GPU
-------------------------------------
-Constructors.
-
-.. ocv:function:: gpu::VideoWriter_GPU::VideoWriter_GPU()
-.. ocv:function:: gpu::VideoWriter_GPU::VideoWriter_GPU(const String& fileName, cv::Size frameSize, double fps, SurfaceFormat format = SF_BGR)
-.. ocv:function:: gpu::VideoWriter_GPU::VideoWriter_GPU(const String& fileName, cv::Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format = SF_BGR)
-.. ocv:function:: gpu::VideoWriter_GPU::VideoWriter_GPU(const cv::Ptr<EncoderCallBack>& encoderCallback, cv::Size frameSize, double fps, SurfaceFormat format = SF_BGR)
-.. ocv:function:: gpu::VideoWriter_GPU::VideoWriter_GPU(const cv::Ptr<EncoderCallBack>& encoderCallback, cv::Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format = SF_BGR)
-
-    :param fileName: Name of the output video file. Only AVI file format is supported.
-
-    :param frameSize: Size of the input video frames.
-
-    :param fps: Framerate of the created video stream.
-
-    :param params: Encoder parameters. See :ocv:struct:`gpu::VideoWriter_GPU::EncoderParams` .
-
-    :param format: Surface format of input frames ( ``SF_UYVY`` , ``SF_YUY2`` , ``SF_YV12`` , ``SF_NV12`` , ``SF_IYUV`` , ``SF_BGR`` or ``SF_GRAY``). BGR or gray frames will be converted to YV12 format before encoding, frames with other formats will be used as is.
-
-    :param encoderCallback: Callbacks for video encoder. See :ocv:class:`gpu::VideoWriter_GPU::EncoderCallBack` . Use it if you want to work with raw video stream.
-
-The constructors initialize video writer. FFMPEG is used to write videos. User can implement own multiplexing with :ocv:class:`gpu::VideoWriter_GPU::EncoderCallBack` .
-
-
-
-gpu::VideoWriter_GPU::open
--------------------------
-Initializes or reinitializes video writer.
-
-.. ocv:function:: void gpu::VideoWriter_GPU::open(const String& fileName, cv::Size frameSize, double fps, SurfaceFormat format = SF_BGR)
-.. ocv:function:: void gpu::VideoWriter_GPU::open(const String& fileName, cv::Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format = SF_BGR)
-.. ocv:function:: void gpu::VideoWriter_GPU::open(const cv::Ptr<EncoderCallBack>& encoderCallback, cv::Size frameSize, double fps, SurfaceFormat format = SF_BGR)
-.. ocv:function:: void gpu::VideoWriter_GPU::open(const cv::Ptr<EncoderCallBack>& encoderCallback, cv::Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format = SF_BGR)
-
-The method opens video writer. Parameters are the same as in the constructor :ocv:func:`gpu::VideoWriter_GPU::VideoWriter_GPU` . The method throws :ocv:class:`Exception` if error occurs.
-
-
-
-gpu::VideoWriter_GPU::isOpened
------------------------------
-Returns true if video writer has been successfully initialized.
-
-.. ocv:function:: bool gpu::VideoWriter_GPU::isOpened() const
-
-
-
-gpu::VideoWriter_GPU::close
---------------------------
-Releases the video writer.
-
-.. ocv:function:: void gpu::VideoWriter_GPU::close()
-
-
-
-gpu::VideoWriter_GPU::write
---------------------------
+gpucodec::VideoWriter::write
+----------------------------
 Writes the next video frame.

-.. ocv:function:: void gpu::VideoWriter_GPU::write(const cv::gpu::GpuMat& image, bool lastFrame = false)
+.. ocv:function:: void gpucodec::VideoWriter::write(InputArray frame, bool lastFrame = false) = 0

-    :param image: The written frame.
+    :param frame: The written frame.

    :param lastFrame: Indicates that it is end of stream. The parameter can be ignored.

@ -86,9 +31,34 @@ The method write the specified image to video file. The image must have the same



-gpu::VideoWriter_GPU::EncoderParams
-----------------------------------
-.. ocv:struct:: gpu::VideoWriter_GPU::EncoderParams
+gpucodec::createVideoWriter
+---------------------------
+Creates video writer.
+
+.. ocv:function:: Ptr<gpucodec::VideoWriter> gpucodec::createVideoWriter(const String& fileName, Size frameSize, double fps, SurfaceFormat format = SF_BGR)
+.. ocv:function:: Ptr<gpucodec::VideoWriter> gpucodec::createVideoWriter(const String& fileName, Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format = SF_BGR)
+.. ocv:function:: Ptr<gpucodec::VideoWriter> gpucodec::createVideoWriter(const Ptr<EncoderCallBack>& encoderCallback, Size frameSize, double fps, SurfaceFormat format = SF_BGR)
+.. ocv:function:: Ptr<gpucodec::VideoWriter> gpucodec::createVideoWriter(const Ptr<EncoderCallBack>& encoderCallback, Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format = SF_BGR)
+
+    :param fileName: Name of the output video file. Only AVI file format is supported.
+
+    :param frameSize: Size of the input video frames.
+
+    :param fps: Framerate of the created video stream.
+
+    :param params: Encoder parameters. See :ocv:struct:`gpucodec::EncoderParams` .
+
+    :param format: Surface format of input frames ( ``SF_UYVY`` , ``SF_YUY2`` , ``SF_YV12`` , ``SF_NV12`` , ``SF_IYUV`` , ``SF_BGR`` or ``SF_GRAY``). BGR or gray frames will be converted to YV12 format before encoding, frames with other formats will be used as is.
+
+    :param encoderCallback: Callbacks for video encoder. See :ocv:class:`gpucodec::EncoderCallBack` . Use it if you want to work with raw video stream.
+
+The constructors initialize video writer. FFMPEG is used to write videos. User can implement own multiplexing with :ocv:class:`gpucodec::EncoderCallBack` .
+
+
+
+gpucodec::EncoderParams
+-----------------------
+.. ocv:struct:: gpucodec::EncoderParams

 Different parameters for CUDA video encoder. ::

@ -123,12 +93,12 @@ Different parameters for CUDA video encoder. ::



-gpu::VideoWriter_GPU::EncoderParams::EncoderParams
--------------------------------------------------
+gpucodec::EncoderParams::EncoderParams
+--------------------------------------
 Constructors.

-.. ocv:function:: gpu::VideoWriter_GPU::EncoderParams::EncoderParams()
-.. ocv:function:: gpu::VideoWriter_GPU::EncoderParams::EncoderParams(const String& configFile)
+.. ocv:function:: gpucodec::EncoderParams::EncoderParams()
+.. ocv:function:: gpucodec::EncoderParams::EncoderParams(const String& configFile)

    :param configFile: Config file name.

@ -136,29 +106,29 @@ Creates default parameters or reads parameters from config file.



-gpu::VideoWriter_GPU::EncoderParams::load
-----------------------------------------
+gpucodec::EncoderParams::load
+-----------------------------
 Reads parameters from config file.

-.. ocv:function:: void gpu::VideoWriter_GPU::EncoderParams::load(const String& configFile)
+.. ocv:function:: void gpucodec::EncoderParams::load(const String& configFile)

    :param configFile: Config file name.



-gpu::VideoWriter_GPU::EncoderParams::save
-----------------------------------------
+gpucodec::EncoderParams::save
+-----------------------------
 Saves parameters to config file.

-.. ocv:function:: void gpu::VideoWriter_GPU::EncoderParams::save(const String& configFile) const
+.. ocv:function:: void gpucodec::EncoderParams::save(const String& configFile) const

    :param configFile: Config file name.



-gpu::VideoWriter_GPU::EncoderCallBack
-------------------------------------
-.. ocv:class:: gpu::VideoWriter_GPU::EncoderCallBack
+gpucodec::EncoderCallBack
+-------------------------
+.. ocv:class:: gpucodec::EncoderCallBack

 Callbacks for CUDA video encoder. ::

@ -182,38 +152,38 @@ Callbacks for CUDA video encoder. ::



-gpu::VideoWriter_GPU::EncoderCallBack::acquireBitStream
-------------------------------------------------------
+gpucodec::EncoderCallBack::acquireBitStream
+-------------------------------------------
 Callback function to signal the start of bitstream that is to be encoded.

-.. ocv:function:: virtual uchar* gpu::VideoWriter_GPU::EncoderCallBack::acquireBitStream(int* bufferSize) = 0
+.. ocv:function:: virtual uchar* gpucodec::EncoderCallBack::acquireBitStream(int* bufferSize) = 0

 Callback must allocate buffer for CUDA encoder and return pointer to it and it's size.



-gpu::VideoWriter_GPU::EncoderCallBack::releaseBitStream
-------------------------------------------------------
+gpucodec::EncoderCallBack::releaseBitStream
+-------------------------------------------
 Callback function to signal that the encoded bitstream is ready to be written to file.

-.. ocv:function:: virtual void gpu::VideoWriter_GPU::EncoderCallBack::releaseBitStream(unsigned char* data, int size) = 0
+.. ocv:function:: virtual void gpucodec::EncoderCallBack::releaseBitStream(unsigned char* data, int size) = 0



-gpu::VideoWriter_GPU::EncoderCallBack::onBeginFrame
---------------------------------------------------
+gpucodec::EncoderCallBack::onBeginFrame
+---------------------------------------
 Callback function to signal that the encoding operation on the frame has started.

-.. ocv:function:: virtual void gpu::VideoWriter_GPU::EncoderCallBack::onBeginFrame(int frameNumber, PicType picType) = 0
+.. ocv:function:: virtual void gpucodec::EncoderCallBack::onBeginFrame(int frameNumber, PicType picType) = 0

    :param picType: Specify frame type (I-Frame, P-Frame or B-Frame).



-gpu::VideoWriter_GPU::EncoderCallBack::onEndFrame
-------------------------------------------------
+gpucodec::EncoderCallBack::onEndFrame
+-------------------------------------
 Callback function signals that the encoding operation on the frame has finished.

-.. ocv:function:: virtual void gpu::VideoWriter_GPU::EncoderCallBack::onEndFrame(int frameNumber, PicType picType) = 0
+.. ocv:function:: virtual void gpucodec::EncoderCallBack::onEndFrame(int frameNumber, PicType picType) = 0

    :param picType: Specify frame type (I-Frame, P-Frame or B-Frame).
--- a/modules/gpucodec/include/opencv2/gpucodec.hpp
+++ b/modules/gpucodec/include/opencv2/gpucodec.hpp
@ -7,11 +7,12 @@
 //  copy or use the software.
 //
 //
-//                           License Agreement
+//                          License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@ -47,219 +48,159 @@
 #  error gpucodec.hpp header must be compiled as C++
 #endif

-#include <iosfwd>
-
 #include "opencv2/core/gpu.hpp"

-namespace cv { namespace gpu {
+namespace cv { namespace gpucodec {

 ////////////////////////////////// Video Encoding //////////////////////////////////

-// Works only under Windows
-// Supports olny H264 video codec and AVI files
-class CV_EXPORTS VideoWriter_GPU
+// Works only under Windows.
+// Supports olny H264 video codec and AVI files.
+
+enum SurfaceFormat
+{
+    SF_UYVY = 0,
+    SF_YUY2,
+    SF_YV12,
+    SF_NV12,
+    SF_IYUV,
+    SF_BGR,
+    SF_GRAY = SF_BGR
+};
+
+struct CV_EXPORTS EncoderParams
+{
+    int P_Interval;      // NVVE_P_INTERVAL,
+    int IDR_Period;      // NVVE_IDR_PERIOD,
+    int DynamicGOP;      // NVVE_DYNAMIC_GOP,
+    int RCType;          // NVVE_RC_TYPE,
+    int AvgBitrate;      // NVVE_AVG_BITRATE,
+    int PeakBitrate;     // NVVE_PEAK_BITRATE,
+    int QP_Level_Intra;  // NVVE_QP_LEVEL_INTRA,
+    int QP_Level_InterP; // NVVE_QP_LEVEL_INTER_P,
+    int QP_Level_InterB; // NVVE_QP_LEVEL_INTER_B,
+    int DeblockMode;     // NVVE_DEBLOCK_MODE,
+    int ProfileLevel;    // NVVE_PROFILE_LEVEL,
+    int ForceIntra;      // NVVE_FORCE_INTRA,
+    int ForceIDR;        // NVVE_FORCE_IDR,
+    int ClearStat;       // NVVE_CLEAR_STAT,
+    int DIMode;          // NVVE_SET_DEINTERLACE,
+    int Presets;         // NVVE_PRESETS,
+    int DisableCabac;    // NVVE_DISABLE_CABAC,
+    int NaluFramingType; // NVVE_CONFIGURE_NALU_FRAMING_TYPE
+    int DisableSPSPPS;   // NVVE_DISABLE_SPS_PPS
+
+    EncoderParams();
+    explicit EncoderParams(const String& configFile);
+
+    void load(const String& configFile);
+    void save(const String& configFile) const;
+};
+
+class CV_EXPORTS EncoderCallBack
 {
 public:
-    struct EncoderParams;
-
-    // Callbacks for video encoder, use it if you want to work with raw video stream
-    class EncoderCallBack;
-
-    enum SurfaceFormat
+    enum PicType
    {
-        SF_UYVY = 0,
-        SF_YUY2,
-        SF_YV12,
-        SF_NV12,
-        SF_IYUV,
-        SF_BGR,
-        SF_GRAY = SF_BGR
+        IFRAME = 1,
+        PFRAME = 2,
+        BFRAME = 3
    };

-    VideoWriter_GPU();
-    VideoWriter_GPU(const String& fileName, cv::Size frameSize, double fps, SurfaceFormat format = SF_BGR);
-    VideoWriter_GPU(const String& fileName, cv::Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format = SF_BGR);
-    VideoWriter_GPU(const cv::Ptr<EncoderCallBack>& encoderCallback, cv::Size frameSize, double fps, SurfaceFormat format = SF_BGR);
-    VideoWriter_GPU(const cv::Ptr<EncoderCallBack>& encoderCallback, cv::Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format = SF_BGR);
-    ~VideoWriter_GPU();
+    virtual ~EncoderCallBack() {}

-    // all methods throws cv::Exception if error occurs
-    void open(const String& fileName, cv::Size frameSize, double fps, SurfaceFormat format = SF_BGR);
-    void open(const String& fileName, cv::Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format = SF_BGR);
-    void open(const cv::Ptr<EncoderCallBack>& encoderCallback, cv::Size frameSize, double fps, SurfaceFormat format = SF_BGR);
-    void open(const cv::Ptr<EncoderCallBack>& encoderCallback, cv::Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format = SF_BGR);
+    //! callback function to signal the start of bitstream that is to be encoded
+    //! callback must allocate host buffer for CUDA encoder and return pointer to it and it's size
+    virtual uchar* acquireBitStream(int* bufferSize) = 0;

-    bool isOpened() const;
-    void close();
+    //! callback function to signal that the encoded bitstream is ready to be written to file
+    virtual void releaseBitStream(unsigned char* data, int size) = 0;

-    void write(const cv::gpu::GpuMat& image, bool lastFrame = false);
+    //! callback function to signal that the encoding operation on the frame has started
+    virtual void onBeginFrame(int frameNumber, PicType picType) = 0;

-    struct CV_EXPORTS EncoderParams
-    {
-        int       P_Interval;      //    NVVE_P_INTERVAL,
-        int       IDR_Period;      //    NVVE_IDR_PERIOD,
-        int       DynamicGOP;      //    NVVE_DYNAMIC_GOP,
-        int       RCType;          //    NVVE_RC_TYPE,
-        int       AvgBitrate;      //    NVVE_AVG_BITRATE,
-        int       PeakBitrate;     //    NVVE_PEAK_BITRATE,
-        int       QP_Level_Intra;  //    NVVE_QP_LEVEL_INTRA,
-        int       QP_Level_InterP; //    NVVE_QP_LEVEL_INTER_P,
-        int       QP_Level_InterB; //    NVVE_QP_LEVEL_INTER_B,
-        int       DeblockMode;     //    NVVE_DEBLOCK_MODE,
-        int       ProfileLevel;    //    NVVE_PROFILE_LEVEL,
-        int       ForceIntra;      //    NVVE_FORCE_INTRA,
-        int       ForceIDR;        //    NVVE_FORCE_IDR,
-        int       ClearStat;       //    NVVE_CLEAR_STAT,
-        int       DIMode;          //    NVVE_SET_DEINTERLACE,
-        int       Presets;         //    NVVE_PRESETS,
-        int       DisableCabac;    //    NVVE_DISABLE_CABAC,
-        int       NaluFramingType; //    NVVE_CONFIGURE_NALU_FRAMING_TYPE
-        int       DisableSPSPPS;   //    NVVE_DISABLE_SPS_PPS
-
-        EncoderParams();
-        explicit EncoderParams(const String& configFile);
-
-        void load(const String& configFile);
-        void save(const String& configFile) const;
-    };
-
-    EncoderParams getParams() const;
-
-    class CV_EXPORTS EncoderCallBack
-    {
-    public:
-        enum PicType
-        {
-            IFRAME = 1,
-            PFRAME = 2,
-            BFRAME = 3
-        };
-
-        virtual ~EncoderCallBack() {}
-
-        // callback function to signal the start of bitstream that is to be encoded
-        // must return pointer to buffer
-        virtual uchar* acquireBitStream(int* bufferSize) = 0;
-
-        // callback function to signal that the encoded bitstream is ready to be written to file
-        virtual void releaseBitStream(unsigned char* data, int size) = 0;
-
-        // callback function to signal that the encoding operation on the frame has started
-        virtual void onBeginFrame(int frameNumber, PicType picType) = 0;
-
-        // callback function signals that the encoding operation on the frame has finished
-        virtual void onEndFrame(int frameNumber, PicType picType) = 0;
-    };
-
-    class Impl;
-
-private:
-    cv::Ptr<Impl> impl_;
+    //! callback function signals that the encoding operation on the frame has finished
+    virtual void onEndFrame(int frameNumber, PicType picType) = 0;
 };

+class CV_EXPORTS VideoWriter
+{
+public:
+    virtual ~VideoWriter() {}
+
+    //! writes the next frame from GPU memory
+    virtual void write(InputArray frame, bool lastFrame = false) = 0;
+
+    virtual EncoderParams getEncoderParams() const = 0;
+};
+
+//! create VideoWriter for specified output file (only AVI file format is supported)
+CV_EXPORTS Ptr<VideoWriter> createVideoWriter(const String& fileName, Size frameSize, double fps, SurfaceFormat format = SF_BGR);
+CV_EXPORTS Ptr<VideoWriter> createVideoWriter(const String& fileName, Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format = SF_BGR);
+
+//! create VideoWriter for user-defined callbacks
+CV_EXPORTS Ptr<VideoWriter> createVideoWriter(const Ptr<EncoderCallBack>& encoderCallback, Size frameSize, double fps, SurfaceFormat format = SF_BGR);
+CV_EXPORTS Ptr<VideoWriter> createVideoWriter(const Ptr<EncoderCallBack>& encoderCallback, Size frameSize, double fps, const EncoderParams& params, SurfaceFormat format = SF_BGR);
+
 ////////////////////////////////// Video Decoding //////////////////////////////////////////

-namespace detail
+enum Codec
 {
-    class FrameQueue;
-    class VideoParser;
-}
+    MPEG1 = 0,
+    MPEG2,
+    MPEG4,
+    VC1,
+    H264,
+    JPEG,
+    H264_SVC,
+    H264_MVC,

-class CV_EXPORTS VideoReader_GPU
-{
-public:
-    enum Codec
-    {
-        MPEG1 = 0,
-        MPEG2,
-        MPEG4,
-        VC1,
-        H264,
-        JPEG,
-        H264_SVC,
-        H264_MVC,
-
-        Uncompressed_YUV420 = (('I'<<24)|('Y'<<16)|('U'<<8)|('V')),   // Y,U,V (4:2:0)
-        Uncompressed_YV12   = (('Y'<<24)|('V'<<16)|('1'<<8)|('2')),   // Y,V,U (4:2:0)
-        Uncompressed_NV12   = (('N'<<24)|('V'<<16)|('1'<<8)|('2')),   // Y,UV  (4:2:0)
-        Uncompressed_YUYV   = (('Y'<<24)|('U'<<16)|('Y'<<8)|('V')),   // YUYV/YUY2 (4:2:2)
-        Uncompressed_UYVY   = (('U'<<24)|('Y'<<16)|('V'<<8)|('Y')),   // UYVY (4:2:2)
-    };
-
-    enum ChromaFormat
-    {
-        Monochrome=0,
-        YUV420,
-        YUV422,
-        YUV444,
-    };
-
-    struct FormatInfo
-    {
-        Codec codec;
-        ChromaFormat chromaFormat;
-        int width;
-        int height;
-    };
-
-    class VideoSource;
-
-    VideoReader_GPU();
-    explicit VideoReader_GPU(const String& filename);
-    explicit VideoReader_GPU(const cv::Ptr<VideoSource>& source);
-
-    ~VideoReader_GPU();
-
-    void open(const String& filename);
-    void open(const cv::Ptr<VideoSource>& source);
-    bool isOpened() const;
-
-    void close();
-
-    bool read(GpuMat& image);
-
-    FormatInfo format() const;
-    void dumpFormat(std::ostream& st);
-
-    class CV_EXPORTS VideoSource
-    {
-    public:
-        VideoSource() : frameQueue_(0), videoParser_(0) {}
-        virtual ~VideoSource() {}
-
-        virtual FormatInfo format() const = 0;
-        virtual void start() = 0;
-        virtual void stop() = 0;
-        virtual bool isStarted() const = 0;
-        virtual bool hasError() const = 0;
-
-        void setFrameQueue(detail::FrameQueue* frameQueue) { frameQueue_ = frameQueue; }
-        void setVideoParser(detail::VideoParser* videoParser) { videoParser_ = videoParser; }
-
-    protected:
-        bool parseVideoData(const uchar* data, size_t size, bool endOfStream = false);
-
-    private:
-        VideoSource(const VideoSource&);
-        VideoSource& operator =(const VideoSource&);
-
-        detail::FrameQueue* frameQueue_;
-        detail::VideoParser* videoParser_;
-    };
-
-    class Impl;
-
-private:
-    cv::Ptr<Impl> impl_;
+    Uncompressed_YUV420 = (('I'<<24)|('Y'<<16)|('U'<<8)|('V')),   // Y,U,V (4:2:0)
+    Uncompressed_YV12   = (('Y'<<24)|('V'<<16)|('1'<<8)|('2')),   // Y,V,U (4:2:0)
+    Uncompressed_NV12   = (('N'<<24)|('V'<<16)|('1'<<8)|('2')),   // Y,UV  (4:2:0)
+    Uncompressed_YUYV   = (('Y'<<24)|('U'<<16)|('Y'<<8)|('V')),   // YUYV/YUY2 (4:2:2)
+    Uncompressed_UYVY   = (('U'<<24)|('Y'<<16)|('V'<<8)|('Y'))    // UYVY (4:2:2)
 };

-}} // namespace cv { namespace gpu {
+enum ChromaFormat
+{
+    Monochrome = 0,
+    YUV420,
+    YUV422,
+    YUV444
+};

-namespace cv {
+struct FormatInfo
+{
+    Codec codec;
+    ChromaFormat chromaFormat;
+    int width;
+    int height;
+};

-template <> CV_EXPORTS void Ptr<cv::gpu::VideoWriter_GPU::Impl>::delete_obj();
-template <> CV_EXPORTS void Ptr<cv::gpu::VideoReader_GPU::Impl>::delete_obj();
+class CV_EXPORTS VideoReader
+{
+public:
+    virtual ~VideoReader() {}

-}
+    virtual bool nextFrame(OutputArray frame) = 0;
+
+    virtual FormatInfo format() const = 0;
+};
+
+class CV_EXPORTS RawVideoSource
+{
+public:
+    virtual ~RawVideoSource() {}
+
+    virtual bool getNextPacket(unsigned char** data, int* size, bool* endOfFile) = 0;
+
+    virtual FormatInfo format() const = 0;
+};
+
+CV_EXPORTS Ptr<VideoReader> createVideoReader(const String& filename);
+CV_EXPORTS Ptr<VideoReader> createVideoReader(const Ptr<RawVideoSource>& source);
+
+}} // namespace cv { namespace gpucodec {

 #endif /* __OPENCV_GPUCODEC_HPP__ */
--- a/modules/gpucodec/perf/perf_video.cpp
+++ b/modules/gpucodec/perf/perf_video.cpp
@ -74,12 +74,11 @@ PERF_TEST_P(FileName, VideoReader, Values("gpu/video/768x576.avi", "gpu/video/19

    if (PERF_RUN_GPU())
    {
-        cv::gpu::VideoReader_GPU d_reader(inputFile);
-        ASSERT_TRUE( d_reader.isOpened() );
+        cv::Ptr<cv::gpucodec::VideoReader> d_reader = cv::gpucodec::createVideoReader(inputFile);

        cv::gpu::GpuMat frame;

-        TEST_CYCLE_N(10) d_reader.read(frame);
+        TEST_CYCLE_N(10) d_reader->nextFrame(frame);

        GPU_SANITY_CHECK(frame);
    }
@ -119,7 +118,7 @@ PERF_TEST_P(FileName, VideoWriter, Values("gpu/video/768x576.avi", "gpu/video/19

    if (PERF_RUN_GPU())
    {
-        cv::gpu::VideoWriter_GPU d_writer;
+        cv::Ptr<cv::gpucodec::VideoWriter> d_writer;

        cv::gpu::GpuMat d_frame;

@ -130,11 +129,11 @@ PERF_TEST_P(FileName, VideoWriter, Values("gpu/video/768x576.avi", "gpu/video/19

            d_frame.upload(frame);

-            if (!d_writer.isOpened())
-                d_writer.open(outputFile, frame.size(), FPS);
+            if (d_writer.empty())
+                d_writer = cv::gpucodec::createVideoWriter(outputFile, frame.size(), FPS);

            startTimer(); next();
-            d_writer.write(d_frame);
+            d_writer->write(d_frame);
            stopTimer();
        }
    }
--- a/modules/gpucodec/src/cuda/nv12_to_rgb.cu
+++ b/modules/gpucodec/src/cuda/nv12_to_rgb.cu
@ -51,12 +51,7 @@

 namespace cv { namespace gpu { namespace cudev
 {
-    __constant__ float constHueColorSpaceMat[9];
-
-    void loadHueCSC(float hueCSC[9])
-    {
-        cudaSafeCall( cudaMemcpyToSymbol(constHueColorSpaceMat, hueCSC, 9 * sizeof(float)) );
-    }
+    __constant__ float constHueColorSpaceMat[9] = {1.1644f, 0.0f, 1.596f, 1.1644f, -0.3918f, -0.813f, 1.1644f, 2.0172f, 0.0f};

    __device__ void YUV2RGB(const uint* yuvi, float* red, float* green, float* blue)
    {
--- a/modules/gpucodec/src/cuvid_video_source.cpp
+++ b/modules/gpucodec/src/cuvid_video_source.cpp
@ -7,11 +7,12 @@
 //  copy or use the software.
 //
 //
-//                           License Agreement
+//                          License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@ -44,7 +45,11 @@

 #ifdef HAVE_NVCUVID

-cv::gpu::detail::CuvidVideoSource::CuvidVideoSource(const String& fname)
+using namespace cv;
+using namespace cv::gpucodec;
+using namespace cv::gpucodec::detail;
+
+cv::gpucodec::detail::CuvidVideoSource::CuvidVideoSource(const String& fname)
 {
    CUVIDSOURCEPARAMS params;
    std::memset(&params, 0, sizeof(CUVIDSOURCEPARAMS));
@ -55,51 +60,51 @@ cv::gpu::detail::CuvidVideoSource::CuvidVideoSource(const String& fname)
    params.pfnAudioDataHandler = 0;

    // now create the actual source
-    CUresult res = cuvidCreateVideoSource(&videoSource_, fname.c_str(), &params);
-    if (res == CUDA_ERROR_INVALID_SOURCE)
-        throw std::runtime_error("Unsupported video source");
-    cuSafeCall( res );
+    CUresult cuRes = cuvidCreateVideoSource(&videoSource_, fname.c_str(), &params);
+    if (cuRes == CUDA_ERROR_INVALID_SOURCE)
+        throw std::runtime_error("");
+    cuSafeCall( cuRes );

    CUVIDEOFORMAT vidfmt;
    cuSafeCall( cuvidGetSourceVideoFormat(videoSource_, &vidfmt, 0) );

-    format_.codec = static_cast<VideoReader_GPU::Codec>(vidfmt.codec);
-    format_.chromaFormat = static_cast<VideoReader_GPU::ChromaFormat>(vidfmt.chroma_format);
+    format_.codec = static_cast<Codec>(vidfmt.codec);
+    format_.chromaFormat = static_cast<ChromaFormat>(vidfmt.chroma_format);
    format_.width = vidfmt.coded_width;
    format_.height = vidfmt.coded_height;
 }

-cv::gpu::detail::CuvidVideoSource::~CuvidVideoSource()
+cv::gpucodec::detail::CuvidVideoSource::~CuvidVideoSource()
 {
    cuvidDestroyVideoSource(videoSource_);
 }

-cv::gpu::VideoReader_GPU::FormatInfo cv::gpu::detail::CuvidVideoSource::format() const
+FormatInfo cv::gpucodec::detail::CuvidVideoSource::format() const
 {
    return format_;
 }

-void cv::gpu::detail::CuvidVideoSource::start()
+void cv::gpucodec::detail::CuvidVideoSource::start()
 {
    cuSafeCall( cuvidSetVideoSourceState(videoSource_, cudaVideoState_Started) );
 }

-void cv::gpu::detail::CuvidVideoSource::stop()
+void cv::gpucodec::detail::CuvidVideoSource::stop()
 {
    cuSafeCall( cuvidSetVideoSourceState(videoSource_, cudaVideoState_Stopped) );
 }

-bool cv::gpu::detail::CuvidVideoSource::isStarted() const
+bool cv::gpucodec::detail::CuvidVideoSource::isStarted() const
 {
    return (cuvidGetVideoSourceState(videoSource_) == cudaVideoState_Started);
 }

-bool cv::gpu::detail::CuvidVideoSource::hasError() const
+bool cv::gpucodec::detail::CuvidVideoSource::hasError() const
 {
    return (cuvidGetVideoSourceState(videoSource_) == cudaVideoState_Error);
 }

-int CUDAAPI cv::gpu::detail::CuvidVideoSource::HandleVideoData(void* userData, CUVIDSOURCEDATAPACKET* packet)
+int CUDAAPI cv::gpucodec::detail::CuvidVideoSource::HandleVideoData(void* userData, CUVIDSOURCEDATAPACKET* packet)
 {
    CuvidVideoSource* thiz = static_cast<CuvidVideoSource*>(userData);

--- a/modules/gpucodec/src/cuvid_video_source.hpp
+++ b/modules/gpucodec/src/cuvid_video_source.hpp
@ -7,11 +7,12 @@
 //  copy or use the software.
 //
 //
-//                           License Agreement
+//                          License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@ -40,25 +41,25 @@
 //
 //M*/

-#ifndef __CUVUD_VIDEO_SOURCE_H__
-#define __CUVUD_VIDEO_SOURCE_H__
-
-#include "opencv2/core/private.gpu.hpp"
-#include "opencv2/gpucodec.hpp"
-#include "thread.h"
+#ifndef __CUVID_VIDEO_SOURCE_HPP__
+#define __CUVID_VIDEO_SOURCE_HPP__

 #include <nvcuvid.h>

-namespace cv { namespace gpu { namespace detail
+#include "opencv2/core/private.gpu.hpp"
+#include "opencv2/gpucodec.hpp"
+#include "video_source.hpp"
+
+namespace cv { namespace gpucodec { namespace detail
 {

-class CuvidVideoSource : public VideoReader_GPU::VideoSource
+class CuvidVideoSource : public VideoSource
 {
 public:
    explicit CuvidVideoSource(const String& fname);
    ~CuvidVideoSource();

-    VideoReader_GPU::FormatInfo format() const;
+    FormatInfo format() const;
    void start();
    void stop();
    bool isStarted() const;
@ -78,9 +79,9 @@ private:
    static int CUDAAPI HandleVideoData(void* pUserData, CUVIDSOURCEDATAPACKET* pPacket);

    CUvideosource videoSource_;
-    VideoReader_GPU::FormatInfo format_;
+    FormatInfo format_;
 };

 }}}

-#endif // __CUVUD_VIDEO_SOURCE_H__
+#endif // __CUVID_VIDEO_SOURCE_HPP__
--- a/modules/gpucodec/src/ffmpeg_video_source.cpp
+++ b/modules/gpucodec/src/ffmpeg_video_source.cpp
@ -7,11 +7,12 @@
 //  copy or use the software.
 //
 //
-//                           License Agreement
+//                          License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@ -48,6 +49,10 @@
    #include "../src/cap_ffmpeg_impl.hpp"
 #endif

+using namespace cv;
+using namespace cv::gpucodec;
+using namespace cv::gpucodec::detail;
+
 namespace
 {
    Create_InputMediaStream_FFMPEG_Plugin create_InputMediaStream_FFMPEG_p = 0;
@ -94,7 +99,7 @@ namespace
    }
 }

-cv::gpu::detail::FFmpegVideoSource::FFmpegVideoSource(const String& fname) :
+cv::gpucodec::detail::FFmpegVideoSource::FFmpegVideoSource(const String& fname) :
    stream_(0)
 {
    CV_Assert( init_MediaStream_FFMPEG() );
@ -106,75 +111,33 @@ cv::gpu::detail::FFmpegVideoSource::FFmpegVideoSource(const String& fname) :

    stream_ = create_InputMediaStream_FFMPEG_p(fname.c_str(), &codec, &chroma_format, &width, &height);
    if (!stream_)
-        CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported video source");
+        CV_Error(Error::StsUnsupportedFormat, "Unsupported video source");

-    format_.codec = static_cast<VideoReader_GPU::Codec>(codec);
-    format_.chromaFormat = static_cast<VideoReader_GPU::ChromaFormat>(chroma_format);
+    format_.codec = static_cast<Codec>(codec);
+    format_.chromaFormat = static_cast<ChromaFormat>(chroma_format);
    format_.width = width;
    format_.height = height;
 }

-cv::gpu::VideoReader_GPU::FormatInfo cv::gpu::detail::FFmpegVideoSource::format() const
+cv::gpucodec::detail::FFmpegVideoSource::~FFmpegVideoSource()
+{
+    if (stream_)
+        release_InputMediaStream_FFMPEG_p(stream_);
+}
+
+FormatInfo cv::gpucodec::detail::FFmpegVideoSource::format() const
 {
    return format_;
 }

-void cv::gpu::detail::FFmpegVideoSource::start()
+bool cv::gpucodec::detail::FFmpegVideoSource::getNextPacket(unsigned char** data, int* size, bool* bEndOfFile)
 {
-    stop_ = false;
-    hasError_ = false;
-    thread_ = new Thread(readLoop, this);
-}
+    int endOfFile;

-void cv::gpu::detail::FFmpegVideoSource::stop()
-{
-    stop_ = true;
-    thread_->wait();
-    thread_.release();
-}
+    int res = read_InputMediaStream_FFMPEG_p(stream_, data, size, &endOfFile);

-bool cv::gpu::detail::FFmpegVideoSource::isStarted() const
-{
-    return !stop_;
-}
-
-bool cv::gpu::detail::FFmpegVideoSource::hasError() const
-{
-    return hasError_;
-}
-
-void cv::gpu::detail::FFmpegVideoSource::readLoop(void* userData)
-{
-    FFmpegVideoSource* thiz = static_cast<FFmpegVideoSource*>(userData);
-
-    for (;;)
-    {
-        unsigned char* data;
-        int size;
-        int endOfFile;
-
-        if (!read_InputMediaStream_FFMPEG_p(thiz->stream_, &data, &size, &endOfFile))
-        {
-            thiz->hasError_ = !endOfFile;
-            break;
-        }
-
-        if (!thiz->parseVideoData(data, size))
-        {
-            thiz->hasError_ = true;
-            break;
-        }
-
-        if (thiz->stop_)
-            break;
-    }
-
-    thiz->parseVideoData(0, 0, true);
-}
-
-template <> void cv::Ptr<InputMediaStream_FFMPEG>::delete_obj()
-{
-    if (obj) release_InputMediaStream_FFMPEG_p(obj);
+    *bEndOfFile = (endOfFile != 0);
+    return res != 0;
 }

 #endif // HAVE_CUDA
--- a/modules/gpucodec/src/ffmpeg_video_source.hpp
+++ b/modules/gpucodec/src/ffmpeg_video_source.hpp
@ -7,11 +7,12 @@
 //  copy or use the software.
 //
 //
-//                           License Agreement
+//                          License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@ -40,43 +41,31 @@
 //
 //M*/

-#ifndef __FFMPEG_VIDEO_SOURCE_H__
-#define __FFMPEG_VIDEO_SOURCE_H__
+#ifndef __FFMPEG_VIDEO_SOURCE_HPP__
+#define __FFMPEG_VIDEO_SOURCE_HPP__

 #include "opencv2/gpucodec.hpp"
-#include "thread.h"

 struct InputMediaStream_FFMPEG;

-namespace cv { namespace gpu { namespace detail {
+namespace cv { namespace gpucodec { namespace detail {

-class FFmpegVideoSource : public VideoReader_GPU::VideoSource
+class FFmpegVideoSource : public RawVideoSource
 {
 public:
    FFmpegVideoSource(const String& fname);
+    ~FFmpegVideoSource();

-    VideoReader_GPU::FormatInfo format() const;
-    void start();
-    void stop();
-    bool isStarted() const;
-    bool hasError() const;
+    bool getNextPacket(unsigned char** data, int* size, bool* endOfFile);
+
+    FormatInfo format() const;

 private:
-    VideoReader_GPU::FormatInfo format_;
+    FormatInfo format_;

-    cv::Ptr<InputMediaStream_FFMPEG> stream_;
-
-    cv::Ptr<Thread> thread_;
-    volatile bool stop_;
-    volatile bool hasError_;
-
-    static void readLoop(void* userData);
+    InputMediaStream_FFMPEG* stream_;
 };

 }}}

-namespace cv {
-    template <> void Ptr<InputMediaStream_FFMPEG>::delete_obj();
-}
-
-#endif // __FFMPEG_VIDEO_SOURCE_H__
+#endif // __FFMPEG_VIDEO_SOURCE_HPP__
--- a/modules/gpucodec/src/frame_queue.cpp
+++ b/modules/gpucodec/src/frame_queue.cpp
@ -7,11 +7,12 @@
 //  copy or use the software.
 //
 //
-//                           License Agreement
+//                          License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@ -44,16 +45,16 @@

 #ifdef HAVE_NVCUVID

-cv::gpu::detail::FrameQueue::FrameQueue() :
+cv::gpucodec::detail::FrameQueue::FrameQueue() :
    endOfDecode_(0),
    framesInQueue_(0),
    readPosition_(0)
 {
    std::memset(displayQueue_, 0, sizeof(displayQueue_));
-    std::memset((void*)isFrameInUse_, 0, sizeof(isFrameInUse_));
+    std::memset((void*) isFrameInUse_, 0, sizeof(isFrameInUse_));
 }

-bool cv::gpu::detail::FrameQueue::waitUntilFrameAvailable(int pictureIndex)
+bool cv::gpucodec::detail::FrameQueue::waitUntilFrameAvailable(int pictureIndex)
 {
    while (isInUse(pictureIndex))
    {
@ -67,7 +68,7 @@ bool cv::gpu::detail::FrameQueue::waitUntilFrameAvailable(int pictureIndex)
    return true;
 }

-void cv::gpu::detail::FrameQueue::enqueue(const CUVIDPARSERDISPINFO* picParams)
+void cv::gpucodec::detail::FrameQueue::enqueue(const CUVIDPARSERDISPINFO* picParams)
 {
    // Mark the frame as 'in-use' so we don't re-use it for decoding until it is no longer needed
    // for display
@ -98,7 +99,7 @@ void cv::gpu::detail::FrameQueue::enqueue(const CUVIDPARSERDISPINFO* picParams)
    } while (!isEndOfDecode());
 }

-bool cv::gpu::detail::FrameQueue::dequeue(CUVIDPARSERDISPINFO& displayInfo)
+bool cv::gpucodec::detail::FrameQueue::dequeue(CUVIDPARSERDISPINFO& displayInfo)
 {
    AutoLock autoLock(mtx_);

--- a/modules/gpucodec/src/frame_queue.hpp
+++ b/modules/gpucodec/src/frame_queue.hpp
@ -7,11 +7,12 @@
 //  copy or use the software.
 //
 //
-//                           License Agreement
+//                          License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@ -40,15 +41,15 @@
 //
 //M*/

-#ifndef __FRAME_QUEUE_H__
-#define __FRAME_QUEUE_H__
+#ifndef __FRAME_QUEUE_HPP__
+#define __FRAME_QUEUE_HPP__

 #include "opencv2/core/utility.hpp"
 #include "opencv2/core/private.gpu.hpp"

 #include <nvcuvid.h>

-namespace cv { namespace gpu { namespace detail
+namespace cv { namespace gpucodec { namespace detail
 {

 class FrameQueue
@ -94,4 +95,4 @@ private:

 }}}

-#endif // __FRAME_QUEUE_H__
+#endif // __FRAME_QUEUE_HPP__
--- a/modules/gpucodec/src/precomp.hpp
+++ b/modules/gpucodec/src/precomp.hpp
@ -7,11 +7,12 @@
 //  copy or use the software.
 //
 //
-//                           License Agreement
+//                          License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@ -66,12 +67,13 @@
        #include <unistd.h>
    #endif

-    #include "thread.h"
-    #include "ffmpeg_video_source.h"
-    #include "cuvid_video_source.h"
-    #include "frame_queue.h"
-    #include "video_decoder.h"
-    #include "video_parser.h"
+    #include "thread.hpp"
+    #include "video_source.hpp"
+    #include "ffmpeg_video_source.hpp"
+    #include "cuvid_video_source.hpp"
+    #include "frame_queue.hpp"
+    #include "video_decoder.hpp"
+    #include "video_parser.hpp"

    #include "../src/cap_ffmpeg_api.hpp"
 #endif
--- a/modules/gpucodec/src/thread.cpp
+++ b/modules/gpucodec/src/thread.cpp
@ -7,11 +7,12 @@
 //  copy or use the software.
 //
 //
-//                           License Agreement
+//                          License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@ -44,7 +45,7 @@

 #ifdef HAVE_NVCUVID

-using namespace cv::gpu::detail;
+using namespace cv::gpucodec::detail;

 #ifdef WIN32

@ -66,7 +67,7 @@ namespace
    }
 }

-class cv::gpu::detail::Thread::Impl
+class cv::gpucodec::detail::Thread::Impl
 {
 public:
    Impl(Thread::Func func, void* userData)
@ -119,7 +120,7 @@ namespace
    }
 }

-class cv::gpu::detail::Thread::Impl
+class cv::gpucodec::detail::Thread::Impl
 {
 public:
    Impl(Thread::Func func, void* userData)
@ -147,17 +148,17 @@ private:

 #endif

-cv::gpu::detail::Thread::Thread(Func func, void* userData) :
+cv::gpucodec::detail::Thread::Thread(Func func, void* userData) :
    impl_(new Impl(func, userData))
 {
 }

-void cv::gpu::detail::Thread::wait()
+void cv::gpucodec::detail::Thread::wait()
 {
    impl_->wait();
 }

-void cv::gpu::detail::Thread::sleep(int ms)
+void cv::gpucodec::detail::Thread::sleep(int ms)
 {
 #ifdef WIN32
    ::Sleep(ms);
@ -166,7 +167,7 @@ void cv::gpu::detail::Thread::sleep(int ms)
 #endif
 }

-template <> void cv::Ptr<cv::gpu::detail::Thread::Impl>::delete_obj()
+template <> void cv::Ptr<cv::gpucodec::detail::Thread::Impl>::delete_obj()
 {
    if (obj) delete obj;
 }
--- a/modules/gpucodec/src/thread.hpp
+++ b/modules/gpucodec/src/thread.hpp
@ -7,11 +7,12 @@
 //  copy or use the software.
 //
 //
-//                           License Agreement
+//                          License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@ -40,12 +41,12 @@
 //
 //M*/

-#ifndef __THREAD_WRAPPERS_H__
-#define __THREAD_WRAPPERS_H__
+#ifndef __THREAD_WRAPPERS_HPP__
+#define __THREAD_WRAPPERS_HPP__

 #include "opencv2/core.hpp"

-namespace cv { namespace gpu { namespace detail {
+namespace cv { namespace gpucodec { namespace detail {

 class Thread
 {
@ -67,7 +68,7 @@ private:
 }}}

 namespace cv {
-    template <> void Ptr<cv::gpu::detail::Thread::Impl>::delete_obj();
+    template <> void Ptr<cv::gpucodec::detail::Thread::Impl>::delete_obj();
 }

-#endif // __THREAD_WRAPPERS_H__
+#endif // __THREAD_WRAPPERS_HPP__
--- a/modules/gpucodec/src/video_decoder.cpp
+++ b/modules/gpucodec/src/video_decoder.cpp
@ -7,11 +7,12 @@
 //  copy or use the software.
 //
 //
-//                           License Agreement
+//                          License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@ -44,7 +45,7 @@

 #ifdef HAVE_NVCUVID

-void cv::gpu::detail::VideoDecoder::create(const VideoReader_GPU::FormatInfo& videoFormat)
+void cv::gpucodec::detail::VideoDecoder::create(const FormatInfo& videoFormat)
 {
    release();

@ -103,7 +104,7 @@ void cv::gpu::detail::VideoDecoder::create(const VideoReader_GPU::FormatInfo& vi
    cuSafeCall( cuvidCreateDecoder(&decoder_, &createInfo_) );
 }

-void cv::gpu::detail::VideoDecoder::release()
+void cv::gpucodec::detail::VideoDecoder::release()
 {
    if (decoder_)
    {
--- a/modules/gpucodec/src/video_decoder.hpp
+++ b/modules/gpucodec/src/video_decoder.hpp
@ -7,11 +7,12 @@
 //  copy or use the software.
 //
 //
-//                           License Agreement
+//                          License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@ -40,21 +41,21 @@
 //
 //M*/

-#ifndef __VIDEO_DECODER_H__
-#define __VIDEO_DECODER_H__
+#ifndef __VIDEO_DECODER_HPP__
+#define __VIDEO_DECODER_HPP__
+
+#include <nvcuvid.h>

 #include "opencv2/core/private.gpu.hpp"
 #include "opencv2/gpucodec.hpp"

-#include <nvcuvid.h>
-
-namespace cv { namespace gpu { namespace detail
+namespace cv { namespace gpucodec { namespace detail
 {

 class VideoDecoder
 {
 public:
-    VideoDecoder(const VideoReader_GPU::FormatInfo& videoFormat, CUvideoctxlock lock) : lock_(lock), decoder_(0)
+    VideoDecoder(const FormatInfo& videoFormat, CUvideoctxlock lock) : lock_(lock), decoder_(0)
    {
        create(videoFormat);
    }
@ -64,7 +65,7 @@ public:
        release();
    }

-    void create(const VideoReader_GPU::FormatInfo& videoFormat);
+    void create(const FormatInfo& videoFormat);
    void release();

    // Get the code-type currently used.
@ -84,17 +85,17 @@ public:
        return cuvidDecodePicture(decoder_, picParams) == CUDA_SUCCESS;
    }

-    cv::gpu::GpuMat mapFrame(int picIdx, CUVIDPROCPARAMS& videoProcParams)
+    gpu::GpuMat mapFrame(int picIdx, CUVIDPROCPARAMS& videoProcParams)
    {
        CUdeviceptr ptr;
        unsigned int pitch;

        cuSafeCall( cuvidMapVideoFrame(decoder_, picIdx, &ptr, &pitch, &videoProcParams) );

-        return GpuMat(targetHeight() * 3 / 2, targetWidth(), CV_8UC1, (void*) ptr, pitch);
+        return gpu::GpuMat(targetHeight() * 3 / 2, targetWidth(), CV_8UC1, (void*) ptr, pitch);
    }

-    void unmapFrame(cv::gpu::GpuMat& frame)
+    void unmapFrame(gpu::GpuMat& frame)
    {
        cuSafeCall( cuvidUnmapVideoFrame(decoder_, (CUdeviceptr) frame.data) );
        frame.release();
@ -108,4 +109,4 @@ private:

 }}}

-#endif // __VIDEO_DECODER_H__
+#endif // __VIDEO_DECODER_HPP__
--- a/modules/gpucodec/src/video_parser.cpp
+++ b/modules/gpucodec/src/video_parser.cpp
@ -7,11 +7,12 @@
 //  copy or use the software.
 //
 //
-//                           License Agreement
+//                          License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@ -44,11 +45,11 @@

 #ifdef HAVE_NVCUVID

-cv::gpu::detail::VideoParser::VideoParser(VideoDecoder* videoDecoder, FrameQueue* frameQueue) :
+cv::gpucodec::detail::VideoParser::VideoParser(VideoDecoder* videoDecoder, FrameQueue* frameQueue) :
    videoDecoder_(videoDecoder), frameQueue_(frameQueue), unparsedPackets_(0), hasError_(false)
 {
    CUVIDPARSERPARAMS params;
-    memset(&params, 0, sizeof(CUVIDPARSERPARAMS));
+    std::memset(&params, 0, sizeof(CUVIDPARSERPARAMS));

    params.CodecType              = videoDecoder->codec();
    params.ulMaxNumDecodeSurfaces = videoDecoder->maxDecodeSurfaces();
@ -61,7 +62,7 @@ cv::gpu::detail::VideoParser::VideoParser(VideoDecoder* videoDecoder, FrameQueue
    cuSafeCall( cuvidCreateVideoParser(&parser_, &params) );
 }

-bool cv::gpu::detail::VideoParser::parseVideoData(const unsigned char* data, size_t size, bool endOfStream)
+bool cv::gpucodec::detail::VideoParser::parseVideoData(const unsigned char* data, size_t size, bool endOfStream)
 {
    CUVIDSOURCEDATAPACKET packet;
    std::memset(&packet, 0, sizeof(CUVIDSOURCEDATAPACKET));
@ -95,7 +96,7 @@ bool cv::gpu::detail::VideoParser::parseVideoData(const unsigned char* data, siz
    return !frameQueue_->isEndOfDecode();
 }

-int CUDAAPI cv::gpu::detail::VideoParser::HandleVideoSequence(void* userData, CUVIDEOFORMAT* format)
+int CUDAAPI cv::gpucodec::detail::VideoParser::HandleVideoSequence(void* userData, CUVIDEOFORMAT* format)
 {
    VideoParser* thiz = static_cast<VideoParser*>(userData);

@ -106,10 +107,10 @@ int CUDAAPI cv::gpu::detail::VideoParser::HandleVideoSequence(void* userData, CU
        format->coded_height  != thiz->videoDecoder_->frameHeight() ||
        format->chroma_format != thiz->videoDecoder_->chromaFormat())
    {
-        VideoReader_GPU::FormatInfo newFormat;
+        FormatInfo newFormat;

-        newFormat.codec = static_cast<VideoReader_GPU::Codec>(format->codec);
-        newFormat.chromaFormat = static_cast<VideoReader_GPU::ChromaFormat>(format->chroma_format);
+        newFormat.codec = static_cast<Codec>(format->codec);
+        newFormat.chromaFormat = static_cast<ChromaFormat>(format->chroma_format);
        newFormat.width = format->coded_width;
        newFormat.height = format->coded_height;

@ -127,7 +128,7 @@ int CUDAAPI cv::gpu::detail::VideoParser::HandleVideoSequence(void* userData, CU
    return true;
 }

-int CUDAAPI cv::gpu::detail::VideoParser::HandlePictureDecode(void* userData, CUVIDPICPARAMS* picParams)
+int CUDAAPI cv::gpucodec::detail::VideoParser::HandlePictureDecode(void* userData, CUVIDPICPARAMS* picParams)
 {
    VideoParser* thiz = static_cast<VideoParser*>(userData);

@ -147,7 +148,7 @@ int CUDAAPI cv::gpu::detail::VideoParser::HandlePictureDecode(void* userData, CU
    return true;
 }

-int CUDAAPI cv::gpu::detail::VideoParser::HandlePictureDisplay(void* userData, CUVIDPARSERDISPINFO* picParams)
+int CUDAAPI cv::gpucodec::detail::VideoParser::HandlePictureDisplay(void* userData, CUVIDPARSERDISPINFO* picParams)
 {
    VideoParser* thiz = static_cast<VideoParser*>(userData);

--- a/modules/gpucodec/src/video_parser.hpp
+++ b/modules/gpucodec/src/video_parser.hpp
@ -7,11 +7,12 @@
 //  copy or use the software.
 //
 //
-//                           License Agreement
+//                          License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@ -40,17 +41,17 @@
 //
 //M*/

-#ifndef __VIDEO_PARSER_H__
-#define __VIDEO_PARSER_H__
-
-#include "opencv2/core/private.gpu.hpp"
-#include "opencv2/gpucodec.hpp"
-#include "frame_queue.h"
-#include "video_decoder.h"
+#ifndef __VIDEO_PARSER_HPP__
+#define __VIDEO_PARSER_HPP__

 #include <nvcuvid.h>

-namespace cv { namespace gpu { namespace detail
+#include "opencv2/core/private.gpu.hpp"
+#include "opencv2/gpucodec.hpp"
+#include "frame_queue.hpp"
+#include "video_decoder.hpp"
+
+namespace cv { namespace gpucodec { namespace detail
 {

 class VideoParser
@ -91,4 +92,4 @@ private:

 }}}

-#endif // __VIDEO_PARSER_H__
+#endif // __VIDEO_PARSER_HPP__
--- a/modules/gpucodec/src/video_reader.cpp
+++ b/modules/gpucodec/src/video_reader.cpp
@ -42,88 +42,77 @@

 #include "precomp.hpp"

+using namespace cv;
+using namespace cv::gpu;
+using namespace cv::gpucodec;
+
 #ifndef HAVE_NVCUVID

-class cv::gpu::VideoReader_GPU::Impl
-{
-};
-
-cv::gpu::VideoReader_GPU::VideoReader_GPU() { throw_no_cuda(); }
-cv::gpu::VideoReader_GPU::VideoReader_GPU(const String&) { throw_no_cuda(); }
-cv::gpu::VideoReader_GPU::VideoReader_GPU(const cv::Ptr<VideoSource>&) { throw_no_cuda(); }
-cv::gpu::VideoReader_GPU::~VideoReader_GPU() { }
-void cv::gpu::VideoReader_GPU::open(const String&) { throw_no_cuda(); }
-void cv::gpu::VideoReader_GPU::open(const cv::Ptr<VideoSource>&) { throw_no_cuda(); }
-bool cv::gpu::VideoReader_GPU::isOpened() const { return false; }
-void cv::gpu::VideoReader_GPU::close() { }
-bool cv::gpu::VideoReader_GPU::read(GpuMat&) { throw_no_cuda(); return false; }
-cv::gpu::VideoReader_GPU::FormatInfo cv::gpu::VideoReader_GPU::format() const { throw_no_cuda(); FormatInfo format_ = {MPEG1,Monochrome,0,0}; return format_; }
-bool cv::gpu::VideoReader_GPU::VideoSource::parseVideoData(const unsigned char*, size_t, bool) { throw_no_cuda(); return false; }
-void cv::gpu::VideoReader_GPU::dumpFormat(std::ostream&) { throw_no_cuda(); }
+Ptr<VideoReader> cv::gpucodec::createVideoReader(const String&) { throw_no_cuda(); return Ptr<VideoReader>(); }
+Ptr<VideoReader> cv::gpucodec::createVideoReader(const Ptr<RawVideoSource>&) { throw_no_cuda(); return Ptr<VideoReader>(); }

 #else // HAVE_NVCUVID

-class cv::gpu::VideoReader_GPU::Impl
-{
-public:
-    explicit Impl(const cv::Ptr<cv::gpu::VideoReader_GPU::VideoSource>& source);
-    ~Impl();
-
-    bool grab(cv::gpu::GpuMat& frame);
-
-    cv::gpu::VideoReader_GPU::FormatInfo format() const { return videoSource_->format(); }
-
-private:
-    cv::Ptr<cv::gpu::VideoReader_GPU::VideoSource> videoSource_;
-
-    cv::Ptr<cv::gpu::detail::FrameQueue> frameQueue_;
-    cv::Ptr<cv::gpu::detail::VideoDecoder> videoDecoder_;
-    cv::Ptr<cv::gpu::detail::VideoParser> videoParser_;
-
-    CUvideoctxlock lock_;
-
-    std::deque< std::pair<CUVIDPARSERDISPINFO, CUVIDPROCPARAMS> > frames_;
-};
-
-cv::gpu::VideoReader_GPU::Impl::Impl(const cv::Ptr<VideoSource>& source) :
-    videoSource_(source),
-    lock_(0)
-{
-    // init context
-    GpuMat temp(1, 1, CV_8UC1);
-    temp.release();
-
-    DeviceInfo devInfo;
-    CV_Assert( devInfo.supports(FEATURE_SET_COMPUTE_11) );
-
-    CUcontext ctx;
-    cuSafeCall( cuCtxGetCurrent(&ctx) );
-    cuSafeCall( cuvidCtxLockCreate(&lock_, ctx) );
-
-    frameQueue_ = new detail::FrameQueue;
-    videoDecoder_ = new detail::VideoDecoder(videoSource_->format(), lock_);
-    videoParser_ = new detail::VideoParser(videoDecoder_, frameQueue_);
-
-    videoSource_->setFrameQueue(frameQueue_);
-    videoSource_->setVideoParser(videoParser_);
-
-    videoSource_->start();
-}
-
-cv::gpu::VideoReader_GPU::Impl::~Impl()
-{
-    frameQueue_->endDecode();
-    videoSource_->stop();
-}
-
 namespace cv { namespace gpu { namespace cudev
 {
-    void loadHueCSC(float hueCSC[9]);
    void NV12_to_RGB(const PtrStepb decodedFrame, PtrStepSz<uint> interopFrame, cudaStream_t stream = 0);
 }}}

 namespace
 {
+    class VideoReaderImpl : public VideoReader
+    {
+    public:
+        explicit VideoReaderImpl(const Ptr<detail::VideoSource>& source);
+        ~VideoReaderImpl();
+
+        bool nextFrame(OutputArray frame);
+
+        FormatInfo format() const;
+
+    private:
+        Ptr<detail::VideoSource> videoSource_;
+
+        Ptr<detail::FrameQueue> frameQueue_;
+        Ptr<detail::VideoDecoder> videoDecoder_;
+        Ptr<detail::VideoParser> videoParser_;
+
+        CUvideoctxlock lock_;
+
+        std::deque< std::pair<CUVIDPARSERDISPINFO, CUVIDPROCPARAMS> > frames_;
+    };
+
+    FormatInfo VideoReaderImpl::format() const
+    {
+        return videoSource_->format();
+    }
+
+    VideoReaderImpl::VideoReaderImpl(const Ptr<detail::VideoSource>& source) :
+        videoSource_(source),
+        lock_(0)
+    {
+        // init context
+        GpuMat temp(1, 1, CV_8UC1);
+        temp.release();
+
+        CUcontext ctx;
+        cuSafeCall( cuCtxGetCurrent(&ctx) );
+        cuSafeCall( cuvidCtxLockCreate(&lock_, ctx) );
+
+        frameQueue_ = new detail::FrameQueue;
+        videoDecoder_ = new detail::VideoDecoder(videoSource_->format(), lock_);
+        videoParser_ = new detail::VideoParser(videoDecoder_, frameQueue_);
+
+        videoSource_->setVideoParser(videoParser_);
+        videoSource_->start();
+    }
+
+    VideoReaderImpl::~VideoReaderImpl()
+    {
+        frameQueue_->endDecode();
+        videoSource_->stop();
+    }
+
    class VideoCtxAutoLock
    {
    public:
@ -134,259 +123,114 @@ namespace
        CUvideoctxlock m_lock;
    };

-    enum ColorSpace
-    {
-        ITU601 = 1,
-        ITU709 = 2
-    };
-
-    void setColorSpaceMatrix(ColorSpace CSC, float hueCSC[9], float hue)
-    {
-        float hueSin = std::sin(hue);
-        float hueCos = std::cos(hue);
-
-        if (CSC == ITU601)
-        {
-            //CCIR 601
-            hueCSC[0] = 1.1644f;
-            hueCSC[1] = hueSin * 1.5960f;
-            hueCSC[2] = hueCos * 1.5960f;
-            hueCSC[3] = 1.1644f;
-            hueCSC[4] = (hueCos * -0.3918f) - (hueSin * 0.8130f);
-            hueCSC[5] = (hueSin *  0.3918f) - (hueCos * 0.8130f);
-            hueCSC[6] = 1.1644f;
-            hueCSC[7] = hueCos *  2.0172f;
-            hueCSC[8] = hueSin * -2.0172f;
-        }
-        else if (CSC == ITU709)
-        {
-            //CCIR 709
-            hueCSC[0] = 1.0f;
-            hueCSC[1] = hueSin * 1.57480f;
-            hueCSC[2] = hueCos * 1.57480f;
-            hueCSC[3] = 1.0;
-            hueCSC[4] = (hueCos * -0.18732f) - (hueSin * 0.46812f);
-            hueCSC[5] = (hueSin *  0.18732f) - (hueCos * 0.46812f);
-            hueCSC[6] = 1.0f;
-            hueCSC[7] = hueCos *  1.85560f;
-            hueCSC[8] = hueSin * -1.85560f;
-        }
-    }
-
-    void cudaPostProcessFrame(const cv::gpu::GpuMat& decodedFrame, cv::gpu::GpuMat& interopFrame, int width, int height)
+    void cudaPostProcessFrame(const GpuMat& decodedFrame, OutputArray _outFrame, int width, int height)
    {
        using namespace cv::gpu::cudev;

-        static bool updateCSC = true;
-        static float hueColorSpaceMat[9];
-
-        // Upload the Color Space Conversion Matrices
-        if (updateCSC)
-        {
-            const ColorSpace colorSpace = ITU601;
-            const float hue = 0.0f;
-
-            // CCIR 601/709
-            setColorSpaceMatrix(colorSpace, hueColorSpaceMat, hue);
-
-            updateCSC = false;
-        }
-
        // Final Stage: NV12toARGB color space conversion

-        interopFrame.create(height, width, CV_8UC4);
+        _outFrame.create(height, width, CV_8UC4);
+        GpuMat outFrame = _outFrame.getGpuMat();

-        loadHueCSC(hueColorSpaceMat);
-
-        NV12_to_RGB(decodedFrame, interopFrame);
+        NV12_to_RGB(decodedFrame, outFrame);
    }
-}

-bool cv::gpu::VideoReader_GPU::Impl::grab(GpuMat& frame)
-{
-    if (videoSource_->hasError() || videoParser_->hasError())
-        CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported video source");
-
-    if (!videoSource_->isStarted() || frameQueue_->isEndOfDecode())
-        return false;
-
-    if (frames_.empty())
+    bool VideoReaderImpl::nextFrame(OutputArray frame)
    {
-        CUVIDPARSERDISPINFO displayInfo;
+        if (videoSource_->hasError() || videoParser_->hasError())
+            CV_Error(Error::StsUnsupportedFormat, "Unsupported video source");

-        for (;;)
+        if (!videoSource_->isStarted() || frameQueue_->isEndOfDecode())
+            return false;
+
+        if (frames_.empty())
        {
-            if (frameQueue_->dequeue(displayInfo))
-                break;
+            CUVIDPARSERDISPINFO displayInfo;

-            if (videoSource_->hasError() || videoParser_->hasError())
-                CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported video source");
+            for (;;)
+            {
+                if (frameQueue_->dequeue(displayInfo))
+                    break;

-            if (frameQueue_->isEndOfDecode())
-                return false;
+                if (videoSource_->hasError() || videoParser_->hasError())
+                    CV_Error(Error::StsUnsupportedFormat, "Unsupported video source");

-            // Wait a bit
-            detail::Thread::sleep(1);
+                if (frameQueue_->isEndOfDecode())
+                    return false;
+
+                // Wait a bit
+                detail::Thread::sleep(1);
+            }
+
+            bool isProgressive = displayInfo.progressive_frame != 0;
+            const int num_fields = isProgressive ? 1 : 2 + displayInfo.repeat_first_field;
+
+            for (int active_field = 0; active_field < num_fields; ++active_field)
+            {
+                CUVIDPROCPARAMS videoProcParams;
+                std::memset(&videoProcParams, 0, sizeof(CUVIDPROCPARAMS));
+
+                videoProcParams.progressive_frame = displayInfo.progressive_frame;
+                videoProcParams.second_field      = active_field;
+                videoProcParams.top_field_first   = displayInfo.top_field_first;
+                videoProcParams.unpaired_field    = (num_fields == 1);
+
+                frames_.push_back(std::make_pair(displayInfo, videoProcParams));
+            }
        }

-        bool isProgressive = displayInfo.progressive_frame != 0;
-        const int num_fields = isProgressive ? 1 : 2 + displayInfo.repeat_first_field;
+        if (frames_.empty())
+            return false;
+
+        std::pair<CUVIDPARSERDISPINFO, CUVIDPROCPARAMS> frameInfo = frames_.front();
+        frames_.pop_front();

-        for (int active_field = 0; active_field < num_fields; ++active_field)
        {
-            CUVIDPROCPARAMS videoProcParams;
-            std::memset(&videoProcParams, 0, sizeof(CUVIDPROCPARAMS));
+            VideoCtxAutoLock autoLock(lock_);

-            videoProcParams.progressive_frame = displayInfo.progressive_frame;
-            videoProcParams.second_field      = active_field;
-            videoProcParams.top_field_first   = displayInfo.top_field_first;
-            videoProcParams.unpaired_field    = (num_fields == 1);
+            // map decoded video frame to CUDA surface
+            GpuMat decodedFrame = videoDecoder_->mapFrame(frameInfo.first.picture_index, frameInfo.second);

-            frames_.push_back(std::make_pair(displayInfo, videoProcParams));
+            // perform post processing on the CUDA surface (performs colors space conversion and post processing)
+            // comment this out if we inclue the line of code seen above
+            cudaPostProcessFrame(decodedFrame, frame, videoDecoder_->targetWidth(), videoDecoder_->targetHeight());
+
+            // unmap video frame
+            // unmapFrame() synchronizes with the VideoDecode API (ensures the frame has finished decoding)
+            videoDecoder_->unmapFrame(decodedFrame);
        }
+
+        // release the frame, so it can be re-used in decoder
+        if (frames_.empty())
+            frameQueue_->releaseFrame(frameInfo.first);
+
+        return true;
    }
-
-    if (frames_.empty())
-        return false;
-
-    std::pair<CUVIDPARSERDISPINFO, CUVIDPROCPARAMS> frameInfo = frames_.front();
-    frames_.pop_front();
-
-    {
-        VideoCtxAutoLock autoLock(lock_);
-
-        // map decoded video frame to CUDA surface
-        cv::gpu::GpuMat decodedFrame = videoDecoder_->mapFrame(frameInfo.first.picture_index, frameInfo.second);
-
-        // perform post processing on the CUDA surface (performs colors space conversion and post processing)
-        // comment this out if we inclue the line of code seen above
-        cudaPostProcessFrame(decodedFrame, frame, videoDecoder_->targetWidth(), videoDecoder_->targetHeight());
-
-        // unmap video frame
-        // unmapFrame() synchronizes with the VideoDecode API (ensures the frame has finished decoding)
-        videoDecoder_->unmapFrame(decodedFrame);
-    }
-
-    // release the frame, so it can be re-used in decoder
-    if (frames_.empty())
-        frameQueue_->releaseFrame(frameInfo.first);
-
-    return true;
 }

-////////////////////////////////////////////////////////////////////////////
-
-cv::gpu::VideoReader_GPU::VideoReader_GPU()
-{
-}
-
-cv::gpu::VideoReader_GPU::VideoReader_GPU(const String& filename)
-{
-    open(filename);
-}
-
-cv::gpu::VideoReader_GPU::VideoReader_GPU(const cv::Ptr<VideoSource>& source)
-{
-    open(source);
-}
-
-cv::gpu::VideoReader_GPU::~VideoReader_GPU()
-{
-    close();
-}
-
-void cv::gpu::VideoReader_GPU::open(const String& filename)
+Ptr<VideoReader> cv::gpucodec::createVideoReader(const String& filename)
 {
    CV_Assert( !filename.empty() );

-#ifndef __APPLE__
+    Ptr<detail::VideoSource> videoSource;
+
    try
    {
-        cv::Ptr<VideoSource> source(new detail::CuvidVideoSource(filename));
-        open(source);
+        videoSource = new detail::CuvidVideoSource(filename);
    }
-    catch (const std::runtime_error&)
-#endif
+    catch (...)
    {
-        cv::Ptr<VideoSource> source(new cv::gpu::detail::FFmpegVideoSource(filename));
-        open(source);
-    }
-}
-
-void cv::gpu::VideoReader_GPU::open(const cv::Ptr<VideoSource>& source)
-{
-    CV_Assert( !source.empty() );
-    close();
-    impl_ = new Impl(source);
-}
-
-bool cv::gpu::VideoReader_GPU::isOpened() const
-{
-    return !impl_.empty();
-}
-
-void cv::gpu::VideoReader_GPU::close()
-{
-    impl_.release();
-}
-
-bool cv::gpu::VideoReader_GPU::read(GpuMat& image)
-{
-    if (!isOpened())
-        return false;
-
-    if (!impl_->grab(image))
-    {
-        close();
-        return false;
+        Ptr<RawVideoSource> source(new detail::FFmpegVideoSource(filename));
+        videoSource = new detail::RawVideoSourceWrapper(source);
    }

-    return true;
+    return new VideoReaderImpl(videoSource);
 }

-cv::gpu::VideoReader_GPU::FormatInfo cv::gpu::VideoReader_GPU::format() const
+Ptr<VideoReader> cv::gpucodec::createVideoReader(const Ptr<RawVideoSource>& source)
 {
-    CV_Assert( isOpened() );
-    return impl_->format();
-}
-
-bool cv::gpu::VideoReader_GPU::VideoSource::parseVideoData(const unsigned char* data, size_t size, bool endOfStream)
-{
-    return videoParser_->parseVideoData(data, size, endOfStream);
-}
-
-void cv::gpu::VideoReader_GPU::dumpFormat(std::ostream& st)
-{
-    static const char* codecs[] =
-    {
-        "MPEG1",
-        "MPEG2",
-        "MPEG4",
-        "VC1",
-        "H264",
-        "JPEG",
-        "H264_SVC",
-        "H264_MVC"
-    };
-
-    static const char* chromas[] =
-    {
-        "Monochrome",
-        "YUV420",
-        "YUV422",
-        "YUV444"
-    };
-
-    FormatInfo _format = this->format();
-
-    st << "Frame Size    : " << _format.width << "x" << _format.height << std::endl;
-    st << "Codec         : " << (_format.codec <= H264_MVC ? codecs[_format.codec] : "Uncompressed YUV") << std::endl;
-    st << "Chroma Format : " << chromas[_format.chromaFormat] << std::endl;
+    Ptr<detail::VideoSource> videoSource(new detail::RawVideoSourceWrapper(source));
+    return new VideoReaderImpl(videoSource);
 }

 #endif // HAVE_NVCUVID
-
-template <> void cv::Ptr<cv::gpu::VideoReader_GPU::Impl>::delete_obj()
-{
-    if (obj) delete obj;
-}
--- a/modules/gpucodec/src/video_source.cpp
+++ b/modules/gpucodec/src/video_source.cpp
@ -0,0 +1,121 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#ifdef HAVE_NVCUVID
+
+using namespace cv;
+using namespace cv::gpucodec;
+using namespace cv::gpucodec::detail;
+
+bool cv::gpucodec::detail::VideoSource::parseVideoData(const unsigned char* data, size_t size, bool endOfStream)
+{
+    return videoParser_->parseVideoData(data, size, endOfStream);
+}
+
+cv::gpucodec::detail::RawVideoSourceWrapper::RawVideoSourceWrapper(const Ptr<RawVideoSource>& source) :
+    source_(source)
+{
+    CV_Assert( !source_.empty() );
+}
+
+cv::gpucodec::FormatInfo cv::gpucodec::detail::RawVideoSourceWrapper::format() const
+{
+    return source_->format();
+}
+
+void cv::gpucodec::detail::RawVideoSourceWrapper::start()
+{
+    stop_ = false;
+    hasError_ = false;
+    thread_ = new Thread(readLoop, this);
+}
+
+void cv::gpucodec::detail::RawVideoSourceWrapper::stop()
+{
+    stop_ = true;
+    thread_->wait();
+    thread_.release();
+}
+
+bool cv::gpucodec::detail::RawVideoSourceWrapper::isStarted() const
+{
+    return !stop_;
+}
+
+bool cv::gpucodec::detail::RawVideoSourceWrapper::hasError() const
+{
+    return hasError_;
+}
+
+void cv::gpucodec::detail::RawVideoSourceWrapper::readLoop(void* userData)
+{
+    RawVideoSourceWrapper* thiz = static_cast<RawVideoSourceWrapper*>(userData);
+
+    for (;;)
+    {
+        unsigned char* data;
+        int size;
+        bool endOfFile;
+
+        if (!thiz->source_->getNextPacket(&data, &size, &endOfFile))
+        {
+            thiz->hasError_ = !endOfFile;
+            break;
+        }
+
+        if (!thiz->parseVideoData(data, size))
+        {
+            thiz->hasError_ = true;
+            break;
+        }
+
+        if (thiz->stop_)
+            break;
+    }
+
+    thiz->parseVideoData(0, 0, true);
+}
+
+#endif // HAVE_NVCUVID
--- a/modules/gpucodec/src/video_source.hpp
+++ b/modules/gpucodec/src/video_source.hpp
@ -0,0 +1,99 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __GPUCODEC_VIDEO_SOURCE_H__
+#define __GPUCODEC_VIDEO_SOURCE_H__
+
+#include "opencv2/core/private.gpu.hpp"
+#include "opencv2/gpucodec.hpp"
+#include "thread.hpp"
+
+namespace cv { namespace gpucodec { namespace detail
+{
+
+class VideoParser;
+
+class VideoSource
+{
+public:
+    virtual ~VideoSource() {}
+
+    virtual FormatInfo format() const = 0;
+    virtual void start() = 0;
+    virtual void stop() = 0;
+    virtual bool isStarted() const = 0;
+    virtual bool hasError() const = 0;
+
+    void setVideoParser(detail::VideoParser* videoParser) { videoParser_ = videoParser; }
+
+protected:
+    bool parseVideoData(const uchar* data, size_t size, bool endOfStream = false);
+
+private:
+    detail::VideoParser* videoParser_;
+};
+
+class RawVideoSourceWrapper : public VideoSource
+{
+public:
+    RawVideoSourceWrapper(const Ptr<RawVideoSource>& source);
+
+    FormatInfo format() const;
+    void start();
+    void stop();
+    bool isStarted() const;
+    bool hasError() const;
+
+private:
+    Ptr<RawVideoSource> source_;
+
+    Ptr<Thread> thread_;
+    volatile bool stop_;
+    volatile bool hasError_;
+
+    static void readLoop(void* userData);
+};
+
+}}}
+
+#endif // __GPUCODEC_VIDEO_SOURCE_H__
--- a/modules/gpucodec/src/video_writer.cpp
+++ b/modules/gpucodec/src/video_writer.cpp
--- a/modules/gpucodec/test/test_video.cpp
+++ b/modules/gpucodec/test/test_video.cpp
@ -57,19 +57,15 @@ GPU_TEST_P(Video, Reader)

    const std::string inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "video/" + GET_PARAM(1);

-    cv::gpu::VideoReader_GPU reader(inputFile);
-    ASSERT_TRUE(reader.isOpened());
+    cv::Ptr<cv::gpucodec::VideoReader> reader = cv::gpucodec::createVideoReader(inputFile);

    cv::gpu::GpuMat frame;

    for (int i = 0; i < 10; ++i)
    {
-        ASSERT_TRUE(reader.read(frame));
+        ASSERT_TRUE(reader->nextFrame(frame));
        ASSERT_FALSE(frame.empty());
    }
-
-    reader.close();
-    ASSERT_FALSE(reader.isOpened());
 }

 //////////////////////////////////////////////////////
@ -89,7 +85,7 @@ GPU_TEST_P(Video, Writer)
    cv::VideoCapture reader(inputFile);
    ASSERT_TRUE(reader.isOpened());

-    cv::gpu::VideoWriter_GPU d_writer;
+    cv::Ptr<cv::gpucodec::VideoWriter> d_writer;

    cv::Mat frame;
    cv::gpu::GpuMat d_frame;
@ -101,14 +97,14 @@ GPU_TEST_P(Video, Writer)

        d_frame.upload(frame);

-        if (!d_writer.isOpened())
-            d_writer.open(outputFile, frame.size(), FPS);
+        if (d_writer.empty())
+            d_writer = cv::gpucodec::createVideoWriter(outputFile, frame.size(), FPS);

-        d_writer.write(d_frame);
+        d_writer->write(d_frame);
    }

    reader.release();
-    d_writer.close();
+    d_writer.release();

    reader.open(outputFile);
    ASSERT_TRUE(reader.isOpened());
--- a/modules/gpufeatures2d/include/opencv2/gpufeatures2d.hpp
+++ b/modules/gpufeatures2d/include/opencv2/gpufeatures2d.hpp
@ -351,7 +351,7 @@ private:

    FAST_GPU fastDetector_;

-    Ptr<FilterEngine_GPU> blurFilter;
+    Ptr<gpu::Filter> blurFilter;

    GpuMat d_keypoints_;
 };
--- a/modules/gpufeatures2d/src/orb.cpp
+++ b/modules/gpufeatures2d/src/orb.cpp
@ -468,7 +468,7 @@ cv::gpu::ORB_GPU::ORB_GPU(int nFeatures, float scaleFactor, int nLevels, int edg

    pattern_.upload(h_pattern);

-    blurFilter = createGaussianFilter_GPU(CV_8UC1, Size(7, 7), 2, 2, BORDER_REFLECT_101);
+    blurFilter = gpu::createGaussianFilter(CV_8UC1, -1, Size(7, 7), 2, 2, BORDER_REFLECT_101);

    blurForDescriptor = false;
 }
@ -632,7 +632,7 @@ void cv::gpu::ORB_GPU::computeDescriptors(GpuMat& descriptors)
        {
            // preprocess the resized image
            ensureSizeIsEnough(imagePyr_[level].size(), imagePyr_[level].type(), buf_);
-            blurFilter->apply(imagePyr_[level], buf_, Rect(0, 0, imagePyr_[level].cols, imagePyr_[level].rows));
+            blurFilter->apply(imagePyr_[level], buf_);
        }

        computeOrbDescriptor_gpu(blurForDescriptor ? buf_ : imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(2),
--- a/modules/gpufilters/CMakeLists.txt
+++ b/modules/gpufilters/CMakeLists.txt
@ -6,4 +6,4 @@ set(the_description "GPU-accelerated Image Filtering")

 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations)

-ocv_define_module(gpufilters opencv_imgproc OPTIONAL opencv_gpuarithm)
+ocv_define_module(gpufilters opencv_imgproc opencv_gpuarithm)
--- a/modules/gpufilters/doc/filtering.rst
+++ b/modules/gpufilters/doc/filtering.rst
@ -7,346 +7,236 @@ Functions and classes described in this section are used to perform various line



-gpu::BaseRowFilter_GPU
----------------------
-.. ocv:class:: gpu::BaseRowFilter_GPU
+gpu::Filter
+-----------
+.. ocv:class:: gpu::Filter

-Base class for linear or non-linear filters that processes rows of 2D arrays. Such filters are used for the "horizontal" filtering passes in separable filters. ::
+Common interface for all GPU filters ::

-    class BaseRowFilter_GPU
+    class CV_EXPORTS Filter : public Algorithm
    {
    public:
-        BaseRowFilter_GPU(int ksize_, int anchor_);
-        virtual ~BaseRowFilter_GPU() {}
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
-        int ksize, anchor;
+        virtual void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null()) = 0;
    };


-.. note:: This class does not allocate memory for a destination image. Usually this class is used inside :ocv:class:`gpu::FilterEngine_GPU`.

-
-
-gpu::BaseColumnFilter_GPU
-------------------------
-.. ocv:class:: gpu::BaseColumnFilter_GPU
-
-Base class for linear or non-linear filters that processes columns of 2D arrays. Such filters are used for the "vertical" filtering passes in separable filters. ::
-
-    class BaseColumnFilter_GPU
-    {
-    public:
-        BaseColumnFilter_GPU(int ksize_, int anchor_);
-        virtual ~BaseColumnFilter_GPU() {}
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
-        int ksize, anchor;
-    };
-
-
-.. note:: This class does not allocate memory for a destination image. Usually this class is used inside :ocv:class:`gpu::FilterEngine_GPU`.
-
-
-
-gpu::BaseFilter_GPU
-------------------
-.. ocv:class:: gpu::BaseFilter_GPU
-
-Base class for non-separable 2D filters. ::
-
-    class CV_EXPORTS BaseFilter_GPU
-    {
-    public:
-        BaseFilter_GPU(const Size& ksize_, const Point& anchor_);
-        virtual ~BaseFilter_GPU() {}
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
-        Size ksize;
-        Point anchor;
-    };
-
-
-.. note:: This class does not allocate memory for a destination image. Usually this class is used inside :ocv:class:`gpu::FilterEngine_GPU`.
-
-
-
-gpu::FilterEngine_GPU
---------------------
-.. ocv:class:: gpu::FilterEngine_GPU
-
-Base class for the Filter Engine. ::
-
-    class CV_EXPORTS FilterEngine_GPU
-    {
-    public:
-        virtual ~FilterEngine_GPU() {}
-
-        virtual void apply(const GpuMat& src, GpuMat& dst,
-                           Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null()) = 0;
-    };
-
-
-The class can be used to apply an arbitrary filtering operation to an image. It contains all the necessary intermediate buffers. Pointers to the initialized ``FilterEngine_GPU`` instances are returned by various ``create*Filter_GPU`` functions (see below), and they are used inside high-level functions such as :ocv:func:`gpu::filter2D`, :ocv:func:`gpu::erode`, :ocv:func:`gpu::Sobel` , and others.
-
-By using ``FilterEngine_GPU`` instead of functions you can avoid unnecessary memory allocation for intermediate buffers and get better performance: ::
-
-    while (...)
-    {
-        gpu::GpuMat src = getImg();
-        gpu::GpuMat dst;
-        // Allocate and release buffers at each iterations
-        gpu::GaussianBlur(src, dst, ksize, sigma1);
-    }
-
-    // Allocate buffers only once
-    cv::Ptr<gpu::FilterEngine_GPU> filter =
-        gpu::createGaussianFilter_GPU(CV_8UC4, ksize, sigma1);
-    while (...)
-    {
-        gpu::GpuMat src = getImg();
-        gpu::GpuMat dst;
-        filter->apply(src, dst, cv::Rect(0, 0, src.cols, src.rows));
-    }
-    // Release buffers only once
-    filter.release();
-
-
-``FilterEngine_GPU`` can process a rectangular sub-region of an image. By default, if ``roi == Rect(0,0,-1,-1)`` , ``FilterEngine_GPU`` processes the inner region of an image ( ``Rect(anchor.x, anchor.y, src_size.width - ksize.width, src_size.height - ksize.height)`` ) because some filters do not check whether indices are outside the image for better performance. See below to understand which filters support processing the whole image and which do not and identify image type limitations.
-
-.. note:: The GPU filters do not support the in-place mode.
-
-.. seealso:: :ocv:class:`gpu::BaseRowFilter_GPU`, :ocv:class:`gpu::BaseColumnFilter_GPU`, :ocv:class:`gpu::BaseFilter_GPU`, :ocv:func:`gpu::createFilter2D_GPU`, :ocv:func:`gpu::createSeparableFilter_GPU`, :ocv:func:`gpu::createBoxFilter_GPU`, :ocv:func:`gpu::createMorphologyFilter_GPU`, :ocv:func:`gpu::createLinearFilter_GPU`, :ocv:func:`gpu::createSeparableLinearFilter_GPU`, :ocv:func:`gpu::createDerivFilter_GPU`, :ocv:func:`gpu::createGaussianFilter_GPU`
-
-
-
-gpu::createFilter2D_GPU
---------------------------
-Creates a non-separable filter engine with the specified filter.
-
-.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createFilter2D_GPU( const Ptr<BaseFilter_GPU>& filter2D, int srcType, int dstType)
-
-    :param filter2D: Non-separable 2D filter.
-
-    :param srcType: Input image type. It must be supported by  ``filter2D`` .
-
-    :param dstType: Output image type. It must be supported by  ``filter2D`` .
-
-Usually this function is used inside such high-level functions as :ocv:func:`gpu::createLinearFilter_GPU`, :ocv:func:`gpu::createBoxFilter_GPU`.
-
-
-
-gpu::createSeparableFilter_GPU
----------------------------------
-Creates a separable filter engine with the specified filters.
-
-.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createSeparableFilter_GPU( const Ptr<BaseRowFilter_GPU>& rowFilter, const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType)
-
-    :param rowFilter: "Horizontal" 1D filter.
-
-    :param columnFilter: "Vertical" 1D filter.
-
-    :param srcType: Input image type. It must be supported by  ``rowFilter`` .
-
-    :param bufType: Buffer image type. It must be supported by  ``rowFilter``  and  ``columnFilter`` .
-
-    :param dstType: Output image type. It must be supported by  ``columnFilter`` .
-
-Usually this function is used inside such high-level functions as :ocv:func:`gpu::createSeparableLinearFilter_GPU`.
-
-
-
-gpu::getRowSumFilter_GPU
----------------------------
-Creates a horizontal 1D box filter.
-
-.. ocv:function:: Ptr<BaseRowFilter_GPU> gpu::getRowSumFilter_GPU(int srcType, int sumType, int ksize, int anchor = -1)
-
-    :param srcType: Input image type. Only ``CV_8UC1`` type is supported for now.
-
-    :param sumType: Output image type. Only ``CV_32FC1`` type is supported for now.
-
-    :param ksize: Kernel size.
-
-    :param anchor: Anchor point. The default value (-1) means that the anchor is at the kernel center.
-
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
-
-
-
-gpu::getColumnSumFilter_GPU
-------------------------------
-Creates a vertical 1D box filter.
-
-.. ocv:function:: Ptr<BaseColumnFilter_GPU> gpu::getColumnSumFilter_GPU(int sumType, int dstType, int ksize, int anchor = -1)
-
-    :param sumType: Input image type. Only ``CV_8UC1`` type is supported for now.
-
-    :param dstType: Output image type. Only ``CV_32FC1`` type is supported for now.
-
-    :param ksize: Kernel size.
-
-    :param anchor: Anchor point. The default value (-1) means that the anchor is at the kernel center.
-
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
-
-
-
-gpu::createBoxFilter_GPU
----------------------------
-Creates a normalized 2D box filter.
-
-.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createBoxFilter_GPU(int srcType, int dstType, const Size& ksize, const Point& anchor = Point(-1,-1))
-
-.. ocv:function:: Ptr<BaseFilter_GPU> gpu::getBoxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1, -1))
-
-    :param srcType: Input image type supporting ``CV_8UC1`` and ``CV_8UC4`` .
-
-    :param dstType: Output image type.  It supports only the same values as the source type.
-
-    :param ksize: Kernel size.
-
-    :param anchor: Anchor point. The default value ``Point(-1, -1)`` means that the anchor is at the kernel center.
-
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
-
-.. seealso:: :ocv:func:`boxFilter`
-
-
-
-gpu::boxFilter
+gpu::Filter::apply
 ------------------
-Smooths the image using the normalized box filter.
+Applies the specified filter to the image.

-.. ocv:function:: void gpu::boxFilter(const GpuMat& src, GpuMat& dst, int ddepth, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::Filter::apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null()) = 0

-    :param src: Input image. ``CV_8UC1`` and ``CV_8UC4`` source types are supported.
+    :param src: Input image.

-    :param dst: Output image type. The size and type is the same as ``src`` .
-
-    :param ddepth: Output image depth. If -1, the output image has the same depth as the input one. The only values allowed here are ``CV_8U`` and -1.
-
-    :param ksize: Kernel size.
-
-    :param anchor: Anchor point. The default value ``Point(-1, -1)`` means that the anchor is at the kernel center.
+    :param dst: Output image.

    :param stream: Stream for the asynchronous version.

-.. note::    This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
+
+
+gpu::createBoxFilter
+--------------------
+Creates a normalized 2D box filter.
+
+.. ocv:function:: Ptr<Filter> gpu::createBoxFilter(int srcType, int dstType, Size ksize, Point anchor = Point(-1,-1), int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0))
+
+    :param srcType: Input image type. Only ``CV_8UC1`` and ``CV_8UC4`` are supported for now.
+
+    :param dstType: Output image type. Only the same type as ``src`` is supported for now.
+
+    :param ksize: Kernel size.
+
+    :param anchor: Anchor point. The default value ``Point(-1, -1)`` means that the anchor is at the kernel center.
+
+    :param borderMode: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
+
+    :param borderVal: Default border value.

 .. seealso:: :ocv:func:`boxFilter`



-gpu::blur
-------------
-Acts as a synonym for the normalized box filter.
+gpu::createLinearFilter
+-----------------------
+Creates a non-separable linear 2D filter.

-.. ocv:function:: void gpu::blur(const GpuMat& src, GpuMat& dst, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null())
+.. ocv:function:: Ptr<Filter> gpu::createLinearFilter(int srcType, int dstType, InputArray kernel, Point anchor = Point(-1,-1), int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0))

-    :param src: Input image.  ``CV_8UC1``  and  ``CV_8UC4``  source types are supported.
+    :param srcType: Input image type. Supports  ``CV_8U``  ,  ``CV_16U``  and  ``CV_32F``  one and four channel image.

-    :param dst: Output image type with the same size and type as  ``src`` .
+    :param dstType: Output image type. Only the same type as ``src`` is supported for now.

-    :param ksize: Kernel size.
+    :param kernel: 2D array of filter coefficients.

    :param anchor: Anchor point. The default value Point(-1, -1) means that the anchor is at the kernel center.

-    :param stream: Stream for the asynchronous version.
+    :param borderMode: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .

-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
+    :param borderVal: Default border value.

-.. seealso:: :ocv:func:`blur`, :ocv:func:`gpu::boxFilter`
+.. seealso:: :ocv:func:`filter2D`



-gpu::createMorphologyFilter_GPU
-----------------------------------
+gpu::createLaplacianFilter
+--------------------------
+Creates a Laplacian operator.
+
+.. ocv:function:: Ptr<Filter> gpu::createLaplacianFilter(int srcType, int dstType, int ksize = 1, double scale = 1, int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0))
+
+    :param srcType: Input image type. Supports  ``CV_8U``  ,  ``CV_16U``  and  ``CV_32F``  one and four channel image.
+
+    :param dstType: Output image type. Only the same type as ``src`` is supported for now.
+
+    :param ksize: Aperture size used to compute the second-derivative filters (see :ocv:func:`getDerivKernels`). It must be positive and odd. Only  ``ksize``  = 1 and  ``ksize``  = 3 are supported.
+
+    :param scale: Optional scale factor for the computed Laplacian values. By default, no scaling is applied (see  :ocv:func:`getDerivKernels` ).
+
+    :param borderMode: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
+
+    :param borderVal: Default border value.
+
+.. seealso:: :ocv:func:`Laplacian`
+
+
+
+gpu::createSeparableLinearFilter
+--------------------------------
+Creates a separable linear filter.
+
+.. ocv:function:: Ptr<Filter> gpu::createSeparableLinearFilter(int srcType, int dstType, InputArray rowKernel, InputArray columnKernel, Point anchor = Point(-1,-1), int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1)
+
+    :param srcType: Source array type.
+
+    :param dstType: Destination array type.
+
+    :param rowKernel: Horizontal filter coefficients. Support kernels with ``size <= 32`` .
+
+    :param columnKernel: Vertical filter coefficients. Support kernels with ``size <= 32`` .
+
+    :param anchor: Anchor position within the kernel. Negative values mean that anchor is positioned at the aperture center.
+
+    :param rowBorderMode: Pixel extrapolation method in the vertical direction For details, see  :ocv:func:`borderInterpolate`.
+
+    :param columnBorderMode: Pixel extrapolation method in the horizontal direction.
+
+.. seealso:: :ocv:func:`sepFilter2D`
+
+
+
+gpu::createDerivFilter
+----------------------
+Creates a generalized Deriv operator.
+
+.. ocv:function:: Ptr<Filter> gpu::createDerivFilter(int srcType, int dstType, int dx, int dy, int ksize, bool normalize = false, double scale = 1, int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1)
+
+    :param srcType: Source image type.
+
+    :param dstType: Destination array type.
+
+    :param dx: Derivative order in respect of x.
+
+    :param dy: Derivative order in respect of y.
+
+    :param ksize: Aperture size. See  :ocv:func:`getDerivKernels` for details.
+
+    :param normalize: Flag indicating whether to normalize (scale down) the filter coefficients or not. See  :ocv:func:`getDerivKernels` for details.
+
+    :param scale: Optional scale factor for the computed derivative values. By default, no scaling is applied. For details, see  :ocv:func:`getDerivKernels` .
+
+    :param rowBorderMode: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
+
+    :param columnBorderMode: Pixel extrapolation method in the horizontal direction.
+
+
+
+gpu::createSobelFilter
+----------------------
+Creates a Sobel operator.
+
+.. ocv:function:: Ptr<Filter> gpu::createSobelFilter(int srcType, int dstType, int dx, int dy, int ksize = 3, double scale = 1, int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1)
+
+    :param srcType: Source image type.
+
+    :param dstType: Destination array type.
+
+    :param dx: Derivative order in respect of x.
+
+    :param dy: Derivative order in respect of y.
+
+    :param ksize: Size of the extended Sobel kernel. Possible values are 1, 3, 5 or 7.
+
+    :param scale: Optional scale factor for the computed derivative values. By default, no scaling is applied. For details, see  :ocv:func:`getDerivKernels` .
+
+    :param rowBorderMode: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
+
+    :param columnBorderMode: Pixel extrapolation method in the horizontal direction.
+
+.. seealso:: :ocv:func:`Sobel`
+
+
+
+gpu::createScharrFilter
+-----------------------
+Creates a vertical or horizontal Scharr operator.
+
+.. ocv:function:: Ptr<Filter> gpu::createScharrFilter(int srcType, int dstType, int dx, int dy, double scale = 1, int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1)
+
+    :param srcType: Source image type.
+
+    :param dstType: Destination array type.
+
+    :param dx: Order of the derivative x.
+
+    :param dy: Order of the derivative y.
+
+    :param scale: Optional scale factor for the computed derivative values. By default, no scaling is applied. See  :ocv:func:`getDerivKernels`  for details.
+
+    :param rowBorderMode: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
+
+    :param columnBorderMode: Pixel extrapolation method in the horizontal direction.
+
+.. seealso:: :ocv:func:`Scharr`
+
+
+
+gpu::createGaussianFilter
+-------------------------
+Creates a Gaussian filter.
+
+.. ocv:function:: Ptr<Filter> gpu::createGaussianFilter(int srcType, int dstType, Size ksize, double sigma1, double sigma2 = 0, int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1)
+
+    :param srcType: Source image type.
+
+    :param dstType: Destination array type.
+
+    :param ksize: Aperture size. See  :ocv:func:`getGaussianKernel` for details.
+
+    :param sigma1: Gaussian sigma in the horizontal direction. See  :ocv:func:`getGaussianKernel` for details.
+
+    :param sigma2: Gaussian sigma in the vertical direction. If 0, then  :math:`\texttt{sigma2}\leftarrow\texttt{sigma1}` .
+
+    :param rowBorderMode: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
+
+    :param columnBorderMode: Pixel extrapolation method in the horizontal direction.
+
+.. seealso:: :ocv:func:`GaussianBlur`
+
+
+
+gpu::createMorphologyFilter
+---------------------------
 Creates a 2D morphological filter.

-.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Point& anchor = Point(-1,-1), int iterations = 1)
-
-.. ocv:function:: Ptr<BaseFilter_GPU> gpu::getMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Size& ksize, Point anchor=Point(-1,-1))
-
-    :param op: Morphology operation id. Only ``MORPH_ERODE`` and ``MORPH_DILATE`` are supported.
-
-    :param type: Input/output image type. Only  ``CV_8UC1``  and  ``CV_8UC4``  are supported.
-
-    :param kernel: 2D 8-bit structuring element for the morphological operation.
-
-    :param ksize: Size of a horizontal or vertical structuring element used for separable morphological operations.
-
-    :param anchor: Anchor position within the structuring element. Negative values mean that the anchor is at the center.
-
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
-
-.. seealso:: :ocv:func:`createMorphologyFilter`
-
-
-
-gpu::erode
--------------
-Erodes an image by using a specific structuring element.
-
-.. ocv:function:: void gpu::erode( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor=Point(-1, -1), int iterations=1 )
-
-.. ocv:function:: void gpu::erode( const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf, Point anchor=Point(-1, -1), int iterations=1, Stream& stream=Stream::Null() )
-
-    :param src: Source image. Only  ``CV_8UC1``  and  ``CV_8UC4``  types are supported.
-
-    :param dst: Destination image with the same size and type as  ``src`` .
-
-    :param kernel: Structuring element used for erosion. If  ``kernel=Mat()``, a  3x3 rectangular structuring element is used.
-
-    :param anchor: Position of an anchor within the element. The default value  ``(-1, -1)``  means that the anchor is at the element center.
-
-    :param iterations: Number of times erosion to be applied.
-
-    :param stream: Stream for the asynchronous version.
-
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
-
-.. seealso:: :ocv:func:`erode`
-
-
-
-gpu::dilate
---------------
-Dilates an image by using a specific structuring element.
-
-.. ocv:function:: void gpu::dilate( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor=Point(-1, -1), int iterations=1 )
-
-.. ocv:function:: void gpu::dilate( const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf, Point anchor=Point(-1, -1), int iterations=1, Stream& stream=Stream::Null() )
-
-    :param src: Source image. ``CV_8UC1`` and ``CV_8UC4`` source types are supported.
-
-    :param dst: Destination image with the same size and type as ``src``.
-
-    :param kernel: Structuring element used for dilation. If  ``kernel=Mat()``, a  3x3 rectangular structuring element is used.
-
-    :param anchor: Position of an anchor within the element. The default value  ``(-1, -1)``  means that the anchor is at the element center.
-
-    :param iterations: Number of times dilation to be applied.
-
-    :param stream: Stream for the asynchronous version.
-
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
-
-.. seealso:: :ocv:func:`dilate`
-
-
-
-gpu::morphologyEx
---------------------
-Applies an advanced morphological operation to an image.
-
-.. ocv:function::  void gpu::morphologyEx( const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, Point anchor=Point(-1, -1), int iterations=1 )
-
-.. ocv:function:: void gpu::morphologyEx( const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, GpuMat& buf1, GpuMat& buf2, Point anchor=Point(-1, -1), int iterations=1, Stream& stream=Stream::Null() )
-
-    :param src: Source image.  ``CV_8UC1``  and  ``CV_8UC4``  source types are supported.
-
-    :param dst: Destination image with the same size and type as  ``src`` .
+.. ocv:function:: Ptr<Filter> gpu::createMorphologyFilter(int op, int srcType, InputArray kernel, Point anchor = Point(-1, -1), int iterations = 1)

    :param op: Type of morphological operation. The following types are possible:

+        * **MORPH_ERODE** erode
+
+        * **MORPH_DILATE** dilate
+
        * **MORPH_OPEN** opening

        * **MORPH_CLOSE** closing
@ -357,363 +247,88 @@ Applies an advanced morphological operation to an image.

        * **MORPH_BLACKHAT** "black hat"

-    :param kernel: Structuring element.
+    :param srcType: Input/output image type. Only  ``CV_8UC1``  and  ``CV_8UC4``  are supported.

-    :param anchor: Position of an anchor within the element. The default value ``Point(-1, -1)`` means that the anchor is at the element center.
+    :param kernel: 2D 8-bit structuring element for the morphological operation.
+
+    :param anchor: Anchor position within the structuring element. Negative values mean that the anchor is at the center.

    :param iterations: Number of times erosion and dilation to be applied.

-    :param stream: Stream for the asynchronous version.
-
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
-
 .. seealso:: :ocv:func:`morphologyEx`



-gpu::createLinearFilter_GPU
-------------------------------
-Creates a non-separable linear filter.
-
-.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, Point anchor = Point(-1,-1), int borderType = BORDER_DEFAULT)
-
-    :param srcType: Input image type. Supports  ``CV_8U``  ,  ``CV_16U``  and  ``CV_32F``  one and four channel image.
-
-    :param dstType: Output image type. The same type as ``src`` is supported.
-
-    :param kernel: 2D array of filter coefficients. Floating-point coefficients will be converted to fixed-point representation before the actual processing. Supports size up to 16. For larger kernels use :ocv:func:`gpu::convolve`.
-
-    :param anchor: Anchor point. The default value Point(-1, -1) means that the anchor is at the kernel center.
-
-    :param borderType: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
-
-.. seealso:: :ocv:func:`createLinearFilter`
-
-
-
-gpu::filter2D
-----------------
-Applies the non-separable 2D linear filter to an image.
-
-.. ocv:function:: void gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor=Point(-1,-1), int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null())
-
-    :param src: Source image. Supports  ``CV_8U``  ,  ``CV_16U``  and  ``CV_32F``  one and four channel image.
-
-    :param dst: Destination image. The size and the number of channels is the same as  ``src`` .
-
-    :param ddepth: Desired depth of the destination image. If it is negative, it is the same as  ``src.depth()`` . It supports only the same depth as the source image depth.
-
-    :param kernel: 2D array of filter coefficients.
-
-    :param anchor: Anchor of the kernel that indicates the relative position of a filtered point within the kernel. The anchor resides within the kernel. The special default value (-1,-1) means that the anchor is at the kernel center.
-
-    :param borderType: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`filter2D`, :ocv:func:`gpu::convolve`
-
-
-
-gpu::Laplacian
------------------
-Applies the Laplacian operator to an image.
-
-.. ocv:function:: void gpu::Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize = 1, double scale = 1, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null())
-
-    :param src: Source image. ``CV_8UC1``  and  ``CV_8UC4``  source types are supported.
-
-    :param dst: Destination image. The size and number of channels is the same as  ``src`` .
-
-    :param ddepth: Desired depth of the destination image. It supports only the same depth as the source image depth.
-
-    :param ksize: Aperture size used to compute the second-derivative filters (see :ocv:func:`getDerivKernels`). It must be positive and odd. Only  ``ksize``  = 1 and  ``ksize``  = 3 are supported.
-
-    :param scale: Optional scale factor for the computed Laplacian values. By default, no scaling is applied (see  :ocv:func:`getDerivKernels` ).
-
-    :param borderType: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
-
-    :param stream: Stream for the asynchronous version.
-
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
-
-.. seealso:: :ocv:func:`Laplacian`, :ocv:func:`gpu::filter2D`
-
-
-
-gpu::getLinearRowFilter_GPU
-------------------------------
-Creates a primitive row filter with the specified kernel.
-
-.. ocv:function:: Ptr<BaseRowFilter_GPU> gpu::getLinearRowFilter_GPU( int srcType, int bufType, const Mat& rowKernel, int anchor=-1, int borderType=BORDER_DEFAULT )
-
-    :param srcType: Source array type. Only  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
-
-    :param bufType: Intermediate buffer type with as many channels as  ``srcType`` .
-
-    :param rowKernel: Filter coefficients. Support kernels with ``size <= 16`` .
-
-    :param anchor: Anchor position within the kernel. Negative values mean that the anchor is positioned at the aperture center.
-
-    :param borderType: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate`. For details on limitations, see below.
-
-There are two versions of the algorithm: NPP and OpenCV.
-
-    * NPP version is called when ``srcType == CV_8UC1`` or ``srcType == CV_8UC4`` and ``bufType == srcType`` . Otherwise, the OpenCV version is called. NPP supports only ``BORDER_CONSTANT`` border type and does not check indices outside the image.
-
-    * OpenCV version supports only ``CV_32F`` buffer depth and ``BORDER_REFLECT101`` , ``BORDER_REPLICATE`` , and ``BORDER_CONSTANT`` border types. It checks indices outside the image.
-
-.. seealso:: :ocv:func:`createSeparableLinearFilter` .
-
-
-
-gpu::getLinearColumnFilter_GPU
----------------------------------
-Creates a primitive column filter with the specified kernel.
-
-.. ocv:function:: Ptr<BaseColumnFilter_GPU> gpu::getLinearColumnFilter_GPU( int bufType, int dstType, const Mat& columnKernel, int anchor=-1, int borderType=BORDER_DEFAULT )
-
-    :param bufType: Intermediate buffer type with as many channels as  ``dstType`` .
-
-    :param dstType: Destination array type. ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1`` destination types are supported.
-
-    :param columnKernel: Filter coefficients. Support kernels with ``size <= 16`` .
-
-    :param anchor: Anchor position within the kernel. Negative values mean that the anchor is positioned at the aperture center.
-
-    :param borderType: Pixel extrapolation method. For details, see  :ocv:func:`borderInterpolate` . For details on limitations, see below.
-
-There are two versions of the algorithm: NPP and OpenCV.
-
-    * NPP version is called when ``dstType == CV_8UC1`` or ``dstType == CV_8UC4`` and ``bufType == dstType`` . Otherwise, the OpenCV version is called. NPP supports only ``BORDER_CONSTANT`` border type and does not check indices outside the image.
-
-    * OpenCV version supports only ``CV_32F`` buffer depth and ``BORDER_REFLECT101`` , ``BORDER_REPLICATE`` , and ``BORDER_CONSTANT`` border types. It checks indices outside image.
-
-.. seealso:: :ocv:func:`gpu::getLinearRowFilter_GPU`, :ocv:func:`createSeparableLinearFilter`
-
-
-
-gpu::createSeparableLinearFilter_GPU
----------------------------------------
-Creates a separable linear filter engine.
-
-.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, const Mat& columnKernel, const Point& anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1)
-
-    :param srcType: Source array type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
-
-    :param dstType: Destination array type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  destination types are supported.
-
-    :param rowKernel: Horizontal filter coefficients. Support kernels with ``size <= 16`` .
-
-    :param columnKernel: Vertical filter coefficients. Support kernels with ``size <= 16`` .
-
-    :param anchor: Anchor position within the kernel. Negative values mean that anchor is positioned at the aperture center.
-
-    :param rowBorderType: Pixel extrapolation method in the vertical direction For details, see  :ocv:func:`borderInterpolate`. For details on limitations, see :ocv:func:`gpu::getLinearRowFilter_GPU`, cpp:ocv:func:`gpu::getLinearColumnFilter_GPU`.
-
-    :param columnBorderType: Pixel extrapolation method in the horizontal direction.
-
-.. seealso:: :ocv:func:`gpu::getLinearRowFilter_GPU`, :ocv:func:`gpu::getLinearColumnFilter_GPU`, :ocv:func:`createSeparableLinearFilter`
-
-
-
-gpu::sepFilter2D
--------------------
-Applies a separable 2D linear filter to an image.
-
-.. ocv:function:: void gpu::sepFilter2D( const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, Point anchor=Point(-1,-1), int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1 )
-
-.. ocv:function:: void gpu::sepFilter2D( const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, GpuMat& buf, Point anchor=Point(-1,-1), int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1, Stream& stream=Stream::Null() )
-
-
-    :param src: Source image.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
-
-    :param dst: Destination image with the same size and number of channels as  ``src`` .
-
-    :param ddepth: Destination image depth.  ``CV_8U`` , ``CV_16S`` , ``CV_32S`` , and  ``CV_32F`` are supported.
-
-    :param kernelX: Horizontal filter coefficients.
-
-    :param kernelY: Vertical filter coefficients.
-
-    :param anchor: Anchor position within the kernel. The default value ``(-1, 1)`` means that the anchor is at the kernel center.
-
-    :param rowBorderType: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
-
-    :param columnBorderType: Pixel extrapolation method in the horizontal direction.
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`gpu::createSeparableLinearFilter_GPU`, :ocv:func:`sepFilter2D`
-
-
-
-gpu::createDerivFilter_GPU
------------------------------
-Creates a filter engine for the generalized Sobel operator.
-
-.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1)
-
-    :param srcType: Source image type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
-
-    :param dstType: Destination image type with as many channels as  ``srcType`` ,  ``CV_8U`` , ``CV_16S`` , ``CV_32S`` , and  ``CV_32F``  depths are supported.
-
-    :param dx: Derivative order in respect of x.
-
-    :param dy: Derivative order in respect of y.
-
-    :param ksize: Aperture size. See  :ocv:func:`getDerivKernels` for details.
-
-    :param rowBorderType: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
-
-    :param columnBorderType: Pixel extrapolation method in the horizontal direction.
-
-.. seealso:: :ocv:func:`gpu::createSeparableLinearFilter_GPU`, :ocv:func:`createDerivFilter`
-
-
-
-gpu::Sobel
--------------
-Applies the generalized Sobel operator to an image.
-
-.. ocv:function:: void gpu::Sobel( const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize=3, double scale=1, int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1 )
-
-.. ocv:function:: void gpu::Sobel( const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, int ksize=3, double scale=1, int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1, Stream& stream=Stream::Null() )
-
-    :param src: Source image.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
-
-    :param dst: Destination image with the same size and number of channels as source image.
-
-    :param ddepth: Destination image depth.  ``CV_8U`` , ``CV_16S`` , ``CV_32S`` , and  ``CV_32F`` are supported.
-
-    :param dx: Derivative order in respect of x.
-
-    :param dy: Derivative order in respect of y.
-
-    :param ksize: Size of the extended Sobel kernel. Possible values are 1, 3, 5 or 7.
-
-    :param scale: Optional scale factor for the computed derivative values. By default, no scaling is applied. For details, see  :ocv:func:`getDerivKernels` .
-
-    :param rowBorderType: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
-
-    :param columnBorderType: Pixel extrapolation method in the horizontal direction.
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`gpu::createSeparableLinearFilter_GPU`, :ocv:func:`Sobel`
-
-
-
-gpu::Scharr
---------------
-Calculates the first x- or y- image derivative using the Scharr operator.
-
-.. ocv:function:: void gpu::Scharr( const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale=1, int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1 )
-
-.. ocv:function:: void gpu::Scharr( const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, double scale=1, int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1, Stream& stream=Stream::Null() )
-
-    :param src: Source image.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
-
-    :param dst: Destination image with the same size and number of channels as  ``src`` has.
-
-    :param ddepth: Destination image depth.  ``CV_8U`` , ``CV_16S`` , ``CV_32S`` , and  ``CV_32F`` are supported.
-
-    :param dx: Order of the derivative x.
-
-    :param dy: Order of the derivative y.
-
-    :param scale: Optional scale factor for the computed derivative values. By default, no scaling is applied. See  :ocv:func:`getDerivKernels`  for details.
-
-    :param rowBorderType: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
-
-    :param columnBorderType: Pixel extrapolation method in the horizontal direction.
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`gpu::createSeparableLinearFilter_GPU`, :ocv:func:`Scharr`
-
-
-
-gpu::createGaussianFilter_GPU
---------------------------------
-Creates a Gaussian filter engine.
-
-.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createGaussianFilter_GPU( int type, Size ksize, double sigma1, double sigma2=0, int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1 )
-
-    :param type: Source and destination image type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1`` are supported.
-
-    :param ksize: Aperture size. See  :ocv:func:`getGaussianKernel` for details.
-
-    :param sigma1: Gaussian sigma in the horizontal direction. See  :ocv:func:`getGaussianKernel` for details.
-
-    :param sigma2: Gaussian sigma in the vertical direction. If 0, then  :math:`\texttt{sigma2}\leftarrow\texttt{sigma1}` .
-
-    :param rowBorderType: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
-
-    :param columnBorderType: Pixel extrapolation method in the horizontal direction.
-
-.. seealso:: :ocv:func:`gpu::createSeparableLinearFilter_GPU`, :ocv:func:`createGaussianFilter`
-
-
-
-gpu::GaussianBlur
---------------------
-Smooths an image using the Gaussian filter.
-
-.. ocv:function:: void gpu::GaussianBlur( const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2=0, int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1 )
-
-.. ocv:function:: void gpu::GaussianBlur( const GpuMat& src, GpuMat& dst, Size ksize, GpuMat& buf, double sigma1, double sigma2=0, int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1, Stream& stream=Stream::Null() )
-
-    :param src: Source image.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
-
-    :param dst: Destination image with the same size and type as  ``src`` .
-
-    :param ksize: Gaussian kernel size.  ``ksize.width``  and  ``ksize.height``  can differ but they both must be positive and odd. If they are zeros, they are computed from  ``sigma1``  and  ``sigma2`` .
-
-    :param sigma1: Gaussian kernel standard deviation in X direction.
-
-    :param sigma2: Gaussian kernel standard deviation in Y direction. If  ``sigma2``  is zero, it is set to be equal to  ``sigma1`` . If they are both zeros, they are computed from  ``ksize.width``  and  ``ksize.height``, respectively. See  :ocv:func:`getGaussianKernel` for details. To fully control the result regardless of possible future modification of all this semantics, you are recommended to specify all of  ``ksize`` , ``sigma1`` , and  ``sigma2`` .
-
-    :param rowBorderType: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
-
-    :param columnBorderType: Pixel extrapolation method in the horizontal direction.
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`gpu::createGaussianFilter_GPU`, :ocv:func:`GaussianBlur`
-
-
-
-gpu::getMaxFilter_GPU
-------------------------
+gpu::createBoxMaxFilter
+-----------------------
 Creates the maximum filter.

-.. ocv:function:: Ptr<BaseFilter_GPU> gpu::getMaxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1))
+.. ocv:function:: Ptr<Filter> gpu::createBoxMaxFilter(int srcType, Size ksize, Point anchor = Point(-1, -1), int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0))

-    :param srcType: Input image type. Only  ``CV_8UC1``  and  ``CV_8UC4`` are supported.
-
-    :param dstType: Output image type. It supports only the same type as the source type.
+    :param srcType: Input/output image type. Only  ``CV_8UC1``  and  ``CV_8UC4`` are supported.

    :param ksize: Kernel size.

    :param anchor: Anchor point. The default value (-1) means that the anchor is at the kernel center.

-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
+    :param borderMode: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
+
+    :param borderVal: Default border value.



-gpu::getMinFilter_GPU
-------------------------
+gpu::createBoxMinFilter
+-----------------------
 Creates the minimum filter.

-.. ocv:function:: Ptr<BaseFilter_GPU> gpu::getMinFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1))
+.. ocv:function:: Ptr<Filter> gpu::createBoxMinFilter(int srcType, Size ksize, Point anchor = Point(-1, -1), int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0))

-    :param srcType: Input image type. Only  ``CV_8UC1``  and  ``CV_8UC4`` are supported.
-
-    :param dstType: Output image type. It supports only the same type as the source type.
+    :param srcType: Input/output image type. Only  ``CV_8UC1``  and  ``CV_8UC4`` are supported.

    :param ksize: Kernel size.

    :param anchor: Anchor point. The default value (-1) means that the anchor is at the kernel center.

-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
+    :param borderMode: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
+
+    :param borderVal: Default border value.
+
+
+
+gpu::createRowSumFilter
+-----------------------
+Creates a horizontal 1D box filter.
+
+.. ocv:function:: Ptr<Filter> gpu::createRowSumFilter(int srcType, int dstType, int ksize, int anchor = -1, int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0))
+
+    :param srcType: Input image type. Only ``CV_8UC1`` type is supported for now.
+
+    :param sumType: Output image type. Only ``CV_32FC1`` type is supported for now.
+
+    :param ksize: Kernel size.
+
+    :param anchor: Anchor point. The default value (-1) means that the anchor is at the kernel center.
+
+    :param borderMode: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
+
+    :param borderVal: Default border value.
+
+
+
+gpu::createColumnSumFilter
+--------------------------
+Creates a vertical 1D box filter.
+
+.. ocv:function:: Ptr<Filter> gpu::createColumnSumFilter(int srcType, int dstType, int ksize, int anchor = -1, int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0))
+
+    :param srcType: Input image type. Only ``CV_8UC1`` type is supported for now.
+
+    :param sumType: Output image type. Only ``CV_32FC1`` type is supported for now.
+
+    :param ksize: Kernel size.
+
+    :param anchor: Anchor point. The default value (-1) means that the anchor is at the kernel center.
+
+    :param borderMode: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
+
+    :param borderVal: Default border value.
--- a/modules/gpufilters/include/opencv2/gpufilters.hpp
+++ b/modules/gpufilters/include/opencv2/gpufilters.hpp
@ -48,221 +48,101 @@
 #endif

 #include "opencv2/core/gpu.hpp"
-#include "opencv2/core/base.hpp"
+#include "opencv2/imgproc.hpp"

 namespace cv { namespace gpu {

-/*!
-The Base Class for 1D or Row-wise Filters
-
-This is the base class for linear or non-linear filters that process 1D data.
-In particular, such filters are used for the "horizontal" filtering parts in separable filters.
-*/
-class CV_EXPORTS BaseRowFilter_GPU
+class CV_EXPORTS Filter : public Algorithm
 {
 public:
-    BaseRowFilter_GPU(int ksize_, int anchor_) : ksize(ksize_), anchor(anchor_) {}
-    virtual ~BaseRowFilter_GPU() {}
-    virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
-    int ksize, anchor;
+    virtual void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null()) = 0;
 };

-/*!
-The Base Class for Column-wise Filters
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Box Filter

-This is the base class for linear or non-linear filters that process columns of 2D arrays.
-Such filters are used for the "vertical" filtering parts in separable filters.
-*/
-class CV_EXPORTS BaseColumnFilter_GPU
-{
-public:
-    BaseColumnFilter_GPU(int ksize_, int anchor_) : ksize(ksize_), anchor(anchor_) {}
-    virtual ~BaseColumnFilter_GPU() {}
-    virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
-    int ksize, anchor;
-};
-
-/*!
-The Base Class for Non-Separable 2D Filters.
-
-This is the base class for linear or non-linear 2D filters.
-*/
-class CV_EXPORTS BaseFilter_GPU
-{
-public:
-    BaseFilter_GPU(const Size& ksize_, const Point& anchor_) : ksize(ksize_), anchor(anchor_) {}
-    virtual ~BaseFilter_GPU() {}
-    virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
-    Size ksize;
-    Point anchor;
-};
-
-/*!
-The Base Class for Filter Engine.
-
-The class can be used to apply an arbitrary filtering operation to an image.
-It contains all the necessary intermediate buffers.
-*/
-class CV_EXPORTS FilterEngine_GPU
-{
-public:
-    virtual ~FilterEngine_GPU() {}
-
-    virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null()) = 0;
-};
-
-//! returns the non-separable filter engine with the specified filter
-CV_EXPORTS Ptr<FilterEngine_GPU> createFilter2D_GPU(const Ptr<BaseFilter_GPU>& filter2D, int srcType, int dstType);
-
-//! returns the separable filter engine with the specified filters
-CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter,
-    const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType);
-CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter,
-    const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType, GpuMat& buf);
-
-//! returns horizontal 1D box filter
-//! supports only CV_8UC1 source type and CV_32FC1 sum type
-CV_EXPORTS Ptr<BaseRowFilter_GPU> getRowSumFilter_GPU(int srcType, int sumType, int ksize, int anchor = -1);
-
-//! returns vertical 1D box filter
-//! supports only CV_8UC1 sum type and CV_32FC1 dst type
-CV_EXPORTS Ptr<BaseColumnFilter_GPU> getColumnSumFilter_GPU(int sumType, int dstType, int ksize, int anchor = -1);
-
-//! returns 2D box filter
-//! supports CV_8UC1 and CV_8UC4 source type, dst type must be the same as source type
-CV_EXPORTS Ptr<BaseFilter_GPU> getBoxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1, -1));
-
-//! returns box filter engine
-CV_EXPORTS Ptr<FilterEngine_GPU> createBoxFilter_GPU(int srcType, int dstType, const Size& ksize,
-    const Point& anchor = Point(-1,-1));
-
-//! returns 2D morphological filter
-//! only MORPH_ERODE and MORPH_DILATE are supported
-//! supports CV_8UC1 and CV_8UC4 types
-//! kernel must have CV_8UC1 type, one rows and cols == ksize.width * ksize.height
-CV_EXPORTS Ptr<BaseFilter_GPU> getMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Size& ksize,
-    Point anchor=Point(-1,-1));
-
-//! returns morphological filter engine. Only MORPH_ERODE and MORPH_DILATE are supported.
-CV_EXPORTS Ptr<FilterEngine_GPU> createMorphologyFilter_GPU(int op, int type, const Mat& kernel,
-    const Point& anchor = Point(-1,-1), int iterations = 1);
-CV_EXPORTS Ptr<FilterEngine_GPU> createMorphologyFilter_GPU(int op, int type, const Mat& kernel, GpuMat& buf,
-    const Point& anchor = Point(-1,-1), int iterations = 1);
-
-//! returns 2D filter with the specified kernel
-//! supports CV_8U, CV_16U and CV_32F one and four channel image
-CV_EXPORTS Ptr<BaseFilter_GPU> getLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
-
-//! returns the non-separable linear filter engine
-CV_EXPORTS Ptr<FilterEngine_GPU> createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel,
-    Point anchor = Point(-1,-1), int borderType = BORDER_DEFAULT);
-
-//! returns the primitive row filter with the specified kernel.
-//! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 source type.
-//! there are two version of algorithm: NPP and OpenCV.
-//! NPP calls when srcType == CV_8UC1 or srcType == CV_8UC4 and bufType == srcType,
-//! otherwise calls OpenCV version.
-//! NPP supports only BORDER_CONSTANT border type.
-//! OpenCV version supports only CV_32F as buffer depth and
-//! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types.
-CV_EXPORTS Ptr<BaseRowFilter_GPU> getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel,
-    int anchor = -1, int borderType = BORDER_DEFAULT);
-
-//! returns the primitive column filter with the specified kernel.
-//! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 dst type.
-//! there are two version of algorithm: NPP and OpenCV.
-//! NPP calls when dstType == CV_8UC1 or dstType == CV_8UC4 and bufType == dstType,
-//! otherwise calls OpenCV version.
-//! NPP supports only BORDER_CONSTANT border type.
-//! OpenCV version supports only CV_32F as buffer depth and
-//! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types.
-CV_EXPORTS Ptr<BaseColumnFilter_GPU> getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel,
-    int anchor = -1, int borderType = BORDER_DEFAULT);
-
-//! returns the separable linear filter engine
-CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel,
-    const Mat& columnKernel, const Point& anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT,
-    int columnBorderType = -1);
-CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel,
-    const Mat& columnKernel, GpuMat& buf, const Point& anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT,
-    int columnBorderType = -1);
-
-//! returns filter engine for the generalized Sobel operator
-CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize,
-                                                       int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, GpuMat& buf,
-                                                       int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-
-//! returns the Gaussian filter engine
-CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0,
-                                                          int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, GpuMat& buf, double sigma1, double sigma2 = 0,
-                                                          int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-
-//! returns maximum filter
-CV_EXPORTS Ptr<BaseFilter_GPU> getMaxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1));
-
-//! returns minimum filter
-CV_EXPORTS Ptr<BaseFilter_GPU> getMinFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1));
-
-//! smooths the image using the normalized box filter
+//! creates a normalized 2D box filter
 //! supports CV_8UC1, CV_8UC4 types
-CV_EXPORTS void boxFilter(const GpuMat& src, GpuMat& dst, int ddepth, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null());
+CV_EXPORTS Ptr<Filter> createBoxFilter(int srcType, int dstType, Size ksize, Point anchor = Point(-1,-1),
+                                       int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));

-//! a synonym for normalized box filter
-static inline void blur(const GpuMat& src, GpuMat& dst, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null())
-{
-    boxFilter(src, dst, -1, ksize, anchor, stream);
-}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Linear Filter

-//! erodes the image (applies the local minimum operator)
-CV_EXPORTS void erode(const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1);
-CV_EXPORTS void erode(const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf,
-                      Point anchor = Point(-1, -1), int iterations = 1,
-                      Stream& stream = Stream::Null());
+//! Creates a non-separable linear 2D filter
+//! supports 1 and 4 channel CV_8U, CV_16U and CV_32F input
+CV_EXPORTS Ptr<Filter> createLinearFilter(int srcType, int dstType, InputArray kernel, Point anchor = Point(-1,-1),
+                                          int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));

-//! dilates the image (applies the local maximum operator)
-CV_EXPORTS void dilate(const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1);
-CV_EXPORTS void dilate(const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf,
-                       Point anchor = Point(-1, -1), int iterations = 1,
-                       Stream& stream = Stream::Null());
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Laplacian Filter

-//! applies an advanced morphological operation to the image
-CV_EXPORTS void morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1);
-CV_EXPORTS void morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, GpuMat& buf1, GpuMat& buf2,
-                             Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null());
-
-//! applies non-separable 2D linear filter to the image
-CV_EXPORTS void filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor=Point(-1,-1), int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null());
-
-//! applies separable 2D linear filter to the image
-CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY,
-                            Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, GpuMat& buf,
-                            Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1,
-                            Stream& stream = Stream::Null());
-
-//! applies generalized Sobel operator to the image
-CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1,
-                      int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, int ksize = 3, double scale = 1,
-                      int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());
-
-//! applies the vertical or horizontal Scharr operator to the image
-CV_EXPORTS void Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale = 1,
-                       int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-CV_EXPORTS void Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, double scale = 1,
-                       int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());
-
-//! smooths the image using Gaussian filter.
-CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2 = 0,
-                             int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, GpuMat& buf, double sigma1, double sigma2 = 0,
-                             int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());
-
-//! applies Laplacian operator to the image
+//! creates a Laplacian operator
 //! supports only ksize = 1 and ksize = 3
-CV_EXPORTS void Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize = 1, double scale = 1, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null());
+CV_EXPORTS Ptr<Filter> createLaplacianFilter(int srcType, int dstType, int ksize = 1, double scale = 1,
+                                             int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Separable Linear Filter
+
+//! creates a separable linear filter
+CV_EXPORTS Ptr<Filter> createSeparableLinearFilter(int srcType, int dstType, InputArray rowKernel, InputArray columnKernel,
+                                                   Point anchor = Point(-1,-1), int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Deriv Filter
+
+//! creates a generalized Deriv operator
+CV_EXPORTS Ptr<Filter> createDerivFilter(int srcType, int dstType, int dx, int dy,
+                                         int ksize, bool normalize = false, double scale = 1,
+                                         int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
+
+//! creates a Sobel operator
+CV_EXPORTS Ptr<Filter> createSobelFilter(int srcType, int dstType, int dx, int dy, int ksize = 3,
+                                         double scale = 1, int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
+
+//! creates a vertical or horizontal Scharr operator
+CV_EXPORTS Ptr<Filter> createScharrFilter(int srcType, int dstType, int dx, int dy,
+                                          double scale = 1, int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Gaussian Filter
+
+//! creates a Gaussian filter
+CV_EXPORTS Ptr<Filter> createGaussianFilter(int srcType, int dstType, Size ksize,
+                                            double sigma1, double sigma2 = 0,
+                                            int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Morphology Filter
+
+//! creates a 2D morphological filter
+//! supports CV_8UC1 and CV_8UC4 types
+CV_EXPORTS Ptr<Filter> createMorphologyFilter(int op, int srcType, InputArray kernel, Point anchor = Point(-1, -1), int iterations = 1);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Image Rank Filter
+
+//! result pixel value is the maximum of pixel values under the rectangular mask region
+CV_EXPORTS Ptr<Filter> createBoxMaxFilter(int srcType, Size ksize,
+                                          Point anchor = Point(-1, -1),
+                                          int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
+
+//! result pixel value is the maximum of pixel values under the rectangular mask region
+CV_EXPORTS Ptr<Filter> createBoxMinFilter(int srcType, Size ksize,
+                                          Point anchor = Point(-1, -1),
+                                          int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// 1D Sum Filter
+
+//! creates a horizontal 1D box filter
+//! supports only CV_8UC1 source type and CV_32FC1 sum type
+CV_EXPORTS Ptr<Filter> createRowSumFilter(int srcType, int dstType, int ksize, int anchor = -1, int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
+
+//! creates a vertical 1D box filter
+//! supports only CV_8UC1 sum type and CV_32FC1 dst type
+CV_EXPORTS Ptr<Filter> createColumnSumFilter(int srcType, int dstType, int ksize, int anchor = -1, int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));

 }} // namespace cv { namespace gpu {

--- a/modules/gpufilters/perf/perf_filters.cpp
+++ b/modules/gpufilters/perf/perf_filters.cpp
@ -70,7 +70,9 @@ PERF_TEST_P(Sz_Type_KernelSz, Blur,
        const cv::gpu::GpuMat d_src(src);
        cv::gpu::GpuMat dst;

-        TEST_CYCLE() cv::gpu::blur(d_src, dst, cv::Size(ksize, ksize));
+        cv::Ptr<cv::gpu::Filter> blurFilter = cv::gpu::createBoxFilter(d_src.type(), -1, cv::Size(ksize, ksize));
+
+        TEST_CYCLE() blurFilter->apply(d_src, dst);

        GPU_SANITY_CHECK(dst, 1);
    }
@ -84,6 +86,79 @@ PERF_TEST_P(Sz_Type_KernelSz, Blur,
    }
 }

+//////////////////////////////////////////////////////////////////////
+// Filter2D
+
+PERF_TEST_P(Sz_Type_KernelSz, Filter2D, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(3, 5, 7, 9, 11, 13, 15)))
+{
+    declare.time(20.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const int ksize = GET_PARAM(2);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    cv::Mat kernel(ksize, ksize, CV_32FC1);
+    declare.in(kernel, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+
+        cv::Ptr<cv::gpu::Filter> filter2D = cv::gpu::createLinearFilter(d_src.type(), -1, kernel);
+
+        TEST_CYCLE() filter2D->apply(d_src, dst);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::filter2D(src, dst, -1, kernel);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Laplacian
+
+PERF_TEST_P(Sz_Type_KernelSz, Laplacian, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(1, 3)))
+{
+    declare.time(20.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const int ksize = GET_PARAM(2);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+
+        cv::Ptr<cv::gpu::Filter> laplacian = cv::gpu::createLaplacianFilter(d_src.type(), -1, ksize);
+
+        TEST_CYCLE() laplacian->apply(d_src, dst);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::Laplacian(src, dst, -1, ksize);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
 //////////////////////////////////////////////////////////////////////
 // Sobel

@ -102,9 +177,10 @@ PERF_TEST_P(Sz_Type_KernelSz, Sobel, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U
    {
        const cv::gpu::GpuMat d_src(src);
        cv::gpu::GpuMat dst;
-        cv::gpu::GpuMat d_buf;

-        TEST_CYCLE() cv::gpu::Sobel(d_src, dst, -1, 1, 1, d_buf, ksize);
+        cv::Ptr<cv::gpu::Filter> sobel = cv::gpu::createSobelFilter(d_src.type(), -1, 1, 1, ksize);
+
+        TEST_CYCLE() sobel->apply(d_src, dst);

        GPU_SANITY_CHECK(dst);
    }
@ -135,9 +211,10 @@ PERF_TEST_P(Sz_Type, Scharr, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8
    {
        const cv::gpu::GpuMat d_src(src);
        cv::gpu::GpuMat dst;
-        cv::gpu::GpuMat d_buf;

-        TEST_CYCLE() cv::gpu::Scharr(d_src, dst, -1, 1, 0, d_buf);
+        cv::Ptr<cv::gpu::Filter> scharr = cv::gpu::createScharrFilter(d_src.type(), -1, 1, 0);
+
+        TEST_CYCLE() scharr->apply(d_src, dst);

        GPU_SANITY_CHECK(dst);
    }
@ -169,9 +246,10 @@ PERF_TEST_P(Sz_Type_KernelSz, GaussianBlur, Combine(GPU_TYPICAL_MAT_SIZES, Value
    {
        const cv::gpu::GpuMat d_src(src);
        cv::gpu::GpuMat dst;
-        cv::gpu::GpuMat d_buf;

-        TEST_CYCLE() cv::gpu::GaussianBlur(d_src, dst, cv::Size(ksize, ksize), d_buf, 0.5);
+        cv::Ptr<cv::gpu::Filter> gauss = cv::gpu::createGaussianFilter(d_src.type(), -1, cv::Size(ksize, ksize), 0.5);
+
+        TEST_CYCLE() gauss->apply(d_src, dst);

        GPU_SANITY_CHECK(dst);
    }
@ -185,39 +263,6 @@ PERF_TEST_P(Sz_Type_KernelSz, GaussianBlur, Combine(GPU_TYPICAL_MAT_SIZES, Value
    }
 }

-//////////////////////////////////////////////////////////////////////
-// Laplacian
-
-PERF_TEST_P(Sz_Type_KernelSz, Laplacian, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(1, 3)))
-{
-    declare.time(20.0);
-
-    const cv::Size size = GET_PARAM(0);
-    const int type = GET_PARAM(1);
-    const int ksize = GET_PARAM(2);
-
-    cv::Mat src(size, type);
-    declare.in(src, WARMUP_RNG);
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
-
-        TEST_CYCLE() cv::gpu::Laplacian(d_src, dst, -1, ksize);
-
-        GPU_SANITY_CHECK(dst);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        TEST_CYCLE() cv::Laplacian(src, dst, -1, ksize);
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
 //////////////////////////////////////////////////////////////////////
 // Erode

@ -237,9 +282,10 @@ PERF_TEST_P(Sz_Type, Erode, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8U
    {
        const cv::gpu::GpuMat d_src(src);
        cv::gpu::GpuMat dst;
-        cv::gpu::GpuMat d_buf;

-        TEST_CYCLE() cv::gpu::erode(d_src, dst, ker, d_buf);
+        cv::Ptr<cv::gpu::Filter> erode = cv::gpu::createMorphologyFilter(cv::MORPH_ERODE, src.type(), ker);
+
+        TEST_CYCLE() erode->apply(d_src, dst);

        GPU_SANITY_CHECK(dst);
    }
@ -272,9 +318,10 @@ PERF_TEST_P(Sz_Type, Dilate, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8
    {
        const cv::gpu::GpuMat d_src(src);
        cv::gpu::GpuMat dst;
-        cv::gpu::GpuMat d_buf;

-        TEST_CYCLE() cv::gpu::dilate(d_src, dst, ker, d_buf);
+        cv::Ptr<cv::gpu::Filter> dilate = cv::gpu::createMorphologyFilter(cv::MORPH_DILATE, src.type(), ker);
+
+        TEST_CYCLE() dilate->apply(d_src, dst);

        GPU_SANITY_CHECK(dst);
    }
@ -312,10 +359,10 @@ PERF_TEST_P(Sz_Type_Op, MorphologyEx, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8
    {
        const cv::gpu::GpuMat d_src(src);
        cv::gpu::GpuMat dst;
-        cv::gpu::GpuMat d_buf1;
-        cv::gpu::GpuMat d_buf2;

-        TEST_CYCLE() cv::gpu::morphologyEx(d_src, dst, morphOp, ker, d_buf1, d_buf2);
+        cv::Ptr<cv::gpu::Filter> morph = cv::gpu::createMorphologyFilter(morphOp, src.type(), ker);
+
+        TEST_CYCLE() morph->apply(d_src, dst);

        GPU_SANITY_CHECK(dst);
    }
@ -328,39 +375,3 @@ PERF_TEST_P(Sz_Type_Op, MorphologyEx, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8
        CPU_SANITY_CHECK(dst);
    }
 }
-
-//////////////////////////////////////////////////////////////////////
-// Filter2D
-
-PERF_TEST_P(Sz_Type_KernelSz, Filter2D, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(3, 5, 7, 9, 11, 13, 15)))
-{
-    declare.time(20.0);
-
-    const cv::Size size = GET_PARAM(0);
-    const int type = GET_PARAM(1);
-    const int ksize = GET_PARAM(2);
-
-    cv::Mat src(size, type);
-    declare.in(src, WARMUP_RNG);
-
-    cv::Mat kernel(ksize, ksize, CV_32FC1);
-    declare.in(kernel, WARMUP_RNG);
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
-
-        TEST_CYCLE() cv::gpu::filter2D(d_src, dst, -1, kernel);
-
-        GPU_SANITY_CHECK(dst);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        TEST_CYCLE() cv::filter2D(src, dst, -1, kernel);
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
--- a/modules/gpufilters/src/cuda/filter2d.cu
+++ b/modules/gpufilters/src/cuda/filter2d.cu
@ -48,111 +48,104 @@

 namespace cv { namespace gpu { namespace cudev
 {
-    namespace imgproc
+    template <class SrcPtr, typename D>
+    __global__ void filter2D(const SrcPtr src, PtrStepSz<D> dst,
+                             const float* __restrict__ kernel,
+                             const int kWidth, const int kHeight,
+                             const int anchorX, const int anchorY)
    {
-        #define FILTER2D_MAX_KERNEL_SIZE 16
+        typedef typename TypeVec<float, VecTraits<D>::cn>::vec_type sum_t;

-        __constant__ float c_filter2DKernel[FILTER2D_MAX_KERNEL_SIZE * FILTER2D_MAX_KERNEL_SIZE];
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;

-        template <class SrcT, typename D>
-        __global__ void filter2D(const SrcT src, PtrStepSz<D> dst, const int kWidth, const int kHeight, const int anchorX, const int anchorY)
+        if (x >= dst.cols || y >= dst.rows)
+            return;
+
+        sum_t res = VecTraits<sum_t>::all(0);
+        int kInd = 0;
+
+        for (int i = 0; i < kHeight; ++i)
        {
-            typedef typename TypeVec<float, VecTraits<D>::cn>::vec_type sum_t;
-
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x >= dst.cols || y >= dst.rows)
-                return;
-
-            sum_t res = VecTraits<sum_t>::all(0);
-            int kInd = 0;
-
-            for (int i = 0; i < kHeight; ++i)
-            {
-                for (int j = 0; j < kWidth; ++j)
-                    res = res + src(y - anchorY + i, x - anchorX + j) * c_filter2DKernel[kInd++];
-            }
-
-            dst(y, x) = saturate_cast<D>(res);
+            for (int j = 0; j < kWidth; ++j)
+                res = res + src(y - anchorY + i, x - anchorX + j) * kernel[kInd++];
        }

-        template <typename T, typename D, template <typename> class Brd> struct Filter2DCaller;
-
-        #define IMPLEMENT_FILTER2D_TEX_READER(type) \
-            texture< type , cudaTextureType2D, cudaReadModeElementType> tex_filter2D_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
-            struct tex_filter2D_ ## type ## _reader \
-            { \
-                typedef type elem_type; \
-                typedef int index_type; \
-                const int xoff; \
-                const int yoff; \
-                tex_filter2D_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
-                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
-                { \
-                    return tex2D(tex_filter2D_ ## type , x + xoff, y + yoff); \
-                } \
-            }; \
-            template <typename D, template <typename> class Brd> struct Filter2DCaller< type , D, Brd> \
-            { \
-                static void call(const PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz<D> dst, \
-                    int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream) \
-                { \
-                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
-                    dim3 block(16, 16); \
-                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-                    bindTexture(&tex_filter2D_ ## type , srcWhole); \
-                    tex_filter2D_ ## type ##_reader texSrc(xoff, yoff); \
-                    Brd<work_type> brd(dst.rows, dst.cols, VecTraits<work_type>::make(borderValue)); \
-                    BorderReader< tex_filter2D_ ## type ##_reader, Brd<work_type> > brdSrc(texSrc, brd); \
-                    filter2D<<<grid, block, 0, stream>>>(brdSrc, dst, kWidth, kHeight, anchorX, anchorY); \
-                    cudaSafeCall( cudaGetLastError() ); \
-                    if (stream == 0) \
-                        cudaSafeCall( cudaDeviceSynchronize() ); \
-                } \
-            };
-
-        IMPLEMENT_FILTER2D_TEX_READER(uchar);
-        IMPLEMENT_FILTER2D_TEX_READER(uchar4);
-
-        IMPLEMENT_FILTER2D_TEX_READER(ushort);
-        IMPLEMENT_FILTER2D_TEX_READER(ushort4);
-
-        IMPLEMENT_FILTER2D_TEX_READER(float);
-        IMPLEMENT_FILTER2D_TEX_READER(float4);
-
-        #undef IMPLEMENT_FILTER2D_TEX_READER
-
-        template <typename T, typename D>
-        void filter2D_gpu(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst,
-                          int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel,
-                          int borderMode, const float* borderValue, cudaStream_t stream)
-        {
-            typedef void (*func_t)(const PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<D> dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream);
-            static const func_t funcs[] =
-            {
-                Filter2DCaller<T, D, BrdConstant>::call,
-                Filter2DCaller<T, D, BrdReplicate>::call,
-                Filter2DCaller<T, D, BrdReflect>::call,
-                Filter2DCaller<T, D, BrdWrap>::call,
-                Filter2DCaller<T, D, BrdReflect101>::call
-            };
-
-            if (stream == 0)
-                cudaSafeCall( cudaMemcpyToSymbol(c_filter2DKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
-            else
-                cudaSafeCall( cudaMemcpyToSymbolAsync(c_filter2DKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
-
-            funcs[borderMode](static_cast< PtrStepSz<T> >(srcWhole), ofsX, ofsY, static_cast< PtrStepSz<D> >(dst), kWidth, kHeight, anchorX, anchorY, borderValue, stream);
-        }
-
-        template void filter2D_gpu<uchar, uchar>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void filter2D_gpu<uchar4, uchar4>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void filter2D_gpu<ushort, ushort>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void filter2D_gpu<ushort4, ushort4>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void filter2D_gpu<float, float>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void filter2D_gpu<float4, float4>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
+        dst(y, x) = saturate_cast<D>(res);
    }
+
+    template <typename T, typename D, template <typename> class Brd> struct Filter2DCaller;
+
+    #define IMPLEMENT_FILTER2D_TEX_READER(type) \
+        texture< type , cudaTextureType2D, cudaReadModeElementType> tex_filter2D_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
+        struct tex_filter2D_ ## type ## _reader \
+        { \
+            typedef type elem_type; \
+            typedef int index_type; \
+            const int xoff; \
+            const int yoff; \
+            tex_filter2D_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
+            __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
+            { \
+                return tex2D(tex_filter2D_ ## type , x + xoff, y + yoff); \
+            } \
+        }; \
+        template <typename D, template <typename> class Brd> struct Filter2DCaller< type , D, Brd> \
+        { \
+            static void call(const PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz<D> dst, const float* kernel, \
+                int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream) \
+            { \
+                typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
+                dim3 block(16, 16); \
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                bindTexture(&tex_filter2D_ ## type , srcWhole); \
+                tex_filter2D_ ## type ##_reader texSrc(xoff, yoff); \
+                Brd<work_type> brd(dst.rows, dst.cols, VecTraits<work_type>::make(borderValue)); \
+                BorderReader< tex_filter2D_ ## type ##_reader, Brd<work_type> > brdSrc(texSrc, brd); \
+                filter2D<<<grid, block, 0, stream>>>(brdSrc, dst, kernel, kWidth, kHeight, anchorX, anchorY); \
+                cudaSafeCall( cudaGetLastError() ); \
+                if (stream == 0) \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+            } \
+        };
+
+    IMPLEMENT_FILTER2D_TEX_READER(uchar);
+    IMPLEMENT_FILTER2D_TEX_READER(uchar4);
+
+    IMPLEMENT_FILTER2D_TEX_READER(ushort);
+    IMPLEMENT_FILTER2D_TEX_READER(ushort4);
+
+    IMPLEMENT_FILTER2D_TEX_READER(float);
+    IMPLEMENT_FILTER2D_TEX_READER(float4);
+
+    #undef IMPLEMENT_FILTER2D_TEX_READER
+
+    template <typename T, typename D>
+    void filter2D(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel,
+                  int kWidth, int kHeight, int anchorX, int anchorY,
+                  int borderMode, const float* borderValue, cudaStream_t stream)
+    {
+        typedef void (*func_t)(const PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<D> dst, const float* kernel,
+                               int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream);
+        static const func_t funcs[] =
+        {
+            Filter2DCaller<T, D, BrdConstant>::call,
+            Filter2DCaller<T, D, BrdReplicate>::call,
+            Filter2DCaller<T, D, BrdReflect>::call,
+            Filter2DCaller<T, D, BrdWrap>::call,
+            Filter2DCaller<T, D, BrdReflect101>::call
+        };
+
+        funcs[borderMode]((PtrStepSz<T>) srcWhole, ofsX, ofsY, (PtrStepSz<D>) dst, kernel,
+                          kWidth, kHeight, anchorX, anchorY, borderValue, stream);
+    }
+
+    template void filter2D<uchar  , uchar  >(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel, int kWidth, int kHeight, int anchorX, int anchorY, int borderMode, const float* borderValue, cudaStream_t stream);
+    template void filter2D<uchar4 , uchar4 >(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel, int kWidth, int kHeight, int anchorX, int anchorY, int borderMode, const float* borderValue, cudaStream_t stream);
+    template void filter2D<ushort , ushort >(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel, int kWidth, int kHeight, int anchorX, int anchorY, int borderMode, const float* borderValue, cudaStream_t stream);
+    template void filter2D<ushort4, ushort4>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel, int kWidth, int kHeight, int anchorX, int anchorY, int borderMode, const float* borderValue, cudaStream_t stream);
+    template void filter2D<float  , float  >(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel, int kWidth, int kHeight, int anchorX, int anchorY, int borderMode, const float* borderValue, cudaStream_t stream);
+    template void filter2D<float4 , float4 >(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel, int kWidth, int kHeight, int anchorX, int anchorY, int borderMode, const float* borderValue, cudaStream_t stream);
 }}}

 #endif // CUDA_DISABLER
--- a/modules/gpufilters/src/filtering.cpp
+++ b/modules/gpufilters/src/filtering.cpp
--- a/modules/gpufilters/src/precomp.hpp
+++ b/modules/gpufilters/src/precomp.hpp
@ -46,14 +46,9 @@
 #include <limits>

 #include "opencv2/gpufilters.hpp"
+#include "opencv2/gpuarithm.hpp"
 #include "opencv2/imgproc.hpp"

 #include "opencv2/core/private.gpu.hpp"

-#include "opencv2/opencv_modules.hpp"
-
-#ifdef HAVE_OPENCV_GPUARITHM
-#  include "opencv2/gpuarithm.hpp"
-#endif
-
 #endif /* __OPENCV_PRECOMP_H__ */
--- a/modules/gpufilters/test/test_filters.cpp
+++ b/modules/gpufilters/test/test_filters.cpp
@ -70,13 +70,14 @@ namespace
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // Blur

-PARAM_TEST_CASE(Blur, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, Anchor, UseRoi)
+PARAM_TEST_CASE(Blur, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, Anchor, BorderType, UseRoi)
 {
    cv::gpu::DeviceInfo devInfo;
    cv::Size size;
    int type;
    cv::Size ksize;
    cv::Point anchor;
+    int borderType;
    bool useRoi;

    virtual void SetUp()
@ -86,7 +87,8 @@ PARAM_TEST_CASE(Blur, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, Anchor, Use
        type = GET_PARAM(2);
        ksize = GET_PARAM(3);
        anchor = GET_PARAM(4);
-        useRoi = GET_PARAM(5);
+        borderType = GET_PARAM(5);
+        useRoi = GET_PARAM(6);

        cv::gpu::setDevice(devInfo.deviceID());
    }
@ -96,13 +98,15 @@ GPU_TEST_P(Blur, Accuracy)
 {
    cv::Mat src = randomMat(size, type);

+    cv::Ptr<cv::gpu::Filter> blurFilter = cv::gpu::createBoxFilter(src.type(), -1, ksize, anchor, borderType);
+
    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::blur(loadMat(src, useRoi), dst, ksize, anchor);
+    blurFilter->apply(loadMat(src, useRoi), dst);

    cv::Mat dst_gold;
-    cv::blur(src, dst_gold, ksize, anchor);
+    cv::blur(src, dst_gold, ksize, anchor, borderType);

-    EXPECT_MAT_NEAR(getInnerROI(dst_gold, ksize), getInnerROI(dst, ksize), 1.0);
+    EXPECT_MAT_NEAR(dst_gold, dst, 1.0);
 }

 INSTANTIATE_TEST_CASE_P(GPU_Filters, Blur, testing::Combine(
@ -111,6 +115,173 @@ INSTANTIATE_TEST_CASE_P(GPU_Filters, Blur, testing::Combine(
    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4)),
    testing::Values(KSize(cv::Size(3, 3)), KSize(cv::Size(5, 5)), KSize(cv::Size(7, 7))),
    testing::Values(Anchor(cv::Point(-1, -1)), Anchor(cv::Point(0, 0)), Anchor(cv::Point(2, 2))),
+    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT)),
+    WHOLE_SUBMAT));
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Filter2D
+
+PARAM_TEST_CASE(Filter2D, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, Anchor, BorderType, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    cv::Size ksize;
+    cv::Point anchor;
+    int borderType;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        ksize = GET_PARAM(3);
+        anchor = GET_PARAM(4);
+        borderType = GET_PARAM(5);
+        useRoi = GET_PARAM(6);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(Filter2D, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Mat kernel = randomMat(cv::Size(ksize.width, ksize.height), CV_32FC1, 0.0, 1.0);
+
+    cv::Ptr<cv::gpu::Filter> filter2D = cv::gpu::createLinearFilter(src.type(), -1, kernel, anchor, borderType);
+
+    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    filter2D->apply(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::filter2D(src, dst_gold, -1, kernel, anchor, 0, borderType);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, CV_MAT_DEPTH(type) == CV_32F ? 1e-1 : 1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Filters, Filter2D, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC4)),
+    testing::Values(KSize(cv::Size(3, 3)), KSize(cv::Size(5, 5)), KSize(cv::Size(7, 7)), KSize(cv::Size(11, 11)), KSize(cv::Size(13, 13)), KSize(cv::Size(15, 15))),
+    testing::Values(Anchor(cv::Point(-1, -1)), Anchor(cv::Point(0, 0)), Anchor(cv::Point(2, 2))),
+    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT)),
+    WHOLE_SUBMAT));
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Laplacian
+
+PARAM_TEST_CASE(Laplacian, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    cv::Size ksize;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        ksize = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(Laplacian, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+
+    cv::Ptr<cv::gpu::Filter> laplacian = cv::gpu::createLaplacianFilter(src.type(), -1, ksize.width);
+
+    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    laplacian->apply(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::Laplacian(src, dst_gold, -1, ksize.width);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() < CV_32F ? 0.0 : 1e-3);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Filters, Laplacian, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1)),
+    testing::Values(KSize(cv::Size(1, 1)), KSize(cv::Size(3, 3))),
+    WHOLE_SUBMAT));
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// SeparableLinearFilter
+
+PARAM_TEST_CASE(SeparableLinearFilter, cv::gpu::DeviceInfo, cv::Size, MatDepth, Channels, KSize, Anchor, BorderType, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    int cn;
+    cv::Size ksize;
+    cv::Point anchor;
+    int borderType;
+    bool useRoi;
+
+    int type;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        cn = GET_PARAM(3);
+        ksize = GET_PARAM(4);
+        anchor = GET_PARAM(5);
+        borderType = GET_PARAM(6);
+        useRoi = GET_PARAM(7);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+
+        type = CV_MAKE_TYPE(depth, cn);
+    }
+};
+
+GPU_TEST_P(SeparableLinearFilter, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Mat rowKernel = randomMat(Size(ksize.width, 1), CV_32FC1, 0.0, 1.0);
+    cv::Mat columnKernel = randomMat(Size(ksize.height, 1), CV_32FC1, 0.0, 1.0);
+
+    cv::Ptr<cv::gpu::Filter> filter = cv::gpu::createSeparableLinearFilter(src.type(), -1, rowKernel, columnKernel, anchor, borderType);
+
+    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    filter->apply(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::sepFilter2D(src, dst_gold, -1, rowKernel, columnKernel, anchor, 0, borderType);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() < CV_32F ? 1.0 : 1e-2);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Filters, SeparableLinearFilter, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_16S), MatDepth(CV_32F)),
+    IMAGE_CHANNELS,
+    testing::Values(KSize(cv::Size(3, 3)),
+                    KSize(cv::Size(7, 7)),
+                    KSize(cv::Size(13, 13)),
+                    KSize(cv::Size(15, 15)),
+                    KSize(cv::Size(17, 17)),
+                    KSize(cv::Size(23, 15)),
+                    KSize(cv::Size(31, 3))),
+    testing::Values(Anchor(cv::Point(-1, -1)), Anchor(cv::Point(0, 0)), Anchor(cv::Point(2, 2))),
+    testing::Values(BorderType(cv::BORDER_REFLECT101),
+                    BorderType(cv::BORDER_REPLICATE),
+                    BorderType(cv::BORDER_CONSTANT),
+                    BorderType(cv::BORDER_REFLECT)),
    WHOLE_SUBMAT));

 /////////////////////////////////////////////////////////////////////////////////////////////////
@ -155,13 +326,15 @@ GPU_TEST_P(Sobel, Accuracy)

    cv::Mat src = randomMat(size, type);

+    cv::Ptr<cv::gpu::Filter> sobel = cv::gpu::createSobelFilter(src.type(), -1, dx, dy, ksize.width, 1.0, borderType);
+
    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::Sobel(loadMat(src, useRoi), dst, -1, dx, dy, ksize.width, 1.0, borderType);
+    sobel->apply(loadMat(src, useRoi), dst);

    cv::Mat dst_gold;
    cv::Sobel(src, dst_gold, -1, dx, dy, ksize.width, 1.0, 0.0, borderType);

-    EXPECT_MAT_NEAR(getInnerROI(dst_gold, ksize), getInnerROI(dst, ksize), CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.1);
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() < CV_32F ? 0.0 : 0.1);
 }

 INSTANTIATE_TEST_CASE_P(GPU_Filters, Sobel, testing::Combine(
@ -218,13 +391,15 @@ GPU_TEST_P(Scharr, Accuracy)

    cv::Mat src = randomMat(size, type);

+    cv::Ptr<cv::gpu::Filter> scharr = cv::gpu::createScharrFilter(src.type(), -1, dx, dy, 1.0, borderType);
+
    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::Scharr(loadMat(src, useRoi), dst, -1, dx, dy, 1.0, borderType);
+    scharr->apply(loadMat(src, useRoi), dst);

    cv::Mat dst_gold;
    cv::Scharr(src, dst_gold, -1, dx, dy, 1.0, 0.0, borderType);

-    EXPECT_MAT_NEAR(getInnerROI(dst_gold, cv::Size(3, 3)), getInnerROI(dst, cv::Size(3, 3)), CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.1);
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() < CV_32F ? 0.0 : 0.1);
 }

 INSTANTIATE_TEST_CASE_P(GPU_Filters, Scharr, testing::Combine(
@ -277,28 +452,15 @@ GPU_TEST_P(GaussianBlur, Accuracy)
    double sigma1 = randomDouble(0.1, 1.0);
    double sigma2 = randomDouble(0.1, 1.0);

-    if (ksize.height > 16 && !supportFeature(devInfo, cv::gpu::FEATURE_SET_COMPUTE_20))
-    {
-        try
-        {
-            cv::gpu::GpuMat dst;
-            cv::gpu::GaussianBlur(loadMat(src), dst, ksize, sigma1, sigma2, borderType);
-        }
-        catch (const cv::Exception& e)
-        {
-            ASSERT_EQ(cv::Error::StsNotImplemented, e.code);
-        }
-    }
-    else
-    {
-        cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-        cv::gpu::GaussianBlur(loadMat(src, useRoi), dst, ksize, sigma1, sigma2, borderType);
+    cv::Ptr<cv::gpu::Filter> gauss = cv::gpu::createGaussianFilter(src.type(), -1, ksize, sigma1, sigma2, borderType);

-        cv::Mat dst_gold;
-        cv::GaussianBlur(src, dst_gold, ksize, sigma1, sigma2, borderType);
+    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    gauss->apply(loadMat(src, useRoi), dst);

-        EXPECT_MAT_NEAR(dst_gold, dst, 4.0);
-    }
+    cv::Mat dst_gold;
+    cv::GaussianBlur(src, dst_gold, ksize, sigma1, sigma2, borderType);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() < CV_32F ? 4.0 : 1e-4);
 }

 INSTANTIATE_TEST_CASE_P(GPU_Filters, GaussianBlur, testing::Combine(
@ -327,49 +489,6 @@ INSTANTIATE_TEST_CASE_P(GPU_Filters, GaussianBlur, testing::Combine(
                    BorderType(cv::BORDER_REFLECT)),
    WHOLE_SUBMAT));

-/////////////////////////////////////////////////////////////////////////////////////////////////
-// Laplacian
-
-PARAM_TEST_CASE(Laplacian, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, UseRoi)
-{
-    cv::gpu::DeviceInfo devInfo;
-    cv::Size size;
-    int type;
-    cv::Size ksize;
-    bool useRoi;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
-        type = GET_PARAM(2);
-        ksize = GET_PARAM(3);
-        useRoi = GET_PARAM(4);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(Laplacian, Accuracy)
-{
-    cv::Mat src = randomMat(size, type);
-
-    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::Laplacian(loadMat(src, useRoi), dst, -1, ksize.width);
-
-    cv::Mat dst_gold;
-    cv::Laplacian(src, dst_gold, -1, ksize.width);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() < CV_32F ? 0.0 : 1e-3);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Filters, Laplacian, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1)),
-    testing::Values(KSize(cv::Size(1, 1)), KSize(cv::Size(3, 3))),
-    WHOLE_SUBMAT));
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // Erode

@ -400,8 +519,10 @@ GPU_TEST_P(Erode, Accuracy)
    cv::Mat src = randomMat(size, type);
    cv::Mat kernel = cv::Mat::ones(3, 3, CV_8U);

+    cv::Ptr<cv::gpu::Filter> erode = cv::gpu::createMorphologyFilter(cv::MORPH_ERODE, src.type(), kernel, anchor, iterations);
+
    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::erode(loadMat(src, useRoi), dst, kernel, anchor, iterations);
+    erode->apply(loadMat(src, useRoi), dst);

    cv::Mat dst_gold;
    cv::erode(src, dst_gold, kernel, anchor, iterations);
@ -449,8 +570,10 @@ GPU_TEST_P(Dilate, Accuracy)
    cv::Mat src = randomMat(size, type);
    cv::Mat kernel = cv::Mat::ones(3, 3, CV_8U);

+    cv::Ptr<cv::gpu::Filter> dilate = cv::gpu::createMorphologyFilter(cv::MORPH_DILATE, src.type(), kernel, anchor, iterations);
+
    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::dilate(loadMat(src, useRoi), dst, kernel, anchor, iterations);
+    dilate->apply(loadMat(src, useRoi), dst);

    cv::Mat dst_gold;
    cv::dilate(src, dst_gold, kernel, anchor, iterations);
@ -502,8 +625,10 @@ GPU_TEST_P(MorphEx, Accuracy)
    cv::Mat src = randomMat(size, type);
    cv::Mat kernel = cv::Mat::ones(3, 3, CV_8U);

+    cv::Ptr<cv::gpu::Filter> morph = cv::gpu::createMorphologyFilter(morphOp, src.type(), kernel, anchor, iterations);
+
    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::morphologyEx(loadMat(src, useRoi), dst, morphOp, kernel, anchor, iterations);
+    morph->apply(loadMat(src, useRoi), dst);

    cv::Mat dst_gold;
    cv::morphologyEx(src, dst_gold, morphOp, kernel, anchor, iterations);
@ -522,56 +647,4 @@ INSTANTIATE_TEST_CASE_P(GPU_Filters, MorphEx, testing::Combine(
    testing::Values(Iterations(1), Iterations(2), Iterations(3)),
    WHOLE_SUBMAT));

-/////////////////////////////////////////////////////////////////////////////////////////////////
-// Filter2D
-
-PARAM_TEST_CASE(Filter2D, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, Anchor, BorderType, UseRoi)
-{
-    cv::gpu::DeviceInfo devInfo;
-    cv::Size size;
-    int type;
-    cv::Size ksize;
-    cv::Point anchor;
-    int borderType;
-    bool useRoi;
-
-    cv::Mat img;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
-        type = GET_PARAM(2);
-        ksize = GET_PARAM(3);
-        anchor = GET_PARAM(4);
-        borderType = GET_PARAM(5);
-        useRoi = GET_PARAM(6);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(Filter2D, Accuracy)
-{
-    cv::Mat src = randomMat(size, type);
-    cv::Mat kernel = randomMat(cv::Size(ksize.width, ksize.height), CV_32FC1, 0.0, 1.0);
-
-    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::filter2D(loadMat(src, useRoi), dst, -1, kernel, anchor, borderType);
-
-    cv::Mat dst_gold;
-    cv::filter2D(src, dst_gold, -1, kernel, anchor, 0, borderType);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, CV_MAT_DEPTH(type) == CV_32F ? 1e-1 : 1.0);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Filters, Filter2D, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC4)),
-    testing::Values(KSize(cv::Size(3, 3)), KSize(cv::Size(5, 5)), KSize(cv::Size(7, 7)), KSize(cv::Size(11, 11)), KSize(cv::Size(13, 13)), KSize(cv::Size(15, 15))),
-    testing::Values(Anchor(cv::Point(-1, -1)), Anchor(cv::Point(0, 0)), Anchor(cv::Point(2, 2))),
-    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT)),
-    WHOLE_SUBMAT));
-
 #endif // HAVE_CUDA
--- a/modules/gpuimgproc/include/opencv2/gpuimgproc.hpp
+++ b/modules/gpuimgproc/include/opencv2/gpuimgproc.hpp
@ -158,7 +158,7 @@ struct CV_EXPORTS CannyBuf
    GpuMat mag;
    GpuMat map;
    GpuMat st1, st2;
-    Ptr<FilterEngine_GPU> filterDX, filterDY;
+    Ptr<Filter> filterDX, filterDY;
 };

 CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
--- a/modules/gpuimgproc/src/canny.cpp
+++ b/modules/gpuimgproc/src/canny.cpp
@ -65,8 +65,8 @@ void cv::gpu::CannyBuf::create(const Size& image_size, int apperture_size)

        if (apperture_size != 3)
        {
-            filterDX = createDerivFilter_GPU(CV_8UC1, CV_32S, 1, 0, apperture_size, BORDER_REPLICATE);
-            filterDY = createDerivFilter_GPU(CV_8UC1, CV_32S, 0, 1, apperture_size, BORDER_REPLICATE);
+            filterDX = createDerivFilter(CV_8UC1, CV_32S, 1, 0, apperture_size, false, 1, BORDER_REPLICATE);
+            filterDY = createDerivFilter(CV_8UC1, CV_32S, 0, 1, apperture_size, false, 1, BORDER_REPLICATE);
        }
    }

@ -150,8 +150,8 @@ void cv::gpu::Canny(const GpuMat& src, CannyBuf& buf, GpuMat& dst, double low_th
    }
    else
    {
-        buf.filterDX->apply(src, buf.dx, Rect(0, 0, src.cols, src.rows));
-        buf.filterDY->apply(src, buf.dy, Rect(0, 0, src.cols, src.rows));
+        buf.filterDX->apply(src, buf.dx);
+        buf.filterDY->apply(src, buf.dy);

        calcMagnitude(buf.dx, buf.dy, buf.mag, L2gradient);
    }
--- a/modules/gpuimgproc/src/corners.cpp
+++ b/modules/gpuimgproc/src/corners.cpp
@ -70,6 +70,8 @@ namespace
 {
    void extractCovData(const GpuMat& src, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType, Stream& stream)
    {
+        (void) buf;
+
        double scale = static_cast<double>(1 << ((ksize > 0 ? ksize : 3) - 1)) * blockSize;

        if (ksize < 0)
@ -83,16 +85,21 @@ namespace
        Dx.create(src.size(), CV_32F);
        Dy.create(src.size(), CV_32F);

+        Ptr<gpu::Filter> filterDx, filterDy;
+
        if (ksize > 0)
        {
-            Sobel(src, Dx, CV_32F, 1, 0, buf, ksize, scale, borderType, -1, stream);
-            Sobel(src, Dy, CV_32F, 0, 1, buf, ksize, scale, borderType, -1, stream);
+            filterDx = gpu::createSobelFilter(src.type(), CV_32F, 1, 0, ksize, scale, borderType);
+            filterDy = gpu::createSobelFilter(src.type(), CV_32F, 0, 1, ksize, scale, borderType);
        }
        else
        {
-            Scharr(src, Dx, CV_32F, 1, 0, buf, scale, borderType, -1, stream);
-            Scharr(src, Dy, CV_32F, 0, 1, buf, scale, borderType, -1, stream);
+            filterDx = gpu::createScharrFilter(src.type(), CV_32F, 1, 0, scale, borderType);
+            filterDy = gpu::createScharrFilter(src.type(), CV_32F, 0, 1, scale, borderType);
        }
+
+        filterDx->apply(src, Dx);
+        filterDy->apply(src, Dy);
    }
 }

--- a/modules/gpuimgproc/src/hough.cpp
+++ b/modules/gpuimgproc/src/hough.cpp
@ -761,7 +761,7 @@ namespace
        {
            buildRTable_gpu(edgePointList.ptr<unsigned int>(0), edgePointList.ptr<float>(1), edgePointList.cols,
                            r_table, r_sizes.ptr<int>(), make_short2(templCenter.x, templCenter.y), levels);
-            min(r_sizes, maxSize, r_sizes);
+            gpu::min(r_sizes, maxSize, r_sizes);
        }
    }

--- a/modules/gpuimgproc/src/match_template.cpp
+++ b/modules/gpuimgproc/src/match_template.cpp
@ -172,15 +172,16 @@ namespace
            return;
        }

-        gpu::ConvolveBuf convolve_buf;
-        convolve_buf.user_block_size = buf.user_block_size;
+        Ptr<gpu::Convolution> conv = gpu::createConvolution(buf.user_block_size);

        if (image.channels() == 1)
-            gpu::convolve(image.reshape(1), templ.reshape(1), result, true, convolve_buf, stream);
+        {
+            conv->convolve(image.reshape(1), templ.reshape(1), result, true, stream);
+        }
        else
        {
            GpuMat result_;
-            gpu::convolve(image.reshape(1), templ.reshape(1), result_, true, convolve_buf, stream);
+            conv->convolve(image.reshape(1), templ.reshape(1), result_, true, stream);
            extractFirstChannel_32F(result_, result, image.channels(), StreamAccessor::getStream(stream));
        }
    }
@ -268,7 +269,7 @@ namespace
            buf.image_sums.resize(1);
            gpu::integral(image, buf.image_sums[0], stream);

-            unsigned int templ_sum = (unsigned int)sum(templ)[0];
+            unsigned int templ_sum = (unsigned int)gpu::sum(templ)[0];
            matchTemplatePrepared_CCOFF_8U(templ.cols, templ.rows, buf.image_sums[0], templ_sum, result, StreamAccessor::getStream(stream));
        }
        else
--- a/modules/gpulegacy/include/opencv2/gpulegacy/NCV.hpp
+++ b/modules/gpulegacy/include/opencv2/gpulegacy/NCV.hpp
@ -126,7 +126,7 @@ typedef                int Ncv32s;
 typedef       unsigned int Ncv32u;
 typedef              short Ncv16s;
 typedef     unsigned short Ncv16u;
-typedef               char Ncv8s;
+typedef        signed char Ncv8s;
 typedef      unsigned char Ncv8u;
 typedef              float Ncv32f;
 typedef             double Ncv64f;
--- a/modules/gpulegacy/src/cuda/NCVPixelOperations.hpp
+++ b/modules/gpulegacy/src/cuda/NCVPixelOperations.hpp
@ -51,7 +51,7 @@ template<typename TBase> inline __host__ __device__ TBase _pixMaxVal();
 template<> static inline __host__ __device__ Ncv8u  _pixMaxVal<Ncv8u>()  {return UCHAR_MAX;}
 template<> static inline __host__ __device__ Ncv16u _pixMaxVal<Ncv16u>() {return USHRT_MAX;}
 template<> static inline __host__ __device__ Ncv32u _pixMaxVal<Ncv32u>() {return  UINT_MAX;}
-template<> static inline __host__ __device__ Ncv8s  _pixMaxVal<Ncv8s>()  {return  CHAR_MAX;}
+template<> static inline __host__ __device__ Ncv8s  _pixMaxVal<Ncv8s>()  {return  SCHAR_MAX;}
 template<> static inline __host__ __device__ Ncv16s _pixMaxVal<Ncv16s>() {return  SHRT_MAX;}
 template<> static inline __host__ __device__ Ncv32s _pixMaxVal<Ncv32s>() {return   INT_MAX;}
 template<> static inline __host__ __device__ Ncv32f _pixMaxVal<Ncv32f>() {return   FLT_MAX;}
@ -61,7 +61,7 @@ template<typename TBase> inline __host__ __device__ TBase _pixMinVal();
 template<> static inline __host__ __device__ Ncv8u  _pixMinVal<Ncv8u>()  {return 0;}
 template<> static inline __host__ __device__ Ncv16u _pixMinVal<Ncv16u>() {return 0;}
 template<> static inline __host__ __device__ Ncv32u _pixMinVal<Ncv32u>() {return 0;}
-template<> static inline __host__ __device__ Ncv8s  _pixMinVal<Ncv8s>()  {return CHAR_MIN;}
+template<> static inline __host__ __device__ Ncv8s  _pixMinVal<Ncv8s>()  {return SCHAR_MIN;}
 template<> static inline __host__ __device__ Ncv16s _pixMinVal<Ncv16s>() {return SHRT_MIN;}
 template<> static inline __host__ __device__ Ncv32s _pixMinVal<Ncv32s>() {return INT_MIN;}
 template<> static inline __host__ __device__ Ncv32f _pixMinVal<Ncv32f>() {return FLT_MIN;}
--- a/modules/gpuoptflow/test/test_optflow.cpp
+++ b/modules/gpuoptflow/test/test_optflow.cpp
@ -80,7 +80,7 @@ GPU_TEST_P(BroxOpticalFlow, Regression)
    brox(loadMat(frame0), loadMat(frame1), u, v);

    std::string fname(cvtest::TS::ptr()->get_data_path());
-    if (devInfo.major() >= 2)
+    if (devInfo.majorVersion() >= 2)
        fname += "opticalflow/brox_optical_flow_cc20.bin";
    else
        fname += "opticalflow/brox_optical_flow.bin";
--- a/modules/gpustereo/doc/stereo.rst
+++ b/modules/gpustereo/doc/stereo.rst
@ -5,135 +5,75 @@ Stereo Correspondence



-gpu::StereoBM_GPU
-----------------
-.. ocv:class:: gpu::StereoBM_GPU
+gpu::StereoBM
+-------------
+.. ocv:class:: gpu::StereoBM : public cv::StereoBM

 Class computing stereo correspondence (disparity map) using the block matching algorithm. ::

-    class StereoBM_GPU
-    {
-    public:
-        enum { BASIC_PRESET = 0, PREFILTER_XSOBEL = 1 };
-
-        enum { DEFAULT_NDISP = 64, DEFAULT_WINSZ = 19 };
-
-        StereoBM_GPU();
-        StereoBM_GPU(int preset, int ndisparities = DEFAULT_NDISP,
-                     int winSize = DEFAULT_WINSZ);
-
-        void operator() (const GpuMat& left, const GpuMat& right,
-                         GpuMat& disparity, Stream& stream = Stream::Null());
-
-        static bool checkIfGpuCallReasonable();
-
-        int preset;
-        int ndisp;
-        int winSize;
-
-        float avergeTexThreshold;
-
-        ...
-    };
-
-
-The class also performs pre- and post-filtering steps: Sobel pre-filtering (if ``PREFILTER_XSOBEL`` flag is set) and low textureness filtering (if ``averageTexThreshols > 0`` ). If ``avergeTexThreshold = 0`` , low textureness filtering is disabled. Otherwise, the disparity is set to 0 in each point ``(x, y)`` , where for the left image
-
-.. math::
-    \sum HorizontalGradiensInWindow(x, y, winSize) < (winSize \cdot winSize) \cdot avergeTexThreshold
-
-This means that the input left image is low textured.
+.. seealso:: :ocv:class:`StereoBM`



-gpu::StereoBM_GPU::StereoBM_GPU
-----------------------------------
-Enables :ocv:class:`gpu::StereoBM_GPU` constructors.
+gpu::createStereoBM
+-------------------
+Creates StereoBM object.

-.. ocv:function:: gpu::StereoBM_GPU::StereoBM_GPU()
+.. ocv:function:: Ptr<gpu::StereoBM> gpu::createStereoBM(int numDisparities = 64, int blockSize = 19)

-.. ocv:function:: gpu::StereoBM_GPU::StereoBM_GPU(int preset, int ndisparities = DEFAULT_NDISP, int winSize = DEFAULT_WINSZ)
+    :param numDisparities: the disparity search range. For each pixel algorithm will find the best disparity from 0 (default minimum disparity) to ``numDisparities``. The search range can then be shifted by changing the minimum disparity.

-    :param preset: Parameter presetting:
-
-        * **BASIC_PRESET** Basic mode without pre-processing.
-
-        * **PREFILTER_XSOBEL** Sobel pre-filtering mode.
-
-    :param ndisparities: Number of disparities. It must be a multiple of 8 and less or equal to 256.
-
-    :param winSize: Block size.
-
-
-
-gpu::StereoBM_GPU::operator ()
----------------------------------
-Enables the stereo correspondence operator that finds the disparity for the specified rectified stereo pair.
-
-.. ocv:function:: void gpu::StereoBM_GPU::operator ()(const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream = Stream::Null())
-
-    :param left: Left image. Only  ``CV_8UC1``  type is supported.
-
-    :param right: Right image with the same size and the same type as the left one.
-
-    :param disparity: Output disparity map. It is a  ``CV_8UC1``  image with the same size as the input images.
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::StereoBM_GPU::checkIfGpuCallReasonable
-----------------------------------------------
-Uses a heuristic method to estimate whether the current GPU is faster than the CPU in this algorithm. It queries the currently active device.
-
-.. ocv:function:: bool gpu::StereoBM_GPU::checkIfGpuCallReasonable()
+    :param blockSize: the linear size of the blocks compared by the algorithm. The size should be odd (as the block is centered at the current pixel). Larger block size implies smoother, though less accurate disparity map. Smaller block size gives more detailed disparity map, but there is higher chance for algorithm to find a wrong correspondence.



 gpu::StereoBeliefPropagation
 ----------------------------
-.. ocv:class:: gpu::StereoBeliefPropagation
+.. ocv:class:: gpu::StereoBeliefPropagation : public cv::StereoMatcher

 Class computing stereo correspondence using the belief propagation algorithm. ::

-    class StereoBeliefPropagation
+    class CV_EXPORTS StereoBeliefPropagation : public cv::StereoMatcher
    {
    public:
-        enum { DEFAULT_NDISP  = 64 };
-        enum { DEFAULT_ITERS  = 5  };
-        enum { DEFAULT_LEVELS = 5  };
+        using cv::StereoMatcher::compute;

-        static void estimateRecommendedParams(int width, int height,
-            int& ndisp, int& iters, int& levels);
+        virtual void compute(InputArray left, InputArray right, OutputArray disparity, Stream& stream) = 0;

-        explicit StereoBeliefPropagation(int ndisp = DEFAULT_NDISP,
-            int iters  = DEFAULT_ITERS,
-            int levels = DEFAULT_LEVELS,
-            int msg_type = CV_32F);
-        StereoBeliefPropagation(int ndisp, int iters, int levels,
-            float max_data_term, float data_weight,
-            float max_disc_term, float disc_single_jump,
-            int msg_type = CV_32F);
+        //! version for user specified data term
+        virtual void compute(InputArray data, OutputArray disparity, Stream& stream = Stream::Null()) = 0;

-        void operator()(const GpuMat& left, const GpuMat& right,
-                        GpuMat& disparity, Stream& stream = Stream::Null());
-        void operator()(const GpuMat& data, GpuMat& disparity, Stream& stream = Stream::Null());
+        //! number of BP iterations on each level
+        virtual int getNumIters() const = 0;
+        virtual void setNumIters(int iters) = 0;

-        int ndisp;
+        //! number of levels
+        virtual int getNumLevels() const = 0;
+        virtual void setNumLevels(int levels) = 0;

-        int iters;
-        int levels;
+        //! truncation of data cost
+        virtual double getMaxDataTerm() const = 0;
+        virtual void setMaxDataTerm(double max_data_term) = 0;

-        float max_data_term;
-        float data_weight;
-        float max_disc_term;
-        float disc_single_jump;
+        //! data weight
+        virtual double getDataWeight() const = 0;
+        virtual void setDataWeight(double data_weight) = 0;

-        int msg_type;
+        //! truncation of discontinuity cost
+        virtual double getMaxDiscTerm() const = 0;
+        virtual void setMaxDiscTerm(double max_disc_term) = 0;

-        ...
+        //! discontinuity single jump
+        virtual double getDiscSingleJump() const = 0;
+        virtual void setDiscSingleJump(double disc_single_jump) = 0;
+
+        virtual int getMsgType() const = 0;
+        virtual void setMsgType(int msg_type) = 0;
+
+        static void estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels);
    };

+
 The class implements algorithm described in [Felzenszwalb2006]_ . It can compute own data cost (using a truncated linear model) or use a user-provided data cost.

 .. note::
@ -152,32 +92,6 @@ The class implements algorithm described in [Felzenszwalb2006]_ . It can compute

    ``width_step`` is the number of bytes in a line including padding.

-
-
-gpu::StereoBeliefPropagation::StereoBeliefPropagation
---------------------------------------------------------
-Enables the :ocv:class:`gpu::StereoBeliefPropagation` constructors.
-
-.. ocv:function:: gpu::StereoBeliefPropagation::StereoBeliefPropagation(int ndisp = DEFAULT_NDISP, int iters = DEFAULT_ITERS, int levels = DEFAULT_LEVELS, int msg_type = CV_32F)
-
-.. ocv:function:: gpu::StereoBeliefPropagation::StereoBeliefPropagation(int ndisp, int iters, int levels, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int msg_type = CV_32F)
-
-    :param ndisp: Number of disparities.
-
-    :param iters: Number of BP iterations on each level.
-
-    :param levels: Number of levels.
-
-    :param max_data_term: Threshold for data cost truncation.
-
-    :param data_weight: Data weight.
-
-    :param max_disc_term: Threshold for discontinuity truncation.
-
-    :param disc_single_jump: Discontinuity single jump.
-
-    :param msg_type: Type for messages.  ``CV_16SC1``  and  ``CV_32FC1`` types are supported.
-
 ``StereoBeliefPropagation`` uses a truncated linear model for the data cost and discontinuity terms:

 .. math::
@ -190,33 +104,45 @@ Enables the :ocv:class:`gpu::StereoBeliefPropagation` constructors.

 For more details, see [Felzenszwalb2006]_.

-By default, :ocv:class:`gpu::StereoBeliefPropagation` uses floating-point arithmetics and the ``CV_32FC1`` type for messages. But it can also use fixed-point arithmetics and the ``CV_16SC1`` message type for better performance. To avoid an overflow in this case, the parameters must satisfy the following requirement:
+By default, ``StereoBeliefPropagation`` uses floating-point arithmetics and the ``CV_32FC1`` type for messages. But it can also use fixed-point arithmetics and the ``CV_16SC1`` message type for better performance. To avoid an overflow in this case, the parameters must satisfy the following requirement:

 .. math::

    10  \cdot 2^{levels-1}  \cdot max \_ data \_ term < SHRT \_ MAX

+.. seealso:: :ocv:class:`StereoMatcher`
+
+
+
+gpu::createStereoBeliefPropagation
+----------------------------------
+Creates StereoBeliefPropagation object.
+
+.. ocv:function:: Ptr<gpu::StereoBeliefPropagation> gpu::createStereoBeliefPropagation(int ndisp = 64, int iters = 5, int levels = 5, int msg_type = CV_32F)
+
+    :param ndisp: Number of disparities.
+
+    :param iters: Number of BP iterations on each level.
+
+    :param levels: Number of levels.
+
+    :param msg_type: Type for messages.  ``CV_16SC1``  and  ``CV_32FC1`` types are supported.
+


 gpu::StereoBeliefPropagation::estimateRecommendedParams
-----------------------------------------------------------
+-------------------------------------------------------
 Uses a heuristic method to compute the recommended parameters ( ``ndisp``, ``iters`` and ``levels`` ) for the specified image size ( ``width`` and ``height`` ).

 .. ocv:function:: void gpu::StereoBeliefPropagation::estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels)



-gpu::StereoBeliefPropagation::operator ()
---------------------------------------------
-Enables the stereo correspondence operator that finds the disparity for the specified rectified stereo pair or data cost.
+gpu::StereoBeliefPropagation::compute
+-------------------------------------
+Enables the stereo correspondence operator that finds the disparity for the specified data cost.

-.. ocv:function:: void gpu::StereoBeliefPropagation::operator ()(const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream = Stream::Null())
-
-.. ocv:function:: void gpu::StereoBeliefPropagation::operator ()(const GpuMat& data, GpuMat& disparity, Stream& stream = Stream::Null())
-
-    :param left: Left image. ``CV_8UC1`` , ``CV_8UC3``  and  ``CV_8UC4``  types are supported.
-
-    :param right: Right image with the same size and the same type as the left one.
+.. ocv:function:: void gpu::StereoBeliefPropagation::compute(InputArray data, OutputArray disparity, Stream& stream = Stream::Null())

    :param data: User-specified data cost, a matrix of ``msg_type`` type and ``Size(<image columns>*ndisp, <image rows>)`` size.

@ -228,89 +154,26 @@ Enables the stereo correspondence operator that finds the disparity for the spec

 gpu::StereoConstantSpaceBP
 --------------------------
-.. ocv:class:: gpu::StereoConstantSpaceBP
+.. ocv:class:: gpu::StereoConstantSpaceBP : public gpu::StereoBeliefPropagation

 Class computing stereo correspondence using the constant space belief propagation algorithm. ::

-    class StereoConstantSpaceBP
+    class CV_EXPORTS StereoConstantSpaceBP : public gpu::StereoBeliefPropagation
    {
    public:
-        enum { DEFAULT_NDISP    = 128 };
-        enum { DEFAULT_ITERS    = 8   };
-        enum { DEFAULT_LEVELS   = 4   };
-        enum { DEFAULT_NR_PLANE = 4   };
+        //! number of active disparity on the first level
+        virtual int getNrPlane() const = 0;
+        virtual void setNrPlane(int nr_plane) = 0;

-        static void estimateRecommendedParams(int width, int height,
-            int& ndisp, int& iters, int& levels, int& nr_plane);
+        virtual bool getUseLocalInitDataCost() const = 0;
+        virtual void setUseLocalInitDataCost(bool use_local_init_data_cost) = 0;

-        explicit StereoConstantSpaceBP(int ndisp = DEFAULT_NDISP,
-            int iters    = DEFAULT_ITERS,
-            int levels   = DEFAULT_LEVELS,
-            int nr_plane = DEFAULT_NR_PLANE,
-            int msg_type = CV_32F);
-        StereoConstantSpaceBP(int ndisp, int iters, int levels, int nr_plane,
-            float max_data_term, float data_weight,
-            float max_disc_term, float disc_single_jump,
-            int min_disp_th = 0,
-            int msg_type = CV_32F);
-
-        void operator()(const GpuMat& left, const GpuMat& right,
-                        GpuMat& disparity, Stream& stream = Stream::Null());
-
-        int ndisp;
-
-        int iters;
-        int levels;
-
-        int nr_plane;
-
-        float max_data_term;
-        float data_weight;
-        float max_disc_term;
-        float disc_single_jump;
-
-        int min_disp_th;
-
-        int msg_type;
-
-        bool use_local_init_data_cost;
-
-        ...
+        static void estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels, int& nr_plane);
    };


 The class implements algorithm described in [Yang2010]_. ``StereoConstantSpaceBP`` supports both local minimum and global minimum data cost initialization algorithms. For more details, see the paper mentioned above. By default, a local algorithm is used. To enable a global algorithm, set ``use_local_init_data_cost`` to ``false`` .

-
-
-gpu::StereoConstantSpaceBP::StereoConstantSpaceBP
-----------------------------------------------------
-Enables the :ocv:class:`gpu::StereoConstantSpaceBP` constructors.
-
-.. ocv:function:: gpu::StereoConstantSpaceBP::StereoConstantSpaceBP(int ndisp = DEFAULT_NDISP, int iters = DEFAULT_ITERS, int levels = DEFAULT_LEVELS, int nr_plane = DEFAULT_NR_PLANE, int msg_type = CV_32F)
-
-.. ocv:function:: gpu::StereoConstantSpaceBP::StereoConstantSpaceBP(int ndisp, int iters, int levels, int nr_plane, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th = 0, int msg_type = CV_32F)
-
-    :param ndisp: Number of disparities.
-
-    :param iters: Number of BP iterations on each level.
-
-    :param levels: Number of levels.
-
-    :param nr_plane: Number of disparity levels on the first level.
-
-    :param max_data_term: Truncation of data cost.
-
-    :param data_weight: Data weight.
-
-    :param max_disc_term: Truncation of discontinuity.
-
-    :param disc_single_jump: Discontinuity single jump.
-
-    :param min_disp_th: Minimal disparity threshold.
-
-    :param msg_type: Type for messages.  ``CV_16SC1``  and  ``CV_32FC1`` types are supported.
-
 ``StereoConstantSpaceBP`` uses a truncated linear model for the data cost and discontinuity terms:

 .. math::
@ -331,54 +194,65 @@ By default, ``StereoConstantSpaceBP`` uses floating-point arithmetics and the ``



+gpu::createStereoConstantSpaceBP
+--------------------------------
+Creates StereoConstantSpaceBP object.
+
+.. ocv:function:: Ptr<gpu::StereoConstantSpaceBP> gpu::createStereoConstantSpaceBP(int ndisp = 128, int iters = 8, int levels = 4, int nr_plane = 4, int msg_type = CV_32F)
+
+    :param ndisp: Number of disparities.
+
+    :param iters: Number of BP iterations on each level.
+
+    :param levels: Number of levels.
+
+    :param nr_plane: Number of disparity levels on the first level.
+
+    :param msg_type: Type for messages.  ``CV_16SC1``  and  ``CV_32FC1`` types are supported.
+
+
+
 gpu::StereoConstantSpaceBP::estimateRecommendedParams
---------------------------------------------------------
+-----------------------------------------------------
 Uses a heuristic method to compute parameters (ndisp, iters, levelsand nrplane) for the specified image size (widthand height).

 .. ocv:function:: void gpu::StereoConstantSpaceBP::estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels, int& nr_plane)



-gpu::StereoConstantSpaceBP::operator ()
-------------------------------------------
-Enables the stereo correspondence operator that finds the disparity for the specified rectified stereo pair.
-
-.. ocv:function:: void gpu::StereoConstantSpaceBP::operator ()(const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream = Stream::Null())
-
-    :param left: Left image. ``CV_8UC1`` , ``CV_8UC3``  and  ``CV_8UC4``  types are supported.
-
-    :param right: Right image with the same size and the same type as the left one.
-
-    :param disparity: Output disparity map. If  ``disparity``  is empty, the output type is  ``CV_16SC1`` . Otherwise, the output type is  ``disparity.type()`` .
-
-    :param stream: Stream for the asynchronous version.
-
-
-
 gpu::DisparityBilateralFilter
 -----------------------------
-.. ocv:class:: gpu::DisparityBilateralFilter
+.. ocv:class:: gpu::DisparityBilateralFilter : public cv::Algorithm

 Class refining a disparity map using joint bilateral filtering. ::

-    class CV_EXPORTS DisparityBilateralFilter
+    class CV_EXPORTS DisparityBilateralFilter : public cv::Algorithm
    {
    public:
-        enum { DEFAULT_NDISP  = 64 };
-        enum { DEFAULT_RADIUS = 3 };
-        enum { DEFAULT_ITERS  = 1 };
+        //! the disparity map refinement operator. Refine disparity map using joint bilateral filtering given a single color image.
+        //! disparity must have CV_8U or CV_16S type, image must have CV_8UC1 or CV_8UC3 type.
+        virtual void apply(InputArray disparity, InputArray image, OutputArray dst, Stream& stream = Stream::Null()) = 0;

-        explicit DisparityBilateralFilter(int ndisp = DEFAULT_NDISP,
-            int radius = DEFAULT_RADIUS, int iters = DEFAULT_ITERS);
+        virtual int getNumDisparities() const = 0;
+        virtual void setNumDisparities(int numDisparities) = 0;

-        DisparityBilateralFilter(int ndisp, int radius, int iters,
-            float edge_threshold, float max_disc_threshold,
-            float sigma_range);
+        virtual int getRadius() const = 0;
+        virtual void setRadius(int radius) = 0;

-        void operator()(const GpuMat& disparity, const GpuMat& image,
-                        GpuMat& dst, Stream& stream = Stream::Null());
+        virtual int getNumIters() const = 0;
+        virtual void setNumIters(int iters) = 0;

-        ...
+        //! truncation of data continuity
+        virtual double getEdgeThreshold() const = 0;
+        virtual void setEdgeThreshold(double edge_threshold) = 0;
+
+        //! truncation of disparity continuity
+        virtual double getMaxDiscThreshold() const = 0;
+        virtual void setMaxDiscThreshold(double max_disc_threshold) = 0;
+
+        //! filter range sigma
+        virtual double getSigmaRange() const = 0;
+        virtual void setSigmaRange(double sigma_range) = 0;
    };


@ -386,13 +260,11 @@ The class implements [Yang2010]_ algorithm.



-gpu::DisparityBilateralFilter::DisparityBilateralFilter
-----------------------------------------------------------
-Enables the :ocv:class:`gpu::DisparityBilateralFilter` constructors.
+gpu::createDisparityBilateralFilter
+-----------------------------------
+Creates DisparityBilateralFilter object.

-.. ocv:function:: gpu::DisparityBilateralFilter::DisparityBilateralFilter(int ndisp = DEFAULT_NDISP, int radius = DEFAULT_RADIUS, int iters = DEFAULT_ITERS)
-
-.. ocv:function:: gpu::DisparityBilateralFilter::DisparityBilateralFilter(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold, float sigma_range)
+.. ocv:function:: Ptr<gpu::DisparityBilateralFilter> gpu::createDisparityBilateralFilter(int ndisp = 64, int radius = 3, int iters = 1)

    :param ndisp: Number of disparities.

@ -400,19 +272,13 @@ Enables the :ocv:class:`gpu::DisparityBilateralFilter` constructors.

    :param iters: Number of iterations.

-    :param edge_threshold: Threshold for edges.
-
-    :param max_disc_threshold: Constant to reject outliers.
-
-    :param sigma_range: Filter range.


-
-gpu::DisparityBilateralFilter::operator ()
----------------------------------------------
+gpu::DisparityBilateralFilter::apply
+------------------------------------
 Refines a disparity map using joint bilateral filtering.

-.. ocv:function:: void gpu::DisparityBilateralFilter::operator ()(const GpuMat& disparity, const GpuMat& image, GpuMat& dst, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::DisparityBilateralFilter::apply(InputArray disparity, InputArray image, OutputArray dst, Stream& stream = Stream::Null())

    :param disparity: Input disparity map.  ``CV_8UC1``  and  ``CV_16SC1``  types are supported.

@ -424,29 +290,11 @@ Refines a disparity map using joint bilateral filtering.



-gpu::drawColorDisp
----------------------
-Colors a disparity image.
-
-.. ocv:function:: void gpu::drawColorDisp(const GpuMat& src_disp, GpuMat& dst_disp, int ndisp, Stream& stream = Stream::Null())
-
-    :param src_disp: Source disparity image.  ``CV_8UC1``  and  ``CV_16SC1``  types are supported.
-
-    :param dst_disp: Output disparity image. It has the same size as  ``src_disp`` . The  type is ``CV_8UC4``  in  ``BGRA``  format (alpha = 255).
-
-    :param ndisp: Number of disparities.
-
-    :param stream: Stream for the asynchronous version.
-
-This function draws a colored disparity map by converting disparity values from ``[0..ndisp)`` interval first to ``HSV`` color space (where different disparity values correspond to different hues) and then converting the pixels to ``RGB`` for visualization.
-
-
-
 gpu::reprojectImageTo3D
---------------------------
+-----------------------
 Reprojects a disparity image to 3D space.

-.. ocv:function:: void gpu::reprojectImageTo3D(const GpuMat& disp, GpuMat& xyzw, const Mat& Q, int dst_cn = 4, Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::reprojectImageTo3D(InputArray disp, OutputArray xyzw, InputArray Q, int dst_cn = 4, Stream& stream = Stream::Null())

    :param disp: Input disparity image.  ``CV_8U``  and  ``CV_16S``  types are supported.

@ -462,6 +310,23 @@ Reprojects a disparity image to 3D space.



-.. [Felzenszwalb2006] Pedro F. Felzenszwalb algorithm [Pedro F. Felzenszwalb and Daniel P. Huttenlocher. *Efficient belief propagation for early vision*. International Journal of Computer Vision, 70(1), October 2006
+gpu::drawColorDisp
+------------------
+Colors a disparity image.

+.. ocv:function:: void gpu::drawColorDisp(InputArray src_disp, OutputArray dst_disp, int ndisp, Stream& stream = Stream::Null())
+
+    :param src_disp: Source disparity image.  ``CV_8UC1``  and  ``CV_16SC1``  types are supported.
+
+    :param dst_disp: Output disparity image. It has the same size as  ``src_disp`` . The  type is ``CV_8UC4``  in  ``BGRA``  format (alpha = 255).
+
+    :param ndisp: Number of disparities.
+
+    :param stream: Stream for the asynchronous version.
+
+This function draws a colored disparity map by converting disparity values from ``[0..ndisp)`` interval first to ``HSV`` color space (where different disparity values correspond to different hues) and then converting the pixels to ``RGB`` for visualization.
+
+
+
+.. [Felzenszwalb2006] Pedro F. Felzenszwalb algorithm [Pedro F. Felzenszwalb and Daniel P. Huttenlocher. *Efficient belief propagation for early vision*. International Journal of Computer Vision, 70(1), October 2006
 .. [Yang2010] Q. Yang, L. Wang, and N. Ahuja. *A constant-space belief propagation algorithm for stereo matching*. In CVPR, 2010.
--- a/modules/gpustereo/include/opencv2/gpustereo.hpp
+++ b/modules/gpustereo/include/opencv2/gpustereo.hpp
@ -48,199 +48,145 @@
 #endif

 #include "opencv2/core/gpu.hpp"
+#include "opencv2/calib3d.hpp"

 namespace cv { namespace gpu {

-class CV_EXPORTS StereoBM_GPU
+/////////////////////////////////////////
+// StereoBM
+
+class CV_EXPORTS StereoBM : public cv::StereoBM
 {
 public:
-    enum { BASIC_PRESET = 0, PREFILTER_XSOBEL = 1 };
+    using cv::StereoBM::compute;

-    enum { DEFAULT_NDISP = 64, DEFAULT_WINSZ = 19 };
-
-    //! the default constructor
-    StereoBM_GPU();
-    //! the full constructor taking the camera-specific preset, number of disparities and the SAD window size. ndisparities must be multiple of 8.
-    StereoBM_GPU(int preset, int ndisparities = DEFAULT_NDISP, int winSize = DEFAULT_WINSZ);
-
-    //! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair
-    //! Output disparity has CV_8U type.
-    void operator()(const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream = Stream::Null());
-
-    //! Some heuristics that tries to estmate
-    // if current GPU will be faster than CPU in this algorithm.
-    // It queries current active device.
-    static bool checkIfGpuCallReasonable();
-
-    int preset;
-    int ndisp;
-    int winSize;
-
-    // If avergeTexThreshold  == 0 => post procesing is disabled
-    // If avergeTexThreshold != 0 then disparity is set 0 in each point (x,y) where for left image
-    // SumOfHorizontalGradiensInWindow(x, y, winSize) < (winSize * winSize) * avergeTexThreshold
-    // i.e. input left image is low textured.
-    float avergeTexThreshold;
-
-private:
-    GpuMat minSSD, leBuf, riBuf;
+    virtual void compute(InputArray left, InputArray right, OutputArray disparity, Stream& stream) = 0;
 };

-// "Efficient Belief Propagation for Early Vision"
-// P.Felzenszwalb
-class CV_EXPORTS StereoBeliefPropagation
+CV_EXPORTS Ptr<gpu::StereoBM> createStereoBM(int numDisparities = 64, int blockSize = 19);
+
+/////////////////////////////////////////
+// StereoBeliefPropagation
+
+//! "Efficient Belief Propagation for Early Vision" P.Felzenszwalb
+class CV_EXPORTS StereoBeliefPropagation : public cv::StereoMatcher
 {
 public:
-    enum { DEFAULT_NDISP  = 64 };
-    enum { DEFAULT_ITERS  = 5  };
-    enum { DEFAULT_LEVELS = 5  };
-
-    static void estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels);
-
-    //! the default constructor
-    explicit StereoBeliefPropagation(int ndisp  = DEFAULT_NDISP,
-                                     int iters  = DEFAULT_ITERS,
-                                     int levels = DEFAULT_LEVELS,
-                                     int msg_type = CV_32F);
-
-    //! the full constructor taking the number of disparities, number of BP iterations on each level,
-    //! number of levels, truncation of data cost, data weight,
-    //! truncation of discontinuity cost and discontinuity single jump
-    //! DataTerm = data_weight * min(fabs(I2-I1), max_data_term)
-    //! DiscTerm = min(disc_single_jump * fabs(f1-f2), max_disc_term)
-    //! please see paper for more details
-    StereoBeliefPropagation(int ndisp, int iters, int levels,
-        float max_data_term, float data_weight,
-        float max_disc_term, float disc_single_jump,
-        int msg_type = CV_32F);
-
-    //! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair,
-    //! if disparity is empty output type will be CV_16S else output type will be disparity.type().
-    void operator()(const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream = Stream::Null());
+    using cv::StereoMatcher::compute;

+    virtual void compute(InputArray left, InputArray right, OutputArray disparity, Stream& stream) = 0;

    //! version for user specified data term
-    void operator()(const GpuMat& data, GpuMat& disparity, Stream& stream = Stream::Null());
+    virtual void compute(InputArray data, OutputArray disparity, Stream& stream = Stream::Null()) = 0;

-    int ndisp;
+    //! number of BP iterations on each level
+    virtual int getNumIters() const = 0;
+    virtual void setNumIters(int iters) = 0;

-    int iters;
-    int levels;
+    //! number of levels
+    virtual int getNumLevels() const = 0;
+    virtual void setNumLevels(int levels) = 0;

-    float max_data_term;
-    float data_weight;
-    float max_disc_term;
-    float disc_single_jump;
+    //! truncation of data cost
+    virtual double getMaxDataTerm() const = 0;
+    virtual void setMaxDataTerm(double max_data_term) = 0;

-    int msg_type;
-private:
-    GpuMat u, d, l, r, u2, d2, l2, r2;
-    std::vector<GpuMat> datas;
-    GpuMat out;
+    //! data weight
+    virtual double getDataWeight() const = 0;
+    virtual void setDataWeight(double data_weight) = 0;
+
+    //! truncation of discontinuity cost
+    virtual double getMaxDiscTerm() const = 0;
+    virtual void setMaxDiscTerm(double max_disc_term) = 0;
+
+    //! discontinuity single jump
+    virtual double getDiscSingleJump() const = 0;
+    virtual void setDiscSingleJump(double disc_single_jump) = 0;
+
+    //! type for messages (CV_16SC1 or CV_32FC1)
+    virtual int getMsgType() const = 0;
+    virtual void setMsgType(int msg_type) = 0;
+
+    static void estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels);
 };

-// "A Constant-Space Belief Propagation Algorithm for Stereo Matching"
-// Qingxiong Yang, Liang Wang, Narendra Ahuja
-// http://vision.ai.uiuc.edu/~qyang6/
-class CV_EXPORTS StereoConstantSpaceBP
+CV_EXPORTS Ptr<gpu::StereoBeliefPropagation>
+    createStereoBeliefPropagation(int ndisp = 64, int iters = 5, int levels = 5, int msg_type = CV_32F);
+
+/////////////////////////////////////////
+// StereoConstantSpaceBP
+
+//! "A Constant-Space Belief Propagation Algorithm for Stereo Matching"
+//! Qingxiong Yang, Liang Wang, Narendra Ahuja
+//! http://vision.ai.uiuc.edu/~qyang6/
+class CV_EXPORTS StereoConstantSpaceBP : public gpu::StereoBeliefPropagation
 {
 public:
-    enum { DEFAULT_NDISP    = 128 };
-    enum { DEFAULT_ITERS    = 8   };
-    enum { DEFAULT_LEVELS   = 4   };
-    enum { DEFAULT_NR_PLANE = 4   };
+    //! number of active disparity on the first level
+    virtual int getNrPlane() const = 0;
+    virtual void setNrPlane(int nr_plane) = 0;
+
+    virtual bool getUseLocalInitDataCost() const = 0;
+    virtual void setUseLocalInitDataCost(bool use_local_init_data_cost) = 0;

    static void estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels, int& nr_plane);
-
-    //! the default constructor
-    explicit StereoConstantSpaceBP(int ndisp    = DEFAULT_NDISP,
-                                   int iters    = DEFAULT_ITERS,
-                                   int levels   = DEFAULT_LEVELS,
-                                   int nr_plane = DEFAULT_NR_PLANE,
-                                   int msg_type = CV_32F);
-
-    //! the full constructor taking the number of disparities, number of BP iterations on each level,
-    //! number of levels, number of active disparity on the first level, truncation of data cost, data weight,
-    //! truncation of discontinuity cost, discontinuity single jump and minimum disparity threshold
-    StereoConstantSpaceBP(int ndisp, int iters, int levels, int nr_plane,
-        float max_data_term, float data_weight, float max_disc_term, float disc_single_jump,
-        int min_disp_th = 0,
-        int msg_type = CV_32F);
-
-    //! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair,
-    //! if disparity is empty output type will be CV_16S else output type will be disparity.type().
-    void operator()(const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream = Stream::Null());
-
-    int ndisp;
-
-    int iters;
-    int levels;
-
-    int nr_plane;
-
-    float max_data_term;
-    float data_weight;
-    float max_disc_term;
-    float disc_single_jump;
-
-    int min_disp_th;
-
-    int msg_type;
-
-    bool use_local_init_data_cost;
-private:
-    GpuMat messages_buffers;
-
-    GpuMat temp;
-    GpuMat out;
 };

-// Disparity map refinement using joint bilateral filtering given a single color image.
-// Qingxiong Yang, Liang Wang, Narendra Ahuja
-// http://vision.ai.uiuc.edu/~qyang6/
-class CV_EXPORTS DisparityBilateralFilter
+CV_EXPORTS Ptr<gpu::StereoConstantSpaceBP>
+    createStereoConstantSpaceBP(int ndisp = 128, int iters = 8, int levels = 4, int nr_plane = 4, int msg_type = CV_32F);
+
+/////////////////////////////////////////
+// DisparityBilateralFilter
+
+//! Disparity map refinement using joint bilateral filtering given a single color image.
+//! Qingxiong Yang, Liang Wang, Narendra Ahuja
+//! http://vision.ai.uiuc.edu/~qyang6/
+class CV_EXPORTS DisparityBilateralFilter : public cv::Algorithm
 {
 public:
-    enum { DEFAULT_NDISP  = 64 };
-    enum { DEFAULT_RADIUS = 3 };
-    enum { DEFAULT_ITERS  = 1 };
-
-    //! the default constructor
-    explicit DisparityBilateralFilter(int ndisp = DEFAULT_NDISP, int radius = DEFAULT_RADIUS, int iters = DEFAULT_ITERS);
-
-    //! the full constructor taking the number of disparities, filter radius,
-    //! number of iterations, truncation of data continuity, truncation of disparity continuity
-    //! and filter range sigma
-    DisparityBilateralFilter(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold, float sigma_range);
-
    //! the disparity map refinement operator. Refine disparity map using joint bilateral filtering given a single color image.
    //! disparity must have CV_8U or CV_16S type, image must have CV_8UC1 or CV_8UC3 type.
-    void operator()(const GpuMat& disparity, const GpuMat& image, GpuMat& dst, Stream& stream = Stream::Null());
+    virtual void apply(InputArray disparity, InputArray image, OutputArray dst, Stream& stream = Stream::Null()) = 0;

-private:
-    int ndisp;
-    int radius;
-    int iters;
+    virtual int getNumDisparities() const = 0;
+    virtual void setNumDisparities(int numDisparities) = 0;

-    float edge_threshold;
-    float max_disc_threshold;
-    float sigma_range;
+    virtual int getRadius() const = 0;
+    virtual void setRadius(int radius) = 0;

-    GpuMat table_color;
-    GpuMat table_space;
+    virtual int getNumIters() const = 0;
+    virtual void setNumIters(int iters) = 0;
+
+    //! truncation of data continuity
+    virtual double getEdgeThreshold() const = 0;
+    virtual void setEdgeThreshold(double edge_threshold) = 0;
+
+    //! truncation of disparity continuity
+    virtual double getMaxDiscThreshold() const = 0;
+    virtual void setMaxDiscThreshold(double max_disc_threshold) = 0;
+
+    //! filter range sigma
+    virtual double getSigmaRange() const = 0;
+    virtual void setSigmaRange(double sigma_range) = 0;
 };

+CV_EXPORTS Ptr<gpu::DisparityBilateralFilter>
+    createDisparityBilateralFilter(int ndisp = 64, int radius = 3, int iters = 1);
+
+/////////////////////////////////////////
+// Utility
+
 //! Reprojects disparity image to 3D space.
 //! Supports CV_8U and CV_16S types of input disparity.
 //! The output is a 3- or 4-channel floating-point matrix.
 //! Each element of this matrix will contain the 3D coordinates of the point (x,y,z,1), computed from the disparity map.
 //! Q is the 4x4 perspective transformation matrix that can be obtained with cvStereoRectify.
-CV_EXPORTS void reprojectImageTo3D(const GpuMat& disp, GpuMat& xyzw, const Mat& Q, int dst_cn = 4, Stream& stream = Stream::Null());
+CV_EXPORTS void reprojectImageTo3D(InputArray disp, OutputArray xyzw, InputArray Q, int dst_cn = 4, Stream& stream = Stream::Null());

 //! Does coloring of disparity image: [0..ndisp) -> [0..240, 1, 1] in HSV.
 //! Supported types of input disparity: CV_8U, CV_16S.
 //! Output disparity has CV_8UC4 type in BGRA format (alpha = 255).
-CV_EXPORTS void drawColorDisp(const GpuMat& src_disp, GpuMat& dst_disp, int ndisp, Stream& stream = Stream::Null());
+CV_EXPORTS void drawColorDisp(InputArray src_disp, OutputArray dst_disp, int ndisp, Stream& stream = Stream::Null());

 }} // namespace cv { namespace gpu {

--- a/modules/gpustereo/perf/perf_stereo.cpp
+++ b/modules/gpustereo/perf/perf_stereo.cpp
@ -63,18 +63,17 @@ PERF_TEST_P(ImagePair, StereoBM,
    const cv::Mat imgRight = readImage(GET_PARAM(1), cv::IMREAD_GRAYSCALE);
    ASSERT_FALSE(imgRight.empty());

-    const int preset = 0;
    const int ndisp = 256;

    if (PERF_RUN_GPU())
    {
-        cv::gpu::StereoBM_GPU d_bm(preset, ndisp);
+        cv::Ptr<cv::StereoBM> d_bm = cv::gpu::createStereoBM(ndisp);

        const cv::gpu::GpuMat d_imgLeft(imgLeft);
        const cv::gpu::GpuMat d_imgRight(imgRight);
        cv::gpu::GpuMat dst;

-        TEST_CYCLE() d_bm(d_imgLeft, d_imgRight, dst);
+        TEST_CYCLE() d_bm->compute(d_imgLeft, d_imgRight, dst);

        GPU_SANITY_CHECK(dst);
    }
@ -108,13 +107,13 @@ PERF_TEST_P(ImagePair, StereoBeliefPropagation,

    if (PERF_RUN_GPU())
    {
-        cv::gpu::StereoBeliefPropagation d_bp(ndisp);
+        cv::Ptr<cv::gpu::StereoBeliefPropagation> d_bp = cv::gpu::createStereoBeliefPropagation(ndisp);

        const cv::gpu::GpuMat d_imgLeft(imgLeft);
        const cv::gpu::GpuMat d_imgRight(imgRight);
        cv::gpu::GpuMat dst;

-        TEST_CYCLE() d_bp(d_imgLeft, d_imgRight, dst);
+        TEST_CYCLE() d_bp->compute(d_imgLeft, d_imgRight, dst);

        GPU_SANITY_CHECK(dst);
    }
@ -142,13 +141,13 @@ PERF_TEST_P(ImagePair, StereoConstantSpaceBP,

    if (PERF_RUN_GPU())
    {
-        cv::gpu::StereoConstantSpaceBP d_csbp(ndisp);
+        cv::Ptr<cv::gpu::StereoConstantSpaceBP> d_csbp = cv::gpu::createStereoConstantSpaceBP(ndisp);

        const cv::gpu::GpuMat d_imgLeft(imgLeft);
        const cv::gpu::GpuMat d_imgRight(imgRight);
        cv::gpu::GpuMat dst;

-        TEST_CYCLE() d_csbp(d_imgLeft, d_imgRight, dst);
+        TEST_CYCLE() d_csbp->compute(d_imgLeft, d_imgRight, dst);

        GPU_SANITY_CHECK(dst);
    }
@ -174,13 +173,13 @@ PERF_TEST_P(ImagePair, DisparityBilateralFilter,

    if (PERF_RUN_GPU())
    {
-        cv::gpu::DisparityBilateralFilter d_filter(ndisp);
+        cv::Ptr<cv::gpu::DisparityBilateralFilter> d_filter = cv::gpu::createDisparityBilateralFilter(ndisp);

        const cv::gpu::GpuMat d_img(img);
        const cv::gpu::GpuMat d_disp(disp);
        cv::gpu::GpuMat dst;

-        TEST_CYCLE() d_filter(d_disp, d_img, dst);
+        TEST_CYCLE() d_filter->apply(d_disp, d_img, dst);

        GPU_SANITY_CHECK(dst);
    }
--- a/modules/gpustereo/src/disparity_bilateral_filter.cpp
+++ b/modules/gpustereo/src/disparity_bilateral_filter.cpp
@ -47,10 +47,7 @@ using namespace cv::gpu;

 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)

-cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int, int, int) { throw_no_cuda(); }
-cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int, int, int, float, float, float) { throw_no_cuda(); }
-
-void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+Ptr<gpu::DisparityBilateralFilter> cv::gpu::createDisparityBilateralFilter(int, int, int) { throw_no_cuda(); return Ptr<gpu::DisparityBilateralFilter>(); }

 #else /* !defined (HAVE_CUDA) */

@ -65,15 +62,46 @@ namespace cv { namespace gpu { namespace cudev
    }
 }}}

-using namespace ::cv::gpu::cudev::disp_bilateral_filter;
-
 namespace
 {
-    const float DEFAULT_EDGE_THRESHOLD = 0.1f;
-    const float DEFAULT_MAX_DISC_THRESHOLD = 0.2f;
-    const float DEFAULT_SIGMA_RANGE = 10.0f;
+    class DispBilateralFilterImpl : public gpu::DisparityBilateralFilter
+    {
+    public:
+        DispBilateralFilterImpl(int ndisp, int radius, int iters);

-    inline void calc_color_weighted_table(GpuMat& table_color, float sigma_range, int len)
+        void apply(InputArray disparity, InputArray image, OutputArray dst, Stream& stream);
+
+        int getNumDisparities() const { return ndisp_; }
+        void setNumDisparities(int numDisparities) { ndisp_ = numDisparities; }
+
+        int getRadius() const { return radius_; }
+        void setRadius(int radius);
+
+        int getNumIters() const { return iters_; }
+        void setNumIters(int iters) { iters_ = iters; }
+
+        double getEdgeThreshold() const { return edge_threshold_; }
+        void setEdgeThreshold(double edge_threshold) { edge_threshold_ = (float) edge_threshold; }
+
+        double getMaxDiscThreshold() const { return max_disc_threshold_; }
+        void setMaxDiscThreshold(double max_disc_threshold) { max_disc_threshold_ = (float) max_disc_threshold; }
+
+        double getSigmaRange() const { return sigma_range_; }
+        void setSigmaRange(double sigma_range);
+
+    private:
+        int ndisp_;
+        int radius_;
+        int iters_;
+        float edge_threshold_;
+        float max_disc_threshold_;
+        float sigma_range_;
+
+        GpuMat table_color_;
+        GpuMat table_space_;
+    };
+
+    void calc_color_weighted_table(GpuMat& table_color, float sigma_range, int len)
    {
        Mat cpu_table_color(1, len, CV_32F);

@ -85,7 +113,7 @@ namespace
        table_color.upload(cpu_table_color);
    }

-    inline void calc_space_weighted_filter(GpuMat& table_space, int win_size, float dist_space)
+    void calc_space_weighted_filter(GpuMat& table_space, int win_size, float dist_space)
    {
        int half = (win_size >> 1);

@ -101,54 +129,78 @@ namespace
        table_space.upload(cpu_table_space);
    }

-    template <typename T>
-    void disp_bilateral_filter_operator(int ndisp, int radius, int iters, float edge_threshold,float max_disc_threshold,
-                                   GpuMat& table_color, GpuMat& table_space,
-                                   const GpuMat& disp, const GpuMat& img, GpuMat& dst, Stream& stream)
+    const float DEFAULT_EDGE_THRESHOLD = 0.1f;
+    const float DEFAULT_MAX_DISC_THRESHOLD = 0.2f;
+    const float DEFAULT_SIGMA_RANGE = 10.0f;
+
+    DispBilateralFilterImpl::DispBilateralFilterImpl(int ndisp, int radius, int iters) :
+        ndisp_(ndisp), radius_(radius), iters_(iters),
+        edge_threshold_(DEFAULT_EDGE_THRESHOLD), max_disc_threshold_(DEFAULT_MAX_DISC_THRESHOLD),
+        sigma_range_(DEFAULT_SIGMA_RANGE)
    {
-        short edge_disc = std::max<short>(short(1), short(ndisp * edge_threshold + 0.5));
-        short max_disc = short(ndisp * max_disc_threshold + 0.5);
+        calc_color_weighted_table(table_color_, sigma_range_, 255);
+        calc_space_weighted_filter(table_space_, radius_ * 2 + 1, radius_ + 1.0f);
+    }
+
+    void DispBilateralFilterImpl::setRadius(int radius)
+    {
+        radius_ = radius;
+        calc_space_weighted_filter(table_space_, radius_ * 2 + 1, radius_ + 1.0f);
+    }
+
+    void DispBilateralFilterImpl::setSigmaRange(double sigma_range)
+    {
+        sigma_range_ = (float) sigma_range;
+        calc_color_weighted_table(table_color_, sigma_range_, 255);
+    }
+
+    template <typename T>
+    void disp_bilateral_filter_operator(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold,
+                                        GpuMat& table_color, GpuMat& table_space,
+                                        const GpuMat& disp, const GpuMat& img,
+                                        OutputArray _dst, Stream& stream)
+    {
+        using namespace cv::gpu::cudev::disp_bilateral_filter;
+
+        const short edge_disc = std::max<short>(short(1), short(ndisp * edge_threshold + 0.5));
+        const short max_disc = short(ndisp * max_disc_threshold + 0.5);

        disp_load_constants(table_color.ptr<float>(), table_space, ndisp, radius, edge_disc, max_disc);

-        if (&dst != &disp)
-        {
+        _dst.create(disp.size(), disp.type());
+        GpuMat dst = _dst.getGpuMat();
+
+        if (dst.data != disp.data)
            disp.copyTo(dst, stream);
-        }

        disp_bilateral_filter<T>(dst, img, img.channels(), iters, StreamAccessor::getStream(stream));
    }

-    typedef void (*bilateral_filter_operator_t)(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold,
-                                                GpuMat& table_color, GpuMat& table_space,
-                                                const GpuMat& disp, const GpuMat& img, GpuMat& dst, Stream& stream);
+    void DispBilateralFilterImpl::apply(InputArray _disp, InputArray _image, OutputArray dst, Stream& stream)
+    {
+        typedef void (*bilateral_filter_operator_t)(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold,
+                                                    GpuMat& table_color, GpuMat& table_space,
+                                                    const GpuMat& disp, const GpuMat& img, OutputArray dst, Stream& stream);
+        const bilateral_filter_operator_t operators[] =
+            {disp_bilateral_filter_operator<unsigned char>, 0, 0, disp_bilateral_filter_operator<short>, 0, 0, 0, 0};

-    const bilateral_filter_operator_t operators[] =
-        {disp_bilateral_filter_operator<unsigned char>, 0, 0, disp_bilateral_filter_operator<short>, 0, 0, 0, 0};
+        CV_Assert( 0 < ndisp_ && 0 < radius_ && 0 < iters_ );
+
+        GpuMat disp = _disp.getGpuMat();
+        GpuMat img = _image.getGpuMat();
+
+        CV_Assert( disp.type() == CV_8U || disp.type() == CV_16S );
+        CV_Assert( img.type() == CV_8UC1 || img.type() == CV_8UC3 );
+        CV_Assert( disp.size() == img.size() );
+
+        operators[disp.type()](ndisp_, radius_, iters_, edge_threshold_, max_disc_threshold_,
+                               table_color_, table_space_, disp, img, dst, stream);
+    }
 }

-cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int ndisp_, int radius_, int iters_)
-    : ndisp(ndisp_), radius(radius_), iters(iters_), edge_threshold(DEFAULT_EDGE_THRESHOLD), max_disc_threshold(DEFAULT_MAX_DISC_THRESHOLD),
-      sigma_range(DEFAULT_SIGMA_RANGE)
+Ptr<gpu::DisparityBilateralFilter> cv::gpu::createDisparityBilateralFilter(int ndisp, int radius, int iters)
 {
-    calc_color_weighted_table(table_color, sigma_range, 255);
-    calc_space_weighted_filter(table_space, radius * 2 + 1, radius + 1.0f);
-}
-
-cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int ndisp_, int radius_, int iters_, float edge_threshold_,
-                                                     float max_disc_threshold_, float sigma_range_)
-    : ndisp(ndisp_), radius(radius_), iters(iters_), edge_threshold(edge_threshold_), max_disc_threshold(max_disc_threshold_),
-      sigma_range(sigma_range_)
-{
-    calc_color_weighted_table(table_color, sigma_range, 255);
-    calc_space_weighted_filter(table_space, radius * 2 + 1, radius + 1.0f);
-}
-
-void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat& disp, const GpuMat& img, GpuMat& dst, Stream& stream)
-{
-    CV_DbgAssert(0 < ndisp && 0 < radius && 0 < iters);
-    CV_Assert(disp.rows == img.rows && disp.cols == img.cols && (disp.type() == CV_8U || disp.type() == CV_16S) && (img.type() == CV_8UC1 || img.type() == CV_8UC3));
-    operators[disp.type()](ndisp, radius, iters, edge_threshold, max_disc_threshold, table_color, table_space, disp, img, dst, stream);
+    return new DispBilateralFilterImpl(ndisp, radius, iters);
 }

 #endif /* !defined (HAVE_CUDA) */
--- a/modules/gpustereo/src/precomp.hpp
+++ b/modules/gpustereo/src/precomp.hpp
@ -48,5 +48,6 @@
 #include "opencv2/gpustereo.hpp"

 #include "opencv2/core/private.gpu.hpp"
+#include "opencv2/core/utility.hpp"

 #endif /* __OPENCV_PRECOMP_H__ */
--- a/modules/gpustereo/src/stereobm.cpp
+++ b/modules/gpustereo/src/stereobm.cpp
@ -47,11 +47,7 @@ using namespace cv::gpu;

 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)

-cv::gpu::StereoBM_GPU::StereoBM_GPU() { throw_no_cuda(); }
-cv::gpu::StereoBM_GPU::StereoBM_GPU(int, int, int) { throw_no_cuda(); }
-
-bool cv::gpu::StereoBM_GPU::checkIfGpuCallReasonable() { throw_no_cuda(); return false; }
-void cv::gpu::StereoBM_GPU::operator() ( const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+Ptr<gpu::StereoBM> cv::gpu::createStereoBM(int, int) { throw_no_cuda(); return Ptr<gpu::StereoBM>(); }

 #else /* !defined (HAVE_CUDA) */

@ -67,74 +63,123 @@ namespace cv { namespace gpu { namespace cudev

 namespace
 {
-    const float defaultAvgTexThreshold = 3;
-}
+    class StereoBMImpl : public gpu::StereoBM
+    {
+    public:
+        StereoBMImpl(int numDisparities, int blockSize);

-cv::gpu::StereoBM_GPU::StereoBM_GPU()
-    : preset(BASIC_PRESET), ndisp(DEFAULT_NDISP), winSize(DEFAULT_WINSZ), avergeTexThreshold(defaultAvgTexThreshold)
-{
-}
+        void compute(InputArray left, InputArray right, OutputArray disparity);
+        void compute(InputArray left, InputArray right, OutputArray disparity, Stream& stream);

-cv::gpu::StereoBM_GPU::StereoBM_GPU(int preset_, int ndisparities_, int winSize_)
-    : preset(preset_), ndisp(ndisparities_), winSize(winSize_), avergeTexThreshold(defaultAvgTexThreshold)
-{
-    const int max_supported_ndisp = 1 << (sizeof(unsigned char) * 8);
-    CV_Assert(0 < ndisp && ndisp <= max_supported_ndisp);
-    CV_Assert(ndisp % 8 == 0);
-    CV_Assert(winSize % 2 == 1);
-}
+        int getMinDisparity() const { return 0; }
+        void setMinDisparity(int /*minDisparity*/) {}

-bool cv::gpu::StereoBM_GPU::checkIfGpuCallReasonable()
-{
-    if (0 == getCudaEnabledDeviceCount())
-        return false;
+        int getNumDisparities() const { return ndisp_; }
+        void setNumDisparities(int numDisparities) { ndisp_ = numDisparities; }

-    DeviceInfo device_info;
+        int getBlockSize() const { return winSize_; }
+        void setBlockSize(int blockSize) { winSize_ = blockSize; }

-    if (device_info.major() > 1 || device_info.multiProcessorCount() > 16)
-        return true;
+        int getSpeckleWindowSize() const { return 0; }
+        void setSpeckleWindowSize(int /*speckleWindowSize*/) {}

-    return false;
-}
+        int getSpeckleRange() const { return 0; }
+        void setSpeckleRange(int /*speckleRange*/) {}

-namespace
-{
-    void stereo_bm_gpu_operator( GpuMat& minSSD,  GpuMat& leBuf, GpuMat&  riBuf,  int preset, int ndisp, int winSize, float avergeTexThreshold, const GpuMat& left, const GpuMat& right, GpuMat& disparity, cudaStream_t stream)
+        int getDisp12MaxDiff() const { return 0; }
+        void setDisp12MaxDiff(int /*disp12MaxDiff*/) {}
+
+        int getPreFilterType() const { return preset_; }
+        void setPreFilterType(int preFilterType) { preset_ = preFilterType; }
+
+        int getPreFilterSize() const { return 0; }
+        void setPreFilterSize(int /*preFilterSize*/) {}
+
+        int getPreFilterCap() const { return preFilterCap_; }
+        void setPreFilterCap(int preFilterCap) { preFilterCap_ = preFilterCap; }
+
+        int getTextureThreshold() const { return avergeTexThreshold_; }
+        void setTextureThreshold(int textureThreshold) { avergeTexThreshold_ = textureThreshold; }
+
+        int getUniquenessRatio() const { return 0; }
+        void setUniquenessRatio(int /*uniquenessRatio*/) {}
+
+        int getSmallerBlockSize() const { return 0; }
+        void setSmallerBlockSize(int /*blockSize*/){}
+
+        Rect getROI1() const { return Rect(); }
+        void setROI1(Rect /*roi1*/) {}
+
+        Rect getROI2() const { return Rect(); }
+        void setROI2(Rect /*roi2*/) {}
+
+    private:
+        int preset_;
+        int ndisp_;
+        int winSize_;
+        int preFilterCap_;
+        float avergeTexThreshold_;
+
+        GpuMat minSSD_, leBuf_, riBuf_;
+    };
+
+    StereoBMImpl::StereoBMImpl(int numDisparities, int blockSize)
+        : preset_(0), ndisp_(numDisparities), winSize_(blockSize), preFilterCap_(31), avergeTexThreshold_(3)
+    {
+    }
+
+    void StereoBMImpl::compute(InputArray left, InputArray right, OutputArray disparity)
+    {
+        compute(left, right, disparity, Stream::Null());
+    }
+
+    void StereoBMImpl::compute(InputArray _left, InputArray _right, OutputArray _disparity, Stream& _stream)
    {
        using namespace ::cv::gpu::cudev::stereobm;

-        CV_Assert(left.rows == right.rows && left.cols == right.cols);
-        CV_Assert(left.type() == CV_8UC1);
-        CV_Assert(right.type() == CV_8UC1);
+        const int max_supported_ndisp = 1 << (sizeof(unsigned char) * 8);
+        CV_Assert( 0 < ndisp_ && ndisp_ <= max_supported_ndisp );
+        CV_Assert( ndisp_ % 8 == 0 );
+        CV_Assert( winSize_ % 2 == 1 );

-        disparity.create(left.size(), CV_8U);
-        minSSD.create(left.size(), CV_32S);
+        GpuMat left = _left.getGpuMat();
+        GpuMat right = _right.getGpuMat();

-        GpuMat le_for_bm =  left;
-        GpuMat ri_for_bm = right;
+        CV_Assert( left.type() == CV_8UC1 );
+        CV_Assert( left.size() == right.size() && left.type() == right.type() );

-        if (preset == StereoBM_GPU::PREFILTER_XSOBEL)
+        _disparity.create(left.size(), CV_8UC1);
+        GpuMat disparity = _disparity.getGpuMat();
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+        gpu::ensureSizeIsEnough(left.size(), CV_32SC1, minSSD_);
+
+        PtrStepSzb le_for_bm =  left;
+        PtrStepSzb ri_for_bm = right;
+
+        if (preset_ == cv::StereoBM::PREFILTER_XSOBEL)
        {
-            leBuf.create( left.size(),  left.type());
-            riBuf.create(right.size(), right.type());
+            gpu::ensureSizeIsEnough(left.size(), left.type(), leBuf_);
+            gpu::ensureSizeIsEnough(right.size(), right.type(), riBuf_);

-            prefilter_xsobel( left, leBuf, 31, stream);
-            prefilter_xsobel(right, riBuf, 31, stream);
+            prefilter_xsobel( left, leBuf_, preFilterCap_, stream);
+            prefilter_xsobel(right, riBuf_, preFilterCap_, stream);

-            le_for_bm = leBuf;
-            ri_for_bm = riBuf;
+            le_for_bm = leBuf_;
+            ri_for_bm = riBuf_;
        }

-        stereoBM_GPU(le_for_bm, ri_for_bm, disparity, ndisp, winSize, minSSD, stream);
+        stereoBM_GPU(le_for_bm, ri_for_bm, disparity, ndisp_, winSize_, minSSD_, stream);

-        if (avergeTexThreshold)
-            postfilter_textureness(le_for_bm, winSize, avergeTexThreshold, disparity, stream);
+        if (avergeTexThreshold_ > 0)
+            postfilter_textureness(le_for_bm, winSize_, avergeTexThreshold_, disparity, stream);
    }
 }

-void cv::gpu::StereoBM_GPU::operator() ( const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream)
+Ptr<gpu::StereoBM> cv::gpu::createStereoBM(int numDisparities, int blockSize)
 {
-    stereo_bm_gpu_operator(minSSD, leBuf, riBuf, preset, ndisp, winSize, avergeTexThreshold, left, right, disparity, StreamAccessor::getStream(stream));
+    return new StereoBMImpl(numDisparities, blockSize);
 }

 #endif /* !defined (HAVE_CUDA) */
--- a/modules/gpustereo/src/stereobp.cpp
+++ b/modules/gpustereo/src/stereobp.cpp
@ -49,12 +49,7 @@ using namespace cv::gpu;

 void cv::gpu::StereoBeliefPropagation::estimateRecommendedParams(int, int, int&, int&, int&) { throw_no_cuda(); }

-cv::gpu::StereoBeliefPropagation::StereoBeliefPropagation(int, int, int, int) { throw_no_cuda(); }
-cv::gpu::StereoBeliefPropagation::StereoBeliefPropagation(int, int, int, float, float, float, float, int) { throw_no_cuda(); }
-
-void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-
-void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+Ptr<gpu::StereoBeliefPropagation> cv::gpu::createStereoBeliefPropagation(int, int, int, int) { throw_no_cuda(); return Ptr<gpu::StereoBeliefPropagation>(); }

 #else /* !defined (HAVE_CUDA) */

@ -78,14 +73,295 @@ namespace cv { namespace gpu { namespace cudev
    }
 }}}

-using namespace ::cv::gpu::cudev::stereobp;
-
 namespace
 {
+    class StereoBPImpl : public gpu::StereoBeliefPropagation
+    {
+    public:
+        StereoBPImpl(int ndisp, int iters, int levels, int msg_type);
+
+        void compute(InputArray left, InputArray right, OutputArray disparity);
+        void compute(InputArray left, InputArray right, OutputArray disparity, Stream& stream);
+        void compute(InputArray data, OutputArray disparity, Stream& stream);
+
+        int getMinDisparity() const { return 0; }
+        void setMinDisparity(int /*minDisparity*/) {}
+
+        int getNumDisparities() const { return ndisp_; }
+        void setNumDisparities(int numDisparities) { ndisp_ = numDisparities; }
+
+        int getBlockSize() const { return 0; }
+        void setBlockSize(int /*blockSize*/) {}
+
+        int getSpeckleWindowSize() const { return 0; }
+        void setSpeckleWindowSize(int /*speckleWindowSize*/) {}
+
+        int getSpeckleRange() const { return 0; }
+        void setSpeckleRange(int /*speckleRange*/) {}
+
+        int getDisp12MaxDiff() const { return 0; }
+        void setDisp12MaxDiff(int /*disp12MaxDiff*/) {}
+
+        int getNumIters() const { return iters_; }
+        void setNumIters(int iters) { iters_ = iters; }
+
+        int getNumLevels() const { return levels_; }
+        void setNumLevels(int levels) { levels_ = levels; }
+
+        double getMaxDataTerm() const { return max_data_term_; }
+        void setMaxDataTerm(double max_data_term) { max_data_term_ = (float) max_data_term; }
+
+        double getDataWeight() const { return data_weight_; }
+        void setDataWeight(double data_weight) { data_weight_ = (float) data_weight; }
+
+        double getMaxDiscTerm() const { return max_disc_term_; }
+        void setMaxDiscTerm(double max_disc_term) { max_disc_term_ = (float) max_disc_term; }
+
+        double getDiscSingleJump() const { return disc_single_jump_; }
+        void setDiscSingleJump(double disc_single_jump) { disc_single_jump_ = (float) disc_single_jump; }
+
+        int getMsgType() const { return msg_type_; }
+        void setMsgType(int msg_type) { msg_type_ = msg_type; }
+
+    private:
+        void init(Stream& stream);
+        void calcBP(OutputArray disp, Stream& stream);
+
+        int ndisp_;
+        int iters_;
+        int levels_;
+        float max_data_term_;
+        float data_weight_;
+        float max_disc_term_;
+        float disc_single_jump_;
+        int msg_type_;
+
+        float scale_;
+        int rows_, cols_;
+        std::vector<int> cols_all_, rows_all_;
+        GpuMat u_, d_, l_, r_, u2_, d2_, l2_, r2_;
+        std::vector<GpuMat> datas_;
+        GpuMat outBuf_;
+    };
+
    const float DEFAULT_MAX_DATA_TERM = 10.0f;
    const float DEFAULT_DATA_WEIGHT = 0.07f;
    const float DEFAULT_MAX_DISC_TERM = 1.7f;
    const float DEFAULT_DISC_SINGLE_JUMP = 1.0f;
+
+    StereoBPImpl::StereoBPImpl(int ndisp, int iters, int levels, int msg_type) :
+        ndisp_(ndisp), iters_(iters), levels_(levels),
+        max_data_term_(DEFAULT_MAX_DATA_TERM), data_weight_(DEFAULT_DATA_WEIGHT),
+        max_disc_term_(DEFAULT_MAX_DISC_TERM), disc_single_jump_(DEFAULT_DISC_SINGLE_JUMP),
+        msg_type_(msg_type)
+    {
+    }
+
+    void StereoBPImpl::compute(InputArray left, InputArray right, OutputArray disparity)
+    {
+        compute(left, right, disparity, Stream::Null());
+    }
+
+    void StereoBPImpl::compute(InputArray _left, InputArray _right, OutputArray disparity, Stream& stream)
+    {
+        using namespace cv::gpu::cudev::stereobp;
+
+        typedef void (*comp_data_t)(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream);
+        static const comp_data_t comp_data_callers[2][5] =
+        {
+            {0, comp_data_gpu<unsigned char, short>, 0, comp_data_gpu<uchar3, short>, comp_data_gpu<uchar4, short>},
+            {0, comp_data_gpu<unsigned char, float>, 0, comp_data_gpu<uchar3, float>, comp_data_gpu<uchar4, float>}
+        };
+
+        scale_ = msg_type_ == CV_32F ? 1.0f : 10.0f;
+
+        CV_Assert( 0 < ndisp_ && 0 < iters_ && 0 < levels_ );
+        CV_Assert( msg_type_ == CV_32F || msg_type_ == CV_16S );
+        CV_Assert( msg_type_ == CV_32F || (1 << (levels_ - 1)) * scale_ * max_data_term_ < std::numeric_limits<short>::max() );
+
+        GpuMat left = _left.getGpuMat();
+        GpuMat right = _right.getGpuMat();
+
+        CV_Assert( left.type() == CV_8UC1 || left.type() == CV_8UC3 || left.type() == CV_8UC4 );
+        CV_Assert( left.size() == right.size() && left.type() == right.type() );
+
+        rows_ = left.rows;
+        cols_ = left.cols;
+
+        const int divisor = (int) pow(2.f, levels_ - 1.0f);
+        const int lowest_cols = cols_ / divisor;
+        const int lowest_rows = rows_ / divisor;
+        const int min_image_dim_size = 2;
+        CV_Assert( std::min(lowest_cols, lowest_rows) > min_image_dim_size );
+
+        init(stream);
+
+        datas_[0].create(rows_ * ndisp_, cols_, msg_type_);
+
+        comp_data_callers[msg_type_ == CV_32F][left.channels()](left, right, datas_[0], StreamAccessor::getStream(stream));
+
+        calcBP(disparity, stream);
+    }
+
+    void StereoBPImpl::compute(InputArray _data, OutputArray disparity, Stream& stream)
+    {
+        scale_ = msg_type_ == CV_32F ? 1.0f : 10.0f;
+
+        CV_Assert( 0 < ndisp_ && 0 < iters_ && 0 < levels_ );
+        CV_Assert( msg_type_ == CV_32F || msg_type_ == CV_16S );
+        CV_Assert( msg_type_ == CV_32F || (1 << (levels_ - 1)) * scale_ * max_data_term_ < std::numeric_limits<short>::max() );
+
+        GpuMat data = _data.getGpuMat();
+
+        CV_Assert( (data.type() == msg_type_) && (data.rows % ndisp_ == 0) );
+
+        rows_ = data.rows / ndisp_;
+        cols_ = data.cols;
+
+        const int divisor = (int) pow(2.f, levels_ - 1.0f);
+        const int lowest_cols = cols_ / divisor;
+        const int lowest_rows = rows_ / divisor;
+        const int min_image_dim_size = 2;
+        CV_Assert( std::min(lowest_cols, lowest_rows) > min_image_dim_size );
+
+        init(stream);
+
+        data.copyTo(datas_[0], stream);
+
+        calcBP(disparity, stream);
+    }
+
+    void StereoBPImpl::init(Stream& stream)
+    {
+        using namespace cv::gpu::cudev::stereobp;
+
+        u_.create(rows_ * ndisp_, cols_, msg_type_);
+        d_.create(rows_ * ndisp_, cols_, msg_type_);
+        l_.create(rows_ * ndisp_, cols_, msg_type_);
+        r_.create(rows_ * ndisp_, cols_, msg_type_);
+
+        if (levels_ & 1)
+        {
+            //can clear less area
+            u_.setTo(0, stream);
+            d_.setTo(0, stream);
+            l_.setTo(0, stream);
+            r_.setTo(0, stream);
+        }
+
+        if (levels_ > 1)
+        {
+            int less_rows = (rows_ + 1) / 2;
+            int less_cols = (cols_ + 1) / 2;
+
+            u2_.create(less_rows * ndisp_, less_cols, msg_type_);
+            d2_.create(less_rows * ndisp_, less_cols, msg_type_);
+            l2_.create(less_rows * ndisp_, less_cols, msg_type_);
+            r2_.create(less_rows * ndisp_, less_cols, msg_type_);
+
+            if ((levels_ & 1) == 0)
+            {
+                u2_.setTo(0, stream);
+                d2_.setTo(0, stream);
+                l2_.setTo(0, stream);
+                r2_.setTo(0, stream);
+            }
+        }
+
+        load_constants(ndisp_, max_data_term_, scale_ * data_weight_, scale_ * max_disc_term_, scale_ * disc_single_jump_);
+
+        datas_.resize(levels_);
+
+        cols_all_.resize(levels_);
+        rows_all_.resize(levels_);
+
+        cols_all_[0] = cols_;
+        rows_all_[0] = rows_;
+    }
+
+    void StereoBPImpl::calcBP(OutputArray disp, Stream& _stream)
+    {
+        using namespace cv::gpu::cudev::stereobp;
+
+        typedef void (*data_step_down_t)(int dst_cols, int dst_rows, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);
+        static const data_step_down_t data_step_down_callers[2] =
+        {
+            data_step_down_gpu<short>, data_step_down_gpu<float>
+        };
+
+        typedef void (*level_up_messages_t)(int dst_idx, int dst_cols, int dst_rows, int src_rows, PtrStepSzb* mus, PtrStepSzb* mds, PtrStepSzb* mls, PtrStepSzb* mrs, cudaStream_t stream);
+        static const level_up_messages_t level_up_messages_callers[2] =
+        {
+            level_up_messages_gpu<short>, level_up_messages_gpu<float>
+        };
+
+        typedef void (*calc_all_iterations_t)(int cols, int rows, int iters, const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, cudaStream_t stream);
+        static const calc_all_iterations_t calc_all_iterations_callers[2] =
+        {
+            calc_all_iterations_gpu<short>, calc_all_iterations_gpu<float>
+        };
+
+        typedef void (*output_t)(const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, const PtrStepSz<short>& disp, cudaStream_t stream);
+        static const output_t output_callers[2] =
+        {
+            output_gpu<short>, output_gpu<float>
+        };
+
+        const int funcIdx = msg_type_ == CV_32F;
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+        for (int i = 1; i < levels_; ++i)
+        {
+            cols_all_[i] = (cols_all_[i-1] + 1) / 2;
+            rows_all_[i] = (rows_all_[i-1] + 1) / 2;
+
+            datas_[i].create(rows_all_[i] * ndisp_, cols_all_[i], msg_type_);
+
+            data_step_down_callers[funcIdx](cols_all_[i], rows_all_[i], rows_all_[i-1], datas_[i-1], datas_[i], stream);
+        }
+
+        PtrStepSzb mus[] = {u_, u2_};
+        PtrStepSzb mds[] = {d_, d2_};
+        PtrStepSzb mrs[] = {r_, r2_};
+        PtrStepSzb mls[] = {l_, l2_};
+
+        int mem_idx = (levels_ & 1) ? 0 : 1;
+
+        for (int i = levels_ - 1; i >= 0; --i)
+        {
+            // for lower level we have already computed messages by setting to zero
+            if (i != levels_ - 1)
+                level_up_messages_callers[funcIdx](mem_idx, cols_all_[i], rows_all_[i], rows_all_[i+1], mus, mds, mls, mrs, stream);
+
+            calc_all_iterations_callers[funcIdx](cols_all_[i], rows_all_[i], iters_, mus[mem_idx], mds[mem_idx], mls[mem_idx], mrs[mem_idx], datas_[i], stream);
+
+            mem_idx = (mem_idx + 1) & 1;
+        }
+
+        const int dtype = disp.fixedType() ? disp.type() : CV_16SC1;
+
+        disp.create(rows_, cols_, dtype);
+        GpuMat out = disp.getGpuMat();
+
+        if (dtype != CV_16SC1)
+        {
+            outBuf_.create(rows_, cols_, CV_16SC1);
+            out = outBuf_;
+        }
+
+        out.setTo(0, _stream);
+
+        output_callers[funcIdx](u_, d_, l_, r_, datas_.front(), out, stream);
+
+        if (dtype != CV_16SC1)
+            out.convertTo(disp, dtype, _stream);
+    }
+}
+
+Ptr<gpu::StereoBeliefPropagation> cv::gpu::createStereoBeliefPropagation(int ndisp, int iters, int levels, int msg_type)
+{
+    return new StereoBPImpl(ndisp, iters, levels, msg_type);
 }

 void cv::gpu::StereoBeliefPropagation::estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels)
@ -101,240 +377,4 @@ void cv::gpu::StereoBeliefPropagation::estimateRecommendedParams(int width, int
    if (levels == 0) levels++;
 }

-cv::gpu::StereoBeliefPropagation::StereoBeliefPropagation(int ndisp_, int iters_, int levels_, int msg_type_)
-    : ndisp(ndisp_), iters(iters_), levels(levels_),
-      max_data_term(DEFAULT_MAX_DATA_TERM), data_weight(DEFAULT_DATA_WEIGHT),
-      max_disc_term(DEFAULT_MAX_DISC_TERM), disc_single_jump(DEFAULT_DISC_SINGLE_JUMP),
-      msg_type(msg_type_), datas(levels_)
-{
-}
-
-cv::gpu::StereoBeliefPropagation::StereoBeliefPropagation(int ndisp_, int iters_, int levels_, float max_data_term_, float data_weight_, float max_disc_term_, float disc_single_jump_, int msg_type_)
-    : ndisp(ndisp_), iters(iters_), levels(levels_),
-      max_data_term(max_data_term_), data_weight(data_weight_),
-      max_disc_term(max_disc_term_), disc_single_jump(disc_single_jump_),
-      msg_type(msg_type_), datas(levels_)
-{
-}
-
-namespace
-{
-    class StereoBeliefPropagationImpl
-    {
-    public:
-        StereoBeliefPropagationImpl(StereoBeliefPropagation& rthis_,
-                                    GpuMat& u_, GpuMat& d_, GpuMat& l_, GpuMat& r_,
-                                    GpuMat& u2_, GpuMat& d2_, GpuMat& l2_, GpuMat& r2_,
-                                    std::vector<GpuMat>& datas_, GpuMat& out_)
-            : rthis(rthis_), u(u_), d(d_), l(l_), r(r_), u2(u2_), d2(d2_), l2(l2_), r2(r2_), datas(datas_), out(out_),
-              zero(Scalar::all(0)), scale(rthis_.msg_type == CV_32F ? 1.0f : 10.0f)
-        {
-            CV_Assert(0 < rthis.ndisp && 0 < rthis.iters && 0 < rthis.levels);
-            CV_Assert(rthis.msg_type == CV_32F || rthis.msg_type == CV_16S);
-            CV_Assert(rthis.msg_type == CV_32F || (1 << (rthis.levels - 1)) * scale * rthis.max_data_term < std::numeric_limits<short>::max());
-        }
-
-        void operator()(const GpuMat& left, const GpuMat& right, GpuMat& disp, Stream& stream)
-        {
-            typedef void (*comp_data_t)(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream);
-            static const comp_data_t comp_data_callers[2][5] =
-            {
-                {0, comp_data_gpu<unsigned char, short>, 0, comp_data_gpu<uchar3, short>, comp_data_gpu<uchar4, short>},
-                {0, comp_data_gpu<unsigned char, float>, 0, comp_data_gpu<uchar3, float>, comp_data_gpu<uchar4, float>}
-            };
-
-            CV_Assert(left.size() == right.size() && left.type() == right.type());
-            CV_Assert(left.type() == CV_8UC1 || left.type() == CV_8UC3 || left.type() == CV_8UC4);
-
-            rows = left.rows;
-            cols = left.cols;
-
-            int divisor = (int)pow(2.f, rthis.levels - 1.0f);
-            int lowest_cols = cols / divisor;
-            int lowest_rows = rows / divisor;
-            const int min_image_dim_size = 2;
-            CV_Assert(std::min(lowest_cols, lowest_rows) > min_image_dim_size);
-
-            init(stream);
-
-            datas[0].create(rows * rthis.ndisp, cols, rthis.msg_type);
-
-            comp_data_callers[rthis.msg_type == CV_32F][left.channels()](left, right, datas[0], StreamAccessor::getStream(stream));
-
-            calcBP(disp, stream);
-        }
-
-        void operator()(const GpuMat& data, GpuMat& disp, Stream& stream)
-        {
-            CV_Assert((data.type() == rthis.msg_type) && (data.rows % rthis.ndisp == 0));
-
-            rows = data.rows / rthis.ndisp;
-            cols = data.cols;
-
-            int divisor = (int)pow(2.f, rthis.levels - 1.0f);
-            int lowest_cols = cols / divisor;
-            int lowest_rows = rows / divisor;
-            const int min_image_dim_size = 2;
-            CV_Assert(std::min(lowest_cols, lowest_rows) > min_image_dim_size);
-
-            init(stream);
-
-            datas[0] = data;
-
-            calcBP(disp, stream);
-        }
-    private:
-        void init(Stream& stream)
-        {
-            u.create(rows * rthis.ndisp, cols, rthis.msg_type);
-            d.create(rows * rthis.ndisp, cols, rthis.msg_type);
-            l.create(rows * rthis.ndisp, cols, rthis.msg_type);
-            r.create(rows * rthis.ndisp, cols, rthis.msg_type);
-
-            if (rthis.levels & 1)
-            {
-                //can clear less area
-                u.setTo(zero, stream);
-                d.setTo(zero, stream);
-                l.setTo(zero, stream);
-                r.setTo(zero, stream);
-            }
-
-            if (rthis.levels > 1)
-            {
-                int less_rows = (rows + 1) / 2;
-                int less_cols = (cols + 1) / 2;
-
-                u2.create(less_rows * rthis.ndisp, less_cols, rthis.msg_type);
-                d2.create(less_rows * rthis.ndisp, less_cols, rthis.msg_type);
-                l2.create(less_rows * rthis.ndisp, less_cols, rthis.msg_type);
-                r2.create(less_rows * rthis.ndisp, less_cols, rthis.msg_type);
-
-                if ((rthis.levels & 1) == 0)
-                {
-                    u2.setTo(zero, stream);
-                    d2.setTo(zero, stream);
-                    l2.setTo(zero, stream);
-                    r2.setTo(zero, stream);
-                }
-            }
-
-            load_constants(rthis.ndisp, rthis.max_data_term, scale * rthis.data_weight, scale * rthis.max_disc_term, scale * rthis.disc_single_jump);
-
-            datas.resize(rthis.levels);
-
-            cols_all.resize(rthis.levels);
-            rows_all.resize(rthis.levels);
-
-            cols_all[0] = cols;
-            rows_all[0] = rows;
-        }
-
-        void calcBP(GpuMat& disp, Stream& stream)
-        {
-            typedef void (*data_step_down_t)(int dst_cols, int dst_rows, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);
-            static const data_step_down_t data_step_down_callers[2] =
-            {
-                data_step_down_gpu<short>, data_step_down_gpu<float>
-            };
-
-            typedef void (*level_up_messages_t)(int dst_idx, int dst_cols, int dst_rows, int src_rows, PtrStepSzb* mus, PtrStepSzb* mds, PtrStepSzb* mls, PtrStepSzb* mrs, cudaStream_t stream);
-            static const level_up_messages_t level_up_messages_callers[2] =
-            {
-                level_up_messages_gpu<short>, level_up_messages_gpu<float>
-            };
-
-            typedef void (*calc_all_iterations_t)(int cols, int rows, int iters, const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, cudaStream_t stream);
-            static const calc_all_iterations_t calc_all_iterations_callers[2] =
-            {
-                calc_all_iterations_gpu<short>, calc_all_iterations_gpu<float>
-            };
-
-            typedef void (*output_t)(const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, const PtrStepSz<short>& disp, cudaStream_t stream);
-            static const output_t output_callers[2] =
-            {
-                output_gpu<short>, output_gpu<float>
-            };
-
-            const int funcIdx = rthis.msg_type == CV_32F;
-
-            cudaStream_t cudaStream = StreamAccessor::getStream(stream);
-
-            for (int i = 1; i < rthis.levels; ++i)
-            {
-                cols_all[i] = (cols_all[i-1] + 1) / 2;
-                rows_all[i] = (rows_all[i-1] + 1) / 2;
-
-                datas[i].create(rows_all[i] * rthis.ndisp, cols_all[i], rthis.msg_type);
-
-                data_step_down_callers[funcIdx](cols_all[i], rows_all[i], rows_all[i-1], datas[i-1], datas[i], cudaStream);
-            }
-
-            PtrStepSzb mus[] = {u, u2};
-            PtrStepSzb mds[] = {d, d2};
-            PtrStepSzb mrs[] = {r, r2};
-            PtrStepSzb mls[] = {l, l2};
-
-            int mem_idx = (rthis.levels & 1) ? 0 : 1;
-
-            for (int i = rthis.levels - 1; i >= 0; --i)
-            {
-                // for lower level we have already computed messages by setting to zero
-                if (i != rthis.levels - 1)
-                    level_up_messages_callers[funcIdx](mem_idx, cols_all[i], rows_all[i], rows_all[i+1], mus, mds, mls, mrs, cudaStream);
-
-                calc_all_iterations_callers[funcIdx](cols_all[i], rows_all[i], rthis.iters, mus[mem_idx], mds[mem_idx], mls[mem_idx], mrs[mem_idx], datas[i], cudaStream);
-
-                mem_idx = (mem_idx + 1) & 1;
-            }
-
-            if (disp.empty())
-                disp.create(rows, cols, CV_16S);
-
-            out = ((disp.type() == CV_16S) ? disp : (out.create(rows, cols, CV_16S), out));
-
-            out.setTo(zero, stream);
-
-            output_callers[funcIdx](u, d, l, r, datas.front(), out, cudaStream);
-
-            if (disp.type() != CV_16S)
-                out.convertTo(disp, disp.type(), stream);
-        }
-
-        StereoBeliefPropagation& rthis;
-
-        GpuMat& u;
-        GpuMat& d;
-        GpuMat& l;
-        GpuMat& r;
-
-        GpuMat& u2;
-        GpuMat& d2;
-        GpuMat& l2;
-        GpuMat& r2;
-
-        std::vector<GpuMat>& datas;
-        GpuMat& out;
-
-        const Scalar zero;
-        const float scale;
-
-        int rows, cols;
-
-        std::vector<int> cols_all, rows_all;
-    };
-}
-
-void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat& left, const GpuMat& right, GpuMat& disp, Stream& stream)
-{
-    StereoBeliefPropagationImpl impl(*this, u, d, l, r, u2, d2, l2, r2, datas, out);
-    impl(left, right, disp, stream);
-}
-
-void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat& data, GpuMat& disp, Stream& stream)
-{
-    StereoBeliefPropagationImpl impl(*this, u, d, l, r, u2, d2, l2, r2, datas, out);
-    impl(data, disp, stream);
-}
-
 #endif /* !defined (HAVE_CUDA) */
--- a/modules/gpustereo/src/stereocsbp.cpp
+++ b/modules/gpustereo/src/stereocsbp.cpp
@ -49,13 +49,9 @@ using namespace cv::gpu;

 void cv::gpu::StereoConstantSpaceBP::estimateRecommendedParams(int, int, int&, int&, int&, int&) { throw_no_cuda(); }

-cv::gpu::StereoConstantSpaceBP::StereoConstantSpaceBP(int, int, int, int, int) { throw_no_cuda(); }
-cv::gpu::StereoConstantSpaceBP::StereoConstantSpaceBP(int, int, int, int, float, float, float, float, int, int) { throw_no_cuda(); }
-
-void cv::gpu::StereoConstantSpaceBP::operator()(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+Ptr<gpu::StereoConstantSpaceBP> cv::gpu::createStereoConstantSpaceBP(int, int, int, int, int) { throw_no_cuda(); return Ptr<gpu::StereoConstantSpaceBP>(); }

 #else /* !defined (HAVE_CUDA) */
-#include "opencv2/core/utility.hpp"

 namespace cv { namespace gpu { namespace cudev
 {
@ -89,14 +85,288 @@ namespace cv { namespace gpu { namespace cudev
    }
 }}}

-using namespace ::cv::gpu::cudev::stereocsbp;
-
 namespace
 {
+    class StereoCSBPImpl : public gpu::StereoConstantSpaceBP
+    {
+    public:
+        StereoCSBPImpl(int ndisp, int iters, int levels, int nr_plane, int msg_type);
+
+        void compute(InputArray left, InputArray right, OutputArray disparity);
+        void compute(InputArray left, InputArray right, OutputArray disparity, Stream& stream);
+        void compute(InputArray data, OutputArray disparity, Stream& stream);
+
+        int getMinDisparity() const { return min_disp_th_; }
+        void setMinDisparity(int minDisparity) { min_disp_th_ = minDisparity; }
+
+        int getNumDisparities() const { return ndisp_; }
+        void setNumDisparities(int numDisparities) { ndisp_ = numDisparities; }
+
+        int getBlockSize() const { return 0; }
+        void setBlockSize(int /*blockSize*/) {}
+
+        int getSpeckleWindowSize() const { return 0; }
+        void setSpeckleWindowSize(int /*speckleWindowSize*/) {}
+
+        int getSpeckleRange() const { return 0; }
+        void setSpeckleRange(int /*speckleRange*/) {}
+
+        int getDisp12MaxDiff() const { return 0; }
+        void setDisp12MaxDiff(int /*disp12MaxDiff*/) {}
+
+        int getNumIters() const { return iters_; }
+        void setNumIters(int iters) { iters_ = iters; }
+
+        int getNumLevels() const { return levels_; }
+        void setNumLevels(int levels) { levels_ = levels; }
+
+        double getMaxDataTerm() const { return max_data_term_; }
+        void setMaxDataTerm(double max_data_term) { max_data_term_ = (float) max_data_term; }
+
+        double getDataWeight() const { return data_weight_; }
+        void setDataWeight(double data_weight) { data_weight_ = (float) data_weight; }
+
+        double getMaxDiscTerm() const { return max_disc_term_; }
+        void setMaxDiscTerm(double max_disc_term) { max_disc_term_ = (float) max_disc_term; }
+
+        double getDiscSingleJump() const { return disc_single_jump_; }
+        void setDiscSingleJump(double disc_single_jump) { disc_single_jump_ = (float) disc_single_jump; }
+
+        int getMsgType() const { return msg_type_; }
+        void setMsgType(int msg_type) { msg_type_ = msg_type; }
+
+        int getNrPlane() const { return nr_plane_; }
+        void setNrPlane(int nr_plane) { nr_plane_ = nr_plane; }
+
+        bool getUseLocalInitDataCost() const { return use_local_init_data_cost_; }
+        void setUseLocalInitDataCost(bool use_local_init_data_cost) { use_local_init_data_cost_ = use_local_init_data_cost; }
+
+    private:
+        int min_disp_th_;
+        int ndisp_;
+        int iters_;
+        int levels_;
+        float max_data_term_;
+        float data_weight_;
+        float max_disc_term_;
+        float disc_single_jump_;
+        int msg_type_;
+        int nr_plane_;
+        bool use_local_init_data_cost_;
+
+        GpuMat mbuf_;
+        GpuMat temp_;
+        GpuMat outBuf_;
+    };
+
    const float DEFAULT_MAX_DATA_TERM = 30.0f;
    const float DEFAULT_DATA_WEIGHT = 1.0f;
    const float DEFAULT_MAX_DISC_TERM = 160.0f;
    const float DEFAULT_DISC_SINGLE_JUMP = 10.0f;
+
+    StereoCSBPImpl::StereoCSBPImpl(int ndisp, int iters, int levels, int nr_plane, int msg_type) :
+        min_disp_th_(0), ndisp_(ndisp), iters_(iters), levels_(levels),
+        max_data_term_(DEFAULT_MAX_DATA_TERM), data_weight_(DEFAULT_DATA_WEIGHT),
+        max_disc_term_(DEFAULT_MAX_DISC_TERM), disc_single_jump_(DEFAULT_DISC_SINGLE_JUMP),
+        msg_type_(msg_type), nr_plane_(nr_plane), use_local_init_data_cost_(true)
+    {
+    }
+
+    void StereoCSBPImpl::compute(InputArray left, InputArray right, OutputArray disparity)
+    {
+        compute(left, right, disparity, Stream::Null());
+    }
+
+    void StereoCSBPImpl::compute(InputArray _left, InputArray _right, OutputArray disp, Stream& _stream)
+    {
+        using namespace cv::gpu::cudev::stereocsbp;
+
+        CV_Assert( msg_type_ == CV_32F || msg_type_ == CV_16S );
+        CV_Assert( 0 < ndisp_ && 0 < iters_ && 0 < levels_ && 0 < nr_plane_ && levels_ <= 8 );
+
+        GpuMat left = _left.getGpuMat();
+        GpuMat right = _right.getGpuMat();
+
+        CV_Assert( left.type() == CV_8UC1 || left.type() == CV_8UC3 || left.type() == CV_8UC4 );
+        CV_Assert( left.size() == right.size() && left.type() == right.type() );
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+        ////////////////////////////////////////////////////////////////////////////////////////////
+        // Init
+
+        int rows = left.rows;
+        int cols = left.cols;
+
+        levels_ = std::min(levels_, int(log((double)ndisp_) / log(2.0)));
+
+        // compute sizes
+        AutoBuffer<int> buf(levels_ * 3);
+        int* cols_pyr = buf;
+        int* rows_pyr = cols_pyr + levels_;
+        int* nr_plane_pyr = rows_pyr + levels_;
+
+        cols_pyr[0]     = cols;
+        rows_pyr[0]     = rows;
+        nr_plane_pyr[0] = nr_plane_;
+
+        for (int i = 1; i < levels_; i++)
+        {
+            cols_pyr[i]     = cols_pyr[i-1] / 2;
+            rows_pyr[i]     = rows_pyr[i-1] / 2;
+            nr_plane_pyr[i] = nr_plane_pyr[i-1] * 2;
+        }
+
+        GpuMat u[2], d[2], l[2], r[2], disp_selected_pyr[2], data_cost, data_cost_selected;
+
+        //allocate buffers
+        int buffers_count = 10; // (up + down + left + right + disp_selected_pyr) * 2
+        buffers_count += 2; //  data_cost has twice more rows than other buffers, what's why +2, not +1;
+        buffers_count += 1; //  data_cost_selected
+        mbuf_.create(rows * nr_plane_ * buffers_count, cols, msg_type_);
+
+        data_cost          = mbuf_.rowRange(0, rows * nr_plane_ * 2);
+        data_cost_selected = mbuf_.rowRange(data_cost.rows, data_cost.rows + rows * nr_plane_);
+
+        for(int k = 0; k < 2; ++k) // in/out
+        {
+            GpuMat sub1 = mbuf_.rowRange(data_cost.rows + data_cost_selected.rows, mbuf_.rows);
+            GpuMat sub2 = sub1.rowRange((k+0)*sub1.rows/2, (k+1)*sub1.rows/2);
+
+            GpuMat *buf_ptrs[] = { &u[k], &d[k], &l[k], &r[k], &disp_selected_pyr[k] };
+            for(int _r = 0; _r < 5; ++_r)
+            {
+                *buf_ptrs[_r] = sub2.rowRange(_r * sub2.rows/5, (_r+1) * sub2.rows/5);
+                CV_DbgAssert( buf_ptrs[_r]->cols == cols && buf_ptrs[_r]->rows == rows * nr_plane_ );
+            }
+        };
+
+        size_t elem_step = mbuf_.step / mbuf_.elemSize();
+
+        Size temp_size = data_cost.size();
+        if ((size_t)temp_size.area() < elem_step * rows_pyr[levels_ - 1] * ndisp_)
+            temp_size = Size(static_cast<int>(elem_step), rows_pyr[levels_ - 1] * ndisp_);
+
+        temp_.create(temp_size, msg_type_);
+
+        ////////////////////////////////////////////////////////////////////////////
+        // Compute
+
+        load_constants(ndisp_, max_data_term_, data_weight_, max_disc_term_, disc_single_jump_, min_disp_th_, left, right, temp_);
+
+        l[0].setTo(0, _stream);
+        d[0].setTo(0, _stream);
+        r[0].setTo(0, _stream);
+        u[0].setTo(0, _stream);
+
+        l[1].setTo(0, _stream);
+        d[1].setTo(0, _stream);
+        r[1].setTo(0, _stream);
+        u[1].setTo(0, _stream);
+
+        data_cost.setTo(0, _stream);
+        data_cost_selected.setTo(0, _stream);
+
+        int cur_idx = 0;
+
+        if (msg_type_ == CV_32F)
+        {
+            for (int i = levels_ - 1; i >= 0; i--)
+            {
+                if (i == levels_ - 1)
+                {
+                    init_data_cost(left.rows, left.cols, disp_selected_pyr[cur_idx].ptr<float>(), data_cost_selected.ptr<float>(),
+                        elem_step, rows_pyr[i], cols_pyr[i], i, nr_plane_pyr[i], ndisp_, left.channels(), use_local_init_data_cost_, stream);
+                }
+                else
+                {
+                    compute_data_cost(disp_selected_pyr[cur_idx].ptr<float>(), data_cost.ptr<float>(), elem_step,
+                        left.rows, left.cols, rows_pyr[i], cols_pyr[i], rows_pyr[i+1], i, nr_plane_pyr[i+1], left.channels(), stream);
+
+                    int new_idx = (cur_idx + 1) & 1;
+
+                    init_message(u[new_idx].ptr<float>(), d[new_idx].ptr<float>(), l[new_idx].ptr<float>(), r[new_idx].ptr<float>(),
+                                 u[cur_idx].ptr<float>(), d[cur_idx].ptr<float>(), l[cur_idx].ptr<float>(), r[cur_idx].ptr<float>(),
+                                 disp_selected_pyr[new_idx].ptr<float>(), disp_selected_pyr[cur_idx].ptr<float>(),
+                                 data_cost_selected.ptr<float>(), data_cost.ptr<float>(), elem_step, rows_pyr[i],
+                                 cols_pyr[i], nr_plane_pyr[i], rows_pyr[i+1], cols_pyr[i+1], nr_plane_pyr[i+1], stream);
+
+                    cur_idx = new_idx;
+                }
+
+                calc_all_iterations(u[cur_idx].ptr<float>(), d[cur_idx].ptr<float>(), l[cur_idx].ptr<float>(), r[cur_idx].ptr<float>(),
+                                    data_cost_selected.ptr<float>(), disp_selected_pyr[cur_idx].ptr<float>(), elem_step,
+                                    rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], iters_, stream);
+            }
+        }
+        else
+        {
+            for (int i = levels_ - 1; i >= 0; i--)
+            {
+                if (i == levels_ - 1)
+                {
+                    init_data_cost(left.rows, left.cols, disp_selected_pyr[cur_idx].ptr<short>(), data_cost_selected.ptr<short>(),
+                        elem_step, rows_pyr[i], cols_pyr[i], i, nr_plane_pyr[i], ndisp_, left.channels(), use_local_init_data_cost_, stream);
+                }
+                else
+                {
+                    compute_data_cost(disp_selected_pyr[cur_idx].ptr<short>(), data_cost.ptr<short>(), elem_step,
+                        left.rows, left.cols, rows_pyr[i], cols_pyr[i], rows_pyr[i+1], i, nr_plane_pyr[i+1], left.channels(), stream);
+
+                    int new_idx = (cur_idx + 1) & 1;
+
+                    init_message(u[new_idx].ptr<short>(), d[new_idx].ptr<short>(), l[new_idx].ptr<short>(), r[new_idx].ptr<short>(),
+                                 u[cur_idx].ptr<short>(), d[cur_idx].ptr<short>(), l[cur_idx].ptr<short>(), r[cur_idx].ptr<short>(),
+                                 disp_selected_pyr[new_idx].ptr<short>(), disp_selected_pyr[cur_idx].ptr<short>(),
+                                 data_cost_selected.ptr<short>(), data_cost.ptr<short>(), elem_step, rows_pyr[i],
+                                 cols_pyr[i], nr_plane_pyr[i], rows_pyr[i+1], cols_pyr[i+1], nr_plane_pyr[i+1], stream);
+
+                    cur_idx = new_idx;
+                }
+
+                calc_all_iterations(u[cur_idx].ptr<short>(), d[cur_idx].ptr<short>(), l[cur_idx].ptr<short>(), r[cur_idx].ptr<short>(),
+                                    data_cost_selected.ptr<short>(), disp_selected_pyr[cur_idx].ptr<short>(), elem_step,
+                                    rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], iters_, stream);
+            }
+        }
+
+        const int dtype = disp.fixedType() ? disp.type() : CV_16SC1;
+
+        disp.create(rows, cols, dtype);
+        GpuMat out = disp.getGpuMat();
+
+        if (dtype != CV_16SC1)
+        {
+            outBuf_.create(rows, cols, CV_16SC1);
+            out = outBuf_;
+        }
+
+        out.setTo(0, _stream);
+
+        if (msg_type_ == CV_32F)
+        {
+            compute_disp(u[cur_idx].ptr<float>(), d[cur_idx].ptr<float>(), l[cur_idx].ptr<float>(), r[cur_idx].ptr<float>(),
+                         data_cost_selected.ptr<float>(), disp_selected_pyr[cur_idx].ptr<float>(), elem_step, out, nr_plane_pyr[0], stream);
+        }
+        else
+        {
+            compute_disp(u[cur_idx].ptr<short>(), d[cur_idx].ptr<short>(), l[cur_idx].ptr<short>(), r[cur_idx].ptr<short>(),
+                         data_cost_selected.ptr<short>(), disp_selected_pyr[cur_idx].ptr<short>(), elem_step, out, nr_plane_pyr[0], stream);
+        }
+
+        if (dtype != CV_16SC1)
+            out.convertTo(disp, dtype, _stream);
+    }
+
+    void StereoCSBPImpl::compute(InputArray /*data*/, OutputArray /*disparity*/, Stream& /*stream*/)
+    {
+        CV_Error(Error::StsNotImplemented, "Not implemented");
+    }
+}
+
+Ptr<gpu::StereoConstantSpaceBP> cv::gpu::createStereoConstantSpaceBP(int ndisp, int iters, int levels, int nr_plane, int msg_type)
+{
+    return new StereoCSBPImpl(ndisp, iters, levels, nr_plane, msg_type);
 }

 void cv::gpu::StereoConstantSpaceBP::estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels, int& nr_plane)
@ -114,174 +384,4 @@ void cv::gpu::StereoConstantSpaceBP::estimateRecommendedParams(int width, int he
    nr_plane = (int) ((float) ndisp / std::pow(2.0, levels + 1));
 }

-cv::gpu::StereoConstantSpaceBP::StereoConstantSpaceBP(int ndisp_, int iters_, int levels_, int nr_plane_,
-                                                      int msg_type_)
-
-    : ndisp(ndisp_), iters(iters_), levels(levels_), nr_plane(nr_plane_),
-      max_data_term(DEFAULT_MAX_DATA_TERM), data_weight(DEFAULT_DATA_WEIGHT),
-      max_disc_term(DEFAULT_MAX_DISC_TERM), disc_single_jump(DEFAULT_DISC_SINGLE_JUMP), min_disp_th(0),
-      msg_type(msg_type_), use_local_init_data_cost(true)
-{
-    CV_Assert(msg_type_ == CV_32F || msg_type_ == CV_16S);
-}
-
-cv::gpu::StereoConstantSpaceBP::StereoConstantSpaceBP(int ndisp_, int iters_, int levels_, int nr_plane_,
-                                                      float max_data_term_, float data_weight_, float max_disc_term_, float disc_single_jump_,
-                                                      int min_disp_th_, int msg_type_)
-    : ndisp(ndisp_), iters(iters_), levels(levels_), nr_plane(nr_plane_),
-      max_data_term(max_data_term_), data_weight(data_weight_),
-      max_disc_term(max_disc_term_), disc_single_jump(disc_single_jump_), min_disp_th(min_disp_th_),
-      msg_type(msg_type_), use_local_init_data_cost(true)
-{
-    CV_Assert(msg_type_ == CV_32F || msg_type_ == CV_16S);
-}
-
-template<class T>
-static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat& mbuf, GpuMat& temp, GpuMat& out, const GpuMat& left, const GpuMat& right, GpuMat& disp, Stream& stream)
-{
-    CV_DbgAssert(0 < rthis.ndisp && 0 < rthis.iters && 0 < rthis.levels && 0 < rthis.nr_plane
-        && left.rows == right.rows && left.cols == right.cols && left.type() == right.type());
-
-    CV_Assert(rthis.levels <= 8 && (left.type() == CV_8UC1 || left.type() == CV_8UC3 || left.type() == CV_8UC4));
-
-    const Scalar zero = Scalar::all(0);
-
-    cudaStream_t cudaStream = StreamAccessor::getStream(stream);
-
-    ////////////////////////////////////////////////////////////////////////////////////////////
-    // Init
-
-    int rows = left.rows;
-    int cols = left.cols;
-
-    rthis.levels = std::min(rthis.levels, int(log((double)rthis.ndisp) / log(2.0)));
-    int levels = rthis.levels;
-
-    // compute sizes
-    AutoBuffer<int> buf(levels * 3);
-    int* cols_pyr = buf;
-    int* rows_pyr = cols_pyr + levels;
-    int* nr_plane_pyr = rows_pyr + levels;
-
-    cols_pyr[0]     = cols;
-    rows_pyr[0]     = rows;
-    nr_plane_pyr[0] = rthis.nr_plane;
-
-    for (int i = 1; i < levels; i++)
-    {
-        cols_pyr[i]     = cols_pyr[i-1] / 2;
-        rows_pyr[i]     = rows_pyr[i-1] / 2;
-        nr_plane_pyr[i] = nr_plane_pyr[i-1] * 2;
-    }
-
-
-    GpuMat u[2], d[2], l[2], r[2], disp_selected_pyr[2], data_cost, data_cost_selected;
-
-
-    //allocate buffers
-    int buffers_count = 10; // (up + down + left + right + disp_selected_pyr) * 2
-    buffers_count += 2; //  data_cost has twice more rows than other buffers, what's why +2, not +1;
-    buffers_count += 1; //  data_cost_selected
-    mbuf.create(rows * rthis.nr_plane * buffers_count, cols, DataType<T>::type);
-
-    data_cost          = mbuf.rowRange(0, rows * rthis.nr_plane * 2);
-    data_cost_selected = mbuf.rowRange(data_cost.rows, data_cost.rows + rows * rthis.nr_plane);
-
-    for(int k = 0; k < 2; ++k) // in/out
-    {
-        GpuMat sub1 = mbuf.rowRange(data_cost.rows + data_cost_selected.rows, mbuf.rows);
-        GpuMat sub2 = sub1.rowRange((k+0)*sub1.rows/2, (k+1)*sub1.rows/2);
-
-        GpuMat *buf_ptrs[] = { &u[k], &d[k], &l[k], &r[k], &disp_selected_pyr[k] };
-        for(int _r = 0; _r < 5; ++_r)
-        {
-            *buf_ptrs[_r] = sub2.rowRange(_r * sub2.rows/5, (_r+1) * sub2.rows/5);
-            CV_DbgAssert(buf_ptrs[_r]->cols == cols && buf_ptrs[_r]->rows == rows * rthis.nr_plane);
-        }
-    };
-
-    size_t elem_step = mbuf.step / sizeof(T);
-
-    Size temp_size = data_cost.size();
-    if ((size_t)temp_size.area() < elem_step * rows_pyr[levels - 1] * rthis.ndisp)
-        temp_size = Size(static_cast<int>(elem_step), rows_pyr[levels - 1] * rthis.ndisp);
-
-    temp.create(temp_size, DataType<T>::type);
-
-    ////////////////////////////////////////////////////////////////////////////
-    // Compute
-
-    load_constants(rthis.ndisp, rthis.max_data_term, rthis.data_weight, rthis.max_disc_term, rthis.disc_single_jump, rthis.min_disp_th, left, right, temp);
-
-    l[0].setTo(zero, stream);
-    d[0].setTo(zero, stream);
-    r[0].setTo(zero, stream);
-    u[0].setTo(zero, stream);
-
-    l[1].setTo(zero, stream);
-    d[1].setTo(zero, stream);
-    r[1].setTo(zero, stream);
-    u[1].setTo(zero, stream);
-
-    data_cost.setTo(zero, stream);
-    data_cost_selected.setTo(zero, stream);
-
-    int cur_idx = 0;
-
-    for (int i = levels - 1; i >= 0; i--)
-    {
-        if (i == levels - 1)
-        {
-            init_data_cost(left.rows, left.cols, disp_selected_pyr[cur_idx].ptr<T>(), data_cost_selected.ptr<T>(),
-                elem_step, rows_pyr[i], cols_pyr[i], i, nr_plane_pyr[i], rthis.ndisp, left.channels(), rthis.use_local_init_data_cost, cudaStream);
-        }
-        else
-        {
-            compute_data_cost(disp_selected_pyr[cur_idx].ptr<T>(), data_cost.ptr<T>(), elem_step,
-                left.rows, left.cols, rows_pyr[i], cols_pyr[i], rows_pyr[i+1], i, nr_plane_pyr[i+1], left.channels(), cudaStream);
-
-            int new_idx = (cur_idx + 1) & 1;
-
-            init_message(u[new_idx].ptr<T>(), d[new_idx].ptr<T>(), l[new_idx].ptr<T>(), r[new_idx].ptr<T>(),
-                         u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),
-                         disp_selected_pyr[new_idx].ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(),
-                         data_cost_selected.ptr<T>(), data_cost.ptr<T>(), elem_step, rows_pyr[i],
-                         cols_pyr[i], nr_plane_pyr[i], rows_pyr[i+1], cols_pyr[i+1], nr_plane_pyr[i+1], cudaStream);
-
-            cur_idx = new_idx;
-        }
-
-        calc_all_iterations(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),
-                            data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), elem_step,
-                            rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], rthis.iters, cudaStream);
-    }
-
-    if (disp.empty())
-        disp.create(rows, cols, CV_16S);
-
-    out = ((disp.type() == CV_16S) ? disp : (out.create(rows, cols, CV_16S), out));
-
-    out.setTo(zero, stream);
-
-    compute_disp(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),
-                 data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), elem_step, out, nr_plane_pyr[0], cudaStream);
-
-    if (disp.type() != CV_16S)
-    {
-        out.convertTo(disp, disp.type(), stream);
-    }
-}
-
-
-typedef void (*csbp_operator_t)(StereoConstantSpaceBP& rthis, GpuMat& mbuf,
-                                     GpuMat& temp, GpuMat& out, const GpuMat& left, const GpuMat& right, GpuMat& disp, Stream& stream);
-
-const static csbp_operator_t operators[] = {0, 0, 0, csbp_operator<short>, 0, csbp_operator<float>, 0, 0};
-
-void cv::gpu::StereoConstantSpaceBP::operator()(const GpuMat& left, const GpuMat& right, GpuMat& disp, Stream& stream)
-{
-    CV_Assert(msg_type == CV_32F || msg_type == CV_16S);
-    operators[msg_type](*this, messages_buffers, temp, out, left, right, disp, stream);
-}
-
 #endif /* !defined (HAVE_CUDA) */
--- a/modules/gpustereo/src/util.cpp
+++ b/modules/gpustereo/src/util.cpp
@ -47,8 +47,8 @@ using namespace cv::gpu;

 #if !defined HAVE_CUDA || defined(CUDA_DISABLER)

-void cv::gpu::reprojectImageTo3D(const GpuMat&, GpuMat&, const Mat&, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::drawColorDisp(const GpuMat&, GpuMat&, int, Stream&) { throw_no_cuda(); }
+void cv::gpu::reprojectImageTo3D(InputArray, OutputArray, InputArray, int, Stream&) { throw_no_cuda(); }
+void cv::gpu::drawColorDisp(InputArray, OutputArray, int, Stream&) { throw_no_cuda(); }

 #else

@ -61,7 +61,7 @@ namespace cv { namespace gpu { namespace cudev
    void reprojectImageTo3D_gpu(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
 }}}

-void cv::gpu::reprojectImageTo3D(const GpuMat& disp, GpuMat& xyz, const Mat& Q, int dst_cn, Stream& stream)
+void cv::gpu::reprojectImageTo3D(InputArray _disp, OutputArray _xyz, InputArray _Q, int dst_cn, Stream& stream)
 {
    using namespace cv::gpu::cudev;

@ -72,11 +72,15 @@ void cv::gpu::reprojectImageTo3D(const GpuMat& disp, GpuMat& xyz, const Mat& Q,
        {reprojectImageTo3D_gpu<uchar, float4>, 0, 0, reprojectImageTo3D_gpu<short, float4>}
    };

-    CV_Assert(disp.type() == CV_8U || disp.type() == CV_16S);
-    CV_Assert(Q.type() == CV_32F && Q.rows == 4 && Q.cols == 4 && Q.isContinuous());
-    CV_Assert(dst_cn == 3 || dst_cn == 4);
+    GpuMat disp = _disp.getGpuMat();
+    Mat Q = _Q.getMat();

-    xyz.create(disp.size(), CV_MAKE_TYPE(CV_32F, dst_cn));
+    CV_Assert( disp.type() == CV_8U || disp.type() == CV_16S );
+    CV_Assert( Q.type() == CV_32F && Q.rows == 4 && Q.cols == 4 && Q.isContinuous() );
+    CV_Assert( dst_cn == 3 || dst_cn == 4 );
+
+    _xyz.create(disp.size(), CV_MAKE_TYPE(CV_32F, dst_cn));
+    GpuMat xyz = _xyz.getGpuMat();

    funcs[dst_cn == 4][disp.type()](disp, xyz, Q.ptr<float>(), StreamAccessor::getStream(stream));
 }
@ -93,23 +97,25 @@ namespace cv { namespace gpu { namespace cudev
 namespace
 {
    template <typename T>
-    void drawColorDisp_caller(const GpuMat& src, GpuMat& dst, int ndisp, const cudaStream_t& stream)
+    void drawColorDisp_caller(const GpuMat& src, OutputArray _dst, int ndisp, const cudaStream_t& stream)
    {
        using namespace ::cv::gpu::cudev;

-        dst.create(src.size(), CV_8UC4);
+        _dst.create(src.size(), CV_8UC4);
+        GpuMat dst = _dst.getGpuMat();

        drawColorDisp_gpu((PtrStepSz<T>)src, dst, ndisp, stream);
    }
-
-    typedef void (*drawColorDisp_caller_t)(const GpuMat& src, GpuMat& dst, int ndisp, const cudaStream_t& stream);
-
-    const drawColorDisp_caller_t drawColorDisp_callers[] = {drawColorDisp_caller<unsigned char>, 0, 0, drawColorDisp_caller<short>, 0, 0, 0, 0};
 }

-void cv::gpu::drawColorDisp(const GpuMat& src, GpuMat& dst, int ndisp, Stream& stream)
+void cv::gpu::drawColorDisp(InputArray _src, OutputArray dst, int ndisp, Stream& stream)
 {
-    CV_Assert(src.type() == CV_8U || src.type() == CV_16S);
+    typedef void (*drawColorDisp_caller_t)(const GpuMat& src, OutputArray dst, int ndisp, const cudaStream_t& stream);
+    const drawColorDisp_caller_t drawColorDisp_callers[] = {drawColorDisp_caller<unsigned char>, 0, 0, drawColorDisp_caller<short>, 0, 0, 0, 0};
+
+    GpuMat src = _src.getGpuMat();
+
+    CV_Assert( src.type() == CV_8U || src.type() == CV_16S );

    drawColorDisp_callers[src.type()](src, dst, ndisp, StreamAccessor::getStream(stream));
 }
--- a/Show More
+++ b/Show More