Merge remote-tracking branch 'origin/master'

2025-07-25 14:47:07 +08:00 · 2012-08-23 14:58:41 +04:00 · 2012-08-23 14:58:41 +04:00 · 5648e49d59
commit 5648e49d59
parent fc307c87dc c8a54f67d4
169 changed files with 14121 additions and 9349 deletions
--- a/3rdparty/libtiff/CMakeLists.txt
+++ b/3rdparty/libtiff/CMakeLists.txt
@ -89,7 +89,7 @@ endif(WIN32)

 ocv_warnings_disable(CMAKE_C_FLAGS -Wno-unused-but-set-variable -Wmissing-prototypes -Wmissing-declarations -Wundef -Wunused -Wsign-compare
                                   -Wcast-align -Wshadow -Wno-maybe-uninitialized -Wno-pointer-to-int-cast -Wno-int-to-pointer-cast)
-ocv_warnings_disable(CMAKE_CXX_FLAGS -Wmissing-declarations -Wunused-parameter /wd4100 /wd4244 /wd4706 /wd4127 /wd4701 /wd4018 /wd4267 /wd4306 /wd4305 /wd4312 /wd4311)
+ocv_warnings_disable(CMAKE_CXX_FLAGS -Wmissing-declarations -Wunused-parameter /wd4100 /wd4244 /wd4706 /wd4127 /wd4701 /wd4018 /wd4267 /wd4306 /wd4305 /wd4312 /wd4311 /wd4703)

 if(UNIX AND (CMAKE_COMPILER_IS_GNUCXX OR CV_ICC))
  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -189,11 +189,11 @@ OCV_OPTION(ENABLE_PROFILING           "Enable profiling in the GCC compiler (Add
 OCV_OPTION(ENABLE_OMIT_FRAME_POINTER  "Enable -fomit-frame-pointer for GCC"                      ON   IF CMAKE_COMPILER_IS_GNUCXX )
 OCV_OPTION(ENABLE_POWERPC             "Enable PowerPC for GCC"                                   ON   IF (CMAKE_COMPILER_IS_GNUCXX AND CMAKE_SYSTEM_PROCESSOR MATCHES powerpc.*) )
 OCV_OPTION(ENABLE_FAST_MATH           "Enable -ffast-math (not recommended for GCC 4.6.x)"       OFF  IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
-OCV_OPTION(ENABLE_SSE                 "Enable SSE instructions"                                  ON   IF (MSVC OR CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
-OCV_OPTION(ENABLE_SSE2                "Enable SSE2 instructions"                                 ON   IF (MSVC OR CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
-OCV_OPTION(ENABLE_SSE3                "Enable SSE3 instructions"                                 OFF  IF (CV_ICC OR CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
+OCV_OPTION(ENABLE_SSE                 "Enable SSE instructions"                                  ON   IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
+OCV_OPTION(ENABLE_SSE2                "Enable SSE2 instructions"                                 ON   IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
+OCV_OPTION(ENABLE_SSE3                "Enable SSE3 instructions"                                 ON   IF ((CV_ICC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
 OCV_OPTION(ENABLE_SSSE3               "Enable SSSE3 instructions"                                OFF  IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
-OCV_OPTION(ENABLE_SSE41               "Enable SSE4.1 instructions"                               OFF  IF (CV_ICC OR CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
+OCV_OPTION(ENABLE_SSE41               "Enable SSE4.1 instructions"                               OFF  IF ((CV_ICC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
 OCV_OPTION(ENABLE_SSE42               "Enable SSE4.2 instructions"                               OFF  IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
 OCV_OPTION(ENABLE_NOISY_WARNINGS      "Show all warnings even if they are too noisy"             OFF )
 OCV_OPTION(OPENCV_WARNINGS_ARE_ERRORS "Treat warnings as errors"                                 OFF )
@ -336,6 +336,7 @@ include(cmake/OpenCVCompilerOptions.cmake REQUIRED)
 # ----------------------------------------------------------------------------
 if(MSVC)
  include(cmake/OpenCVCRTLinkage.cmake REQUIRED)
+  add_definitions(-D_VARIADIC_MAX=10)
 endif(MSVC)


--- a/android/android.toolchain.cmake
+++ b/android/android.toolchain.cmake
@ -1,6 +1,6 @@
 # ------------------------------------------------------------------------------
 #  Android CMake toolchain file, for use with the Android NDK r5-r8
-#  Requires cmake 2.6.3 or newer (2.8.3 or newer is recommended).
+#  Requires cmake 2.6.3 or newer (2.8.5 or newer is recommended).
 #  See home page: http://code.google.com/p/android-cmake/
 #
 #  The file is mantained by the OpenCV project. And also can be found at
@ -44,7 +44,8 @@
 #    ANDROID_ABI=armeabi-v7a -  specifies the target Application Binary
 #      Interface (ABI). This option nearly matches to the APP_ABI variable
 #      used by ndk-build tool from Android NDK.
-#      Possible values are:
+#
+#      Possible targets are:
 #        "armeabi" - matches to the NDK ABI with the same name.
 #           See ${ANDROID_NDK}/docs/CPU-ARCH-ABIS.html for the documentation.
 #        "armeabi-v7a" - matches to the NDK ABI with the same name.
@ -55,7 +56,9 @@
 #            sets VFPV3 as floating-point unit (has 32 registers instead of 16).
 #        "armeabi-v6 with VFP" - tuned for ARMv6 processors having VFP.
 #        "x86" - matches to the NDK ABI with the same name.
-#           See ${ANDROID_NDK}/docs/CPU-ARCH-ABIS.html for the documentation.
+#            See ${ANDROID_NDK}/docs/CPU-ARCH-ABIS.html for the documentation.
+#        "mips" - matches to the NDK ABI with the same name
+#            (not testes on real devices)
 #
 #    ANDROID_NATIVE_API_LEVEL=android-8 - level of Android API compile for.
 #      Option is read-only when standalone toolchain used.
@ -183,12 +186,13 @@
 #   - modified August 2012
 #     [+] updated for NDK r8b
 #     [~] all intermediate files generated by toolchain are moved into CMakeFiles
+#     [~] libstdc++ and libsupc are removed from explicit link libraries
 # ------------------------------------------------------------------------------

 cmake_minimum_required( VERSION 2.6.3 )

 if( DEFINED CMAKE_CROSSCOMPILING )
- #subsequent toolchain loading is not really needed
+ # subsequent toolchain loading is not really needed
 return()
 endif()

@ -199,7 +203,7 @@ endif()

 # this one is important
 set( CMAKE_SYSTEM_NAME Linux )
-#this one not so much
+# this one not so much
 set( CMAKE_SYSTEM_VERSION 1 )

 set( ANDROID_SUPPORTED_NDK_VERSIONS ${ANDROID_EXTRA_NDK_VERSIONS} -r8b -r8 -r7c -r7b -r7 -r6b -r6 -r5c -r5b -r5 "" )
@ -331,11 +335,11 @@ macro( __COPY_IF_DIFFERENT _source _destination )
 endmacro()


-#stl version: by default gnustl_static will be used
+# stl version: by default gnustl_static will be used
 set( ANDROID_USE_STLPORT FALSE CACHE BOOL "Experimental: use stlport_static instead of gnustl_static")
 mark_as_advanced( ANDROID_USE_STLPORT )

-#fight against cygwin
+# fight against cygwin
 set( ANDROID_FORBID_SYGWIN TRUE CACHE BOOL "Prevent cmake from working under cygwin and using cygwin tools")
 mark_as_advanced( ANDROID_FORBID_SYGWIN )
 if( ANDROID_FORBID_SYGWIN )
@ -344,7 +348,7 @@ if( ANDROID_FORBID_SYGWIN )
 endif()

 if( CMAKE_HOST_WIN32 )
-  #remove cygwin from PATH
+  # remove cygwin from PATH
  set( __new_path "$ENV{PATH}")
  __LIST_FILTER( __new_path "cygwin" )
  set(ENV{PATH} "${__new_path}")
@ -352,7 +356,7 @@ if( ANDROID_FORBID_SYGWIN )
 endif()
 endif()

-#detect current host platform
+# detect current host platform
 set( TOOL_OS_SUFFIX "" )
 if( CMAKE_HOST_APPLE )
 set( ANDROID_NDK_HOST_SYSTEM_NAME "darwin-x86" )
@ -365,10 +369,10 @@ else()
 message( FATAL_ERROR "Cross-compilation on your platform is not supported by this cmake toolchain" )
 endif()

-#see if we have path to Android NDK
+# see if we have path to Android NDK
 __INIT_VARIABLE( ANDROID_NDK PATH ENV_ANDROID_NDK )
 if( NOT ANDROID_NDK )
- #see if we have path to Android standalone toolchain
+ # see if we have path to Android standalone toolchain
 __INIT_VARIABLE( ANDROID_STANDALONE_TOOLCHAIN PATH ENV_ANDROID_STANDALONE_TOOLCHAIN OBSOLETE_ANDROID_NDK_TOOLCHAIN_ROOT OBSOLETE_ENV_ANDROID_NDK_TOOLCHAIN_ROOT )

 if( NOT ANDROID_STANDALONE_TOOLCHAIN )
@ -397,10 +401,10 @@ if( NOT ANDROID_NDK )
 endif( NOT ANDROID_STANDALONE_TOOLCHAIN )
 endif( NOT ANDROID_NDK )

-#remember found paths
+# remember found paths
 if( ANDROID_NDK )
 get_filename_component( ANDROID_NDK "${ANDROID_NDK}" ABSOLUTE )
- #try to detect change
+ # try to detect change
 if( CMAKE_AR )
  string( LENGTH "${ANDROID_NDK}" __length )
  string( SUBSTRING "${CMAKE_AR}" 0 ${__length} __androidNdkPreviousPath )
@ -414,7 +418,7 @@ if( ANDROID_NDK )
 set( BUILD_WITH_ANDROID_NDK True )
 elseif( ANDROID_STANDALONE_TOOLCHAIN )
 get_filename_component( ANDROID_STANDALONE_TOOLCHAIN "${ANDROID_STANDALONE_TOOLCHAIN}" ABSOLUTE )
- #try to detect change
+ # try to detect change
 if( CMAKE_AR )
  string( LENGTH "${ANDROID_STANDALONE_TOOLCHAIN}" __length )
  string( SUBSTRING "${CMAKE_AR}" 0 ${__length} __androidStandaloneToolchainPreviousPath )
@ -438,7 +442,7 @@ else()
      sudo ln -s ~/my-android-toolchain ${ANDROID_STANDALONE_TOOLCHAIN_SEARCH_PATH}" )
 endif()

-#get all the details about standalone toolchain
+# get all the details about standalone toolchain
 if( BUILD_WITH_STANDALONE_TOOLCHAIN )
 __DETECT_NATIVE_API_LEVEL( ANDROID_SUPPORTED_NATIVE_API_LEVELS "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot/usr/include/android/api-level.h" )
 set( ANDROID_STANDALONE_TOOLCHAIN_API_LEVEL ${ANDROID_SUPPORTED_NATIVE_API_LEVELS} )
@ -455,7 +459,7 @@ if( BUILD_WITH_STANDALONE_TOOLCHAIN )
  set( __availableToolchainArchs "mipsel" )
 endif()
 if( ANDROID_COMPILER_VERSION )
-  #do not run gcc every time because it is relatevely expencive
+  # do not run gcc every time because it is relatevely expencive
  set( __availableToolchainCompilerVersions "${ANDROID_COMPILER_VERSION}" )
 else()
  execute_process( COMMAND "${ANDROID_STANDALONE_TOOLCHAIN}/bin/${__availableToolchainMachines}-gcc${TOOL_OS_SUFFIX}" --version
@ -464,7 +468,7 @@ if( BUILD_WITH_STANDALONE_TOOLCHAIN )
 endif()
 endif()

-#get all the details about NDK
+# get all the details about NDK
 if( BUILD_WITH_ANDROID_NDK )
 file( GLOB ANDROID_SUPPORTED_NATIVE_API_LEVELS RELATIVE "${ANDROID_NDK}/platforms" "${ANDROID_NDK}/platforms/android-*" )
 string( REPLACE "android-" "" ANDROID_SUPPORTED_NATIVE_API_LEVELS "${ANDROID_SUPPORTED_NATIVE_API_LEVELS}" )
@ -490,7 +494,7 @@ if( BUILD_WITH_ANDROID_NDK )
 endif()
 endif()

-#build list of available ABIs
+# build list of available ABIs
 if( NOT ANDROID_SUPPORTED_ABIS )
 set( ANDROID_SUPPORTED_ABIS "" )
 set( __uniqToolchainArchNames ${__availableToolchainArchs} )
@ -505,9 +509,9 @@ if( NOT ANDROID_SUPPORTED_ABIS )
 endif()
 endif()

-#choose target ABI
+# choose target ABI
 __INIT_VARIABLE( ANDROID_ABI OBSOLETE_ARM_TARGET OBSOLETE_ARM_TARGETS VALUES ${ANDROID_SUPPORTED_ABIS} )
-#verify that target ABI is supported
+# verify that target ABI is supported
 list( FIND ANDROID_SUPPORTED_ABIS "${ANDROID_ABI}" __androidAbiIdx )
 if( __androidAbiIdx EQUAL -1 )
 string( REPLACE ";" "\", \"", PRINTABLE_ANDROID_SUPPORTED_ABIS  "${ANDROID_SUPPORTED_ABIS}" )
@ -517,10 +521,10 @@ if( __androidAbiIdx EQUAL -1 )
 endif()
 unset( __androidAbiIdx )

-#remember target ABI
+# remember target ABI
 set( ANDROID_ABI "${ANDROID_ABI}" CACHE STRING "The target ABI for Android. If arm, then armeabi-v7a is recommended for hardware floating point." FORCE )

-#set target ABI options
+# set target ABI options
 if( ANDROID_ABI STREQUAL "x86" )
 set( X86 true )
 set( ANDROID_NDK_ABI_NAME "x86" )
@ -545,7 +549,7 @@ elseif( ANDROID_ABI STREQUAL "armeabi-v6 with VFP" )
 set( ANDROID_ARCH_NAME "arm" )
 set( ANDROID_ARCH_FULLNAME "arm" )
 set( CMAKE_SYSTEM_PROCESSOR "armv6" )
- #need always fallback to older platform
+ # need always fallback to older platform
 set( ARMEABI true )
 elseif( ANDROID_ABI STREQUAL "armeabi-v7a")
 set( ARMEABI_V7A true )
@ -573,8 +577,8 @@ else()
 endif()

 if( CMAKE_BINARY_DIR AND EXISTS "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeSystem.cmake" )
- #really dirty hack
- #it is not possible to change CMAKE_SYSTEM_PROCESSOR after the first run...
+ # really dirty hack
+ # it is not possible to change CMAKE_SYSTEM_PROCESSOR after the first run...
 file( APPEND "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeSystem.cmake" "SET(CMAKE_SYSTEM_PROCESSOR \"${CMAKE_SYSTEM_PROCESSOR}\")\n" )
 endif()

@ -592,7 +596,7 @@ else()
 unset( ANDROID_FORCE_ARM_BUILD CACHE )
 endif()

-#choose toolchain
+# choose toolchain
 if( ANDROID_TOOLCHAIN_NAME )
 list( FIND __availableToolchains "${ANDROID_TOOLCHAIN_NAME}" __toolchainIdx )
 if( __toolchainIdx EQUAL -1 )
@ -637,10 +641,10 @@ unset( __availableToolchainMachines )
 unset( __availableToolchainArchs )
 unset( __availableToolchainCompilerVersions )

-#choose native API level
+# choose native API level
 __INIT_VARIABLE( ANDROID_NATIVE_API_LEVEL ENV_ANDROID_NATIVE_API_LEVEL ANDROID_API_LEVEL ENV_ANDROID_API_LEVEL ANDROID_STANDALONE_TOOLCHAIN_API_LEVEL ANDROID_DEFAULT_NDK_API_LEVEL_${ANDROID_ARCH_NAME} ANDROID_DEFAULT_NDK_API_LEVEL )
 string( REGEX MATCH "[0-9]+" ANDROID_NATIVE_API_LEVEL "${ANDROID_NATIVE_API_LEVEL}" )
-#validate
+# validate
 list( FIND ANDROID_SUPPORTED_NATIVE_API_LEVELS "${ANDROID_NATIVE_API_LEVEL}" __levelIdx )
 if( __levelIdx EQUAL -1 )
 message( SEND_ERROR "Specified Android native API level (${ANDROID_NATIVE_API_LEVEL}) is not supported by your NDK/toolchain." )
@ -659,7 +663,7 @@ if( CMAKE_VERSION VERSION_GREATER "2.8" )
 set_property( CACHE ANDROID_NATIVE_API_LEVEL PROPERTY STRINGS ${ANDROID_SUPPORTED_NATIVE_API_LEVELS} )
 endif()

-#setup paths
+# setup paths
 if( BUILD_WITH_STANDALONE_TOOLCHAIN )
 set( ANDROID_TOOLCHAIN_ROOT "${ANDROID_STANDALONE_TOOLCHAIN}" )
 set( ANDROID_SYSROOT "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot" )
@ -689,7 +693,7 @@ set( CMAKE_ASM_COMPILER "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHI
 if( CMAKE_VERSION VERSION_LESS 2.8.5 )
 set( CMAKE_ASM_COMPILER_ARG1 "-c" )
 endif()
-#there may be a way to make cmake deduce these TODO deduce the rest of the tools
+# there may be a way to make cmake deduce these TODO deduce the rest of the tools
 set( CMAKE_STRIP        "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-strip${TOOL_OS_SUFFIX}"   CACHE PATH "strip" )
 set( CMAKE_AR           "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-ar${TOOL_OS_SUFFIX}"      CACHE PATH "archive" )
 set( CMAKE_LINKER       "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-ld${TOOL_OS_SUFFIX}"      CACHE PATH "linker" )
@ -705,11 +709,12 @@ if( APPLE )
 endif()
 mark_as_advanced( CMAKE_INSTALL_NAME_TOOL )
 endif()
-#export directories
+
+# export directories
 set( ANDROID_SYSTEM_INCLUDE_DIRS "" )
 set( ANDROID_SYSTEM_LIB_DIRS "" )

-#setup output directories
+# setup output directories
 set( LIBRARY_OUTPUT_PATH_ROOT ${CMAKE_SOURCE_DIR} CACHE PATH "root for library output, set this to change where android libs are installed to" )
 set( CMAKE_INSTALL_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/user" CACHE STRING "path for installing" )

@ -722,13 +727,13 @@ if(NOT _CMAKE_IN_TRY_COMPILE)
 set( LIBRARY_OUTPUT_PATH "${LIBRARY_OUTPUT_PATH_ROOT}/libs/${ANDROID_NDK_ABI_NAME}" CACHE PATH "path for android libs" )
 endif()

-#includes
+# includes
 list( APPEND ANDROID_SYSTEM_INCLUDE_DIRS "${ANDROID_SYSROOT}/usr/include" )
 if( __stlIncludePath AND EXISTS "${__stlIncludePath}" )
 list( APPEND ANDROID_SYSTEM_INCLUDE_DIRS "${__stlIncludePath}" )
 endif()

-#STL bits includes
+# c++ bits includes
 if( __stlLibPath AND EXISTS "${__stlLibPath}/include" )
 list( APPEND ANDROID_SYSTEM_INCLUDE_DIRS "${__stlLibPath}/include" )
 endif()
@ -742,7 +747,7 @@ elseif( EXISTS "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/incl
 list( APPEND ANDROID_SYSTEM_INCLUDE_DIRS "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/include/c++/${ANDROID_COMPILER_VERSION}/${ANDROID_TOOLCHAIN_MACHINE_NAME}" )
 endif()

-#flags and definitions
+# flags and definitions
 if(ANDROID_SYSROOT MATCHES "[ ;\"]")
 set( ANDROID_CXX_FLAGS "--sysroot=\"${ANDROID_SYSROOT}\"" )
 # quotes will break try_compile and compiler identification
@ -766,7 +771,7 @@ set( CMAKE_CXX_PLATFORM_ID Linux )
 set( CMAKE_CXX_SIZEOF_DATA_PTR 4 )
 set( CMAKE_CXX_HAS_ISYSROOT 1 )
 set( CMAKE_CXX_COMPILER_ABI ELF )
-#force ASM compiler (required for CMake < 2.8.5)
+# force ASM compiler (required for CMake < 2.8.5)
 set( CMAKE_ASM_COMPILER_ID_RUN TRUE )
 set( CMAKE_ASM_COMPILER_ID GNU )
 set( CMAKE_ASM_COMPILER_WORKS TRUE )
@ -796,17 +801,17 @@ endif()

 if( ANDROID_USE_STLPORT )
 set( _CMAKE_CXX_FLAGS "${_CMAKE_CXX_FLAGS} -fno-rtti -fno-exceptions" )
- set( _CMAKE_C_FLAGS "${_CMAKE_C_FLAGS} -fno-rtti -fno-exceptions" )
+ set( _CMAKE_C_FLAGS "${_CMAKE_C_FLAGS} -fno-exceptions" )
 else()
 set( _CMAKE_CXX_FLAGS "${_CMAKE_CXX_FLAGS} -frtti -fexceptions" )
 set( _CMAKE_C_FLAGS "${_CMAKE_C_FLAGS} -fexceptions" )
 endif()

-#release and debug flags
+# release and debug flags
 if( ARMEABI OR ARMEABI_V7A )
 if( NOT ANDROID_FORCE_ARM_BUILD AND NOT ARMEABI_V6 )
-  #It is recommended to use the -mthumb compiler flag to force the generation
-  #of 16-bit Thumb-1 instructions (the default being 32-bit ARM ones).
+  # It is recommended to use the -mthumb compiler flag to force the generation
+  # of 16-bit Thumb-1 instructions (the default being 32-bit ARM ones).
  # O3 instead of O2/Os in release mode - like cmake sets for desktop gcc
  set( _CMAKE_CXX_FLAGS_RELEASE "-mthumb -O3" )
  set( _CMAKE_C_FLAGS_RELEASE   "-mthumb -O3" )
@ -836,7 +841,7 @@ set( _CMAKE_C_FLAGS_RELEASE   "${_CMAKE_C_FLAGS_RELEASE}   -fomit-frame-pointer
 set( _CMAKE_CXX_FLAGS_DEBUG "${_CMAKE_CXX_FLAGS_DEBUG} -fno-strict-aliasing -fno-omit-frame-pointer -DDEBUG -D_DEBUG" )
 set( _CMAKE_C_FLAGS_DEBUG   "${_CMAKE_C_FLAGS_DEBUG}   -fno-strict-aliasing -fno-omit-frame-pointer -DDEBUG -D_DEBUG" )

-#ABI-specific flags
+# ABI-specific flags
 if( ARMEABI_V7A )
 set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -march=armv7-a -mfloat-abi=softfp" )
 if( NEON )
@ -854,19 +859,18 @@ elseif( X86 )
 set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS}" )#sse?
 endif()

-#linker flags
+# linker flags
 if( NOT DEFINED __ndklibspath )
 set( __ndklibspath "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/ndklibs/${ANDROID_NDK_ABI_NAME}" )
 endif()
-list( APPEND ANDROID_SYSTEM_LIB_DIRS "${__ndklibspath}" "${CMAKE_INSTALL_PREFIX}/libs/${ANDROID_NDK_ABI_NAME}" )
+list( APPEND ANDROID_SYSTEM_LIB_DIRS "${CMAKE_INSTALL_PREFIX}/libs/${ANDROID_NDK_ABI_NAME}" )
 set( ANDROID_LINKER_FLAGS "" )
-#STL
+
+# STL
 if( ANDROID_USE_STLPORT )
 if( EXISTS "${__stlLibPath}/libstlport_static.a" )
-  __COPY_IF_DIFFERENT( "${__stlLibPath}/libstlport_static.a" "${__ndklibspath}/libstlport_static.a" )
- endif()
- if( EXISTS "${__ndklibspath}/libstlport_static.a" )
-  set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -Wl,--start-group -lstlport_static" )
+  set( CMAKE_CXX_CREATE_SHARED_LIBRARY "<CMAKE_CXX_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <SONAME_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES> \"${__stlLibPath}/libstlport_static.a\"")
+  set( CMAKE_CXX_CREATE_SHARED_MODULE  "<CMAKE_CXX_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <SONAME_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES> \"${__stlLibPath}/libstlport_static.a\"")
 endif()
 else( ANDROID_USE_STLPORT )
 if( EXISTS "${__stlLibPath}/libgnustl_static.a" )
@ -880,11 +884,6 @@ else( ANDROID_USE_STLPORT )
 elseif( EXISTS "${__stlLibPath}/libstdc++.a" )
  __COPY_IF_DIFFERENT( "${__stlLibPath}/libstdc++.a" "${__ndklibspath}/libstdc++.a" )
 endif()
- if( EXISTS "${__ndklibspath}/libstdc++.a" )
-  set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -lstdc++" )
- endif()
-
- #gcc exception & rtti support
 if( EXISTS "${__stlLibPath}/libsupc++.a" )
  __COPY_IF_DIFFERENT( "${__stlLibPath}/libsupc++.a" "${__ndklibspath}/libsupc++.a" )
 elseif( ANDROID_ARCH_NAME STREQUAL "arm" AND EXISTS "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/${CMAKE_SYSTEM_PROCESSOR}/thumb/libsupc++.a" )
@ -896,16 +895,14 @@ else( ANDROID_USE_STLPORT )
 elseif( EXISTS "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/libsupc++.a" )
  __COPY_IF_DIFFERENT( "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/libsupc++.a" "${__ndklibspath}/libsupc++.a" )
 endif()
- if( EXISTS "${__ndklibspath}/libsupc++.a" )
-  set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -lsupc++" )
- endif()
+ list( APPEND ANDROID_SYSTEM_LIB_DIRS "${__ndklibspath}" )
 endif( ANDROID_USE_STLPORT )

-#cleanup for STL search
+# cleanup for STL search
 unset( __stlIncludePath )
 unset( __stlLibPath )

-#other linker flags
+# other linker flags
 __INIT_VARIABLE( ANDROID_NO_UNDEFINED OBSOLETE_NO_UNDEFINED VALUES ON )
 set( ANDROID_NO_UNDEFINED ${ANDROID_NO_UNDEFINED} CACHE BOOL "Show all undefined symbols as linker errors" FORCE )
 mark_as_advanced( ANDROID_NO_UNDEFINED )
@ -914,7 +911,7 @@ if( ANDROID_NO_UNDEFINED )
 endif()

 if (ANDROID_NDK MATCHES "-r[56].?$")
- #libGLESv2.so in NDK's prior to r7 refers to exteranal symbols. So this flag option is required for all projects using OpenGL from native.
+ # libGLESv2.so in NDK's prior to r7 refers to exteranal symbols. So this flag option is required for all projects using OpenGL from native.
 __INIT_VARIABLE( ANDROID_SO_UNDEFINED VALUES ON )
 else()
 __INIT_VARIABLE( ANDROID_SO_UNDEFINED VALUES OFF )
@ -940,7 +937,7 @@ if( ARMEABI_V7A )
 set( ANDROID_LINKER_FLAGS "-Wl,--fix-cortex-a8 ${ANDROID_LINKER_FLAGS}" )
 endif()

-#cache flags
+# cache flags
 set( CMAKE_CXX_FLAGS "${_CMAKE_CXX_FLAGS}" CACHE STRING "c++ flags" )
 set( CMAKE_C_FLAGS "${_CMAKE_C_FLAGS}" CACHE STRING "c flags" )
 set( CMAKE_CXX_FLAGS_RELEASE "${_CMAKE_CXX_FLAGS_RELEASE}" CACHE STRING "c++ Release flags" )
@ -954,7 +951,7 @@ set( CMAKE_EXE_LINKER_FLAGS "-Wl,-z,nocopyreloc" CACHE STRING "linker flags" )
 include_directories( SYSTEM ${ANDROID_SYSTEM_INCLUDE_DIRS} )
 link_directories( ${ANDROID_SYSTEM_LIB_DIRS} )

-#finish flags
+# finish flags
 set( ANDROID_CXX_FLAGS    "${ANDROID_CXX_FLAGS}"    CACHE INTERNAL "Extra Android compiler flags")
 set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS}" CACHE INTERNAL "Extra Android linker flags")
 set( CMAKE_CXX_FLAGS           "${ANDROID_CXX_FLAGS} ${CMAKE_CXX_FLAGS}" )
@ -969,7 +966,7 @@ else()
 set( CMAKE_EXE_LINKER_FLAGS    "${ANDROID_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS}" )
 endif()

-#set these global flags for cmake client scripts to change behavior
+# set these global flags for cmake client scripts to change behavior
 set( ANDROID True )
 set( BUILD_ANDROID True )

@ -982,7 +979,7 @@ set( CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY )
 set( CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY )


-#macro to find packages on the host OS
+# macro to find packages on the host OS
 macro( find_host_package )
 set( CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER )
 set( CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER )
@ -1004,7 +1001,7 @@ macro( find_host_package )
 endmacro()


-#macro to find programs on the host OS
+# macro to find programs on the host OS
 macro( find_host_program )
 set( CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER )
 set( CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER )
@ -1044,7 +1041,11 @@ if( NOT PROJECT_NAME STREQUAL "CMAKE_TRY_COMPILE" )
 set( __toolchain_config "")
 foreach( __var ANDROID_ABI ANDROID_FORCE_ARM_BUILD ANDROID_NATIVE_API_LEVEL ANDROID_NO_UNDEFINED ANDROID_SO_UNDEFINED ANDROID_SET_OBSOLETE_VARIABLES LIBRARY_OUTPUT_PATH_ROOT ANDROID_USE_STLPORT ANDROID_FORBID_SYGWIN ANDROID_NDK ANDROID_STANDALONE_TOOLCHAIN ANDROID_FUNCTION_LEVEL_LINKING __ndklibspath )
  if( DEFINED ${__var} )
-   set( __toolchain_config "${__toolchain_config}set( ${__var} \"${${__var}}\" )\n" )
+   if( "${__var}" MATCHES " ")
+    set( __toolchain_config "${__toolchain_config}set( ${__var} \"${${__var}}\" CACHE INTERNAL \"\" )\n" )
+   else()
+    set( __toolchain_config "${__toolchain_config}set( ${__var} ${${__var}} CACHE INTERNAL \"\" )\n" )
+   endif()
  endif()
 endforeach()
 file( WRITE "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/android.toolchain.config.cmake" "${__toolchain_config}" )
--- a/cmake/CMakeParseArguments.cmake
+++ b/cmake/CMakeParseArguments.cmake
@ -0,0 +1,138 @@
+# CMAKE_PARSE_ARGUMENTS(<prefix> <options> <one_value_keywords> <multi_value_keywords> args...)
+#
+# CMAKE_PARSE_ARGUMENTS() is intended to be used in macros or functions for
+# parsing the arguments given to that macro or function.
+# It processes the arguments and defines a set of variables which hold the
+# values of the respective options.
+#
+# The <options> argument contains all options for the respective macro,
+# i.e. keywords which can be used when calling the macro without any value
+# following, like e.g. the OPTIONAL keyword of the install() command.
+#
+# The <one_value_keywords> argument contains all keywords for this macro
+# which are followed by one value, like e.g. DESTINATION keyword of the
+# install() command.
+#
+# The <multi_value_keywords> argument contains all keywords for this macro
+# which can be followed by more than one value, like e.g. the TARGETS or
+# FILES keywords of the install() command.
+#
+# When done, CMAKE_PARSE_ARGUMENTS() will have defined for each of the
+# keywords listed in <options>, <one_value_keywords> and
+# <multi_value_keywords> a variable composed of the given <prefix>
+# followed by "_" and the name of the respective keyword.
+# These variables will then hold the respective value from the argument list.
+# For the <options> keywords this will be TRUE or FALSE.
+#
+# All remaining arguments are collected in a variable
+# <prefix>_UNPARSED_ARGUMENTS, this can be checked afterwards to see whether
+# your macro was called with unrecognized parameters.
+#
+# As an example here a my_install() macro, which takes similar arguments as the
+# real install() command:
+#
+#   function(MY_INSTALL)
+#     set(options OPTIONAL FAST)
+#     set(oneValueArgs DESTINATION RENAME)
+#     set(multiValueArgs TARGETS CONFIGURATIONS)
+#     cmake_parse_arguments(MY_INSTALL "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN} )
+#     ...
+#
+# Assume my_install() has been called like this:
+#   my_install(TARGETS foo bar DESTINATION bin OPTIONAL blub)
+#
+# After the cmake_parse_arguments() call the macro will have set the following
+# variables:
+#   MY_INSTALL_OPTIONAL = TRUE
+#   MY_INSTALL_FAST = FALSE (this option was not used when calling my_install()
+#   MY_INSTALL_DESTINATION = "bin"
+#   MY_INSTALL_RENAME = "" (was not used)
+#   MY_INSTALL_TARGETS = "foo;bar"
+#   MY_INSTALL_CONFIGURATIONS = "" (was not used)
+#   MY_INSTALL_UNPARSED_ARGUMENTS = "blub" (no value expected after "OPTIONAL"
+#
+# You can the continue and process these variables.
+#
+# Keywords terminate lists of values, e.g. if directly after a one_value_keyword
+# another recognized keyword follows, this is interpreted as the beginning of
+# the new option.
+# E.g. my_install(TARGETS foo DESTINATION OPTIONAL) would result in
+# MY_INSTALL_DESTINATION set to "OPTIONAL", but MY_INSTALL_DESTINATION would
+# be empty and MY_INSTALL_OPTIONAL would be set to TRUE therefor.
+
+#=============================================================================
+# Copyright 2010 Alexander Neundorf <neundorf@kde.org>
+#
+# Distributed under the OSI-approved BSD License (the "License");
+# see accompanying file Copyright.txt for details.
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+# (To distribute this file outside of CMake, substitute the full
+#  License text for the above reference.)
+
+
+if(__CMAKE_PARSE_ARGUMENTS_INCLUDED)
+  return()
+endif()
+set(__CMAKE_PARSE_ARGUMENTS_INCLUDED TRUE)
+
+
+function(CMAKE_PARSE_ARGUMENTS prefix _optionNames _singleArgNames _multiArgNames)
+  # first set all result variables to empty/FALSE
+  foreach(arg_name ${_singleArgNames} ${_multiArgNames})
+    set(${prefix}_${arg_name})
+  endforeach(arg_name)
+
+  foreach(option ${_optionNames})
+    set(${prefix}_${option} FALSE)
+  endforeach(option)
+
+  set(${prefix}_UNPARSED_ARGUMENTS)
+
+  set(insideValues FALSE)
+  set(currentArgName)
+
+  # now iterate over all arguments and fill the result variables
+  foreach(currentArg ${ARGN})
+    list(FIND _optionNames "${currentArg}" optionIndex)  # ... then this marks the end of the arguments belonging to this keyword
+    list(FIND _singleArgNames "${currentArg}" singleArgIndex)  # ... then this marks the end of the arguments belonging to this keyword
+    list(FIND _multiArgNames "${currentArg}" multiArgIndex)  # ... then this marks the end of the arguments belonging to this keyword
+
+    if(${optionIndex} EQUAL -1  AND  ${singleArgIndex} EQUAL -1  AND  ${multiArgIndex} EQUAL -1)
+      if(insideValues)
+        if("${insideValues}" STREQUAL "SINGLE")
+          set(${prefix}_${currentArgName} ${currentArg})
+          set(insideValues FALSE)
+        elseif("${insideValues}" STREQUAL "MULTI")
+          list(APPEND ${prefix}_${currentArgName} ${currentArg})
+        endif()
+      else(insideValues)
+        list(APPEND ${prefix}_UNPARSED_ARGUMENTS ${currentArg})
+      endif(insideValues)
+    else()
+      if(NOT ${optionIndex} EQUAL -1)
+        set(${prefix}_${currentArg} TRUE)
+        set(insideValues FALSE)
+      elseif(NOT ${singleArgIndex} EQUAL -1)
+        set(currentArgName ${currentArg})
+        set(${prefix}_${currentArgName})
+        set(insideValues "SINGLE")
+      elseif(NOT ${multiArgIndex} EQUAL -1)
+        set(currentArgName ${currentArg})
+        set(${prefix}_${currentArgName})
+        set(insideValues "MULTI")
+      endif()
+    endif()
+
+  endforeach(currentArg)
+
+  # propagate the result variables to the caller:
+  foreach(arg_name ${_singleArgNames} ${_multiArgNames} ${_optionNames})
+    set(${prefix}_${arg_name}  ${${prefix}_${arg_name}} PARENT_SCOPE)
+  endforeach(arg_name)
+  set(${prefix}_UNPARSED_ARGUMENTS ${${prefix}_UNPARSED_ARGUMENTS} PARENT_SCOPE)
+
+endfunction(CMAKE_PARSE_ARGUMENTS _options _singleArgs _multiArgs)
--- a/cmake/OpenCVDetectCUDA.cmake
+++ b/cmake/OpenCVDetectCUDA.cmake
@ -88,7 +88,11 @@ if(CUDA_FOUND)
    if(APPLE)
      set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fno-finite-math-only)
    endif()
-    string(REPLACE "-Wsign-promo" "" CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+
+    # disabled because of multiple warnings during building nvcc auto generated files
+    if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_GCC_REGEX_VERSION VERSION_GREATER "4.6.0")
+      ocv_warnings_disable(CMAKE_CXX_FLAGS -Wunused-but-set-variable)
+    endif()

    # we remove -ggdb3 flag as it leads to preprocessor errors when compiling CUDA files (CUDA 4.1)
    set(CMAKE_CXX_FLAGS_DEBUG_ ${CMAKE_CXX_FLAGS_DEBUG})
--- a/cmake/OpenCVDetectTBB.cmake
+++ b/cmake/OpenCVDetectTBB.cmake
@ -21,7 +21,12 @@ elseif(UNIX AND NOT APPLE)
 endif()

 if(NOT HAVE_TBB)
-  set(TBB_DEFAULT_INCLUDE_DIRS "/opt/intel/tbb" "/usr/local/include" "/usr/include" "C:/Program Files/Intel/TBB" "C:/Program Files (x86)/Intel/TBB" "C:/Program Files (x86)/TBB" "${CMAKE_INSTALL_PREFIX}/include")
+  set(TBB_DEFAULT_INCLUDE_DIRS 
+    "/opt/intel/tbb" "/usr/local/include" "/usr/include" 
+    "C:/Program Files/Intel/TBB" "C:/Program Files (x86)/Intel/TBB" 
+    "C:/Program Files (x86)/tbb/include" 
+    "C:/Program Files (x86)/tbb/include" 
+    "${CMAKE_INSTALL_PREFIX}/include")

  find_path(TBB_INCLUDE_DIRS "tbb/tbb.h" PATHS ${TBB_INCLUDE_DIR} ${TBB_DEFAULT_INCLUDE_DIRS} DOC "The path to TBB headers")
  if(TBB_INCLUDE_DIRS)
--- a/cmake/OpenCVGenConfig.cmake
+++ b/cmake/OpenCVGenConfig.cmake
@ -64,9 +64,14 @@ macro(ocv_generate_dependencies_map_configcmake suffix configuration)
      string(REGEX REPLACE "${CMAKE_SHARED_LIBRARY_SUFFIX}$" "${OPENCV_LINK_LIBRARY_SUFFIX}" __libname "${__libname}")
    endif()

+    string(REPLACE " " "\\ " __mod_deps "${${__ocv_lib}_MODULE_DEPS_${suffix}}")
+    string(REPLACE " " "\\ " __ext_deps "${${__ocv_lib}_EXTRA_DEPS_${suffix}}")
+    string(REPLACE "\"" "\\\"" __mod_deps "${__mod_deps}")
+    string(REPLACE "\"" "\\\"" __ext_deps "${__ext_deps}")
+
    set(OPENCV_DEPENDENCIES_MAP_${suffix} "${OPENCV_DEPENDENCIES_MAP_${suffix}}set(OpenCV_${__ocv_lib}_LIBNAME_${suffix} \"${__libname}\")\n")
-    set(OPENCV_DEPENDENCIES_MAP_${suffix} "${OPENCV_DEPENDENCIES_MAP_${suffix}}set(OpenCV_${__ocv_lib}_DEPS_${suffix} ${${__ocv_lib}_MODULE_DEPS_${suffix}})\n")
-    set(OPENCV_DEPENDENCIES_MAP_${suffix} "${OPENCV_DEPENDENCIES_MAP_${suffix}}set(OpenCV_${__ocv_lib}_EXTRA_DEPS_${suffix} ${${__ocv_lib}_EXTRA_DEPS_${suffix}})\n")
+    set(OPENCV_DEPENDENCIES_MAP_${suffix} "${OPENCV_DEPENDENCIES_MAP_${suffix}}set(OpenCV_${__ocv_lib}_DEPS_${suffix} ${__mod_deps})\n")
+    set(OPENCV_DEPENDENCIES_MAP_${suffix} "${OPENCV_DEPENDENCIES_MAP_${suffix}}set(OpenCV_${__ocv_lib}_EXTRA_DEPS_${suffix} ${__ext_deps})\n")

    list(APPEND OPENCV_PROCESSED_LIBS ${__ocv_lib})
    list(APPEND OPENCV_LIBS_TO_PROCESS ${${__ocv_lib}_MODULE_DEPS_${suffix}})
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@ -509,8 +509,6 @@ endmacro()
 macro(ocv_add_precompiled_headers the_target)
  if("${the_target}" MATCHES "^opencv_test_.*$")
    SET(pch_path "test/test_")
-  elseif("${the_target}" MATCHES "opencv_perf_gpu_cpu")
-    SET(pch_path "perf_cpu/perf_cpu_")
  elseif("${the_target}" MATCHES "^opencv_perf_.*$")
    SET(pch_path "perf/perf_")
  else()
--- a/cmake/OpenCVUtils.cmake
+++ b/cmake/OpenCVUtils.cmake
@ -501,3 +501,12 @@ macro(ocv_parse_header2 LIBNAME HDR_PATH VARNAME)
    endif()
  endif()
 endmacro()
+
+
+################################################################################################
+# short command to setup source group
+function(ocv_source_group group)
+  cmake_parse_arguments(OCV_SOURCE_GROUP "" "" "GLOB" ${ARGN})
+  file(GLOB srcs ${OCV_SOURCE_GROUP_GLOB})
+  source_group(${group} FILES ${srcs})
+endfunction()
--- a/data/lbpcascades/lbpcascade_profileface.xml
+++ b/data/lbpcascades/lbpcascade_profileface.xml
--- a/data/lbpcascades/lbpcascade_silverware.xml
+++ b/data/lbpcascades/lbpcascade_silverware.xml
--- a/doc/_themes/blue/static/default.css_t
+++ b/doc/_themes/blue/static/default.css_t
@ -175,6 +175,8 @@ a:hover {
 div.body p, div.body dd, div.body li {
    text-align: justify;
    line-height: 130%;
+    margin-top: 1em;
+    margin-bottom: 1em;
 }

 div.body h1,
@ -327,16 +329,16 @@ table.field-list {
    margin-top: 20px;
 }

-ul.simple {
+/*ul.simple {
    list-style: none;
-}
+}*/

 em.menuselection, em.guilabel {
    font-family: {{ theme_guifont }};
 }

 .enumeratevisibleitemswithsquare ul {
-list-style: square; 
+list-style: square;
 margin-bottom: 0px;
 margin-left: 0px;
 margin-right: 0px;
@ -349,25 +351,25 @@ margin-left: 0px;
 margin-right: 0px;
 margin-top: 0.2em;
 }
- 
+
 .enumeratevisibleitemswithsquare p {
 margin-bottom: 0pt;
 margin-top: 1pt;
 }
- 
+
 .enumeratevisibleitemswithsquare dl{
 margin-bottom: 0px;
 margin-left: 0px;
 margin-right: 0px;
 margin-top: 0px;
 }
- 
+
 .toctableopencv
 {
-   width: 100% ; 
+   width: 100% ;
   table-layout: fixed;
 }
- 
+

  .toctableopencv colgroup col:first-child
  {
@ -375,12 +377,17 @@ margin-top: 0px;
    max-width: 100pt !important;
    min-width: 100pt !important;
  }
-  
-  .toctableopencv colgroup col:nth-child(2) 
+
+  .toctableopencv colgroup col:nth-child(2)
  {
    width: 100% !important;
  }
-  
+
 div.body ul.search li {
    text-align: left;
 }
+
+div.linenodiv {
+    min-width: 1em;
+    text-align: right;
+}
--- a/doc/tutorials/introduction/android_binary_package/O4A_SDK.rst
+++ b/doc/tutorials/introduction/android_binary_package/O4A_SDK.rst
@ -36,21 +36,21 @@ The structure of package contents looks as follows:

    OpenCV-2.4.2-android-sdk
    |_ apk
-    |   |_ OpenCV_2.4.2_binary_pack_XXX.apk 
-    |   |_ OpenCV_2.4.2_Manager.apk 
+    |   |_ OpenCV_2.4.2_binary_pack_XXX.apk
+    |   |_ OpenCV_2.4.2_Manager.apk
    |
    |_ doc
    |_ samples
    |_ sdk
-    |    |_ etc 
-    |    |_ java 
-    |    |_ native 
-    |          |_ 3rdparty 
-    |          |_ jni 
-    |          |_ libs 
-    |               |_ armeabi 
-    |               |_ armeabi-v7a 
-    |               |_ x86 
+    |    |_ etc
+    |    |_ java
+    |    |_ native
+    |          |_ 3rdparty
+    |          |_ jni
+    |          |_ libs
+    |               |_ armeabi
+    |               |_ armeabi-v7a
+    |               |_ x86
    |
    |_ license.txt
    |_ README.android
@ -64,11 +64,11 @@ The structure of package contents looks as follows:
 * :file:`sdk/etc` folder contains Haar and LBP cascades distributed with OpenCV.

 * :file:`apk` folder contains Android packages that should be installed on the target Android device to enable OpenCV library access via OpenCV Manager API (see details below).
-   
+
  On production devices that have access to Google Play Market (and internet) these packages will be installed from Market on the first start of an application using OpenCV Manager API.
  But dev kits without Market or internet require this packages to be installed manually.
  (Install the `Manager.apk` and the corresponding `binary_pack.apk` depending on the device CPU, the Manager GUI provides this info).
-  
+
  **Note**: installation from internet is the preferable way since we may publish updated versions of this packages on the Market.

 * :file:`samples` folder contains sample applications projects and their prebuilt packages (APK).
@ -76,7 +76,7 @@ The structure of package contents looks as follows:

 * :file:`doc` folder contains various OpenCV documentation in PDF format.
  It's also available online at http://docs.opencv.org.
-  
+
  **Note**: the most recent docs (nightly build) are at http://docs.opencv.org/trunk/.
  Generally, it's more up-to-date, but can refer to not-yet-released functionality.

@ -94,10 +94,10 @@ Starting version 2.4.2 `OpenCV4Android SDK` uses `OpenCV Manager` API for librar


 For additional information on OpenCV Manager see the:
- 
-* |OpenCV4Android_Slides|_ 
-  
-* |OpenCV4Android_Reference|_ 
+
+* |OpenCV4Android_Slides|_
+
+* |OpenCV4Android_Reference|_

     ..

@ -196,15 +196,15 @@ Open OpenCV library and samples in Eclipse
   However, **all these errors are only false-alarms**!

   Just give a minute to Eclipse to complete initialization.
-   
+
   In some cases these errors disappear after :menuselection:`Project --> Clean... --> Clean all --> OK`
   or after pressing :kbd:`F5` (for Refresh action) when selecting error-label-marked projects in :guilabel:`Package Explorer`.

   Sometimes more advanced manipulations are required:

-   * The provided projects are configured for ``API 11`` target (and ``API 9`` for the library) that can be missing platform in your Android SDK.
-     After right click on any project select  :guilabel:`Properties` and then :guilabel:`Android` on the left pane.
-     Click some target with `API Level` 11 or higher:
+   The provided projects are configured for ``API 11`` target (and ``API 9`` for the library) that can be missing platform in your Android SDK.
+   After right click on any project select  :guilabel:`Properties` and then :guilabel:`Android` on the left pane.
+   Click some target with `API Level` 11 or higher:

      .. image:: images/eclipse_8a_target.png
         :alt: Updating target
@ -239,10 +239,10 @@ Well, running samples from Eclipse is very simple:

 * Connect your device with :command:`adb` tool from Android SDK or create an emulator with camera support.

-   * See `Managing Virtual Devices
-     <http://developer.android.com/guide/developing/devices/index.html>`_ document for help with Android Emulator.
-   * See `Using Hardware Devices
-     <http://developer.android.com/guide/developing/device.html>`_ for help with real devices (not emulators).
+  * See `Managing Virtual Devices
+    <http://developer.android.com/guide/developing/devices/index.html>`_ document for help with Android Emulator.
+  * See `Using Hardware Devices
+    <http://developer.android.com/guide/developing/device.html>`_ for help with real devices (not emulators).


 * Select project you want to start in :guilabel:`Package Explorer` and just press :kbd:`Ctrl + F11` or select option :menuselection:`Run --> Run` from the main menu, or click :guilabel:`Run` button on the toolbar.
@ -263,33 +263,33 @@ Well, running samples from Eclipse is very simple:
  .. image:: images/android_emulator_opencv_manager_fail.png
     :alt: You will see this message if you have no OpenCV Manager installed
     :align: center
-     
+
  To get rid of the message you will need to install `OpenCV Manager` and the appropriate `OpenCV binary pack`.
  Simply tap :menuselection:`Yes` if you have *Google Play Market* installed on your device/emulator. It will redirect you to the corresponding page on *Google Play Market*.
-  
+
  If you have no access to the *Market*, which is often the case with emulators - you will need to install the packages from OpenCV4Android SDK folder manually. Open the console/terminal and type in the following two commands:
-  
+
  .. code-block:: sh
    :linenos:

    <Android SDK path>/platform-tools/adb install <OpenCV4Android SDK path>/apk/OpenCV_2.4.2_Manager.apk
    <Android SDK path>/platform-tools/adb install <OpenCV4Android SDK path>/apk/OpenCV_2.4.2_binary_pack_armv7a.apk
-    
+
  If you're running Windows, that will probably look like this:
-    
+
  .. image:: images/install_opencv_manager_with_adb.png
     :alt: Run these commands in the console to install OpenCV Manager
     :align: center
-     
+
  When done, you will be able to run OpenCV samples on your device/emulator seamlessly.
-  
+
 * Here is ``Tutorial 2 - Use OpenCV Camera`` sample, running on top of stock camera-preview of the emulator.

  .. image:: images/emulator_canny.png
     :height: 600px
     :alt: Tutorial 1 Basic - 1. Add OpenCV - running Canny
     :align: center
-  
+
 What's next
 ===========

--- a/doc/tutorials/introduction/android_binary_package/android_dev_intro.rst
+++ b/doc/tutorials/introduction/android_binary_package/android_dev_intro.rst
@ -75,7 +75,7 @@ You need the following software to be installed in order to develop for Android

           sudo update-java-alternatives --set java-6-sun

-   **TODO:** add a note on Sun/Oracle Java installation on Ubuntu 12.
+..   **TODO:** add a note on Sun/Oracle Java installation on Ubuntu 12.

 #. **Android SDK**

@ -241,27 +241,29 @@ where:
 The script :file:`Android.mk` usually has the following structure:

 .. code-block:: make
+   :linenos:

-        LOCAL_PATH := $(call my-dir)
+   LOCAL_PATH := $(call my-dir)

-        include $(CLEAR_VARS)
-        LOCAL_MODULE    := <module_name>
-        LOCAL_SRC_FILES := <list of .c and .cpp project files>
-        <some variable name> := <some variable value>
-        ...
-        <some variable name> := <some variable value>
+   include $(CLEAR_VARS)
+   LOCAL_MODULE    := <module_name>
+   LOCAL_SRC_FILES := <list of .c and .cpp project files>
+   <some variable name> := <some variable value>
+   ...
+   <some variable name> := <some variable value>

-        include $(BUILD_SHARED_LIBRARY)
+   include $(BUILD_SHARED_LIBRARY)

 This is the minimal file :file:`Android.mk`, which builds C++ source code of an Android application. Note that the first two lines and the last line are mandatory for any :file:`Android.mk`.

 Usually the file :file:`Application.mk` is optional, but in case of project using OpenCV, when STL and exceptions are used in C++, it also should be created. Example of the file :file:`Application.mk`:

 .. code-block:: make
+   :linenos:

-        APP_STL := gnustl_static
-        APP_CPPFLAGS := -frtti -fexceptions
-        APP_ABI := armeabi-v7a
+   APP_STL := gnustl_static
+   APP_CPPFLAGS := -frtti -fexceptions
+   APP_ABI := armeabi-v7a


 .. _NDK_build_cli:
@ -332,75 +334,76 @@ We recommend the approach based on Eclipse :abbr:`CDT(C/C++ Development Tooling)
 #. Open Eclipse and load the Android app project to configure.

 #. Add C/C++ Nature to the project via Eclipse menu :guilabel:`New -> Other -> C/C++ -> Convert to a C/C++ Project`.
-   
-     .. image:: images/eclipse_cdt_cfg1.png
-        :alt: Configure CDT
-        :align: center

-    ` `
+   .. image:: images/eclipse_cdt_cfg1.png
+      :alt: Configure CDT
+      :align: center

-     .. image:: images/eclipse_cdt_cfg2.png
-        :alt: Configure CDT
-        :align: center
+   And:
+
+   .. image:: images/eclipse_cdt_cfg2.png
+      :alt: Configure CDT
+      :align: center

 #. Select the project(s) to convert. Specify "Project type" = ``Makefile project``, "Toolchains" = ``Other Toolchain``.
-   
+
     .. image:: images/eclipse_cdt_cfg3.png
        :alt: Configure CDT
        :align: center

-#. Open :guilabel:`Project Properties -> C/C++ Build`, unckeck ``Use default build command``, replace "Build command" text from ``"make"`` to 
-     ``"${NDKROOT}/ndk-build.cmd"`` on Windows,
+#. Open :guilabel:`Project Properties -> C/C++ Build`, unckeck ``Use default build command``, replace "Build command" text from ``"make"`` to

-     ``"${NDKROOT}/ndk-build"`` on Linux and MacOS.
-   
-     .. image:: images/eclipse_cdt_cfg4.png
-        :alt: Configure CDT
-        :align: center
+   ``"${NDKROOT}/ndk-build.cmd"`` on Windows,
+
+   ``"${NDKROOT}/ndk-build"`` on Linux and MacOS.
+
+   .. image:: images/eclipse_cdt_cfg4.png
+      :alt: Configure CDT
+      :align: center

 #. Go to :guilabel:`Behaviour`  tab and change "Workbench build type" section like shown below:
-   
-     .. image:: images/eclipse_cdt_cfg5.png
-        :alt: Configure CDT
-        :align: center
+
+   .. image:: images/eclipse_cdt_cfg5.png
+      :alt: Configure CDT
+      :align: center

 #. Press :guilabel:`OK`  and make sure the ``ndk-build`` is successfully invoked when building the project.
-   
-     .. image:: images/eclipse_cdt_cfg6.png
-        :alt: Configure CDT
-        :align: center
+
+   .. image:: images/eclipse_cdt_cfg6.png
+      :alt: Configure CDT
+      :align: center

 #. If you open your C++ source file in Eclipse editor, you'll see syntax error notifications. They are not real errors, but additional CDT configuring is required.
-   
-     .. image:: images/eclipse_cdt_cfg7.png
-        :alt: Configure CDT
-        :align: center
+
+   .. image:: images/eclipse_cdt_cfg7.png
+      :alt: Configure CDT
+      :align: center

 #. Open :guilabel:`Project Properties -> C/C++ General -> Paths and Symbols` and add the following **Include** paths for **C++**:

-     ::
+   ::

        ${NDKROOT}/platforms/android-9/arch-arm/usr/include
        ${NDKROOT}/sources/cxx-stl/gnu-libstdc++/include
        ${NDKROOT}/sources/cxx-stl/gnu-libstdc++/libs/armeabi-v7a/include
        ${ProjDirPath}/../../sdk/native/jni/include

-     The last path should be changed to the correct absolute or relative path to OpenCV4Android SDK location.
-     
-     This should clear the syntax error notifications in Eclipse C++ editor.
-   
-     .. image:: images/eclipse_cdt_cfg8.png
-        :alt: Configure CDT
-        :align: center
+   The last path should be changed to the correct absolute or relative path to OpenCV4Android SDK location.

-     .. note:: The latest Android NDK **r8b** has a bit different STL headers path. So if you use this NDK version please use the following modified **Include** paths list:
+   This should clear the syntax error notifications in Eclipse C++ editor.

-       ::
+   .. image:: images/eclipse_cdt_cfg8.png
+      :alt: Configure CDT
+      :align: center

-          ${NDKROOT}/platforms/android-9/arch-arm/usr/include
-          ${NDKROOT}/sources/cxx-stl/gnu-libstdc++/4.6/include
-          ${NDKROOT}/sources/cxx-stl/gnu-libstdc++/4.6/libs/armeabi-v7a/include
-          ${ProjDirPath}/../../sdk/native/jni/include
+   .. note:: The latest Android NDK **r8b** uses different STL headers path. So if you use this NDK release add the following **Include** paths list instead:
+
+   ::
+
+        ${NDKROOT}/platforms/android-9/arch-arm/usr/include
+        ${NDKROOT}/sources/cxx-stl/gnu-libstdc++/4.6/include
+        ${NDKROOT}/sources/cxx-stl/gnu-libstdc++/4.6/libs/armeabi-v7a/include
+        ${ProjDirPath}/../../sdk/native/jni/include


 Debugging and Testing
@ -412,12 +415,16 @@ AVD
 AVD (*Android Virtual Device*) is not probably the most convenient way to test an OpenCV-dependent application, but sure the most uncomplicated one to configure.

 #. Assuming you already have *Android SDK* and *Eclipse IDE* installed, in Eclipse go :guilabel:`Window -> AVD Manager`.
-     **TBD:** how to start AVD Manager without Eclipse...
+
+   ..     **TBD:** how to start AVD Manager without Eclipse...
+
 #. Press the :guilabel:`New` button in :guilabel:`AVD Manager` window.
 #. :guilabel:`Create new Android Virtual Device` window will let you select some properties for your new device, like target API level, size of SD-card and other.
-    .. image:: images/AVD_create.png
-     :alt: Configure builders
-     :align: center
+
+   .. image:: images/AVD_create.png
+      :alt: Configure builders
+      :align: center
+
 #. When you click the :guilabel:`Create AVD` button, your new AVD will be availible in :guilabel:`AVD Manager`.
 #. Press :guilabel:`Start` to launch the device. Be aware that any AVD (a.k.a. Emulator) is usually much slower than a hardware Android device, so it may take up to several minutes to start.
 #. Go :guilabel:`Run -> Run/Debug`  in Eclipse IDE to run your application in regular or debugging mode. :guilabel:`Device Chooser` will let you choose among the running devices or to start a new one.
@ -435,81 +442,106 @@ Windows host computer
 #. Attach the Android device to your PC with a USB cable.
 #. Go to :guilabel:`Start Menu` and **right-click** on :guilabel:`Computer`. Select :guilabel:`Manage` in the context menu. You may be asked for Administrative permissions.
 #. Select :guilabel:`Device Manager` in the left pane and find an unknown device in the list. You may try unplugging it and then plugging back in order to check whether it's your exact equipment appears in the list.
-    .. image:: images/usb_device_connect_01.png
-     :alt: Unknown device
-     :align: center
+
+   .. image:: images/usb_device_connect_01.png
+      :alt: Unknown device
+      :align: center
+
 #. Try your luck installing `Google USB drivers` without any modifications: **right-click** on the unknown device, select :guilabel:`Properties` menu item --> :guilabel:`Details` tab --> :guilabel:`Update Driver` button.
-    .. image:: images/usb_device_connect_05.png
-     :alt: Device properties
-     :align: center
+
+   .. image:: images/usb_device_connect_05.png
+      :alt: Device properties
+      :align: center
+
 #. Select :guilabel:`Browse computer for driver software`.
-    .. image:: images/usb_device_connect_06.png
-     :alt: Browse for driver
-     :align: center
+
+   .. image:: images/usb_device_connect_06.png
+      :alt: Browse for driver
+      :align: center
+
 #. Specify the path to :file:`<Android SDK folder>/extras/google/usb_driver/` folder.
-    .. image:: images/usb_device_connect_07.png
-     :alt: Browse for driver
-     :align: center
+
+   .. image:: images/usb_device_connect_07.png
+      :alt: Browse for driver
+      :align: center
+
 #. If you get the prompt to install unverified drivers and report about success - you've finished with USB driver installation.
-    .. image:: images/usb_device_connect_08.png
-     :alt: Install prompt
-     :align: center

-    ` `
+   .. image:: images/usb_device_connect_08.png
+      :alt: Install prompt
+      :align: center
+
+   ` `
+
+   .. image:: images/usb_device_connect_09.png
+      :alt: Installed OK
+      :align: center

-    .. image:: images/usb_device_connect_09.png
-     :alt: Installed OK
-     :align: center
 #. Otherwise (getting the failure like shown below) follow the next steps.
-    .. image:: images/usb_device_connect_12.png
-     :alt: No driver
-     :align: center
+
+   .. image:: images/usb_device_connect_12.png
+      :alt: No driver
+      :align: center
+
 #. Again **right-click** on the unknown device, select :guilabel:`Properties --> Details --> Hardware Ids` and copy the line like ``USB\VID_XXXX&PID_XXXX&MI_XX``.
-    .. image:: images/usb_device_connect_02.png
-     :alt: Device properties details
-     :align: center
+
+   .. image:: images/usb_device_connect_02.png
+      :alt: Device properties details
+      :align: center
+
 #. Now open file :file:`<Android SDK folder>/extras/google/usb_driver/android_winusb.inf`. Select either ``Google.NTx86`` or ``Google.NTamd64`` section depending on your host system architecture.
-    .. image:: images/usb_device_connect_03.png
-     :alt: "android_winusb.inf"
-     :align: center
+
+   .. image:: images/usb_device_connect_03.png
+      :alt: "android_winusb.inf"
+      :align: center
+
 #. There should be a record like existing ones for your device and you need to add one manually.
-    .. image:: images/usb_device_connect_04.png
-     :alt: "android_winusb.inf"
-     :align: center
+
+   .. image:: images/usb_device_connect_04.png
+      :alt: "android_winusb.inf"
+      :align: center
+
 #. Save the :file:`android_winusb.inf` file and try to install the USB driver again.
-    .. image:: images/usb_device_connect_05.png
-     :alt: Device properties
-     :align: center

-    ` `
+   .. image:: images/usb_device_connect_05.png
+      :alt: Device properties
+      :align: center

-    .. image:: images/usb_device_connect_06.png
-     :alt: Browse for driver
-     :align: center
+   ` `

-    ` `
+   .. image:: images/usb_device_connect_06.png
+      :alt: Browse for driver
+      :align: center
+
+   ` `
+
+   .. image:: images/usb_device_connect_07.png
+      :alt: Browse for driver
+      :align: center

-    .. image:: images/usb_device_connect_07.png
-     :alt: Browse for driver
-     :align: center
 #. This time installation should go successfully.
-    .. image:: images/usb_device_connect_08.png
-     :alt: Install prompt
-     :align: center

-    ` `
+   .. image:: images/usb_device_connect_08.png
+      :alt: Install prompt
+      :align: center
+
+   ` `
+
+   .. image:: images/usb_device_connect_09.png
+      :alt: Installed OK
+      :align: center

-    .. image:: images/usb_device_connect_09.png
-     :alt: Installed OK
-     :align: center
 #. And an unknown device is now recognized as an Android phone.
-    .. image:: images/usb_device_connect_10.png
-     :alt: "Known" device
-     :align: center
+
+   .. image:: images/usb_device_connect_10.png
+      :alt: "Known" device
+      :align: center
+
 #. Successful device USB connection can be verified in console via ``adb devices`` command.
-    .. image:: images/usb_device_connect_11.png
-     :alt: "adb devices"
-     :align: center
+
+   .. image:: images/usb_device_connect_11.png
+      :alt: "adb devices"
+      :align: center

 #. Now, in Eclipse go :guilabel:`Run -> Run/Debug` to run your application in regular or debugging mode. :guilabel:`Device Chooser` will let you choose among the devices.

@ -519,13 +551,13 @@ By default Linux doesn't recognize Android devices, but it's easy to fix this is

 .. code-block:: guess

-  SUBSYSTEM=="usb", ATTR{idVendor}=="1004",  MODE="0666", GROUP="plugdev"
+   SUBSYSTEM=="usb", ATTR{idVendor}=="1004",  MODE="0666", GROUP="plugdev"

 Then restart your adb server (even better to restart the system), plug in your Android device and execute :command:`adb devices` command. You will see the list of attached devices:

-  .. image:: images/usb_device_connect_ubuntu.png
-    :alt: List of attached devices
-    :align: center
+.. image:: images/usb_device_connect_ubuntu.png
+   :alt: List of attached devices
+   :align: center

 MacOS host computer
 ^^^^^^^^^^^^^^^^^^^
--- a/doc/tutorials/introduction/android_binary_package/dev_with_OCV_on_Android.rst
+++ b/doc/tutorials/introduction/android_binary_package/dev_with_OCV_on_Android.rst
@ -57,12 +57,12 @@ Using async initialization is a **recommended** way for application development.
 To run OpenCV Manager-based application the first time you need to install packages with the `OpenCV Manager` and `OpenCV binary pack` for you platform.
 You can do it using Google Play Market or manually with ``adb`` tool:

-  .. code-block:: sh
+.. code-block:: sh
    :linenos:

    <Android SDK path>/platform-tools/adb install <OpenCV4Android SDK path>/apk/OpenCV_2.4.2_Manager.apk
    <Android SDK path>/platform-tools/adb install <OpenCV4Android SDK path>/apk/OpenCV_2.4.2_binary_pack_armv7a.apk
-	
+
 There is a very base code snippet implementing the async initialization. It shows basic principles. See the "15-puzzle" OpenCV sample for details.

 .. code-block:: java
@ -107,7 +107,7 @@ There is a very base code snippet implementing the async initialization. It show
    }

 It this case application works with OpenCV Manager in asynchronous fashion. ``OnManagerConnected`` callback will be called in UI thread, when initialization finishes.
-Please note, that it is not allowed to use OpenCV calls or load OpenCV-dependent native libs before invoking this callback. 
+Please note, that it is not allowed to use OpenCV calls or load OpenCV-dependent native libs before invoking this callback.
 Load your own native libraries that depend on OpenCV after the successful OpenCV initialization.

 Application development with static initialization
@ -130,27 +130,27 @@ This approach is deprecated for the production code, release package is recommen
       :align: center

 #. If your application project **doesn't have a JNI part**, just copy the corresponding OpenCV native libs from :file:`<OpenCV-2.4.2-android-sdk>/sdk/native/libs/<target_arch>` to your project directory to folder :file:`libs/<target_arch>`.
-   
-   In case of the application project **with a JNI part**, instead of manual libraries copying you need to modify your ``Android.mk`` file: 
+
+   In case of the application project **with a JNI part**, instead of manual libraries copying you need to modify your ``Android.mk`` file:
   add the following two code lines after the ``"include $(CLEAR_VARS)"`` and before ``"include path_to_OpenCV-2.4.2-android-sdk/sdk/native/jni/OpenCV.mk"``

   .. code-block:: make
-       :linenos:
+      :linenos:
+
+      OPENCV_CAMERA_MODULES:=on
+      OPENCV_INSTALL_MODULES:=on

-       OPENCV_CAMERA_MODULES:=on
-       OPENCV_INSTALL_MODULES:=on
- 
   The result should look like the following:
-  
+
   .. code-block:: make
-       :linenos:
+      :linenos:

-       include $(CLEAR_VARS)
+      include $(CLEAR_VARS)

-       # OpenCV
-       OPENCV_CAMERA_MODULES:=on
-       OPENCV_INSTALL_MODULES:=on
-       include ../../sdk/native/jni/OpenCV.mk
+      # OpenCV
+      OPENCV_CAMERA_MODULES:=on
+      OPENCV_INSTALL_MODULES:=on
+      include ../../sdk/native/jni/OpenCV.mk

   After that the OpenCV libraries will be copied to your application :file:`libs` folder during the JNI part build.

@ -159,28 +159,28 @@ This approach is deprecated for the production code, release package is recommen
 #. The last step of enabling OpenCV in your application is Java initialization code before call to OpenCV API.
   It can be done, for example, in the static section of the ``Activity`` class:

-    .. code-block:: java
-       :linenos:
+   .. code-block:: java
+      :linenos:

-        static {
-            if (!OpenCVLoader.initDebug()) {
-                // Handle initialization error
-            }
-        }
+      static {
+          if (!OpenCVLoader.initDebug()) {
+              // Handle initialization error
+          }
+      }

-    If you application includes other OpenCV-dependent native libraries you should load them **after** OpenCV initialization:
+   If you application includes other OpenCV-dependent native libraries you should load them **after** OpenCV initialization:

-    .. code-block:: java
-        :linenos:
+   .. code-block:: java
+      :linenos:

-        static {
-            if (!OpenCVLoader.initDebug()) {
-                // Handle initialization error
-            } else {
-                System.loadLibrary("my_jni_lib1");
-                System.loadLibrary("my_jni_lib2");
-            }
-        }
+      static {
+          if (!OpenCVLoader.initDebug()) {
+              // Handle initialization error
+          } else {
+              System.loadLibrary("my_jni_lib1");
+              System.loadLibrary("my_jni_lib2");
+          }
+      }

 Native/C++
 ----------
@ -198,33 +198,33 @@ To build your own Android application, which uses OpenCV from native part, the f

   .. code-block:: make

-           include C:\Work\OpenCV4Android\OpenCV-2.4.2-android-sdk\sdk\native\jni\OpenCV.mk
+      include C:\Work\OpenCV4Android\OpenCV-2.4.2-android-sdk\sdk\native\jni\OpenCV.mk

   should be inserted into the :file:`jni/Android.mk` file **after** the line

   .. code-block:: make

-        include $(CLEAR_VARS)
+      include $(CLEAR_VARS)

 #. Several variables can be used to customize OpenCV stuff, but you **don't need** to use them when your application uses the `async initialization` via the `OpenCV Manager` API.
-   
+
   Note: these variables should be set **before**  the ``"include .../OpenCV.mk"`` line:

   .. code-block:: make

-        OPENCV_INSTALL_MODULES:=on
+      OPENCV_INSTALL_MODULES:=on

   Copies necessary OpenCV dynamic libs to the project ``libs`` folder in order to include them into the APK.

   .. code-block:: make

-        OPENCV_CAMERA_MODULES:=off
+      OPENCV_CAMERA_MODULES:=off

   Skip native OpenCV camera related libs copying to the project ``libs`` folder.

   .. code-block:: make

-        OPENCV_LIB_TYPE:=STATIC
+      OPENCV_LIB_TYPE:=STATIC

   Perform static link with OpenCV. By default dynamic link is used and the project JNI lib depends on ``libopencv_java.so``.

@ -232,14 +232,14 @@ To build your own Android application, which uses OpenCV from native part, the f

   .. code-block:: make

-        APP_STL := gnustl_static
-        APP_CPPFLAGS := -frtti -fexceptions
+      APP_STL := gnustl_static
+      APP_CPPFLAGS := -frtti -fexceptions

   Also the line like this one:

   .. code-block:: make

-                 APP_ABI := armeabi-v7a
+      APP_ABI := armeabi-v7a

   should specify the application target platforms.

@ -249,11 +249,11 @@ To build your own Android application, which uses OpenCV from native part, the f

   .. code-block:: make

-                 APP_PLATFORM := android-9
+      APP_PLATFORM := android-9


 #. Either use :ref:`manual <NDK_build_cli>` ``ndk-build`` invocation or :ref:`setup Eclipse CDT Builder <CDT_Builder>` to build native JNI lib before Java part [re]build and APK creation.
-   
+

 Hello OpenCV Sample
 ===================
@ -262,208 +262,217 @@ Here are basic steps to guide you trough the process of creating a simple OpenCV
 It will be capable of accessing camera output, processing it and displaying the result.

 #. Open Eclipse IDE, create a new clean workspace, create a new Android project (*File -> New -> Android Project*).
-   
+
 #. Set name, target, package and minSDKVersion accordingly.
-   
+
 #. Create a new class (*File -> New -> Class*). Name it for example: *HelloOpenCVView*.
-	.. image:: images/dev_OCV_new_class.png
-         :alt: Add a new class.
-         :align: center

-    * It should extend *SurfaceView* class.
+   .. image:: images/dev_OCV_new_class.png
+        :alt: Add a new class.
+        :align: center

-    * It also should implement *SurfaceHolder.Callback*, *Runnable*.
+   * It should extend *SurfaceView* class.
+   * It also should implement *SurfaceHolder.Callback*, *Runnable*.

 #. Edit *HelloOpenCVView* class.

-    * Add an *import* line for *android.content.context*.
+   * Add an *import* line for *android.content.context*.

-    * Modify autogenerated stubs: *HelloOpenCVView*, *surfaceCreated*, *surfaceDestroyed* and *surfaceChanged*.
-	 .. code-block:: java
+   * Modify autogenerated stubs: *HelloOpenCVView*, *surfaceCreated*, *surfaceDestroyed* and *surfaceChanged*.

-		  package com.hello.opencv.test;
+     .. code-block:: java
+        :linenos:

-		  import android.content.Context;
+        package com.hello.opencv.test;

-		  public class HelloOpenCVView extends SurfaceView implements Callback, Runnable {
+        import android.content.Context;

-		  public HelloOpenCVView(Context context) {
-		  super(context);
-		  getHolder().addCallback(this);
-		  }
-		  
-		  public void surfaceCreated(SurfaceHolder holder) {
-		  (new Thread(this)).start();
-		  }
-		  
-		  public void surfaceDestroyed(SurfaceHolder holder) {
-		  cameraRelease();
-		  }
-		  
-		  public void surfaceChanged(SurfaceHolder holder, int format, int width,
-		  int height) {
-		  cameraSetup(width, height);
-		  }
+        public class HelloOpenCVView extends SurfaceView implements Callback, Runnable {

-    * Add *cameraOpen*, *cameraRelease* and *cameraSetup* voids as shown below.
+        public HelloOpenCVView(Context context) {
+            super(context);
+            getHolder().addCallback(this);
+        }

-    * Also, don't forget to add the public void *run()* as follows:
-	
-	 .. code-block:: java
+        public void surfaceCreated(SurfaceHolder holder) {
+            (new Thread(this)).start();
+        }

-		  public void run() {
-			// TODO: loop { getFrame(), processFrame(), drawFrame() }
-		  }
+        public void surfaceDestroyed(SurfaceHolder holder) {
+            cameraRelease();
+        }

-		  public boolean cameraOpen() {
-			return false; //TODO: open camera
-		  }
-	
-		  private void cameraRelease() {
-			// TODO release camera
-		  }
+        public void surfaceChanged(SurfaceHolder holder, int format, int width, int height) {
+            cameraSetup(width, height);
+        }

-		  private void cameraSetup(int width, int height) {
-			// TODO setup camera
-		  }
-  	
+        //...

-       ..
+   * Add *cameraOpen*, *cameraRelease* and *cameraSetup* voids as shown below.
+
+   * Also, don't forget to add the public void *run()* as follows:
+
+     .. code-block:: java
+        :linenos:
+
+        public void run() {
+            // TODO: loop { getFrame(), processFrame(), drawFrame() }
+        }
+
+        public boolean cameraOpen() {
+            return false; //TODO: open camera
+        }
+
+        private void cameraRelease() {
+            // TODO release camera
+        }
+
+        private void cameraSetup(int width, int height) {
+            // TODO setup camera
+        }

 #. Create a new *Activity* (*New -> Other -> Android -> Android Activity*) and name it, for example: *HelloOpenCVActivity*. For this activity define *onCreate*, *onResume* and *onPause* voids.
-	 .. code-block:: java

-		  public void onCreate (Bundle savedInstanceState) {
-			super.onCreate(savedInstanceState);
-			mView = new HelloOpenCVView(this);
-			setContentView (mView);
-		  }
+   .. code-block:: java
+      :linenos:

-		  protected void onPause() {
-			super.onPause();
-			mView.cameraRelease();
-		  }
+       public void onCreate (Bundle savedInstanceState) {
+           super.onCreate(savedInstanceState);
+           mView = new HelloOpenCVView(this);
+           setContentView (mView);
+       }

-		  protected void onResume() {
-			super.onResume();
-			if( !mView.cameraOpen() ) {
-				// MessageBox and exit app
-				AlertDialog ad = new AlertDialog.Builder(this).create();
-				ad.setCancelable(false); // This blocks the "BACK" button
-				ad.setMessage("Fatal error: can't open camera!");
-				ad.setButton("OK", new DialogInterface.OnClickListener() {
-					public void onClick(DialogInterface dialog, int which) {
-						dialog.dismiss();
-						finish();
-					}
-				});
-				ad.show();
-			}
-		
-		}
+       protected void onPause() {
+           super.onPause();
+           mView.cameraRelease();
+       }
+
+       protected void onResume() {
+           super.onResume();
+           if( !mView.cameraOpen() ) {
+               // MessageBox and exit app
+               AlertDialog ad = new AlertDialog.Builder(this).create();
+               ad.setCancelable(false); // This blocks the "BACK" button
+               ad.setMessage("Fatal error: can't open camera!");
+               ad.setButton("OK", new DialogInterface.OnClickListener() {
+                   public void onClick(DialogInterface dialog, int which) {
+                       dialog.dismiss();
+                       finish();
+                   }
+               });
+               ad.show();
+           }
+       }

 #. Add the following permissions to the AndroidManifest.xml file:
-	 .. code-block:: xml

-	  </application>
+   .. code-block:: xml
+      :linenos:
+
+      </application>
+
+      <uses-permission android:name="android.permission.CAMERA" />
+      <uses-feature android:name="android.hardware.camera" />
+      <uses-feature android:name="android.hardware.camera.autofocus" />

-	  <uses-permission android:name="android.permission.CAMERA" />
-	  <uses-feature android:name="android.hardware.camera" />
-	  <uses-feature android:name="android.hardware.camera.autofocus" />
-	  
 #. Reference OpenCV library within your project properties.
-	 .. image:: images/dev_OCV_reference.png
-          :alt: Reference OpenCV library.
-          :align: center
+
+   .. image:: images/dev_OCV_reference.png
+        :alt: Reference OpenCV library.
+        :align: center

 #. We now need some code to handle the camera. Update the *HelloOpenCVView* class as follows:
-	 .. code-block:: java

-		  private VideoCapture		mCamera;
-		  
-		  public boolean cameraOpen() {
-			synchronized (this) {
-				cameraRelease();
-				mCamera = new VideoCapture(Highgui.CV_CAP_ANDROID);
-				if (!mCamera.isOpened()) {
-					mCamera.release();
-					mCamera = null;
-					Log.e("HelloOpenCVView", "Failed to open native camera");
-					return false;
-				}
-			}
-			return true;
-		  }
-		  public void cameraRelease() {
-			synchronized(this) {
-				if (mCamera != null) {
-					mCamera.release();
-					mCamera = null;
-				}
-			}
-		  }
-		  private void cameraSetup(int width, int height) {
-			synchronized (this) {
-				if (mCamera != null && mCamera.isOpened()) {
-					List<Size> sizes = mCamera.getSupportedPreviewSizes();
-					int mFrameWidth = width;
-					int mFrameHeight = height;
-					{ // selecting optimal camera preview size
-						double minDiff = Double.MAX_VALUE;
-						for (Size size : sizes) {
-							if (Math.abs(size.height - height) < minDiff) {
-								mFrameWidth = (int) size.width;
-								mFrameHeight = (int) size.height;
-								minDiff = Math.abs(size.height - height);
-							}
-						}
-					}
-					mCamera.set(Highgui.CV_CAP_PROP_FRAME_WIDTH, mFrameWidth);
-					mCamera.set(Highgui.CV_CAP_PROP_FRAME_HEIGHT, mFrameHeight);
-				}
-			}
-		  }
+   .. code-block:: java
+      :linenos:
+
+      private VideoCapture      mCamera;
+
+      public boolean cameraOpen() {
+          synchronized (this) {
+              cameraRelease();
+              mCamera = new VideoCapture(Highgui.CV_CAP_ANDROID);
+              if (!mCamera.isOpened()) {
+                  mCamera.release();
+                  mCamera = null;
+                  Log.e("HelloOpenCVView", "Failed to open native camera");
+                  return false;
+              }
+          }
+          return true;
+      }
+
+      public void cameraRelease() {
+          synchronized(this) {
+              if (mCamera != null) {
+                   mCamera.release();
+                   mCamera = null;
+              }
+          }
+      }
+
+      private void cameraSetup(int width, int height) {
+          synchronized (this) {
+              if (mCamera != null && mCamera.isOpened()) {
+                  List<Size> sizes = mCamera.getSupportedPreviewSizes();
+                  int mFrameWidth = width;
+                  int mFrameHeight = height;
+                  { // selecting optimal camera preview size
+                       double minDiff = Double.MAX_VALUE;
+                       for (Size size : sizes) {
+                           if (Math.abs(size.height - height) < minDiff) {
+                               mFrameWidth = (int) size.width;
+                               mFrameHeight = (int) size.height;
+                               minDiff = Math.abs(size.height - height);
+                           }
+                       }
+                   }
+                   mCamera.set(Highgui.CV_CAP_PROP_FRAME_WIDTH, mFrameWidth);
+                   mCamera.set(Highgui.CV_CAP_PROP_FRAME_HEIGHT, mFrameHeight);
+              }
+          }
+      }

 #. The last step would be to update the *run()* void in *HelloOpenCVView* class as follows:
-	 .. code-block:: java

-		  public void run() {
-			while (true) {
-				Bitmap bmp = null;
-				synchronized (this) {
-					if (mCamera == null)
-						break;
-					if (!mCamera.grab())
-						break;
-				
-					bmp = processFrame(mCamera);
-				}
-				if (bmp != null) {
-					Canvas canvas = getHolder().lockCanvas();
-					if (canvas != null) {
-						canvas.drawBitmap(bmp, (canvas.getWidth() - bmp.getWidth()) / 2,
-								(canvas.getHeight() - bmp.getHeight()) / 2, null);
-						getHolder().unlockCanvasAndPost(canvas);
-					
-					}
-					bmp.recycle();
-				}
-			}
-		  }
+   .. code-block:: java
+      :linenos:

-		  protected Bitmap processFrame(VideoCapture capture) {
-			Mat mRgba = new Mat();
-			capture.retrieve(mRgba, Highgui.CV_CAP_ANDROID_COLOR_FRAME_RGBA);
-			//process mRgba
-			Bitmap bmp = Bitmap.createBitmap(mRgba.cols(), mRgba.rows(), Bitmap.Config.ARGB_8888);
-			try {
-				Utils.matToBitmap(mRgba, bmp);
-			} catch(Exception e) {
-				Log.e("processFrame", "Utils.matToBitmap() throws an exception: " + e.getMessage());
-				bmp.recycle();
-				bmp = null;
-			}
-			return bmp;
-		  }
+      public void run() {
+          while (true) {
+              Bitmap bmp = null;
+              synchronized (this) {
+                  if (mCamera == null)
+                      break;
+                  if (!mCamera.grab())
+                      break;

+                  bmp = processFrame(mCamera);
+              }
+              if (bmp != null) {
+                  Canvas canvas = getHolder().lockCanvas();
+                  if (canvas != null) {
+                      canvas.drawBitmap(bmp, (canvas.getWidth()  - bmp.getWidth())  / 2,
+                                             (canvas.getHeight() - bmp.getHeight()) / 2, null);
+                      getHolder().unlockCanvasAndPost(canvas);

+                  }
+                  bmp.recycle();
+              }
+          }
+      }
+
+      protected Bitmap processFrame(VideoCapture capture) {
+          Mat mRgba = new Mat();
+          capture.retrieve(mRgba, Highgui.CV_CAP_ANDROID_COLOR_FRAME_RGBA);
+          //process mRgba
+          Bitmap bmp = Bitmap.createBitmap(mRgba.cols(), mRgba.rows(), Bitmap.Config.ARGB_8888);
+          try {
+              Utils.matToBitmap(mRgba, bmp);
+          } catch(Exception e) {
+              Log.e("processFrame", "Utils.matToBitmap() throws an exception: " + e.getMessage());
+              bmp.recycle();
+              bmp = null;
+          }
+          return bmp;
+      }
--- a/ios/cmake/Modules/Platform/iOS.cmake
+++ b/ios/cmake/Modules/Platform/iOS.cmake
@ -42,6 +42,8 @@ set (CMAKE_CXX_OSX_CURRENT_VERSION_FLAG "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}")
 set (CMAKE_C_FLAGS "")
 set (CMAKE_CXX_FLAGS "-headerpad_max_install_names -fvisibility=hidden -fvisibility-inlines-hidden")

+set (CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG -O3 -fomit-frame-pointer -ffast-math")
+
 if (HAVE_FLAG_SEARCH_PATHS_FIRST)
 	set (CMAKE_C_LINK_FLAGS "-Wl,-search_paths_first ${CMAKE_C_LINK_FLAGS}")
 	set (CMAKE_CXX_LINK_FLAGS "-Wl,-search_paths_first ${CMAKE_CXX_LINK_FLAGS}")
--- a/modules/calib3d/src/triangulate.cpp
+++ b/modules/calib3d/src/triangulate.cpp
@ -416,10 +416,10 @@ void cv::triangulatePoints( InputArray _projMatr1, InputArray _projMatr2,
    Mat points1 = _projPoints1.getMat(), points2 = _projPoints2.getMat();

    if((points1.rows == 1 || points1.cols == 1) && points1.channels() == 2)
-        points1 = points1.reshape(1, points1.total()).t();
+        points1 = points1.reshape(1, static_cast<int>(points1.total())).t();

    if((points2.rows == 1 || points2.cols == 1) && points2.channels() == 2)
-        points2 = points2.reshape(1, points2.total()).t();
+        points2 = points2.reshape(1, static_cast<int>(points2.total())).t();

    CvMat cvMatr1 = matr1, cvMatr2 = matr2;
    CvMat cvPoints1 = points1, cvPoints2 = points2;
--- a/modules/calib3d/test/test_affine3d_estimator.cpp
+++ b/modules/calib3d/test/test_affine3d_estimator.cpp
@ -48,6 +48,7 @@ using namespace std;
 #include <string>
 #include <iostream>
 #include <fstream>
+#include <functional>
 #include <iterator>
 #include <limits>
 #include <numeric>
--- a/modules/calib3d/test/test_chesscorners.cpp
+++ b/modules/calib3d/test/test_chesscorners.cpp
@ -42,6 +42,7 @@
 #include "test_precomp.hpp"
 #include "test_chessboardgenerator.hpp"

+#include <functional>
 #include <limits>
 #include <numeric>

--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@ -3,13 +3,14 @@ ocv_add_module(core ${ZLIB_LIBRARIES})
 ocv_module_include_directories(${ZLIB_INCLUDE_DIR})

 if(HAVE_CUDA)
-  file(GLOB lib_cuda "src/cuda/*.cu")
-  source_group("Cuda" FILES "${lib_cuda}")
-
+  ocv_source_group("Src\\Cuda" GLOB "src/cuda/*.cu")  
  ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/src" "${OpenCV_SOURCE_DIR}/modules/gpu/src/cuda" ${CUDA_INCLUDE_DIRS})
  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
+  
+  file(GLOB lib_cuda "src/cuda/*.cu")
  ocv_cuda_compile(cuda_objs ${lib_cuda})

+  
  set(cuda_link_libs ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
 else()
  set(lib_cuda "")
--- a/modules/core/include/opencv2/core/core.hpp
+++ b/modules/core/include/opencv2/core/core.hpp
@ -440,7 +440,7 @@ template<typename _Tp, int m, int n> class CV_EXPORTS Matx
 {
 public:
    typedef _Tp value_type;
-    typedef Matx<_Tp, MIN(m, n), 1> diag_type;
+    typedef Matx<_Tp, (m < n ? m : n), 1> diag_type;
    typedef Matx<_Tp, m, n> mat_type;
    enum { depth = DataDepth<_Tp>::value, rows = m, cols = n, channels = rows*cols,
           type = CV_MAKETYPE(depth, channels) };
@ -4620,6 +4620,34 @@ public:

 CV_EXPORTS void parallel_for_(const Range& range, const ParallelLoopBody& body);

+/////////////////////////// Synchronization Primitives ///////////////////////////////
+
+class CV_EXPORTS Mutex
+{
+public:
+    Mutex();
+    ~Mutex();
+    Mutex(const Mutex& m);
+    Mutex& operator = (const Mutex& m);
+    
+    void lock();
+    bool trylock();
+    void unlock();
+    
+    struct Impl;
+protected:
+    Impl* impl;
+};
+
+class CV_EXPORTS AutoLock
+{
+public:    
+    AutoLock(Mutex& m) : mutex(&m) { mutex->lock(); }
+    ~AutoLock() { mutex->unlock(); }
+protected:    
+    Mutex* mutex;
+};
+
 }

 #endif // __cplusplus
--- a/modules/core/include/opencv2/core/gpumat.hpp
+++ b/modules/core/include/opencv2/core/gpumat.hpp
@ -72,9 +72,11 @@ namespace cv { namespace gpu
        FEATURE_SET_COMPUTE_13 = 13,
        FEATURE_SET_COMPUTE_20 = 20,
        FEATURE_SET_COMPUTE_21 = 21,
+        FEATURE_SET_COMPUTE_30 = 30,
        GLOBAL_ATOMICS = FEATURE_SET_COMPUTE_11,
        SHARED_ATOMICS = FEATURE_SET_COMPUTE_12,
-        NATIVE_DOUBLE = FEATURE_SET_COMPUTE_13
+        NATIVE_DOUBLE = FEATURE_SET_COMPUTE_13,
+        WARP_SHUFFLE_FUNCTIONS = FEATURE_SET_COMPUTE_30
    };

    // Gives information about what GPU archs this OpenCV GPU module was
--- a/modules/core/src/cuda/matrix_operations.cu
+++ b/modules/core/src/cuda/matrix_operations.cu
@ -44,7 +44,7 @@
 #include "opencv2/gpu/device/transform.hpp"
 #include "opencv2/gpu/device/functional.hpp"

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
    template <typename T> struct shift_and_sizeof;
    template <> struct shift_and_sizeof<signed char> { enum { shift = 0 }; };
@ -272,7 +272,7 @@ namespace cv { namespace gpu { namespace device
    template <typename T, typename D> struct TransformFunctorTraits< Convertor<T, D> > : detail::ConvertTraits< Convertor<T, D> >
    {
    };
-        
+
    template<typename T, typename D>
    void cvt_(DevMem2Db src, DevMem2Db dst, double alpha, double beta, cudaStream_t stream)
    {
@ -282,6 +282,11 @@ namespace cv { namespace gpu { namespace device
        cv::gpu::device::transform((DevMem2D_<T>)src, (DevMem2D_<D>)dst, op, WithOutMask(), stream);
    }

+#if defined  __clang__
+# pragma clang diagnostic push
+# pragma clang diagnostic ignored "-Wmissing-declarations"
+#endif
+
    void convert_gpu(DevMem2Db src, int sdepth, DevMem2Db dst, int ddepth, double alpha, double beta, cudaStream_t stream)
    {
        typedef void (*caller_t)(DevMem2Db src, DevMem2Db dst, double alpha, double beta, cudaStream_t stream);
@ -318,4 +323,8 @@ namespace cv { namespace gpu { namespace device

        func(src, dst, alpha, beta, stream);
    }
+
+#if defined __clang__
+# pragma clang diagnostic pop
+#endif
 }}} // namespace cv { namespace gpu { namespace device
--- a/modules/core/src/gpumat.cpp
+++ b/modules/core/src/gpumat.cpp
@ -1199,10 +1199,6 @@ namespace

        void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const
        {
-            NppiSize sz;
-            sz.width  = m.cols;
-            sz.height = m.rows;
-
            if (mask.empty())
            {
                if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0)
--- a/modules/core/src/parallel.cpp
+++ b/modules/core/src/parallel.cpp
@ -42,6 +42,16 @@

 #include "precomp.hpp"

+#if !defined HAVE_TBB && !defined HAVE_OPENMP && !defined HAVE_GCD && !defined HAVE_CONCURRENCY
+
+#ifdef __APPLE__
+#define HAVE_GCD
+#elif defined _MSC_VER && _MSC_VER >= 1600
+#define HAVE_CONCURRENCY
+#endif
+
+#endif
+
 #ifdef HAVE_CONCURRENCY
 #  include <ppl.h>
 #elif defined HAVE_OPENMP
@ -106,7 +116,22 @@ namespace cv

 #elif defined HAVE_CONCURRENCY

-        Concurrency::parallel_for(range.start, range.end, body);
+        class ConcurrencyProxyLoopBody
+        {
+        public:
+            ConcurrencyProxyLoopBody(const ParallelLoopBody& body) : _body(body) {}
+
+            void operator ()(int i) const
+            {
+                _body(Range(i, i + 1));
+            }
+
+        private:
+            const ParallelLoopBody& _body;
+            ConcurrencyProxyLoopBody& operator=(const ConcurrencyProxyLoopBody&) {return *this;}
+        } proxy(body);
+
+        Concurrency::parallel_for(range.start, range.end, proxy);

 #elif defined HAVE_OPENMP

--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@ -930,4 +930,104 @@ BOOL WINAPI DllMain( HINSTANCE, DWORD  fdwReason, LPVOID )
 }
 #endif

+namespace cv
+{
+
+#if defined WIN32 || defined _WIN32 || defined WINCE
+
+struct Mutex::Impl
+{
+    Impl() { InitializeCriticalSection(&cs); refcount = 1; }
+    ~Impl() { DeleteCriticalSection(&cs); }
+
+    void lock() { EnterCriticalSection(&cs); }
+    bool trylock() { return TryEnterCriticalSection(&cs) != 0; }
+    void unlock() { LeaveCriticalSection(&cs); }
+
+    CRITICAL_SECTION cs;
+    int refcount;
+};
+
+#elif defined __APPLE__
+
+#include <libkern/OSAtomic.h>
+
+struct Mutex::Impl
+{
+    Impl() { sl = OS_SPINLOCK_INIT; refcount = 1; }
+    ~Impl() {}
+
+    void lock() { OSSpinLockLock(&sl); }
+    bool trylock() { return OSSpinLockTry(&sl); }
+    void unlock() { OSSpinLockUnlock(&sl); }
+
+    OSSpinLock sl;
+    int refcount;
+};
+
+#elif defined __linux__ && !defined ANDROID
+
+struct Mutex::Impl
+{
+    Impl() { pthread_spin_init(&sl, 0); refcount = 1; }
+    ~Impl() { pthread_spin_destroy(&sl); }
+
+    void lock() { pthread_spin_lock(&sl); }
+    bool trylock() { return pthread_spin_trylock(&sl) == 0; }
+    void unlock() { pthread_spin_unlock(&sl); }
+
+    pthread_spinlock_t sl;
+    int refcount;
+};
+
+#else
+
+struct Mutex::Impl
+{
+    Impl() { pthread_mutex_init(&sl, 0); refcount = 1; }
+    ~Impl() { pthread_mutex_destroy(&sl); }
+
+    void lock() { pthread_mutex_lock(&sl); }
+    bool trylock() { return pthread_mutex_trylock(&sl) == 0; }
+    void unlock() { pthread_mutex_unlock(&sl); }
+
+    pthread_mutex_t sl;
+    int refcount;
+};
+
+#endif
+
+Mutex::Mutex()
+{
+    impl = new Mutex::Impl;
+}
+
+Mutex::~Mutex()
+{
+    if( CV_XADD(&impl->refcount, -1) == 1 )
+        delete impl;
+    impl = 0;
+}
+
+Mutex::Mutex(const Mutex& m)
+{
+    impl = m.impl;
+    CV_XADD(&impl->refcount, 1);
+}
+
+Mutex& Mutex::operator = (const Mutex& m)
+{
+    CV_XADD(&m.impl->refcount, 1);
+    if( CV_XADD(&impl->refcount, -1) == 1 )
+        delete impl;
+    impl = m.impl;
+    return *this;
+}
+
+void Mutex::lock() { impl->lock(); }
+void Mutex::unlock() { impl->unlock(); }
+bool Mutex::trylock() { return impl->trylock(); }
+
+}
+
 /* End of file. */
--- a/modules/features2d/src/features2d_init.cpp
+++ b/modules/features2d/src/features2d_init.cpp
@ -59,7 +59,7 @@ CV_INIT_ALGORITHM(BriefDescriptorExtractor, "Feature2D.BRIEF",
 CV_INIT_ALGORITHM(FastFeatureDetector, "Feature2D.FAST",
                  obj.info()->addParam(obj, "threshold", obj.threshold);
                  obj.info()->addParam(obj, "nonmaxSuppression", obj.nonmaxSuppression);
-                  obj.info()->addParam(obj, "type", obj.type, FastFeatureDetector::TYPE_9_16));
+                  obj.info()->addParam(obj, "type", obj.type));

 ///////////////////////////////////////////////////////////////////////////////////////////////////////////

--- a/modules/features2d/test/test_fast.cpp
+++ b/modules/features2d/test/test_fast.cpp
@ -75,8 +75,8 @@ void CV_FastTest::run( int )

    vector<KeyPoint> keypoints1;
    vector<KeyPoint> keypoints2;
-    FAST(gray1, keypoints1, 30, type);
-    FAST(gray2, keypoints2, 30, type);
+    FAST(gray1, keypoints1, 30, true, type);
+    FAST(gray2, keypoints2, 30, true, type);

    for(size_t i = 0; i < keypoints1.size(); ++i)
    {
--- a/modules/features2d/test/test_nearestneighbors.cpp
+++ b/modules/features2d/test/test_nearestneighbors.cpp
@ -200,7 +200,7 @@ int CV_KDTreeTest_CPP::checkGetPoins( const Mat& data )

 int CV_KDTreeTest_CPP::checkFindBoxed()
 {
-    vector<float> min( dims, minValue), max(dims, maxValue);
+    vector<float> min( dims, static_cast<float>(minValue)), max(dims, static_cast<float>(maxValue));
    vector<int> indices;
    tr->findOrthoRange( min, max, indices );
    // TODO check indices
@ -214,8 +214,8 @@ int CV_KDTreeTest_CPP::findNeighbors( Mat& points, Mat& neighbors )
    const int emax = 20;
    Mat neighbors2( neighbors.size(), CV_32SC1 );
    int j;
-    vector<float> min(points.cols, minValue);
-    vector<float> max(points.cols, maxValue);
+    vector<float> min(points.cols, static_cast<float>(minValue));
+    vector<float> max(points.cols, static_cast<float>(maxValue));
    for( int pi = 0; pi < points.rows; pi++ )
    {
        // 1st way
--- a/modules/features2d/test/test_rotation_and_scale_invariance.cpp
+++ b/modules/features2d/test/test_rotation_and_scale_invariance.cpp
@ -54,7 +54,7 @@ static
 Mat generateHomography(float angle)
 {
    // angle - rotation around Oz in degrees
-    float angleRadian = angle * CV_PI / 180.;
+    float angleRadian = static_cast<float>(angle * CV_PI / 180);
    Mat H = Mat::eye(3, 3, CV_32FC1);
    H.at<float>(0,0) = H.at<float>(1,1) = std::cos(angleRadian);
    H.at<float>(0,1) = -std::sin(angleRadian);
@ -69,8 +69,8 @@ Mat rotateImage(const Mat& srcImage, float angle, Mat& dstImage, Mat& dstMask)
    // angle - rotation around Oz in degrees
    float diag = std::sqrt(static_cast<float>(srcImage.cols * srcImage.cols + srcImage.rows * srcImage.rows));
    Mat LUShift = Mat::eye(3, 3, CV_32FC1); // left up
-    LUShift.at<float>(0,2) = -srcImage.cols/2;
-    LUShift.at<float>(1,2) = -srcImage.rows/2;
+    LUShift.at<float>(0,2) = static_cast<float>(-srcImage.cols/2);
+    LUShift.at<float>(1,2) = static_cast<float>(-srcImage.rows/2);
    Mat RDShift = Mat::eye(3, 3, CV_32FC1); // right down
    RDShift.at<float>(0,2) = diag/2;
    RDShift.at<float>(1,2) = diag/2;
@ -114,7 +114,7 @@ void scaleKeyPoints(const vector<KeyPoint>& src, vector<KeyPoint>& dst, float sc
 static
 float calcCirclesIntersectArea(const Point2f& p0, float r0, const Point2f& p1, float r1)
 {
-    float c = norm(p0 - p1), sqr_c = c * c;
+    float c = static_cast<float>(norm(p0 - p1)), sqr_c = c * c;

    float sqr_r0 = r0 * r0;
    float sqr_r1 = r1 * r1;
@ -125,7 +125,7 @@ float calcCirclesIntersectArea(const Point2f& p0, float r0, const Point2f& p1, f
    float minR = std::min(r0, r1);
    float maxR = std::max(r0, r1);
    if(c + minR <= maxR)
-        return CV_PI * minR * minR;
+        return static_cast<float>(CV_PI * minR * minR);

    float cos_halfA0 = (sqr_r0 + sqr_c - sqr_r1) / (2 * r0 * c);
    float cos_halfA1 = (sqr_r1 + sqr_c - sqr_r0) / (2 * r1 * c);
@ -133,15 +133,15 @@ float calcCirclesIntersectArea(const Point2f& p0, float r0, const Point2f& p1, f
    float A0 = 2 * acos(cos_halfA0);
    float A1 = 2 * acos(cos_halfA1);

-    return  0.5 * sqr_r0 * (A0 - sin(A0)) +
-            0.5 * sqr_r1 * (A1 - sin(A1));
+    return  0.5f * sqr_r0 * (A0 - sin(A0)) +
+            0.5f * sqr_r1 * (A1 - sin(A1));
 }

 static
 float calcIntersectRatio(const Point2f& p0, float r0, const Point2f& p1, float r1)
 {
    float intersectArea = calcCirclesIntersectArea(p0, r0, p1, r1);
-    float unionArea = CV_PI * (r0 * r0 + r1 * r1) - intersectArea;
+    float unionArea = static_cast<float>(CV_PI) * (r0 * r0 + r1 * r1) - intersectArea;
    return intersectArea / unionArea;
 }

@ -160,7 +160,7 @@ void matchKeyPoints(const vector<KeyPoint>& keypoints0, const Mat& H,

    matches.clear();
    vector<uchar> usedMask(keypoints1.size(), 0);
-    for(size_t i0 = 0; i0 < keypoints0.size(); i0++)
+    for(int i0 = 0; i0 < static_cast<int>(keypoints0.size()); i0++)
    {
        int nearestPointIndex = -1;
        float maxIntersectRatio = 0.f;
@ -176,7 +176,7 @@ void matchKeyPoints(const vector<KeyPoint>& keypoints0, const Mat& H,
            if(intersectRatio > maxIntersectRatio)
            {
                maxIntersectRatio = intersectRatio;
-                nearestPointIndex = i1;
+                nearestPointIndex = static_cast<int>(i1);
            }
        }

@ -222,7 +222,7 @@ protected:
        const int maxAngle = 360, angleStep = 15;
        for(int angle = 0; angle < maxAngle; angle += angleStep)
        {
-            Mat H = rotateImage(image0, angle, image1, mask1);
+            Mat H = rotateImage(image0, static_cast<float>(angle), image1, mask1);

            vector<KeyPoint> keypoints1;
            featureDetector->detect(image1, keypoints1, mask1);
@ -339,10 +339,10 @@ protected:
        const int maxAngle = 360, angleStep = 15;
        for(int angle = 0; angle < maxAngle; angle += angleStep)
        {
-            Mat H = rotateImage(image0, angle, image1, mask1);
+            Mat H = rotateImage(image0, static_cast<float>(angle), image1, mask1);

            vector<KeyPoint> keypoints1;
-            rotateKeyPoints(keypoints0, H, angle, keypoints1);
+            rotateKeyPoints(keypoints0, H, static_cast<float>(angle), keypoints1);
            Mat descriptors1;
            descriptorExtractor->compute(image1, keypoints1, descriptors1);

@ -457,7 +457,7 @@ protected:
                keyPointMatchesCount++;

                // Check does this inlier have consistent sizes
-                const float maxSizeDiff = 0.8;//0.9f; // grad
+                const float maxSizeDiff = 0.8f;//0.9f; // grad
                float size0 = keypoints0[matches[m].trainIdx].size;
                float size1 = osiKeypoints1[matches[m].queryIdx].size;
                CV_Assert(size0 > 0 && size1 > 0);
@ -545,7 +545,7 @@ protected:
            resize(image0, image1, Size(), 1./scale, 1./scale);

            vector<KeyPoint> keypoints1;
-            scaleKeyPoints(keypoints0, keypoints1, 1./scale);
+            scaleKeyPoints(keypoints0, keypoints1, 1.0f/scale);
            Mat descriptors1;
            descriptorExtractor->compute(image1, keypoints1, descriptors1);

--- a/modules/gpu/CMakeLists.txt
+++ b/modules/gpu/CMakeLists.txt
@ -111,43 +111,3 @@ ocv_add_accuracy_tests(FILES "Include" ${test_hdrs}
                       FILES "Src" ${test_srcs}
                       ${nvidia})
 ocv_add_perf_tests()
-
-
-
-set(perf_cpu_path "${CMAKE_CURRENT_SOURCE_DIR}/perf_cpu")
-if(BUILD_PERF_TESTS AND EXISTS "${perf_cpu_path}")
-    # opencv_highgui is required for imread/imwrite
-    set(perf_deps ${the_module} opencv_ts opencv_highgui opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_nonfree)
-    ocv_check_dependencies(${perf_deps})
-
-    if(OCV_DEPENDENCIES_FOUND)
-      set(the_target "opencv_perf_gpu_cpu")
-
-      ocv_module_include_directories(${perf_deps} "${perf_cpu_path}")
-
-      if(NOT OPENCV_PERF_${the_module}_CPU_SOURCES)
-        file(GLOB perf_srcs "${perf_cpu_path}/*.cpp")
-        file(GLOB perf_hdrs "${perf_cpu_path}/*.hpp" "${perf_cpu_path}/*.h")
-        source_group("Src" FILES ${perf_srcs})
-        source_group("Include" FILES ${perf_hdrs})
-        set(OPENCV_PERF_${the_module}_CPU_SOURCES ${perf_srcs} ${perf_hdrs})
-      endif()
-
-      add_executable(${the_target} ${OPENCV_PERF_${the_module}_CPU_SOURCES})
-      target_link_libraries(${the_target} ${OPENCV_MODULE_${the_module}_DEPS} ${perf_deps} ${OPENCV_LINKER_LIBS})
-
-      # Additional target properties
-      set_target_properties(${the_target} PROPERTIES
-        DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
-        RUNTIME_OUTPUT_DIRECTORY "${EXECUTABLE_OUTPUT_PATH}"
-      )
-
-      if(ENABLE_SOLUTION_FOLDERS)
-        set_target_properties(${the_target} PROPERTIES FOLDER "tests performance")
-      endif()
-
-      ocv_add_precompiled_headers(${the_target})
-    else(OCV_DEPENDENCIES_FOUND)
-      #TODO: warn about unsatisfied dependencies
-    endif(OCV_DEPENDENCIES_FOUND)
-  endif()
--- a/modules/gpu/doc/object_detection.rst
+++ b/modules/gpu/doc/object_detection.rst
@ -204,7 +204,7 @@ gpu::CascadeClassifier_GPU
 --------------------------
 .. ocv:class:: gpu::CascadeClassifier_GPU

-Cascade classifier class used for object detection. ::
+Cascade classifier class used for object detection. Supports HAAR and LBP cascades. ::

    class CV_EXPORTS CascadeClassifier_GPU
    {
@ -219,6 +219,7 @@ Cascade classifier class used for object detection. ::

            /* Returns number of detected objects */
            int detectMultiScale( const GpuMat& image, GpuMat& objectsBuf, double scaleFactor=1.2, int minNeighbors=4, Size minSize=Size());
+            int detectMultiScale( const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize = Size(), double scaleFactor = 1.1, int minNeighbors = 4);

            /* Finds only the largest object. Special mode if training is required.*/
            bool findLargestObject;
@ -233,11 +234,11 @@ Cascade classifier class used for object detection. ::

 gpu::CascadeClassifier_GPU::CascadeClassifier_GPU
 -----------------------------------------------------
-Loads the classifier from a file.
+Loads the classifier from a file. Cascade type is detected automatically by constructor parameter.

 .. ocv:function:: gpu::CascadeClassifier_GPU::CascadeClassifier_GPU(const string& filename)

-    :param filename: Name of the file from which the classifier is loaded. Only the old ``haar`` classifier (trained by the ``haar`` training application) and NVIDIA's ``nvbin`` are supported.
+    :param filename: Name of the file from which the classifier is loaded. Only the old ``haar`` classifier (trained by the ``haar`` training application) and NVIDIA's ``nvbin`` are supported for HAAR and only new type of OpenCV XML cascade supported for LBP.



@ -255,8 +256,7 @@ Loads the classifier from a file. The previous content is destroyed.

 .. ocv:function:: bool gpu::CascadeClassifier_GPU::load(const string& filename)

-    :param filename: Name of the file from which the classifier is loaded. Only the old ``haar`` classifier (trained by the ``haar`` training application) and NVIDIA's ``nvbin`` are supported.
-
+    :param filename: Name of the file from which the classifier is loaded. Only the old ``haar`` classifier (trained by the ``haar`` training application) and NVIDIA's ``nvbin`` are supported for HAAR and only new type of OpenCV XML cascade supported for LBP.


 gpu::CascadeClassifier_GPU::release
@ -273,13 +273,17 @@ Detects objects of different sizes in the input image.

 .. ocv:function:: int gpu::CascadeClassifier_GPU::detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, double scaleFactor=1.2, int minNeighbors=4, Size minSize=Size())

+.. ocv:function:: int gpu::CascadeClassifier_GPU::detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize = Size(), double scaleFactor = 1.1, int minNeighbors = 4)
+
    :param image: Matrix of type  ``CV_8U``  containing an image where objects should be detected.

    :param objectsBuf: Buffer to store detected objects (rectangles). If it is empty, it is allocated with the default size. If not empty, the function searches not more than N objects, where ``N = sizeof(objectsBufer's data)/sizeof(cv::Rect)``.

-    :param scaleFactor: Value to specify how much the image size is reduced at each image scale.
+    :param maxObjectSize: Maximum possible object size. Objects larger than that are ignored. Used for second signature and supported only for LBP cascades.

-    :param minNeighbors: Value to specify how many neighbours each candidate rectangle has to retain.
+    :param scaleFactor:  Parameter specifying how much the image size is reduced at each image scale.
+
+    :param minNeighbors: Parameter specifying how many neighbors each candidate rectangle should have to retain it.

    :param minSize: Minimum possible object size. Objects smaller than that are ignored.

--- a/modules/gpu/doc/video.rst
+++ b/modules/gpu/doc/video.rst
@ -653,7 +653,7 @@ gpu::GMG_GPU
 ------------
 .. ocv:class:: gpu::GMG_GPU

-Class used for background/foreground segmentation. ::
+  Class used for background/foreground segmentation. ::

    class GMG_GPU_GPU
    {
@ -677,9 +677,9 @@ Class used for background/foreground segmentation. ::
        ...
    };

-The class discriminates between foreground and background pixels by building and maintaining a model of the background. Any pixel which does not fit this model is then deemed to be foreground. The class implements algorithm described in [GMG2012]_.
+  The class discriminates between foreground and background pixels by building and maintaining a model of the background. Any pixel which does not fit this model is then deemed to be foreground. The class implements algorithm described in [GMG2012]_.

-Here are important members of the class that control the algorithm, which you can set after constructing the class instance:
+  Here are important members of the class that control the algorithm, which you can set after constructing the class instance:

    .. ocv:member:: int maxFeatures

--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@ -820,6 +820,7 @@ private:
    int nLayers_;
 };

+//! HoughLines
 CV_EXPORTS void HoughLines(const GpuMat& src, GpuMat& lines, float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096);
 CV_EXPORTS void HoughLines(const GpuMat& src, GpuMat& lines, GpuMat& accum, GpuMat& buf, float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096);
 CV_EXPORTS void HoughLinesTransform(const GpuMat& src, GpuMat& accum, GpuMat& buf, float rho, float theta);
--- a/modules/gpu/misc/mark_nvidia.py
+++ b/modules/gpu/misc/mark_nvidia.py
@ -1,255 +1,234 @@
 import sys, re

 spaces = '[\s]*'
-symbols = '[\s\w\d,.=:|]*'
+symbols = '[\s\w\d,.:|]*'

 def pattern1(prefix, test):
-    return re.compile(spaces + 'perf::' + prefix + '/' + test + '::' + '\(' + symbols + '\)' + spaces)
+    return re.compile(spaces + prefix + '_' + test + '::' + symbols + '::' + '\(' + symbols + '\)' + spaces)

-def pattern2(prefix, test, cvtype):
-    return re.compile(spaces + 'perf::' + prefix + '/' + test + '::' + '\(' + symbols + cvtype + symbols + '\)' + spaces)
+def pattern2(prefix, test, param1):
+    return re.compile(spaces + prefix + '_' + test + '::' + symbols + '::' + '\(' + symbols + param1 + symbols + '\)' + spaces)

-def pattern3(prefix, test, cvtype, param1):
-    return re.compile(spaces + 'perf::' + prefix + '/' + test + '::' + '\(' + symbols + cvtype + symbols + param1 + symbols + '\)' + spaces)
+def pattern3(prefix, test, param1, param2):
+    return re.compile(spaces + prefix + '_' + test + '::' + symbols + '::' + '\(' + symbols + param1 + symbols + param2 + symbols + '\)' + spaces)

-def pattern4(prefix, test, cvtype, param1, param2):
-    return re.compile(spaces + 'perf::' + prefix + '/' + test + '::' + '\(' + symbols + cvtype + symbols + param1 + symbols + param2 + symbols + '\)' + spaces)
+def pattern4(prefix, test, param1, param2, param3):
+    return re.compile(spaces + prefix + '_' + test + '::' + symbols + '::' + '\(' + symbols + param1 + symbols + param2 + symbols + param3 + symbols + '\)' + spaces)
+
+def pattern5(prefix, test, param1, param2, param3, param5):
+    return re.compile(spaces + prefix + '_' + test + '::' + symbols + '::' + '\(' + symbols + param1 + symbols + param2 + symbols + param3 + symbols + param4 + symbols + '\)' + spaces)

 npp_patterns = [
    ##############################################################
    # Core
-    
-    # Core/Add_Mat (CV_8U | CV_16U | CV_32F)
-    pattern2('Core', 'Add_Mat', '8U'),
-    pattern2('Core', 'Add_Mat', '16U'),
-    pattern2('Core', 'Add_Mat', '32F'),
-    
-    # Core/Add_Scalar (CV_8U | CV_16U | CV_32F)
-    pattern2('Core', 'Add_Scalar', '8U'),
-    pattern2('Core', 'Add_Scalar', '16U'),
-    pattern2('Core', 'Add_Scalar', '32F'),
-    
-    # Core/Subtract_Mat (CV_8U | CV_16U | CV_32F)
-    pattern2('Core', 'Subtract_Mat', '8U'),
-    pattern2('Core', 'Subtract_Mat', '16U'),
-    pattern2('Core', 'Subtract_Mat', '32F'),
-    
-    # Core/Subtract_Scalar (CV_8U | CV_16U | CV_32F)
-    pattern2('Core', 'Subtract_Scalar', '8U'),
-    pattern2('Core', 'Subtract_Scalar', '16U'),
-    pattern2('Core', 'Subtract_Scalar', '32F'),
-    
-    # Core/Multiply_Mat (CV_8U | CV_16U | CV_32F)
-    pattern2('Core', 'Multiply_Mat', '8U'),
-    pattern2('Core', 'Multiply_Mat', '16U'),
-    pattern2('Core', 'Multiply_Mat', '32F'),
-    
-    # Core/Multiply_Scalar (CV_8U | CV_16U | CV_32F)
-    pattern2('Core', 'Multiply_Scalar', '8U'),
-    pattern2('Core', 'Multiply_Scalar', '16U'),
-    pattern2('Core', 'Multiply_Scalar', '32F'),
-    
-    # Core/Divide_Mat (CV_8U | CV_16U | CV_32F)
-    pattern2('Core', 'Divide_Mat', '8U'),
-    pattern2('Core', 'Divide_Mat', '16U'),
-    pattern2('Core', 'Divide_Mat', '32F'),
-    
-    # Core/Divide_Scalar (CV_8U | CV_16U | CV_32F)
-    pattern2('Core', 'Divide_Scalar', '8U'),
-    pattern2('Core', 'Divide_Scalar', '16U'),
-    pattern2('Core', 'Divide_Scalar', '32F'),
-    
-    # Core/AbsDiff_Mat (CV_8U | CV_16U | CV_32F)
-    pattern2('Core', 'AbsDiff_Mat', '8U'),
-    pattern2('Core', 'AbsDiff_Mat', '16U'),
-    pattern2('Core', 'AbsDiff_Mat', '32F'),
-    
-    # Core/AbsDiff_Scalar (CV_8U | CV_16U | CV_32F)
-    pattern2('Core', 'AbsDiff_Scalar', '8U'),
-    pattern2('Core', 'AbsDiff_Scalar', '16U'),
-    pattern2('Core', 'AbsDiff_Scalar', '32F'),

-    # Core/Abs
+    # Core_AddMat (CV_8U | CV_16U | CV_32F)
+    pattern2('Core', 'AddMat', '8U'),
+    pattern2('Core', 'AddMat', '16U'),
+    pattern2('Core', 'AddMat', '32F'),
+
+    # Core_AddScalar (CV_8U | CV_16U | CV_32F)
+    pattern2('Core', 'AddScalar', '8U'),
+    pattern2('Core', 'AddScalar', '16U'),
+    pattern2('Core', 'AddScalar', '32F'),
+
+    # Core_SubtractMat (CV_8U | CV_16U | CV_32F)
+    pattern2('Core', 'SubtractMat', '8U'),
+    pattern2('Core', 'SubtractMat', '16U'),
+    pattern2('Core', 'SubtractMat', '32F'),
+
+    # Core_SubtractScalar (CV_8U | CV_16U | CV_32F)
+    pattern2('Core', 'SubtractScalar', '8U'),
+    pattern2('Core', 'SubtractScalar', '16U'),
+    pattern2('Core', 'SubtractScalar', '32F'),
+
+    # Core_MultiplyMat (CV_8U | CV_16U | CV_32F)
+    pattern2('Core', 'MultiplyMat', '8U'),
+    pattern2('Core', 'MultiplyMat', '16U'),
+    pattern2('Core', 'MultiplyMat', '32F'),
+
+    # Core_MultiplyScalar (CV_8U | CV_16U | CV_32F)
+    pattern2('Core', 'MultiplyScalar', '8U'),
+    pattern2('Core', 'MultiplyScalar', '16U'),
+    pattern2('Core', 'MultiplyScalar', '32F'),
+
+    # Core_DivideMat (CV_8U | CV_16U | CV_32F)
+    pattern2('Core', 'DivideMat', '8U'),
+    pattern2('Core', 'DivideMat', '16U'),
+    pattern2('Core', 'DivideMat', '32F'),
+
+    # Core_Divide_Scalar (CV_8U | CV_16U | CV_32F)
+    pattern2('Core', 'DivideScalar', '8U'),
+    pattern2('Core', 'DivideScalar', '16U'),
+    pattern2('Core', 'DivideScalar', '32F'),
+
+    # Core_AbsDiff_Mat (CV_8U | CV_16U | CV_32F)
+    pattern2('Core', 'AbsDiffMat', '8U'),
+    pattern2('Core', 'AbsDiffMat', '16U'),
+    pattern2('Core', 'AbsDiffMat', '32F'),
+
+    # Core_AbsDiffScalar (CV_8U | CV_16U | CV_32F)
+    pattern2('Core', 'AbsDiffScalar', '8U'),
+    pattern2('Core', 'AbsDiffScalar', '16U'),
+    pattern2('Core', 'AbsDiffScalar', '32F'),
+
+    # Core_Abs
    pattern1('Core', 'Abs'),

-    # Core/Sqr
+    # Core_Sqr
    pattern1('Core', 'Sqr'),

-    # Core/Sqrt
+    # Core_Sqrt
    pattern1('Core', 'Sqrt'),

-    # Core/Log
+    # Core_Log
    pattern1('Core', 'Log'),

-    # Core/Exp
+    # Core_Exp
    pattern1('Core', 'Exp'),

-    # Core/Bitwise_And_Scalar
-    pattern1('Core', 'Bitwise_And_Scalar'),
+    # Core_BitwiseAndScalar
+    pattern1('Core', 'BitwiseAndScalar'),

-    # Core/Bitwise_Or_Scalar
-    pattern1('Core', 'Bitwise_Or_Scalar'),
+    # Core_BitwiseOrScalar
+    pattern1('Core', 'BitwiseOrScalar'),

-    # Core/Bitwise_Xor_Scalar
-    pattern1('Core', 'Bitwise_Xor_Scalar'),
+    # Core_BitwiseXorScalar
+    pattern1('Core', 'BitwiseXorScalar'),

-    # Core/RShift
+    # Core_RShift
    pattern1('Core', 'RShift'),

-    # Core/LShift
+    # Core_LShift
    pattern1('Core', 'LShift'),

-    # Core/Transpose
+    # Core_Transpose
    pattern1('Core', 'Transpose'),

-    # Core/Flip
+    # Core_Flip
    pattern1('Core', 'Flip'),

-    # Core/LUT_OneChannel
-    pattern1('Core', 'LUT_OneChannel'),
+    # Core_LutOneChannel
+    pattern1('Core', 'LutOneChannel'),

-    # Core/LUT_MultiChannel
-    pattern1('Core', 'LUT_MultiChannel'),
+    # Core_LutMultiChannel
+    pattern1('Core', 'LutMultiChannel'),

-    # Core/Magnitude_Complex
-    pattern1('Core', 'Magnitude_Complex'),
+    # Core_MagnitudeComplex
+    pattern1('Core', 'MagnitudeComplex'),

-    # Core/Magnitude_Sqr_Complex
-    pattern1('Core', 'Magnitude_Sqr_Complex'),
+    # Core_MagnitudeSqrComplex
+    pattern1('Core', 'MagnitudeSqrComplex'),

-    # Core/MeanStdDev
+    # Core_MeanStdDev
    pattern1('Core', 'MeanStdDev'),

-    # Core/NormDiff
+    # Core_NormDiff
    pattern1('Core', 'NormDiff'),
-    
+
    ##############################################################
    # Filters

-    # Filters/Blur
+    # Filters_Blur
    pattern1('Filters', 'Blur'),
-    
-    # Filters/Erode
+
+    # Filters_Erode
    pattern1('Filters', 'Erode'),
-    
-    # Filters/Dilate
+
+    # Filters_Dilate
    pattern1('Filters', 'Dilate'),
-    
-    # Filters/MorphologyEx
+
+    # Filters_MorphologyEx
    pattern1('Filters', 'MorphologyEx'),
-    
+
    ##############################################################
    # ImgProc
-    
-    # ImgProc/Resize (8UC1 | 8UC4, INTER_NEAREST | INTER_LINEAR)
-    pattern3('ImgProc', 'Resize', '8UC1', 'INTER_NEAREST'),
-    pattern3('ImgProc', 'Resize', '8UC4', 'INTER_NEAREST'),
-    pattern3('ImgProc', 'Resize', '8UC1', 'INTER_LINEAR'),
-    pattern3('ImgProc', 'Resize', '8UC4', 'INTER_LINEAR'),
-    
-    # ImgProc/Resize (8UC4, INTER_CUBIC)
-    pattern3('ImgProc', 'Resize', '8UC4', 'INTER_CUBIC'),
-    
-    # ImgProc/WarpAffine (8UC1 | 8UC3 | 8UC4 | 32FC1 | 32FC3 | 32FC4, INTER_NEAREST | INTER_LINEAR | INTER_CUBIC, BORDER_CONSTANT)
-    pattern4('ImgProc', 'WarpAffine', '8UC1', 'INTER_NEAREST', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '8UC1', 'INTER_LINEAR', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '8UC1', 'INTER_CUBIC', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '8UC3', 'INTER_NEAREST', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '8UC3', 'INTER_LINEAR', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '8UC3', 'INTER_CUBIC', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '8UC4', 'INTER_NEAREST', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '8UC4', 'INTER_LINEAR', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '8UC4', 'INTER_CUBIC', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '32FC1', 'INTER_NEAREST', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '32FC1', 'INTER_LINEAR', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '32FC1', 'INTER_CUBIC', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '32FC3', 'INTER_NEAREST', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '32FC3', 'INTER_LINEAR', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '32FC3', 'INTER_CUBIC', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '32FC4', 'INTER_NEAREST', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '32FC4', 'INTER_LINEAR', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '32FC4', 'INTER_CUBIC', 'BORDER_CONSTANT'),
-    
-    # ImgProc/WarpPerspective (8UC1 | 8UC3 | 8UC4 | 32FC1 | 32FC3 | 32FC4, INTER_NEAREST | INTER_LINEAR | INTER_CUBIC, BORDER_CONSTANT)
-    pattern4('ImgProc', 'WarpPerspective', '8UC1', 'INTER_NEAREST', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '8UC1', 'INTER_LINEAR', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '8UC1', 'INTER_CUBIC', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '8UC3', 'INTER_NEAREST', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '8UC3', 'INTER_LINEAR', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '8UC3', 'INTER_CUBIC', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '8UC4', 'INTER_NEAREST', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '8UC4', 'INTER_LINEAR', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '8UC4', 'INTER_CUBIC', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '32FC1', 'INTER_NEAREST', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '32FC1', 'INTER_LINEAR', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '32FC1', 'INTER_CUBIC', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '32FC3', 'INTER_NEAREST', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '32FC3', 'INTER_LINEAR', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '32FC3', 'INTER_CUBIC', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '32FC4', 'INTER_NEAREST', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '32FC4', 'INTER_LINEAR', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '32FC4', 'INTER_CUBIC', 'BORDER_CONSTANT'),
-    
-    # ImgProc/CopyMakeBorder (8UC1 | 8UC4 | 32SC1 | 32FC1, BORDER_CONSTANT)
-    pattern3('ImgProc', 'CopyMakeBorder', '8UC1', 'BORDER_CONSTANT'),
-    pattern3('ImgProc', 'CopyMakeBorder', '8UC4', 'BORDER_CONSTANT'),
-    pattern3('ImgProc', 'CopyMakeBorder', '32SC1', 'BORDER_CONSTANT'),
-    pattern3('ImgProc', 'CopyMakeBorder', '32FC1', 'BORDER_CONSTANT'),
-    
-    # ImgProc/Threshold (32F, THRESH_TRUNC)
+
+    # ImgProc_Resize (8U, 1 | 4, INTER_NEAREST | INTER_LINEAR)
+    pattern4('ImgProc', 'Resize', '8U', '1', 'INTER_NEAREST'),
+    pattern4('ImgProc', 'Resize', '8U', '4', 'INTER_NEAREST'),
+    pattern4('ImgProc', 'Resize', '8U', '1', 'INTER_LINEAR'),
+    pattern4('ImgProc', 'Resize', '8U', '4', 'INTER_LINEAR'),
+
+    # ImgProc_Resize (8U, 4, INTER_CUBIC)
+    pattern4('ImgProc', 'Resize', '8U', '4', 'INTER_CUBIC'),
+
+    # ImgProc_WarpAffine (8U | 32F, INTER_NEAREST | INTER_LINEAR | INTER_CUBIC, BORDER_CONSTANT)
+    pattern4('ImgProc', 'WarpAffine', '8U' , 'INTER_NEAREST', 'BORDER_CONSTANT'),
+    pattern4('ImgProc', 'WarpAffine', '8U' , 'INTER_LINEAR', 'BORDER_CONSTANT'),
+    pattern4('ImgProc', 'WarpAffine', '8U' , 'INTER_CUBIC', 'BORDER_CONSTANT'),
+    pattern4('ImgProc', 'WarpAffine', '32F', 'INTER_NEAREST', 'BORDER_CONSTANT'),
+    pattern4('ImgProc', 'WarpAffine', '32F', 'INTER_LINEAR', 'BORDER_CONSTANT'),
+    pattern4('ImgProc', 'WarpAffine', '32F', 'INTER_CUBIC', 'BORDER_CONSTANT'),
+
+    # ImgProc_WarpPerspective (8U | 32F, INTER_NEAREST | INTER_LINEAR | INTER_CUBIC, BORDER_CONSTANT)
+    pattern4('ImgProc', 'WarpPerspective', '8U' , 'INTER_NEAREST', 'BORDER_CONSTANT'),
+    pattern4('ImgProc', 'WarpPerspective', '8U' , 'INTER_LINEAR', 'BORDER_CONSTANT'),
+    pattern4('ImgProc', 'WarpPerspective', '8U' , 'INTER_CUBIC', 'BORDER_CONSTANT'),
+    pattern4('ImgProc', 'WarpPerspective', '32F', 'INTER_NEAREST', 'BORDER_CONSTANT'),
+    pattern4('ImgProc', 'WarpPerspective', '32F', 'INTER_LINEAR', 'BORDER_CONSTANT'),
+    pattern4('ImgProc', 'WarpPerspective', '32F', 'INTER_CUBIC', 'BORDER_CONSTANT'),
+
+    # ImgProc_CopyMakeBorder (8UC1 | 8UC4 | 32SC1 | 32FC1, BORDER_CONSTANT)
+    pattern4('ImgProc', 'CopyMakeBorder', '8U' , '1', 'BORDER_CONSTANT'),
+    pattern4('ImgProc', 'CopyMakeBorder', '8U' , '4', 'BORDER_CONSTANT'),
+    pattern4('ImgProc', 'CopyMakeBorder', '32S', '1', 'BORDER_CONSTANT'),
+    pattern4('ImgProc', 'CopyMakeBorder', '32F', '1', 'BORDER_CONSTANT'),
+
+    # ImgProc_Threshold (32F, THRESH_TRUNC)
    pattern3('ImgProc', 'Threshold', '32F', 'THRESH_TRUNC'),

-    # ImgProc/Integral_Sqr
-    pattern1('ImgProc', 'Integral_Sqr'),
+    # ImgProc_IntegralSqr
+    pattern1('ImgProc', 'IntegralSqr'),

-    # ImgProc/HistEven_OneChannel
-    pattern1('ImgProc', 'HistEven_OneChannel'),
+    # ImgProc_HistEven_OneChannel
+    pattern1('ImgProc', 'HistEvenOneChannel'),

-    # ImgProc/HistEven_FourChannel
-    pattern1('ImgProc', 'HistEven_FourChannel'),
+    # ImgProc_HistEven_FourChannel
+    pattern1('ImgProc', 'HistEvenFourChannel'),

-    # ImgProc/Rotate
+    # ImgProc_Rotate
    pattern1('ImgProc', 'Rotate'),

-    # ImgProc/SwapChannels
+    # ImgProc_SwapChannels
    pattern1('ImgProc', 'SwapChannels'),

-    # ImgProc/AlphaComp
+    # ImgProc_AlphaComp
    pattern1('ImgProc', 'AlphaComp'),

-    # ImgProc/ImagePyramid_build
-    pattern1('ImgProc', 'ImagePyramid_build'),
+    # ImgProc_ImagePyramidBuild
+    pattern1('ImgProc', 'ImagePyramidBuild'),
+
+    # ImgProc_ImagePyramid_getLayer
+    pattern1('ImgProc', 'ImagePyramidGetLayer'),

-    # ImgProc/ImagePyramid_getLayer
-    pattern1('ImgProc', 'ImagePyramid_getLayer'),
-    
    ##############################################################
    # MatOp
-    
-    # MatOp/SetTo (8UC4 | 16UC1 | 16UC4 | 32FC1 | 32FC4)
-    pattern2('MatOp', 'SetTo', '8UC4'),
-    pattern2('MatOp', 'SetTo', '16UC1'),
-    pattern2('MatOp', 'SetTo', '16UC4'),
-    pattern2('MatOp', 'SetTo', '32FC1'),
-    pattern2('MatOp', 'SetTo', '32FC4'),
-    
-    # MatOp/SetToMasked (8UC4 | 16UC1 | 16UC4 | 32FC1 | 32FC4)
-    pattern2('MatOp', 'SetToMasked', '8UC4'),
-    pattern2('MatOp', 'SetToMasked', '16UC1'),
-    pattern2('MatOp', 'SetToMasked', '16UC4'),
-    pattern2('MatOp', 'SetToMasked', '32FC1'),
-    pattern2('MatOp', 'SetToMasked', '32FC4'),
-    
-    # MatOp/CopyToMasked (8UC1 | 8UC3 |8UC4 | 16UC1 | 16UC3 | 16UC4 | 32FC1 | 32FC3 | 32FC4)
-    pattern2('MatOp', 'CopyToMasked', '8UC1'),
-    pattern2('MatOp', 'CopyToMasked', '8UC3'),
-    pattern2('MatOp', 'CopyToMasked', '8UC4'),
-    pattern2('MatOp', 'CopyToMasked', '16UC1'),
-    pattern2('MatOp', 'CopyToMasked', '16UC3'),
-    pattern2('MatOp', 'CopyToMasked', '16UC4'),
-    pattern2('MatOp', 'CopyToMasked', '32FC1'),
-    pattern2('MatOp', 'CopyToMasked', '32FC3'),
-    pattern2('MatOp', 'CopyToMasked', '32FC4'),    
+
+    # MatOp_SetTo (8UC4 | 16UC1 | 16UC4 | 32FC1 | 32FC4)
+    pattern3('MatOp', 'SetTo', '8U' , '4'),
+    pattern3('MatOp', 'SetTo', '16U', '1'),
+    pattern3('MatOp', 'SetTo', '16U', '4'),
+    pattern3('MatOp', 'SetTo', '32F', '1'),
+    pattern3('MatOp', 'SetTo', '32F', '4'),
+
+    # MatOp_SetToMasked (8UC4 | 16UC1 | 16UC4 | 32FC1 | 32FC4)
+    pattern3('MatOp', 'SetToMasked', '8U' , '4'),
+    pattern3('MatOp', 'SetToMasked', '16U', '1'),
+    pattern3('MatOp', 'SetToMasked', '16U', '4'),
+    pattern3('MatOp', 'SetToMasked', '32F', '1'),
+    pattern3('MatOp', 'SetToMasked', '32F', '4'),
+
+    # MatOp_CopyToMasked (8UC1 | 8UC3 |8UC4 | 16UC1 | 16UC3 | 16UC4 | 32FC1 | 32FC3 | 32FC4)
+    pattern3('MatOp', 'CopyToMasked', '8U' , '1'),
+    pattern3('MatOp', 'CopyToMasked', '8U' , '3'),
+    pattern3('MatOp', 'CopyToMasked', '8U' , '4'),
+    pattern3('MatOp', 'CopyToMasked', '16U', '1'),
+    pattern3('MatOp', 'CopyToMasked', '16U', '3'),
+    pattern3('MatOp', 'CopyToMasked', '16U', '4'),
+    pattern3('MatOp', 'CopyToMasked', '32F', '1'),
+    pattern3('MatOp', 'CopyToMasked', '32F', '3'),
+    pattern3('MatOp', 'CopyToMasked', '32F', '4'),
 ]

 cublasPattern = pattern1('Core', 'GEMM')
@ -260,7 +239,7 @@ if __name__ == "__main__":
    inputFile = open(sys.argv[1], 'r')
    lines = inputFile.readlines()
    inputFile.close()
-    
+

    for i in range(len(lines)):
        if cublasPattern.match(lines[i]):
--- a/modules/gpu/perf/main.cpp
+++ b/modules/gpu/perf/main.cpp
@ -0,0 +1,125 @@
+#include "perf_precomp.hpp"
+
+using namespace std;
+using namespace cv;
+using namespace cv::gpu;
+using namespace cvtest;
+using namespace testing;
+
+void printOsInfo()
+{
+#if defined _WIN32
+#   if defined _WIN64
+        cout << "OS: Windows x64 \n" << endl;
+#   else
+        cout << "OS: Windows x32 \n" << endl;
+#   endif
+#elif defined linux
+#   if defined _LP64
+        cout << "OS: Linux x64 \n" << endl;
+#   else
+        cout << "OS: Linux x32 \n" << endl;
+#   endif
+#elif defined __APPLE__
+#   if defined _LP64
+        cout << "OS: Apple x64 \n" << endl;
+#   else
+        cout << "OS: Apple x32 \n" << endl;
+#   endif
+#endif
+}
+
+void printCudaInfo()
+{
+#ifndef HAVE_CUDA
+    cout << "OpenCV was built without CUDA support \n" << endl;
+#else
+    int driver;
+    cudaDriverGetVersion(&driver);
+
+    cout << "CUDA Driver  version: " << driver << '\n';
+    cout << "CUDA Runtime version: " << CUDART_VERSION << '\n';
+
+    cout << endl;
+
+    cout << "GPU module was compiled for the following GPU archs:" << endl;
+    cout << "    BIN: " << CUDA_ARCH_BIN << '\n';
+    cout << "    PTX: " << CUDA_ARCH_PTX << '\n';
+
+    cout << endl;
+
+    int deviceCount = getCudaEnabledDeviceCount();
+    cout << "CUDA device count: " << deviceCount << '\n';
+
+    cout << endl;
+
+    for (int i = 0; i < deviceCount; ++i)
+    {
+        DeviceInfo info(i);
+
+        cout << "Device [" << i << "] \n";
+        cout << "\t Name: " << info.name() << '\n';
+        cout << "\t Compute capability: " << info.majorVersion() << '.' << info.minorVersion()<< '\n';
+        cout << "\t Multi Processor Count: " << info.multiProcessorCount() << '\n';
+        cout << "\t Total memory: " << static_cast<int>(static_cast<int>(info.totalMemory() / 1024.0) / 1024.0) << " Mb \n";
+        cout << "\t Free  memory: " << static_cast<int>(static_cast<int>(info.freeMemory() / 1024.0) / 1024.0) << " Mb \n";
+        if (!info.isCompatible())
+            cout << "\t !!! This device is NOT compatible with current GPU module build \n";
+
+        cout << endl;
+    }
+#endif
+}
+
+int main(int argc, char** argv)
+{
+    CommandLineParser cmd(argc, (const char**) argv,
+        "{ print_info_only | print_info_only | false | Print information about system and exit }"
+        "{ device | device | 0 | Device on which tests will be executed }"
+        "{ cpu | cpu | false | Run tests on cpu }"
+    );
+
+    printOsInfo();
+    printCudaInfo();
+
+    if (cmd.get<bool>("print_info_only"))
+        return 0;
+
+    int device = cmd.get<int>("device");
+    bool cpu = cmd.get<bool>("cpu");
+#ifndef HAVE_CUDA
+    cpu = true;
+#endif
+
+    if (cpu)
+    {
+        runOnGpu = false;
+
+        cout << "Run tests on CPU \n" << endl;
+    }
+    else
+    {
+        runOnGpu = true;
+
+        if (device < 0 || device >= getCudaEnabledDeviceCount())
+        {
+            cerr << "Incorrect device index - " << device << endl;
+            return -1;
+        }
+
+        DeviceInfo info(device);
+        if (!info.isCompatible())
+        {
+            cerr << "Device " << device << " [" << info.name() << "] is NOT compatible with current GPU module build" << endl;
+            return -1;
+        }
+
+        setDevice(device);
+
+        cout << "Run tests on device " << device << " [" << info.name() << "] \n" << endl;
+    }
+
+    InitGoogleTest(&argc, argv);
+    perf::TestBase::Init(argc, argv);
+    return RUN_ALL_TESTS();
+}
--- a/modules/gpu/perf/perf_calib3d.cpp
+++ b/modules/gpu/perf/perf_calib3d.cpp
@ -1,219 +1,263 @@
 #include "perf_precomp.hpp"

-#ifdef HAVE_CUDA
+using namespace std;
+using namespace testing;
+
+namespace {

 //////////////////////////////////////////////////////////////////////
 // StereoBM

-GPU_PERF_TEST_1(StereoBM, cv::gpu::DeviceInfo)
+typedef pair<string, string> pair_string;
+DEF_PARAM_TEST_1(ImagePair, pair_string);
+
+PERF_TEST_P(ImagePair, Calib3D_StereoBM, Values(make_pair<string, string>("gpu/perf/aloe.jpg", "gpu/perf/aloeR.jpg")))
 {
-    cv::gpu::DeviceInfo devInfo = GetParam();
-    cv::gpu::setDevice(devInfo.deviceID());
-
-    cv::Mat img_l_host = readImage("gpu/perf/aloe.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img_l_host.empty());
-
-    cv::Mat img_r_host = readImage("gpu/perf/aloeR.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img_r_host.empty());
-
-    cv::gpu::StereoBM_GPU bm(0, 256);
-    cv::gpu::GpuMat img_l(img_l_host);
-    cv::gpu::GpuMat img_r(img_r_host);
-    cv::gpu::GpuMat dst;
-
-    bm(img_l, img_r, dst);
-
    declare.time(5.0);

-    TEST_CYCLE()
+    const cv::Mat imgLeft = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(imgLeft.empty());
+
+    const cv::Mat imgRight = readImage(GetParam().second, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(imgRight.empty());
+
+    const int preset = 0;
+    const int ndisp = 256;
+
+    if (runOnGpu)
    {
-        bm(img_l, img_r, dst);
+        cv::gpu::StereoBM_GPU d_bm(preset, ndisp);
+
+        cv::gpu::GpuMat d_imgLeft(imgLeft);
+        cv::gpu::GpuMat d_imgRight(imgRight);
+        cv::gpu::GpuMat d_dst;
+
+        d_bm(d_imgLeft, d_imgRight, d_dst);
+
+        TEST_CYCLE()
+        {
+            d_bm(d_imgLeft, d_imgRight, d_dst);
+        }
+    }
+    else
+    {
+        cv::StereoBM bm(preset, ndisp);
+
+        cv::Mat dst;
+
+        bm(imgLeft, imgRight, dst);
+
+        TEST_CYCLE()
+        {
+            bm(imgLeft, imgRight, dst);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Calib3D, StereoBM, ALL_DEVICES);
-
 //////////////////////////////////////////////////////////////////////
 // StereoBeliefPropagation

-GPU_PERF_TEST_1(StereoBeliefPropagation, cv::gpu::DeviceInfo)
+PERF_TEST_P(ImagePair, Calib3D_StereoBeliefPropagation, Values(make_pair<string, string>("gpu/stereobp/aloe-L.png", "gpu/stereobp/aloe-R.png")))
 {
-    cv::gpu::DeviceInfo devInfo = GetParam();
-    cv::gpu::setDevice(devInfo.deviceID());
-
-    cv::Mat img_l_host = readImage("gpu/stereobp/aloe-L.png");
-    ASSERT_FALSE(img_l_host.empty());
-
-    cv::Mat img_r_host = readImage("gpu/stereobp/aloe-R.png");
-    ASSERT_FALSE(img_r_host.empty());
-
-    cv::gpu::StereoBeliefPropagation bp(64);
-    cv::gpu::GpuMat img_l(img_l_host);
-    cv::gpu::GpuMat img_r(img_r_host);
-    cv::gpu::GpuMat dst;
-
-    bp(img_l, img_r, dst);
-
    declare.time(10.0);

-    TEST_CYCLE()
+    const cv::Mat imgLeft = readImage(GetParam().first);
+    ASSERT_FALSE(imgLeft.empty());
+
+    const cv::Mat imgRight = readImage(GetParam().second);
+    ASSERT_FALSE(imgRight.empty());
+
+    const int ndisp = 64;
+
+    if (runOnGpu)
    {
-        bp(img_l, img_r, dst);
+        cv::gpu::StereoBeliefPropagation d_bp(ndisp);
+
+        cv::gpu::GpuMat d_imgLeft(imgLeft);
+        cv::gpu::GpuMat d_imgRight(imgRight);
+        cv::gpu::GpuMat d_dst;
+
+        d_bp(d_imgLeft, d_imgRight, d_dst);
+
+        TEST_CYCLE()
+        {
+            d_bp(d_imgLeft, d_imgRight, d_dst);
+        }
+    }
+    else
+    {
+        FAIL();
    }
 }

-INSTANTIATE_TEST_CASE_P(Calib3D, StereoBeliefPropagation, ALL_DEVICES);
-
 //////////////////////////////////////////////////////////////////////
 // StereoConstantSpaceBP

-GPU_PERF_TEST_1(StereoConstantSpaceBP, cv::gpu::DeviceInfo)
+PERF_TEST_P(ImagePair, Calib3D_StereoConstantSpaceBP, Values(make_pair<string, string>("gpu/stereobm/aloe-L.png", "gpu/stereobm/aloe-R.png")))
 {
-    cv::gpu::DeviceInfo devInfo = GetParam();
-    cv::gpu::setDevice(devInfo.deviceID());
-
-    cv::Mat img_l_host = readImage("gpu/stereobm/aloe-L.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img_l_host.empty());
-
-    cv::Mat img_r_host = readImage("gpu/stereobm/aloe-R.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img_r_host.empty());
-
-    cv::gpu::StereoConstantSpaceBP csbp(128);
-    cv::gpu::GpuMat img_l(img_l_host);
-    cv::gpu::GpuMat img_r(img_r_host);
-    cv::gpu::GpuMat dst;
-
-    csbp(img_l, img_r, dst);
-
    declare.time(10.0);

-    TEST_CYCLE()
+    const cv::Mat imgLeft = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(imgLeft.empty());
+
+    const cv::Mat imgRight = readImage(GetParam().second, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(imgRight.empty());
+
+    const int ndisp = 128;
+
+    if (runOnGpu)
    {
-        csbp(img_l, img_r, dst);
+        cv::gpu::StereoConstantSpaceBP d_csbp(ndisp);
+
+        cv::gpu::GpuMat d_imgLeft(imgLeft);
+        cv::gpu::GpuMat d_imgRight(imgRight);
+        cv::gpu::GpuMat d_dst;
+
+        d_csbp(d_imgLeft, d_imgRight, d_dst);
+
+        TEST_CYCLE()
+        {
+            d_csbp(d_imgLeft, d_imgRight, d_dst);
+        }
+    }
+    else
+    {
+        FAIL();
    }
 }

-INSTANTIATE_TEST_CASE_P(Calib3D, StereoConstantSpaceBP, ALL_DEVICES);
-
 //////////////////////////////////////////////////////////////////////
 // DisparityBilateralFilter

-GPU_PERF_TEST_1(DisparityBilateralFilter, cv::gpu::DeviceInfo)
+PERF_TEST_P(ImagePair, Calib3D_DisparityBilateralFilter, Values(make_pair<string, string>("gpu/stereobm/aloe-L.png", "gpu/stereobm/aloe-disp.png")))
 {
-    cv::gpu::DeviceInfo devInfo = GetParam();
-    cv::gpu::setDevice(devInfo.deviceID());
+    const cv::Mat img = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());

-    cv::Mat img_host = readImage("gpu/stereobm/aloe-L.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img_host.empty());
+    const cv::Mat disp = readImage(GetParam().second, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(disp.empty());

-    cv::Mat disp_host = readImage("gpu/stereobm/aloe-disp.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(disp_host.empty());
+    const int ndisp = 128;

-    cv::gpu::DisparityBilateralFilter f(128);
-    cv::gpu::GpuMat img(img_host);
-    cv::gpu::GpuMat disp(disp_host);
-    cv::gpu::GpuMat dst;
-
-    f(disp, img, dst);
-
-    TEST_CYCLE()
+    if (runOnGpu)
    {
-        f(disp, img, dst);
+        cv::gpu::DisparityBilateralFilter d_filter(ndisp);
+
+        cv::gpu::GpuMat d_img(img);
+        cv::gpu::GpuMat d_disp(disp);
+        cv::gpu::GpuMat d_dst;
+
+        d_filter(d_disp, d_img, d_dst);
+
+        TEST_CYCLE()
+        {
+            d_filter(d_disp, d_img, d_dst);
+        }
+    }
+    else
+    {
+        FAIL();
    }
 }

-INSTANTIATE_TEST_CASE_P(Calib3D, DisparityBilateralFilter, ALL_DEVICES);
-
 //////////////////////////////////////////////////////////////////////
 // TransformPoints

-IMPLEMENT_PARAM_CLASS(Count, int)
+DEF_PARAM_TEST_1(Count, int);

-GPU_PERF_TEST(TransformPoints, cv::gpu::DeviceInfo, Count)
+PERF_TEST_P(Count, Calib3D_TransformPoints, Values(5000, 10000, 20000))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    const int count = GetParam();

-    int count = GET_PARAM(1);
+    cv::Mat src(1, count, CV_32FC3);
+    fillRandom(src, -100, 100);

-    cv::Mat src_host(1, count, CV_32FC3);
-    fill(src_host, -100, 100);
+    const cv::Mat rvec = cv::Mat::ones(1, 3, CV_32FC1);
+    const cv::Mat tvec = cv::Mat::ones(1, 3, CV_32FC1);

-    cv::gpu::GpuMat src(src_host);
-    cv::Mat rvec = cv::Mat::ones(1, 3, CV_32FC1);
-    cv::Mat tvec = cv::Mat::ones(1, 3, CV_32FC1);
-    cv::gpu::GpuMat dst;
-
-    cv::gpu::transformPoints(src, rvec, tvec, dst);
-
-    TEST_CYCLE()
+    if (runOnGpu)
    {
-        cv::gpu::transformPoints(src, rvec, tvec, dst);
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+
+        cv::gpu::transformPoints(d_src, rvec, tvec, d_dst);
+
+        TEST_CYCLE()
+        {
+            cv::gpu::transformPoints(d_src, rvec, tvec, d_dst);
+        }
+    }
+    else
+    {
+        FAIL();
    }
 }

-INSTANTIATE_TEST_CASE_P(Calib3D, TransformPoints, testing::Combine(
-    ALL_DEVICES,
-    testing::Values<Count>(5000, 10000, 20000)));
-
 //////////////////////////////////////////////////////////////////////
 // ProjectPoints

-GPU_PERF_TEST(ProjectPoints, cv::gpu::DeviceInfo, Count)
+PERF_TEST_P(Count, Calib3D_ProjectPoints, Values(5000, 10000, 20000))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    const int count = GetParam();

-    int count = GET_PARAM(1);
+    cv::Mat src(1, count, CV_32FC3);
+    fillRandom(src, -100, 100);

-    cv::Mat src_host(1, count, CV_32FC3);
-    fill(src_host, -100, 100);
+    const cv::Mat rvec = cv::Mat::ones(1, 3, CV_32FC1);
+    const cv::Mat tvec = cv::Mat::ones(1, 3, CV_32FC1);
+    const cv::Mat camera_mat = cv::Mat::ones(3, 3, CV_32FC1);

-    cv::gpu::GpuMat src(src_host);
-    cv::Mat rvec = cv::Mat::ones(1, 3, CV_32FC1);
-    cv::Mat tvec = cv::Mat::ones(1, 3, CV_32FC1);
-    cv::Mat camera_mat = cv::Mat::ones(3, 3, CV_32FC1);
-    cv::gpu::GpuMat dst;
-
-    cv::gpu::projectPoints(src, rvec, tvec, camera_mat, cv::Mat(), dst);
-
-    TEST_CYCLE()
+    if (runOnGpu)
    {
-        cv::gpu::projectPoints(src, rvec, tvec, camera_mat, cv::Mat(), dst);
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+
+        cv::gpu::projectPoints(d_src, rvec, tvec, camera_mat, cv::Mat(), d_dst);
+
+        TEST_CYCLE()
+        {
+            cv::gpu::projectPoints(d_src, rvec, tvec, camera_mat, cv::Mat(), d_dst);
+        }
+    }
+    else
+    {
+        cv::Mat dst;
+
+        cv::projectPoints(src, rvec, tvec, camera_mat, cv::noArray(), dst);
+
+        TEST_CYCLE()
+        {
+            cv::projectPoints(src, rvec, tvec, camera_mat, cv::noArray(), dst);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Calib3D, ProjectPoints, testing::Combine(
-    ALL_DEVICES,
-    testing::Values<Count>(5000, 10000, 20000)));
-
 //////////////////////////////////////////////////////////////////////
 // SolvePnPRansac

-GPU_PERF_TEST(SolvePnPRansac, cv::gpu::DeviceInfo, Count)
+PERF_TEST_P(Count, Calib3D_SolvePnPRansac, Values(5000, 10000, 20000))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    declare.time(10.0);

-    int count = GET_PARAM(1);
+    const int count = GetParam();

    cv::Mat object(1, count, CV_32FC3);
-    fill(object, -100, 100);
+    fillRandom(object, -100, 100);

    cv::Mat camera_mat(3, 3, CV_32FC1);
-    fill(camera_mat, 0.5, 1);
+    fillRandom(camera_mat, 0.5, 1);
    camera_mat.at<float>(0, 1) = 0.f;
    camera_mat.at<float>(1, 0) = 0.f;
    camera_mat.at<float>(2, 0) = 0.f;
    camera_mat.at<float>(2, 1) = 0.f;

-    cv::Mat dist_coef(1, 8, CV_32F, cv::Scalar::all(0));
+    const cv::Mat dist_coef(1, 8, CV_32F, cv::Scalar::all(0));

    std::vector<cv::Point2f> image_vec;
    cv::Mat rvec_gold(1, 3, CV_32FC1);
-    fill(rvec_gold, 0, 1);
+    fillRandom(rvec_gold, 0, 1);
    cv::Mat tvec_gold(1, 3, CV_32FC1);
-    fill(tvec_gold, 0, 1);
+    fillRandom(tvec_gold, 0, 1);
    cv::projectPoints(object, rvec_gold, tvec_gold, camera_mat, dist_coef, image_vec);

    cv::Mat image(1, count, CV_32FC2, &image_vec[0]);
@ -221,82 +265,92 @@ GPU_PERF_TEST(SolvePnPRansac, cv::gpu::DeviceInfo, Count)
    cv::Mat rvec;
    cv::Mat tvec;

-    cv::gpu::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);
-
-    declare.time(3.0);
-
-    TEST_CYCLE()
+    if (runOnGpu)
    {
        cv::gpu::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);
+
+        TEST_CYCLE()
+        {
+            cv::gpu::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);
+        }
+    }
+    else
+    {
+        cv::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);
+
+        TEST_CYCLE()
+        {
+            cv::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Calib3D, SolvePnPRansac, testing::Combine(
-    ALL_DEVICES,
-    testing::Values<Count>(5000, 10000, 20000)));
-
 //////////////////////////////////////////////////////////////////////
 // ReprojectImageTo3D

-GPU_PERF_TEST(ReprojectImageTo3D, cv::gpu::DeviceInfo, cv::Size, MatDepth)
+PERF_TEST_P(Sz_Depth, Calib3D_ReprojectImageTo3D, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16S)))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);

-    cv::Size size = GET_PARAM(1);
-    int depth = GET_PARAM(2);
-
-    cv::Mat src_host(size, depth);
-    fill(src_host, 5.0, 30.0);
+    cv::Mat src(size, depth);
+    fillRandom(src, 5.0, 30.0);

    cv::Mat Q(4, 4, CV_32FC1);
-    fill(Q, 0.1, 1.0);
+    fillRandom(Q, 0.1, 1.0);

-    cv::gpu::GpuMat src(src_host);
-    cv::gpu::GpuMat dst;
-
-    cv::gpu::reprojectImageTo3D(src, dst, Q);
-
-    TEST_CYCLE()
+    if (runOnGpu)
    {
-        cv::gpu::reprojectImageTo3D(src, dst, Q);
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+
+        cv::gpu::reprojectImageTo3D(d_src, d_dst, Q);
+
+        TEST_CYCLE()
+        {
+            cv::gpu::reprojectImageTo3D(d_src, d_dst, Q);
+        }
+    }
+    else
+    {
+        cv::Mat dst;
+
+        cv::reprojectImageTo3D(src, dst, Q);
+
+        TEST_CYCLE()
+        {
+            cv::reprojectImageTo3D(src, dst, Q);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Calib3D, ReprojectImageTo3D, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values<MatDepth>(CV_8U, CV_16S)));
-
 //////////////////////////////////////////////////////////////////////
 // DrawColorDisp

-GPU_PERF_TEST(DrawColorDisp, cv::gpu::DeviceInfo, cv::Size, MatDepth)
+PERF_TEST_P(Sz_Depth, Calib3D_DrawColorDisp, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16S)))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);

-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
+    cv::Mat src(size, type);
+    fillRandom(src, 0, 255);

-    cv::Mat src_host(size, type);
-    fill(src_host, 0, 255);
-
-    cv::gpu::GpuMat src(src_host);
-    cv::gpu::GpuMat dst;
-
-    cv::gpu::drawColorDisp(src, dst, 255);
-
-    TEST_CYCLE()
+    if (runOnGpu)
    {
-        cv::gpu::drawColorDisp(src, dst, 255);
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+
+        cv::gpu::drawColorDisp(d_src, d_dst, 255);
+
+        TEST_CYCLE()
+        {
+            cv::gpu::drawColorDisp(d_src, d_dst, 255);
+        }
+    }
+    else
+    {
+        FAIL();
    }
 }

-INSTANTIATE_TEST_CASE_P(Calib3D, DrawColorDisp, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatDepth(CV_8U), MatDepth(CV_16S))));
-
-#endif
-
+} // namespace
--- a/modules/gpu/perf/perf_core.cpp
+++ b/modules/gpu/perf/perf_core.cpp
--- a/modules/gpu/perf/perf_features2d.cpp
+++ b/modules/gpu/perf/perf_features2d.cpp
@ -1,209 +1,278 @@
 #include "perf_precomp.hpp"

-#ifdef HAVE_CUDA
+using namespace std;
+using namespace testing;
+
+namespace {

 //////////////////////////////////////////////////////////////////////
 // SURF

-GPU_PERF_TEST_1(SURF, cv::gpu::DeviceInfo)
+DEF_PARAM_TEST_1(Image, string);
+
+PERF_TEST_P(Image, Features2D_SURF, Values<string>("gpu/perf/aloe.jpg"))
 {
-    cv::gpu::DeviceInfo devInfo = GetParam();
-    cv::gpu::setDevice(devInfo.deviceID());
+    declare.time(50.0);

-    cv::Mat img_host = readImage("gpu/perf/aloe.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img_host.empty());
+    cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());

-    cv::gpu::SURF_GPU surf;
-
-    cv::gpu::GpuMat img(img_host);
-    cv::gpu::GpuMat keypoints, descriptors;
-
-    surf(img, cv::gpu::GpuMat(), keypoints, descriptors);
-
-    declare.time(2.0);
-
-    TEST_CYCLE()
+    if (runOnGpu)
    {
-        surf(img, cv::gpu::GpuMat(), keypoints, descriptors);
+        cv::gpu::SURF_GPU d_surf;
+
+        cv::gpu::GpuMat d_img(img);
+        cv::gpu::GpuMat d_keypoints, d_descriptors;
+
+        d_surf(d_img, cv::gpu::GpuMat(), d_keypoints, d_descriptors);
+
+        TEST_CYCLE()
+        {
+            d_surf(d_img, cv::gpu::GpuMat(), d_keypoints, d_descriptors);
+        }
+    }
+    else
+    {
+        cv::SURF surf;
+
+        std::vector<cv::KeyPoint> keypoints;
+        cv::Mat descriptors;
+
+        surf(img, cv::noArray(), keypoints, descriptors);
+
+        TEST_CYCLE()
+        {
+            keypoints.clear();
+            surf(img, cv::noArray(), keypoints, descriptors);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Features2D, SURF, ALL_DEVICES);
-
 //////////////////////////////////////////////////////////////////////
 // FAST

-GPU_PERF_TEST_1(FAST, cv::gpu::DeviceInfo)
+PERF_TEST_P(Image, Features2D_FAST, Values<string>("gpu/perf/aloe.jpg"))
 {
-    cv::gpu::DeviceInfo devInfo = GetParam();
-    cv::gpu::setDevice(devInfo.deviceID());
+    cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());

-    cv::Mat img_host = readImage("gpu/perf/aloe.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img_host.empty());
-
-    cv::gpu::FAST_GPU fast(20);
-
-    cv::gpu::GpuMat img(img_host);
-    cv::gpu::GpuMat keypoints;
-
-    fast(img, cv::gpu::GpuMat(), keypoints);
-
-    TEST_CYCLE()
+    if (runOnGpu)
    {
-        fast(img, cv::gpu::GpuMat(), keypoints);
+        cv::gpu::FAST_GPU d_fast(20);
+
+        cv::gpu::GpuMat d_img(img);
+        cv::gpu::GpuMat d_keypoints;
+
+        d_fast(d_img, cv::gpu::GpuMat(), d_keypoints);
+
+        TEST_CYCLE()
+        {
+            d_fast(d_img, cv::gpu::GpuMat(), d_keypoints);
+        }
+    }
+    else
+    {
+        std::vector<cv::KeyPoint> keypoints;
+
+        cv::FAST(img, keypoints, 20);
+
+        TEST_CYCLE()
+        {
+            keypoints.clear();
+            cv::FAST(img, keypoints, 20);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Features2D, FAST, ALL_DEVICES);
-
 //////////////////////////////////////////////////////////////////////
 // ORB

-GPU_PERF_TEST_1(ORB, cv::gpu::DeviceInfo)
+PERF_TEST_P(Image, Features2D_ORB, Values<string>("gpu/perf/aloe.jpg"))
 {
-    cv::gpu::DeviceInfo devInfo = GetParam();
-    cv::gpu::setDevice(devInfo.deviceID());
+    cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());

-    cv::Mat img_host = readImage("gpu/perf/aloe.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img_host.empty());
-
-    cv::gpu::ORB_GPU orb(4000);
-
-    cv::gpu::GpuMat img(img_host);
-    cv::gpu::GpuMat keypoints, descriptors;
-
-    TEST_CYCLE()
+    if (runOnGpu)
    {
-        orb(img, cv::gpu::GpuMat(), keypoints, descriptors);
+        cv::gpu::ORB_GPU d_orb(4000);
+
+        cv::gpu::GpuMat d_img(img);
+        cv::gpu::GpuMat d_keypoints, d_descriptors;
+
+        d_orb(d_img, cv::gpu::GpuMat(), d_keypoints, d_descriptors);
+
+        TEST_CYCLE()
+        {
+            d_orb(d_img, cv::gpu::GpuMat(), d_keypoints, d_descriptors);
+        }
+    }
+    else
+    {
+        cv::ORB orb(4000);
+
+        std::vector<cv::KeyPoint> keypoints;
+        cv::Mat descriptors;
+
+        orb(img, cv::noArray(), keypoints, descriptors);
+
+        TEST_CYCLE()
+        {
+            keypoints.clear();
+            orb(img, cv::noArray(), keypoints, descriptors);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Features2D, ORB, ALL_DEVICES);
+//////////////////////////////////////////////////////////////////////
+// BFMatch
+
+DEF_PARAM_TEST(DescSize_Norm, int, NormType);
+
+PERF_TEST_P(DescSize_Norm, Features2D_BFMatch, Combine(Values(64, 128, 256), Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))))
+{
+    declare.time(20.0);
+
+    int desc_size = GET_PARAM(0);
+    int normType = GET_PARAM(1);
+
+    int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;
+
+    cv::Mat query(3000, desc_size, type);
+    fillRandom(query);
+
+    cv::Mat train(3000, desc_size, type);
+    fillRandom(train);
+
+    if (runOnGpu)
+    {
+        cv::gpu::BFMatcher_GPU d_matcher(normType);
+
+        cv::gpu::GpuMat d_query(query);
+        cv::gpu::GpuMat d_train(train);
+        cv::gpu::GpuMat d_trainIdx, d_distance;
+
+        d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
+
+        TEST_CYCLE()
+        {
+            d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
+        }
+    }
+    else
+    {
+        cv::BFMatcher matcher(normType);
+
+        std::vector<cv::DMatch> matches;
+
+        matcher.match(query, train, matches);
+
+        TEST_CYCLE()
+        {
+            matcher.match(query, train, matches);
+        }
+    }
+}

 //////////////////////////////////////////////////////////////////////
-// BruteForceMatcher_match
+// BFKnnMatch

-IMPLEMENT_PARAM_CLASS(DescriptorSize, int)
+DEF_PARAM_TEST(DescSize_K_Norm, int, int, NormType);

-GPU_PERF_TEST(BruteForceMatcher_match, cv::gpu::DeviceInfo, DescriptorSize, NormType)
+PERF_TEST_P(DescSize_K_Norm, Features2D_BFKnnMatch, Combine(
+    Values(64, 128, 256),
+    Values(2, 3),
+    Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    declare.time(30.0);

-    int desc_size = GET_PARAM(1);
+    int desc_size = GET_PARAM(0);
+    int k = GET_PARAM(1);
    int normType = GET_PARAM(2);

    int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;

-    cv::Mat query_host(3000, desc_size, type);
-    fill(query_host, 0.0, 10.0);
+    cv::Mat query(3000, desc_size, type);
+    fillRandom(query);

-    cv::Mat train_host(3000, desc_size, type);
-    fill(train_host, 0.0, 10.0);
+    cv::Mat train(3000, desc_size, type);
+    fillRandom(train);

-    cv::gpu::BFMatcher_GPU matcher(normType);
-
-    cv::gpu::GpuMat query(query_host);
-    cv::gpu::GpuMat train(train_host);
-    cv::gpu::GpuMat trainIdx, distance;
-
-    matcher.matchSingle(query, train, trainIdx, distance);
-
-    declare.time(3.0);
-
-    TEST_CYCLE()
+    if (runOnGpu)
    {
-        matcher.matchSingle(query, train, trainIdx, distance);
+        cv::gpu::BFMatcher_GPU d_matcher(normType);
+
+        cv::gpu::GpuMat d_query(query);
+        cv::gpu::GpuMat d_train(train);
+        cv::gpu::GpuMat d_trainIdx, d_distance, d_allDist;
+
+        d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, k);
+
+        TEST_CYCLE()
+        {
+            d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, k);
+        }
+    }
+    else
+    {
+        cv::BFMatcher matcher(normType);
+
+        std::vector< std::vector<cv::DMatch> > matches;
+
+        matcher.knnMatch(query, train, matches, k);
+
+        TEST_CYCLE()
+        {
+            matcher.knnMatch(query, train, matches, k);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Features2D, BruteForceMatcher_match, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(DescriptorSize(64), DescriptorSize(128), DescriptorSize(256)),
-    testing::Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))));
-
 //////////////////////////////////////////////////////////////////////
-// BruteForceMatcher_knnMatch
+// BFRadiusMatch

-IMPLEMENT_PARAM_CLASS(K, int)
-
-GPU_PERF_TEST(BruteForceMatcher_knnMatch, cv::gpu::DeviceInfo, DescriptorSize, K, NormType)
+PERF_TEST_P(DescSize_Norm, Features2D_BFRadiusMatch, Combine(Values(64, 128, 256), Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    declare.time(30.0);

-    int desc_size = GET_PARAM(1);
-    int k = GET_PARAM(2);
-    int normType = GET_PARAM(3);
+    int desc_size = GET_PARAM(0);
+    int normType = GET_PARAM(1);

    int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;

-    cv::Mat query_host(3000, desc_size, type);
-    fill(query_host, 0.0, 10.0);
+    cv::Mat query(3000, desc_size, type);
+    fillRandom(query, 0.0, 1.0);

-    cv::Mat train_host(3000, desc_size, type);
-    fill(train_host, 0.0, 10.0);
+    cv::Mat train(3000, desc_size, type);
+    fillRandom(train, 0.0, 1.0);

-    cv::gpu::BFMatcher_GPU matcher(normType);
-
-    cv::gpu::GpuMat query(query_host);
-    cv::gpu::GpuMat train(train_host);
-    cv::gpu::GpuMat trainIdx, distance, allDist;
-
-    matcher.knnMatchSingle(query, train, trainIdx, distance, allDist, k);
-
-    declare.time(3.0);
-
-    TEST_CYCLE()
+    if (runOnGpu)
    {
-        matcher.knnMatchSingle(query, train, trainIdx, distance, allDist, k);
+        cv::gpu::BFMatcher_GPU d_matcher(normType);
+
+        cv::gpu::GpuMat d_query(query);
+        cv::gpu::GpuMat d_train(train);
+        cv::gpu::GpuMat d_trainIdx, d_nMatches, d_distance;
+
+        d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, 2.0);
+
+        TEST_CYCLE()
+        {
+            d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, 2.0);
+        }
+    }
+    else
+    {
+        cv::BFMatcher matcher(normType);
+
+        std::vector< std::vector<cv::DMatch> > matches;
+
+        matcher.radiusMatch(query, train, matches, 2.0);
+
+        TEST_CYCLE()
+        {
+            matcher.radiusMatch(query, train, matches, 2.0);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Features2D, BruteForceMatcher_knnMatch, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(DescriptorSize(64), DescriptorSize(128), DescriptorSize(256)),
-    testing::Values(K(2), K(3)),
-    testing::Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))));
-
-//////////////////////////////////////////////////////////////////////
-// BruteForceMatcher_radiusMatch
-
-GPU_PERF_TEST(BruteForceMatcher_radiusMatch, cv::gpu::DeviceInfo, DescriptorSize, NormType)
-{
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
-
-    int desc_size = GET_PARAM(1);
-    int normType = GET_PARAM(2);
-
-    int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;
-
-    cv::Mat query_host(3000, desc_size, type);
-    fill(query_host, 0.0, 1.0);
-
-    cv::Mat train_host(3000, desc_size, type);
-    fill(train_host, 0.0, 1.0);
-
-    cv::gpu::BFMatcher_GPU matcher(normType);
-
-    cv::gpu::GpuMat query(query_host);
-    cv::gpu::GpuMat train(train_host);
-    cv::gpu::GpuMat trainIdx, nMatches, distance;
-
-    matcher.radiusMatchSingle(query, train, trainIdx, distance, nMatches, 2.0);
-
-    declare.time(3.0);
-
-    TEST_CYCLE()
-    {
-        matcher.radiusMatchSingle(query, train, trainIdx, distance, nMatches, 2.0);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Features2D, BruteForceMatcher_radiusMatch, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(DescriptorSize(64), DescriptorSize(128), DescriptorSize(256)),
-    testing::Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))));
-
-#endif
+} // namespace
--- a/modules/gpu/perf/perf_filters.cpp
+++ b/modules/gpu/perf/perf_filters.cpp
@ -1,308 +1,379 @@
 #include "perf_precomp.hpp"

-#ifdef HAVE_CUDA
+using namespace std;
+using namespace testing;
+
+namespace {

 //////////////////////////////////////////////////////////////////////
 // Blur

-IMPLEMENT_PARAM_CLASS(KernelSize, int)
+DEF_PARAM_TEST(Sz_Type_KernelSz, cv::Size, MatType, int);

-GPU_PERF_TEST(Blur, cv::gpu::DeviceInfo, cv::Size, MatType, KernelSize)
+PERF_TEST_P(Sz_Type_KernelSz, Filters_Blur, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4), Values(3, 5, 7)))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    declare.time(20.0);

-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int ksize = GET_PARAM(3);
+    cv::Size size = GET_PARAM(0);
+    int type = GET_PARAM(1);
+    int ksize = GET_PARAM(2);

-    cv::Mat src_host(size, type);
-    fill(src_host, 0.0, 255.0);
+    cv::Mat src(size, type);
+    fillRandom(src);

-    cv::gpu::GpuMat src(src_host);
-    cv::gpu::GpuMat dst;
-
-    cv::gpu::blur(src, dst, cv::Size(ksize, ksize));
-
-    TEST_CYCLE()
+    if (runOnGpu)
    {
-        cv::gpu::blur(src, dst, cv::Size(ksize, ksize));
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+
+        cv::gpu::blur(d_src, d_dst, cv::Size(ksize, ksize));
+
+        TEST_CYCLE()
+        {
+            cv::gpu::blur(d_src, d_dst, cv::Size(ksize, ksize));
+        }
+    }
+    else
+    {
+        cv::Mat dst;
+
+        cv::blur(src, dst, cv::Size(ksize, ksize));
+
+        TEST_CYCLE()
+        {
+            cv::blur(src, dst, cv::Size(ksize, ksize));
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Filters, Blur, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4)),
-    testing::Values(KernelSize(3), KernelSize(5), KernelSize(7))));
-
 //////////////////////////////////////////////////////////////////////
 // Sobel

-GPU_PERF_TEST(Sobel, cv::gpu::DeviceInfo, cv::Size, MatType, KernelSize)
+PERF_TEST_P(Sz_Type_KernelSz, Filters_Sobel, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1), Values(3, 5, 7, 9, 11, 13, 15)))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    declare.time(20.0);

-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int ksize = GET_PARAM(3);
+    cv::Size size = GET_PARAM(0);
+    int type = GET_PARAM(1);
+    int ksize = GET_PARAM(2);

-    cv::Mat src_host(size, type);
-    fill(src_host, 0.0, 255.0);
+    cv::Mat src(size, type);
+    fillRandom(src);

-    cv::gpu::GpuMat src(src_host);
-    cv::gpu::GpuMat dst;
-    cv::gpu::GpuMat buf;
-
-    cv::gpu::Sobel(src, dst, -1, 1, 1, buf, ksize);
-
-    TEST_CYCLE()
+    if (runOnGpu)
    {
-        cv::gpu::Sobel(src, dst, -1, 1, 1, buf, ksize);
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+        cv::gpu::GpuMat d_buf;
+
+        cv::gpu::Sobel(d_src, d_dst, -1, 1, 1, d_buf, ksize);
+
+        TEST_CYCLE()
+        {
+            cv::gpu::Sobel(d_src, d_dst, -1, 1, 1, d_buf, ksize);
+        }
+    }
+    else
+    {
+        cv::Mat dst;
+
+        cv::Sobel(src, dst, -1, 1, 1, ksize);
+
+        TEST_CYCLE()
+        {
+            cv::Sobel(src, dst, -1, 1, 1, ksize);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Filters, Sobel, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1)),
-    testing::Values(KernelSize(3), KernelSize(5), KernelSize(7), KernelSize(9), KernelSize(11), KernelSize(13), KernelSize(15))));
-
 //////////////////////////////////////////////////////////////////////
 // Scharr

-GPU_PERF_TEST(Scharr, cv::gpu::DeviceInfo, cv::Size, MatType)
+PERF_TEST_P(Sz_Type, Filters_Scharr, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1)))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    declare.time(20.0);

-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
+    cv::Size size = GET_PARAM(0);
+    int type = GET_PARAM(1);

-    cv::Mat src_host(size, type);
-    fill(src_host, 0.0, 255.0);
+    cv::Mat src(size, type);
+    fillRandom(src);

-    cv::gpu::GpuMat src(src_host);
-    cv::gpu::GpuMat dst;
-    cv::gpu::GpuMat buf;
-
-    cv::gpu::Scharr(src, dst, -1, 1, 0, buf);
-
-    TEST_CYCLE()
+    if (runOnGpu)
    {
-        cv::gpu::Scharr(src, dst, -1, 1, 0, buf);
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+        cv::gpu::GpuMat d_buf;
+
+        cv::gpu::Scharr(d_src, d_dst, -1, 1, 0, d_buf);
+
+        TEST_CYCLE()
+        {
+            cv::gpu::Scharr(d_src, d_dst, -1, 1, 0, d_buf);
+        }
+    }
+    else
+    {
+        cv::Mat dst;
+
+        cv::Scharr(src, dst, -1, 1, 0);
+
+        TEST_CYCLE()
+        {
+            cv::Scharr(src, dst, -1, 1, 0);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Filters, Scharr, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1))));
-
 //////////////////////////////////////////////////////////////////////
 // GaussianBlur

-GPU_PERF_TEST(GaussianBlur, cv::gpu::DeviceInfo, cv::Size, MatType, KernelSize)
+PERF_TEST_P(Sz_Type_KernelSz, Filters_GaussianBlur, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1), Values(3, 5, 7, 9, 11, 13, 15)))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    declare.time(20.0);

-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int ksize = GET_PARAM(3);
+    cv::Size size = GET_PARAM(0);
+    int type = GET_PARAM(1);
+    int ksize = GET_PARAM(2);

-    cv::Mat src_host(size, type);
-    fill(src_host, 0.0, 255.0);
+    cv::Mat src(size, type);
+    fillRandom(src);

-    cv::gpu::GpuMat src(src_host);
-    cv::gpu::GpuMat dst;
-    cv::gpu::GpuMat buf;
-
-    cv::gpu::GaussianBlur(src, dst, cv::Size(ksize, ksize), buf, 0.5);
-
-    TEST_CYCLE()
+    if (runOnGpu)
    {
-        cv::gpu::GaussianBlur(src, dst, cv::Size(ksize, ksize), buf, 0.5);
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+        cv::gpu::GpuMat d_buf;
+
+        cv::gpu::GaussianBlur(d_src, d_dst, cv::Size(ksize, ksize), d_buf, 0.5);
+
+        TEST_CYCLE()
+        {
+            cv::gpu::GaussianBlur(d_src, d_dst, cv::Size(ksize, ksize), d_buf, 0.5);
+        }
+    }
+    else
+    {
+        cv::Mat dst;
+
+        cv::GaussianBlur(src, dst, cv::Size(ksize, ksize), 0.5);
+
+        TEST_CYCLE()
+        {
+            cv::GaussianBlur(src, dst, cv::Size(ksize, ksize), 0.5);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Filters, GaussianBlur, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1)),
-    testing::Values(KernelSize(3), KernelSize(5), KernelSize(7), KernelSize(9), KernelSize(11), KernelSize(13), KernelSize(15))));
-
 //////////////////////////////////////////////////////////////////////
 // Laplacian

-GPU_PERF_TEST(Laplacian, cv::gpu::DeviceInfo, cv::Size, MatType, KernelSize)
+PERF_TEST_P(Sz_Type_KernelSz, Filters_Laplacian, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(1, 3)))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    declare.time(20.0);

-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int ksize = GET_PARAM(3);
+    cv::Size size = GET_PARAM(0);
+    int type = GET_PARAM(1);
+    int ksize = GET_PARAM(2);

-    cv::Mat src_host(size, type);
-    fill(src_host, 0.0, 255.0);
+    cv::Mat src(size, type);
+    fillRandom(src);

-    cv::gpu::GpuMat src(src_host);
-    cv::gpu::GpuMat dst;
-
-    cv::gpu::Laplacian(src, dst, -1, ksize);
-
-    TEST_CYCLE()
+    if (runOnGpu)
    {
-        cv::gpu::Laplacian(src, dst, -1, ksize);
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+
+        cv::gpu::Laplacian(d_src, d_dst, -1, ksize);
+
+        TEST_CYCLE()
+        {
+            cv::gpu::Laplacian(d_src, d_dst, -1, ksize);
+        }
+    }
+    else
+    {
+        cv::Mat dst;
+
+        cv::Laplacian(src, dst, -1, ksize);
+
+        TEST_CYCLE()
+        {
+            cv::Laplacian(src, dst, -1, ksize);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Filters, Laplacian, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC4)),
-    testing::Values(KernelSize(1), KernelSize(3))));
-
 //////////////////////////////////////////////////////////////////////
 // Erode

-GPU_PERF_TEST(Erode, cv::gpu::DeviceInfo, cv::Size, MatType)
+PERF_TEST_P(Sz_Type, Filters_Erode, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4)))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    declare.time(20.0);

-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
+    cv::Size size = GET_PARAM(0);
+    int type = GET_PARAM(1);

-    cv::Mat src_host(size, type);
-    fill(src_host, 0.0, 255.0);
+    cv::Mat src(size, type);
+    fillRandom(src);

    cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));

-    cv::gpu::GpuMat src(src_host);
-    cv::gpu::GpuMat dst;
-    cv::gpu::GpuMat buf;
-
-    cv::gpu::erode(src, dst, ker, buf);
-
-    TEST_CYCLE()
+    if (runOnGpu)
    {
-        cv::gpu::erode(src, dst, ker, buf);
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+        cv::gpu::GpuMat d_buf;
+
+        cv::gpu::erode(d_src, d_dst, ker, d_buf);
+
+        TEST_CYCLE()
+        {
+            cv::gpu::erode(d_src, d_dst, ker, d_buf);
+        }
+    }
+    else
+    {
+        cv::Mat dst;
+
+        cv::erode(src, dst, ker);
+
+        TEST_CYCLE()
+        {
+            cv::erode(src, dst, ker);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Filters, Erode, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4))));
-
 //////////////////////////////////////////////////////////////////////
 // Dilate

-GPU_PERF_TEST(Dilate, cv::gpu::DeviceInfo, cv::Size, MatType)
+PERF_TEST_P(Sz_Type, Filters_Dilate, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4)))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    declare.time(20.0);

-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
+    cv::Size size = GET_PARAM(0);
+    int type = GET_PARAM(1);

-    cv::Mat src_host(size, type);
-    fill(src_host, 0.0, 255.0);
+    cv::Mat src(size, type);
+    fillRandom(src);

    cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));

-    cv::gpu::GpuMat src(src_host);
-    cv::gpu::GpuMat dst;
-    cv::gpu::GpuMat buf;
-
-    cv::gpu::dilate(src, dst, ker, buf);
-
-    TEST_CYCLE()
+    if (runOnGpu)
    {
-        cv::gpu::dilate(src, dst, ker, buf);
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+        cv::gpu::GpuMat d_buf;
+
+        cv::gpu::dilate(d_src, d_dst, ker, d_buf);
+
+        TEST_CYCLE()
+        {
+            cv::gpu::dilate(d_src, d_dst, ker, d_buf);
+        }
+    }
+    else
+    {
+        cv::Mat dst;
+
+        cv::dilate(src, dst, ker);
+
+        TEST_CYCLE()
+        {
+            cv::dilate(src, dst, ker);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Filters, Dilate, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4))));
-
 //////////////////////////////////////////////////////////////////////
 // MorphologyEx

 CV_ENUM(MorphOp, cv::MORPH_OPEN, cv::MORPH_CLOSE, cv::MORPH_GRADIENT, cv::MORPH_TOPHAT, cv::MORPH_BLACKHAT)
-#define ALL_MORPH_OPS testing::Values(MorphOp(cv::MORPH_OPEN), MorphOp(cv::MORPH_CLOSE), MorphOp(cv::MORPH_GRADIENT), MorphOp(cv::MORPH_TOPHAT), MorphOp(cv::MORPH_BLACKHAT))
+#define ALL_MORPH_OPS ValuesIn(MorphOp::all())

-GPU_PERF_TEST(MorphologyEx, cv::gpu::DeviceInfo, cv::Size, MatType, MorphOp)
+DEF_PARAM_TEST(Sz_Type_Op, cv::Size, MatType, MorphOp);
+
+PERF_TEST_P(Sz_Type_Op, Filters_MorphologyEx, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4), ALL_MORPH_OPS))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    declare.time(20.0);

-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int morphOp = GET_PARAM(3);
+    cv::Size size = GET_PARAM(0);
+    int type = GET_PARAM(1);
+    int morphOp = GET_PARAM(2);

-    cv::Mat src_host(size, type);
-    fill(src_host, 0.0, 255.0);
+    cv::Mat src(size, type);
+    fillRandom(src);

    cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));

-    cv::gpu::GpuMat src(src_host);
-    cv::gpu::GpuMat dst;
-    cv::gpu::GpuMat buf1;
-    cv::gpu::GpuMat buf2;
-
-    cv::gpu::morphologyEx(src, dst, morphOp, ker, buf1, buf2);
-
-    TEST_CYCLE()
+    if (runOnGpu)
    {
-        cv::gpu::morphologyEx(src, dst, morphOp, ker, buf1, buf2);
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+        cv::gpu::GpuMat d_buf1;
+        cv::gpu::GpuMat d_buf2;
+
+        cv::gpu::morphologyEx(d_src, d_dst, morphOp, ker, d_buf1, d_buf2);
+
+        TEST_CYCLE()
+        {
+            cv::gpu::morphologyEx(d_src, d_dst, morphOp, ker, d_buf1, d_buf2);
+        }
+    }
+    else
+    {
+        cv::Mat dst;
+
+        cv::morphologyEx(src, dst, morphOp, ker);
+
+        TEST_CYCLE()
+        {
+            cv::morphologyEx(src, dst, morphOp, ker);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Filters, MorphologyEx, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4)),
-    ALL_MORPH_OPS));
-
 //////////////////////////////////////////////////////////////////////
 // Filter2D

-GPU_PERF_TEST(Filter2D, cv::gpu::DeviceInfo, cv::Size, MatType, KernelSize)
+PERF_TEST_P(Sz_Type_KernelSz, Filters_Filter2D, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(3, 5, 7, 9, 11, 13, 15)))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    declare.time(20.0);

-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int ksize = GET_PARAM(3);
+    cv::Size size = GET_PARAM(0);
+    int type = GET_PARAM(1);
+    int ksize = GET_PARAM(2);

-    cv::Mat src_host(size, type);
-    fill(src_host, 0.0, 255.0);
+    cv::Mat src(size, type);
+    fillRandom(src);

    cv::Mat kernel(ksize, ksize, CV_32FC1);
-    fill(kernel, 0.0, 1.0);
+    fillRandom(kernel, 0.0, 1.0);

-    cv::gpu::GpuMat src(src_host);
-    cv::gpu::GpuMat dst;
-
-    cv::gpu::filter2D(src, dst, -1, kernel);
-
-    TEST_CYCLE()
+    if (runOnGpu)
    {
-        cv::gpu::filter2D(src, dst, -1, kernel);
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+
+        cv::gpu::filter2D(d_src, d_dst, -1, kernel);
+
+        TEST_CYCLE()
+        {
+            cv::gpu::filter2D(d_src, d_dst, -1, kernel);
+        }
+    }
+    else
+    {
+        cv::Mat dst;
+
+        cv::filter2D(src, dst, -1, kernel);
+
+        TEST_CYCLE()
+        {
+            cv::filter2D(src, dst, -1, kernel);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Filters, Filter2D, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC4)),
-    testing::Values(KernelSize(3), KernelSize(5), KernelSize(7), KernelSize(9), KernelSize(11), KernelSize(13), KernelSize(15))));
-
-#endif
+} // namespace
--- a/modules/gpu/perf/perf_imgproc.cpp
+++ b/modules/gpu/perf/perf_imgproc.cpp
--- a/modules/gpu/perf/perf_labeling.cpp
+++ b/modules/gpu/perf/perf_labeling.cpp
@ -1,75 +1,141 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-// By downloading, copying, installing or using the software you agree to this license.
-// If you do not agree to this license, do not download, install,
-// copy or use the software.
-//
-//
-//                          License Agreement
-//               For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2008-2011, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//  * Redistributions of source code must retain the above copyright notice,
-//    this list of conditions and the following disclaimer.
-//
-//  * Redistributions in binary form must reproduce the above copyright notice,
-//    this list of conditions and the following disclaimer in the documentation
-//    and/or other materials provided with the distribution.
-//
-//  * The name of the copyright holders may not be used to endorse or promote products
-//    derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//M*/
-
 #include "perf_precomp.hpp"

-#ifdef HAVE_CUDA
+using namespace std;
+using namespace testing;

-GPU_PERF_TEST(ConnectedComponents, cv::gpu::DeviceInfo, cv::Size)
+namespace {
+
+DEF_PARAM_TEST_1(Image, string);
+
+struct GreedyLabeling
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    struct dot
+    {
+        int x;
+        int y;

-    cv::Mat image = readImage("gpu/labeling/aloe-disp.png", cv::IMREAD_GRAYSCALE);
+        static dot make(int i, int j)
+        {
+            dot d; d.x = i; d.y = j;
+            return d;
+        }
+    };

-    // cv::threshold(image, image, 150, 255, CV_THRESH_BINARY);
+    struct InInterval
+    {
+        InInterval(const int& _lo, const int& _hi) : lo(-_lo), hi(_hi) {};
+        const int lo, hi;

-    cv::gpu::GpuMat mask;
-    mask.create(image.rows, image.cols, CV_8UC1);
+        bool operator() (const unsigned char a, const unsigned char b) const
+        {
+            int d = a - b;
+            return lo <= d && d <= hi;
+        }

-    cv::gpu::GpuMat components;
-    components.create(image.rows, image.cols, CV_32SC1);
+	private:
+		InInterval& operator=(const InInterval&);

-    cv::gpu::connectivityMask(cv::gpu::GpuMat(image), mask, cv::Scalar::all(0), cv::Scalar::all(2));

-    ASSERT_NO_THROW(cv::gpu::labelComponents(mask, components));
+    };

+    GreedyLabeling(cv::Mat img)
+    : image(img), _labels(image.size(), CV_32SC1, cv::Scalar::all(-1)) {stack = new dot[image.cols * image.rows];}
+
+    ~GreedyLabeling(){delete[] stack;}
+
+    void operator() (cv::Mat labels) const
+    {
+        labels.setTo(cv::Scalar::all(-1));
+        InInterval inInt(0, 2);
+        int cc = -1;
+
+        int* dist_labels = (int*)labels.data;
+        int pitch = static_cast<int>(labels.step1());
+
+        unsigned char* source = (unsigned char*)image.data;
+        int width = image.cols;
+        int height = image.rows;
+
+        for (int j = 0; j < image.rows; ++j)
+            for (int i = 0; i < image.cols; ++i)
+            {
+                if (dist_labels[j * pitch + i] != -1) continue;
+
+                dot* top = stack;
+                dot p = dot::make(i, j);
+                cc++;
+
+                dist_labels[j * pitch + i] = cc;
+
+                while (top >= stack)
+                {
+                    int*  dl = &dist_labels[p.y * pitch + p.x];
+                    unsigned char* sp = &source[p.y * image.step1() + p.x];
+
+                    dl[0] = cc;
+
+                    //right
+                    if( p.x < (width - 1) && dl[ +1] == -1 && inInt(sp[0], sp[+1]))
+                        *top++ = dot::make(p.x + 1, p.y);
+
+                    //left
+                    if( p.x > 0 && dl[-1] == -1 && inInt(sp[0], sp[-1]))
+                        *top++ = dot::make(p.x - 1, p.y);
+
+                    //bottom
+                    if( p.y < (height - 1) && dl[+pitch] == -1 && inInt(sp[0], sp[+image.step1()]))
+                        *top++ = dot::make(p.x, p.y + 1);
+
+                    //top
+                    if( p.y > 0 && dl[-pitch] == -1 && inInt(sp[0], sp[-static_cast<int>(image.step1())]))
+                        *top++ = dot::make(p.x, p.y - 1);
+
+                    p = *--top;
+                }
+            }
+    }
+
+    cv::Mat image;
+    cv::Mat _labels;
+    dot* stack;
+};
+
+PERF_TEST_P(Image, Labeling_ConnectedComponents, Values<string>("gpu/labeling/aloe-disp.png"))
+{
    declare.time(1.0);

-    TEST_CYCLE()
+    cv::Mat image = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+
+    if (runOnGpu)
    {
-        cv::gpu::labelComponents(mask, components);
+        cv::gpu::GpuMat mask;
+        mask.create(image.rows, image.cols, CV_8UC1);
+
+        cv::gpu::GpuMat components;
+        components.create(image.rows, image.cols, CV_32SC1);
+
+        cv::gpu::connectivityMask(cv::gpu::GpuMat(image), mask, cv::Scalar::all(0), cv::Scalar::all(2));
+
+        ASSERT_NO_THROW(cv::gpu::labelComponents(mask, components));
+
+        TEST_CYCLE()
+        {
+            cv::gpu::labelComponents(mask, components);
+        }
+    }
+    else
+    {
+        GreedyLabeling host(image);
+
+        host(host._labels);
+
+        declare.time(1.0);
+
+        TEST_CYCLE()
+        {
+            host(host._labels);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Labeling, ConnectedComponents, testing::Combine(ALL_DEVICES, testing::Values(cv::Size(261, 262))));
-
-#endif
+} // namespace
--- a/modules/gpu/perf/perf_main.cpp
+++ b/modules/gpu/perf/perf_main.cpp
@ -1,20 +0,0 @@
-#include "perf_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-int main(int argc, char **argv)
-{
-    testing::InitGoogleTest(&argc, argv);
-    perf::TestBase::Init(argc, argv);
-    return RUN_ALL_TESTS();
-}
-
-#else
-
-int main()
-{
-    printf("OpenCV was built without CUDA support\n");
-    return 0;
-}
-
-#endif
--- a/modules/gpu/perf/perf_matop.cpp
+++ b/modules/gpu/perf/perf_matop.cpp
@ -1,141 +1,169 @@
 #include "perf_precomp.hpp"

-#ifdef HAVE_CUDA
+using namespace std;
+using namespace testing;
+
+namespace {

 //////////////////////////////////////////////////////////////////////
 // SetTo

-GPU_PERF_TEST(SetTo, cv::gpu::DeviceInfo, cv::Size, MatType)
+PERF_TEST_P(Sz_Depth_Cn, MatOp_SetTo, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16U, CV_32F, CV_64F), Values(1, 3, 4)))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    cv::Size size = GET_PARAM(0);
+    int depth = GET_PARAM(1);
+    int channels = GET_PARAM(2);

-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
+    int type = CV_MAKE_TYPE(depth, channels);

-    cv::gpu::GpuMat src(size, type);
    cv::Scalar val(1, 2, 3, 4);

-    src.setTo(val);
-
-    TEST_CYCLE()
+    if (runOnGpu)
    {
+        cv::gpu::GpuMat d_src(size, type);
+
+        d_src.setTo(val);
+
+        TEST_CYCLE()
+        {
+            d_src.setTo(val);
+        }
+    }
+    else
+    {
+        cv::Mat src(size, type);
+
        src.setTo(val);
+
+        TEST_CYCLE()
+        {
+            src.setTo(val);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(MatOp, SetTo, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
-                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
-                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4),
-                    MatType(CV_64FC1), MatType(CV_64FC3), MatType(CV_64FC4))));
-
 //////////////////////////////////////////////////////////////////////
 // SetToMasked

-GPU_PERF_TEST(SetToMasked, cv::gpu::DeviceInfo, cv::Size, MatType)
+PERF_TEST_P(Sz_Depth_Cn, MatOp_SetToMasked, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16U, CV_32F, CV_64F), Values(1, 3, 4)))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    cv::Size size = GET_PARAM(0);
+    int depth = GET_PARAM(1);
+    int channels = GET_PARAM(2);

-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
+    int type = CV_MAKE_TYPE(depth, channels);

-    cv::Mat src_host(size, type);
-    fill(src_host, 0, 255);
+    cv::Mat src(size, type);
+    fillRandom(src);

-    cv::Mat mask_host(size, CV_8UC1);
-    fill(mask_host, 0, 2);
+    cv::Mat mask(size, CV_8UC1);
+    fillRandom(mask, 0, 2);

-    cv::gpu::GpuMat src(src_host);
    cv::Scalar val(1, 2, 3, 4);
-    cv::gpu::GpuMat mask(mask_host);

-    src.setTo(val, mask);
+    if (runOnGpu)
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_mask(mask);

-    TEST_CYCLE()
+        d_src.setTo(val, d_mask);
+
+        TEST_CYCLE()
+        {
+            d_src.setTo(val, d_mask);
+        }
+    }
+    else
    {
        src.setTo(val, mask);
+
+        TEST_CYCLE()
+        {
+            src.setTo(val, mask);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(MatOp, SetToMasked, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
-                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
-                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4),
-                    MatType(CV_64FC1), MatType(CV_64FC3), MatType(CV_64FC4))));
-
 //////////////////////////////////////////////////////////////////////
 // CopyToMasked

-GPU_PERF_TEST(CopyToMasked, cv::gpu::DeviceInfo, cv::Size, MatType)
+PERF_TEST_P(Sz_Depth_Cn, MatOp_CopyToMasked, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16U, CV_32F, CV_64F), Values(1, 3, 4)))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    cv::Size size = GET_PARAM(0);
+    int depth = GET_PARAM(1);
+    int channels = GET_PARAM(2);

-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
+    int type = CV_MAKE_TYPE(depth, channels);

-    cv::Mat src_host(size, type);
-    fill(src_host, 0, 255);
+    cv::Mat src(size, type);
+    fillRandom(src);

-    cv::Mat mask_host(size, CV_8UC1);
-    fill(mask_host, 0, 2);
+    cv::Mat mask(size, CV_8UC1);
+    fillRandom(mask, 0, 2);

-    cv::gpu::GpuMat src(src_host);
-    cv::gpu::GpuMat mask(mask_host);
-    cv::gpu::GpuMat dst;
-
-    src.copyTo(dst, mask);
-
-    TEST_CYCLE()
+    if (runOnGpu)
    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_mask(mask);
+        cv::gpu::GpuMat d_dst;
+
+        d_src.copyTo(d_dst, d_mask);
+
+        TEST_CYCLE()
+        {
+            d_src.copyTo(d_dst, d_mask);
+        }
+    }
+    else
+    {
+        cv::Mat dst;
+
        src.copyTo(dst, mask);
+
+        TEST_CYCLE()
+        {
+            src.copyTo(dst, mask);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(MatOp, CopyToMasked, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
-                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
-                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4),
-                    MatType(CV_64FC1), MatType(CV_64FC3), MatType(CV_64FC4))));
-
 //////////////////////////////////////////////////////////////////////
 // ConvertTo

-GPU_PERF_TEST(ConvertTo, cv::gpu::DeviceInfo, cv::Size, MatDepth, MatDepth)
+DEF_PARAM_TEST(Sz_2Depth, cv::Size, MatDepth, MatDepth);
+
+PERF_TEST_P(Sz_2Depth, MatOp_ConvertTo, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16U, CV_32F, CV_64F), Values(CV_8U, CV_16U, CV_32F, CV_64F)))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    cv::Size size = GET_PARAM(0);
+    int depth1 = GET_PARAM(1);
+    int depth2 = GET_PARAM(2);

-    cv::Size size = GET_PARAM(1);
-    int depth1 = GET_PARAM(2);
-    int depth2 = GET_PARAM(3);
+    cv::Mat src(size, depth1);
+    fillRandom(src);

-    cv::Mat src_host(size, depth1);
-    fill(src_host, 0, 255);
-
-    cv::gpu::GpuMat src(src_host);
-    cv::gpu::GpuMat dst;
-
-    src.convertTo(dst, depth2, 0.5, 1.0);
-
-    TEST_CYCLE()
+    if (runOnGpu)
    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+
+        d_src.convertTo(d_dst, depth2, 0.5, 1.0);
+
+        TEST_CYCLE()
+        {
+            d_src.convertTo(d_dst, depth2, 0.5, 1.0);
+        }
+    }
+    else
+    {
+        cv::Mat dst;
+
        src.convertTo(dst, depth2, 0.5, 1.0);
+
+        TEST_CYCLE()
+        {
+            src.convertTo(dst, depth2, 0.5, 1.0);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(MatOp, ConvertTo, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_32F), MatDepth(CV_64F)),
-    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_32F), MatDepth(CV_64F))));
-
-#endif
+} // namespace
--- a/modules/gpu/perf/perf_objdetect.cpp
+++ b/modules/gpu/perf/perf_objdetect.cpp
@ -1,85 +1,131 @@
 #include "perf_precomp.hpp"

-#ifdef HAVE_CUDA
+using namespace std;
+using namespace testing;
+
+namespace {

 ///////////////////////////////////////////////////////////////
 // HOG

-GPU_PERF_TEST_1(HOG, cv::gpu::DeviceInfo)
+DEF_PARAM_TEST_1(Image, string);
+
+PERF_TEST_P(Image, ObjDetect_HOG, Values<string>("gpu/hog/road.png"))
 {
-    cv::gpu::DeviceInfo devInfo = GetParam();
-    cv::gpu::setDevice(devInfo.deviceID());
+    cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());

-    cv::Mat img_host = readImage("gpu/hog/road.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img_host.empty());
-
-    cv::gpu::GpuMat img(img_host);
    std::vector<cv::Rect> found_locations;

-    cv::gpu::HOGDescriptor hog;
-    hog.setSVMDetector(cv::gpu::HOGDescriptor::getDefaultPeopleDetector());
-
-    hog.detectMultiScale(img, found_locations);
-
-    TEST_CYCLE()
+    if (runOnGpu)
    {
+        cv::gpu::GpuMat d_img(img);
+
+        cv::gpu::HOGDescriptor d_hog;
+        d_hog.setSVMDetector(cv::gpu::HOGDescriptor::getDefaultPeopleDetector());
+
+        d_hog.detectMultiScale(d_img, found_locations);
+
+        TEST_CYCLE()
+        {
+            d_hog.detectMultiScale(d_img, found_locations);
+        }
+    }
+    else
+    {
+        cv::HOGDescriptor hog;
+        hog.setSVMDetector(cv::gpu::HOGDescriptor::getDefaultPeopleDetector());
+
        hog.detectMultiScale(img, found_locations);
+
+        TEST_CYCLE()
+        {
+            hog.detectMultiScale(img, found_locations);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(ObjDetect, HOG, ALL_DEVICES);
-
 ///////////////////////////////////////////////////////////////
 // HaarClassifier

-GPU_PERF_TEST_1(HaarClassifier, cv::gpu::DeviceInfo)
+typedef pair<string, string> pair_string;
+DEF_PARAM_TEST_1(ImageAndCascade, pair_string);
+
+PERF_TEST_P(ImageAndCascade, ObjDetect_HaarClassifier,
+    Values<pair_string>(make_pair("gpu/haarcascade/group_1_640x480_VGA.pgm", "gpu/perf/haarcascade_frontalface_alt.xml")))
 {
-    cv::gpu::DeviceInfo devInfo = GetParam();
-    cv::gpu::setDevice(devInfo.deviceID());
+    cv::Mat img = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());

-    cv::Mat img_host = readImage("gpu/haarcascade/group_1_640x480_VGA.pgm", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img_host.empty());
-
-    cv::gpu::CascadeClassifier_GPU cascade;
-
-    ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath("gpu/perf/haarcascade_frontalface_alt.xml")));
-
-    cv::gpu::GpuMat img(img_host);
-    cv::gpu::GpuMat objects_buffer;
-
-    cascade.detectMultiScale(img, objects_buffer);
-
-    TEST_CYCLE()
+    if (runOnGpu)
    {
-        cascade.detectMultiScale(img, objects_buffer);
+        cv::gpu::CascadeClassifier_GPU d_cascade;
+        ASSERT_TRUE(d_cascade.load(perf::TestBase::getDataPath(GetParam().second)));
+
+        cv::gpu::GpuMat d_img(img);
+        cv::gpu::GpuMat d_objects_buffer;
+
+        d_cascade.detectMultiScale(d_img, d_objects_buffer);
+
+        TEST_CYCLE()
+        {
+            d_cascade.detectMultiScale(d_img, d_objects_buffer);
+        }
+    }
+    else
+    {
+        cv::CascadeClassifier cascade;
+        ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath("gpu/perf/haarcascade_frontalface_alt.xml")));
+
+        std::vector<cv::Rect> rects;
+
+        cascade.detectMultiScale(img, rects);
+
+        TEST_CYCLE()
+        {
+            cascade.detectMultiScale(img, rects);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(ObjDetect, HaarClassifier, ALL_DEVICES);
+///////////////////////////////////////////////////////////////
+// LBP cascade

-//===================== LBP cascade ==========================//
-GPU_PERF_TEST_1(LBPClassifier, cv::gpu::DeviceInfo)
+PERF_TEST_P(ImageAndCascade, ObjDetect_LBPClassifier,
+    Values<pair_string>(make_pair("gpu/haarcascade/group_1_640x480_VGA.pgm", "gpu/lbpcascade/lbpcascade_frontalface.xml")))
 {
-    cv::gpu::DeviceInfo devInfo = GetParam();
-    cv::gpu::setDevice(devInfo.deviceID());
+    cv::Mat img = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());

-    cv::Mat img_host = readImage("gpu/haarcascade/group_1_640x480_VGA.pgm", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img_host.empty());
-
-
-
-    cv::gpu::GpuMat img(img_host);
-        cv::gpu::GpuMat gpu_rects;
-    cv::gpu::CascadeClassifier_GPU cascade;
-    ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath("gpu/lbpcascade/lbpcascade_frontalface.xml")));
-
-    cascade.detectMultiScale(img, gpu_rects);
-    TEST_CYCLE()
+    if (runOnGpu)
    {
-        cascade.detectMultiScale(img, gpu_rects);
+        cv::gpu::CascadeClassifier_GPU d_cascade;
+        ASSERT_TRUE(d_cascade.load(perf::TestBase::getDataPath(GetParam().second)));
+
+        cv::gpu::GpuMat d_img(img);
+        cv::gpu::GpuMat d_gpu_rects;
+
+        d_cascade.detectMultiScale(d_img, d_gpu_rects);
+
+        TEST_CYCLE()
+        {
+            d_cascade.detectMultiScale(d_img, d_gpu_rects);
+        }
+    }
+    else
+    {
+        cv::CascadeClassifier cascade;
+        ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath("gpu/lbpcascade/lbpcascade_frontalface.xml")));
+
+        std::vector<cv::Rect> rects;
+
+        cascade.detectMultiScale(img, rects);
+
+        TEST_CYCLE()
+        {
+            cascade.detectMultiScale(img, rects);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(ObjDetect, LBPClassifier, ALL_DEVICES);
-
-#endif
+} // namespace
--- a/modules/gpu/perf/perf_precomp.hpp
+++ b/modules/gpu/perf/perf_precomp.hpp
@ -11,6 +11,10 @@

 #include "cvconfig.h"

+#ifdef HAVE_CUDA
+#include <cuda_runtime.h>
+#endif
+
 #include "opencv2/ts/ts.hpp"
 #include "opencv2/ts/ts_perf.hpp"

@ -18,8 +22,12 @@
 #include "opencv2/highgui/highgui.hpp"
 #include "opencv2/gpu/gpu.hpp"
 #include "opencv2/calib3d/calib3d.hpp"
+#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/video/video.hpp"
+#include "opencv2/nonfree/nonfree.hpp"
+#include "opencv2/legacy/legacy.hpp"

-#include "perf_utility.hpp"
+#include "utility.hpp"

 #ifdef GTEST_CREATE_SHARED_LIBRARY
 #error no modules except ts should have GTEST_CREATE_SHARED_LIBRARY defined
--- a/modules/gpu/perf/perf_utility.hpp
+++ b/modules/gpu/perf/perf_utility.hpp
@ -1,77 +0,0 @@
-#ifndef __OPENCV_PERF_GPU_UTILITY_HPP__
-#define __OPENCV_PERF_GPU_UTILITY_HPP__
-
-void fill(cv::Mat& m, double a, double b);
-
-using perf::MatType;
-using perf::MatDepth;
-
-CV_ENUM(BorderMode, cv::BORDER_REFLECT101, cv::BORDER_REPLICATE, cv::BORDER_CONSTANT, cv::BORDER_REFLECT, cv::BORDER_WRAP)
-        CV_ENUM(Interpolation, cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_CUBIC, cv::INTER_AREA)
-CV_ENUM(NormType, cv::NORM_INF, cv::NORM_L1, cv::NORM_L2, cv::NORM_HAMMING)
-
-struct CvtColorInfo
-{
-    int scn;
-    int dcn;
-    int code;
-
-    explicit CvtColorInfo(int scn_=0, int dcn_=0, int code_=0) : scn(scn_), dcn(dcn_), code(code_) {}
-};
-
-void PrintTo(const CvtColorInfo& info, std::ostream* os);
-
-#define IMPLEMENT_PARAM_CLASS(name, type) \
-    class name \
-    { \
-    public: \
-        name ( type arg = type ()) : val_(arg) {} \
-        operator type () const {return val_;} \
-    private: \
-        type val_; \
-    }; \
-    inline void PrintTo( name param, std::ostream* os) \
-    { \
-        *os << #name <<  " = " << testing::PrintToString(static_cast< type >(param)); \
-    }
-
-IMPLEMENT_PARAM_CLASS(Channels, int)
-
-namespace cv { namespace gpu
-{
-    void PrintTo(const cv::gpu::DeviceInfo& info, std::ostream* os);
-}}
-
-#define GPU_PERF_TEST(name, ...) \
-    struct name : perf::TestBaseWithParam< std::tr1::tuple< __VA_ARGS__ > > \
-    { \
-    public: \
-        name() {} \
-    protected: \
-        void PerfTestBody(); \
-    }; \
-    TEST_P(name, perf){ RunPerfTestBody(); } \
-    void name :: PerfTestBody()
-
-#define GPU_PERF_TEST_1(name, param_type) \
-    struct name : perf::TestBaseWithParam< param_type > \
-    { \
-    public: \
-        name() {} \
-    protected: \
-        void PerfTestBody(); \
-    }; \
-    TEST_P(name, perf){ RunPerfTestBody(); } \
-    void name :: PerfTestBody()
-
-#define GPU_TYPICAL_MAT_SIZES testing::Values(perf::szSXGA, perf::sz1080p, cv::Size(1800, 1500))
-
-cv::Mat readImage(const std::string& fileName, int flags = cv::IMREAD_COLOR);
-
-const std::vector<cv::gpu::DeviceInfo>& devices();
-
-#define ALL_DEVICES testing::ValuesIn(devices())
-
-#define GET_PARAM(k) std::tr1::get< k >(GetParam())
-
-#endif // __OPENCV_PERF_GPU_UTILITY_HPP__
--- a/modules/gpu/perf/perf_video.cpp
+++ b/modules/gpu/perf/perf_video.cpp
--- a/modules/gpu/perf/perf_utility.cpp
+++ b/modules/gpu/perf/perf_utility.cpp
@ -4,12 +4,19 @@ using namespace std;
 using namespace cv;
 using namespace cv::gpu;

-void fill(Mat& m, double a, double b)
+bool runOnGpu = true;
+
+void fillRandom(Mat& m, double a, double b)
 {
    RNG rng(123456789);
    rng.fill(m, RNG::UNIFORM, Scalar::all(a), Scalar::all(b));
 }

+Mat readImage(const string& fileName, int flags)
+{
+    return imread(perf::TestBase::getDataPath(fileName), flags);
+}
+
 void PrintTo(const CvtColorInfo& info, ostream* os)
 {
    static const char* str[] =
@ -184,37 +191,3 @@ void PrintTo(const CvtColorInfo& info, ostream* os)

    *os << str[info.code];
 }
-
-void cv::gpu::PrintTo(const DeviceInfo& info, ostream* os)
-{
-    *os << info.name();
-}
-
-Mat readImage(const string& fileName, int flags)
-{
-    return imread(perf::TestBase::getDataPath(fileName), flags);
-}
-
-const vector<DeviceInfo>& devices()
-{
-    static vector<DeviceInfo> devs;
-    static bool first = true;
-
-    if (first)
-    {
-        int deviceCount = getCudaEnabledDeviceCount();
-
-        devs.reserve(deviceCount);
-
-        for (int i = 0; i < deviceCount; ++i)
-        {
-            DeviceInfo info(i);
-            if (info.isCompatible())
-                devs.push_back(info);
-        }
-
-        first = false;
-    }
-
-    return devs;
-}
--- a/modules/gpu/perf/utility.hpp
+++ b/modules/gpu/perf/utility.hpp
@ -0,0 +1,45 @@
+#ifndef __OPENCV_PERF_GPU_UTILITY_HPP__
+#define __OPENCV_PERF_GPU_UTILITY_HPP__
+
+#include "opencv2/core/core.hpp"
+#include "opencv2/core/gpumat.hpp"
+#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/ts/ts_perf.hpp"
+
+extern bool runOnGpu;
+
+void fillRandom(cv::Mat& m, double a = 0.0, double b = 255.0);
+cv::Mat readImage(const std::string& fileName, int flags = cv::IMREAD_COLOR);
+
+using perf::MatType;
+using perf::MatDepth;
+
+CV_ENUM(BorderMode, cv::BORDER_REFLECT101, cv::BORDER_REPLICATE, cv::BORDER_CONSTANT, cv::BORDER_REFLECT, cv::BORDER_WRAP)
+#define ALL_BORDER_MODES testing::ValuesIn(BorderMode::all())
+CV_ENUM(Interpolation, cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_CUBIC, cv::INTER_AREA)
+#define ALL_INTERPOLATIONS testing::ValuesIn(Interpolation::all())
+CV_ENUM(NormType, cv::NORM_INF, cv::NORM_L1, cv::NORM_L2, cv::NORM_HAMMING)
+
+struct CvtColorInfo
+{
+    int scn;
+    int dcn;
+    int code;
+
+    explicit CvtColorInfo(int scn_=0, int dcn_=0, int code_=0) : scn(scn_), dcn(dcn_), code(code_) {}
+};
+void PrintTo(const CvtColorInfo& info, std::ostream* os);
+
+#define GET_PARAM(k) std::tr1::get< k >(GetParam())
+
+#define DEF_PARAM_TEST(name, ...) typedef ::perf::TestBaseWithParam< std::tr1::tuple< __VA_ARGS__ > > name
+#define DEF_PARAM_TEST_1(name, param_type) typedef ::perf::TestBaseWithParam< param_type > name
+
+DEF_PARAM_TEST_1(Sz, cv::Size);
+typedef perf::Size_MatType Sz_Type;
+DEF_PARAM_TEST(Sz_Depth, cv::Size, MatDepth);
+DEF_PARAM_TEST(Sz_Depth_Cn, cv::Size, MatDepth, int);
+
+#define GPU_TYPICAL_MAT_SIZES testing::Values(perf::szSXGA, perf::sz720p, perf::sz1080p)
+
+#endif // __OPENCV_PERF_GPU_UTILITY_HPP__
--- a/modules/gpu/perf_cpu/perf_calib3d.cpp
+++ b/modules/gpu/perf_cpu/perf_calib3d.cpp
@ -1,136 +0,0 @@
-#include "perf_cpu_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-//////////////////////////////////////////////////////////////////////
-// StereoBM
-
-GPU_PERF_TEST_1(StereoBM, cv::gpu::DeviceInfo)
-{
-    cv::Mat img_l = readImage("gpu/perf/aloe.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img_l.empty());
-
-    cv::Mat img_r = readImage("gpu/perf/aloeR.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img_r.empty());
-
-    cv::StereoBM bm(0, 256);
-
-    cv::Mat dst;
-
-    bm(img_l, img_r, dst);
-
-    declare.time(5.0);
-
-    TEST_CYCLE()
-    {
-        bm(img_l, img_r, dst);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Calib3D, StereoBM, ALL_DEVICES);
-
-//////////////////////////////////////////////////////////////////////
-// ProjectPoints
-
-IMPLEMENT_PARAM_CLASS(Count, int)
-
-GPU_PERF_TEST(ProjectPoints, cv::gpu::DeviceInfo, Count)
-{
-    int count = GET_PARAM(1);
-
-    cv::Mat src(1, count, CV_32FC3);
-    fill(src, -100, 100);
-
-    cv::Mat rvec = cv::Mat::ones(1, 3, CV_32FC1);
-    cv::Mat tvec = cv::Mat::ones(1, 3, CV_32FC1);
-    cv::Mat camera_mat = cv::Mat::ones(3, 3, CV_32FC1);
-    cv::Mat dst;
-
-    cv::projectPoints(src, rvec, tvec, camera_mat, cv::noArray(), dst);
-
-    TEST_CYCLE()
-    {
-        cv::projectPoints(src, rvec, tvec, camera_mat, cv::noArray(), dst);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Calib3D, ProjectPoints, testing::Combine(
-    ALL_DEVICES,
-    testing::Values<Count>(5000, 10000, 20000)));
-
-//////////////////////////////////////////////////////////////////////
-// SolvePnPRansac
-
-GPU_PERF_TEST(SolvePnPRansac, cv::gpu::DeviceInfo, Count)
-{
-    int count = GET_PARAM(1);
-
-    cv::Mat object(1, count, CV_32FC3);
-    fill(object, -100, 100);
-
-    cv::Mat camera_mat(3, 3, CV_32FC1);
-    fill(camera_mat, 0.5, 1);
-    camera_mat.at<float>(0, 1) = 0.f;
-    camera_mat.at<float>(1, 0) = 0.f;
-    camera_mat.at<float>(2, 0) = 0.f;
-    camera_mat.at<float>(2, 1) = 0.f;
-
-    cv::Mat dist_coef(1, 8, CV_32F, cv::Scalar::all(0));
-
-    std::vector<cv::Point2f> image_vec;
-    cv::Mat rvec_gold(1, 3, CV_32FC1);
-    fill(rvec_gold, 0, 1);
-    cv::Mat tvec_gold(1, 3, CV_32FC1);
-    fill(tvec_gold, 0, 1);
-    cv::projectPoints(object, rvec_gold, tvec_gold, camera_mat, dist_coef, image_vec);
-
-    cv::Mat image(1, count, CV_32FC2, &image_vec[0]);
-
-    cv::Mat rvec;
-    cv::Mat tvec;
-
-    cv::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);
-
-    declare.time(10.0);
-
-    TEST_CYCLE()
-    {
-        cv::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Calib3D, SolvePnPRansac, testing::Combine(
-    ALL_DEVICES,
-    testing::Values<Count>(5000, 10000, 20000)));
-
-//////////////////////////////////////////////////////////////////////
-// ReprojectImageTo3D
-
-GPU_PERF_TEST(ReprojectImageTo3D, cv::gpu::DeviceInfo, cv::Size, MatDepth)
-{
-    cv::Size size = GET_PARAM(1);
-    int depth = GET_PARAM(2);
-
-    cv::Mat src(size, depth);
-    fill(src, 5.0, 30.0);
-
-    cv::Mat Q(4, 4, CV_32FC1);
-    fill(Q, 0.1, 1.0);
-
-    cv::Mat dst;
-
-    cv::reprojectImageTo3D(src, dst, Q);
-
-    TEST_CYCLE()
-    {
-        cv::reprojectImageTo3D(src, dst, Q);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Calib3D, ReprojectImageTo3D, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values<MatDepth>(CV_8U, CV_16S)));
-
-#endif
-
--- a/modules/gpu/perf_cpu/perf_core.cpp
+++ b/modules/gpu/perf_cpu/perf_core.cpp
--- a/modules/gpu/perf_cpu/perf_cpu_precomp.cpp
+++ b/modules/gpu/perf_cpu/perf_cpu_precomp.cpp
@ -1 +0,0 @@
-#include "perf_cpu_precomp.hpp"
--- a/modules/gpu/perf_cpu/perf_cpu_precomp.hpp
+++ b/modules/gpu/perf_cpu/perf_cpu_precomp.hpp
@ -1,32 +0,0 @@
-#ifdef __GNUC__
-#  pragma GCC diagnostic ignored "-Wmissing-declarations"
-#  pragma GCC diagnostic ignored "-Wmissing-prototypes" //OSX
-#endif
-
-#ifndef __OPENCV_PERF_CPU_PRECOMP_HPP__
-#define __OPENCV_PERF_CPU_PRECOMP_HPP__
-
-#include <cstdio>
-#include <iostream>
-
-#include "cvconfig.h"
-
-#include "opencv2/ts/ts.hpp"
-#include "opencv2/ts/ts_perf.hpp"
-
-#include "opencv2/core/core.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/gpu/gpu.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
-#include "opencv2/video/video.hpp"
-#include "opencv2/calib3d/calib3d.hpp"
-#include "opencv2/nonfree/nonfree.hpp"
-#include "opencv2/legacy/legacy.hpp"
-
-#include "perf_utility.hpp"
-
-#ifdef GTEST_CREATE_SHARED_LIBRARY
-#error no modules except ts should have GTEST_CREATE_SHARED_LIBRARY defined
-#endif
-
-#endif
--- a/modules/gpu/perf_cpu/perf_features2d.cpp
+++ b/modules/gpu/perf_cpu/perf_features2d.cpp
@ -1,187 +0,0 @@
-#include "perf_cpu_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-//////////////////////////////////////////////////////////////////////
-// SURF
-
-GPU_PERF_TEST_1(SURF, cv::gpu::DeviceInfo)
-{
-    cv::Mat img = readImage("gpu/perf/aloe.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    cv::SURF surf;
-
-    std::vector<cv::KeyPoint> keypoints;
-    cv::Mat descriptors;
-
-    surf(img, cv::noArray(), keypoints, descriptors);
-
-    declare.time(50.0);
-
-    TEST_CYCLE()
-    {
-        keypoints.clear();
-        surf(img, cv::noArray(), keypoints, descriptors);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Features2D, SURF, ALL_DEVICES);
-
-//////////////////////////////////////////////////////////////////////
-// FAST
-
-GPU_PERF_TEST_1(FAST, cv::gpu::DeviceInfo)
-{
-    cv::Mat img = readImage("gpu/perf/aloe.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    std::vector<cv::KeyPoint> keypoints;
-
-    cv::FAST(img, keypoints, 20);
-
-    TEST_CYCLE()
-    {
-        keypoints.clear();
-        cv::FAST(img, keypoints, 20);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Features2D, FAST, ALL_DEVICES);
-
-//////////////////////////////////////////////////////////////////////
-// ORB
-
-GPU_PERF_TEST_1(ORB, cv::gpu::DeviceInfo)
-{
-    cv::Mat img = readImage("gpu/perf/aloe.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    cv::ORB orb(4000);
-
-    std::vector<cv::KeyPoint> keypoints;
-    cv::Mat descriptors;
-
-    orb(img, cv::noArray(), keypoints, descriptors);
-
-    TEST_CYCLE()
-    {
-        keypoints.clear();
-        orb(img, cv::noArray(), keypoints, descriptors);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Features2D, ORB, ALL_DEVICES);
-
-//////////////////////////////////////////////////////////////////////
-// BruteForceMatcher_match
-
-IMPLEMENT_PARAM_CLASS(DescriptorSize, int)
-
-GPU_PERF_TEST(BruteForceMatcher_match, cv::gpu::DeviceInfo, DescriptorSize, NormType)
-{
-    int desc_size = GET_PARAM(1);
-    int normType = GET_PARAM(2);
-
-    int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;
-
-    cv::Mat query(3000, desc_size, type);
-    fill(query, 0.0, 10.0);
-
-    cv::Mat train(3000, desc_size, type);
-    fill(train, 0.0, 10.0);
-
-    cv::BFMatcher matcher(normType);
-
-    std::vector<cv::DMatch> matches;
-
-    matcher.match(query, train, matches);
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        matcher.match(query, train, matches);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Features2D, BruteForceMatcher_match, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(DescriptorSize(64), DescriptorSize(128), DescriptorSize(256)),
-    testing::Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))));
-
-//////////////////////////////////////////////////////////////////////
-// BruteForceMatcher_knnMatch
-
-IMPLEMENT_PARAM_CLASS(K, int)
-
-GPU_PERF_TEST(BruteForceMatcher_knnMatch, cv::gpu::DeviceInfo, DescriptorSize, K, NormType)
-{
-    int desc_size = GET_PARAM(1);
-    int k = GET_PARAM(2);
-    int normType = GET_PARAM(3);
-
-    int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;
-
-    cv::Mat query(3000, desc_size, type);
-    fill(query, 0.0, 10.0);
-
-    cv::Mat train(3000, desc_size, type);
-    fill(train, 0.0, 10.0);
-
-    cv::BFMatcher matcher(normType);
-
-    std::vector< std::vector<cv::DMatch> > matches;
-
-    matcher.knnMatch(query, train, matches, k);
-
-    declare.time(30.0);
-
-    TEST_CYCLE()
-    {
-        matcher.knnMatch(query, train, matches, k);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Features2D, BruteForceMatcher_knnMatch, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(DescriptorSize(64), DescriptorSize(128), DescriptorSize(256)),
-    testing::Values(K(2), K(3)),
-    testing::Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))));
-
-//////////////////////////////////////////////////////////////////////
-// BruteForceMatcher_radiusMatch
-
-GPU_PERF_TEST(BruteForceMatcher_radiusMatch, cv::gpu::DeviceInfo, DescriptorSize, NormType)
-{
-    int desc_size = GET_PARAM(1);
-    int normType = GET_PARAM(2);
-
-    int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;
-
-    cv::Mat query(3000, desc_size, type);
-    fill(query, 0.0, 1.0);
-
-    cv::Mat train(3000, desc_size, type);
-    fill(train, 0.0, 1.0);
-
-    cv::BFMatcher matcher(normType);
-
-    std::vector< std::vector<cv::DMatch> > matches;
-
-    matcher.radiusMatch(query, train, matches, 2.0);
-
-    declare.time(30.0);
-
-    TEST_CYCLE()
-    {
-        matcher.radiusMatch(query, train, matches, 2.0);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Features2D, BruteForceMatcher_radiusMatch, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(DescriptorSize(64), DescriptorSize(128), DescriptorSize(256)),
-    testing::Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))));
-
-#endif
--- a/modules/gpu/perf_cpu/perf_filters.cpp
+++ b/modules/gpu/perf_cpu/perf_filters.cpp
@ -1,283 +0,0 @@
-#include "perf_cpu_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-IMPLEMENT_PARAM_CLASS(KernelSize, int)
-
-//////////////////////////////////////////////////////////////////////
-// Blur
-
-GPU_PERF_TEST(Blur, cv::gpu::DeviceInfo, cv::Size, MatType, KernelSize)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int ksize = GET_PARAM(3);
-
-    cv::Mat src(size, type);
-    fill(src, 0.0, 255.0);
-
-    cv::Mat dst;
-
-    cv::blur(src, dst, cv::Size(ksize, ksize));
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        cv::blur(src, dst, cv::Size(ksize, ksize));
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Filters, Blur, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4)),
-    testing::Values(KernelSize(3), KernelSize(5), KernelSize(7))));
-
-//////////////////////////////////////////////////////////////////////
-// Sobel
-
-GPU_PERF_TEST(Sobel, cv::gpu::DeviceInfo, cv::Size, MatType, KernelSize)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int ksize = GET_PARAM(3);
-
-    cv::Mat src(size, type);
-    fill(src, 0.0, 255.0);
-
-    cv::Mat dst;
-
-    cv::Sobel(src, dst, -1, 1, 1, ksize);
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        cv::Sobel(src, dst, -1, 1, 1, ksize);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Filters, Sobel, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1)),
-    testing::Values(KernelSize(3), KernelSize(5), KernelSize(7), KernelSize(9), KernelSize(11), KernelSize(13), KernelSize(15))));
-
-//////////////////////////////////////////////////////////////////////
-// Scharr
-
-GPU_PERF_TEST(Scharr, cv::gpu::DeviceInfo, cv::Size, MatType)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-
-    cv::Mat src(size, type);
-    fill(src, 0.0, 255.0);
-
-    cv::Mat dst;
-
-    cv::Scharr(src, dst, -1, 1, 0);
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        cv::Scharr(src, dst, -1, 1, 0);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Filters, Scharr, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1))));
-
-//////////////////////////////////////////////////////////////////////
-// GaussianBlur
-
-GPU_PERF_TEST(GaussianBlur, cv::gpu::DeviceInfo, cv::Size, MatType, KernelSize)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int ksize = GET_PARAM(3);
-
-    cv::Mat src(size, type);
-    fill(src, 0.0, 255.0);
-
-    cv::Mat dst;
-
-    cv::GaussianBlur(src, dst, cv::Size(ksize, ksize), 0.5);
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        cv::GaussianBlur(src, dst, cv::Size(ksize, ksize), 0.5);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Filters, GaussianBlur, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1)),
-    testing::Values(KernelSize(3), KernelSize(5), KernelSize(7), KernelSize(9), KernelSize(11), KernelSize(13), KernelSize(15))));
-
-//////////////////////////////////////////////////////////////////////
-// Laplacian
-
-GPU_PERF_TEST(Laplacian, cv::gpu::DeviceInfo, cv::Size, MatType, KernelSize)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int ksize = GET_PARAM(3);
-
-    cv::Mat src(size, type);
-    fill(src, 0.0, 255.0);
-
-    cv::Mat dst;
-
-    cv::Laplacian(src, dst, -1, ksize);
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        cv::Laplacian(src, dst, -1, ksize);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Filters, Laplacian, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC4)),
-    testing::Values(KernelSize(1), KernelSize(3))));
-
-//////////////////////////////////////////////////////////////////////
-// Erode
-
-GPU_PERF_TEST(Erode, cv::gpu::DeviceInfo, cv::Size, MatType)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-
-    cv::Mat src(size, type);
-    fill(src, 0.0, 255.0);
-
-    cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));
-
-    cv::Mat dst;
-
-    cv::erode(src, dst, ker);
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        cv::erode(src, dst, ker);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Filters, Erode, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4))));
-
-//////////////////////////////////////////////////////////////////////
-// Dilate
-
-GPU_PERF_TEST(Dilate, cv::gpu::DeviceInfo, cv::Size, MatType)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-
-    cv::Mat src(size, type);
-    fill(src, 0.0, 255.0);
-
-    cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));
-
-    cv::Mat dst;
-
-    cv::dilate(src, dst, ker);
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        cv::dilate(src, dst, ker);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Filters, Dilate, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4))));
-
-//////////////////////////////////////////////////////////////////////
-// MorphologyEx
-
-CV_ENUM(MorphOp, cv::MORPH_OPEN, cv::MORPH_CLOSE, cv::MORPH_GRADIENT, cv::MORPH_TOPHAT, cv::MORPH_BLACKHAT)
-#define ALL_MORPH_OPS testing::Values(MorphOp(cv::MORPH_OPEN), MorphOp(cv::MORPH_CLOSE), MorphOp(cv::MORPH_GRADIENT), MorphOp(cv::MORPH_TOPHAT), MorphOp(cv::MORPH_BLACKHAT))
-
-GPU_PERF_TEST(MorphologyEx, cv::gpu::DeviceInfo, cv::Size, MatType, MorphOp)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int morphOp = GET_PARAM(3);
-
-    cv::Mat src(size, type);
-    fill(src, 0.0, 255.0);
-
-    cv::Mat dst;
-
-    cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));
-
-    cv::morphologyEx(src, dst, morphOp, ker);
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        cv::morphologyEx(src, dst, morphOp, ker);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Filters, MorphologyEx, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4)),
-    ALL_MORPH_OPS));
-
-//////////////////////////////////////////////////////////////////////
-// Filter2D
-
-GPU_PERF_TEST(Filter2D, cv::gpu::DeviceInfo, cv::Size, MatType, KernelSize)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int ksize = GET_PARAM(3);
-
-    cv::Mat src(size, type);
-    fill(src, 0.0, 255.0);
-
-    cv::Mat kernel(ksize, ksize, CV_32FC1);
-    fill(kernel, 0.0, 1.0);
-
-    cv::Mat dst;
-
-    cv::filter2D(src, dst, -1, kernel);
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        cv::filter2D(src, dst, -1, kernel);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Filters, Filter2D, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC4)),
-    testing::Values(KernelSize(3), KernelSize(5), KernelSize(7), KernelSize(9), KernelSize(11), KernelSize(13), KernelSize(15))));
-
-#endif
--- a/modules/gpu/perf_cpu/perf_imgproc.cpp
+++ b/modules/gpu/perf_cpu/perf_imgproc.cpp
@ -1,771 +0,0 @@
-#include "perf_cpu_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-//////////////////////////////////////////////////////////////////////
-// Remap
-
-GPU_PERF_TEST(Remap, cv::gpu::DeviceInfo, cv::Size, MatType, Interpolation, BorderMode)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int interpolation = GET_PARAM(3);
-    int borderMode = GET_PARAM(4);
-
-    cv::Mat src(size, type);
-    fill(src, 0, 255);
-
-    cv::Mat xmap(size, CV_32FC1);
-    fill(xmap, 0, size.width);
-
-    cv::Mat ymap(size, CV_32FC1);
-    fill(ymap, 0, size.height);
-
-    cv::Mat dst;
-
-    cv::remap(src, dst, xmap, ymap, interpolation, borderMode);
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        cv::remap(src, dst, xmap, ymap, interpolation, borderMode);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, Remap, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
-                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
-                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
-    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
-    testing::Values(BorderMode(cv::BORDER_REFLECT101), BorderMode(cv::BORDER_REPLICATE), BorderMode(cv::BORDER_CONSTANT), BorderMode(cv::BORDER_REFLECT), BorderMode(cv::BORDER_WRAP))));
-
-
-//////////////////////////////////////////////////////////////////////
-// Resize
-
-IMPLEMENT_PARAM_CLASS(Scale, double)
-
-GPU_PERF_TEST(Resize, cv::gpu::DeviceInfo, cv::Size, MatType, Interpolation, Scale)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int interpolation = GET_PARAM(3);
-    double f = GET_PARAM(4);
-
-    cv::Mat src(size, type);
-    fill(src, 0, 255);
-
-    cv::Mat dst;
-
-    cv::resize(src, dst, cv::Size(), f, f, interpolation);
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        cv::resize(src, dst, cv::Size(), f, f, interpolation);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, Resize, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
-                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
-                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
-    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR),
-                    Interpolation(cv::INTER_CUBIC),   Interpolation(cv::INTER_AREA)),
-    testing::Values(Scale(0.5), Scale(0.3), Scale(2.0))));
-
-GPU_PERF_TEST(ResizeArea, cv::gpu::DeviceInfo, cv::Size, MatType, Scale)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int interpolation = cv::INTER_AREA;
-    double f = GET_PARAM(3);
-
-    cv::Mat src_host(size, type);
-    fill(src_host, 0, 255);
-
-    cv::Mat src(src_host);
-    cv::Mat dst;
-
-    cv::resize(src, dst, cv::Size(), f, f, interpolation);
-
-    declare.time(1.0);
-
-    TEST_CYCLE()
-    {
-        cv::resize(src, dst, cv::Size(), f, f, interpolation);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, ResizeArea, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(perf::sz1080p, cv::Size(4096, 2048)),
-    testing::Values(MatType(CV_8UC1)/*,  MatType(CV_8UC3), MatType(CV_8UC4),
-                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
-                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)*/),
-    testing::Values(Scale(0.2),Scale(0.1),Scale(0.05))));
-
-//////////////////////////////////////////////////////////////////////
-// WarpAffine
-
-GPU_PERF_TEST(WarpAffine, cv::gpu::DeviceInfo, cv::Size, MatType, Interpolation, BorderMode)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int interpolation = GET_PARAM(3);
-    int borderMode = GET_PARAM(4);
-
-    cv::Mat src(size, type);
-    fill(src, 0, 255);
-
-    cv::Mat dst;
-
-    const double aplha = CV_PI / 4;
-    double mat[2][3] = { {std::cos(aplha), -std::sin(aplha), src.cols / 2},
-                         {std::sin(aplha),  std::cos(aplha), 0}};
-    cv::Mat M(2, 3, CV_64F, (void*) mat);
-
-    cv::warpAffine(src, dst, M, size, interpolation, borderMode);
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        cv::warpAffine(src, dst, M, size, interpolation, borderMode);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, WarpAffine, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
-                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
-                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
-    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
-    testing::Values(BorderMode(cv::BORDER_REFLECT101), BorderMode(cv::BORDER_REPLICATE), BorderMode(cv::BORDER_CONSTANT), BorderMode(cv::BORDER_REFLECT), BorderMode(cv::BORDER_WRAP))));
-
-//////////////////////////////////////////////////////////////////////
-// WarpPerspective
-
-GPU_PERF_TEST(WarpPerspective, cv::gpu::DeviceInfo, cv::Size, MatType, Interpolation, BorderMode)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int interpolation = GET_PARAM(3);
-    int borderMode = GET_PARAM(4);
-
-    cv::Mat src(size, type);
-    fill(src, 0, 255);
-
-    cv::Mat dst;
-
-    const double aplha = CV_PI / 4;
-    double mat[3][3] = { {std::cos(aplha), -std::sin(aplha), src.cols / 2},
-                         {std::sin(aplha),  std::cos(aplha), 0},
-                         {0.0,              0.0,             1.0}};
-    cv::Mat M(3, 3, CV_64F, (void*) mat);
-
-    cv::warpPerspective(src, dst, M, size, interpolation, borderMode);
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        cv::warpPerspective(src, dst, M, size, interpolation, borderMode);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, WarpPerspective, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
-                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
-                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
-    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
-    testing::Values(BorderMode(cv::BORDER_REFLECT101), BorderMode(cv::BORDER_REPLICATE), BorderMode(cv::BORDER_CONSTANT), BorderMode(cv::BORDER_REFLECT), BorderMode(cv::BORDER_WRAP))));
-
-//////////////////////////////////////////////////////////////////////
-// CopyMakeBorder
-
-GPU_PERF_TEST(CopyMakeBorder, cv::gpu::DeviceInfo, cv::Size, MatType, BorderMode)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int borderType = GET_PARAM(3);
-
-    cv::Mat src(size, type);
-    fill(src, 0, 255);
-
-    cv::Mat dst;
-
-    cv::copyMakeBorder(src, dst, 5, 5, 5, 5, borderType);
-
-    TEST_CYCLE()
-    {
-        cv::copyMakeBorder(src, dst, 5, 5, 5, 5, borderType);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, CopyMakeBorder, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
-                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
-                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
-    testing::Values(BorderMode(cv::BORDER_REFLECT101), BorderMode(cv::BORDER_REPLICATE), BorderMode(cv::BORDER_CONSTANT), BorderMode(cv::BORDER_REFLECT), BorderMode(cv::BORDER_WRAP))));
-
-//////////////////////////////////////////////////////////////////////
-// Threshold
-
-CV_ENUM(ThreshOp, cv::THRESH_BINARY, cv::THRESH_BINARY_INV, cv::THRESH_TRUNC, cv::THRESH_TOZERO, cv::THRESH_TOZERO_INV)
-#define ALL_THRESH_OPS testing::Values(ThreshOp(cv::THRESH_BINARY), ThreshOp(cv::THRESH_BINARY_INV), ThreshOp(cv::THRESH_TRUNC), ThreshOp(cv::THRESH_TOZERO), ThreshOp(cv::THRESH_TOZERO_INV))
-
-GPU_PERF_TEST(Threshold, cv::gpu::DeviceInfo, cv::Size, MatDepth, ThreshOp)
-{
-    cv::Size size = GET_PARAM(1);
-    int depth = GET_PARAM(2);
-    int threshOp = GET_PARAM(3);
-
-    cv::Mat src(size, depth);
-    fill(src, 0, 255);
-
-    cv::Mat dst;
-
-    cv::threshold(src, dst, 100.0, 255.0, threshOp);
-
-    TEST_CYCLE()
-    {
-        cv::threshold(src, dst, 100.0, 255.0, threshOp);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, Threshold, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_32F), MatDepth(CV_64F)),
-    ALL_THRESH_OPS));
-
-//////////////////////////////////////////////////////////////////////
-// Integral
-
-GPU_PERF_TEST(Integral, cv::gpu::DeviceInfo, cv::Size)
-{
-    cv::Size size = GET_PARAM(1);
-
-    cv::Mat src(size, CV_8UC1);
-    fill(src, 0, 255);
-
-    cv::Mat dst;
-
-    cv::integral(src, dst);
-
-    TEST_CYCLE()
-    {
-        cv::integral(src, dst);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, Integral, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES));
-
-//////////////////////////////////////////////////////////////////////
-// HistEven_OneChannel
-
-GPU_PERF_TEST(HistEven_OneChannel, cv::gpu::DeviceInfo, cv::Size, MatDepth)
-{
-    cv::Size size = GET_PARAM(1);
-    int depth = GET_PARAM(2);
-
-    cv::Mat src(size, depth);
-    fill(src, 0, 255);
-
-    int hbins = 30;
-    float hranges[] = {0.0f, 180.0f};
-    cv::Mat hist;
-    int histSize[] = {hbins};
-    const float* ranges[] = {hranges};
-    int channels[] = {0};
-
-    cv::calcHist(&src, 1, channels, cv::Mat(), hist, 1, histSize, ranges);
-
-    TEST_CYCLE()
-    {
-        cv::calcHist(&src, 1, channels, cv::Mat(), hist, 1, histSize, ranges);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, HistEven_OneChannel, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_16S))));
-
-//////////////////////////////////////////////////////////////////////
-// EqualizeHist
-
-GPU_PERF_TEST(EqualizeHist, cv::gpu::DeviceInfo, cv::Size)
-{
-    cv::Size size = GET_PARAM(1);
-
-    cv::Mat src(size, CV_8UC1);
-    fill(src, 0, 255);
-
-    cv::Mat dst;
-
-    cv::equalizeHist(src, dst);
-
-    TEST_CYCLE()
-    {
-        cv::equalizeHist(src, dst);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, EqualizeHist, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES));
-
-//////////////////////////////////////////////////////////////////////
-// Canny
-
-IMPLEMENT_PARAM_CLASS(AppertureSize, int)
-IMPLEMENT_PARAM_CLASS(L2gradient, bool)
-
-GPU_PERF_TEST(Canny, cv::gpu::DeviceInfo, AppertureSize, L2gradient)
-{
-    int apperture_size = GET_PARAM(1);
-    bool useL2gradient = GET_PARAM(2);
-
-    cv::Mat image = readImage("perf/1280x1024.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(image.empty());
-
-    cv::Mat dst;
-
-    cv::Canny(image, dst, 50.0, 100.0, apperture_size, useL2gradient);
-
-    TEST_CYCLE()
-    {
-        cv::Canny(image, dst, 50.0, 100.0, apperture_size, useL2gradient);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, Canny, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(AppertureSize(3), AppertureSize(5)),
-    testing::Values(L2gradient(false), L2gradient(true))));
-
-//////////////////////////////////////////////////////////////////////
-// MeanShiftFiltering
-
-GPU_PERF_TEST_1(MeanShiftFiltering, cv::gpu::DeviceInfo)
-{
-    cv::Mat img = readImage("gpu/meanshift/cones.png");
-    ASSERT_FALSE(img.empty());
-
-    cv::Mat dst;
-
-    cv::pyrMeanShiftFiltering(img, dst, 50, 50);
-
-    declare.time(15.0);
-
-    TEST_CYCLE()
-    {
-        cv::pyrMeanShiftFiltering(img, dst, 50, 50);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, MeanShiftFiltering, ALL_DEVICES);
-
-//////////////////////////////////////////////////////////////////////
-// Convolve
-
-IMPLEMENT_PARAM_CLASS(KSize, int)
-IMPLEMENT_PARAM_CLASS(Ccorr, bool)
-
-GPU_PERF_TEST(Convolve, cv::gpu::DeviceInfo, cv::Size, KSize, Ccorr)
-{
-    cv::Size size = GET_PARAM(1);
-    int templ_size = GET_PARAM(2);
-    bool ccorr = GET_PARAM(3);
-
-    ASSERT_FALSE(ccorr);
-
-    cv::Mat image(size, CV_32FC1);
-    image.setTo(1.0);
-
-    cv::Mat templ(templ_size, templ_size, CV_32FC1);
-    templ.setTo(1.0);
-
-    cv::Mat dst;
-
-    cv::filter2D(image, dst, image.depth(), templ);
-
-    declare.time(10.0);
-
-    TEST_CYCLE()
-    {
-        cv::filter2D(image, dst, image.depth(), templ);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, Convolve, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(KSize(3), KSize(9), KSize(17), KSize(27), KSize(32), KSize(64)),
-    testing::Values(Ccorr(false), Ccorr(true))));
-
-////////////////////////////////////////////////////////////////////////////////
-// MatchTemplate_8U
-
-CV_ENUM(TemplateMethod, cv::TM_SQDIFF, cv::TM_SQDIFF_NORMED, cv::TM_CCORR, cv::TM_CCORR_NORMED, cv::TM_CCOEFF, cv::TM_CCOEFF_NORMED)
-#define ALL_TEMPLATE_METHODS testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_SQDIFF_NORMED), TemplateMethod(cv::TM_CCORR), TemplateMethod(cv::TM_CCORR_NORMED), TemplateMethod(cv::TM_CCOEFF), TemplateMethod(cv::TM_CCOEFF_NORMED))
-
-IMPLEMENT_PARAM_CLASS(TemplateSize, cv::Size)
-
-GPU_PERF_TEST(MatchTemplate_8U, cv::gpu::DeviceInfo, cv::Size, TemplateSize, Channels, TemplateMethod)
-{
-    cv::Size size = GET_PARAM(1);
-    cv::Size templ_size = GET_PARAM(2);
-    int cn = GET_PARAM(3);
-    int method = GET_PARAM(4);
-
-    cv::Mat image(size, CV_MAKE_TYPE(CV_8U, cn));
-    fill(image, 0, 255);
-
-    cv::Mat templ(templ_size, CV_MAKE_TYPE(CV_8U, cn));
-    fill(templ, 0, 255);
-
-    cv::Mat dst;
-
-    cv::matchTemplate(image, templ, dst, method);
-
-    TEST_CYCLE()
-    {
-        cv::matchTemplate(image, templ, dst, method);
-    }
-};
-
-INSTANTIATE_TEST_CASE_P(ImgProc, MatchTemplate_8U, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16)), TemplateSize(cv::Size(30, 30))),
-    testing::Values(Channels(1), Channels(3), Channels(4)),
-    ALL_TEMPLATE_METHODS));
-
-////////////////////////////////////////////////////////////////////////////////
-// MatchTemplate_32F
-
-GPU_PERF_TEST(MatchTemplate_32F, cv::gpu::DeviceInfo, cv::Size, TemplateSize, Channels, TemplateMethod)
-{
-    cv::Size size = GET_PARAM(1);
-    cv::Size templ_size = GET_PARAM(2);
-    int cn = GET_PARAM(3);
-    int method = GET_PARAM(4);
-
-    cv::Mat image(size, CV_MAKE_TYPE(CV_32F, cn));
-    fill(image, 0, 255);
-
-    cv::Mat templ(templ_size, CV_MAKE_TYPE(CV_32F, cn));
-    fill(templ, 0, 255);
-
-    cv::Mat dst;
-
-    cv::matchTemplate(image, templ, dst, method);
-
-    TEST_CYCLE()
-    {
-        cv::matchTemplate(image, templ, dst, method);
-    }
-};
-
-INSTANTIATE_TEST_CASE_P(ImgProc, MatchTemplate_32F, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16)), TemplateSize(cv::Size(30, 30))),
-    testing::Values(Channels(1), Channels(3), Channels(4)),
-    testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR))));
-
-//////////////////////////////////////////////////////////////////////
-// MulSpectrums
-
-CV_FLAGS(DftFlags, 0, cv::DFT_INVERSE, cv::DFT_SCALE, cv::DFT_ROWS, cv::DFT_COMPLEX_OUTPUT, cv::DFT_REAL_OUTPUT)
-
-GPU_PERF_TEST(MulSpectrums, cv::gpu::DeviceInfo, cv::Size, DftFlags)
-{
-    cv::Size size = GET_PARAM(1);
-    int flag = GET_PARAM(2);
-
-    cv::Mat a(size, CV_32FC2);
-    fill(a, 0, 100);
-
-    cv::Mat b(size, CV_32FC2);
-    fill(b, 0, 100);
-
-    cv::Mat dst;
-
-    cv::mulSpectrums(a, b, dst, flag);
-
-    TEST_CYCLE()
-    {
-        cv::mulSpectrums(a, b, dst, flag);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, MulSpectrums, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(DftFlags(0), DftFlags(cv::DFT_ROWS))));
-
-//////////////////////////////////////////////////////////////////////
-// Dft
-
-GPU_PERF_TEST(Dft, cv::gpu::DeviceInfo, cv::Size, DftFlags)
-{
-    cv::Size size = GET_PARAM(1);
-    int flag = GET_PARAM(2);
-
-    cv::Mat src(size, CV_32FC2);
-    fill(src, 0, 100);
-
-    cv::Mat dst;
-
-    cv::dft(src, dst, flag);
-
-    declare.time(10.0);
-
-    TEST_CYCLE()
-    {
-        cv::dft(src, dst, flag);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, Dft, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(DftFlags(0), DftFlags(cv::DFT_ROWS), DftFlags(cv::DFT_INVERSE))));
-
-//////////////////////////////////////////////////////////////////////
-// CornerHarris
-
-IMPLEMENT_PARAM_CLASS(BlockSize, int)
-IMPLEMENT_PARAM_CLASS(ApertureSize, int)
-
-GPU_PERF_TEST(CornerHarris, cv::gpu::DeviceInfo, MatType, BorderMode, BlockSize, ApertureSize)
-{
-    int type = GET_PARAM(1);
-    int borderType = GET_PARAM(2);
-    int blockSize = GET_PARAM(3);
-    int apertureSize = GET_PARAM(4);
-
-    cv::Mat img = readImage("gpu/stereobm/aloe-L.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    img.convertTo(img, type, type == CV_32F ? 1.0 / 255.0 : 1.0);
-
-    cv::Mat dst;
-
-    double k = 0.5;
-
-    cv::cornerHarris(img, dst, blockSize, apertureSize, k, borderType);
-
-    TEST_CYCLE()
-    {
-        cv::cornerHarris(img, dst, blockSize, apertureSize, k, borderType);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, CornerHarris, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_32FC1)),
-    testing::Values(BorderMode(cv::BORDER_REFLECT101), BorderMode(cv::BORDER_REPLICATE), BorderMode(cv::BORDER_REFLECT)),
-    testing::Values(BlockSize(3), BlockSize(5), BlockSize(7)),
-    testing::Values(ApertureSize(0), ApertureSize(3), ApertureSize(5), ApertureSize(7))));
-
-//////////////////////////////////////////////////////////////////////
-// CornerMinEigenVal
-
-GPU_PERF_TEST(CornerMinEigenVal, cv::gpu::DeviceInfo, MatType, BorderMode, BlockSize, ApertureSize)
-{
-    int type = GET_PARAM(1);
-    int borderType = GET_PARAM(2);
-    int blockSize = GET_PARAM(3);
-    int apertureSize = GET_PARAM(4);
-
-    cv::Mat img = readImage("gpu/stereobm/aloe-L.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    img.convertTo(img, type, type == CV_32F ? 1.0 / 255.0 : 1.0);
-
-    cv::Mat dst;
-
-    cv::cornerMinEigenVal(img, dst, blockSize, apertureSize, borderType);
-
-    TEST_CYCLE()
-    {
-        cv::cornerMinEigenVal(img, dst, blockSize, apertureSize, borderType);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, CornerMinEigenVal, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_32FC1)),
-    testing::Values(BorderMode(cv::BORDER_REFLECT101), BorderMode(cv::BORDER_REPLICATE), BorderMode(cv::BORDER_REFLECT)),
-    testing::Values(BlockSize(3), BlockSize(5), BlockSize(7)),
-    testing::Values(ApertureSize(0), ApertureSize(3), ApertureSize(5), ApertureSize(7))));
-
-//////////////////////////////////////////////////////////////////////
-// PyrDown
-
-GPU_PERF_TEST(PyrDown, cv::gpu::DeviceInfo, cv::Size, MatType)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-
-    cv::Mat src(size, type);
-    fill(src, 0, 255);
-
-    cv::Mat dst;
-
-    cv::pyrDown(src, dst);
-
-    TEST_CYCLE()
-    {
-        cv::pyrDown(src, dst);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, PyrDown, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
-                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
-                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4))));
-
-//////////////////////////////////////////////////////////////////////
-// PyrUp
-
-GPU_PERF_TEST(PyrUp, cv::gpu::DeviceInfo, cv::Size, MatType)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-
-    cv::Mat src(size, type);
-    fill(src, 0, 255);
-
-    cv::Mat dst;
-
-    cv::pyrUp(src, dst);
-
-    TEST_CYCLE()
-    {
-        cv::pyrUp(src, dst);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, PyrUp, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
-                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
-                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4))));
-
-//////////////////////////////////////////////////////////////////////
-// CvtColor
-
-GPU_PERF_TEST(CvtColor, cv::gpu::DeviceInfo, cv::Size, MatDepth, CvtColorInfo)
-{
-    cv::Size size = GET_PARAM(1);
-    int depth = GET_PARAM(2);
-    CvtColorInfo info = GET_PARAM(3);
-
-    cv::Mat src(size, CV_MAKETYPE(depth, info.scn));
-    fill(src, 0, 255);
-
-    cv::Mat dst;
-
-    cv::cvtColor(src, dst, info.code, info.dcn);
-
-    TEST_CYCLE()
-    {
-        cv::cvtColor(src, dst, info.code, info.dcn);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, CvtColor, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_32F)),
-    testing::Values(CvtColorInfo(4, 4, cv::COLOR_RGBA2BGRA),
-                    CvtColorInfo(4, 1, cv::COLOR_BGRA2GRAY),
-                    CvtColorInfo(1, 4, cv::COLOR_GRAY2BGRA),
-                    CvtColorInfo(3, 3, cv::COLOR_BGR2XYZ),
-                    CvtColorInfo(3, 3, cv::COLOR_XYZ2BGR),
-                    CvtColorInfo(3, 3, cv::COLOR_BGR2YCrCb),
-                    CvtColorInfo(3, 3, cv::COLOR_YCrCb2BGR),
-                    CvtColorInfo(3, 3, cv::COLOR_BGR2YUV),
-                    CvtColorInfo(3, 3, cv::COLOR_YUV2BGR),
-                    CvtColorInfo(3, 3, cv::COLOR_BGR2HSV),
-                    CvtColorInfo(3, 3, cv::COLOR_HSV2BGR),
-                    CvtColorInfo(3, 3, cv::COLOR_BGR2HLS),
-                    CvtColorInfo(3, 3, cv::COLOR_HLS2BGR),
-                    CvtColorInfo(3, 3, cv::COLOR_BGR2Lab),
-                    CvtColorInfo(3, 3, cv::COLOR_RGB2Lab),
-                    CvtColorInfo(3, 3, cv::COLOR_BGR2Luv),
-                    CvtColorInfo(3, 3, cv::COLOR_RGB2Luv),
-                    CvtColorInfo(3, 3, cv::COLOR_Lab2BGR),
-                    CvtColorInfo(3, 3, cv::COLOR_Lab2RGB),
-                    CvtColorInfo(3, 3, cv::COLOR_Luv2BGR),
-                    CvtColorInfo(3, 3, cv::COLOR_Luv2RGB),
-                    CvtColorInfo(1, 3, cv::COLOR_BayerBG2BGR),
-                    CvtColorInfo(1, 3, cv::COLOR_BayerGB2BGR),
-                    CvtColorInfo(1, 3, cv::COLOR_BayerRG2BGR),
-                    CvtColorInfo(1, 3, cv::COLOR_BayerGR2BGR),
-                    CvtColorInfo(4, 4, cv::COLOR_RGBA2mRGBA))));
-
-//////////////////////////////////////////////////////////////////////
-// HoughLines
-
-IMPLEMENT_PARAM_CLASS(DoSort, bool)
-
-GPU_PERF_TEST(HoughLines, cv::gpu::DeviceInfo, cv::Size, DoSort)
-{
-    declare.time(30.0);
-
-    const cv::Size size = GET_PARAM(1);
-
-    const float rho = 1.0f;
-    const float theta = CV_PI / 180.0f;
-    const int threshold = 300;
-
-    cv::RNG rng(123456789);
-
-    cv::Mat src(size, CV_8UC1, cv::Scalar::all(0));
-
-    const int numLines = rng.uniform(500, 2000);
-    for (int i = 0; i < numLines; ++i)
-    {
-        cv::Point p1(rng.uniform(0, src.cols), rng.uniform(0, src.rows));
-        cv::Point p2(rng.uniform(0, src.cols), rng.uniform(0, src.rows));
-        cv::line(src, p1, p2, cv::Scalar::all(255), 2);
-    }
-
-    std::vector<cv::Vec2f> lines;
-    cv::HoughLines(src, lines, rho, theta, threshold);
-
-    TEST_CYCLE()
-    {
-        cv::HoughLines(src, lines, rho, theta, threshold);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, HoughLines, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(DoSort(false), DoSort(true))));
-
-#endif
--- a/modules/gpu/perf_cpu/perf_labeling.cpp
+++ b/modules/gpu/perf_cpu/perf_labeling.cpp
@ -1,158 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-// By downloading, copying, installing or using the software you agree to this license.
-// If you do not agree to this license, do not download, install,
-// copy or use the software.
-//
-//
-//                          License Agreement
-//               For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2008-2011, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//  * Redistributions of source code must retain the above copyright notice,
-//    this list of conditions and the following disclaimer.
-//
-//  * Redistributions in binary form must reproduce the above copyright notice,
-//    this list of conditions and the following disclaimer in the documentation
-//    and/or other materials provided with the distribution.
-//
-//  * The name of the copyright holders may not be used to endorse or promote products
-//    derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//M*/
-
-#include "perf_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-namespace {
-
-    struct GreedyLabeling
-    {
-        struct dot
-        {
-            int x;
-            int y;
-
-            static dot make(int i, int j)
-            {
-                dot d; d.x = i; d.y = j;
-                return d;
-            }
-        };
-
-        struct InInterval
-        {
-            InInterval(const int& _lo, const int& _hi) : lo(-_lo), hi(_hi) {};
-            const int lo, hi;
-
-            bool operator() (const unsigned char a, const unsigned char b) const
-            {
-                int d = a - b;
-                return lo <= d && d <= hi;
-            }
-        };
-
-        GreedyLabeling(cv::Mat img)
-        : image(img), _labels(image.size(), CV_32SC1, cv::Scalar::all(-1)) {stack = new dot[image.cols * image.rows];}
-
-        ~GreedyLabeling(){delete[] stack;}
-
-        void operator() (cv::Mat labels) const
-        {
-            labels.setTo(cv::Scalar::all(-1));
-            InInterval inInt(0, 2);
-            int cc = -1;
-
-            int* dist_labels = (int*)labels.data;
-            int pitch = labels.step1();
-
-            unsigned char* source = (unsigned char*)image.data;
-            int width = image.cols;
-            int height = image.rows;
-
-            for (int j = 0; j < image.rows; ++j)
-                for (int i = 0; i < image.cols; ++i)
-                {
-                    if (dist_labels[j * pitch + i] != -1) continue;
-
-                    dot* top = stack;
-                    dot p = dot::make(i, j);
-                    cc++;
-
-                    dist_labels[j * pitch + i] = cc;
-
-                    while (top >= stack)
-                    {
-                        int*  dl = &dist_labels[p.y * pitch + p.x];
-                        unsigned char* sp = &source[p.y * image.step1() + p.x];
-
-                        dl[0] = cc;
-
-                        //right
-                        if( p.x < (width - 1) && dl[ +1] == -1 && inInt(sp[0], sp[+1]))
-                            *top++ = dot::make(p.x + 1, p.y);
-
-                        //left
-                        if( p.x > 0 && dl[-1] == -1 && inInt(sp[0], sp[-1]))
-                            *top++ = dot::make(p.x - 1, p.y);
-
-                        //bottom
-                        if( p.y < (height - 1) && dl[+pitch] == -1 && inInt(sp[0], sp[+image.step1()]))
-                            *top++ = dot::make(p.x, p.y + 1);
-
-                        //top
-                        if( p.y > 0 && dl[-pitch] == -1 && inInt(sp[0], sp[-image.step1()]))
-                            *top++ = dot::make(p.x, p.y - 1);
-
-                        p = *--top;
-                    }
-                }
-        }
-
-        cv::Mat image;
-        cv::Mat _labels;
-        dot* stack;
-    };
-}
-
-GPU_PERF_TEST(ConnectedComponents, cv::gpu::DeviceInfo, cv::Size)
-{
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
-
-    cv::Mat image = readImage("gpu/labeling/aloe-disp.png", cv::IMREAD_GRAYSCALE);
-
-    GreedyLabeling host(image);
-
-    host(host._labels);
-
-    declare.time(1.0);
-
-    TEST_CYCLE()
-    {
-        host(host._labels);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Labeling, ConnectedComponents, testing::Combine(ALL_DEVICES, testing::Values(cv::Size(261, 262))));
-
-#endif
--- a/modules/gpu/perf_cpu/perf_main.cpp
+++ b/modules/gpu/perf_cpu/perf_main.cpp
@ -1,20 +0,0 @@
-#include "perf_cpu_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-int main(int argc, char **argv)
-{
-    testing::InitGoogleTest(&argc, argv);
-    perf::TestBase::Init(argc, argv);
-    return RUN_ALL_TESTS();
-}
-
-#else
-
-int main()
-{
-    printf("OpenCV was built without CUDA support\n");
-    return 0;
-}
-
-#endif
--- a/modules/gpu/perf_cpu/perf_matop.cpp
+++ b/modules/gpu/perf_cpu/perf_matop.cpp
@ -1,124 +0,0 @@
-#include "perf_cpu_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-//////////////////////////////////////////////////////////////////////
-// SetTo
-
-GPU_PERF_TEST(SetTo, cv::gpu::DeviceInfo, cv::Size, MatType)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-
-    cv::Mat src(size, type);
-    cv::Scalar val(1, 2, 3, 4);
-
-    src.setTo(val);
-
-    TEST_CYCLE()
-    {
-        src.setTo(val);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(MatOp, SetTo, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
-                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
-                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4),
-                    MatType(CV_64FC1), MatType(CV_64FC3), MatType(CV_64FC4))));
-
-//////////////////////////////////////////////////////////////////////
-// SetToMasked
-
-GPU_PERF_TEST(SetToMasked, cv::gpu::DeviceInfo, cv::Size, MatType)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-
-    cv::Mat src(size, type);
-    fill(src, 0, 255);
-
-    cv::Mat mask(size, CV_8UC1);
-    fill(mask, 0, 2);
-
-    cv::Scalar val(1, 2, 3, 4);
-
-    src.setTo(val, mask);
-
-    TEST_CYCLE()
-    {
-        src.setTo(val, mask);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(MatOp, SetToMasked, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
-                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
-                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4),
-                    MatType(CV_64FC1), MatType(CV_64FC3), MatType(CV_64FC4))));
-
-//////////////////////////////////////////////////////////////////////
-// CopyToMasked
-
-GPU_PERF_TEST(CopyToMasked, cv::gpu::DeviceInfo, cv::Size, MatType)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-
-    cv::Mat src(size, type);
-    fill(src, 0, 255);
-
-    cv::Mat mask(size, CV_8UC1);
-    fill(mask, 0, 2);
-
-    cv::Mat dst;
-
-    src.copyTo(dst, mask);
-
-    TEST_CYCLE()
-    {
-        src.copyTo(dst, mask);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(MatOp, CopyToMasked, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
-                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
-                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4),
-                    MatType(CV_64FC1), MatType(CV_64FC3), MatType(CV_64FC4))));
-
-//////////////////////////////////////////////////////////////////////
-// ConvertTo
-
-GPU_PERF_TEST(ConvertTo, cv::gpu::DeviceInfo, cv::Size, MatDepth, MatDepth)
-{
-    cv::Size size = GET_PARAM(1);
-    int depth1 = GET_PARAM(2);
-    int depth2 = GET_PARAM(3);
-
-    cv::Mat src(size, depth1);
-    fill(src, 0, 255);
-
-    cv::Mat dst;
-
-    src.convertTo(dst, depth2, 0.5, 1.0);
-
-    TEST_CYCLE()
-    {
-        src.convertTo(dst, depth2, 0.5, 1.0);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(MatOp, ConvertTo, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_32F), MatDepth(CV_64F)),
-    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_32F), MatDepth(CV_64F))));
-
-#endif
--- a/modules/gpu/perf_cpu/perf_objdetect.cpp
+++ b/modules/gpu/perf_cpu/perf_objdetect.cpp
@ -1,74 +0,0 @@
-#include "perf_cpu_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-///////////////////////////////////////////////////////////////
-// HOG
-
-GPU_PERF_TEST_1(HOG, cv::gpu::DeviceInfo)
-{
-    cv::Mat img = readImage("gpu/hog/road.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    std::vector<cv::Rect> found_locations;
-
-    cv::HOGDescriptor hog;
-    hog.setSVMDetector(cv::gpu::HOGDescriptor::getDefaultPeopleDetector());
-
-    hog.detectMultiScale(img, found_locations);
-
-    TEST_CYCLE()
-    {
-        hog.detectMultiScale(img, found_locations);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ObjDetect, HOG, ALL_DEVICES);
-
-///////////////////////////////////////////////////////////////
-// HaarClassifier
-
-GPU_PERF_TEST_1(HaarClassifier, cv::gpu::DeviceInfo)
-{
-    cv::Mat img = readImage("gpu/haarcascade/group_1_640x480_VGA.pgm", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    cv::CascadeClassifier cascade;
-
-    ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath("gpu/perf/haarcascade_frontalface_alt.xml")));
-
-    std::vector<cv::Rect> rects;
-
-    cascade.detectMultiScale(img, rects);
-
-    TEST_CYCLE()
-    {
-        cascade.detectMultiScale(img, rects);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ObjDetect, HaarClassifier, ALL_DEVICES);
-
-//===================== LBP cascade ==========================//
-GPU_PERF_TEST_1(LBPClassifier, cv::gpu::DeviceInfo)
-{
-    cv::Mat img = readImage("gpu/haarcascade/group_1_640x480_VGA.pgm", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    cv::CascadeClassifier cascade;
-
-    ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath("gpu/lbpcascade/lbpcascade_frontalface.xml")));
-
-    std::vector<cv::Rect> rects;
-
-    cascade.detectMultiScale(img, rects);
-
-    TEST_CYCLE()
-    {
-        cascade.detectMultiScale(img, rects);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ObjDetect, LBPClassifier, ALL_DEVICES);
-
-#endif
--- a/modules/gpu/perf_cpu/perf_utility.cpp
+++ b/modules/gpu/perf_cpu/perf_utility.cpp
@ -1,220 +0,0 @@
-#include "perf_cpu_precomp.hpp"
-
-using namespace std;
-using namespace cv;
-using namespace cv::gpu;
-
-void fill(Mat& m, double a, double b)
-{
-    RNG rng(123456789);
-    rng.fill(m, RNG::UNIFORM, a, b);
-}
-
-void PrintTo(const CvtColorInfo& info, ostream* os)
-{
-    static const char* str[] =
-    {
-        "BGR2BGRA",
-        "BGRA2BGR",
-        "BGR2RGBA",
-        "RGBA2BGR",
-        "BGR2RGB",
-        "BGRA2RGBA",
-
-        "BGR2GRAY",
-        "RGB2GRAY",
-        "GRAY2BGR",
-        "GRAY2BGRA",
-        "BGRA2GRAY",
-        "RGBA2GRAY",
-
-        "BGR2BGR565",
-        "RGB2BGR565",
-        "BGR5652BGR",
-        "BGR5652RGB",
-        "BGRA2BGR565",
-        "RGBA2BGR565",
-        "BGR5652BGRA",
-        "BGR5652RGBA",
-
-        "GRAY2BGR565",
-        "BGR5652GRAY",
-
-        "BGR2BGR555",
-        "RGB2BGR555",
-        "BGR5552BGR",
-        "BGR5552RGB",
-        "BGRA2BGR555",
-        "RGBA2BGR555",
-        "BGR5552BGRA",
-        "BGR5552RGBA",
-
-        "GRAY2BGR555",
-        "BGR5552GRAY",
-
-        "BGR2XYZ",
-        "RGB2XYZ",
-        "XYZ2BGR",
-        "XYZ2RGB",
-
-        "BGR2YCrCb",
-        "RGB2YCrCb",
-        "YCrCb2BGR",
-        "YCrCb2RGB",
-
-        "BGR2HSV",
-        "RGB2HSV",
-
-        "",
-        "",
-
-        "BGR2Lab",
-        "RGB2Lab",
-
-        "BayerBG2BGR",
-        "BayerGB2BGR",
-        "BayerRG2BGR",
-        "BayerGR2BGR",
-
-        "BGR2Luv",
-        "RGB2Luv",
-
-        "BGR2HLS",
-        "RGB2HLS",
-
-        "HSV2BGR",
-        "HSV2RGB",
-
-        "Lab2BGR",
-        "Lab2RGB",
-        "Luv2BGR",
-        "Luv2RGB",
-
-        "HLS2BGR",
-        "HLS2RGB",
-
-        "BayerBG2BGR_VNG",
-        "BayerGB2BGR_VNG",
-        "BayerRG2BGR_VNG",
-        "BayerGR2BGR_VNG",
-
-        "BGR2HSV_FULL",
-        "RGB2HSV_FULL",
-        "BGR2HLS_FULL",
-        "RGB2HLS_FULL",
-
-        "HSV2BGR_FULL",
-        "HSV2RGB_FULL",
-        "HLS2BGR_FULL",
-        "HLS2RGB_FULL",
-
-        "LBGR2Lab",
-        "LRGB2Lab",
-        "LBGR2Luv",
-        "LRGB2Luv",
-
-        "Lab2LBGR",
-        "Lab2LRGB",
-        "Luv2LBGR",
-        "Luv2LRGB",
-
-        "BGR2YUV",
-        "RGB2YUV",
-        "YUV2BGR",
-        "YUV2RGB",
-
-        "BayerBG2GRAY",
-        "BayerGB2GRAY",
-        "BayerRG2GRAY",
-        "BayerGR2GRAY",
-
-        //YUV 4:2:0 formats family
-        "YUV2RGB_NV12",
-        "YUV2BGR_NV12",
-        "YUV2RGB_NV21",
-        "YUV2BGR_NV21",
-
-        "YUV2RGBA_NV12",
-        "YUV2BGRA_NV12",
-        "YUV2RGBA_NV21",
-        "YUV2BGRA_NV21",
-
-        "YUV2RGB_YV12",
-        "YUV2BGR_YV12",
-        "YUV2RGB_IYUV",
-        "YUV2BGR_IYUV",
-
-        "YUV2RGBA_YV12",
-        "YUV2BGRA_YV12",
-        "YUV2RGBA_IYUV",
-        "YUV2BGRA_IYUV",
-
-        "YUV2GRAY_420",
-
-        //YUV 4:2:2 formats family
-        "YUV2RGB_UYVY",
-        "YUV2BGR_UYVY",
-        "YUV2RGB_VYUY",
-        "YUV2BGR_VYUY",
-
-        "YUV2RGBA_UYVY",
-        "YUV2BGRA_UYVY",
-        "YUV2RGBA_VYUY",
-        "YUV2BGRA_VYUY",
-
-        "YUV2RGB_YUY2",
-        "YUV2BGR_YUY2",
-        "YUV2RGB_YVYU",
-        "YUV2BGR_YVYU",
-
-        "YUV2RGBA_YUY2",
-        "YUV2BGRA_YUY2",
-        "YUV2RGBA_YVYU",
-        "YUV2BGRA_YVYU",
-
-        "YUV2GRAY_UYVY",
-        "YUV2GRAY_YUY2",
-
-        // alpha premultiplication
-        "RGBA2mRGBA",
-        "mRGBA2RGBA",
-
-        "COLORCVT_MAX"
-    };
-
-    *os << str[info.code];
-}
-
-void cv::gpu::PrintTo(const DeviceInfo& info, ostream* os)
-{
-    *os << info.name();
-}
-
-Mat readImage(const string& fileName, int flags)
-{
-    return imread(perf::TestBase::getDataPath(fileName), flags);
-}
-
-const vector<DeviceInfo>& devices()
-{
-    static vector<DeviceInfo> devs;
-    static bool first = true;
-
-    if (first)
-    {
-        int deviceCount = getCudaEnabledDeviceCount();
-
-        devs.reserve(deviceCount);
-
-        for (int i = 0; i < deviceCount; ++i)
-        {
-            DeviceInfo info(i);
-            if (info.isCompatible())
-                devs.push_back(info);
-        }
-
-        first = false;
-    }
-
-    return devs;
-}
--- a/modules/gpu/perf_cpu/perf_utility.hpp
+++ b/modules/gpu/perf_cpu/perf_utility.hpp
@ -1,77 +0,0 @@
-#ifndef __OPENCV_PERF_GPU_UTILITY_HPP__
-#define __OPENCV_PERF_GPU_UTILITY_HPP__
-
-void fill(cv::Mat& m, double a, double b);
-
-using perf::MatType;
-using perf::MatDepth;
-
-CV_ENUM(BorderMode, cv::BORDER_REFLECT101, cv::BORDER_REPLICATE, cv::BORDER_CONSTANT, cv::BORDER_REFLECT, cv::BORDER_WRAP)
-        CV_ENUM(Interpolation, cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_CUBIC, cv::INTER_AREA)
-CV_ENUM(NormType, cv::NORM_INF, cv::NORM_L1, cv::NORM_L2, cv::NORM_HAMMING)
-
-struct CvtColorInfo
-{
-    int scn;
-    int dcn;
-    int code;
-
-    explicit CvtColorInfo(int scn_=0, int dcn_=0, int code_=0) : scn(scn_), dcn(dcn_), code(code_) {}
-};
-
-void PrintTo(const CvtColorInfo& info, std::ostream* os);
-
-#define IMPLEMENT_PARAM_CLASS(name, type) \
-    class name \
-    { \
-    public: \
-        name ( type arg = type ()) : val_(arg) {} \
-        operator type () const {return val_;} \
-    private: \
-        type val_; \
-    }; \
-    inline void PrintTo( name param, std::ostream* os) \
-    { \
-        *os << #name <<  " = " << testing::PrintToString(static_cast< type >(param)); \
-    }
-
-IMPLEMENT_PARAM_CLASS(Channels, int)
-
-namespace cv { namespace gpu
-{
-    void PrintTo(const cv::gpu::DeviceInfo& info, std::ostream* os);
-}}
-
-#define GPU_PERF_TEST(name, ...) \
-    struct name : perf::TestBaseWithParam< std::tr1::tuple< __VA_ARGS__ > > \
-    { \
-    public: \
-        name() {} \
-    protected: \
-        void PerfTestBody(); \
-    }; \
-    TEST_P(name, perf){ RunPerfTestBody(); } \
-    void name :: PerfTestBody()
-
-#define GPU_PERF_TEST_1(name, param_type) \
-    struct name : perf::TestBaseWithParam< param_type > \
-    { \
-    public: \
-        name() {} \
-    protected: \
-        void PerfTestBody(); \
-    }; \
-    TEST_P(name, perf){ RunPerfTestBody(); } \
-    void name :: PerfTestBody()
-
-#define GPU_TYPICAL_MAT_SIZES testing::Values(perf::szSXGA, perf::sz1080p, cv::Size(1800, 1500))
-
-cv::Mat readImage(const std::string& fileName, int flags = cv::IMREAD_COLOR);
-
-const std::vector<cv::gpu::DeviceInfo>& devices();
-
-#define ALL_DEVICES testing::ValuesIn(devices())
-
-#define GET_PARAM(k) std::tr1::get< k >(GetParam())
-
-#endif // __OPENCV_PERF_GPU_UTILITY_HPP__
--- a/modules/gpu/perf_cpu/perf_video.cpp
+++ b/modules/gpu/perf_cpu/perf_video.cpp
@ -1,466 +0,0 @@
-#include "perf_cpu_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-//////////////////////////////////////////////////////
-// GoodFeaturesToTrack
-
-IMPLEMENT_PARAM_CLASS(MinDistance, double)
-
-GPU_PERF_TEST(GoodFeaturesToTrack, cv::gpu::DeviceInfo, MinDistance)
-{
-    double minDistance = GET_PARAM(1);
-
-    cv::Mat image = readImage("gpu/perf/aloe.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(image.empty());
-
-    cv::Mat corners;
-
-    cv::goodFeaturesToTrack(image, corners, 8000, 0.01, minDistance);
-
-    TEST_CYCLE()
-    {
-        cv::goodFeaturesToTrack(image, corners, 8000, 0.01, minDistance);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Video, GoodFeaturesToTrack, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(MinDistance(0.0), MinDistance(3.0))));
-
-//////////////////////////////////////////////////////
-// PyrLKOpticalFlowSparse
-
-IMPLEMENT_PARAM_CLASS(GraySource, bool)
-IMPLEMENT_PARAM_CLASS(Points, int)
-IMPLEMENT_PARAM_CLASS(WinSize, int)
-IMPLEMENT_PARAM_CLASS(Levels, int)
-IMPLEMENT_PARAM_CLASS(Iters, int)
-
-GPU_PERF_TEST(PyrLKOpticalFlowSparse, cv::gpu::DeviceInfo, GraySource, Points, WinSize, Levels, Iters)
-{
-    bool useGray = GET_PARAM(1);
-    int points = GET_PARAM(2);
-    int win_size = GET_PARAM(3);
-    int levels = GET_PARAM(4);
-    int iters = GET_PARAM(5);
-
-    cv::Mat frame0 = readImage("gpu/opticalflow/frame0.png", useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
-    ASSERT_FALSE(frame0.empty());
-
-    cv::Mat frame1 = readImage("gpu/opticalflow/frame1.png", useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
-    ASSERT_FALSE(frame1.empty());
-
-    cv::Mat gray_frame;
-    if (useGray)
-        gray_frame = frame0;
-    else
-        cv::cvtColor(frame0, gray_frame, cv::COLOR_BGR2GRAY);
-
-    cv::Mat pts;
-    cv::goodFeaturesToTrack(gray_frame, pts, points, 0.01, 0.0);
-
-    cv::Mat nextPts;
-    cv::Mat status;
-
-    cv::calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts, status, cv::noArray(),
-                             cv::Size(win_size, win_size), levels - 1,
-                             cv::TermCriteria(cv::TermCriteria::COUNT + cv::TermCriteria::EPS, iters, 0.01));
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        cv::calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts, status, cv::noArray(),
-                                 cv::Size(win_size, win_size), levels - 1,
-                                 cv::TermCriteria(cv::TermCriteria::COUNT + cv::TermCriteria::EPS, iters, 0.01));
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Video, PyrLKOpticalFlowSparse, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(GraySource(true), GraySource(false)),
-    testing::Values(Points(1000), Points(2000), Points(4000), Points(8000)),
-    testing::Values(WinSize(9), WinSize(13), WinSize(17), WinSize(21)),
-    testing::Values(Levels(1), Levels(2), Levels(3)),
-    testing::Values(Iters(1), Iters(10), Iters(30))));
-
-//////////////////////////////////////////////////////
-// FarnebackOpticalFlowTest
-
-GPU_PERF_TEST_1(FarnebackOpticalFlowTest, cv::gpu::DeviceInfo)
-{
-    cv::Mat frame0 = readImage("gpu/opticalflow/frame0.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame0.empty());
-
-    cv::Mat frame1 = readImage("gpu/opticalflow/frame1.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame1.empty());
-
-    cv::Mat flow;
-
-    int numLevels = 5;
-    double pyrScale = 0.5;
-    int winSize = 13;
-    int numIters = 10;
-    int polyN = 5;
-    double polySigma = 1.1;
-    int flags = 0;
-
-    cv::calcOpticalFlowFarneback(frame0, frame1, flow, pyrScale, numLevels, winSize, numIters, polyN, polySigma, flags);
-
-    declare.time(10);
-
-    TEST_CYCLE()
-    {
-        cv::calcOpticalFlowFarneback(frame0, frame1, flow, pyrScale, numLevels, winSize, numIters, polyN, polySigma, flags);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Video, FarnebackOpticalFlowTest, ALL_DEVICES);
-
-//////////////////////////////////////////////////////
-// FGDStatModel
-
-namespace cv
-{
-    template<> void Ptr<CvBGStatModel>::delete_obj()
-    {
-        cvReleaseBGStatModel(&obj);
-    }
-}
-
-GPU_PERF_TEST(FGDStatModel, cv::gpu::DeviceInfo, std::string)
-{
-    std::string inputFile = perf::TestBase::getDataPath(std::string("gpu/video/") + GET_PARAM(1));
-
-    cv::VideoCapture cap(inputFile);
-    ASSERT_TRUE(cap.isOpened());
-
-    cv::Mat frame;
-    cap >> frame;
-    ASSERT_FALSE(frame.empty());
-
-    IplImage ipl_frame = frame;
-    cv::Ptr<CvBGStatModel> model(cvCreateFGDStatModel(&ipl_frame));
-
-    declare.time(60);
-
-    for (int i = 0; i < 10; ++i)
-    {
-        cap >> frame;
-        ASSERT_FALSE(frame.empty());
-
-        ipl_frame = frame;
-
-        startTimer();
-        next();
-
-        cvUpdateBGStatModel(&ipl_frame, model);
-
-        stopTimer();
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Video, FGDStatModel, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(std::string("768x576.avi"), std::string("1920x1080.avi"))));
-
-//////////////////////////////////////////////////////
-// MOG
-
-IMPLEMENT_PARAM_CLASS(LearningRate, double)
-
-GPU_PERF_TEST(MOG, cv::gpu::DeviceInfo, std::string, Channels, LearningRate)
-{
-    std::string inputFile = perf::TestBase::getDataPath(std::string("gpu/video/") + GET_PARAM(1));
-    int cn = GET_PARAM(2);
-    double learningRate = GET_PARAM(3);
-
-    cv::VideoCapture cap(inputFile);
-    ASSERT_TRUE(cap.isOpened());
-
-    cv::Mat frame;
-
-    cv::BackgroundSubtractorMOG mog;
-    cv::Mat foreground;
-
-    cap >> frame;
-    ASSERT_FALSE(frame.empty());
-
-    if (cn != 3)
-    {
-        cv::Mat temp;
-        if (cn == 1)
-            cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
-        else
-            cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
-        cv::swap(temp, frame);
-    }
-
-    mog(frame, foreground, learningRate);
-
-    for (int i = 0; i < 10; ++i)
-    {
-        cap >> frame;
-        ASSERT_FALSE(frame.empty());
-
-        if (cn != 3)
-        {
-            cv::Mat temp;
-            if (cn == 1)
-                cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
-            else
-                cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
-            cv::swap(temp, frame);
-        }
-
-        startTimer(); next();
-        mog(frame, foreground, learningRate);
-        stopTimer();
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Video, MOG, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(std::string("768x576.avi"), std::string("1920x1080.avi")),
-    testing::Values(Channels(1), Channels(3)/*, Channels(4)*/),
-    testing::Values(LearningRate(0.0), LearningRate(0.01))));
-
-//////////////////////////////////////////////////////
-// MOG2
-
-GPU_PERF_TEST(MOG2_update, cv::gpu::DeviceInfo, std::string, Channels)
-{
-    std::string inputFile = perf::TestBase::getDataPath(std::string("gpu/video/") + GET_PARAM(1));
-    int cn = GET_PARAM(2);
-
-    cv::VideoCapture cap(inputFile);
-    ASSERT_TRUE(cap.isOpened());
-
-    cv::Mat frame;
-
-    cv::BackgroundSubtractorMOG2 mog2;
-    cv::Mat foreground;
-
-    cap >> frame;
-    ASSERT_FALSE(frame.empty());
-
-    if (cn != 3)
-    {
-        cv::Mat temp;
-        if (cn == 1)
-            cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
-        else
-            cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
-        cv::swap(temp, frame);
-    }
-
-    mog2(frame, foreground);
-
-    for (int i = 0; i < 10; ++i)
-    {
-        cap >> frame;
-        ASSERT_FALSE(frame.empty());
-
-        if (cn != 3)
-        {
-            cv::Mat temp;
-            if (cn == 1)
-                cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
-            else
-                cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
-            cv::swap(temp, frame);
-        }
-
-        startTimer(); next();
-        mog2(frame, foreground);
-        stopTimer();
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Video, MOG2_update, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(std::string("768x576.avi"), std::string("1920x1080.avi")),
-    testing::Values(Channels(1), Channels(3)/*, Channels(4)*/)));
-
-GPU_PERF_TEST(MOG2_getBackgroundImage, cv::gpu::DeviceInfo, std::string, Channels)
-{
-    std::string inputFile = perf::TestBase::getDataPath(std::string("gpu/video/") + GET_PARAM(1));
-    int cn = GET_PARAM(2);
-
-    cv::VideoCapture cap(inputFile);
-    ASSERT_TRUE(cap.isOpened());
-
-    cv::Mat frame;
-
-    cv::BackgroundSubtractorMOG2 mog2;
-    cv::Mat foreground;
-
-    for (int i = 0; i < 10; ++i)
-    {
-        cap >> frame;
-        ASSERT_FALSE(frame.empty());
-
-        if (cn != 3)
-        {
-            cv::Mat temp;
-            if (cn == 1)
-                cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
-            else
-                cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
-            cv::swap(temp, frame);
-        }
-
-        mog2(frame, foreground);
-    }
-
-    cv::Mat background;
-    mog2.getBackgroundImage(background);
-
-    TEST_CYCLE()
-    {
-        mog2.getBackgroundImage(background);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Video, MOG2_getBackgroundImage, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(std::string("768x576.avi"), std::string("1920x1080.avi")),
-    testing::Values(/*Channels(1),*/ Channels(3)/*, Channels(4)*/)));
-
-//////////////////////////////////////////////////////
-// GMG
-
-IMPLEMENT_PARAM_CLASS(MaxFeatures, int)
-
-GPU_PERF_TEST(GMG, cv::gpu::DeviceInfo, std::string, Channels, MaxFeatures)
-{
-    std::string inputFile = perf::TestBase::getDataPath(std::string("gpu/video/") + GET_PARAM(1));
-    int cn = GET_PARAM(2);
-    int maxFeatures = GET_PARAM(3);
-
-    cv::VideoCapture cap(inputFile);
-    ASSERT_TRUE(cap.isOpened());
-
-    cv::Mat frame;
-    cap >> frame;
-    ASSERT_FALSE(frame.empty());
-
-    if (cn != 3)
-    {
-        cv::Mat temp;
-        if (cn == 1)
-            cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
-        else
-            cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
-        cv::swap(temp, frame);
-    }
-
-    cv::Mat fgmask;
-    cv::Mat zeros(frame.size(), CV_8UC1, cv::Scalar::all(0));
-
-    cv::BackgroundSubtractorGMG gmg;
-    gmg.set("maxFeatures", maxFeatures);
-    gmg.initialize(frame.size(), 0.0, 255.0);
-
-    gmg(frame, fgmask);
-
-    for (int i = 0; i < 150; ++i)
-    {
-        cap >> frame;
-        if (frame.empty())
-        {
-            cap.open(inputFile);
-            cap >> frame;
-        }
-
-        if (cn != 3)
-        {
-            cv::Mat temp;
-            if (cn == 1)
-                cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
-            else
-                cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
-            cv::swap(temp, frame);
-        }
-
-        startTimer(); next();
-        gmg(frame, fgmask);
-        stopTimer();
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Video, GMG, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(std::string("768x576.avi"), std::string("1920x1080.avi")),
-    testing::Values(Channels(1), Channels(3), Channels(4)),
-    testing::Values(MaxFeatures(20), MaxFeatures(40), MaxFeatures(60))));
-
-//////////////////////////////////////////////////////
-// VideoWriter
-
-#ifdef WIN32
-
-GPU_PERF_TEST(VideoWriter, cv::gpu::DeviceInfo, std::string)
-{
-    const double FPS = 25.0;
-
-    std::string inputFile = perf::TestBase::getDataPath(std::string("gpu/video/") + GET_PARAM(1));
-    std::string outputFile = cv::tempfile(".avi");
-
-    cv::VideoCapture reader(inputFile);
-    ASSERT_TRUE( reader.isOpened() );
-
-    cv::VideoWriter writer;
-
-    cv::Mat frame;
-
-    declare.time(30);
-
-    for (int i = 0; i < 10; ++i)
-    {
-        reader >> frame;
-        ASSERT_FALSE(frame.empty());
-
-        if (!writer.isOpened())
-            writer.open(outputFile, CV_FOURCC('X', 'V', 'I', 'D'), FPS, frame.size());
-
-        startTimer(); next();
-        writer.write(frame);
-        stopTimer();
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Video, VideoWriter, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(std::string("768x576.avi"), std::string("1920x1080.avi"))));
-
-#endif // WIN32
-
-//////////////////////////////////////////////////////
-// VideoReader
-
-GPU_PERF_TEST(VideoReader, cv::gpu::DeviceInfo, std::string)
-{
-    std::string inputFile = perf::TestBase::getDataPath(std::string("gpu/video/") + GET_PARAM(1));
-
-    cv::VideoCapture reader(inputFile);
-    ASSERT_TRUE( reader.isOpened() );
-
-    cv::Mat frame;
-
-    reader >> frame;
-
-    declare.time(20);
-
-    TEST_CYCLE_N(10)
-    {
-        reader >> frame;
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Video, VideoReader, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(std::string("768x576.avi"), std::string("1920x1080.avi"))));
-
-#endif
--- a/modules/gpu/src/brute_force_matcher.cpp
+++ b/modules/gpu/src/brute_force_matcher.cpp
@ -420,16 +420,16 @@ void cv::gpu::BFMatcher_GPU::matchConvert(const Mat& trainIdx, const Mat& imgIdx
    const float* distance_ptr =  distance.ptr<float>();
    for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
    {
-        int trainIdx = *trainIdx_ptr;
+        int _trainIdx = *trainIdx_ptr;

-        if (trainIdx == -1)
+        if (_trainIdx == -1)
            continue;

-        int imgIdx = *imgIdx_ptr;
+        int _imgIdx = *imgIdx_ptr;

-        float distance = *distance_ptr;
+        float _distance = *distance_ptr;

-        DMatch m(queryIdx, trainIdx, imgIdx, distance);
+        DMatch m(queryIdx, _trainIdx, _imgIdx, _distance);

        matches.push_back(m);
    }
@ -558,13 +558,13 @@ void cv::gpu::BFMatcher_GPU::knnMatchConvert(const Mat& trainIdx, const Mat& dis

        for (int i = 0; i < k; ++i, ++trainIdx_ptr, ++distance_ptr)
        {
-            int trainIdx = *trainIdx_ptr;
+            int _trainIdx = *trainIdx_ptr;

-            if (trainIdx != -1)
+            if (_trainIdx != -1)
            {
-                float distance = *distance_ptr;
+                float _distance = *distance_ptr;

-                DMatch m(queryIdx, trainIdx, 0, distance);
+                DMatch m(queryIdx, _trainIdx, 0, _distance);

                curMatches.push_back(m);
            }
@ -680,15 +680,15 @@ void cv::gpu::BFMatcher_GPU::knnMatch2Convert(const Mat& trainIdx, const Mat& im

        for (int i = 0; i < 2; ++i, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
        {
-            int trainIdx = *trainIdx_ptr;
+            int _trainIdx = *trainIdx_ptr;

-            if (trainIdx != -1)
+            if (_trainIdx != -1)
            {
-                int imgIdx = *imgIdx_ptr;
+                int _imgIdx = *imgIdx_ptr;

-                float distance = *distance_ptr;
+                float _distance = *distance_ptr;

-                DMatch m(queryIdx, trainIdx, imgIdx, distance);
+                DMatch m(queryIdx, _trainIdx, _imgIdx, _distance);

                curMatches.push_back(m);
            }
@ -868,25 +868,25 @@ void cv::gpu::BFMatcher_GPU::radiusMatchConvert(const Mat& trainIdx, const Mat&
        const int* trainIdx_ptr = trainIdx.ptr<int>(queryIdx);
        const float* distance_ptr = distance.ptr<float>(queryIdx);

-        const int nMatches = std::min(nMatches_ptr[queryIdx], trainIdx.cols);
+        const int nMatched = std::min(nMatches_ptr[queryIdx], trainIdx.cols);

-        if (nMatches == 0)
+        if (nMatched == 0)
        {
            if (!compactResult)
                matches.push_back(vector<DMatch>());
            continue;
        }

-        matches.push_back(vector<DMatch>(nMatches));
+        matches.push_back(vector<DMatch>(nMatched));
        vector<DMatch>& curMatches = matches.back();

-        for (int i = 0; i < nMatches; ++i, ++trainIdx_ptr, ++distance_ptr)
+        for (int i = 0; i < nMatched; ++i, ++trainIdx_ptr, ++distance_ptr)
        {
-            int trainIdx = *trainIdx_ptr;
+            int _trainIdx = *trainIdx_ptr;

-            float distance = *distance_ptr;
+            float _distance = *distance_ptr;

-            DMatch m(queryIdx, trainIdx, 0, distance);
+            DMatch m(queryIdx, _trainIdx, 0, _distance);

            curMatches[i] = m;
        }
@ -1009,9 +1009,9 @@ void cv::gpu::BFMatcher_GPU::radiusMatchConvert(const Mat& trainIdx, const Mat&
        const int* imgIdx_ptr = imgIdx.ptr<int>(queryIdx);
        const float* distance_ptr = distance.ptr<float>(queryIdx);

-        const int nMatches = std::min(nMatches_ptr[queryIdx], trainIdx.cols);
+        const int nMatched = std::min(nMatches_ptr[queryIdx], trainIdx.cols);

-        if (nMatches == 0)
+        if (nMatched == 0)
        {
            if (!compactResult)
                matches.push_back(vector<DMatch>());
@ -1020,9 +1020,9 @@ void cv::gpu::BFMatcher_GPU::radiusMatchConvert(const Mat& trainIdx, const Mat&

        matches.push_back(vector<DMatch>());
        vector<DMatch>& curMatches = matches.back();
-        curMatches.reserve(nMatches);
+        curMatches.reserve(nMatched);

-        for (int i = 0; i < nMatches; ++i, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
+        for (int i = 0; i < nMatched; ++i, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
        {
            int _trainIdx = *trainIdx_ptr;
            int _imgIdx = *imgIdx_ptr;
--- a/modules/gpu/src/calib3d.cpp
+++ b/modules/gpu/src/calib3d.cpp
@ -56,14 +56,14 @@ void cv::gpu::solvePnPRansac(const Mat&, const Mat&, const Mat&, const Mat&, Mat

 #else

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace transform_points 
+    namespace transform_points
    {
        void call(const DevMem2D_<float3> src, const float* rot, const float* transl, DevMem2D_<float3> dst, cudaStream_t stream);
    }

-    namespace project_points 
+    namespace project_points
    {
        void call(const DevMem2D_<float3> src, const float* rot, const float* transl, const float* proj, DevMem2D_<float2> dst, cudaStream_t stream);
    }
@ -154,11 +154,11 @@ namespace
    class TransformHypothesesGenerator
    {
    public:
-        TransformHypothesesGenerator(const Mat& object_, const Mat& image_, const Mat& dist_coef_, 
-                                     const Mat& camera_mat_, int num_points_, int subset_size_, 
+        TransformHypothesesGenerator(const Mat& object_, const Mat& image_, const Mat& dist_coef_,
+                                     const Mat& camera_mat_, int num_points_, int subset_size_,
                                     Mat rot_matrices_, Mat transl_vectors_)
-                : object(&object_), image(&image_), dist_coef(&dist_coef_), camera_mat(&camera_mat_), 
-                  num_points(num_points_), subset_size(subset_size_), rot_matrices(rot_matrices_), 
+                : object(&object_), image(&image_), dist_coef(&dist_coef_), camera_mat(&camera_mat_),
+                  num_points(num_points_), subset_size(subset_size_), rot_matrices(rot_matrices_),
                  transl_vectors(transl_vectors_) {}

        void operator()(const BlockedRange& range) const
@ -211,9 +211,10 @@ namespace

 void cv::gpu::solvePnPRansac(const Mat& object, const Mat& image, const Mat& camera_mat,
                             const Mat& dist_coef, Mat& rvec, Mat& tvec, bool use_extrinsic_guess,
-                             int num_iters, float max_dist, int min_inlier_count, 
+                             int num_iters, float max_dist, int min_inlier_count,
                             vector<int>* inliers)
 {
+    (void)min_inlier_count;
    CV_Assert(object.rows == 1 && object.cols > 0 && object.type() == CV_32FC3);
    CV_Assert(image.rows == 1 && image.cols > 0 && image.type() == CV_32FC2);
    CV_Assert(object.cols == image.cols);
@ -236,7 +237,7 @@ void cv::gpu::solvePnPRansac(const Mat& object, const Mat& image, const Mat& cam
    Mat transl_vectors(1, num_iters * 3, CV_32F);

    // Generate set of hypotheses using small subsets of the input data
-    TransformHypothesesGenerator body(object, image_normalized, empty_dist_coef, eye_camera_mat, 
+    TransformHypothesesGenerator body(object, image_normalized, empty_dist_coef, eye_camera_mat,
                                      num_points, subset_size, rot_matrices, transl_vectors);
    parallel_for(BlockedRange(0, num_iters), body);

@ -246,7 +247,7 @@ void cv::gpu::solvePnPRansac(const Mat& object, const Mat& image, const Mat& cam
    GpuMat d_hypothesis_scores(1, num_iters, CV_32S);
    solve_pnp_ransac::computeHypothesisScores(
            num_iters, num_points, rot_matrices.ptr<float>(), transl_vectors.ptr<float3>(),
-            d_object.ptr<float3>(), d_image_normalized.ptr<float2>(), max_dist * max_dist, 
+            d_object.ptr<float3>(), d_image_normalized.ptr<float2>(), max_dist * max_dist,
            d_hypothesis_scores.ptr<int>());

    // Find the best hypothesis index
--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@ -143,7 +143,7 @@ public:
    }

    unsigned int process(const GpuMat& image, GpuMat& objectsBuf, float scaleFactor, int minNeighbors,
-                      bool findLargestObject, bool visualizeInPlace, cv::Size minSize, cv::Size maxObjectSize)
+                      bool findLargestObject, bool visualizeInPlace, cv::Size minSize, cv::Size /*maxObjectSize*/)
    {
        CV_Assert( scaleFactor > 1 && image.depth() == CV_8U);

@ -380,12 +380,12 @@ public:
    LbpCascade(){}
    virtual ~LbpCascade(){}

-    virtual unsigned int process(const GpuMat& image, GpuMat& objects, float scaleFactor, int groupThreshold, bool findLargestObject,
-        bool visualizeInPlace, cv::Size minObjectSize, cv::Size maxObjectSize)
+    virtual unsigned int process(const GpuMat& image, GpuMat& objects, float scaleFactor, int groupThreshold, bool /*findLargestObject*/,
+        bool /*visualizeInPlace*/, cv::Size minObjectSize, cv::Size maxObjectSize)
    {
        CV_Assert(scaleFactor > 1 && image.depth() == CV_8U);

-        const int defaultObjSearchNum = 100;
+        // const int defaultObjSearchNum = 100;
        const float grouping_eps = 0.2f;

        if( !objects.empty() && objects.depth() == CV_32S)
--- a/modules/gpu/src/cuda/bf_radius_match.cu
+++ b/modules/gpu/src/cuda/bf_radius_match.cu
@ -56,7 +56,7 @@ namespace cv { namespace gpu { namespace device
        __global__ void matchUnrolled(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask,
            PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
        {
-            #if __CUDA_ARCH__ >= 110
+            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)

            extern __shared__ int smem[];

@ -168,7 +168,7 @@ namespace cv { namespace gpu { namespace device
        __global__ void match(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask,
            PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
        {
-            #if __CUDA_ARCH__ >= 110
+            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)

            extern __shared__ int smem[];

--- a/modules/gpu/src/cuda/canny.cu
+++ b/modules/gpu/src/cuda/canny.cu
@ -261,7 +261,7 @@ namespace cv { namespace gpu { namespace device

        __global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols)
        {
-            #if __CUDA_ARCH__ >= 120
+            #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 120)

            __shared__ int smem[18][18];

@ -358,7 +358,7 @@ namespace cv { namespace gpu { namespace device

        __global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count)
        {
-            #if __CUDA_ARCH__ >= 120
+            #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 120

            const int stack_size = 512;

--- a/modules/gpu/src/cuda/ccomponetns.cu
+++ b/modules/gpu/src/cuda/ccomponetns.cu
@ -316,7 +316,7 @@ namespace cv { namespace gpu { namespace device
                        }
                    }

-                changed = Emulation::sycthOr(changed);
+                changed = Emulation::syncthreadsOr(changed);

                if (!changed)
                    break;
@ -474,7 +474,7 @@ namespace cv { namespace gpu { namespace device
                        }
                    }
                }
-            } while (Emulation::sycthOr(changed));
+            } while (Emulation::syncthreadsOr(changed));
        }

        __global__ void flatten(const DevMem2D edges, DevMem2Di comps)
--- a/modules/gpu/src/cuda/column_filter.cu
+++ b/modules/gpu/src/cuda/column_filter.cu
@ -64,7 +64,7 @@ namespace cv { namespace gpu { namespace device
        template <int KSIZE, typename T, typename D, typename B>
        __global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep<D> dst, const int anchor, const B brd)
        {
-            #if __CUDA_ARCH__ >= 200
+            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
                const int BLOCK_DIM_X = 16;
                const int BLOCK_DIM_Y = 16;
                const int PATCH_PER_BLOCK = 4;
--- a/modules/gpu/src/cuda/fast.cu
+++ b/modules/gpu/src/cuda/fast.cu
@ -223,7 +223,7 @@ namespace cv { namespace gpu { namespace device
        template <bool calcScore, class Mask>
        __global__ void calcKeypoints(const DevMem2Db img, const Mask mask, short2* kpLoc, const unsigned int maxKeypoints, PtrStepi score, const int threshold)
        {
-            #if __CUDA_ARCH__ >= 110
+            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)

            const int j = threadIdx.x + blockIdx.x * blockDim.x + 3;
            const int i = threadIdx.y + blockIdx.y * blockDim.y + 3;
@ -325,7 +325,7 @@ namespace cv { namespace gpu { namespace device

        __global__ void nonmaxSupression(const short2* kpLoc, int count, const DevMem2Di scoreMat, short2* locFinal, float* responseFinal)
        {
-            #if __CUDA_ARCH__ >= 110
+            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)

            const int kpIdx = threadIdx.x + blockIdx.x * blockDim.x;

--- a/modules/gpu/src/cuda/hist.cu
+++ b/modules/gpu/src/cuda/hist.cu
@ -63,7 +63,7 @@ namespace cv { namespace gpu { namespace device

    #define MERGE_THREADBLOCK_SIZE 256

-    #define USE_SMEM_ATOMICS (__CUDA_ARCH__ >= 120)
+    #define USE_SMEM_ATOMICS (defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 120))

    namespace hist
    {
--- a/modules/gpu/src/cuda/hough.cu
+++ b/modules/gpu/src/cuda/hough.cu
@ -59,7 +59,7 @@ namespace cv { namespace gpu { namespace device
        {
            __shared__ int s_queues[4][32 * PIXELS_PER_THREAD];
            __shared__ int s_qsize[4];
-            __shared__ int s_start[4];
+            __shared__ int s_globStart[4];

            const int x = blockIdx.x * blockDim.x * PIXELS_PER_THREAD + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -73,9 +73,10 @@ namespace cv { namespace gpu { namespace device
            __syncthreads();

            // fill the queue
+            const uchar* srcRow = src.ptr(y);
            for (int i = 0, xx = x; i < PIXELS_PER_THREAD && xx < src.cols; ++i, xx += blockDim.x)
            {
-                if (src(y, xx))
+                if (srcRow[xx])
                {
                    const unsigned int val = (y << 16) | xx;
                    const int qidx = Emulation::smem::atomicAdd(&s_qsize[threadIdx.y], 1);
@ -89,36 +90,34 @@ namespace cv { namespace gpu { namespace device
            if (threadIdx.x == 0 && threadIdx.y == 0)
            {
                // find how many items are stored in each list
-                int total_size = 0;
+                int totalSize = 0;
                for (int i = 0; i < blockDim.y; ++i)
                {
-                    s_start[i] = total_size;
-                    total_size += s_qsize[i];
+                    s_globStart[i] = totalSize;
+                    totalSize += s_qsize[i];
                }

                // calculate the offset in the global list
-                const int global_offset = atomicAdd(&g_counter, total_size);
+                const int globalOffset = atomicAdd(&g_counter, totalSize);
                for (int i = 0; i < blockDim.y; ++i)
-                    s_start[i] += global_offset;
+                    s_globStart[i] += globalOffset;
            }

            __syncthreads();

            // copy local queues to global queue
            const int qsize = s_qsize[threadIdx.y];
-            for(int i = threadIdx.x; i < qsize; i += blockDim.x)
-            {
-                const unsigned int val = s_queues[threadIdx.y][i];
-                list[s_start[threadIdx.y] + i] = val;
-            }
+            int gidx = s_globStart[threadIdx.y] + threadIdx.x;
+            for(int i = threadIdx.x; i < qsize; i += blockDim.x, gidx += blockDim.x)
+                list[gidx] = s_queues[threadIdx.y][i];
        }

        int buildPointList_gpu(DevMem2Db src, unsigned int* list)
        {
-            void* counter_ptr;
-            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
+            void* counterPtr;
+            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );

-            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
+            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );

            const dim3 block(32, 4);
            const dim3 grid(divUp(src.cols, block.x * PIXELS_PER_THREAD), divUp(src.rows, block.y));
@ -130,10 +129,10 @@ namespace cv { namespace gpu { namespace device

            cudaSafeCall( cudaDeviceSynchronize() );

-            int total_count;
-            cudaSafeCall( cudaMemcpy(&total_count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
+            int totalCount;
+            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );

-            return total_count;
+            return totalCount;
        }

        ////////////////////////////////////////////////////////////////////////
@ -144,24 +143,26 @@ namespace cv { namespace gpu { namespace device
            const int n = blockIdx.x;
            const float ang = n * theta;

-            float sin_ang;
-            float cos_ang;
-            sincosf(ang, &sin_ang, &cos_ang);
+            float sinVal;
+            float cosVal;
+            sincosf(ang, &sinVal, &cosVal);
+            sinVal *= irho;
+            cosVal *= irho;

-            const float tabSin = sin_ang * irho;
-            const float tabCos = cos_ang * irho;
+            const int shift = (numrho - 1) / 2;

+            int* accumRow = accum.ptr(n + 1);
            for (int i = threadIdx.x; i < count; i += blockDim.x)
            {
-                const unsigned int qvalue = list[i];
+                const unsigned int val = list[i];

-                const int x = (qvalue & 0x0000FFFF);
-                const int y = (qvalue >> 16) & 0x0000FFFF;
+                const int x = (val & 0xFFFF);
+                const int y = (val >> 16) & 0xFFFF;

-                int r = __float2int_rn(x * tabCos + y * tabSin);
-                r += (numrho - 1) / 2;
+                int r = __float2int_rn(x * cosVal + y * sinVal);
+                r += shift;

-                ::atomicAdd(accum.ptr(n + 1) + r + 1, 1);
+                ::atomicAdd(accumRow + r + 1, 1);
            }
        }

@ -177,30 +178,32 @@ namespace cv { namespace gpu { namespace device
            const int n = blockIdx.x;
            const float ang = n * theta;

-            float sin_ang;
-            float cos_ang;
-            sincosf(ang, &sin_ang, &cos_ang);
+            float sinVal;
+            float cosVal;
+            sincosf(ang, &sinVal, &cosVal);
+            sinVal *= irho;
+            cosVal *= irho;

-            const float tabSin = sin_ang * irho;
-            const float tabCos = cos_ang * irho;
+            const int shift = (numrho - 1) / 2;

            for (int i = threadIdx.x; i < count; i += blockDim.x)
            {
-                const unsigned int qvalue = list[i];
+                const unsigned int val = list[i];

-                const int x = (qvalue & 0x0000FFFF);
-                const int y = (qvalue >> 16) & 0x0000FFFF;
+                const int x = (val & 0xFFFF);
+                const int y = (val >> 16) & 0xFFFF;

-                int r = __float2int_rn(x * tabCos + y * tabSin);
-                r += (numrho - 1) / 2;
+                int r = __float2int_rn(x * cosVal + y * sinVal);
+                r += shift;

                Emulation::smem::atomicAdd(&smem[r + 1], 1);
            }

            __syncthreads();

-            for (int i = threadIdx.x; i < numrho; i += blockDim.x)
-                accum(n + 1, i) = smem[i];
+            int* accumRow = accum.ptr(n + 1);
+            for (int i = threadIdx.x; i < numrho + 1; i += blockDim.x)
+                accumRow[i] = smem[i];
        }

        void linesAccum_gpu(const unsigned int* list, int count, DevMem2Di accum, float rho, float theta, size_t sharedMemPerBlock, bool has20)
@ -225,21 +228,21 @@ namespace cv { namespace gpu { namespace device
        ////////////////////////////////////////////////////////////////////////
        // linesGetResult

-        __global__ void linesGetResult(const DevMem2Di accum, float2* out, int* votes, const int maxSize, const float threshold, const float theta, const float rho, const int numrho)
+        __global__ void linesGetResult(const DevMem2Di accum, float2* out, int* votes, const int maxSize, const float rho, const float theta, const float threshold, const int numrho)
        {
            __shared__ int smem[8][32];

-            int r = blockIdx.x * (blockDim.x - 2) + threadIdx.x;
-            int n = blockIdx.y * (blockDim.y - 2) + threadIdx.y;
+            const int x = blockIdx.x * (blockDim.x - 2) + threadIdx.x;
+            const int y = blockIdx.y * (blockDim.y - 2) + threadIdx.y;

-            if (r >= accum.cols || n >= accum.rows)
+            if (x >= accum.cols || y >= accum.rows)
                return;

-            smem[threadIdx.y][threadIdx.x] = accum(n, r);
+            smem[threadIdx.y][threadIdx.x] = accum(y, x);
            __syncthreads();

-            r -= 1;
-            n -= 1;
+            const int r = x - 1;
+            const int n = y - 1;

            if (threadIdx.x == 0 || threadIdx.x == blockDim.x - 1 || threadIdx.y == 0 || threadIdx.y == blockDim.y - 1 || r >= accum.cols - 2 || n >= accum.rows - 2)
                return;
@ -264,32 +267,32 @@ namespace cv { namespace gpu { namespace device

        int linesGetResult_gpu(DevMem2Di accum, float2* out, int* votes, int maxSize, float rho, float theta, float threshold, bool doSort)
        {
-            void* counter_ptr;
-            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
+            void* counterPtr;
+            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );

-            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
+            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );

            const dim3 block(32, 8);
            const dim3 grid(divUp(accum.cols, block.x - 2), divUp(accum.rows, block.y - 2));

-            linesGetResult<<<grid, block>>>(accum, out, votes, maxSize, threshold, theta, rho, accum.cols - 2);
+            linesGetResult<<<grid, block>>>(accum, out, votes, maxSize, rho, theta, threshold, accum.cols - 2);
            cudaSafeCall( cudaGetLastError() );

            cudaSafeCall( cudaDeviceSynchronize() );

-            int total_count;
-            cudaSafeCall( cudaMemcpy(&total_count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
+            int totalCount;
+            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );

-            total_count = ::min(total_count, maxSize);
+            totalCount = ::min(totalCount, maxSize);

-            if (doSort && total_count > 0)
+            if (doSort && totalCount > 0)
            {
-                thrust::device_ptr<float2> out_ptr(out);
-                thrust::device_ptr<int> votes_ptr(votes);
-                thrust::sort_by_key(votes_ptr, votes_ptr + total_count, out_ptr, thrust::greater<int>());
+                thrust::device_ptr<float2> outPtr(out);
+                thrust::device_ptr<int> votesPtr(votes);
+                thrust::sort_by_key(votesPtr, votesPtr + totalCount, outPtr, thrust::greater<int>());
            }

-            return total_count;
+            return totalCount;
        }
    }
 }}}
--- a/modules/gpu/src/cuda/integral_image.cu
+++ b/modules/gpu/src/cuda/integral_image.cu
@ -0,0 +1,385 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/gpu/device/common.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace imgproc
+    {
+        // Utility function to extract unsigned chars from an unsigned integer
+        __device__ uchar4 int_to_uchar4(unsigned int in)
+        {
+            uchar4 bytes;
+            bytes.x = (in && 0x000000ff) >>  0;
+            bytes.y = (in && 0x0000ff00) >>  8;
+            bytes.z = (in && 0x00ff0000) >> 16;
+            bytes.w = (in && 0xff000000) >> 24;
+            return bytes;
+        }
+
+        __global__ void shfl_integral_horizontal(const PtrStep_<uint4> img, PtrStep_<uint4> integral)
+        {
+        #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
+            __shared__ int sums[128];
+
+            const int id = threadIdx.x;
+            const int lane_id = id % warpSize;
+            const int warp_id = id / warpSize;
+
+            const uint4 data = img(blockIdx.x, id);
+
+            const uchar4 a = int_to_uchar4(data.x);
+            const uchar4 b = int_to_uchar4(data.y);
+            const uchar4 c = int_to_uchar4(data.z);
+            const uchar4 d = int_to_uchar4(data.w);
+
+            int result[16];
+
+            result[0]  =              a.x;
+            result[1]  = result[0]  + a.y;
+            result[2]  = result[1]  + a.z;
+            result[3]  = result[2]  + a.w;
+
+            result[4]  = result[3]  + b.x;
+            result[5]  = result[4]  + b.y;
+            result[6]  = result[5]  + b.z;
+            result[7]  = result[6]  + b.w;
+
+            result[8]  = result[7]  + c.x;
+            result[9]  = result[8]  + c.y;
+            result[10] = result[9]  + c.z;
+            result[11] = result[10] + c.w;
+
+            result[12] = result[11] + d.x;
+            result[13] = result[12] + d.y;
+            result[14] = result[13] + d.z;
+            result[15] = result[14] + d.w;
+
+            int sum = result[15];
+
+            // the prefix sum for each thread's 16 value is computed,
+            // now the final sums (result[15]) need to be shared
+            // with the other threads and add.  To do this,
+            // the __shfl_up() instruction is used and a shuffle scan
+            // operation is performed to distribute the sums to the correct
+            // threads
+            #pragma unroll
+            for (int i = 1; i < 32; i *= 2)
+            {
+                const int n = __shfl_up(sum, i, 32);
+
+                if (lane_id >= i)
+                {
+                    #pragma unroll
+                    for (int i = 0; i < 16; ++i)
+                        result[i] += n;
+
+                    sum += n;
+                }
+            }
+
+            // Now the final sum for the warp must be shared
+            // between warps.  This is done by each warp
+            // having a thread store to shared memory, then
+            // having some other warp load the values and
+            // compute a prefix sum, again by using __shfl_up.
+            // The results are uniformly added back to the warps.
+            // last thread in the warp holding sum of the warp
+            // places that in shared
+            if (threadIdx.x % warpSize == warpSize - 1)
+                sums[warp_id] = result[15];
+
+            __syncthreads();
+
+            if (warp_id == 0)
+            {
+                int warp_sum = sums[lane_id];
+
+                #pragma unroll
+                for (int i = 1; i <= 32; i *= 2)
+                {
+                    const int n = __shfl_up(warp_sum, i, 32);
+
+                    if (lane_id >= i)
+                        warp_sum += n;
+                }
+
+                sums[lane_id] = warp_sum;
+            }
+
+            __syncthreads();
+
+            int blockSum = 0;
+
+            // fold in unused warp
+            if (warp_id > 0)
+            {
+                blockSum = sums[warp_id - 1];
+
+                #pragma unroll
+                for (int i = 0; i < 16; ++i)
+                    result[i] += blockSum;
+            }
+
+            // assemble result
+            // Each thread has 16 values to write, which are
+            // now integer data (to avoid overflow).  Instead of
+            // each thread writing consecutive uint4s, the
+            // approach shown here experiments using
+            // the shuffle command to reformat the data
+            // inside the registers so that each thread holds
+            // consecutive data to be written so larger contiguous
+            // segments can be assembled for writing.
+
+            /*
+                For example data that needs to be written as
+
+                GMEM[16] <- x0 x1 x2 x3 y0 y1 y2 y3 z0 z1 z2 z3 w0 w1 w2 w3
+                but is stored in registers (r0..r3), in four threads (0..3) as:
+
+                threadId   0  1  2  3
+                  r0      x0 y0 z0 w0
+                  r1      x1 y1 z1 w1
+                  r2      x2 y2 z2 w2
+                  r3      x3 y3 z3 w3
+
+                  after apply __shfl_xor operations to move data between registers r1..r3:
+
+                threadId  00 01 10 11
+                          x0 y0 z0 w0
+                 xor(01)->y1 x1 w1 z1
+                 xor(10)->z2 w2 x2 y2
+                 xor(11)->w3 z3 y3 x3
+
+                 and now x0..x3, and z0..z3 can be written out in order by all threads.
+
+                 In the current code, each register above is actually representing
+                 four integers to be written as uint4's to GMEM.
+            */
+
+            result[4]  = __shfl_xor(result[4] , 1, 32);
+            result[5]  = __shfl_xor(result[5] , 1, 32);
+            result[6]  = __shfl_xor(result[6] , 1, 32);
+            result[7]  = __shfl_xor(result[7] , 1, 32);
+
+            result[8]  = __shfl_xor(result[8] , 2, 32);
+            result[9]  = __shfl_xor(result[9] , 2, 32);
+            result[10] = __shfl_xor(result[10], 2, 32);
+            result[11] = __shfl_xor(result[11], 2, 32);
+
+            result[12] = __shfl_xor(result[12], 3, 32);
+            result[13] = __shfl_xor(result[13], 3, 32);
+            result[14] = __shfl_xor(result[14], 3, 32);
+            result[15] = __shfl_xor(result[15], 3, 32);
+
+            uint4* integral_row = integral.ptr(blockIdx.x);
+            uint4 output;
+
+            ///////
+
+            if (threadIdx.x % 4 == 0)
+                output = make_uint4(result[0], result[1], result[2], result[3]);
+
+            if (threadIdx.x % 4 == 1)
+                output = make_uint4(result[4], result[5], result[6], result[7]);
+
+            if (threadIdx.x % 4 == 2)
+                output = make_uint4(result[8], result[9], result[10], result[11]);
+
+            if (threadIdx.x % 4 == 3)
+                output = make_uint4(result[12], result[13], result[14], result[15]);
+
+            integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16] = output;
+
+            ///////
+
+            if (threadIdx.x % 4 == 2)
+                output = make_uint4(result[0], result[1], result[2], result[3]);
+
+            if (threadIdx.x % 4 == 3)
+                output = make_uint4(result[4], result[5], result[6], result[7]);
+
+            if (threadIdx.x % 4 == 0)
+                output = make_uint4(result[8], result[9], result[10], result[11]);
+
+            if (threadIdx.x % 4 == 1)
+                output = make_uint4(result[12], result[13], result[14], result[15]);
+
+            integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 8] = output;
+
+            // continuning from the above example,
+            // this use of __shfl_xor() places the y0..y3 and w0..w3 data
+            // in order.
+
+            #pragma unroll
+            for (int i = 0; i < 16; ++i)
+                result[i] = __shfl_xor(result[i], 1, 32);
+
+            if (threadIdx.x % 4 == 0)
+                output = make_uint4(result[0], result[1], result[2], result[3]);
+
+            if (threadIdx.x % 4 == 1)
+                output = make_uint4(result[4], result[5], result[6], result[7]);
+
+            if (threadIdx.x % 4 == 2)
+                output = make_uint4(result[8], result[9], result[10], result[11]);
+
+            if (threadIdx.x % 4 == 3)
+                output = make_uint4(result[12], result[13], result[14], result[15]);
+
+            integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16 + 4] = output;
+
+            ///////
+
+            if (threadIdx.x % 4 == 2)
+                output = make_uint4(result[0], result[1], result[2], result[3]);
+
+            if (threadIdx.x % 4 == 3)
+                output = make_uint4(result[4], result[5], result[6], result[7]);
+
+            if (threadIdx.x % 4 == 0)
+                output = make_uint4(result[8], result[9], result[10], result[11]);
+
+            if (threadIdx.x % 4 == 1)
+                output = make_uint4(result[12], result[13], result[14], result[15]);
+
+            integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 12] = output;
+        #endif
+        }
+
+        // This kernel computes columnwise prefix sums.  When the data input is
+        // the row sums from above, this completes the integral image.
+        // The approach here is to have each block compute a local set of sums.
+        // First , the data covered by the block is loaded into shared memory,
+        // then instead of performing a sum in shared memory using __syncthreads
+        // between stages, the data is reformatted so that the necessary sums
+        // occur inside warps and the shuffle scan operation is used.
+        // The final set of sums from the block is then propgated, with the block
+        // computing "down" the image and adding the running sum to the local
+        // block sums.
+        __global__ void shfl_integral_vertical(DevMem2D_<unsigned int> integral)
+        {
+        #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
+            __shared__ unsigned int sums[32][9];
+
+            const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
+            const int lane_id = tidx % 8;
+
+            if (tidx >= integral.cols)
+                return;
+
+            sums[threadIdx.x][threadIdx.y] = 0;
+            __syncthreads();
+
+            unsigned int stepSum = 0;
+
+            for (int y = threadIdx.y; y < integral.rows; y += blockDim.y)
+            {
+                unsigned int* p = integral.ptr(y) + tidx;
+
+                unsigned int sum = *p;
+
+                sums[threadIdx.x][threadIdx.y] = sum;
+                __syncthreads();
+
+                // place into SMEM
+                // shfl scan reduce the SMEM, reformating so the column
+                // sums are computed in a warp
+                // then read out properly
+                const int j = threadIdx.x % 8;
+                const int k = threadIdx.x / 8 + threadIdx.y * 4;
+
+                int partial_sum = sums[k][j];
+
+                for (int i = 1; i <= 8; i *= 2)
+                {
+                    int n = __shfl_up(partial_sum, i, 32);
+
+                    if (lane_id >= i)
+                        partial_sum += n;
+                }
+
+                sums[k][j] = partial_sum;
+                __syncthreads();
+
+                if (threadIdx.y > 0)
+                    sum += sums[threadIdx.x][threadIdx.y - 1];
+
+                sum += stepSum;
+                stepSum += sums[threadIdx.x][blockDim.y - 1];
+
+                __syncthreads();
+
+                *p = sum;
+            }
+        #endif
+        }
+
+        void shfl_integral_gpu(DevMem2Db img, DevMem2D_<unsigned int> integral, cudaStream_t stream)
+        {
+            {
+                // each thread handles 16 values, use 1 block/row
+                const int block = img.cols / 16;
+
+                // launch 1 block / row
+                const int grid = img.rows;
+
+                cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );
+
+                shfl_integral_horizontal<<<grid, block, 0, stream>>>((DevMem2D_<uint4>) img, (DevMem2D_<uint4>) integral);
+                cudaSafeCall( cudaGetLastError() );
+            }
+
+            {
+                const dim3 block(32, 8);
+                const dim3 grid(divUp(integral.cols, block.x), 1);
+
+                shfl_integral_vertical<<<grid, block, 0, stream>>>(integral);
+                cudaSafeCall( cudaGetLastError() );
+            }
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    }
+}}}
--- a/modules/gpu/src/cuda/lbp.cu
+++ b/modules/gpu/src/cuda/lbp.cu
@ -279,7 +279,7 @@ namespace cv { namespace gpu { namespace device
                rect.z = __float2int_rn(windowW * scale);
                rect.w = __float2int_rn(windowH * scale);

-                int res = Emulation::smem::atomicInc(classified, (unsigned int)objects.cols);
+                int res = atomicInc(classified, (unsigned int)objects.cols);
                objects(0, res) = rect;
            }
        }
--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
@ -215,7 +215,7 @@ namespace cv { namespace gpu { namespace device
                    maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
                }

-            #if __CUDA_ARCH__ >= 110
+            #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
 		        __shared__ bool is_last;

 		        if (tid == 0)
@ -535,7 +535,7 @@ namespace cv { namespace gpu { namespace device

                findMinMaxLocInSmem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);

-            #if __CUDA_ARCH__ >= 110
+            #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
 		        __shared__ bool is_last;

 		        if (tid == 0)
@ -841,7 +841,7 @@ namespace cv { namespace gpu { namespace device

                sumInSmem<nthreads, uint>(scount, tid);

-            #if __CUDA_ARCH__ >= 110
+            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
 		        __shared__ bool is_last;

 		        if (tid == 0)
@ -1034,7 +1034,7 @@ namespace cv { namespace gpu { namespace device

                sumInSmem<nthreads, R>(smem, tid);

-            #if __CUDA_ARCH__ >= 110
+            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
                __shared__ bool is_last;

                if (tid == 0)
@ -1115,7 +1115,7 @@ namespace cv { namespace gpu { namespace device
                sumInSmem<nthreads, R>(smem, tid);
                sumInSmem<nthreads, R>(smem + nthreads, tid);

-            #if __CUDA_ARCH__ >= 110
+            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
                __shared__ bool is_last;

                if (tid == 0)
@ -1222,7 +1222,7 @@ namespace cv { namespace gpu { namespace device
                sumInSmem<nthreads, R>(smem + nthreads, tid);
                sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);

-            #if __CUDA_ARCH__ >= 110
+            #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
                __shared__ bool is_last;

                if (tid == 0)
@ -1339,7 +1339,7 @@ namespace cv { namespace gpu { namespace device
                sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
                sumInSmem<nthreads, R>(smem + 3 * nthreads, tid);

-            #if __CUDA_ARCH__ >= 110
+            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
                __shared__ bool is_last;

                if (tid == 0)
@ -1975,7 +1975,7 @@ namespace cv { namespace gpu { namespace device
            for (int c = 0; c < cn; ++c)
                myVal[c] = op.startValue();

-        #if __CUDA_ARCH__ >= 200
+        #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 200

            // For cc >= 2.0 prefer L1 cache
            for (int x = threadIdx.x; x < src.cols; x += 256)
--- a/modules/gpu/src/cuda/pyrlk.cu
+++ b/modules/gpu/src/cuda/pyrlk.cu
@ -82,7 +82,7 @@ namespace cv { namespace gpu { namespace device
            smem3[tid] = val3;
            __syncthreads();

-#if __CUDA_ARCH__ > 110
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 110)
            if (tid < 128)
            {
                smem1[tid] = val1 += smem1[tid + 128];
@ -138,7 +138,7 @@ namespace cv { namespace gpu { namespace device
            smem2[tid] = val2;
            __syncthreads();

-#if __CUDA_ARCH__ > 110
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 110)
            if (tid < 128)
            {
                smem1[tid] = val1 += smem1[tid + 128];
@ -184,7 +184,7 @@ namespace cv { namespace gpu { namespace device
            smem1[tid] = val1;
            __syncthreads();

-#if __CUDA_ARCH__ > 110
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 110)
            if (tid < 128)
            {
                smem1[tid] = val1 += smem1[tid + 128];
@ -271,7 +271,7 @@ namespace cv { namespace gpu { namespace device
        template <int cn, int PATCH_X, int PATCH_Y, bool calcErr>
        __global__ void lkSparse(const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols)
        {
-#if __CUDA_ARCH__ <= 110
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ <= 110)
            __shared__ float smem1[128];
            __shared__ float smem2[128];
            __shared__ float smem3[128];
--- a/modules/gpu/src/cuda/row_filter.cu
+++ b/modules/gpu/src/cuda/row_filter.cu
@ -64,7 +64,7 @@ namespace cv { namespace gpu { namespace device
        template <int KSIZE, typename T, typename D, typename B>
        __global__ void linearRowFilter(const DevMem2D_<T> src, PtrStep<D> dst, const int anchor, const B brd)
        {
-            #if __CUDA_ARCH__ >= 200
+            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
                const int BLOCK_DIM_X = 32;
                const int BLOCK_DIM_Y = 8;
                const int PATCH_PER_BLOCK = 4;
--- a/modules/gpu/src/cuda/texture_binder.hpp
+++ b/modules/gpu/src/cuda/texture_binder.hpp
@ -0,0 +1,92 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_GPU_TEXTURE_BINDER_HPP_
+#define OPENCV_GPU_TEXTURE_BINDER_HPP_
+
+#include "opencv2/gpu/devmem2d.hpp"
+#include <safe_call.hpp>
+
+namespace cv
+{
+  namespace gpu
+  {
+    class TextureBinder
+    {
+    public:
+      template<class T, enum cudaTextureReadMode readMode>
+      TextureBinder(const PtrStepSz<T>& arr, const struct texture<T, 2, readMode>& tex) : texref(&tex)
+      {
+        cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();  
+        cudaSafeCall( cudaBindTexture2D(0, tex, arr.data, desc, arr.cols, arr.rows, arr.step) );
+      }
+      
+      template<class T, enum cudaTextureReadMode readMode>
+      TextureBinder(const PtrSz<T>& arr, const struct texture<T, 1, readMode> &tex) : texref(&tex)
+      {
+        cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();  
+        cudaSafeCall( cudaBindTexture(0, tex, arr.data, desc, arr.size * arr.elemSize()) );
+      }
+
+      template<class A, class T, enum cudaTextureReadMode readMode>
+      TextureBinder(const A& arr, const struct texture<T, 2, readMode>& tex, const cudaChannelFormatDesc& desc) : texref(&tex)
+      {
+        cudaSafeCall( cudaBindTexture2D(0, tex, arr.data, desc, arr.cols, arr.rows, arr.step) );
+      }
+
+
+      ~TextureBinder()
+      {
+        cudaSafeCall( cudaUnbindTexture(texref) );
+      }
+    private:
+      const struct textureReference *texref;
+    };
+  }
+
+  namespace device
+  {
+      using pcl::gpu::TextureBinder;
+  }
+}
+
+#endif /* OPENCV_GPU_TEXTURE_BINDER_HPP_*/
--- a/modules/gpu/src/graphcuts.cpp
+++ b/modules/gpu/src/graphcuts.cpp
@ -48,7 +48,7 @@ void cv::gpu::graphcut(GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, Gpu
 void cv::gpu::graphcut(GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }

 void cv::gpu::connectivityMask(const GpuMat&, GpuMat&, const cv::Scalar&, const cv::Scalar&, Stream&) { throw_nogpu(); }
-void cv::gpu::labelComponents(const GpuMat& mask, GpuMat& components, int, Stream& stream) { throw_nogpu(); }
+void cv::gpu::labelComponents(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }

 #else /* !defined (HAVE_CUDA) */

--- a/modules/gpu/src/hog.cpp
+++ b/modules/gpu/src/hog.cpp
@ -315,7 +315,7 @@ void cv::gpu::HOGDescriptor::computeConfidenceMultiScale(const GpuMat& img, vect
  double scale = 1.;
  int levels = 0;

-  for (levels = 0; levels < conf_out.size(); levels++)
+  for (levels = 0; levels < (int)conf_out.size(); levels++)
    {
      scale = conf_out[levels].scale;
      level_scale.push_back(scale);
@ -332,8 +332,8 @@ void cv::gpu::HOGDescriptor::computeConfidenceMultiScale(const GpuMat& img, vect

  for (size_t i = 0; i < level_scale.size(); i++)
    {
-      double scale = level_scale[i];
-      Size sz(cvRound(img.cols / scale), cvRound(img.rows / scale));
+      double _scale = level_scale[i];
+      Size sz(cvRound(img.cols / _scale), cvRound(img.rows / _scale));
      GpuMat smaller_img;

      if (sz == img.size())
--- a/modules/gpu/src/hough.cpp
+++ b/modules/gpu/src/hough.cpp
@ -57,11 +57,27 @@ namespace cv { namespace gpu { namespace device
    namespace hough
    {
        int buildPointList_gpu(DevMem2Db src, unsigned int* list);
+
        void linesAccum_gpu(const unsigned int* list, int count, DevMem2Di accum, float rho, float theta, size_t sharedMemPerBlock, bool has20);
        int linesGetResult_gpu(DevMem2Di accum, float2* out, int* votes, int maxSize, float rho, float theta, float threshold, bool doSort);
    }
 }}}

+//////////////////////////////////////////////////////////
+// HoughLines
+
+void cv::gpu::HoughLines(const GpuMat& src, GpuMat& lines, float rho, float theta, int threshold, bool doSort, int maxLines)
+{
+    GpuMat accum, buf;
+    HoughLines(src, lines, accum, buf, rho, theta, threshold, doSort, maxLines);
+}
+
+void cv::gpu::HoughLines(const GpuMat& src, GpuMat& lines, GpuMat& accum, GpuMat& buf, float rho, float theta, int threshold, bool doSort, int maxLines)
+{
+    HoughLinesTransform(src, accum, buf, rho, theta);
+    HoughLinesGet(accum, lines, rho, theta, threshold, doSort, maxLines);
+}
+
 void cv::gpu::HoughLinesTransform(const GpuMat& src, GpuMat& accum, GpuMat& buf, float rho, float theta)
 {
    using namespace cv::gpu::device::hough;
@ -80,23 +96,23 @@ void cv::gpu::HoughLinesTransform(const GpuMat& src, GpuMat& accum, GpuMat& buf,
    CV_Assert(numangle > 0 && numrho > 0);

    ensureSizeIsEnough(numangle + 2, numrho + 2, CV_32SC1, accum);
-    accum.setTo(cv::Scalar::all(0));
+    accum.setTo(Scalar::all(0));

-    cv::gpu::DeviceInfo devInfo;
+    DeviceInfo devInfo;

    if (count > 0)
-        linesAccum_gpu(buf.ptr<unsigned int>(), count, accum, rho, theta, devInfo.sharedMemPerBlock(), devInfo.supports(cv::gpu::FEATURE_SET_COMPUTE_20));
+        linesAccum_gpu(buf.ptr<unsigned int>(), count, accum, rho, theta, devInfo.sharedMemPerBlock(), devInfo.supports(FEATURE_SET_COMPUTE_20));
 }

 void cv::gpu::HoughLinesGet(const GpuMat& accum, GpuMat& lines, float rho, float theta, int threshold, bool doSort, int maxLines)
 {
-    using namespace cv::gpu::device;
+    using namespace cv::gpu::device::hough;

    CV_Assert(accum.type() == CV_32SC1);

    ensureSizeIsEnough(2, maxLines, CV_32FC2, lines);

-    int count = hough::linesGetResult_gpu(accum, lines.ptr<float2>(0), lines.ptr<int>(1), maxLines, rho, theta, threshold, doSort);
+    int count = linesGetResult_gpu(accum, lines.ptr<float2>(0), lines.ptr<int>(1), maxLines, rho, theta, threshold, doSort);

    if (count > 0)
        lines.cols = count;
@ -104,18 +120,6 @@ void cv::gpu::HoughLinesGet(const GpuMat& accum, GpuMat& lines, float rho, float
        lines.release();
 }

-void cv::gpu::HoughLines(const GpuMat& src, GpuMat& lines, float rho, float theta, int threshold, bool doSort, int maxLines)
-{
-    cv::gpu::GpuMat accum, buf;
-    HoughLines(src, lines, accum, buf, rho, theta, threshold, doSort, maxLines);
-}
-
-void cv::gpu::HoughLines(const GpuMat& src, GpuMat& lines, GpuMat& accum, GpuMat& buf, float rho, float theta, int threshold, bool doSort, int maxLines)
-{
-    HoughLinesTransform(src, accum, buf, rho, theta);
-    HoughLinesGet(accum, lines, rho, theta, threshold, doSort, maxLines);
-}
-
 void cv::gpu::HoughLinesDownload(const GpuMat& d_lines, OutputArray h_lines_, OutputArray h_votes_)
 {
    if (d_lines.empty())
@ -129,14 +133,14 @@ void cv::gpu::HoughLinesDownload(const GpuMat& d_lines, OutputArray h_lines_, Ou
    CV_Assert(d_lines.rows == 2 && d_lines.type() == CV_32FC2);

    h_lines_.create(1, d_lines.cols, CV_32FC2);
-    cv::Mat h_lines = h_lines_.getMat();
+    Mat h_lines = h_lines_.getMat();
    d_lines.row(0).download(h_lines);

    if (h_votes_.needed())
    {
        h_votes_.create(1, d_lines.cols, CV_32SC1);
-        cv::Mat h_votes = h_votes_.getMat();
-        cv::gpu::GpuMat d_votes(1, d_lines.cols, CV_32SC1, const_cast<int*>(d_lines.ptr<int>(1)));
+        Mat h_votes = h_votes_.getMat();
+        GpuMat d_votes(1, d_lines.cols, CV_32SC1, const_cast<int*>(d_lines.ptr<int>(1)));
        d_votes.download(h_votes);
    }
 }
--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@ -223,7 +223,7 @@ void cv::gpu::reprojectImageTo3D(const GpuMat& disp, GpuMat& xyz, const Mat& Q,
    using namespace cv::gpu::device::imgproc;

    typedef void (*func_t)(const DevMem2Db disp, DevMem2Db xyz, const float* q, cudaStream_t stream);
-    static const func_t funcs[2][4] = 
+    static const func_t funcs[2][4] =
    {
        {reprojectImageTo3D_gpu<uchar, float3>, 0, 0, reprojectImageTo3D_gpu<short, float3>},
        {reprojectImageTo3D_gpu<uchar, float4>, 0, 0, reprojectImageTo3D_gpu<short, float4>}
@ -261,6 +261,12 @@ namespace
    }
 }

+#if defined __GNUC__ && __GNUC__ > 2 && __GNUC_MINOR__  > 4
+typedef Npp32s __attribute__((__may_alias__)) Npp32s_a;
+#else
+typedef Npp32s Npp32s_a;
+#endif
+
 void cv::gpu::copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, int borderType, const Scalar& value, Stream& s)
 {
    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
@ -308,7 +314,7 @@ void cv::gpu::copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom
        case CV_32FC1:
            {
                Npp32f val = saturate_cast<Npp32f>(value[0]);
-                Npp32s nVal = *(reinterpret_cast<Npp32s*>(&val));
+                Npp32s nVal = *(reinterpret_cast<Npp32s_a*>(&val));
                nppSafeCall( nppiCopyConstBorder_32s_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), srcsz,
                    dst.ptr<Npp32s>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
                break;
@ -527,32 +533,86 @@ void cv::gpu::integral(const GpuMat& src, GpuMat& sum, Stream& s)
    integralBuffered(src, sum, buffer, s);
 }

+namespace cv { namespace gpu { namespace device
+{
+    namespace imgproc
+    {
+        void shfl_integral_gpu(DevMem2Db img, DevMem2D_<unsigned int> integral, cudaStream_t stream);
+    }
+}}}
+
 void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, Stream& s)
 {
    CV_Assert(src.type() == CV_8UC1);
-    if (sum.cols != src.cols + 1 && sum.rows != src.rows + 1)
-        sum.create(src.rows + 1, src.cols + 1, CV_32S);
-
-    NcvSize32u roiSize;
-    roiSize.width = src.cols;
-    roiSize.height = src.rows;
-
-    cudaDeviceProp prop;
-    cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) );
-
-    Ncv32u bufSize;
-    ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
-    ensureSizeIsEnough(1, bufSize, CV_8UC1, buffer);

    cudaStream_t stream = StreamAccessor::getStream(s);

-    NppStStreamHandler h(stream);
+    DeviceInfo info;

-    ncvSafeCall( nppiStIntegral_8u32u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>()), static_cast<int>(src.step),
-        sum.ptr<Ncv32u>(), static_cast<int>(sum.step), roiSize, buffer.ptr<Ncv8u>(), bufSize, prop) );
+    if (info.supports(WARP_SHUFFLE_FUNCTIONS))
+    {
+        GpuMat src16;

-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (src.cols % 16 == 0)
+            src16 = src;
+        else
+        {
+            ensureSizeIsEnough(src.rows, ((src.cols + 15) / 16) * 16, src.type(), buffer);
+
+            GpuMat inner = buffer(Rect(0, 0, src.cols, src.rows));
+
+            if (s)
+            {
+                s.enqueueMemSet(buffer, Scalar::all(0));
+                s.enqueueCopy(src, inner);
+            }
+            else
+            {
+                buffer.setTo(Scalar::all(0));
+                src.copyTo(inner);
+            }
+
+            src16 = buffer;
+        }
+
+        sum.create(src16.rows + 1, src16.cols + 1, CV_32SC1);
+
+        if (s)
+            s.enqueueMemSet(sum, Scalar::all(0));
+        else
+            sum.setTo(Scalar::all(0));
+
+        GpuMat inner = sum(Rect(1, 1, src16.cols, src16.rows));
+
+        cv::gpu::device::imgproc::shfl_integral_gpu(src16, inner, stream);
+
+        if (src16.cols != src.cols)
+            sum = sum(Rect(0, 0, src.cols + 1, src.rows + 1));
+    }
+    else
+    {
+        sum.create(src.rows + 1, src.cols + 1, CV_32SC1);
+
+        NcvSize32u roiSize;
+        roiSize.width = src.cols;
+        roiSize.height = src.rows;
+
+        cudaDeviceProp prop;
+        cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) );
+
+        Ncv32u bufSize;
+        ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
+        ensureSizeIsEnough(1, bufSize, CV_8UC1, buffer);
+
+
+        NppStStreamHandler h(stream);
+
+        ncvSafeCall( nppiStIntegral_8u32u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>()), static_cast<int>(src.step),
+            sum.ptr<Ncv32u>(), static_cast<int>(sum.step), roiSize, buffer.ptr<Ncv8u>(), bufSize, prop) );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
 }

 //////////////////////////////////////////////////////////////////////////////
@ -1334,7 +1394,7 @@ Size cv::gpu::ConvolveBuf::estimateBlockSize(Size result_size, Size /*templ_size
    int width = (result_size.width + 2) / 3;
    int height = (result_size.height + 2) / 3;
    width = std::min(width, result_size.width);
-    height = std::min(height, result_size.height);    
+    height = std::min(height, result_size.height);
    return Size(width, height);
 }

@ -1374,7 +1434,7 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,

    cufftHandle planR2C, planC2R;
    cufftSafeCall(cufftPlan2d(&planC2R, dft_size.height, dft_size.width, CUFFT_C2R));
-    cufftSafeCall(cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C));   
+    cufftSafeCall(cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C));

    cufftSafeCall( cufftSetStream(planR2C, StreamAccessor::getStream(stream)) );
    cufftSafeCall( cufftSetStream(planC2R, StreamAccessor::getStream(stream)) );
--- a/modules/gpu/src/match_template.cpp
+++ b/modules/gpu/src/match_template.cpp
@ -52,9 +52,9 @@ void cv::gpu::matchTemplate(const GpuMat&, const GpuMat&, GpuMat&, int, Stream&)

 #else

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace match_template 
+    namespace match_template
    {
        void matchTemplateNaive_CCORR_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);
        void matchTemplateNaive_CCORR_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);
@ -71,47 +71,47 @@ namespace cv { namespace gpu { namespace device
        void matchTemplatePrepared_CCOFF_8U(int w, int h, const DevMem2D_<unsigned int> image_sum, unsigned int templ_sum, DevMem2Df result, cudaStream_t stream);
        void matchTemplatePrepared_CCOFF_8UC2(
            int w, int h,
-            const DevMem2D_<unsigned int> image_sum_r, 
-            const DevMem2D_<unsigned int> image_sum_g, 
+            const DevMem2D_<unsigned int> image_sum_r,
+            const DevMem2D_<unsigned int> image_sum_g,
            unsigned int templ_sum_r,
-            unsigned int templ_sum_g, 
+            unsigned int templ_sum_g,
            DevMem2Df result, cudaStream_t stream);
        void matchTemplatePrepared_CCOFF_8UC3(
-                int w, int h, 
-                const DevMem2D_<unsigned int> image_sum_r, 
+                int w, int h,
+                const DevMem2D_<unsigned int> image_sum_r,
                const DevMem2D_<unsigned int> image_sum_g,
                const DevMem2D_<unsigned int> image_sum_b,
-                unsigned int templ_sum_r, 
-                unsigned int templ_sum_g, 
-                unsigned int templ_sum_b, 
+                unsigned int templ_sum_r,
+                unsigned int templ_sum_g,
+                unsigned int templ_sum_b,
                DevMem2Df result, cudaStream_t stream);
        void matchTemplatePrepared_CCOFF_8UC4(
-                int w, int h, 
-                const DevMem2D_<unsigned int> image_sum_r, 
+                int w, int h,
+                const DevMem2D_<unsigned int> image_sum_r,
                const DevMem2D_<unsigned int> image_sum_g,
                const DevMem2D_<unsigned int> image_sum_b,
                const DevMem2D_<unsigned int> image_sum_a,
-                unsigned int templ_sum_r, 
-                unsigned int templ_sum_g, 
-                unsigned int templ_sum_b, 
-                unsigned int templ_sum_a, 
+                unsigned int templ_sum_r,
+                unsigned int templ_sum_g,
+                unsigned int templ_sum_b,
+                unsigned int templ_sum_a,
                DevMem2Df result, cudaStream_t stream);


        void matchTemplatePrepared_CCOFF_NORMED_8U(
-                int w, int h, const DevMem2D_<unsigned int> image_sum, 
+                int w, int h, const DevMem2D_<unsigned int> image_sum,
                const DevMem2D_<unsigned long long> image_sqsum,
                unsigned int templ_sum, unsigned long long templ_sqsum,
                DevMem2Df result, cudaStream_t stream);
        void matchTemplatePrepared_CCOFF_NORMED_8UC2(
-                int w, int h, 
+                int w, int h,
                const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
                const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
                unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
                unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
                DevMem2Df result, cudaStream_t stream);
        void matchTemplatePrepared_CCOFF_NORMED_8UC3(
-                int w, int h, 
+                int w, int h,
                const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
                const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
                const DevMem2D_<unsigned int> image_sum_b, const DevMem2D_<unsigned long long> image_sqsum_b,
@ -120,7 +120,7 @@ namespace cv { namespace gpu { namespace device
                unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
                DevMem2Df result, cudaStream_t stream);
        void matchTemplatePrepared_CCOFF_NORMED_8UC4(
-                int w, int h, 
+                int w, int h,
                const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
                const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
                const DevMem2D_<unsigned int> image_sum_b, const DevMem2D_<unsigned long long> image_sqsum_b,
@ -131,7 +131,7 @@ namespace cv { namespace gpu { namespace device
                unsigned int templ_sum_a, unsigned long long templ_sqsum_a,
                DevMem2Df result, cudaStream_t stream);

-        void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, 
+        void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum,
                          unsigned long long templ_sqsum, DevMem2Df result, int cn, cudaStream_t stream);

        void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn, cudaStream_t stream);
@ -140,17 +140,17 @@ namespace cv { namespace gpu { namespace device

 using namespace ::cv::gpu::device::match_template;

-namespace 
+namespace
 {

-    // Evaluates optimal template's area threshold. If 
-    // template's area is less  than the threshold, we use naive match 
+    // Evaluates optimal template's area threshold. If
+    // template's area is less  than the threshold, we use naive match
    // template version, otherwise FFT-based (if available)
    int getTemplateThreshold(int method, int depth)
    {
        switch (method)
        {
-        case CV_TM_CCORR: 
+        case CV_TM_CCORR:
            if (depth == CV_32F) return 250;
            if (depth == CV_8U) return 300;
            break;
@ -162,10 +162,10 @@ namespace
        return 0;
    }

-    
+
    void matchTemplate_CCORR_32F(
            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
-    {        
+    {
        result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
        if (templ.size().area() < getTemplateThreshold(CV_TM_CCORR, CV_32F))
        {
@ -223,10 +223,11 @@ namespace
        normalize_8U(templ.cols, templ.rows, buf.image_sqsums[0], templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
    }

-    
+
    void matchTemplate_SQDIFF_32F(
            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
    {
+        (void)buf;
        result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
        matchTemplateNaive_SQDIFF_32F(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
    }
@ -362,7 +363,7 @@ namespace
            {
            case 2:
                matchTemplatePrepared_CCOFF_NORMED_8UC2(
-                        templ.cols, templ.rows, 
+                        templ.cols, templ.rows,
                        buf.image_sums[0], buf.image_sqsums[0],
                        buf.image_sums[1], buf.image_sqsums[1],
                        (unsigned int)templ_sum[0], (unsigned long long)templ_sqsum[0],
@ -371,7 +372,7 @@ namespace
                break;
            case 3:
                matchTemplatePrepared_CCOFF_NORMED_8UC3(
-                        templ.cols, templ.rows, 
+                        templ.cols, templ.rows,
                        buf.image_sums[0], buf.image_sqsums[0],
                        buf.image_sums[1], buf.image_sqsums[1],
                        buf.image_sums[2], buf.image_sqsums[2],
@ -382,7 +383,7 @@ namespace
                break;
            case 4:
                matchTemplatePrepared_CCOFF_NORMED_8UC4(
-                        templ.cols, templ.rows, 
+                        templ.cols, templ.rows,
                        buf.image_sums[0], buf.image_sqsums[0],
                        buf.image_sums[1], buf.image_sqsums[1],
                        buf.image_sums[2], buf.image_sqsums[2],
@ -391,7 +392,7 @@ namespace
                        (unsigned int)templ_sum[1], (unsigned long long)templ_sqsum[1],
                        (unsigned int)templ_sum[2], (unsigned long long)templ_sqsum[2],
                        (unsigned int)templ_sum[3], (unsigned long long)templ_sqsum[3],
-                        result, StreamAccessor::getStream(stream));                
+                        result, StreamAccessor::getStream(stream));
                break;
            default:
                CV_Error(CV_StsBadArg, "matchTemplate: unsupported number of channels");
--- a/modules/gpu/src/nvidia/NCVHaarObjectDetection.hpp
+++ b/modules/gpu/src/nvidia/NCVHaarObjectDetection.hpp
@ -67,7 +67,11 @@
 // Guaranteed size cross-platform classifier structures
 //
 //==============================================================================
-
+#if defined __GNUC__ && __GNUC__ > 2 && __GNUC_MINOR__  > 4
+typedef Ncv32f __attribute__((__may_alias__)) Ncv32f_a;
+#else
+typedef Ncv32f Ncv32f_a;
+#endif

 struct HaarFeature64
 {
@ -87,7 +91,7 @@ struct HaarFeature64

    __host__ NCVStatus setWeight(Ncv32f weight)
    {
-        ((Ncv32f*)&(this->_ui2.y))[0] = weight;
+        ((Ncv32f_a*)&(this->_ui2.y))[0] = weight;
        return NCV_SUCCESS;
    }

@ -102,7 +106,7 @@ struct HaarFeature64

    __device__ __host__ Ncv32f getWeight(void)
    {
-        return *(Ncv32f*)(&this->_ui2.y);
+        return *(Ncv32f_a*)(&this->_ui2.y);
    }
 };

@ -168,14 +172,13 @@ public:
    }
 };

-
 struct HaarClassifierNodeDescriptor32
 {
    uint1 _ui1;

    __host__ NCVStatus create(Ncv32f leafValue)
    {
-        *(Ncv32f *)&this->_ui1 = leafValue;
+        *(Ncv32f_a *)&this->_ui1 = leafValue;
        return NCV_SUCCESS;
    }

@ -187,7 +190,7 @@ struct HaarClassifierNodeDescriptor32

    __host__ Ncv32f getLeafValueHost(void)
    {
-        return *(Ncv32f *)&this->_ui1.x;
+        return *(Ncv32f_a *)&this->_ui1.x;
    }

 #ifdef __CUDACC__
@ -203,6 +206,11 @@ struct HaarClassifierNodeDescriptor32
    }
 };

+#if defined __GNUC__ && __GNUC__ > 2 && __GNUC_MINOR__  > 4
+typedef Ncv32u __attribute__((__may_alias__)) Ncv32u_a;
+#else
+typedef Ncv32u Ncv32u_a;
+#endif

 struct HaarClassifierNode128
 {
@ -216,19 +224,19 @@ struct HaarClassifierNode128

    __host__ NCVStatus setThreshold(Ncv32f t)
    {
-        this->_ui4.y = *(Ncv32u *)&t;
+        this->_ui4.y = *(Ncv32u_a *)&t;
        return NCV_SUCCESS;
    }

    __host__ NCVStatus setLeftNodeDesc(HaarClassifierNodeDescriptor32 nl)
    {
-        this->_ui4.z = *(Ncv32u *)&nl;
+        this->_ui4.z = *(Ncv32u_a *)&nl;
        return NCV_SUCCESS;
    }

    __host__ NCVStatus setRightNodeDesc(HaarClassifierNodeDescriptor32 nr)
    {
-        this->_ui4.w = *(Ncv32u *)&nr;
+        this->_ui4.w = *(Ncv32u_a *)&nr;
        return NCV_SUCCESS;
    }

@ -239,7 +247,7 @@ struct HaarClassifierNode128

    __host__ __device__ Ncv32f getThreshold(void)
    {
-        return *(Ncv32f*)&this->_ui4.y;
+        return *(Ncv32f_a*)&this->_ui4.y;
    }

    __host__ __device__ HaarClassifierNodeDescriptor32 getLeftNodeDesc(void)
@ -264,7 +272,7 @@ struct HaarStage64

    __host__ NCVStatus setStageThreshold(Ncv32f t)
    {
-        this->_ui2.x = *(Ncv32u *)&t;
+        this->_ui2.x = *(Ncv32u_a *)&t;
        return NCV_SUCCESS;
    }

@ -290,7 +298,7 @@ struct HaarStage64

    __host__ __device__ Ncv32f getStageThreshold(void)
    {
-        return *(Ncv32f*)&this->_ui2.x;
+        return *(Ncv32f_a*)&this->_ui2.x;
    }

    __host__ __device__ Ncv32u getStartClassifierRootNodeOffset(void)
--- a/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
+++ b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
@ -1423,7 +1423,7 @@ NCVStatus compactVector_32u_device(Ncv32u *d_src, Ncv32u srcLen,
                    (d_hierSums.ptr() + partSumOffsets[i],
                     partSumNums[i], NULL,
                     d_hierSums.ptr() + partSumOffsets[i+1],
-                     NULL);
+                     0);
            }
            else
            {
@ -1433,7 +1433,7 @@ NCVStatus compactVector_32u_device(Ncv32u *d_src, Ncv32u srcLen,
                    (d_hierSums.ptr() + partSumOffsets[i],
                     partSumNums[i], NULL,
                     NULL,
-                     NULL);
+                     0);
            }

            ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
@ -1557,16 +1557,21 @@ NCVStatus nppsStCompact_32s(Ncv32s *d_src, Ncv32u srcLen,
 }


+#if defined __GNUC__ && __GNUC__ > 2 && __GNUC_MINOR__  > 4
+typedef Ncv32u __attribute__((__may_alias__)) Ncv32u_a;
+#else
+typedef Ncv32u Ncv32u_a;
+#endif
+
 NCVStatus nppsStCompact_32f(Ncv32f *d_src, Ncv32u srcLen,
                            Ncv32f *d_dst, Ncv32u *p_dstLen,
                            Ncv32f elemRemove, Ncv8u *pBuffer,
                            Ncv32u bufSize, cudaDeviceProp &devProp)
 {
    return nppsStCompact_32u((Ncv32u *)d_src, srcLen, (Ncv32u *)d_dst, p_dstLen,
-                             *(Ncv32u *)&elemRemove, pBuffer, bufSize, devProp);
+                             *(Ncv32u_a *)&elemRemove, pBuffer, bufSize, devProp);
 }

-
 NCVStatus nppsStCompact_32u_host(Ncv32u *h_src, Ncv32u srcLen,
                                 Ncv32u *h_dst, Ncv32u *dstLen, Ncv32u elemRemove)
 {
@ -1602,17 +1607,16 @@ NCVStatus nppsStCompact_32u_host(Ncv32u *h_src, Ncv32u srcLen,
 NCVStatus nppsStCompact_32s_host(Ncv32s *h_src, Ncv32u srcLen,
                                 Ncv32s *h_dst, Ncv32u *dstLen, Ncv32s elemRemove)
 {
-    return nppsStCompact_32u_host((Ncv32u *)h_src, srcLen, (Ncv32u *)h_dst, dstLen, *(Ncv32u *)&elemRemove);
+    return nppsStCompact_32u_host((Ncv32u *)h_src, srcLen, (Ncv32u *)h_dst, dstLen, *(Ncv32u_a *)&elemRemove);
 }


 NCVStatus nppsStCompact_32f_host(Ncv32f *h_src, Ncv32u srcLen,
                                 Ncv32f *h_dst, Ncv32u *dstLen, Ncv32f elemRemove)
 {
-    return nppsStCompact_32u_host((Ncv32u *)h_src, srcLen, (Ncv32u *)h_dst, dstLen, *(Ncv32u *)&elemRemove);
+    return nppsStCompact_32u_host((Ncv32u *)h_src, srcLen, (Ncv32u *)h_dst, dstLen, *(Ncv32u_a *)&elemRemove);
 }

-
 //==============================================================================
 //
 // Filter.cu
@ -2066,7 +2070,7 @@ NCVStatus nppiStInterpolateFrames(const NppStInterpolationState *pState)
 //==============================================================================


-#if __CUDA_ARCH__ < 200
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 200)

 // FP32 atomic add
 static __forceinline__ __device__ float _atomicAdd(float *addr, float val)
--- a/modules/gpu/src/opencv2/gpu/device/emulation.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/emulation.hpp
@ -51,11 +51,11 @@ namespace cv { namespace gpu { namespace device
    struct Emulation
    {

-        static __device__ __forceinline__ int sycthOr(int pred)
+        static __device__ __forceinline__ int syncthreadsOr(int pred)
        {
 #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 200)
                // just campilation stab
-                return false;
+                return 0;
 #else
                return __syncthreads_or(pred);
 #endif
--- a/modules/gpu/src/split_merge.cpp
+++ b/modules/gpu/src/split_merge.cpp
@ -119,7 +119,6 @@ namespace

        int depth = src.depth();
        int num_channels = src.channels();
-        Size size = src.size();

        if (depth == CV_64F)
        {
--- a/modules/gpu/src/video_decoder.cpp
+++ b/modules/gpu/src/video_decoder.cpp
@ -49,36 +49,36 @@ void cv::gpu::detail::VideoDecoder::create(const VideoReader_GPU::FormatInfo& vi
 {
    release();

-    cudaVideoCodec codec = static_cast<cudaVideoCodec>(videoFormat.codec);
-    cudaVideoChromaFormat chromaFormat = static_cast<cudaVideoChromaFormat>(videoFormat.chromaFormat);
+    cudaVideoCodec _codec = static_cast<cudaVideoCodec>(videoFormat.codec);
+    cudaVideoChromaFormat _chromaFormat = static_cast<cudaVideoChromaFormat>(videoFormat.chromaFormat);

-    cudaVideoCreateFlags videoCreateFlags = (codec == cudaVideoCodec_JPEG || codec == cudaVideoCodec_MPEG2) ?
+    cudaVideoCreateFlags videoCreateFlags = (_codec == cudaVideoCodec_JPEG || _codec == cudaVideoCodec_MPEG2) ?
                                            cudaVideoCreate_PreferCUDA :
                                            cudaVideoCreate_PreferCUVID;

    // Validate video format.  These are the currently supported formats via NVCUVID
-    CV_Assert(cudaVideoCodec_MPEG1 == codec ||
-              cudaVideoCodec_MPEG2 == codec ||
-              cudaVideoCodec_MPEG4 == codec ||
-              cudaVideoCodec_VC1   == codec ||
-              cudaVideoCodec_H264  == codec ||
-              cudaVideoCodec_JPEG  == codec ||
-              cudaVideoCodec_YUV420== codec ||
-              cudaVideoCodec_YV12  == codec ||
-              cudaVideoCodec_NV12  == codec ||
-              cudaVideoCodec_YUYV  == codec ||
-              cudaVideoCodec_UYVY  == codec );
+    CV_Assert(cudaVideoCodec_MPEG1 == _codec ||
+              cudaVideoCodec_MPEG2 == _codec ||
+              cudaVideoCodec_MPEG4 == _codec ||
+              cudaVideoCodec_VC1   == _codec ||
+              cudaVideoCodec_H264  == _codec ||
+              cudaVideoCodec_JPEG  == _codec ||
+              cudaVideoCodec_YUV420== _codec ||
+              cudaVideoCodec_YV12  == _codec ||
+              cudaVideoCodec_NV12  == _codec ||
+              cudaVideoCodec_YUYV  == _codec ||
+              cudaVideoCodec_UYVY  == _codec );

-    CV_Assert(cudaVideoChromaFormat_Monochrome == chromaFormat ||
-              cudaVideoChromaFormat_420        == chromaFormat ||
-              cudaVideoChromaFormat_422        == chromaFormat ||
-              cudaVideoChromaFormat_444        == chromaFormat);
+    CV_Assert(cudaVideoChromaFormat_Monochrome == _chromaFormat ||
+              cudaVideoChromaFormat_420        == _chromaFormat ||
+              cudaVideoChromaFormat_422        == _chromaFormat ||
+              cudaVideoChromaFormat_444        == _chromaFormat);

    // Fill the decoder-create-info struct from the given video-format struct.
    std::memset(&createInfo_, 0, sizeof(CUVIDDECODECREATEINFO));

    // Create video decoder
-    createInfo_.CodecType           = codec;
+    createInfo_.CodecType           = _codec;
    createInfo_.ulWidth             = videoFormat.width;
    createInfo_.ulHeight            = videoFormat.height;
    createInfo_.ulNumDecodeSurfaces = FrameQueue::MaximumSize;
@ -87,7 +87,7 @@ void cv::gpu::detail::VideoDecoder::create(const VideoReader_GPU::FormatInfo& vi
    while (createInfo_.ulNumDecodeSurfaces * videoFormat.width * videoFormat.height > 16 * 1024 * 1024)
        createInfo_.ulNumDecodeSurfaces--;

-    createInfo_.ChromaFormat    = chromaFormat;
+    createInfo_.ChromaFormat    = _chromaFormat;
    createInfo_.OutputFormat    = cudaVideoSurfaceFormat_NV12;
    createInfo_.DeinterlaceMode = cudaVideoDeinterlaceMode_Adaptive;

--- a/modules/gpu/test/main.cpp
+++ b/modules/gpu/test/main.cpp
@ -39,7 +39,7 @@
 //
 //M*/

-#include "precomp.hpp"
+#include "test_precomp.hpp"

 #ifdef HAVE_CUDA

@ -49,87 +49,128 @@ using namespace cv::gpu;
 using namespace cvtest;
 using namespace testing;

-void print_info()
+void printOsInfo()
 {
-    printf("\n");
 #if defined _WIN32
 #   if defined _WIN64
-        puts("OS: Windows 64");
+        cout << "OS: Windows x64 \n" << endl;
 #   else
-        puts("OS: Windows 32");
+        cout << "OS: Windows x32 \n" << endl;
 #   endif
 #elif defined linux
 #   if defined _LP64
-        puts("OS: Linux 64");
+        cout << "OS: Linux x64 \n" << endl;
 #   else
-        puts("OS: Linux 32");
+        cout << "OS: Linux x32 \n" << endl;
 #   endif
 #elif defined __APPLE__
 #   if defined _LP64
-        puts("OS: Apple 64");
+        cout << "OS: Apple x64 \n" << endl;
 #   else
-        puts("OS: Apple 32");
+        cout << "OS: Apple x32 \n" << endl;
 #   endif
 #endif
+}

-    int deviceCount = getCudaEnabledDeviceCount();
+void printCudaInfo()
+{
+#ifndef HAVE_CUDA
+    cout << "OpenCV was built without CUDA support \n" << endl;
+#else
    int driver;
    cudaDriverGetVersion(&driver);

-    printf("CUDA Driver  version: %d\n", driver);
-    printf("CUDA Runtime version: %d\n", CUDART_VERSION);
-    printf("CUDA device count: %d\n\n", deviceCount);
+    cout << "CUDA Driver  version: " << driver << '\n';
+    cout << "CUDA Runtime version: " << CUDART_VERSION << '\n';
+
+    cout << endl;
+
+    cout << "GPU module was compiled for the following GPU archs:" << endl;
+    cout << "    BIN: " << CUDA_ARCH_BIN << '\n';
+    cout << "    PTX: " << CUDA_ARCH_PTX << '\n';
+
+    cout << endl;
+
+    int deviceCount = getCudaEnabledDeviceCount();
+    cout << "CUDA device count: " << deviceCount << '\n';
+
+    cout << endl;

    for (int i = 0; i < deviceCount; ++i)
    {
        DeviceInfo info(i);

-        printf("Device %d:\n", i);
-        printf("    Name: %s\n", info.name().c_str());
-        printf("    Compute capability version: %d.%d\n", info.majorVersion(), info.minorVersion());
-        printf("    Total memory: %d Mb\n", static_cast<int>(static_cast<int>(info.totalMemory() / 1024.0) / 1024.0));
-        printf("    Free  memory: %d Mb\n", static_cast<int>(static_cast<int>(info.freeMemory() / 1024.0) / 1024.0));
-        if (info.isCompatible())
-            puts("    This device is compatible with current GPU module build\n");
-        else
-            puts("    This device is NOT compatible with current GPU module build\n");
+        cout << "Device [" << i << "] \n";
+        cout << "\t Name: " << info.name() << '\n';
+        cout << "\t Compute capability: " << info.majorVersion() << '.' << info.minorVersion()<< '\n';
+        cout << "\t Multi Processor Count: " << info.multiProcessorCount() << '\n';
+        cout << "\t Total memory: " << static_cast<int>(static_cast<int>(info.totalMemory() / 1024.0) / 1024.0) << " Mb \n";
+        cout << "\t Free  memory: " << static_cast<int>(static_cast<int>(info.freeMemory() / 1024.0) / 1024.0) << " Mb \n";
+        if (!info.isCompatible())
+            cout << "\t !!! This device is NOT compatible with current GPU module build \n";
+
+        cout << endl;
    }
-
-    puts("GPU module was compiled for the following GPU archs:");
-    printf("    BIN: %s\n", CUDA_ARCH_BIN);
-    printf("    PTX: %s\n\n", CUDA_ARCH_PTX);
+#endif
 }

-enum OutputLevel
-{
-    OutputLevelNone,
-    OutputLevelCompact,
-    OutputLevelFull
-};
-
-extern OutputLevel nvidiaTestOutputLevel;
-
 int main(int argc, char** argv)
 {
-    TS::ptr()->init("gpu");
-    InitGoogleTest(&argc, argv);
+    try
+    {
+        CommandLineParser cmd(argc, (const char**)argv,
+            "{ print_info_only | print_info_only | false | Print information about system and exit }"
+            "{ device | device | -1 | Device on which tests will be executed (-1 means all devices) }"
+            "{ nvtest_output_level | nvtest_output_level | compact | NVidia test verbosity level }"
+        );

-    const char* keys ="{ nvtest_output_level | nvtest_output_level | compact | NVidia test verbosity level }";
+        printOsInfo();
+        printCudaInfo();

-    CommandLineParser parser(argc, (const char**)argv, keys);
+        if (cmd.get<bool>("print_info_only"))
+            return 0;

-    string outputLevel = parser.get<string>("nvtest_output_level", "none");
+        int device = cmd.get<int>("device");
+        if (device < 0)
+        {
+            DeviceManager::instance().loadAll();

-    if (outputLevel == "none")
-        nvidiaTestOutputLevel = OutputLevelNone;
-    else if (outputLevel == "compact")
-        nvidiaTestOutputLevel = OutputLevelCompact;
-    else if (outputLevel == "full")
-        nvidiaTestOutputLevel = OutputLevelFull;
+            cout << "Run tests on all supported devices \n" << endl;
+        }
+        else
+        {
+            DeviceManager::instance().load(device);

-    print_info();
+            DeviceInfo info(device);
+            cout << "Run tests on device " << device << " [" << info.name() << "] \n" << endl;
+        }

-    return RUN_ALL_TESTS();
+        string outputLevel = cmd.get<string>("nvtest_output_level");
+
+        if (outputLevel == "none")
+            nvidiaTestOutputLevel = OutputLevelNone;
+        else if (outputLevel == "compact")
+            nvidiaTestOutputLevel = OutputLevelCompact;
+        else if (outputLevel == "full")
+            nvidiaTestOutputLevel = OutputLevelFull;
+
+        TS::ptr()->init("gpu");
+        InitGoogleTest(&argc, argv);
+
+        return RUN_ALL_TESTS();
+    }
+    catch (const exception& e)
+    {
+        cerr << e.what() << endl;
+        return -1;
+    }
+    catch (...)
+    {
+        cerr << "Unknown error" << endl;
+        return -1;
+    }
+
+    return 0;
 }

 #else // HAVE_CUDA
--- a/modules/gpu/test/main_test_nvidia.h
+++ b/modules/gpu/test/main_test_nvidia.h
@ -1,7 +1,7 @@
 #ifndef __main_test_nvidia_h__
 #define __main_test_nvidia_h__

-#include<string>
+#include <string>

 enum OutputLevel
 {
@ -10,6 +10,8 @@ enum OutputLevel
    OutputLevelFull
 };

+extern OutputLevel nvidiaTestOutputLevel;
+
 bool nvidia_NPPST_Integral_Image(const std::string& test_data_path, OutputLevel outputLevel);
 bool nvidia_NPPST_Squared_Integral_Image(const std::string& test_data_path, OutputLevel outputLevel);
 bool nvidia_NPPST_RectStdDev(const std::string& test_data_path, OutputLevel outputLevel);
--- a/modules/gpu/test/nvidia/TestHaarCascadeApplication.cpp
+++ b/modules/gpu/test/nvidia/TestHaarCascadeApplication.cpp
@ -245,8 +245,8 @@ bool TestHaarCascadeApplication::process()

    int devId;
    ncvAssertCUDAReturn(cudaGetDevice(&devId), false);
-    cudaDeviceProp devProp;
-    ncvAssertCUDAReturn(cudaGetDeviceProperties(&devProp, devId), false);
+    cudaDeviceProp _devProp;
+    ncvAssertCUDAReturn(cudaGetDeviceProperties(&_devProp, devId), false);

    ncvStat = ncvApplyHaarClassifierCascade_device(
        d_integralImage, d_rectStdDev, d_pixelMask,
@ -254,7 +254,7 @@ bool TestHaarCascadeApplication::process()
        haar, h_HaarStages, d_HaarStages, d_HaarNodes, d_HaarFeatures, false,
        searchRoiU, 1, 1.0f,
        *this->allocatorGPU.get(), *this->allocatorCPU.get(),
-        devProp, 0);
+        _devProp, 0);
    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);

    NCVMatrixAlloc<Ncv32u> h_pixelMask_d(*this->allocatorCPU.get(), this->width, this->height);
--- a/modules/gpu/test/nvidia/main_nvidia.cpp
+++ b/modules/gpu/test/nvidia/main_nvidia.cpp
@ -1,4 +1,6 @@
-#pragma warning (disable : 4408 4201 4100)
+#if defined _MSC_VER && _MSC_VER >= 1200
+# pragma warning (disable : 4408 4201 4100)
+#endif

 #include <cstdio>

--- a/modules/gpu/test/test_calib3d.cpp
+++ b/modules/gpu/test/test_calib3d.cpp
@ -39,7 +39,9 @@
 //
 //M*/

-#include "precomp.hpp"
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA

 namespace {

@ -329,7 +331,7 @@ TEST_P(ReprojectImageTo3D, Accuracy)

    cv::gpu::GpuMat dst;
    cv::gpu::reprojectImageTo3D(loadMat(disp, useRoi), dst, Q, 3);
-    
+
    cv::Mat dst_gold;
    cv::reprojectImageTo3D(disp, dst_gold, Q, false);

@ -343,3 +345,5 @@ INSTANTIATE_TEST_CASE_P(GPU_Calib3D, ReprojectImageTo3D, testing::Combine(
    WHOLE_SUBMAT));

 } // namespace
+
+#endif // HAVE_CUDA
--- a/modules/gpu/test/test_color.cpp
+++ b/modules/gpu/test/test_color.cpp
@ -39,7 +39,7 @@
 //
 //M*/

-#include "precomp.hpp"
+#include "test_precomp.hpp"

 #ifdef HAVE_CUDA

--- a/modules/gpu/test/test_copy_make_border.cpp
+++ b/modules/gpu/test/test_copy_make_border.cpp
@ -39,7 +39,9 @@
 //
 //M*/

-#include "precomp.hpp"
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA

 namespace {

@ -98,3 +100,5 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, CopyMakeBorder, testing::Combine(
    WHOLE_SUBMAT));

 } // namespace
+
+#endif // HAVE_CUDA
--- a/modules/gpu/test/test_core.cpp
+++ b/modules/gpu/test/test_core.cpp
@ -39,7 +39,9 @@
 //
 //M*/

-#include "precomp.hpp"
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA

 namespace {

@ -3396,3 +3398,5 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Reduce, testing::Combine(
    WHOLE_SUBMAT));

 } // namespace
+
+#endif // HAVE_CUDA
--- a/modules/gpu/test/test_features2d.cpp
+++ b/modules/gpu/test/test_features2d.cpp
@ -39,7 +39,9 @@
 //
 //M*/

-#include "precomp.hpp"
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA

 namespace {

@ -984,3 +986,5 @@ INSTANTIATE_TEST_CASE_P(GPU_Features2D, BruteForceMatcher, testing::Combine(
    testing::Values(UseMask(false), UseMask(true))));

 } // namespace
+
+#endif // HAVE_CUDA
--- a/modules/gpu/test/test_filters.cpp
+++ b/modules/gpu/test/test_filters.cpp
@ -39,7 +39,9 @@
 //
 //M*/

-#include "precomp.hpp"
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA

 namespace {

@ -552,3 +554,5 @@ INSTANTIATE_TEST_CASE_P(GPU_Filter, Filter2D, testing::Combine(
    WHOLE_SUBMAT));

 } // namespace
+
+#endif // HAVE_CUDA
--- a/Show More
+++ b/Show More