diff --git a/CMakeLists.txt b/CMakeLists.txt
index cd2095db86..71d714b9b0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -194,6 +194,7 @@ OCV_OPTION(BUILD_WITH_STATIC_CRT    "Enables use of staticaly linked CRT for sta
 OCV_OPTION(BUILD_FAT_JAVA_LIB       "Create fat java wrapper containing the whole OpenCV library" ON IF NOT BUILD_SHARED_LIBS AND CMAKE_COMPILER_IS_GNUCXX )
 OCV_OPTION(BUILD_ANDROID_SERVICE    "Build OpenCV Manager for Google Play" OFF IF ANDROID AND ANDROID_SOURCE_TREE )
 OCV_OPTION(BUILD_ANDROID_PACKAGE    "Build platform-specific package for Google Play" OFF IF ANDROID )
+OCV_OPTION(BUILD_TINY_GPU_MODULE    "Build tiny gpu module with limited image format support" OFF )
 
 # 3rd party libs
 OCV_OPTION(BUILD_ZLIB               "Build zlib from source"             WIN32 OR APPLE )
@@ -996,6 +997,7 @@ if(HAVE_CUDA)
   status("    NVIDIA GPU arch:"      ${OPENCV_CUDA_ARCH_BIN})
   status("    NVIDIA PTX archs:"     ${OPENCV_CUDA_ARCH_PTX})
   status("    Use fast math:"        CUDA_FAST_MATH THEN YES ELSE NO)
+  status("    Tiny gpu module:"      BUILD_TINY_GPU_MODULE THEN YES ELSE NO)
 endif()
 
 if(HAVE_OPENCL)
diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake
index 6a93d008ee..a69ac8c7e6 100644
--- a/cmake/OpenCVCompilerOptions.cmake
+++ b/cmake/OpenCVCompilerOptions.cmake
@@ -63,6 +63,10 @@ if(OPENCV_CAN_BREAK_BINARY_COMPATIBILITY)
   add_definitions(-DOPENCV_CAN_BREAK_BINARY_COMPATIBILITY)
 endif()
 
+if(BUILD_TINY_GPU_MODULE)
+  add_definitions(-DOPENCV_TINY_GPU_MODULE)
+endif()
+
 if(CMAKE_COMPILER_IS_GNUCXX)
   # High level of warnings.
   add_extra_compiler_option(-W)
diff --git a/modules/gpu/perf/perf_core.cpp b/modules/gpu/perf/perf_core.cpp
index ae6ed865b1..87e22c4695 100644
--- a/modules/gpu/perf/perf_core.cpp
+++ b/modules/gpu/perf/perf_core.cpp
@@ -46,7 +46,11 @@ using namespace std;
 using namespace testing;
 using namespace perf;
 
+#ifdef OPENCV_TINY_GPU_MODULE
+#define ARITHM_MAT_DEPTH Values(CV_8U, CV_32F)
+#else
 #define ARITHM_MAT_DEPTH Values(CV_8U, CV_16U, CV_32F, CV_64F)
+#endif
 
 //////////////////////////////////////////////////////////////////////
 // Merge
@@ -524,9 +528,17 @@ PERF_TEST_P(Sz_Depth, Core_AbsDiffScalar,
 //////////////////////////////////////////////////////////////////////
 // Abs
 
-PERF_TEST_P(Sz_Depth, Core_Abs,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_16S, CV_32F)))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth, Core_Abs, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(MatDepth(CV_32F))
+))
+#else
+PERF_TEST_P(Sz_Depth, Core_Abs, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_16S, CV_32F)
+))
+#endif
 {
     const cv::Size size = GET_PARAM(0);
     const int depth = GET_PARAM(1);
@@ -552,9 +564,17 @@ PERF_TEST_P(Sz_Depth, Core_Abs,
 //////////////////////////////////////////////////////////////////////
 // Sqr
 
-PERF_TEST_P(Sz_Depth, Core_Sqr,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16S, CV_32F)))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth, Core_Sqr, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(MatDepth(CV_32F))
+))
+#else
+PERF_TEST_P(Sz_Depth, Core_Sqr, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_16S, CV_32F)
+))
+#endif
 {
     const cv::Size size = GET_PARAM(0);
     const int depth = GET_PARAM(1);
@@ -580,9 +600,17 @@ PERF_TEST_P(Sz_Depth, Core_Sqr,
 //////////////////////////////////////////////////////////////////////
 // Sqrt
 
-PERF_TEST_P(Sz_Depth, Core_Sqrt,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16S, CV_32F)))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth, Core_Sqrt, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(MatDepth(CV_32F))
+))
+#else
+PERF_TEST_P(Sz_Depth, Core_Sqrt, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_16S, CV_32F)
+))
+#endif
 {
     const cv::Size size = GET_PARAM(0);
     const int depth = GET_PARAM(1);
@@ -612,9 +640,17 @@ PERF_TEST_P(Sz_Depth, Core_Sqrt,
 //////////////////////////////////////////////////////////////////////
 // Log
 
-PERF_TEST_P(Sz_Depth, Core_Log,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16S, CV_32F)))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth, Core_Log, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(MatDepth(CV_32F))
+))
+#else
+PERF_TEST_P(Sz_Depth, Core_Log, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_16S, CV_32F)
+))
+#endif
 {
     const cv::Size size = GET_PARAM(0);
     const int depth = GET_PARAM(1);
@@ -644,9 +680,17 @@ PERF_TEST_P(Sz_Depth, Core_Log,
 //////////////////////////////////////////////////////////////////////
 // Exp
 
-PERF_TEST_P(Sz_Depth, Core_Exp,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16S, CV_32F)))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth, Core_Exp, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(MatDepth(CV_32F))
+))
+#else
+PERF_TEST_P(Sz_Depth, Core_Exp, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_16S, CV_32F)
+))
+#endif
 {
     const cv::Size size = GET_PARAM(0);
     const int depth = GET_PARAM(1);
@@ -678,10 +722,19 @@ PERF_TEST_P(Sz_Depth, Core_Exp,
 
 DEF_PARAM_TEST(Sz_Depth_Power, cv::Size, MatDepth, double);
 
-PERF_TEST_P(Sz_Depth_Power, Core_Pow,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16S, CV_32F),
-                    Values(0.3, 2.0, 2.4)))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth_Power, Core_Pow, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(MatDepth(CV_32F)),
+    Values(0.3, 2.0, 2.4)
+))
+#else
+PERF_TEST_P(Sz_Depth_Power, Core_Pow, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_16S, CV_32F),
+    Values(0.3, 2.0, 2.4)
+))
+#endif
 {
     const cv::Size size = GET_PARAM(0);
     const int depth = GET_PARAM(1);
@@ -859,10 +912,19 @@ PERF_TEST_P(Sz_Depth, Core_BitwiseAndMat,
 //////////////////////////////////////////////////////////////////////
 // BitwiseAndScalar
 
-PERF_TEST_P(Sz_Depth_Cn, Core_BitwiseAndScalar,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16U, CV_32S),
-                    GPU_CHANNELS_1_3_4))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth_Cn, Core_BitwiseAndScalar, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(MatDepth(CV_8U)),
+    testing::Values(MatCn(Gray))
+))
+#else
+PERF_TEST_P(Sz_Depth_Cn, Core_BitwiseAndScalar, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_16U, CV_32S),
+    GPU_CHANNELS_1_3_4
+))
+#endif
 {
     const cv::Size size = GET_PARAM(0);
     const int depth = GET_PARAM(1);
@@ -935,10 +997,19 @@ PERF_TEST_P(Sz_Depth, Core_BitwiseOrMat,
 //////////////////////////////////////////////////////////////////////
 // BitwiseOrScalar
 
-PERF_TEST_P(Sz_Depth_Cn, Core_BitwiseOrScalar,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16U, CV_32S),
-                    GPU_CHANNELS_1_3_4))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth_Cn, Core_BitwiseOrScalar, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(MatDepth(CV_8U)),
+    testing::Values(MatCn(Gray))
+))
+#else
+PERF_TEST_P(Sz_Depth_Cn, Core_BitwiseOrScalar, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_16U, CV_32S),
+    GPU_CHANNELS_1_3_4
+))
+#endif
 {
     const cv::Size size = GET_PARAM(0);
     const int depth = GET_PARAM(1);
@@ -1011,10 +1082,19 @@ PERF_TEST_P(Sz_Depth, Core_BitwiseXorMat,
 //////////////////////////////////////////////////////////////////////
 // BitwiseXorScalar
 
-PERF_TEST_P(Sz_Depth_Cn, Core_BitwiseXorScalar,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16U, CV_32S),
-                    GPU_CHANNELS_1_3_4))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth_Cn, Core_BitwiseXorScalar, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(MatDepth(CV_8U)),
+    testing::Values(MatCn(Gray))
+))
+#else
+PERF_TEST_P(Sz_Depth_Cn, Core_BitwiseXorScalar, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_16U, CV_32S),
+    GPU_CHANNELS_1_3_4
+))
+#endif
 {
     const cv::Size size = GET_PARAM(0);
     const int depth = GET_PARAM(1);
@@ -1155,9 +1235,17 @@ PERF_TEST_P(Sz_Depth, Core_MinMat,
 //////////////////////////////////////////////////////////////////////
 // MinScalar
 
-PERF_TEST_P(Sz_Depth, Core_MinScalar,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16U, CV_32F)))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth, Core_MinScalar, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_32F)
+))
+#else
+PERF_TEST_P(Sz_Depth, Core_MinScalar, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_16U, CV_32F)
+))
+#endif
 {
     const cv::Size size = GET_PARAM(0);
     const int depth = GET_PARAM(1);
@@ -1226,9 +1314,17 @@ PERF_TEST_P(Sz_Depth, Core_MaxMat,
 //////////////////////////////////////////////////////////////////////
 // MaxScalar
 
-PERF_TEST_P(Sz_Depth, Core_MaxScalar,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16U, CV_32F)))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth, Core_MaxScalar, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_32F)
+))
+#else
+PERF_TEST_P(Sz_Depth, Core_MaxScalar, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_16U, CV_32F)
+))
+#endif
 {
     const cv::Size size = GET_PARAM(0);
     const int depth = GET_PARAM(1);
@@ -1263,11 +1359,21 @@ PERF_TEST_P(Sz_Depth, Core_MaxScalar,
 
 DEF_PARAM_TEST(Sz_3Depth, cv::Size, MatDepth, MatDepth, MatDepth);
 
-PERF_TEST_P(Sz_3Depth, Core_AddWeighted,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16U, CV_32F, CV_64F),
-                    Values(CV_8U, CV_16U, CV_32F, CV_64F),
-                    Values(CV_8U, CV_16U, CV_32F, CV_64F)))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_3Depth, Core_AddWeighted, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(MatDepth(CV_32F)),
+    Values(MatDepth(CV_32F)),
+    Values(MatDepth(CV_32F))
+))
+#else
+PERF_TEST_P(Sz_3Depth, Core_AddWeighted, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_16U, CV_32F, CV_64F),
+    Values(CV_8U, CV_16U, CV_32F, CV_64F),
+    Values(CV_8U, CV_16U, CV_32F, CV_64F)
+))
+#endif
 {
     const cv::Size size = GET_PARAM(0);
     const int depth1 = GET_PARAM(1);
@@ -1782,10 +1888,19 @@ PERF_TEST_P(Sz, Core_MeanStdDev,
 
 DEF_PARAM_TEST(Sz_Depth_Norm, cv::Size, MatDepth, NormType);
 
-PERF_TEST_P(Sz_Depth_Norm, Core_Norm,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16U, CV_32S, CV_32F),
-                    Values(NormType(cv::NORM_INF), NormType(cv::NORM_L1), NormType(cv::NORM_L2))))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth_Norm, Core_Norm, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_32F),
+    Values(NormType(cv::NORM_INF), NormType(cv::NORM_L1), NormType(cv::NORM_L2))
+))
+#else
+PERF_TEST_P(Sz_Depth_Norm, Core_Norm, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_16U, CV_32S, CV_32F),
+    Values(NormType(cv::NORM_INF), NormType(cv::NORM_L1), NormType(cv::NORM_L2))
+))
+#endif
 {
     const cv::Size size = GET_PARAM(0);
     const int depth = GET_PARAM(1);
@@ -1859,10 +1974,19 @@ PERF_TEST_P(Sz_Norm, Core_NormDiff,
 //////////////////////////////////////////////////////////////////////
 // Sum
 
-PERF_TEST_P(Sz_Depth_Cn, Core_Sum,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16U, CV_32F),
-                    GPU_CHANNELS_1_3_4))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth_Cn, Core_Sum, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_32F),
+    testing::Values(MatCn(Gray))
+))
+#else
+PERF_TEST_P(Sz_Depth_Cn, Core_Sum, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_16U, CV_32F),
+    GPU_CHANNELS_1_3_4
+))
+#endif
 {
     const cv::Size size = GET_PARAM(0);
     const int depth = GET_PARAM(1);
@@ -1896,10 +2020,19 @@ PERF_TEST_P(Sz_Depth_Cn, Core_Sum,
 //////////////////////////////////////////////////////////////////////
 // SumAbs
 
-PERF_TEST_P(Sz_Depth_Cn, Core_SumAbs,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16U, CV_32F),
-                    GPU_CHANNELS_1_3_4))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth_Cn, Core_SumAbs, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_32F),
+    testing::Values(MatCn(Gray))
+))
+#else
+PERF_TEST_P(Sz_Depth_Cn, Core_SumAbs, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_16U, CV_32F),
+    GPU_CHANNELS_1_3_4
+))
+#endif
 {
     const cv::Size size = GET_PARAM(0);
     const int depth = GET_PARAM(1);
@@ -1929,10 +2062,19 @@ PERF_TEST_P(Sz_Depth_Cn, Core_SumAbs,
 //////////////////////////////////////////////////////////////////////
 // SumSqr
 
-PERF_TEST_P(Sz_Depth_Cn, Core_SumSqr,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values<MatDepth>(CV_8U, CV_16U, CV_32F),
-                    GPU_CHANNELS_1_3_4))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth_Cn, Core_SumSqr, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_32F),
+    testing::Values(MatCn(Gray))
+))
+#else
+PERF_TEST_P(Sz_Depth_Cn, Core_SumSqr, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_16U, CV_32F),
+    GPU_CHANNELS_1_3_4
+))
+#endif
 {
     const cv::Size size = GET_PARAM(0);
     const int depth = GET_PARAM(1);
@@ -1962,9 +2104,17 @@ PERF_TEST_P(Sz_Depth_Cn, Core_SumSqr,
 //////////////////////////////////////////////////////////////////////
 // MinMax
 
-PERF_TEST_P(Sz_Depth, Core_MinMax,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16U, CV_32F, CV_64F)))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth, Core_MinMax, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_32F)
+))
+#else
+PERF_TEST_P(Sz_Depth, Core_MinMax, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_16U, CV_32F, CV_64F)
+))
+#endif
 {
     const cv::Size size = GET_PARAM(0);
     const int depth = GET_PARAM(1);
@@ -2000,9 +2150,17 @@ PERF_TEST_P(Sz_Depth, Core_MinMax,
 //////////////////////////////////////////////////////////////////////
 // MinMaxLoc
 
-PERF_TEST_P(Sz_Depth, Core_MinMaxLoc,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16U, CV_32F, CV_64F)))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth, Core_MinMaxLoc, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_32F)
+))
+#else
+PERF_TEST_P(Sz_Depth, Core_MinMaxLoc, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_16U, CV_32F, CV_64F)
+))
+#endif
 {
     const cv::Size size = GET_PARAM(0);
     const int depth = GET_PARAM(1);
@@ -2040,9 +2198,17 @@ PERF_TEST_P(Sz_Depth, Core_MinMaxLoc,
 //////////////////////////////////////////////////////////////////////
 // CountNonZero
 
-PERF_TEST_P(Sz_Depth, Core_CountNonZero,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16U, CV_32F, CV_64F)))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth, Core_CountNonZero, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_32F)
+))
+#else
+PERF_TEST_P(Sz_Depth, Core_CountNonZero, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_16U, CV_32F, CV_64F)
+))
+#endif
 {
     const cv::Size size = GET_PARAM(0);
     const int depth = GET_PARAM(1);
@@ -2079,12 +2245,23 @@ CV_ENUM(ReduceDim, Rows, Cols)
 
 DEF_PARAM_TEST(Sz_Depth_Cn_Code_Dim, cv::Size, MatDepth, MatCn, ReduceCode, ReduceDim);
 
-PERF_TEST_P(Sz_Depth_Cn_Code_Dim, Core_Reduce,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16U, CV_16S, CV_32F),
-                    Values(1, 2, 3, 4),
-                    ReduceCode::all(),
-                    ReduceDim::all()))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth_Cn_Code_Dim, Core_Reduce, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_32F),
+    Values(1, 2, 3, 4),
+    ReduceCode::all(),
+    ReduceDim::all()
+))
+#else
+PERF_TEST_P(Sz_Depth_Cn_Code_Dim, Core_Reduce, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_16U, CV_16S, CV_32F),
+    Values(1, 2, 3, 4),
+    ReduceCode::all(),
+    ReduceDim::all()
+))
+#endif
 {
     const cv::Size size = GET_PARAM(0);
     const int depth = GET_PARAM(1);
@@ -2120,13 +2297,25 @@ PERF_TEST_P(Sz_Depth_Cn_Code_Dim, Core_Reduce,
 
 DEF_PARAM_TEST(Sz_Depth_NormType, cv::Size, MatDepth, NormType);
 
-PERF_TEST_P(Sz_Depth_NormType, Core_Normalize,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16U, CV_32F, CV_64F),
-                    Values(NormType(cv::NORM_INF),
-                           NormType(cv::NORM_L1),
-                           NormType(cv::NORM_L2),
-                           NormType(cv::NORM_MINMAX))))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth_NormType, Core_Normalize, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_32F),
+    Values(NormType(cv::NORM_INF),
+           NormType(cv::NORM_L1),
+           NormType(cv::NORM_L2),
+           NormType(cv::NORM_MINMAX))
+))
+#else
+PERF_TEST_P(Sz_Depth_NormType, Core_Normalize, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_16U, CV_32F, CV_64F),
+    Values(NormType(cv::NORM_INF),
+           NormType(cv::NORM_L1),
+           NormType(cv::NORM_L2),
+           NormType(cv::NORM_MINMAX))
+))
+#endif
 {
     const cv::Size size = GET_PARAM(0);
     const int type = GET_PARAM(1);
diff --git a/modules/gpu/perf/perf_features2d.cpp b/modules/gpu/perf/perf_features2d.cpp
index 2b1ab58129..5a21acdff6 100644
--- a/modules/gpu/perf/perf_features2d.cpp
+++ b/modules/gpu/perf/perf_features2d.cpp
@@ -145,9 +145,17 @@ PERF_TEST_P(Image_NFeatures, Features2D_ORB,
 
 DEF_PARAM_TEST(DescSize_Norm, int, NormType);
 
-PERF_TEST_P(DescSize_Norm, Features2D_BFMatch,
-            Combine(Values(64, 128, 256),
-                    Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(DescSize_Norm, Features2D_BFMatch, Combine(
+    Values(64, 128, 256),
+    Values(NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))
+))
+#else
+PERF_TEST_P(DescSize_Norm, Features2D_BFMatch, Combine(
+    Values(64, 128, 256),
+    Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))
+))
+#endif
 {
     declare.time(20.0);
 
@@ -202,10 +210,19 @@ static void toOneRowMatches(const std::vector< std::vector<cv::DMatch> >& src, s
 
 DEF_PARAM_TEST(DescSize_K_Norm, int, int, NormType);
 
-PERF_TEST_P(DescSize_K_Norm, Features2D_BFKnnMatch,
-            Combine(Values(64, 128, 256),
-                    Values(2, 3),
-                    Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2))))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(DescSize_K_Norm, Features2D_BFKnnMatch, Combine(
+    Values(64, 128, 256),
+    Values(2, 3),
+    Values(NormType(cv::NORM_L2))
+))
+#else
+PERF_TEST_P(DescSize_K_Norm, Features2D_BFKnnMatch, Combine(
+    Values(64, 128, 256),
+    Values(2, 3),
+    Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2))
+))
+#endif
 {
     declare.time(30.0);
 
@@ -257,9 +274,17 @@ PERF_TEST_P(DescSize_K_Norm, Features2D_BFKnnMatch,
 //////////////////////////////////////////////////////////////////////
 // BFRadiusMatch
 
-PERF_TEST_P(DescSize_Norm, Features2D_BFRadiusMatch,
-            Combine(Values(64, 128, 256),
-                    Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2))))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(DescSize_Norm, Features2D_BFRadiusMatch, Combine(
+    Values(64, 128, 256),
+    Values(NormType(cv::NORM_L2))
+))
+#else
+PERF_TEST_P(DescSize_Norm, Features2D_BFRadiusMatch, Combine(
+    Values(64, 128, 256),
+    Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2))
+))
+#endif
 {
     declare.time(30.0);
 
diff --git a/modules/gpu/perf/perf_filters.cpp b/modules/gpu/perf/perf_filters.cpp
index adfc294f6d..f064dd395e 100644
--- a/modules/gpu/perf/perf_filters.cpp
+++ b/modules/gpu/perf/perf_filters.cpp
@@ -87,7 +87,19 @@ PERF_TEST_P(Sz_Type_KernelSz, Filters_Blur,
 //////////////////////////////////////////////////////////////////////
 // Sobel
 
-PERF_TEST_P(Sz_Type_KernelSz, Filters_Sobel, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1), Values(3, 5, 7, 9, 11, 13, 15)))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Type_KernelSz, Filters_Sobel, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8UC1, CV_8UC4, CV_32FC1),
+    Values(3, 5, 7)
+))
+#else
+PERF_TEST_P(Sz_Type_KernelSz, Filters_Sobel, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8UC1, CV_8UC4, CV_32FC1),
+    Values(3, 5, 7, 9, 11, 13, 15)
+))
+#endif
 {
     declare.time(20.0);
 
@@ -154,7 +166,19 @@ PERF_TEST_P(Sz_Type, Filters_Scharr, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U
 //////////////////////////////////////////////////////////////////////
 // GaussianBlur
 
-PERF_TEST_P(Sz_Type_KernelSz, Filters_GaussianBlur, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1), Values(3, 5, 7, 9, 11, 13, 15)))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Type_KernelSz, Filters_GaussianBlur, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8UC1, CV_8UC4, CV_32FC1),
+    Values(3, 5, 7)
+))
+#else
+PERF_TEST_P(Sz_Type_KernelSz, Filters_GaussianBlur, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8UC1, CV_8UC4, CV_32FC1),
+    Values(3, 5, 7, 9, 11, 13, 15)
+))
+#endif
 {
     declare.time(20.0);
 
diff --git a/modules/gpu/perf/perf_imgproc.cpp b/modules/gpu/perf/perf_imgproc.cpp
index c7c1022941..f2762e07c0 100644
--- a/modules/gpu/perf/perf_imgproc.cpp
+++ b/modules/gpu/perf/perf_imgproc.cpp
@@ -91,13 +91,25 @@ void generateMap(cv::Mat& map_x, cv::Mat& map_y, int remapMode)
 
 DEF_PARAM_TEST(Sz_Depth_Cn_Inter_Border_Mode, cv::Size, MatDepth, MatCn, Interpolation, BorderMode, RemapMode);
 
-PERF_TEST_P(Sz_Depth_Cn_Inter_Border_Mode, ImgProc_Remap,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16U, CV_32F),
-                    GPU_CHANNELS_1_3_4,
-                    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
-                    ALL_BORDER_MODES,
-                    RemapMode::all()))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth_Cn_Inter_Border_Mode, ImgProc_Remap, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_32F),
+    GPU_CHANNELS_1_3_4,
+    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR)),
+    ALL_BORDER_MODES,
+    RemapMode::all()
+))
+#else
+PERF_TEST_P(Sz_Depth_Cn_Inter_Border_Mode, ImgProc_Remap, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_16U, CV_32F),
+    GPU_CHANNELS_1_3_4,
+    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
+    ALL_BORDER_MODES,
+    RemapMode::all()
+))
+#endif
 {
     declare.time(20.0);
 
@@ -143,12 +155,23 @@ PERF_TEST_P(Sz_Depth_Cn_Inter_Border_Mode, ImgProc_Remap,
 
 DEF_PARAM_TEST(Sz_Depth_Cn_Inter_Scale, cv::Size, MatDepth, MatCn, Interpolation, double);
 
-PERF_TEST_P(Sz_Depth_Cn_Inter_Scale, ImgProc_Resize,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16U, CV_32F),
-                    GPU_CHANNELS_1_3_4,
-                    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
-                    Values(0.5, 0.3, 2.0)))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth_Cn_Inter_Scale, ImgProc_Resize, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_32F),
+    GPU_CHANNELS_1_3_4,
+    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR)),
+    Values(0.5, 0.3, 2.0)
+))
+#else
+PERF_TEST_P(Sz_Depth_Cn_Inter_Scale, ImgProc_Resize, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_16U, CV_32F),
+    GPU_CHANNELS_1_3_4,
+    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
+    Values(0.5, 0.3, 2.0)
+))
+#endif
 {
     declare.time(20.0);
 
@@ -187,11 +210,21 @@ PERF_TEST_P(Sz_Depth_Cn_Inter_Scale, ImgProc_Resize,
 
 DEF_PARAM_TEST(Sz_Depth_Cn_Scale, cv::Size, MatDepth, MatCn, double);
 
-PERF_TEST_P(Sz_Depth_Cn_Scale, ImgProc_ResizeArea,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16U, CV_32F),
-                    GPU_CHANNELS_1_3_4,
-                    Values(0.2, 0.1, 0.05)))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth_Cn_Scale, ImgProc_ResizeArea, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_32F),
+    GPU_CHANNELS_1_3_4,
+    Values(0.2, 0.1, 0.05)
+))
+#else
+PERF_TEST_P(Sz_Depth_Cn_Scale, ImgProc_ResizeArea, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_16U, CV_32F),
+    GPU_CHANNELS_1_3_4,
+    Values(0.2, 0.1, 0.05)
+))
+#endif
 {
     declare.time(1.0);
 
@@ -230,12 +263,23 @@ PERF_TEST_P(Sz_Depth_Cn_Scale, ImgProc_ResizeArea,
 
 DEF_PARAM_TEST(Sz_Depth_Cn_Inter_Border, cv::Size, MatDepth, MatCn, Interpolation, BorderMode);
 
-PERF_TEST_P(Sz_Depth_Cn_Inter_Border, ImgProc_WarpAffine,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16U, CV_32F),
-                    GPU_CHANNELS_1_3_4,
-                    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
-                    ALL_BORDER_MODES))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth_Cn_Inter_Border, ImgProc_WarpAffine, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_32F),
+    GPU_CHANNELS_1_3_4,
+    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR)),
+    ALL_BORDER_MODES)
+)
+#else
+PERF_TEST_P(Sz_Depth_Cn_Inter_Border, ImgProc_WarpAffine, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_16U, CV_32F),
+    GPU_CHANNELS_1_3_4,
+    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
+    ALL_BORDER_MODES)
+)
+#endif
 {
     declare.time(20.0);
 
@@ -280,12 +324,23 @@ PERF_TEST_P(Sz_Depth_Cn_Inter_Border, ImgProc_WarpAffine,
 //////////////////////////////////////////////////////////////////////
 // WarpPerspective
 
-PERF_TEST_P(Sz_Depth_Cn_Inter_Border, ImgProc_WarpPerspective,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16U, CV_32F),
-                    GPU_CHANNELS_1_3_4,
-                    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
-                    ALL_BORDER_MODES))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth_Cn_Inter_Border, ImgProc_WarpPerspective, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_32F),
+    GPU_CHANNELS_1_3_4,
+    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR)),
+    ALL_BORDER_MODES)
+)
+#else
+PERF_TEST_P(Sz_Depth_Cn_Inter_Border, ImgProc_WarpPerspective, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_16U, CV_32F),
+    GPU_CHANNELS_1_3_4,
+    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
+    ALL_BORDER_MODES)
+)
+#endif
 {
     declare.time(20.0);
 
@@ -330,11 +385,21 @@ PERF_TEST_P(Sz_Depth_Cn_Inter_Border, ImgProc_WarpPerspective,
 
 DEF_PARAM_TEST(Sz_Depth_Cn_Border, cv::Size, MatDepth, MatCn, BorderMode);
 
-PERF_TEST_P(Sz_Depth_Cn_Border, ImgProc_CopyMakeBorder,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16U, CV_32F),
-                    GPU_CHANNELS_1_3_4,
-                    ALL_BORDER_MODES))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth_Cn_Border, ImgProc_CopyMakeBorder, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_32F),
+    GPU_CHANNELS_1_3_4,
+    ALL_BORDER_MODES)
+)
+#else
+PERF_TEST_P(Sz_Depth_Cn_Border, ImgProc_CopyMakeBorder, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_16U, CV_32F),
+    GPU_CHANNELS_1_3_4,
+    ALL_BORDER_MODES)
+)
+#endif
 {
     const cv::Size size = GET_PARAM(0);
     const int depth = GET_PARAM(1);
@@ -372,10 +437,19 @@ CV_ENUM(ThreshOp, THRESH_BINARY, THRESH_BINARY_INV, THRESH_TRUNC, THRESH_TOZERO,
 
 DEF_PARAM_TEST(Sz_Depth_Op, cv::Size, MatDepth, ThreshOp);
 
-PERF_TEST_P(Sz_Depth_Op, ImgProc_Threshold,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-            Values(CV_8U, CV_16U, CV_32F, CV_64F),
-            ThreshOp::all()))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth_Op, ImgProc_Threshold, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_32F),
+    ThreshOp::all()
+))
+#else
+PERF_TEST_P(Sz_Depth_Op, ImgProc_Threshold, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_16U, CV_32F, CV_64F),
+    ThreshOp::all()
+))
+#endif
 {
     const cv::Size size = GET_PARAM(0);
     const int depth = GET_PARAM(1);
@@ -672,10 +746,19 @@ PERF_TEST_P(Sz, ImgProc_ColumnSum,
 
 DEF_PARAM_TEST(Image_AppertureSz_L2gradient, string, int, bool);
 
-PERF_TEST_P(Image_AppertureSz_L2gradient, ImgProc_Canny,
-            Combine(Values("perf/800x600.png", "perf/1280x1024.png", "perf/1680x1050.png"),
-                    Values(3, 5),
-                    Bool()))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Image_AppertureSz_L2gradient, ImgProc_Canny, Combine(
+    Values("perf/800x600.png", "perf/1280x1024.png", "perf/1680x1050.png"),
+    Values(3),
+    Bool()
+))
+#else
+PERF_TEST_P(Image_AppertureSz_L2gradient, ImgProc_Canny, Combine(
+    Values("perf/800x600.png", "perf/1280x1024.png", "perf/1680x1050.png"),
+    Values(3, 5),
+    Bool()
+))
+#endif
 {
     const string fileName = GET_PARAM(0);
     const int apperture_size = GET_PARAM(1);
@@ -1300,10 +1383,19 @@ PERF_TEST_P(Sz_Depth_Cn_Inter, ImgProc_Rotate,
 //////////////////////////////////////////////////////////////////////
 // PyrDown
 
-PERF_TEST_P(Sz_Depth_Cn, ImgProc_PyrDown,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16U, CV_32F),
-                    GPU_CHANNELS_1_3_4))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth_Cn, ImgProc_PyrDown, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_32F),
+    GPU_CHANNELS_1_3_4)
+)
+#else
+PERF_TEST_P(Sz_Depth_Cn, ImgProc_PyrDown, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_16U, CV_32F),
+    GPU_CHANNELS_1_3_4)
+)
+#endif
 {
     const cv::Size size = GET_PARAM(0);
     const int depth = GET_PARAM(1);
@@ -1336,10 +1428,19 @@ PERF_TEST_P(Sz_Depth_Cn, ImgProc_PyrDown,
 //////////////////////////////////////////////////////////////////////
 // PyrUp
 
-PERF_TEST_P(Sz_Depth_Cn, ImgProc_PyrUp,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16U, CV_32F),
-                    GPU_CHANNELS_1_3_4))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth_Cn, ImgProc_PyrUp, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_32F),
+    GPU_CHANNELS_1_3_4)
+)
+#else
+PERF_TEST_P(Sz_Depth_Cn, ImgProc_PyrUp, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_16U, CV_32F),
+    GPU_CHANNELS_1_3_4)
+)
+#endif
 {
     const cv::Size size = GET_PARAM(0);
     const int depth = GET_PARAM(1);
diff --git a/modules/gpu/src/brute_force_matcher.cpp b/modules/gpu/src/brute_force_matcher.cpp
index 5da22e156b..e39bce2270 100644
--- a/modules/gpu/src/brute_force_matcher.cpp
+++ b/modules/gpu/src/brute_force_matcher.cpp
@@ -204,6 +204,26 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& query, const
                              const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
                              cudaStream_t stream);
 
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const caller_t callers[3][6] =
+    {
+        {
+            0/*matchL1_gpu<unsigned char>*/, 0/*matchL1_gpu<signed char>*/,
+            0/*matchL1_gpu<unsigned short>*/, 0/*matchL1_gpu<short>*/,
+            0/*matchL1_gpu<int>*/, 0/*matchL1_gpu<float>*/
+        },
+        {
+            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
+            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
+            0/*matchL2_gpu<int>*/, matchL2_gpu<float>
+        },
+        {
+            matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
+            0/*matchHamming_gpu<unsigned short>*/, 0/*matchHamming_gpu<short>*/,
+            0/*matchHamming_gpu<int>*/, 0/*matchHamming_gpu<float>*/
+        }
+    };
+#else
     static const caller_t callers[3][6] =
     {
         {
@@ -222,6 +242,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& query, const
             matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
         }
     };
+#endif
 
     CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
     CV_Assert(train.cols == query.cols && train.type() == query.type());
@@ -334,6 +355,16 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& query, c
     GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance,
     const GpuMat& masks, Stream& stream)
 {
+#ifdef OPENCV_TINY_GPU_MODULE
+    (void)query;
+    (void)trainCollection;
+    (void)trainIdx;
+    (void)imgIdx;
+    (void)distance;
+    (void)masks;
+    (void)stream;
+    CV_Error(CV_StsNotImplemented, "not available in tiny build");
+#else
     if (query.empty() || trainCollection.empty())
         return;
 
@@ -374,6 +405,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& query, c
     CV_Assert(func != 0);
 
     func(query, trainCollection, masks, trainIdx, imgIdx, distance, StreamAccessor::getStream(stream));
+#endif
 }
 
 void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, vector<DMatch>& matches)
@@ -451,6 +483,26 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchSingle(const GpuMat& query, co
                              const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
                              cudaStream_t stream);
 
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const caller_t callers[3][6] =
+    {
+        {
+            0/*matchL1_gpu<unsigned char>*/, 0/*matchL1_gpu<signed char>*/,
+            0/*matchL1_gpu<unsigned short>*/, 0/*matchL1_gpu<short>*/,
+            0/*matchL1_gpu<int>*/, 0/*matchL1_gpu<float>*/
+        },
+        {
+            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
+            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
+            0/*matchL2_gpu<int>*/, matchL2_gpu<float>
+        },
+        {
+            matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
+            0/*matchHamming_gpu<unsigned short>*/, 0/*matchHamming_gpu<short>*/,
+            0/*matchHamming_gpu<int>*/, 0/*matchHamming_gpu<float>*/
+        }
+    };
+#else
     static const caller_t callers[3][6] =
     {
         {
@@ -469,6 +521,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchSingle(const GpuMat& query, co
             matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
         }
     };
+#endif
 
     CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
     CV_Assert(train.type() == query.type() && train.cols == query.cols);
@@ -568,6 +621,16 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Collection(const GpuMat& quer
     GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance,
     const GpuMat& maskCollection, Stream& stream)
 {
+#ifdef OPENCV_TINY_GPU_MODULE
+    (void)query;
+    (void)trainCollection;
+    (void)trainIdx;
+    (void)imgIdx;
+    (void)distance;
+    (void)maskCollection;
+    (void)stream;
+    CV_Error(CV_StsNotImplemented, "not available in tiny build");
+#else
     if (query.empty() || trainCollection.empty())
         return;
 
@@ -613,6 +676,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Collection(const GpuMat& quer
     CV_Assert(func != 0);
 
     func(query, trainCollection, maskCollection, trainIdx, imgIdx, distance, StreamAccessor::getStream(stream));
+#endif
 }
 
 void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Download(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance,
@@ -755,6 +819,26 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query,
                              const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
                              cudaStream_t stream);
 
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const caller_t callers[3][6] =
+    {
+        {
+            0/*matchL1_gpu<unsigned char>*/, 0/*matchL1_gpu<signed char>*/,
+            0/*matchL1_gpu<unsigned short>*/, 0/*matchL1_gpu<short>*/,
+            0/*matchL1_gpu<int>*/, 0/*matchL1_gpu<float>*/
+        },
+        {
+            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
+            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
+            0/*matchL2_gpu<int>*/, matchL2_gpu<float>
+        },
+        {
+            matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
+            0/*matchHamming_gpu<unsigned short>*/, 0/*matchHamming_gpu<short>*/,
+            0/*matchHamming_gpu<int>*/, 0/*matchHamming_gpu<float>*/
+        }
+    };
+#else
     static const caller_t callers[3][6] =
     {
         {
@@ -773,6 +857,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query,
             matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
         }
     };
+#endif
 
     const int nQuery = query.rows;
     const int nTrain = train.rows;
@@ -872,6 +957,17 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& query, const
 void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& query, GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, GpuMat& nMatches,
     float maxDistance, const vector<GpuMat>& masks, Stream& stream)
 {
+#ifdef OPENCV_TINY_GPU_MODULE
+    (void)query;
+    (void)trainIdx;
+    (void)imgIdx;
+    (void)distance;
+    (void)nMatches;
+    (void)maxDistance;
+    (void)masks;
+    (void)stream;
+    CV_Error(CV_StsNotImplemented, "not available in tiny build");
+#else
     if (query.empty() || empty())
         return;
 
@@ -926,6 +1022,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& qu
 
     func(query, &trains_[0], static_cast<int>(trains_.size()), maxDistance, masks_.size() == 0 ? 0 : &masks_[0],
         trainIdx, imgIdx, distance, nMatches, StreamAccessor::getStream(stream));
+#endif
 }
 
 void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, const GpuMat& nMatches,
diff --git a/modules/gpu/src/color.cpp b/modules/gpu/src/color.cpp
index 66a1ad6791..09b8be526f 100644
--- a/modules/gpu/src/color.cpp
+++ b/modules/gpu/src/color.cpp
@@ -71,6 +71,12 @@ namespace cv { namespace gpu {
 
 using namespace ::cv::gpu::device;
 
+#ifdef OPENCV_TINY_GPU_MODULE
+    #define APPEND_16U(func) 0
+#else
+    #define APPEND_16U(func) func ## _16u
+#endif
+
 namespace
 {
     typedef void (*gpu_func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
@@ -78,10 +84,11 @@ namespace
     void bgr_to_rgb(const GpuMat& src, GpuMat& dst, int, Stream& stream)
     {
         using namespace cv::gpu::device;
-        static const gpu_func_t funcs[] = {bgr_to_rgb_8u, 0, bgr_to_rgb_16u, 0, 0, bgr_to_rgb_32f};
+        static const gpu_func_t funcs[] = {bgr_to_rgb_8u, 0, APPEND_16U(bgr_to_rgb), 0, 0, bgr_to_rgb_32f};
 
         CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
         CV_Assert(src.channels() == 3);
+        CV_Assert(funcs[src.depth()] != 0);
 
         dst.create(src.size(), CV_MAKETYPE(src.depth(), 3));
 
@@ -91,10 +98,11 @@ namespace
     void bgr_to_bgra(const GpuMat& src, GpuMat& dst, int, Stream& stream)
     {
         using namespace cv::gpu::device;
-        static const gpu_func_t funcs[] = {bgr_to_bgra_8u, 0, bgr_to_bgra_16u, 0, 0, bgr_to_bgra_32f};
+        static const gpu_func_t funcs[] = {bgr_to_bgra_8u, 0, APPEND_16U(bgr_to_bgra), 0, 0, bgr_to_bgra_32f};
 
         CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
         CV_Assert(src.channels() == 3);
+        CV_Assert(funcs[src.depth()] != 0);
 
         dst.create(src.size(), CV_MAKETYPE(src.depth(), 4));
 
@@ -104,10 +112,11 @@ namespace
     void bgr_to_rgba(const GpuMat& src, GpuMat& dst, int, Stream& stream)
     {
         using namespace cv::gpu::device;
-        static const gpu_func_t funcs[] = {bgr_to_rgba_8u, 0, bgr_to_rgba_16u, 0, 0, bgr_to_rgba_32f};
+        static const gpu_func_t funcs[] = {bgr_to_rgba_8u, 0, APPEND_16U(bgr_to_rgba), 0, 0, bgr_to_rgba_32f};
 
         CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
         CV_Assert(src.channels() == 3);
+        CV_Assert(funcs[src.depth()] != 0);
 
         dst.create(src.size(), CV_MAKETYPE(src.depth(), 4));
 
@@ -117,10 +126,11 @@ namespace
     void bgra_to_bgr(const GpuMat& src, GpuMat& dst, int, Stream& stream)
     {
         using namespace cv::gpu::device;
-        static const gpu_func_t funcs[] = {bgra_to_bgr_8u, 0, bgra_to_bgr_16u, 0, 0, bgra_to_bgr_32f};
+        static const gpu_func_t funcs[] = {bgra_to_bgr_8u, 0, APPEND_16U(bgra_to_bgr), 0, 0, bgra_to_bgr_32f};
 
         CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
         CV_Assert(src.channels() == 4);
+        CV_Assert(funcs[src.depth()] != 0);
 
         dst.create(src.size(), CV_MAKETYPE(src.depth(), 3));
 
@@ -130,10 +140,11 @@ namespace
     void bgra_to_rgb(const GpuMat& src, GpuMat& dst, int, Stream& stream)
     {
         using namespace cv::gpu::device;
-        static const gpu_func_t funcs[] = {bgra_to_rgb_8u, 0, bgra_to_rgb_16u, 0, 0, bgra_to_rgb_32f};
+        static const gpu_func_t funcs[] = {bgra_to_rgb_8u, 0, APPEND_16U(bgra_to_rgb), 0, 0, bgra_to_rgb_32f};
 
         CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
         CV_Assert(src.channels() == 4);
+        CV_Assert(funcs[src.depth()] != 0);
 
         dst.create(src.size(), CV_MAKETYPE(src.depth(), 3));
 
@@ -143,10 +154,11 @@ namespace
     void bgra_to_rgba(const GpuMat& src, GpuMat& dst, int, Stream& stream)
     {
         using namespace cv::gpu::device;
-        static const gpu_func_t funcs[] = {bgra_to_rgba_8u, 0, bgra_to_rgba_16u, 0, 0, bgra_to_rgba_32f};
+        static const gpu_func_t funcs[] = {bgra_to_rgba_8u, 0, APPEND_16U(bgra_to_rgba), 0, 0, bgra_to_rgba_32f};
 
         CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
         CV_Assert(src.channels() == 4);
+        CV_Assert(funcs[src.depth()] != 0);
 
         dst.create(src.size(), CV_MAKETYPE(src.depth(), 4));
 
@@ -316,10 +328,11 @@ namespace
     void gray_to_bgr(const GpuMat& src, GpuMat& dst, int, Stream& stream)
     {
         using namespace cv::gpu::device;
-        static const gpu_func_t funcs[] = {gray_to_bgr_8u, 0, gray_to_bgr_16u, 0, 0, gray_to_bgr_32f};
+        static const gpu_func_t funcs[] = {gray_to_bgr_8u, 0, APPEND_16U(gray_to_bgr), 0, 0, gray_to_bgr_32f};
 
         CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
         CV_Assert(src.channels() == 1);
+        CV_Assert(funcs[src.depth()] != 0);
 
         dst.create(src.size(), CV_MAKETYPE(src.depth(), 3));
 
@@ -329,10 +342,11 @@ namespace
     void gray_to_bgra(const GpuMat& src, GpuMat& dst, int, Stream& stream)
     {
         using namespace cv::gpu::device;
-        static const gpu_func_t funcs[] = {gray_to_bgra_8u, 0, gray_to_bgra_16u, 0, 0, gray_to_bgra_32f};
+        static const gpu_func_t funcs[] = {gray_to_bgra_8u, 0, APPEND_16U(gray_to_bgra), 0, 0, gray_to_bgra_32f};
 
         CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
         CV_Assert(src.channels() == 1);
+        CV_Assert(funcs[src.depth()] != 0);
 
         dst.create(src.size(), CV_MAKETYPE(src.depth(), 4));
 
@@ -382,10 +396,11 @@ namespace
     void rgb_to_gray(const GpuMat& src, GpuMat& dst, int, Stream& stream)
     {
         using namespace cv::gpu::device;
-        static const gpu_func_t funcs[] = {rgb_to_gray_8u, 0, rgb_to_gray_16u, 0, 0, rgb_to_gray_32f};
+        static const gpu_func_t funcs[] = {rgb_to_gray_8u, 0, APPEND_16U(rgb_to_gray), 0, 0, rgb_to_gray_32f};
 
         CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
         CV_Assert(src.channels() == 3);
+        CV_Assert(funcs[src.depth()] != 0);
 
         dst.create(src.size(), CV_MAKETYPE(src.depth(), 1));
 
@@ -395,10 +410,11 @@ namespace
     void bgr_to_gray(const GpuMat& src, GpuMat& dst, int, Stream& stream)
     {
         using namespace cv::gpu::device;
-        static const gpu_func_t funcs[] = {bgr_to_gray_8u, 0, bgr_to_gray_16u, 0, 0, bgr_to_gray_32f};
+        static const gpu_func_t funcs[] = {bgr_to_gray_8u, 0, APPEND_16U(bgr_to_gray), 0, 0, bgr_to_gray_32f};
 
         CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
         CV_Assert(src.channels() == 3);
+        CV_Assert(funcs[src.depth()] != 0);
 
         dst.create(src.size(), CV_MAKETYPE(src.depth(), 1));
 
@@ -408,10 +424,11 @@ namespace
     void rgba_to_gray(const GpuMat& src, GpuMat& dst, int, Stream& stream)
     {
         using namespace cv::gpu::device;
-        static const gpu_func_t funcs[] = {rgba_to_gray_8u, 0, rgba_to_gray_16u, 0, 0, rgba_to_gray_32f};
+        static const gpu_func_t funcs[] = {rgba_to_gray_8u, 0, APPEND_16U(rgba_to_gray), 0, 0, rgba_to_gray_32f};
 
         CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
         CV_Assert(src.channels() == 4);
+        CV_Assert(funcs[src.depth()] != 0);
 
         dst.create(src.size(), CV_MAKETYPE(src.depth(), 1));
 
@@ -421,10 +438,11 @@ namespace
     void bgra_to_gray(const GpuMat& src, GpuMat& dst, int, Stream& stream)
     {
         using namespace cv::gpu::device;
-        static const gpu_func_t funcs[] = {bgra_to_gray_8u, 0, bgra_to_gray_16u, 0, 0, bgra_to_gray_32f};
+        static const gpu_func_t funcs[] = {bgra_to_gray_8u, 0, APPEND_16U(bgra_to_gray), 0, 0, bgra_to_gray_32f};
 
         CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
         CV_Assert(src.channels() == 4);
+        CV_Assert(funcs[src.depth()] != 0);
 
         dst.create(src.size(), CV_MAKETYPE(src.depth(), 1));
 
@@ -437,12 +455,12 @@ namespace
         static const gpu_func_t funcs[2][2][6] =
         {
             {
-                {rgb_to_yuv_8u, 0, rgb_to_yuv_16u, 0, 0, rgb_to_yuv_32f},
-                {rgba_to_yuv_8u, 0, rgba_to_yuv_16u, 0, 0, rgba_to_yuv_32f}
+                {rgb_to_yuv_8u, 0, APPEND_16U(rgb_to_yuv), 0, 0, rgb_to_yuv_32f},
+                {rgba_to_yuv_8u, 0, APPEND_16U(rgba_to_yuv), 0, 0, rgba_to_yuv_32f}
             },
             {
-                {rgb_to_yuv4_8u, 0, rgb_to_yuv4_16u, 0, 0, rgb_to_yuv4_32f},
-                {rgba_to_yuv4_8u, 0, rgba_to_yuv4_16u, 0, 0, rgba_to_yuv4_32f}
+                {rgb_to_yuv4_8u, 0, APPEND_16U(rgb_to_yuv4), 0, 0, rgb_to_yuv4_32f},
+                {rgba_to_yuv4_8u, 0, APPEND_16U(rgba_to_yuv4), 0, 0, rgba_to_yuv4_32f}
             }
         };
 
@@ -451,6 +469,7 @@ namespace
         CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
         CV_Assert(src.channels() == 3 || src.channels() == 4);
         CV_Assert(dcn == 3 || dcn == 4);
+        CV_Assert(funcs[dcn == 4][src.channels() == 4][src.depth()] != 0);
 
         dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
 
@@ -463,12 +482,12 @@ namespace
         static const gpu_func_t funcs[2][2][6] =
         {
             {
-                {bgr_to_yuv_8u, 0, bgr_to_yuv_16u, 0, 0, bgr_to_yuv_32f},
-                {bgra_to_yuv_8u, 0, bgra_to_yuv_16u, 0, 0, bgra_to_yuv_32f}
+                {bgr_to_yuv_8u, 0, APPEND_16U(bgr_to_yuv), 0, 0, bgr_to_yuv_32f},
+                {bgra_to_yuv_8u, 0, APPEND_16U(bgra_to_yuv), 0, 0, bgra_to_yuv_32f}
             },
             {
-                {bgr_to_yuv4_8u, 0, bgr_to_yuv4_16u, 0, 0, bgr_to_yuv4_32f},
-                {bgra_to_yuv4_8u, 0, bgra_to_yuv4_16u, 0, 0, bgra_to_yuv4_32f}
+                {bgr_to_yuv4_8u, 0, APPEND_16U(bgr_to_yuv4), 0, 0, bgr_to_yuv4_32f},
+                {bgra_to_yuv4_8u, 0, APPEND_16U(bgra_to_yuv4), 0, 0, bgra_to_yuv4_32f}
             }
         };
 
@@ -477,6 +496,7 @@ namespace
         CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
         CV_Assert(src.channels() == 3 || src.channels() == 4);
         CV_Assert(dcn == 3 || dcn == 4);
+        CV_Assert(funcs[dcn == 4][src.channels() == 4][src.depth()] != 0);
 
         dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
 
@@ -489,12 +509,12 @@ namespace
         static const gpu_func_t funcs[2][2][6] =
         {
             {
-                {yuv_to_rgb_8u, 0, yuv_to_rgb_16u, 0, 0, yuv_to_rgb_32f},
-                {yuv4_to_rgb_8u, 0, yuv4_to_rgb_16u, 0, 0, yuv4_to_rgb_32f}
+                {yuv_to_rgb_8u, 0, APPEND_16U(yuv_to_rgb), 0, 0, yuv_to_rgb_32f},
+                {yuv4_to_rgb_8u, 0, APPEND_16U(yuv4_to_rgb), 0, 0, yuv4_to_rgb_32f}
             },
             {
-                {yuv_to_rgba_8u, 0, yuv_to_rgba_16u, 0, 0, yuv_to_rgba_32f},
-                {yuv4_to_rgba_8u, 0, yuv4_to_rgba_16u, 0, 0, yuv4_to_rgba_32f}
+                {yuv_to_rgba_8u, 0, APPEND_16U(yuv_to_rgba), 0, 0, yuv_to_rgba_32f},
+                {yuv4_to_rgba_8u, 0, APPEND_16U(yuv4_to_rgba), 0, 0, yuv4_to_rgba_32f}
             }
         };
 
@@ -503,6 +523,7 @@ namespace
         CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
         CV_Assert(src.channels() == 3 || src.channels() == 4);
         CV_Assert(dcn == 3 || dcn == 4);
+        CV_Assert(funcs[dcn == 4][src.channels() == 4][src.depth()] != 0);
 
         dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
 
@@ -515,12 +536,12 @@ namespace
         static const gpu_func_t funcs[2][2][6] =
         {
             {
-                {yuv_to_bgr_8u, 0, yuv_to_bgr_16u, 0, 0, yuv_to_bgr_32f},
-                {yuv4_to_bgr_8u, 0, yuv4_to_bgr_16u, 0, 0, yuv4_to_bgr_32f}
+                {yuv_to_bgr_8u, 0, APPEND_16U(yuv_to_bgr), 0, 0, yuv_to_bgr_32f},
+                {yuv4_to_bgr_8u, 0, APPEND_16U(yuv4_to_bgr), 0, 0, yuv4_to_bgr_32f}
             },
             {
-                {yuv_to_bgra_8u, 0, yuv_to_bgra_16u, 0, 0, yuv_to_bgra_32f},
-                {yuv4_to_bgra_8u, 0, yuv4_to_bgra_16u, 0, 0, yuv4_to_bgra_32f}
+                {yuv_to_bgra_8u, 0, APPEND_16U(yuv_to_bgra), 0, 0, yuv_to_bgra_32f},
+                {yuv4_to_bgra_8u, 0, APPEND_16U(yuv4_to_bgra), 0, 0, yuv4_to_bgra_32f}
             }
         };
 
@@ -529,6 +550,7 @@ namespace
         CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
         CV_Assert(src.channels() == 3 || src.channels() == 4);
         CV_Assert(dcn == 3 || dcn == 4);
+        CV_Assert(funcs[dcn == 4][src.channels() == 4][src.depth()] != 0);
 
         dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
 
@@ -541,12 +563,12 @@ namespace
         static const gpu_func_t funcs[2][2][6] =
         {
             {
-                {rgb_to_YCrCb_8u, 0, rgb_to_YCrCb_16u, 0, 0, rgb_to_YCrCb_32f},
-                {rgba_to_YCrCb_8u, 0, rgba_to_YCrCb_16u, 0, 0, rgba_to_YCrCb_32f}
+                {rgb_to_YCrCb_8u, 0, APPEND_16U(rgb_to_YCrCb), 0, 0, rgb_to_YCrCb_32f},
+                {rgba_to_YCrCb_8u, 0, APPEND_16U(rgba_to_YCrCb), 0, 0, rgba_to_YCrCb_32f}
             },
             {
-                {rgb_to_YCrCb4_8u, 0, rgb_to_YCrCb4_16u, 0, 0, rgb_to_YCrCb4_32f},
-                {rgba_to_YCrCb4_8u, 0, rgba_to_YCrCb4_16u, 0, 0, rgba_to_YCrCb4_32f}
+                {rgb_to_YCrCb4_8u, 0, APPEND_16U(rgb_to_YCrCb4), 0, 0, rgb_to_YCrCb4_32f},
+                {rgba_to_YCrCb4_8u, 0, APPEND_16U(rgba_to_YCrCb4), 0, 0, rgba_to_YCrCb4_32f}
             }
         };
 
@@ -555,6 +577,7 @@ namespace
         CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
         CV_Assert(src.channels() == 3 || src.channels() == 4);
         CV_Assert(dcn == 3 || dcn == 4);
+        CV_Assert(funcs[dcn == 4][src.channels() == 4][src.depth()] != 0);
 
         dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
 
@@ -567,12 +590,12 @@ namespace
         static const gpu_func_t funcs[2][2][6] =
         {
             {
-                {bgr_to_YCrCb_8u, 0, bgr_to_YCrCb_16u, 0, 0, bgr_to_YCrCb_32f},
-                {bgra_to_YCrCb_8u, 0, bgra_to_YCrCb_16u, 0, 0, bgra_to_YCrCb_32f}
+                {bgr_to_YCrCb_8u, 0, APPEND_16U(bgr_to_YCrCb), 0, 0, bgr_to_YCrCb_32f},
+                {bgra_to_YCrCb_8u, 0, APPEND_16U(bgra_to_YCrCb), 0, 0, bgra_to_YCrCb_32f}
             },
             {
-                {bgr_to_YCrCb4_8u, 0, bgr_to_YCrCb4_16u, 0, 0, bgr_to_YCrCb4_32f},
-                {bgra_to_YCrCb4_8u, 0, bgra_to_YCrCb4_16u, 0, 0, bgra_to_YCrCb4_32f}
+                {bgr_to_YCrCb4_8u, 0, APPEND_16U(bgr_to_YCrCb4), 0, 0, bgr_to_YCrCb4_32f},
+                {bgra_to_YCrCb4_8u, 0, APPEND_16U(bgra_to_YCrCb4), 0, 0, bgra_to_YCrCb4_32f}
             }
         };
 
@@ -581,6 +604,7 @@ namespace
         CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
         CV_Assert(src.channels() == 3 || src.channels() == 4);
         CV_Assert(dcn == 3 || dcn == 4);
+        CV_Assert(funcs[dcn == 4][src.channels() == 4][src.depth()] != 0);
 
         dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
 
@@ -593,12 +617,12 @@ namespace
         static const gpu_func_t funcs[2][2][6] =
         {
             {
-                {YCrCb_to_rgb_8u, 0, YCrCb_to_rgb_16u, 0, 0, YCrCb_to_rgb_32f},
-                {YCrCb4_to_rgb_8u, 0, YCrCb4_to_rgb_16u, 0, 0, YCrCb4_to_rgb_32f}
+                {YCrCb_to_rgb_8u, 0, APPEND_16U(YCrCb_to_rgb), 0, 0, YCrCb_to_rgb_32f},
+                {YCrCb4_to_rgb_8u, 0, APPEND_16U(YCrCb4_to_rgb), 0, 0, YCrCb4_to_rgb_32f}
             },
             {
-                {YCrCb_to_rgba_8u, 0, YCrCb_to_rgba_16u, 0, 0, YCrCb_to_rgba_32f},
-                {YCrCb4_to_rgba_8u, 0, YCrCb4_to_rgba_16u, 0, 0, YCrCb4_to_rgba_32f}
+                {YCrCb_to_rgba_8u, 0, APPEND_16U(YCrCb_to_rgba), 0, 0, YCrCb_to_rgba_32f},
+                {YCrCb4_to_rgba_8u, 0, APPEND_16U(YCrCb4_to_rgba), 0, 0, YCrCb4_to_rgba_32f}
             }
         };
 
@@ -607,6 +631,7 @@ namespace
         CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
         CV_Assert(src.channels() == 3 || src.channels() == 4);
         CV_Assert(dcn == 3 || dcn == 4);
+        CV_Assert(funcs[dcn == 4][src.channels() == 4][src.depth()] != 0);
 
         dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
 
@@ -619,12 +644,12 @@ namespace
         static const gpu_func_t funcs[2][2][6] =
         {
             {
-                {YCrCb_to_bgr_8u, 0, YCrCb_to_bgr_16u, 0, 0, YCrCb_to_bgr_32f},
-                {YCrCb4_to_bgr_8u, 0, YCrCb4_to_bgr_16u, 0, 0, YCrCb4_to_bgr_32f}
+                {YCrCb_to_bgr_8u, 0, APPEND_16U(YCrCb_to_bgr), 0, 0, YCrCb_to_bgr_32f},
+                {YCrCb4_to_bgr_8u, 0, APPEND_16U(YCrCb4_to_bgr), 0, 0, YCrCb4_to_bgr_32f}
             },
             {
-                {YCrCb_to_bgra_8u, 0, YCrCb_to_bgra_16u, 0, 0, YCrCb_to_bgra_32f},
-                {YCrCb4_to_bgra_8u, 0, YCrCb4_to_bgra_16u, 0, 0, YCrCb4_to_bgra_32f}
+                {YCrCb_to_bgra_8u, 0, APPEND_16U(YCrCb_to_bgra), 0, 0, YCrCb_to_bgra_32f},
+                {YCrCb4_to_bgra_8u, 0, APPEND_16U(YCrCb4_to_bgra), 0, 0, YCrCb4_to_bgra_32f}
             }
         };
 
@@ -633,6 +658,7 @@ namespace
         CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
         CV_Assert(src.channels() == 3 || src.channels() == 4);
         CV_Assert(dcn == 3 || dcn == 4);
+        CV_Assert(funcs[dcn == 4][src.channels() == 4][src.depth()] != 0);
 
         dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
 
@@ -645,12 +671,12 @@ namespace
         static const gpu_func_t funcs[2][2][6] =
         {
             {
-                {rgb_to_xyz_8u, 0, rgb_to_xyz_16u, 0, 0, rgb_to_xyz_32f},
-                {rgba_to_xyz_8u, 0, rgba_to_xyz_16u, 0, 0, rgba_to_xyz_32f}
+                {rgb_to_xyz_8u, 0, APPEND_16U(rgb_to_xyz), 0, 0, rgb_to_xyz_32f},
+                {rgba_to_xyz_8u, 0, APPEND_16U(rgba_to_xyz), 0, 0, rgba_to_xyz_32f}
             },
             {
-                {rgb_to_xyz4_8u, 0, rgb_to_xyz4_16u, 0, 0, rgb_to_xyz4_32f},
-                {rgba_to_xyz4_8u, 0, rgba_to_xyz4_16u, 0, 0, rgba_to_xyz4_32f}
+                {rgb_to_xyz4_8u, 0, APPEND_16U(rgb_to_xyz4), 0, 0, rgb_to_xyz4_32f},
+                {rgba_to_xyz4_8u, 0, APPEND_16U(rgba_to_xyz4), 0, 0, rgba_to_xyz4_32f}
             }
         };
 
@@ -659,6 +685,7 @@ namespace
         CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
         CV_Assert(src.channels() == 3 || src.channels() == 4);
         CV_Assert(dcn == 3 || dcn == 4);
+        CV_Assert(funcs[dcn == 4][src.channels() == 4][src.depth()] != 0);
 
         dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
 
@@ -671,12 +698,12 @@ namespace
         static const gpu_func_t funcs[2][2][6] =
         {
             {
-                {bgr_to_xyz_8u, 0, bgr_to_xyz_16u, 0, 0, bgr_to_xyz_32f},
-                {bgra_to_xyz_8u, 0, bgra_to_xyz_16u, 0, 0, bgra_to_xyz_32f}
+                {bgr_to_xyz_8u, 0, APPEND_16U(bgr_to_xyz), 0, 0, bgr_to_xyz_32f},
+                {bgra_to_xyz_8u, 0, APPEND_16U(bgra_to_xyz), 0, 0, bgra_to_xyz_32f}
             },
             {
-                {bgr_to_xyz4_8u, 0, bgr_to_xyz4_16u, 0, 0, bgr_to_xyz4_32f},
-                {bgra_to_xyz4_8u, 0, bgra_to_xyz4_16u, 0, 0, bgra_to_xyz4_32f}
+                {bgr_to_xyz4_8u, 0, APPEND_16U(bgr_to_xyz4), 0, 0, bgr_to_xyz4_32f},
+                {bgra_to_xyz4_8u, 0, APPEND_16U(bgra_to_xyz4), 0, 0, bgra_to_xyz4_32f}
             }
         };
 
@@ -685,6 +712,7 @@ namespace
         CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
         CV_Assert(src.channels() == 3 || src.channels() == 4);
         CV_Assert(dcn == 3 || dcn == 4);
+        CV_Assert(funcs[dcn == 4][src.channels() == 4][src.depth()] != 0);
 
         dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
 
@@ -697,12 +725,12 @@ namespace
         static const gpu_func_t funcs[2][2][6] =
         {
             {
-                {xyz_to_rgb_8u, 0, xyz_to_rgb_16u, 0, 0, xyz_to_rgb_32f},
-                {xyz4_to_rgb_8u, 0, xyz4_to_rgb_16u, 0, 0, xyz4_to_rgb_32f}
+                {xyz_to_rgb_8u, 0, APPEND_16U(xyz_to_rgb), 0, 0, xyz_to_rgb_32f},
+                {xyz4_to_rgb_8u, 0, APPEND_16U(xyz4_to_rgb), 0, 0, xyz4_to_rgb_32f}
             },
             {
-                {xyz_to_rgba_8u, 0, xyz_to_rgba_16u, 0, 0, xyz_to_rgba_32f},
-                {xyz4_to_rgba_8u, 0, xyz4_to_rgba_16u, 0, 0, xyz4_to_rgba_32f}
+                {xyz_to_rgba_8u, 0, APPEND_16U(xyz_to_rgba), 0, 0, xyz_to_rgba_32f},
+                {xyz4_to_rgba_8u, 0, APPEND_16U(xyz4_to_rgba), 0, 0, xyz4_to_rgba_32f}
             }
         };
 
@@ -711,6 +739,7 @@ namespace
         CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
         CV_Assert(src.channels() == 3 || src.channels() == 4);
         CV_Assert(dcn == 3 || dcn == 4);
+        CV_Assert(funcs[dcn == 4][src.channels() == 4][src.depth()] != 0);
 
         dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
 
@@ -723,12 +752,12 @@ namespace
         static const gpu_func_t funcs[2][2][6] =
         {
             {
-                {xyz_to_bgr_8u, 0, xyz_to_bgr_16u, 0, 0, xyz_to_bgr_32f},
-                {xyz4_to_bgr_8u, 0, xyz4_to_bgr_16u, 0, 0, xyz4_to_bgr_32f}
+                {xyz_to_bgr_8u, 0, APPEND_16U(xyz_to_bgr), 0, 0, xyz_to_bgr_32f},
+                {xyz4_to_bgr_8u, 0, APPEND_16U(xyz4_to_bgr), 0, 0, xyz4_to_bgr_32f}
             },
             {
-                {xyz_to_bgra_8u, 0, xyz_to_bgra_16u, 0, 0, xyz_to_bgra_32f},
-                {xyz4_to_bgra_8u, 0, xyz4_to_bgra_16u, 0, 0, xyz4_to_bgra_32f}
+                {xyz_to_bgra_8u, 0, APPEND_16U(xyz_to_bgra), 0, 0, xyz_to_bgra_32f},
+                {xyz4_to_bgra_8u, 0, APPEND_16U(xyz4_to_bgra), 0, 0, xyz4_to_bgra_32f}
             }
         };
 
@@ -737,6 +766,7 @@ namespace
         CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
         CV_Assert(src.channels() == 3 || src.channels() == 4);
         CV_Assert(dcn == 3 || dcn == 4);
+        CV_Assert(funcs[dcn == 4][src.channels() == 4][src.depth()] != 0);
 
         dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));
 
diff --git a/modules/gpu/src/cuda/bf_knnmatch.cu b/modules/gpu/src/cuda/bf_knnmatch.cu
index 3e5bc741ff..640dafb816 100644
--- a/modules/gpu/src/cuda/bf_knnmatch.cu
+++ b/modules/gpu/src/cuda/bf_knnmatch.cu
@@ -1168,12 +1168,14 @@ namespace cv { namespace gpu { namespace device
                 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
         }
 
+#ifndef OPENCV_TINY_GPU_MODULE
         template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
         //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
         template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
         template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
         template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
         template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+#endif
 
         template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
             const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
@@ -1185,11 +1187,13 @@ namespace cv { namespace gpu { namespace device
                 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
         }
 
+#ifndef OPENCV_TINY_GPU_MODULE
         //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
         //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
         //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
         //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
         //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+#endif
         template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
 
         template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
@@ -1203,10 +1207,12 @@ namespace cv { namespace gpu { namespace device
         }
 
         template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
         //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
         template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
         //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
         template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+#endif
 
         template <typename T> void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
             const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
@@ -1218,12 +1224,14 @@ namespace cv { namespace gpu { namespace device
                 match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance,  stream);
         }
 
+#ifndef OPENCV_TINY_GPU_MODULE
         template void match2L1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
         //template void match2L1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
         template void match2L1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
         template void match2L1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
         template void match2L1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
         template void match2L1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+#endif
 
         template <typename T> void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
             const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
@@ -1235,12 +1243,14 @@ namespace cv { namespace gpu { namespace device
                 match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream);
         }
 
+#ifndef OPENCV_TINY_GPU_MODULE
         //template void match2L2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
         //template void match2L2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
         //template void match2L2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
         //template void match2L2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
         //template void match2L2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
         template void match2L2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+#endif
 
         template <typename T> void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
             const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
@@ -1252,11 +1262,13 @@ namespace cv { namespace gpu { namespace device
                 match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream);
         }
 
+#ifndef OPENCV_TINY_GPU_MODULE
         template void match2Hamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
         //template void match2Hamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
         template void match2Hamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
         //template void match2Hamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
         template void match2Hamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+#endif
     } // namespace bf_knnmatch
 }}} // namespace cv { namespace gpu { namespace device {
 
diff --git a/modules/gpu/src/cuda/bf_match.cu b/modules/gpu/src/cuda/bf_match.cu
index c2ae48bb30..baaf979e5d 100644
--- a/modules/gpu/src/cuda/bf_match.cu
+++ b/modules/gpu/src/cuda/bf_match.cu
@@ -644,12 +644,14 @@ namespace cv { namespace gpu { namespace device
             }
         }
 
+#ifndef OPENCV_TINY_GPU_MODULE
         template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
         //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
         template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
         template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
         template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
         template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+#endif
 
         template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
                                                const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
@@ -669,11 +671,13 @@ namespace cv { namespace gpu { namespace device
             }
         }
 
+#ifndef OPENCV_TINY_GPU_MODULE
         //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
         //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
         //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
         //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
         //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+#endif
         template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
 
         template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
@@ -695,10 +699,12 @@ namespace cv { namespace gpu { namespace device
         }
 
         template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
         //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
         template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
         //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
         template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+#endif
 
         template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
                                                const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
@@ -718,12 +724,14 @@ namespace cv { namespace gpu { namespace device
             }
         }
 
+#ifndef OPENCV_TINY_GPU_MODULE
         template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
         //template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
         template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
         template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
         template void matchL1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
         template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+#endif
 
         template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
                                                const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
@@ -743,12 +751,14 @@ namespace cv { namespace gpu { namespace device
             }
         }
 
+#ifndef OPENCV_TINY_GPU_MODULE
         //template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
         //template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
         //template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
         //template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
         //template void matchL2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
         template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& maskCollection, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+#endif
 
         template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
                                                     const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
@@ -768,11 +778,13 @@ namespace cv { namespace gpu { namespace device
             }
         }
 
+#ifndef OPENCV_TINY_GPU_MODULE
         template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
         //template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
         template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
         //template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
         template void matchHamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+#endif
     } // namespace bf_match
 }}} // namespace cv { namespace gpu { namespace device {
 
diff --git a/modules/gpu/src/cuda/bf_radius_match.cu b/modules/gpu/src/cuda/bf_radius_match.cu
index d83f9f7f96..8493b4e065 100644
--- a/modules/gpu/src/cuda/bf_radius_match.cu
+++ b/modules/gpu/src/cuda/bf_radius_match.cu
@@ -356,12 +356,14 @@ namespace cv { namespace gpu { namespace device
             }
         }
 
+#ifndef OPENCV_TINY_GPU_MODULE
         template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
         //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
         template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
         template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
         template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
         template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+#endif
 
         template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
             const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
@@ -381,11 +383,13 @@ namespace cv { namespace gpu { namespace device
             }
         }
 
+#ifndef OPENCV_TINY_GPU_MODULE
         //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
         //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
         //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
         //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
         //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+#endif
         template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
 
         template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
@@ -407,10 +411,12 @@ namespace cv { namespace gpu { namespace device
         }
 
         template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
         //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
         template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
         //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
         template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+#endif
 
         template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
@@ -421,12 +427,14 @@ namespace cv { namespace gpu { namespace device
                 stream);
         }
 
+#ifndef OPENCV_TINY_GPU_MODULE
         template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
         //template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
         template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
         template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
         template void matchL1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
         template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+#endif
 
         template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
@@ -437,12 +445,14 @@ namespace cv { namespace gpu { namespace device
                 stream);
         }
 
+#ifndef OPENCV_TINY_GPU_MODULE
         //template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
         //template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
         //template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
         //template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
         //template void matchL2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
         template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+#endif
 
         template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
@@ -453,11 +463,13 @@ namespace cv { namespace gpu { namespace device
                 stream);
         }
 
+#ifndef OPENCV_TINY_GPU_MODULE
         template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
         //template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
         template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
         //template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
         template void matchHamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+#endif
     } // namespace bf_radius_match
 }}} // namespace cv { namespace gpu { namespace device
 
diff --git a/modules/gpu/src/cuda/bilateral_filter.cu b/modules/gpu/src/cuda/bilateral_filter.cu
index 15e72a8b75..69f58aabd9 100644
--- a/modules/gpu/src/cuda/bilateral_filter.cu
+++ b/modules/gpu/src/cuda/bilateral_filter.cu
@@ -149,6 +149,16 @@ namespace cv { namespace gpu { namespace device
         {
             typedef void (*caller_t)(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, cudaStream_t stream);
 
+#ifdef OPENCV_TINY_GPU_MODULE
+            static caller_t funcs[] =
+            {
+                bilateral_caller<T, BrdReflect101>,
+                bilateral_caller<T, BrdReplicate>,
+                0,
+                0,
+                0,
+            };
+#else
             static caller_t funcs[] =
             {
                 bilateral_caller<T, BrdReflect101>,
@@ -157,7 +167,13 @@ namespace cv { namespace gpu { namespace device
                 bilateral_caller<T, BrdReflect>,
                 bilateral_caller<T, BrdWrap>,
             };
-            funcs[borderMode](src, dst, kernel_size, gauss_spatial_coeff, gauss_color_coeff, stream);
+#endif
+
+            const caller_t caller = funcs[borderMode];
+            if (!caller)
+                cv::gpu::error("Unsupported input parameters for bilateral_filter", __FILE__, __LINE__, "");
+
+            caller(src, dst, kernel_size, gauss_spatial_coeff, gauss_color_coeff, stream);
         }
     }
 }}}
@@ -171,6 +187,7 @@ OCV_INSTANTIATE_BILATERAL_FILTER(uchar)
 OCV_INSTANTIATE_BILATERAL_FILTER(uchar3)
 OCV_INSTANTIATE_BILATERAL_FILTER(uchar4)
 
+#ifndef OPENCV_TINY_GPU_MODULE
 //OCV_INSTANTIATE_BILATERAL_FILTER(schar)
 //OCV_INSTANTIATE_BILATERAL_FILTER(schar2)
 //OCV_INSTANTIATE_BILATERAL_FILTER(schar3)
@@ -190,6 +207,7 @@ OCV_INSTANTIATE_BILATERAL_FILTER(ushort4)
 //OCV_INSTANTIATE_BILATERAL_FILTER(int2)
 //OCV_INSTANTIATE_BILATERAL_FILTER(int3)
 //OCV_INSTANTIATE_BILATERAL_FILTER(int4)
+#endif
 
 OCV_INSTANTIATE_BILATERAL_FILTER(float)
 //OCV_INSTANTIATE_BILATERAL_FILTER(float2)
diff --git a/modules/gpu/src/cuda/color.cu b/modules/gpu/src/cuda/color.cu
index 5d8f6cbbb5..3ac0c111c8 100644
--- a/modules/gpu/src/cuda/color.cu
+++ b/modules/gpu/src/cuda/color.cu
@@ -235,10 +235,16 @@ namespace cv { namespace gpu { namespace device
 #define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(name) \
     OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, name ## _traits)
 
-#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(name) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _16u, name ## _traits<ushort>) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>)
+#ifdef OPENCV_TINY_GPU_MODULE
+    #define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(name) \
+        OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
+        OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>)
+#else
+    #define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(name) \
+        OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
+        OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _16u, name ## _traits<ushort>) \
+        OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>)
+#endif
 
 #define OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(name) \
     OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
diff --git a/modules/gpu/src/cuda/column_filter.10.cu b/modules/gpu/src/cuda/column_filter.10.cu
index b71e25207e..81e4fe7a0a 100644
--- a/modules/gpu/src/cuda/column_filter.10.cu
+++ b/modules/gpu/src/cuda/column_filter.10.cu
@@ -44,9 +44,13 @@
 
 #include "column_filter.h"
 
+#ifndef OPENCV_TINY_GPU_MODULE
+
 namespace filter
 {
     template void linearColumn<float, unsigned short>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }
 
+#endif
+
 #endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/column_filter.11.cu b/modules/gpu/src/cuda/column_filter.11.cu
index ccfbf8e773..34a065453b 100644
--- a/modules/gpu/src/cuda/column_filter.11.cu
+++ b/modules/gpu/src/cuda/column_filter.11.cu
@@ -44,9 +44,13 @@
 
 #include "column_filter.h"
 
+#ifndef OPENCV_TINY_GPU_MODULE
+
 namespace filter
 {
     template void linearColumn<float3, ushort3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }
 
+#endif
+
 #endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/column_filter.12.cu b/modules/gpu/src/cuda/column_filter.12.cu
index a38f93b531..bc0a45bc3f 100644
--- a/modules/gpu/src/cuda/column_filter.12.cu
+++ b/modules/gpu/src/cuda/column_filter.12.cu
@@ -44,9 +44,13 @@
 
 #include "column_filter.h"
 
+#ifndef OPENCV_TINY_GPU_MODULE
+
 namespace filter
 {
     template void linearColumn<float4, ushort4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }
 
+#endif
+
 #endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/column_filter.13.cu b/modules/gpu/src/cuda/column_filter.13.cu
index 40eec7a83f..b7facb6c03 100644
--- a/modules/gpu/src/cuda/column_filter.13.cu
+++ b/modules/gpu/src/cuda/column_filter.13.cu
@@ -44,9 +44,13 @@
 
 #include "column_filter.h"
 
+#ifndef OPENCV_TINY_GPU_MODULE
+
 namespace filter
 {
     template void linearColumn<float3, int3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }
 
+#endif
+
 #endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/column_filter.14.cu b/modules/gpu/src/cuda/column_filter.14.cu
index 08151ac6d0..6db983786b 100644
--- a/modules/gpu/src/cuda/column_filter.14.cu
+++ b/modules/gpu/src/cuda/column_filter.14.cu
@@ -44,9 +44,13 @@
 
 #include "column_filter.h"
 
+#ifndef OPENCV_TINY_GPU_MODULE
+
 namespace filter
 {
     template void linearColumn<float4, int4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }
 
+#endif
+
 #endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/column_filter.3.cu b/modules/gpu/src/cuda/column_filter.3.cu
index 7304565b96..339bdabc68 100644
--- a/modules/gpu/src/cuda/column_filter.3.cu
+++ b/modules/gpu/src/cuda/column_filter.3.cu
@@ -44,9 +44,13 @@
 
 #include "column_filter.h"
 
+#ifndef OPENCV_TINY_GPU_MODULE
+
 namespace filter
 {
     template void linearColumn<float3, short3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }
 
+#endif
+
 #endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/column_filter.4.cu b/modules/gpu/src/cuda/column_filter.4.cu
index 8c9db6985b..37f9bd718e 100644
--- a/modules/gpu/src/cuda/column_filter.4.cu
+++ b/modules/gpu/src/cuda/column_filter.4.cu
@@ -44,9 +44,13 @@
 
 #include "column_filter.h"
 
+#ifndef OPENCV_TINY_GPU_MODULE
+
 namespace filter
 {
     template void linearColumn<float, int>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }
 
+#endif
+
 #endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/column_filter.8.cu b/modules/gpu/src/cuda/column_filter.8.cu
index 0a63a1dd43..b4ad5bd02e 100644
--- a/modules/gpu/src/cuda/column_filter.8.cu
+++ b/modules/gpu/src/cuda/column_filter.8.cu
@@ -44,9 +44,13 @@
 
 #include "column_filter.h"
 
+#ifndef OPENCV_TINY_GPU_MODULE
+
 namespace filter
 {
     template void linearColumn<float, short>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }
 
+#endif
+
 #endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/column_filter.9.cu b/modules/gpu/src/cuda/column_filter.9.cu
index 758d9289d9..da64c32225 100644
--- a/modules/gpu/src/cuda/column_filter.9.cu
+++ b/modules/gpu/src/cuda/column_filter.9.cu
@@ -44,9 +44,13 @@
 
 #include "column_filter.h"
 
+#ifndef OPENCV_TINY_GPU_MODULE
+
 namespace filter
 {
     template void linearColumn<float4, short4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }
 
+#endif
+
 #endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/column_filter.h b/modules/gpu/src/cuda/column_filter.h
index 46e3583153..139a6ef20a 100644
--- a/modules/gpu/src/cuda/column_filter.h
+++ b/modules/gpu/src/cuda/column_filter.h
@@ -183,6 +183,186 @@ namespace filter
     {
         typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);
 
+#ifdef OPENCV_TINY_GPU_MODULE
+        static const caller_t callers[5][33] =
+        {
+            {
+                0,
+                0,
+                0,
+                column_filter::caller< 3, T, D, BrdColReflect101>,
+                0,
+                column_filter::caller< 5, T, D, BrdColReflect101>,
+                0,
+                column_filter::caller< 7, T, D, BrdColReflect101>,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+            },
+            {
+                0,
+                0,
+                0,
+                column_filter::caller< 3, T, D, BrdColReplicate>,
+                0,
+                column_filter::caller< 5, T, D, BrdColReplicate>,
+                0,
+                column_filter::caller< 7, T, D, BrdColReplicate>,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+            },
+            {
+                0,
+                0,
+                0,
+                column_filter::caller< 3, T, D, BrdColConstant>,
+                0,
+                column_filter::caller< 5, T, D, BrdColConstant>,
+                0,
+                column_filter::caller< 7, T, D, BrdColConstant>,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+            },
+            {
+                0,
+                0,
+                0,
+                column_filter::caller< 3, T, D, BrdColReflect>,
+                0,
+                column_filter::caller< 5, T, D, BrdColReflect>,
+                0,
+                column_filter::caller< 7, T, D, BrdColReflect>,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+            },
+            {
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0
+            }
+        };
+#else
         static const caller_t callers[5][33] =
         {
             {
@@ -361,12 +541,17 @@ namespace filter
                 column_filter::caller<32, T, D, BrdColWrap>
             }
         };
+#endif
+
+        const caller_t caller = callers[brd_type][ksize];
+        if (!caller)
+            cv::gpu::error("Unsupported input parameters for column_filter", __FILE__, __LINE__, "");
 
         if (stream == 0)
             cudaSafeCall( cudaMemcpyToSymbol(column_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
         else
             cudaSafeCall( cudaMemcpyToSymbolAsync(column_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
 
-        callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
+        caller((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
     }
 }
diff --git a/modules/gpu/src/cuda/copy_make_border.cu b/modules/gpu/src/cuda/copy_make_border.cu
index 5553810ebd..ca5a4f779e 100644
--- a/modules/gpu/src/cuda/copy_make_border.cu
+++ b/modules/gpu/src/cuda/copy_make_border.cu
@@ -90,10 +90,18 @@ namespace cv { namespace gpu { namespace device
                 CopyMakeBorderDispatcher<BrdReplicate, vec_type>::call,
                 CopyMakeBorderDispatcher<BrdConstant, vec_type>::call,
                 CopyMakeBorderDispatcher<BrdReflect, vec_type>::call,
+    #ifdef OPENCV_TINY_GPU_MODULE
+                0,
+    #else
                 CopyMakeBorderDispatcher<BrdWrap, vec_type>::call
+    #endif
             };
 
-            callers[borderMode](PtrStepSz<vec_type>(src), PtrStepSz<vec_type>(dst), top, left, borderValue, stream);
+            const caller_t caller = callers[borderMode];
+            if (!caller)
+                cv::gpu::error("Unsupported input parameters for copyMakeBorder", __FILE__, __LINE__, "");
+
+            caller(PtrStepSz<vec_type>(src), PtrStepSz<vec_type>(dst), top, left, borderValue, stream);
         }
 
         template void copyMakeBorder_gpu<uchar, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
@@ -101,6 +109,7 @@ namespace cv { namespace gpu { namespace device
         template void copyMakeBorder_gpu<uchar, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
         template void copyMakeBorder_gpu<uchar, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
 
+#ifndef OPENCV_TINY_GPU_MODULE
         //template void copyMakeBorder_gpu<schar, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
         //template void copyMakeBorder_gpu<schar, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
         //template void copyMakeBorder_gpu<schar, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
@@ -120,6 +129,7 @@ namespace cv { namespace gpu { namespace device
         //template void copyMakeBorder_gpu<int, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
         //template void copyMakeBorder_gpu<int, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
         //template void copyMakeBorder_gpu<int, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
+#endif
 
         template void copyMakeBorder_gpu<float, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
         //template void copyMakeBorder_gpu<float, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
diff --git a/modules/gpu/src/cuda/element_operations.cu b/modules/gpu/src/cuda/element_operations.cu
index f606f0c8c1..bca89ad643 100644
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@@ -234,6 +234,7 @@ namespace arithm
     }
 
     template void addMat<uchar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void addMat<uchar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
     template void addMat<uchar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
     template void addMat<uchar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
@@ -278,7 +279,9 @@ namespace arithm
     //template void addMat<float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
     //template void addMat<float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
     //template void addMat<float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+#endif
     template void addMat<float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void addMat<float, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 
     //template void addMat<double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
@@ -288,6 +291,7 @@ namespace arithm
     //template void addMat<double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
     //template void addMat<double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
     template void addMat<double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+#endif
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -329,6 +333,7 @@ namespace arithm
     }
 
     template void addScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void addScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
     template void addScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
     template void addScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
@@ -373,7 +378,9 @@ namespace arithm
     //template void addScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
     //template void addScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
     //template void addScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+#endif
     template void addScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void addScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 
     //template void addScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
@@ -383,6 +390,7 @@ namespace arithm
     //template void addScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
     //template void addScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
     template void addScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+#endif
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -469,6 +477,7 @@ namespace arithm
     }
 
     template void subMat<uchar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void subMat<uchar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
     template void subMat<uchar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
     template void subMat<uchar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
@@ -513,7 +522,9 @@ namespace arithm
     //template void subMat<float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
     //template void subMat<float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
     //template void subMat<float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+#endif
     template void subMat<float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void subMat<float, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 
     //template void subMat<double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
@@ -523,6 +534,7 @@ namespace arithm
     //template void subMat<double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
     //template void subMat<double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
     template void subMat<double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+#endif
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -542,6 +554,7 @@ namespace arithm
     }
 
     template void subScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void subScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
     template void subScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
     template void subScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
@@ -586,7 +599,9 @@ namespace arithm
     //template void subScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
     //template void subScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
     //template void subScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+#endif
     template void subScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void subScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 
     //template void subScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
@@ -596,6 +611,7 @@ namespace arithm
     //template void subScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
     //template void subScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
     template void subScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+#endif
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -700,6 +716,7 @@ namespace arithm
     }
 
     template void mulMat<uchar, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void mulMat<uchar, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
     template void mulMat<uchar, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
     template void mulMat<uchar, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
@@ -744,7 +761,9 @@ namespace arithm
     //template void mulMat<float, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
     //template void mulMat<float, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
     //template void mulMat<float, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+#endif
     template void mulMat<float, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void mulMat<float, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
 
     //template void mulMat<double, double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
@@ -754,6 +773,7 @@ namespace arithm
     //template void mulMat<double, double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
     //template void mulMat<double, double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
     template void mulMat<double, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+#endif
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -791,6 +811,7 @@ namespace arithm
     }
 
     template void mulScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void mulScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
     template void mulScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
     template void mulScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
@@ -835,7 +856,9 @@ namespace arithm
     //template void mulScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
     //template void mulScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
     //template void mulScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+#endif
     template void mulScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void mulScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
 
     //template void mulScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
@@ -845,6 +868,7 @@ namespace arithm
     //template void mulScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
     //template void mulScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
     template void mulScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+#endif
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -968,6 +992,7 @@ namespace arithm
     }
 
     template void divMat<uchar, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void divMat<uchar, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
     template void divMat<uchar, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
     template void divMat<uchar, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
@@ -1012,7 +1037,9 @@ namespace arithm
     //template void divMat<float, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
     //template void divMat<float, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
     //template void divMat<float, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+#endif
     template void divMat<float, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void divMat<float, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
 
     //template void divMat<double, double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
@@ -1022,6 +1049,7 @@ namespace arithm
     //template void divMat<double, double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
     //template void divMat<double, double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
     template void divMat<double, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+#endif
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -1037,6 +1065,7 @@ namespace arithm
     }
 
     template void divScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void divScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
     template void divScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
     template void divScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
@@ -1081,7 +1110,9 @@ namespace arithm
     //template void divScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
     //template void divScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
     //template void divScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+#endif
     template void divScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void divScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
 
     //template void divScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
@@ -1091,6 +1122,7 @@ namespace arithm
     //template void divScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
     //template void divScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
     template void divScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+#endif
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -1128,6 +1160,7 @@ namespace arithm
     }
 
     template void divInv<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void divInv<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
     template void divInv<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
     template void divInv<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
@@ -1172,7 +1205,9 @@ namespace arithm
     //template void divInv<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
     //template void divInv<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
     //template void divInv<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+#endif
     template void divInv<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void divInv<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
 
     //template void divInv<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
@@ -1182,6 +1217,7 @@ namespace arithm
     //template void divInv<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
     //template void divInv<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
     template void divInv<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+#endif
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -1278,12 +1314,16 @@ namespace arithm
     }
 
     template void absDiffMat<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void absDiffMat<schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
     template void absDiffMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
     template void absDiffMat<short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
     template void absDiffMat<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#endif
     template void absDiffMat<float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void absDiffMat<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#endif
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -1323,12 +1363,16 @@ namespace arithm
     }
 
     template void absDiffScalar<uchar, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void absDiffScalar<schar, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
     template void absDiffScalar<ushort, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
     template void absDiffScalar<short, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
     template void absDiffScalar<int, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+#endif
     template void absDiffScalar<float, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void absDiffScalar<double, double>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+#endif
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -1349,13 +1393,17 @@ namespace arithm
         transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, abs_func<T>(), WithOutMask(), stream);
     }
 
+#ifndef OPENCV_TINY_GPU_MODULE
     template void absMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
     template void absMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
     template void absMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
     template void absMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
     template void absMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+#endif
     template void absMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void absMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+#endif
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -1390,13 +1438,17 @@ namespace arithm
         transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, Sqr<T>(), WithOutMask(), stream);
     }
 
+#ifndef OPENCV_TINY_GPU_MODULE
     template void sqrMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
     template void sqrMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
     template void sqrMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
     template void sqrMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
     template void sqrMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+#endif
     template void sqrMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void sqrMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+#endif
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -1417,13 +1469,17 @@ namespace arithm
         transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, sqrt_func<T>(), WithOutMask(), stream);
     }
 
+#ifndef OPENCV_TINY_GPU_MODULE
     template void sqrtMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
     template void sqrtMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
     template void sqrtMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
     template void sqrtMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
     template void sqrtMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+#endif
     template void sqrtMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void sqrtMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+#endif
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -1444,13 +1500,17 @@ namespace arithm
         transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, log_func<T>(), WithOutMask(), stream);
     }
 
+#ifndef OPENCV_TINY_GPU_MODULE
     template void logMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
     template void logMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
     template void logMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
     template void logMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
     template void logMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+#endif
     template void logMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void logMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+#endif
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -1486,13 +1546,17 @@ namespace arithm
         transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, Exp<T>(), WithOutMask(), stream);
     }
 
+#ifndef OPENCV_TINY_GPU_MODULE
     template void expMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
     template void expMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
     template void expMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
     template void expMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
     template void expMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+#endif
     template void expMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void expMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+#endif
 }
 
 //////////////////////////////////////////////////////////////////////////////////////
@@ -1620,36 +1684,52 @@ namespace arithm
     }
 
     template void cmpMatEq<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void cmpMatEq<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
     template void cmpMatEq<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
     template void cmpMatEq<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
     template void cmpMatEq<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#endif
     template void cmpMatEq<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void cmpMatEq<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#endif
 
     template void cmpMatNe<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void cmpMatNe<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
     template void cmpMatNe<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
     template void cmpMatNe<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
     template void cmpMatNe<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#endif
     template void cmpMatNe<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void cmpMatNe<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#endif
 
     template void cmpMatLt<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void cmpMatLt<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
     template void cmpMatLt<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
     template void cmpMatLt<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
     template void cmpMatLt<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#endif
     template void cmpMatLt<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void cmpMatLt<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#endif
 
     template void cmpMatLe<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void cmpMatLe<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
     template void cmpMatLe<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
     template void cmpMatLe<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
     template void cmpMatLe<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#endif
     template void cmpMatLe<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void cmpMatLe<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#endif
 }
 
 //////////////////////////////////////////////////////////////////////////////////////
@@ -1824,52 +1904,76 @@ namespace arithm
     }
 
     template void cmpScalarEq<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void cmpScalarEq<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
     template void cmpScalarEq<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
     template void cmpScalarEq<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
     template void cmpScalarEq<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#endif
     template void cmpScalarEq<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void cmpScalarEq<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#endif
 
     template void cmpScalarNe<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void cmpScalarNe<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
     template void cmpScalarNe<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
     template void cmpScalarNe<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
     template void cmpScalarNe<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#endif
     template void cmpScalarNe<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void cmpScalarNe<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#endif
 
     template void cmpScalarLt<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void cmpScalarLt<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
     template void cmpScalarLt<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
     template void cmpScalarLt<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
     template void cmpScalarLt<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#endif
     template void cmpScalarLt<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void cmpScalarLt<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#endif
 
     template void cmpScalarLe<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void cmpScalarLe<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
     template void cmpScalarLe<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
     template void cmpScalarLe<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
     template void cmpScalarLe<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#endif
     template void cmpScalarLe<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void cmpScalarLe<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#endif
 
     template void cmpScalarGt<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void cmpScalarGt<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
     template void cmpScalarGt<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
     template void cmpScalarGt<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
     template void cmpScalarGt<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#endif
     template void cmpScalarGt<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void cmpScalarGt<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#endif
 
     template void cmpScalarGe<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void cmpScalarGe<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
     template void cmpScalarGe<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
     template void cmpScalarGe<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
     template void cmpScalarGe<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#endif
     template void cmpScalarGe<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void cmpScalarGe<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#endif
 }
 
 //////////////////////////////////////////////////////////////////////////////////////
@@ -1981,19 +2085,25 @@ namespace arithm
     }
 
     template void bitScalarAnd<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void bitScalarAnd<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
     template void bitScalarAnd<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
     template void bitScalarAnd<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+#endif
 
     template void bitScalarOr<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void bitScalarOr<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
     template void bitScalarOr<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
     template void bitScalarOr<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+#endif
 
     template void bitScalarXor<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void bitScalarXor<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
     template void bitScalarXor<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
     template void bitScalarXor<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+#endif
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -2067,18 +2177,27 @@ namespace arithm
     }
 
     template void minMat<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void minMat<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
     template void minMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
     template void minMat<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
     template void minMat<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#endif
     template void minMat<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void minMat<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#endif
 
     template <typename T> void minScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream)
     {
         transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::gpu::device::bind2nd(minimum<T>(), src2), WithOutMask(), stream);
     }
 
+#ifdef OPENCV_TINY_GPU_MODULE
+    template void minScalar<uchar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void minScalar<int   >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void minScalar<float >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+#else
     template void minScalar<uchar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
     template void minScalar<schar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
     template void minScalar<ushort>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
@@ -2086,6 +2205,7 @@ namespace arithm
     template void minScalar<int   >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
     template void minScalar<float >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
     template void minScalar<double>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+#endif
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -2159,12 +2279,16 @@ namespace arithm
     }
 
     template void maxMat<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void maxMat<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
     template void maxMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
     template void maxMat<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
     template void maxMat<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#endif
     template void maxMat<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void maxMat<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#endif
 
     template <typename T> void maxScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream)
     {
@@ -2172,12 +2296,16 @@ namespace arithm
     }
 
     template void maxScalar<uchar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void maxScalar<schar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
     template void maxScalar<ushort>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
     template void maxScalar<short >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
     template void maxScalar<int   >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+#endif
     template void maxScalar<float >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void maxScalar<double>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+#endif
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -2233,12 +2361,16 @@ namespace arithm
     }
 
     template void threshold<uchar>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void threshold<schar>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
     template void threshold<ushort>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
     template void threshold<short>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
     template void threshold<int>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
+#endif
     template void threshold<float>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void threshold<double>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
+#endif
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -2312,13 +2444,17 @@ namespace arithm
         transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, PowOp<T>(power), WithOutMask(), stream);
     }
 
+#ifndef OPENCV_TINY_GPU_MODULE
     template void pow<uchar>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
     template void pow<schar>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
     template void pow<short>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
     template void pow<ushort>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
     template void pow<int>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
+#endif
     template void pow<float>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void pow<double>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
+#endif
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -2397,6 +2533,7 @@ namespace arithm
     }
 
     template void addWeighted<uchar, uchar, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void addWeighted<uchar, uchar, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
     template void addWeighted<uchar, uchar, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
     template void addWeighted<uchar, uchar, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
@@ -2451,9 +2588,10 @@ namespace arithm
     template void addWeighted<uchar, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
     template void addWeighted<uchar, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
     template void addWeighted<uchar, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+#endif
 
 
-
+#ifndef OPENCV_TINY_GPU_MODULE
     template void addWeighted<schar, schar, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
     template void addWeighted<schar, schar, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
     template void addWeighted<schar, schar, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
@@ -2501,9 +2639,10 @@ namespace arithm
     template void addWeighted<schar, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
     template void addWeighted<schar, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
     template void addWeighted<schar, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+#endif
 
 
-
+#ifndef OPENCV_TINY_GPU_MODULE
     template void addWeighted<ushort, ushort, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
     template void addWeighted<ushort, ushort, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
     template void addWeighted<ushort, ushort, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
@@ -2543,9 +2682,10 @@ namespace arithm
     template void addWeighted<ushort, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
     template void addWeighted<ushort, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
     template void addWeighted<ushort, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+#endif
 
 
-
+#ifndef OPENCV_TINY_GPU_MODULE
     template void addWeighted<short, short, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
     template void addWeighted<short, short, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
     template void addWeighted<short, short, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
@@ -2577,9 +2717,10 @@ namespace arithm
     template void addWeighted<short, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
     template void addWeighted<short, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
     template void addWeighted<short, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+#endif
 
 
-
+#ifndef OPENCV_TINY_GPU_MODULE
     template void addWeighted<int, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
     template void addWeighted<int, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
     template void addWeighted<int, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
@@ -2603,15 +2744,18 @@ namespace arithm
     template void addWeighted<int, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
     template void addWeighted<int, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
     template void addWeighted<int, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+#endif
 
 
-
+#ifndef OPENCV_TINY_GPU_MODULE
     template void addWeighted<float, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
     template void addWeighted<float, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
     template void addWeighted<float, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
     template void addWeighted<float, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
     template void addWeighted<float, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+#endif
     template void addWeighted<float, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void addWeighted<float, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
     template void addWeighted<float, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
@@ -2621,9 +2765,11 @@ namespace arithm
     template void addWeighted<float, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
     template void addWeighted<float, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
     template void addWeighted<float, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+#endif
 
 
 
+#ifndef OPENCV_TINY_GPU_MODULE
     template void addWeighted<double, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
     template void addWeighted<double, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
     template void addWeighted<double, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
@@ -2631,6 +2777,7 @@ namespace arithm
     template void addWeighted<double, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
     template void addWeighted<double, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
     template void addWeighted<double, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+#endif
 }
 
 #endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/imgproc.cu b/modules/gpu/src/cuda/imgproc.cu
index 067dfaf640..2a1bca4ad9 100644
--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
@@ -985,6 +985,16 @@ namespace cv { namespace gpu { namespace device
                           int borderMode, const float* borderValue, cudaStream_t stream)
         {
             typedef void (*func_t)(const PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<D> dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream);
+#ifdef OPENCV_TINY_GPU_MODULE
+            static const func_t funcs[] =
+            {
+                Filter2DCaller<T, D, BrdReflect101>::call,
+                Filter2DCaller<T, D, BrdReplicate>::call,
+                Filter2DCaller<T, D, BrdConstant>::call,
+                Filter2DCaller<T, D, BrdReflect>::call,
+                0
+            };
+#else
             static const func_t funcs[] =
             {
                 Filter2DCaller<T, D, BrdReflect101>::call,
@@ -993,19 +1003,26 @@ namespace cv { namespace gpu { namespace device
                 Filter2DCaller<T, D, BrdReflect>::call,
                 Filter2DCaller<T, D, BrdWrap>::call
             };
+#endif
+
+            const func_t func = funcs[borderMode];
+            if (!func)
+                cv::gpu::error("Unsupported input parameters for filter2D", __FILE__, __LINE__, "");
 
             if (stream == 0)
                 cudaSafeCall( cudaMemcpyToSymbol(c_filter2DKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
             else
                 cudaSafeCall( cudaMemcpyToSymbolAsync(c_filter2DKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
 
-            funcs[borderMode](static_cast< PtrStepSz<T> >(srcWhole), ofsX, ofsY, static_cast< PtrStepSz<D> >(dst), kWidth, kHeight, anchorX, anchorY, borderValue, stream);
+            func(static_cast< PtrStepSz<T> >(srcWhole), ofsX, ofsY, static_cast< PtrStepSz<D> >(dst), kWidth, kHeight, anchorX, anchorY, borderValue, stream);
         }
 
         template void filter2D_gpu<uchar, uchar>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
         template void filter2D_gpu<uchar4, uchar4>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
         template void filter2D_gpu<ushort, ushort>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
         template void filter2D_gpu<ushort4, ushort4>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
+#endif
         template void filter2D_gpu<float, float>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
         template void filter2D_gpu<float4, float4>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
     } // namespace imgproc
diff --git a/modules/gpu/src/cuda/matrix_reductions.cu b/modules/gpu/src/cuda/matrix_reductions.cu
index 745daca1db..15d6612832 100644
--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
@@ -462,6 +462,7 @@ namespace sum
     }
 
     template void run<uchar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void run<uchar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
     template void run<uchar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
     template void run<uchar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
@@ -485,8 +486,10 @@ namespace sum
     template void run<int, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
     template void run<int, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
     template void run<int, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+#endif
 
     template void run<float, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void run<float, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
     template void run<float, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
     template void run<float, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
@@ -495,6 +498,7 @@ namespace sum
     template void run<double, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
     template void run<double, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
     template void run<double, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+#endif
 
     template <typename T, int cn>
     void runAbs(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask)
@@ -504,6 +508,7 @@ namespace sum
     }
 
     template void runAbs<uchar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void runAbs<uchar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
     template void runAbs<uchar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
     template void runAbs<uchar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
@@ -527,8 +532,10 @@ namespace sum
     template void runAbs<int, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
     template void runAbs<int, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
     template void runAbs<int, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+#endif
 
     template void runAbs<float, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void runAbs<float, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
     template void runAbs<float, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
     template void runAbs<float, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
@@ -537,6 +544,7 @@ namespace sum
     template void runAbs<double, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
     template void runAbs<double, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
     template void runAbs<double, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+#endif
 
     template <typename T> struct Sqr : unary_function<T, T>
     {
@@ -553,6 +561,7 @@ namespace sum
     }
 
     template void runSqr<uchar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void runSqr<uchar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
     template void runSqr<uchar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
     template void runSqr<uchar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
@@ -576,8 +585,10 @@ namespace sum
     template void runSqr<int, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
     template void runSqr<int, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
     template void runSqr<int, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+#endif
 
     template void runSqr<float, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void runSqr<float, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
     template void runSqr<float, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
     template void runSqr<float, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
@@ -586,6 +597,7 @@ namespace sum
     template void runSqr<double, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
     template void runSqr<double, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
     template void runSqr<double, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+#endif
 }
 
 /////////////////////////////////////////////////////////////
@@ -773,12 +785,16 @@ namespace minMax
     }
 
     template void run<uchar >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void run<schar >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
     template void run<ushort>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
     template void run<short >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
     template void run<int   >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+#endif
     template void run<float >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void run<double>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+#endif
 }
 
 /////////////////////////////////////////////////////////////
@@ -955,12 +971,16 @@ namespace minMaxLoc
     }
 
     template void run<unsigned char >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void run<signed char >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
     template void run<unsigned short>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
     template void run<short >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
+#endif
     template void run<int   >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
     template void run<float >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
+#ifndef OPENCV_TINY_GPU_MODULE
     template void run<double>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
+#endif
 }
 
 /////////////////////////////////////////////////////////////
@@ -1079,12 +1099,16 @@ namespace countNonZero
     }
 
     template int run<uchar >(const PtrStepSzb src, PtrStep<unsigned int> buf);
+#ifndef OPENCV_TINY_GPU_MODULE
     template int run<schar >(const PtrStepSzb src, PtrStep<unsigned int> buf);
     template int run<ushort>(const PtrStepSzb src, PtrStep<unsigned int> buf);
     template int run<short >(const PtrStepSzb src, PtrStep<unsigned int> buf);
     template int run<int   >(const PtrStepSzb src, PtrStep<unsigned int> buf);
+#endif
     template int run<float >(const PtrStepSzb src, PtrStep<unsigned int> buf);
+#ifndef OPENCV_TINY_GPU_MODULE
     template int run<double>(const PtrStepSzb src, PtrStep<unsigned int> buf);
+#endif
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -1257,6 +1281,11 @@ namespace reduce
         funcs[op]((PtrStepSz<T>) src, (D*) dst, stream);
     }
 
+#ifdef OPENCV_TINY_GPU_MODULE
+    template void rows<unsigned char, int, unsigned char>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+    template void rows<unsigned char, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+    template void rows<float, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+#else
     template void rows<unsigned char, int, unsigned char>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
     template void rows<unsigned char, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
     template void rows<unsigned char, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
@@ -1280,6 +1309,7 @@ namespace reduce
     template void rows<float, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
 
     template void rows<double, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+#endif
 
     ///////////////////////////////////////////////////////////
 
@@ -1338,6 +1368,11 @@ namespace reduce
         funcs[cn][op](src, dst, stream);
     }
 
+#ifdef OPENCV_TINY_GPU_MODULE
+    template void cols<unsigned char, int, unsigned char>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+    template void cols<unsigned char, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+    template void cols<float, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+#else
     template void cols<unsigned char, int, unsigned char>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
     template void cols<unsigned char, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
     template void cols<unsigned char, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
@@ -1361,6 +1396,7 @@ namespace reduce
     template void cols<float, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
 
     template void cols<double, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+#endif
 }
 
 #endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/pyr_down.cu b/modules/gpu/src/cuda/pyr_down.cu
index eac7928826..af0e18d888 100644
--- a/modules/gpu/src/cuda/pyr_down.cu
+++ b/modules/gpu/src/cuda/pyr_down.cu
@@ -197,6 +197,7 @@ namespace cv { namespace gpu { namespace device
         template void pyrDown_gpu<uchar3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
         template void pyrDown_gpu<uchar4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 
+#ifndef OPENCV_TINY_GPU_MODULE
         //template void pyrDown_gpu<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
         //template void pyrDown_gpu<char2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
         //template void pyrDown_gpu<char3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
@@ -216,6 +217,7 @@ namespace cv { namespace gpu { namespace device
         //template void pyrDown_gpu<int2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
         //template void pyrDown_gpu<int3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
         //template void pyrDown_gpu<int4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+#endif
 
         template void pyrDown_gpu<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
         //template void pyrDown_gpu<float2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
diff --git a/modules/gpu/src/cuda/pyr_up.cu b/modules/gpu/src/cuda/pyr_up.cu
index b14d124e7c..ffb6276622 100644
--- a/modules/gpu/src/cuda/pyr_up.cu
+++ b/modules/gpu/src/cuda/pyr_up.cu
@@ -166,6 +166,7 @@ namespace cv { namespace gpu { namespace device
         template void pyrUp_gpu<uchar3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
         template void pyrUp_gpu<uchar4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 
+#ifndef OPENCV_TINY_GPU_MODULE
         //template void pyrUp_gpu<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
         //template void pyrUp_gpu<char2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
         //template void pyrUp_gpu<char3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
@@ -185,6 +186,7 @@ namespace cv { namespace gpu { namespace device
         //template void pyrUp_gpu<int2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
         //template void pyrUp_gpu<int3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
         //template void pyrUp_gpu<int4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+#endif
 
         template void pyrUp_gpu<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
         //template void pyrUp_gpu<float2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
diff --git a/modules/gpu/src/cuda/remap.cu b/modules/gpu/src/cuda/remap.cu
index f40ada0302..77bf976140 100644
--- a/modules/gpu/src/cuda/remap.cu
+++ b/modules/gpu/src/cuda/remap.cu
@@ -209,6 +209,7 @@ namespace cv { namespace gpu { namespace device
             typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap,
                 PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20);
 
+#ifdef OPENCV_TINY_GPU_MODULE
             static const caller_t callers[3][5] =
             {
                 {
@@ -216,25 +217,55 @@ namespace cv { namespace gpu { namespace device
                     RemapDispatcher<PointFilter, BrdReplicate, T>::call,
                     RemapDispatcher<PointFilter, BrdConstant, T>::call,
                     RemapDispatcher<PointFilter, BrdReflect, T>::call,
-                    RemapDispatcher<PointFilter, BrdWrap, T>::call
+                    0/*RemapDispatcher<PointFilter, BrdWrap, T>::call*/,
                 },
                 {
                     RemapDispatcher<LinearFilter, BrdReflect101, T>::call,
                     RemapDispatcher<LinearFilter, BrdReplicate, T>::call,
                     RemapDispatcher<LinearFilter, BrdConstant, T>::call,
                     RemapDispatcher<LinearFilter, BrdReflect, T>::call,
-                    RemapDispatcher<LinearFilter, BrdWrap, T>::call
+                    0/*RemapDispatcher<LinearFilter, BrdWrap, T>::call*/,
+                },
+                {
+                    0/*RemapDispatcher<CubicFilter, BrdReflect101, T>::call*/,
+                    0/*RemapDispatcher<CubicFilter, BrdReplicate, T>::call*/,
+                    0/*RemapDispatcher<CubicFilter, BrdConstant, T>::call*/,
+                    0/*RemapDispatcher<CubicFilter, BrdReflect, T>::call*/,
+                    0/*RemapDispatcher<CubicFilter, BrdWrap, T>::call*/,
+                }
+            };
+#else
+            static const caller_t callers[3][5] =
+            {
+                {
+                    RemapDispatcher<PointFilter, BrdReflect101, T>::call,
+                    RemapDispatcher<PointFilter, BrdReplicate, T>::call,
+                    RemapDispatcher<PointFilter, BrdConstant, T>::call,
+                    RemapDispatcher<PointFilter, BrdReflect, T>::call,
+                    RemapDispatcher<PointFilter, BrdWrap, T>::call,
+                },
+                {
+                    RemapDispatcher<LinearFilter, BrdReflect101, T>::call,
+                    RemapDispatcher<LinearFilter, BrdReplicate, T>::call,
+                    RemapDispatcher<LinearFilter, BrdConstant, T>::call,
+                    RemapDispatcher<LinearFilter, BrdReflect, T>::call,
+                    RemapDispatcher<LinearFilter, BrdWrap, T>::call,
                 },
                 {
                     RemapDispatcher<CubicFilter, BrdReflect101, T>::call,
                     RemapDispatcher<CubicFilter, BrdReplicate, T>::call,
                     RemapDispatcher<CubicFilter, BrdConstant, T>::call,
                     RemapDispatcher<CubicFilter, BrdReflect, T>::call,
-                    RemapDispatcher<CubicFilter, BrdWrap, T>::call
+                    RemapDispatcher<CubicFilter, BrdWrap, T>::call,
                 }
             };
+#endif
 
-            callers[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff, xmap, ymap,
+            const caller_t caller = callers[interpolation][borderMode];
+            if (!caller)
+                cv::gpu::error("Unsupported input parameters for remap", __FILE__, __LINE__, "");
+
+            caller(static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff, xmap, ymap,
                 static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc20);
         }
 
@@ -243,6 +274,7 @@ namespace cv { namespace gpu { namespace device
         template void remap_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
         template void remap_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 
+#ifndef OPENCV_TINY_GPU_MODULE
         //template void remap_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
         //template void remap_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
         //template void remap_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
@@ -262,6 +294,7 @@ namespace cv { namespace gpu { namespace device
         //template void remap_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
         //template void remap_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
         //template void remap_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+#endif
 
         template void remap_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
         //template void remap_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
diff --git a/modules/gpu/src/cuda/resize.cu b/modules/gpu/src/cuda/resize.cu
index 1998b3b07c..11a90ab243 100644
--- a/modules/gpu/src/cuda/resize.cu
+++ b/modules/gpu/src/cuda/resize.cu
@@ -342,11 +342,13 @@ namespace cv { namespace gpu { namespace device
     template <> struct ResizeNearestDispatcher<uchar> : SelectImplForNearest<uchar> {};
     template <> struct ResizeNearestDispatcher<uchar4> : SelectImplForNearest<uchar4> {};
 
+#ifndef OPENCV_TINY_GPU_MODULE
     template <> struct ResizeNearestDispatcher<ushort> : SelectImplForNearest<ushort> {};
     template <> struct ResizeNearestDispatcher<ushort4> : SelectImplForNearest<ushort4> {};
 
     template <> struct ResizeNearestDispatcher<short> : SelectImplForNearest<short> {};
     template <> struct ResizeNearestDispatcher<short4> : SelectImplForNearest<short4> {};
+#endif
 
     template <> struct ResizeNearestDispatcher<float> : SelectImplForNearest<float> {};
     template <> struct ResizeNearestDispatcher<float4> : SelectImplForNearest<float4> {};
@@ -380,11 +382,13 @@ namespace cv { namespace gpu { namespace device
     template <> struct ResizeLinearDispatcher<uchar> : SelectImplForLinear<uchar> {};
     template <> struct ResizeLinearDispatcher<uchar4> : SelectImplForLinear<uchar4> {};
 
+#ifndef OPENCV_TINY_GPU_MODULE
     template <> struct ResizeLinearDispatcher<ushort> : SelectImplForLinear<ushort> {};
     template <> struct ResizeLinearDispatcher<ushort4> : SelectImplForLinear<ushort4> {};
 
     template <> struct ResizeLinearDispatcher<short> : SelectImplForLinear<short> {};
     template <> struct ResizeLinearDispatcher<short4> : SelectImplForLinear<short4> {};
+#endif
 
     template <> struct ResizeLinearDispatcher<float> : SelectImplForLinear<float> {};
     template <> struct ResizeLinearDispatcher<float4> : SelectImplForLinear<float4> {};
@@ -410,6 +414,7 @@ namespace cv { namespace gpu { namespace device
         }
     };
 
+#ifndef OPENCV_TINY_GPU_MODULE
     template <> struct ResizeCubicDispatcher<uchar> : SelectImplForCubic<uchar> {};
     template <> struct ResizeCubicDispatcher<uchar4> : SelectImplForCubic<uchar4> {};
 
@@ -421,6 +426,7 @@ namespace cv { namespace gpu { namespace device
 
     template <> struct ResizeCubicDispatcher<float> : SelectImplForCubic<float> {};
     template <> struct ResizeCubicDispatcher<float4> : SelectImplForCubic<float4> {};
+#endif
 
     // ResizeAreaDispatcher
 
@@ -467,7 +473,11 @@ namespace cv { namespace gpu { namespace device
         {
             ResizeNearestDispatcher<T>::call,
             ResizeLinearDispatcher<T>::call,
+#ifdef OPENCV_TINY_GPU_MODULE
+            0,
+#else
             ResizeCubicDispatcher<T>::call,
+#endif
             ResizeAreaDispatcher<T>::call
         };
 
@@ -475,13 +485,18 @@ namespace cv { namespace gpu { namespace device
         if (interpolation == 3 && (fx <= 1.f || fy <= 1.f))
             interpolation = 1;
 
-        funcs[interpolation](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), yoff, xoff, static_cast< PtrStepSz<T> >(dst), fy, fx, stream);
+        const func_t func = funcs[interpolation];
+        if (!func)
+            cv::gpu::error("Unsupported input parameters for resize", __FILE__, __LINE__, "");
+
+        func(static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), yoff, xoff, static_cast< PtrStepSz<T> >(dst), fy, fx, stream);
     }
 
     template void resize<uchar >(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
     template void resize<uchar3>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
     template void resize<uchar4>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
 
+#ifndef OPENCV_TINY_GPU_MODULE
     template void resize<ushort >(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
     template void resize<ushort3>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
     template void resize<ushort4>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
@@ -489,6 +504,7 @@ namespace cv { namespace gpu { namespace device
     template void resize<short >(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
     template void resize<short3>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
     template void resize<short4>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+#endif
 
     template void resize<float >(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
     template void resize<float3>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
diff --git a/modules/gpu/src/cuda/row_filter.10.cu b/modules/gpu/src/cuda/row_filter.10.cu
index 7d93ee31ac..c910270a64 100644
--- a/modules/gpu/src/cuda/row_filter.10.cu
+++ b/modules/gpu/src/cuda/row_filter.10.cu
@@ -44,9 +44,13 @@
 
 #include "row_filter.h"
 
+#ifndef OPENCV_TINY_GPU_MODULE
+
 namespace filter
 {
     template void linearRow<unsigned short, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }
 
+#endif
+
 #endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/row_filter.11.cu b/modules/gpu/src/cuda/row_filter.11.cu
index 31bccc48b6..c5e1fbcd96 100644
--- a/modules/gpu/src/cuda/row_filter.11.cu
+++ b/modules/gpu/src/cuda/row_filter.11.cu
@@ -44,9 +44,13 @@
 
 #include "row_filter.h"
 
+#ifndef OPENCV_TINY_GPU_MODULE
+
 namespace filter
 {
     template void linearRow<ushort3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }
 
+#endif
+
 #endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/row_filter.12.cu b/modules/gpu/src/cuda/row_filter.12.cu
index 7be543f6b2..017aff8e7f 100644
--- a/modules/gpu/src/cuda/row_filter.12.cu
+++ b/modules/gpu/src/cuda/row_filter.12.cu
@@ -44,9 +44,13 @@
 
 #include "row_filter.h"
 
+#ifndef OPENCV_TINY_GPU_MODULE
+
 namespace filter
 {
     template void linearRow<ushort4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }
 
+#endif
+
 #endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/row_filter.13.cu b/modules/gpu/src/cuda/row_filter.13.cu
index bd700b1bb2..676f5ae826 100644
--- a/modules/gpu/src/cuda/row_filter.13.cu
+++ b/modules/gpu/src/cuda/row_filter.13.cu
@@ -44,9 +44,13 @@
 
 #include "row_filter.h"
 
+#ifndef OPENCV_TINY_GPU_MODULE
+
 namespace filter
 {
     template void linearRow<int3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }
 
+#endif
+
 #endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/row_filter.14.cu b/modules/gpu/src/cuda/row_filter.14.cu
index 97df2f128a..e8d0ec501a 100644
--- a/modules/gpu/src/cuda/row_filter.14.cu
+++ b/modules/gpu/src/cuda/row_filter.14.cu
@@ -44,9 +44,13 @@
 
 #include "row_filter.h"
 
+#ifndef OPENCV_TINY_GPU_MODULE
+
 namespace filter
 {
     template void linearRow<int4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }
 
+#endif
+
 #endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/row_filter.3.cu b/modules/gpu/src/cuda/row_filter.3.cu
index fe84666954..57013781c5 100644
--- a/modules/gpu/src/cuda/row_filter.3.cu
+++ b/modules/gpu/src/cuda/row_filter.3.cu
@@ -44,9 +44,13 @@
 
 #include "row_filter.h"
 
+#ifndef OPENCV_TINY_GPU_MODULE
+
 namespace filter
 {
     template void linearRow<short3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }
 
+#endif
+
 #endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/row_filter.4.cu b/modules/gpu/src/cuda/row_filter.4.cu
index 050f7af04e..277ab7f87d 100644
--- a/modules/gpu/src/cuda/row_filter.4.cu
+++ b/modules/gpu/src/cuda/row_filter.4.cu
@@ -44,9 +44,13 @@
 
 #include "row_filter.h"
 
+#ifndef OPENCV_TINY_GPU_MODULE
+
 namespace filter
 {
     template void linearRow<int, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }
 
+#endif
+
 #endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/row_filter.8.cu b/modules/gpu/src/cuda/row_filter.8.cu
index b899e87a7a..e9dfd7f4a3 100644
--- a/modules/gpu/src/cuda/row_filter.8.cu
+++ b/modules/gpu/src/cuda/row_filter.8.cu
@@ -44,9 +44,13 @@
 
 #include "row_filter.h"
 
+#ifndef OPENCV_TINY_GPU_MODULE
+
 namespace filter
 {
     template void linearRow<short, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }
 
+#endif
+
 #endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/row_filter.9.cu b/modules/gpu/src/cuda/row_filter.9.cu
index 516dd8fe7c..eaad54d343 100644
--- a/modules/gpu/src/cuda/row_filter.9.cu
+++ b/modules/gpu/src/cuda/row_filter.9.cu
@@ -44,9 +44,13 @@
 
 #include "row_filter.h"
 
+#ifndef OPENCV_TINY_GPU_MODULE
+
 namespace filter
 {
     template void linearRow<short4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }
 
+#endif
+
 #endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/row_filter.h b/modules/gpu/src/cuda/row_filter.h
index 933f900293..9bfaf7f3d8 100644
--- a/modules/gpu/src/cuda/row_filter.h
+++ b/modules/gpu/src/cuda/row_filter.h
@@ -182,6 +182,186 @@ namespace filter
     {
         typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);
 
+#ifdef OPENCV_TINY_GPU_MODULE
+        static const caller_t callers[5][33] =
+        {
+            {
+                0,
+                0,
+                0,
+                row_filter::caller< 3, T, D, BrdRowReflect101>,
+                0,
+                row_filter::caller< 5, T, D, BrdRowReflect101>,
+                0,
+                row_filter::caller< 7, T, D, BrdRowReflect101>,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+            },
+            {
+                0,
+                0,
+                0,
+                row_filter::caller< 3, T, D, BrdRowReplicate>,
+                0,
+                row_filter::caller< 5, T, D, BrdRowReplicate>,
+                0,
+                row_filter::caller< 7, T, D, BrdRowReplicate>,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+            },
+            {
+                0,
+                0,
+                0,
+                row_filter::caller< 3, T, D, BrdRowConstant>,
+                0,
+                row_filter::caller< 5, T, D, BrdRowConstant>,
+                0,
+                row_filter::caller< 7, T, D, BrdRowConstant>,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+            },
+            {
+                0,
+                0,
+                0,
+                row_filter::caller< 3, T, D, BrdRowReflect>,
+                0,
+                row_filter::caller< 5, T, D, BrdRowReflect>,
+                0,
+                row_filter::caller< 7, T, D, BrdRowReflect>,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+            },
+            {
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+            }
+        };
+#else
         static const caller_t callers[5][33] =
         {
             {
@@ -360,12 +540,17 @@ namespace filter
                 row_filter::caller<32, T, D, BrdRowWrap>
             }
         };
+#endif
+
+        const caller_t caller = callers[brd_type][ksize];
+        if (!caller)
+            cv::gpu::error("Unsupported input parameters for row_filter", __FILE__, __LINE__, "");
 
         if (stream == 0)
             cudaSafeCall( cudaMemcpyToSymbol(row_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
         else
             cudaSafeCall( cudaMemcpyToSymbolAsync(row_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
 
-        callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
+        caller((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
     }
 }
diff --git a/modules/gpu/src/cuda/stereobm.cu b/modules/gpu/src/cuda/stereobm.cu
index ad256357b8..6082e6c3cc 100644
--- a/modules/gpu/src/cuda/stereobm.cu
+++ b/modules/gpu/src/cuda/stereobm.cu
@@ -330,24 +330,55 @@ namespace cv { namespace gpu { namespace device
 
         typedef void (*kernel_caller_t)(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, cudaStream_t & stream);
 
+#ifdef OPENCV_TINY_GPU_MODULE
+        const static kernel_caller_t callers[] =
+        {
+            0,
+            kernel_caller< 1>,
+            kernel_caller< 2>,
+            kernel_caller< 3>,
+            kernel_caller< 4>,
+            kernel_caller< 5>,
+            0/*kernel_caller< 6>*/,
+            0/*kernel_caller< 7>*/,
+            0/*kernel_caller< 8>*/,
+            kernel_caller< 9>,
+            0/*kernel_caller<10>*/,
+            0/*kernel_caller<11>*/,
+            0/*kernel_caller<12>*/,
+            0/*kernel_caller<13>*/,
+            0/*kernel_caller<14>*/,
+            kernel_caller<15>,
+            0/*kernel_caller<16>*/,
+            0/*kernel_caller<17>*/,
+            0/*kernel_caller<18>*/,
+            0/*kernel_caller<19>*/,
+            0/*kernel_caller<20>*/,
+            0/*kernel_caller<21>*/,
+            0/*kernel_caller<22>*/,
+            0/*kernel_caller<23>*/,
+            0/*kernel_caller<24>*/,
+            0/*kernel_caller<25>*/,
+        };
+#else
         const static kernel_caller_t callers[] =
         {
             0,
             kernel_caller< 1>, kernel_caller< 2>, kernel_caller< 3>, kernel_caller< 4>, kernel_caller< 5>,
             kernel_caller< 6>, kernel_caller< 7>, kernel_caller< 8>, kernel_caller< 9>, kernel_caller<10>,
-            kernel_caller<11>, kernel_caller<12>, kernel_caller<13>, kernel_caller<15>, kernel_caller<15>,
+            kernel_caller<11>, kernel_caller<12>, kernel_caller<13>, kernel_caller<14>, kernel_caller<15>,
             kernel_caller<16>, kernel_caller<17>, kernel_caller<18>, kernel_caller<19>, kernel_caller<20>,
             kernel_caller<21>, kernel_caller<22>, kernel_caller<23>, kernel_caller<24>, kernel_caller<25>
-
-            //0,0,0, 0,0,0, 0,0,kernel_caller<9>
         };
+#endif
+
         const int calles_num = sizeof(callers)/sizeof(callers[0]);
 
         void stereoBM_GPU(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, int winsz, const PtrStepSz<unsigned int>& minSSD_buf, cudaStream_t& stream)
         {
             int winsz2 = winsz >> 1;
 
-            if (winsz2 == 0 || winsz2 >= calles_num)
+            if (winsz2 == 0 || winsz2 >= calles_num || callers[winsz2] == 0)
                 cv::gpu::error("Unsupported window size", __FILE__, __LINE__, "stereoBM_GPU");
 
             //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferL1) );
diff --git a/modules/gpu/src/cuda/warp.cu b/modules/gpu/src/cuda/warp.cu
index 49130d9405..ad867601ed 100644
--- a/modules/gpu/src/cuda/warp.cu
+++ b/modules/gpu/src/cuda/warp.cu
@@ -278,6 +278,7 @@ namespace cv { namespace gpu { namespace device
         {
             typedef void (*func_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20);
 
+#ifdef OPENCV_TINY_GPU_MODULE
             static const func_t funcs[3][5] =
             {
                 {
@@ -285,25 +286,55 @@ namespace cv { namespace gpu { namespace device
                     WarpDispatcher<Transform, PointFilter, BrdReplicate, T>::call,
                     WarpDispatcher<Transform, PointFilter, BrdConstant, T>::call,
                     WarpDispatcher<Transform, PointFilter, BrdReflect, T>::call,
-                    WarpDispatcher<Transform, PointFilter, BrdWrap, T>::call
+                    0/*WarpDispatcher<Transform, PointFilter, BrdWrap, T>::call*/,
                 },
                 {
                     WarpDispatcher<Transform, LinearFilter, BrdReflect101, T>::call,
                     WarpDispatcher<Transform, LinearFilter, BrdReplicate, T>::call,
                     WarpDispatcher<Transform, LinearFilter, BrdConstant, T>::call,
                     WarpDispatcher<Transform, LinearFilter, BrdReflect, T>::call,
-                    WarpDispatcher<Transform, LinearFilter, BrdWrap, T>::call
+                    0/*WarpDispatcher<Transform, LinearFilter, BrdWrap, T>::call*/,
+                },
+                {
+                    0/*WarpDispatcher<Transform, CubicFilter, BrdReflect101, T>::call*/,
+                    0/*WarpDispatcher<Transform, CubicFilter, BrdReplicate, T>::call*/,
+                    0/*WarpDispatcher<Transform, CubicFilter, BrdConstant, T>::call*/,
+                    0/*WarpDispatcher<Transform, CubicFilter, BrdReflect, T>::call*/,
+                    0/*WarpDispatcher<Transform, CubicFilter, BrdWrap, T>::call*/,
+                }
+            };
+#else
+            static const func_t funcs[3][5] =
+            {
+                {
+                    WarpDispatcher<Transform, PointFilter, BrdReflect101, T>::call,
+                    WarpDispatcher<Transform, PointFilter, BrdReplicate, T>::call,
+                    WarpDispatcher<Transform, PointFilter, BrdConstant, T>::call,
+                    WarpDispatcher<Transform, PointFilter, BrdReflect, T>::call,
+                    WarpDispatcher<Transform, PointFilter, BrdWrap, T>::call,
+                },
+                {
+                    WarpDispatcher<Transform, LinearFilter, BrdReflect101, T>::call,
+                    WarpDispatcher<Transform, LinearFilter, BrdReplicate, T>::call,
+                    WarpDispatcher<Transform, LinearFilter, BrdConstant, T>::call,
+                    WarpDispatcher<Transform, LinearFilter, BrdReflect, T>::call,
+                    WarpDispatcher<Transform, LinearFilter, BrdWrap, T>::call,
                 },
                 {
                     WarpDispatcher<Transform, CubicFilter, BrdReflect101, T>::call,
                     WarpDispatcher<Transform, CubicFilter, BrdReplicate, T>::call,
                     WarpDispatcher<Transform, CubicFilter, BrdConstant, T>::call,
                     WarpDispatcher<Transform, CubicFilter, BrdReflect, T>::call,
-                    WarpDispatcher<Transform, CubicFilter, BrdWrap, T>::call
+                    WarpDispatcher<Transform, CubicFilter, BrdWrap, T>::call,
                 }
             };
+#endif
 
-            funcs[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff,
+            const func_t func = funcs[interpolation][borderMode];
+            if (!func)
+                cv::gpu::error("Unsupported input parameters for warp_caller", __FILE__, __LINE__, "");
+
+            func(static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff,
                 static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc20);
         }
 
@@ -320,6 +351,7 @@ namespace cv { namespace gpu { namespace device
         template void warpAffine_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
         template void warpAffine_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 
+#ifndef OPENCV_TINY_GPU_MODULE
         //template void warpAffine_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
         //template void warpAffine_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
         //template void warpAffine_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
@@ -339,6 +371,7 @@ namespace cv { namespace gpu { namespace device
         //template void warpAffine_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
         //template void warpAffine_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
         //template void warpAffine_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+#endif
 
         template void warpAffine_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
         //template void warpAffine_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
@@ -358,6 +391,7 @@ namespace cv { namespace gpu { namespace device
         template void warpPerspective_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
         template void warpPerspective_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 
+#ifndef OPENCV_TINY_GPU_MODULE
         //template void warpPerspective_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
         //template void warpPerspective_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
         //template void warpPerspective_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
@@ -377,6 +411,7 @@ namespace cv { namespace gpu { namespace device
         //template void warpPerspective_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
         //template void warpPerspective_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
         //template void warpPerspective_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+#endif
 
         template void warpPerspective_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
         //template void warpPerspective_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
diff --git a/modules/gpu/src/cvt_color_internal.h b/modules/gpu/src/cvt_color_internal.h
index 1b7c68f35f..f108da827f 100644
--- a/modules/gpu/src/cvt_color_internal.h
+++ b/modules/gpu/src/cvt_color_internal.h
@@ -48,10 +48,16 @@ namespace cv { namespace gpu { namespace device
 #define OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name) \
     void name(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 
-#define OPENCV_GPU_DECLARE_CVTCOLOR_ALL(name)       \
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u)    \
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _16u)   \
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f)
+#ifdef OPENCV_TINY_GPU_MODULE
+    #define OPENCV_GPU_DECLARE_CVTCOLOR_ALL(name)       \
+        OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u)    \
+        OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f)
+#else
+    #define OPENCV_GPU_DECLARE_CVTCOLOR_ALL(name)       \
+        OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u)    \
+        OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _16u)   \
+        OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f)
+#endif
 
 #define OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(name)    \
     OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u)   \
diff --git a/modules/gpu/src/denoising.cpp b/modules/gpu/src/denoising.cpp
index 50fab0cc2e..3eb74a92e6 100644
--- a/modules/gpu/src/denoising.cpp
+++ b/modules/gpu/src/denoising.cpp
@@ -77,6 +77,17 @@ void cv::gpu::bilateralFilter(const GpuMat& src, GpuMat& dst, int kernel_size, f
 
     typedef void (*func_t)(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, int borderMode, cudaStream_t s);
 
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[6][4] =
+    {
+        {bilateral_filter_gpu<uchar>       , 0 /*bilateral_filter_gpu<uchar2>*/ , bilateral_filter_gpu<uchar3>       , bilateral_filter_gpu<uchar4>       },
+        {0 /*bilateral_filter_gpu<schar>*/ , 0 /*bilateral_filter_gpu<schar2>*/ , 0 /*bilateral_filter_gpu<schar3>*/ , 0 /*bilateral_filter_gpu<schar4>*/ },
+        {0 /*bilateral_filter_gpu<ushort>*/, 0 /*bilateral_filter_gpu<ushort2>*/, 0 /*bilateral_filter_gpu<ushort3>*/, 0 /*bilateral_filter_gpu<ushort4>*/},
+        {0 /*bilateral_filter_gpu<short>*/ , 0 /*bilateral_filter_gpu<short2>*/ , 0 /*bilateral_filter_gpu<short3>*/ , 0 /*bilateral_filter_gpu<short4>*/ },
+        {0 /*bilateral_filter_gpu<int>*/   , 0 /*bilateral_filter_gpu<int2>*/   , 0 /*bilateral_filter_gpu<int3>*/   , 0 /*bilateral_filter_gpu<int4>*/   },
+        {bilateral_filter_gpu<float>       , 0 /*bilateral_filter_gpu<float2>*/ , bilateral_filter_gpu<float3>       , bilateral_filter_gpu<float4>       }
+    };
+#else
     static const func_t funcs[6][4] =
     {
         {bilateral_filter_gpu<uchar>      , 0 /*bilateral_filter_gpu<uchar2>*/ , bilateral_filter_gpu<uchar3>      , bilateral_filter_gpu<uchar4>      },
@@ -86,6 +97,7 @@ void cv::gpu::bilateralFilter(const GpuMat& src, GpuMat& dst, int kernel_size, f
         {0 /*bilateral_filter_gpu<int>*/  , 0 /*bilateral_filter_gpu<int2>*/   , 0 /*bilateral_filter_gpu<int3>*/  , 0 /*bilateral_filter_gpu<int4>*/  },
         {bilateral_filter_gpu<float>      , 0 /*bilateral_filter_gpu<float2>*/ , bilateral_filter_gpu<float3>      , bilateral_filter_gpu<float4>      }
     };
+#endif
 
     sigma_color = (sigma_color <= 0 ) ? 1 : sigma_color;
     sigma_spatial = (sigma_spatial <= 0 ) ? 1 : sigma_spatial;
diff --git a/modules/gpu/src/element_operations.cpp b/modules/gpu/src/element_operations.cpp
index 354d614d42..780745d733 100644
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
@@ -275,6 +275,75 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu
     using namespace arithm;
 
     typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[7][7] =
+    {
+        {
+            addMat<unsigned char, unsigned char>,
+            0/*addMat<unsigned char, signed char>*/,
+            0/*addMat<unsigned char, unsigned short>*/,
+            0/*addMat<unsigned char, short>*/,
+            0/*addMat<unsigned char, int>*/,
+            0/*addMat<unsigned char, float>*/,
+            0/*addMat<unsigned char, double>*/,
+        },
+        {
+            0/*addMat<signed char, unsigned char>*/,
+            0/*addMat<signed char, signed char>*/,
+            0/*addMat<signed char, unsigned short>*/,
+            0/*addMat<signed char, short>*/,
+            0/*addMat<signed char, int>*/,
+            0/*addMat<signed char, float>*/,
+            0/*addMat<signed char, double>*/,
+        },
+        {
+            0 /*addMat<unsigned short, unsigned char>*/,
+            0 /*addMat<unsigned short, signed char>*/,
+            0/*addMat<unsigned short, unsigned short>*/,
+            0/*addMat<unsigned short, short>*/,
+            0/*addMat<unsigned short, int>*/,
+            0/*addMat<unsigned short, float>*/,
+            0/*addMat<unsigned short, double>*/,
+        },
+        {
+            0 /*addMat<short, unsigned char>*/,
+            0 /*addMat<short, signed char>*/,
+            0/*addMat<short, unsigned short>*/,
+            0/*addMat<short, short>*/,
+            0/*addMat<short, int>*/,
+            0/*addMat<short, float>*/,
+            0/*addMat<short, double>*/,
+        },
+        {
+            0 /*addMat<int, unsigned char>*/,
+            0 /*addMat<int, signed char>*/,
+            0 /*addMat<int, unsigned short>*/,
+            0 /*addMat<int, short>*/,
+            0/*addMat<int, int>*/,
+            0/*addMat<int, float>*/,
+            0/*addMat<int, double>*/,
+        },
+        {
+            0 /*addMat<float, unsigned char>*/,
+            0 /*addMat<float, signed char>*/,
+            0 /*addMat<float, unsigned short>*/,
+            0 /*addMat<float, short>*/,
+            0 /*addMat<float, int>*/,
+            addMat<float, float>,
+            0/*addMat<float, double>*/,
+        },
+        {
+            0 /*addMat<double, unsigned char>*/,
+            0 /*addMat<double, signed char>*/,
+            0 /*addMat<double, unsigned short>*/,
+            0 /*addMat<double, short>*/,
+            0 /*addMat<double, int>*/,
+            0 /*addMat<double, float>*/,
+            0/*addMat<double, double>*/,
+        }
+    };
+#else
     static const func_t funcs[7][7] =
     {
         {
@@ -284,7 +353,7 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu
             addMat<unsigned char, short>,
             addMat<unsigned char, int>,
             addMat<unsigned char, float>,
-            addMat<unsigned char, double>
+            addMat<unsigned char, double>,
         },
         {
             addMat<signed char, unsigned char>,
@@ -293,7 +362,7 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu
             addMat<signed char, short>,
             addMat<signed char, int>,
             addMat<signed char, float>,
-            addMat<signed char, double>
+            addMat<signed char, double>,
         },
         {
             0 /*addMat<unsigned short, unsigned char>*/,
@@ -302,7 +371,7 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu
             addMat<unsigned short, short>,
             addMat<unsigned short, int>,
             addMat<unsigned short, float>,
-            addMat<unsigned short, double>
+            addMat<unsigned short, double>,
         },
         {
             0 /*addMat<short, unsigned char>*/,
@@ -311,7 +380,7 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu
             addMat<short, short>,
             addMat<short, int>,
             addMat<short, float>,
-            addMat<short, double>
+            addMat<short, double>,
         },
         {
             0 /*addMat<int, unsigned char>*/,
@@ -320,7 +389,7 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu
             0 /*addMat<int, short>*/,
             addMat<int, int>,
             addMat<int, float>,
-            addMat<int, double>
+            addMat<int, double>,
         },
         {
             0 /*addMat<float, unsigned char>*/,
@@ -329,7 +398,7 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu
             0 /*addMat<float, short>*/,
             0 /*addMat<float, int>*/,
             addMat<float, float>,
-            addMat<float, double>
+            addMat<float, double>,
         },
         {
             0 /*addMat<double, unsigned char>*/,
@@ -338,9 +407,10 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu
             0 /*addMat<double, short>*/,
             0 /*addMat<double, int>*/,
             0 /*addMat<double, float>*/,
-            addMat<double, double>
+            addMat<double, double>,
         }
     };
+#endif
 
     if (dtype < 0)
         dtype = src1.depth();
@@ -421,6 +491,75 @@ void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat
     using namespace arithm;
 
     typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[7][7] =
+    {
+        {
+            addScalar<unsigned char, float, unsigned char>,
+            0/*addScalar<unsigned char, float, signed char>*/,
+            0/*addScalar<unsigned char, float, unsigned short>*/,
+            0/*addScalar<unsigned char, float, short>*/,
+            0/*addScalar<unsigned char, float, int>*/,
+            0/*addScalar<unsigned char, float, float>*/,
+            0/*addScalar<unsigned char, double, double>*/,
+        },
+        {
+            0/*addScalar<signed char, float, unsigned char>*/,
+            0/*addScalar<signed char, float, signed char>*/,
+            0/*addScalar<signed char, float, unsigned short>*/,
+            0/*addScalar<signed char, float, short>*/,
+            0/*addScalar<signed char, float, int>*/,
+            0/*addScalar<signed char, float, float>*/,
+            0/*addScalar<signed char, double, double>*/,
+        },
+        {
+            0 /*addScalar<unsigned short, float, unsigned char>*/,
+            0 /*addScalar<unsigned short, float, signed char>*/,
+            0/*addScalar<unsigned short, float, unsigned short>*/,
+            0/*addScalar<unsigned short, float, short>*/,
+            0/*addScalar<unsigned short, float, int>*/,
+            0/*addScalar<unsigned short, float, float>*/,
+            0/*addScalar<unsigned short, double, double>*/,
+        },
+        {
+            0 /*addScalar<short, float, unsigned char>*/,
+            0 /*addScalar<short, float, signed char>*/,
+            0/*addScalar<short, float, unsigned short>*/,
+            0/*addScalar<short, float, short>*/,
+            0/*addScalar<short, float, int>*/,
+            0/*addScalar<short, float, float>*/,
+            0/*addScalar<short, double, double>*/,
+        },
+        {
+            0 /*addScalar<int, float, unsigned char>*/,
+            0 /*addScalar<int, float, signed char>*/,
+            0 /*addScalar<int, float, unsigned short>*/,
+            0 /*addScalar<int, float, short>*/,
+            0/*addScalar<int, float, int>*/,
+            0/*addScalar<int, float, float>*/,
+            0/*addScalar<int, double, double>*/,
+        },
+        {
+            0 /*addScalar<float, float, unsigned char>*/,
+            0 /*addScalar<float, float, signed char>*/,
+            0 /*addScalar<float, float, unsigned short>*/,
+            0 /*addScalar<float, float, short>*/,
+            0 /*addScalar<float, float, int>*/,
+            addScalar<float, float, float>,
+            0/*addScalar<float, double, double>*/,
+        },
+        {
+            0 /*addScalar<double, double, unsigned char>*/,
+            0 /*addScalar<double, double, signed char>*/,
+            0 /*addScalar<double, double, unsigned short>*/,
+            0 /*addScalar<double, double, short>*/,
+            0 /*addScalar<double, double, int>*/,
+            0 /*addScalar<double, double, float>*/,
+            0/*addScalar<double, double, double>*/,
+        }
+    };
+#else
     static const func_t funcs[7][7] =
     {
         {
@@ -430,7 +569,7 @@ void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat
             addScalar<unsigned char, float, short>,
             addScalar<unsigned char, float, int>,
             addScalar<unsigned char, float, float>,
-            addScalar<unsigned char, double, double>
+            addScalar<unsigned char, double, double>,
         },
         {
             addScalar<signed char, float, unsigned char>,
@@ -439,7 +578,7 @@ void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat
             addScalar<signed char, float, short>,
             addScalar<signed char, float, int>,
             addScalar<signed char, float, float>,
-            addScalar<signed char, double, double>
+            addScalar<signed char, double, double>,
         },
         {
             0 /*addScalar<unsigned short, float, unsigned char>*/,
@@ -448,7 +587,7 @@ void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat
             addScalar<unsigned short, float, short>,
             addScalar<unsigned short, float, int>,
             addScalar<unsigned short, float, float>,
-            addScalar<unsigned short, double, double>
+            addScalar<unsigned short, double, double>,
         },
         {
             0 /*addScalar<short, float, unsigned char>*/,
@@ -457,7 +596,7 @@ void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat
             addScalar<short, float, short>,
             addScalar<short, float, int>,
             addScalar<short, float, float>,
-            addScalar<short, double, double>
+            addScalar<short, double, double>,
         },
         {
             0 /*addScalar<int, float, unsigned char>*/,
@@ -466,7 +605,7 @@ void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat
             0 /*addScalar<int, float, short>*/,
             addScalar<int, float, int>,
             addScalar<int, float, float>,
-            addScalar<int, double, double>
+            addScalar<int, double, double>,
         },
         {
             0 /*addScalar<float, float, unsigned char>*/,
@@ -475,7 +614,7 @@ void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat
             0 /*addScalar<float, float, short>*/,
             0 /*addScalar<float, float, int>*/,
             addScalar<float, float, float>,
-            addScalar<float, double, double>
+            addScalar<float, double, double>,
         },
         {
             0 /*addScalar<double, double, unsigned char>*/,
@@ -484,9 +623,10 @@ void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat
             0 /*addScalar<double, double, short>*/,
             0 /*addScalar<double, double, int>*/,
             0 /*addScalar<double, double, float>*/,
-            addScalar<double, double, double>
+            addScalar<double, double, double>,
         }
     };
+#endif
 
     typedef void (*npp_func_t)(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream);
     static const npp_func_t npp_funcs[7][4] =
@@ -555,6 +695,75 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons
     using namespace arithm;
 
     typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[7][7] =
+    {
+        {
+            subMat<unsigned char, unsigned char>,
+            0/*subMat<unsigned char, signed char>*/,
+            0/*subMat<unsigned char, unsigned short>*/,
+            0/*subMat<unsigned char, short>*/,
+            0/*subMat<unsigned char, int>*/,
+            0/*subMat<unsigned char, float>*/,
+            0/*subMat<unsigned char, double>*/,
+        },
+        {
+            0/*subMat<signed char, unsigned char>*/,
+            0/*subMat<signed char, signed char>*/,
+            0/*subMat<signed char, unsigned short>*/,
+            0/*subMat<signed char, short>*/,
+            0/*subMat<signed char, int>*/,
+            0/*subMat<signed char, float>*/,
+            0/*subMat<signed char, double>*/,
+        },
+        {
+            0 /*subMat<unsigned short, unsigned char>*/,
+            0 /*subMat<unsigned short, signed char>*/,
+            0/*subMat<unsigned short, unsigned short>*/,
+            0/*subMat<unsigned short, short>*/,
+            0/*subMat<unsigned short, int>*/,
+            0/*subMat<unsigned short, float>*/,
+            0/*subMat<unsigned short, double>*/,
+        },
+        {
+            0 /*subMat<short, unsigned char>*/,
+            0 /*subMat<short, signed char>*/,
+            0/*subMat<short, unsigned short>*/,
+            0/*subMat<short, short>*/,
+            0/*subMat<short, int>*/,
+            0/*subMat<short, float>*/,
+            0/*subMat<short, double>*/,
+        },
+        {
+            0 /*subMat<int, unsigned char>*/,
+            0 /*subMat<int, signed char>*/,
+            0 /*subMat<int, unsigned short>*/,
+            0 /*subMat<int, short>*/,
+            0/*subMat<int, int>*/,
+            0/*subMat<int, float>*/,
+            0/*subMat<int, double>*/,
+        },
+        {
+            0 /*subMat<float, unsigned char>*/,
+            0 /*subMat<float, signed char>*/,
+            0 /*subMat<float, unsigned short>*/,
+            0 /*subMat<float, short>*/,
+            0 /*subMat<float, int>*/,
+            subMat<float, float>,
+            0/*subMat<float, double>*/,
+        },
+        {
+            0 /*subMat<double, unsigned char>*/,
+            0 /*subMat<double, signed char>*/,
+            0 /*subMat<double, unsigned short>*/,
+            0 /*subMat<double, short>*/,
+            0 /*subMat<double, int>*/,
+            0 /*subMat<double, float>*/,
+            0/*subMat<double, double>*/,
+        }
+    };
+#else
     static const func_t funcs[7][7] =
     {
         {
@@ -564,7 +773,7 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons
             subMat<unsigned char, short>,
             subMat<unsigned char, int>,
             subMat<unsigned char, float>,
-            subMat<unsigned char, double>
+            subMat<unsigned char, double>,
         },
         {
             subMat<signed char, unsigned char>,
@@ -573,7 +782,7 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons
             subMat<signed char, short>,
             subMat<signed char, int>,
             subMat<signed char, float>,
-            subMat<signed char, double>
+            subMat<signed char, double>,
         },
         {
             0 /*subMat<unsigned short, unsigned char>*/,
@@ -582,7 +791,7 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons
             subMat<unsigned short, short>,
             subMat<unsigned short, int>,
             subMat<unsigned short, float>,
-            subMat<unsigned short, double>
+            subMat<unsigned short, double>,
         },
         {
             0 /*subMat<short, unsigned char>*/,
@@ -591,7 +800,7 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons
             subMat<short, short>,
             subMat<short, int>,
             subMat<short, float>,
-            subMat<short, double>
+            subMat<short, double>,
         },
         {
             0 /*subMat<int, unsigned char>*/,
@@ -600,7 +809,7 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons
             0 /*subMat<int, short>*/,
             subMat<int, int>,
             subMat<int, float>,
-            subMat<int, double>
+            subMat<int, double>,
         },
         {
             0 /*subMat<float, unsigned char>*/,
@@ -609,7 +818,7 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons
             0 /*subMat<float, short>*/,
             0 /*subMat<float, int>*/,
             subMat<float, float>,
-            subMat<float, double>
+            subMat<float, double>,
         },
         {
             0 /*subMat<double, unsigned char>*/,
@@ -618,9 +827,10 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons
             0 /*subMat<double, short>*/,
             0 /*subMat<double, int>*/,
             0 /*subMat<double, float>*/,
-            subMat<double, double>
+            subMat<double, double>,
         }
     };
+#endif
 
     if (dtype < 0)
         dtype = src1.depth();
@@ -701,6 +911,75 @@ void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, const G
     using namespace arithm;
 
     typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[7][7] =
+    {
+        {
+            subScalar<unsigned char, float, unsigned char>,
+            0/*subScalar<unsigned char, float, signed char>*/,
+            0/*subScalar<unsigned char, float, unsigned short>*/,
+            0/*subScalar<unsigned char, float, short>*/,
+            0/*subScalar<unsigned char, float, int>*/,
+            0/*subScalar<unsigned char, float, float>*/,
+            0/*subScalar<unsigned char, double, double>*/,
+        },
+        {
+            0/*subScalar<signed char, float, unsigned char>*/,
+            0/*subScalar<signed char, float, signed char>*/,
+            0/*subScalar<signed char, float, unsigned short>*/,
+            0/*subScalar<signed char, float, short>*/,
+            0/*subScalar<signed char, float, int>*/,
+            0/*subScalar<signed char, float, float>*/,
+            0/*subScalar<signed char, double, double>*/,
+        },
+        {
+            0 /*subScalar<unsigned short, float, unsigned char>*/,
+            0 /*subScalar<unsigned short, float, signed char>*/,
+            0/*subScalar<unsigned short, float, unsigned short>*/,
+            0/*subScalar<unsigned short, float, short>*/,
+            0/*subScalar<unsigned short, float, int>*/,
+            0/*subScalar<unsigned short, float, float>*/,
+            0/*subScalar<unsigned short, double, double>*/,
+        },
+        {
+            0 /*subScalar<short, float, unsigned char>*/,
+            0 /*subScalar<short, float, signed char>*/,
+            0/*subScalar<short, float, unsigned short>*/,
+            0/*subScalar<short, float, short>*/,
+            0/*subScalar<short, float, int>*/,
+            0/*subScalar<short, float, float>*/,
+            0/*subScalar<short, double, double>*/,
+        },
+        {
+            0 /*subScalar<int, float, unsigned char>*/,
+            0 /*subScalar<int, float, signed char>*/,
+            0 /*subScalar<int, float, unsigned short>*/,
+            0 /*subScalar<int, float, short>*/,
+            0/*subScalar<int, float, int>*/,
+            0/*subScalar<int, float, float>*/,
+            0/*subScalar<int, double, double>*/,
+        },
+        {
+            0 /*subScalar<float, float, unsigned char>*/,
+            0 /*subScalar<float, float, signed char>*/,
+            0 /*subScalar<float, float, unsigned short>*/,
+            0 /*subScalar<float, float, short>*/,
+            0 /*subScalar<float, float, int>*/,
+            subScalar<float, float, float>,
+            0/*subScalar<float, double, double>*/,
+        },
+        {
+            0 /*subScalar<double, double, unsigned char>*/,
+            0 /*subScalar<double, double, signed char>*/,
+            0 /*subScalar<double, double, unsigned short>*/,
+            0 /*subScalar<double, double, short>*/,
+            0 /*subScalar<double, double, int>*/,
+            0 /*subScalar<double, double, float>*/,
+            0/*subScalar<double, double, double>*/,
+        }
+    };
+#else
     static const func_t funcs[7][7] =
     {
         {
@@ -710,7 +989,7 @@ void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, const G
             subScalar<unsigned char, float, short>,
             subScalar<unsigned char, float, int>,
             subScalar<unsigned char, float, float>,
-            subScalar<unsigned char, double, double>
+            subScalar<unsigned char, double, double>,
         },
         {
             subScalar<signed char, float, unsigned char>,
@@ -719,7 +998,7 @@ void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, const G
             subScalar<signed char, float, short>,
             subScalar<signed char, float, int>,
             subScalar<signed char, float, float>,
-            subScalar<signed char, double, double>
+            subScalar<signed char, double, double>,
         },
         {
             0 /*subScalar<unsigned short, float, unsigned char>*/,
@@ -728,7 +1007,7 @@ void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, const G
             subScalar<unsigned short, float, short>,
             subScalar<unsigned short, float, int>,
             subScalar<unsigned short, float, float>,
-            subScalar<unsigned short, double, double>
+            subScalar<unsigned short, double, double>,
         },
         {
             0 /*subScalar<short, float, unsigned char>*/,
@@ -737,7 +1016,7 @@ void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, const G
             subScalar<short, float, short>,
             subScalar<short, float, int>,
             subScalar<short, float, float>,
-            subScalar<short, double, double>
+            subScalar<short, double, double>,
         },
         {
             0 /*subScalar<int, float, unsigned char>*/,
@@ -746,7 +1025,7 @@ void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, const G
             0 /*subScalar<int, float, short>*/,
             subScalar<int, float, int>,
             subScalar<int, float, float>,
-            subScalar<int, double, double>
+            subScalar<int, double, double>,
         },
         {
             0 /*subScalar<float, float, unsigned char>*/,
@@ -755,7 +1034,7 @@ void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, const G
             0 /*subScalar<float, float, short>*/,
             0 /*subScalar<float, float, int>*/,
             subScalar<float, float, float>,
-            subScalar<float, double, double>
+            subScalar<float, double, double>,
         },
         {
             0 /*subScalar<double, double, unsigned char>*/,
@@ -764,9 +1043,10 @@ void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, const G
             0 /*subScalar<double, double, short>*/,
             0 /*subScalar<double, double, int>*/,
             0 /*subScalar<double, double, float>*/,
-            subScalar<double, double, double>
+            subScalar<double, double, double>,
         }
     };
+#endif
 
     typedef void (*npp_func_t)(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream);
     static const npp_func_t npp_funcs[7][4] =
@@ -856,6 +1136,75 @@ void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, doub
     else
     {
         typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+#ifdef OPENCV_TINY_GPU_MODULE
+        static const func_t funcs[7][7] =
+        {
+            {
+                mulMat<unsigned char, float, unsigned char>,
+                0/*mulMat<unsigned char, float, signed char>*/,
+                0/*mulMat<unsigned char, float, unsigned short>*/,
+                0/*mulMat<unsigned char, float, short>*/,
+                0/*mulMat<unsigned char, float, int>*/,
+                0/*mulMat<unsigned char, float, float>*/,
+                0/*mulMat<unsigned char, double, double>*/,
+            },
+            {
+                0/*mulMat<signed char, float, unsigned char>*/,
+                0/*mulMat<signed char, float, signed char>*/,
+                0/*mulMat<signed char, float, unsigned short>*/,
+                0/*mulMat<signed char, float, short>*/,
+                0/*mulMat<signed char, float, int>*/,
+                0/*mulMat<signed char, float, float>*/,
+                0/*mulMat<signed char, double, double>*/,
+            },
+            {
+                0 /*mulMat<unsigned short, float, unsigned char>*/,
+                0 /*mulMat<unsigned short, float, signed char>*/,
+                0/*mulMat<unsigned short, float, unsigned short>*/,
+                0/*mulMat<unsigned short, float, short>*/,
+                0/*mulMat<unsigned short, float, int>*/,
+                0/*mulMat<unsigned short, float, float>*/,
+                0/*mulMat<unsigned short, double, double>*/,
+            },
+            {
+                0 /*mulMat<short, float, unsigned char>*/,
+                0 /*mulMat<short, float, signed char>*/,
+                0/*mulMat<short, float, unsigned short>*/,
+                0/*mulMat<short, float, short>*/,
+                0/*mulMat<short, float, int>*/,
+                0/*mulMat<short, float, float>*/,
+                0/*mulMat<short, double, double>*/,
+            },
+            {
+                0 /*mulMat<int, float, unsigned char>*/,
+                0 /*mulMat<int, float, signed char>*/,
+                0 /*mulMat<int, float, unsigned short>*/,
+                0 /*mulMat<int, float, short>*/,
+                0/*mulMat<int, float, int>*/,
+                0/*mulMat<int, float, float>*/,
+                0/*mulMat<int, double, double>*/,
+            },
+            {
+                0 /*mulMat<float, float, unsigned char>*/,
+                0 /*mulMat<float, float, signed char>*/,
+                0 /*mulMat<float, float, unsigned short>*/,
+                0 /*mulMat<float, float, short>*/,
+                0 /*mulMat<float, float, int>*/,
+                mulMat<float, float, float>,
+                0/*mulMat<float, double, double>*/,
+            },
+            {
+                0 /*mulMat<double, double, unsigned char>*/,
+                0 /*mulMat<double, double, signed char>*/,
+                0 /*mulMat<double, double, unsigned short>*/,
+                0 /*mulMat<double, double, short>*/,
+                0 /*mulMat<double, double, int>*/,
+                0 /*mulMat<double, double, float>*/,
+                0/*mulMat<double, double, double>*/,
+            }
+        };
+#else
         static const func_t funcs[7][7] =
         {
             {
@@ -865,7 +1214,7 @@ void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, doub
                 mulMat<unsigned char, float, short>,
                 mulMat<unsigned char, float, int>,
                 mulMat<unsigned char, float, float>,
-                mulMat<unsigned char, double, double>
+                mulMat<unsigned char, double, double>,
             },
             {
                 mulMat<signed char, float, unsigned char>,
@@ -874,7 +1223,7 @@ void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, doub
                 mulMat<signed char, float, short>,
                 mulMat<signed char, float, int>,
                 mulMat<signed char, float, float>,
-                mulMat<signed char, double, double>
+                mulMat<signed char, double, double>,
             },
             {
                 0 /*mulMat<unsigned short, float, unsigned char>*/,
@@ -883,7 +1232,7 @@ void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, doub
                 mulMat<unsigned short, float, short>,
                 mulMat<unsigned short, float, int>,
                 mulMat<unsigned short, float, float>,
-                mulMat<unsigned short, double, double>
+                mulMat<unsigned short, double, double>,
             },
             {
                 0 /*mulMat<short, float, unsigned char>*/,
@@ -892,7 +1241,7 @@ void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, doub
                 mulMat<short, float, short>,
                 mulMat<short, float, int>,
                 mulMat<short, float, float>,
-                mulMat<short, double, double>
+                mulMat<short, double, double>,
             },
             {
                 0 /*mulMat<int, float, unsigned char>*/,
@@ -901,7 +1250,7 @@ void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, doub
                 0 /*mulMat<int, float, short>*/,
                 mulMat<int, float, int>,
                 mulMat<int, float, float>,
-                mulMat<int, double, double>
+                mulMat<int, double, double>,
             },
             {
                 0 /*mulMat<float, float, unsigned char>*/,
@@ -910,7 +1259,7 @@ void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, doub
                 0 /*mulMat<float, float, short>*/,
                 0 /*mulMat<float, float, int>*/,
                 mulMat<float, float, float>,
-                mulMat<float, double, double>
+                mulMat<float, double, double>,
             },
             {
                 0 /*mulMat<double, double, unsigned char>*/,
@@ -919,9 +1268,10 @@ void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, doub
                 0 /*mulMat<double, double, short>*/,
                 0 /*mulMat<double, double, int>*/,
                 0 /*mulMat<double, double, float>*/,
-                mulMat<double, double, double>
+                mulMat<double, double, double>,
             }
         };
+#endif
 
         if (dtype < 0)
             dtype = src1.depth();
@@ -965,6 +1315,75 @@ void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, double
     using namespace arithm;
 
     typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[7][7] =
+    {
+        {
+            mulScalar<unsigned char, float, unsigned char>,
+            0/*mulScalar<unsigned char, float, signed char>*/,
+            0/*mulScalar<unsigned char, float, unsigned short>*/,
+            0/*mulScalar<unsigned char, float, short>*/,
+            0/*mulScalar<unsigned char, float, int>*/,
+            0/*mulScalar<unsigned char, float, float>*/,
+            0/*mulScalar<unsigned char, double, double>*/,
+        },
+        {
+            0/*mulScalar<signed char, float, unsigned char>*/,
+            0/*mulScalar<signed char, float, signed char>*/,
+            0/*mulScalar<signed char, float, unsigned short>*/,
+            0/*mulScalar<signed char, float, short>*/,
+            0/*mulScalar<signed char, float, int>*/,
+            0/*mulScalar<signed char, float, float>*/,
+            0/*mulScalar<signed char, double, double>*/,
+        },
+        {
+            0 /*mulScalar<unsigned short, float, unsigned char>*/,
+            0 /*mulScalar<unsigned short, float, signed char>*/,
+            0/*mulScalar<unsigned short, float, unsigned short>*/,
+            0/*mulScalar<unsigned short, float, short>*/,
+            0/*mulScalar<unsigned short, float, int>*/,
+            0/*mulScalar<unsigned short, float, float>*/,
+            0/*mulScalar<unsigned short, double, double>*/,
+        },
+        {
+            0 /*mulScalar<short, float, unsigned char>*/,
+            0 /*mulScalar<short, float, signed char>*/,
+            0/*mulScalar<short, float, unsigned short>*/,
+            0/*mulScalar<short, float, short>*/,
+            0/*mulScalar<short, float, int>*/,
+            0/*mulScalar<short, float, float>*/,
+            0/*mulScalar<short, double, double>*/,
+        },
+        {
+            0 /*mulScalar<int, float, unsigned char>*/,
+            0 /*mulScalar<int, float, signed char>*/,
+            0 /*mulScalar<int, float, unsigned short>*/,
+            0 /*mulScalar<int, float, short>*/,
+            0/*mulScalar<int, float, int>*/,
+            0/*mulScalar<int, float, float>*/,
+            0/*mulScalar<int, double, double>*/,
+        },
+        {
+            0 /*mulScalar<float, float, unsigned char>*/,
+            0 /*mulScalar<float, float, signed char>*/,
+            0 /*mulScalar<float, float, unsigned short>*/,
+            0 /*mulScalar<float, float, short>*/,
+            0 /*mulScalar<float, float, int>*/,
+            mulScalar<float, float, float>,
+            0/*mulScalar<float, double, double>*/,
+        },
+        {
+            0 /*mulScalar<double, double, unsigned char>*/,
+            0 /*mulScalar<double, double, signed char>*/,
+            0 /*mulScalar<double, double, unsigned short>*/,
+            0 /*mulScalar<double, double, short>*/,
+            0 /*mulScalar<double, double, int>*/,
+            0 /*mulScalar<double, double, float>*/,
+            0/*mulScalar<double, double, double>*/,
+        }
+    };
+#else
     static const func_t funcs[7][7] =
     {
         {
@@ -974,7 +1393,7 @@ void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, double
             mulScalar<unsigned char, float, short>,
             mulScalar<unsigned char, float, int>,
             mulScalar<unsigned char, float, float>,
-            mulScalar<unsigned char, double, double>
+            mulScalar<unsigned char, double, double>,
         },
         {
             mulScalar<signed char, float, unsigned char>,
@@ -983,7 +1402,7 @@ void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, double
             mulScalar<signed char, float, short>,
             mulScalar<signed char, float, int>,
             mulScalar<signed char, float, float>,
-            mulScalar<signed char, double, double>
+            mulScalar<signed char, double, double>,
         },
         {
             0 /*mulScalar<unsigned short, float, unsigned char>*/,
@@ -992,7 +1411,7 @@ void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, double
             mulScalar<unsigned short, float, short>,
             mulScalar<unsigned short, float, int>,
             mulScalar<unsigned short, float, float>,
-            mulScalar<unsigned short, double, double>
+            mulScalar<unsigned short, double, double>,
         },
         {
             0 /*mulScalar<short, float, unsigned char>*/,
@@ -1001,7 +1420,7 @@ void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, double
             mulScalar<short, float, short>,
             mulScalar<short, float, int>,
             mulScalar<short, float, float>,
-            mulScalar<short, double, double>
+            mulScalar<short, double, double>,
         },
         {
             0 /*mulScalar<int, float, unsigned char>*/,
@@ -1010,7 +1429,7 @@ void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, double
             0 /*mulScalar<int, float, short>*/,
             mulScalar<int, float, int>,
             mulScalar<int, float, float>,
-            mulScalar<int, double, double>
+            mulScalar<int, double, double>,
         },
         {
             0 /*mulScalar<float, float, unsigned char>*/,
@@ -1019,7 +1438,7 @@ void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, double
             0 /*mulScalar<float, float, short>*/,
             0 /*mulScalar<float, float, int>*/,
             mulScalar<float, float, float>,
-            mulScalar<float, double, double>
+            mulScalar<float, double, double>,
         },
         {
             0 /*mulScalar<double, double, unsigned char>*/,
@@ -1028,9 +1447,10 @@ void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, double
             0 /*mulScalar<double, double, short>*/,
             0 /*mulScalar<double, double, int>*/,
             0 /*mulScalar<double, double, float>*/,
-            mulScalar<double, double, double>
+            mulScalar<double, double, double>,
         }
     };
+#endif
 
     typedef void (*npp_func_t)(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream);
     static const npp_func_t npp_funcs[7][4] =
@@ -1121,6 +1541,75 @@ void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double
     else
     {
         typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+#ifdef OPENCV_TINY_GPU_MODULE
+        static const func_t funcs[7][7] =
+        {
+            {
+                divMat<unsigned char, float, unsigned char>,
+                0/*divMat<unsigned char, float, signed char>*/,
+                0/*divMat<unsigned char, float, unsigned short>*/,
+                0/*divMat<unsigned char, float, short>*/,
+                0/*divMat<unsigned char, float, int>*/,
+                0/*divMat<unsigned char, float, float>*/,
+                0/*divMat<unsigned char, double, double>*/,
+            },
+            {
+                0/*divMat<signed char, float, unsigned char>*/,
+                0/*divMat<signed char, float, signed char>*/,
+                0/*divMat<signed char, float, unsigned short>*/,
+                0/*divMat<signed char, float, short>*/,
+                0/*divMat<signed char, float, int>*/,
+                0/*divMat<signed char, float, float>*/,
+                0/*divMat<signed char, double, double>*/,
+            },
+            {
+                0 /*divMat<unsigned short, float, unsigned char>*/,
+                0 /*divMat<unsigned short, float, signed char>*/,
+                0/*divMat<unsigned short, float, unsigned short>*/,
+                0/*divMat<unsigned short, float, short>*/,
+                0/*divMat<unsigned short, float, int>*/,
+                0/*divMat<unsigned short, float, float>*/,
+                0/*divMat<unsigned short, double, double>*/,
+            },
+            {
+                0 /*divMat<short, float, unsigned char>*/,
+                0 /*divMat<short, float, signed char>*/,
+                0/*divMat<short, float, unsigned short>*/,
+                0/*divMat<short, float, short>*/,
+                0/*divMat<short, float, int>*/,
+                0/*divMat<short, float, float>*/,
+                0/*divMat<short, double, double>*/,
+            },
+            {
+                0 /*divMat<int, float, unsigned char>*/,
+                0 /*divMat<int, float, signed char>*/,
+                0 /*divMat<int, float, unsigned short>*/,
+                0 /*divMat<int, float, short>*/,
+                0/*divMat<int, float, int>*/,
+                0/*divMat<int, float, float>*/,
+                0/*divMat<int, double, double>*/,
+            },
+            {
+                0 /*divMat<float, float, unsigned char>*/,
+                0 /*divMat<float, float, signed char>*/,
+                0 /*divMat<float, float, unsigned short>*/,
+                0 /*divMat<float, float, short>*/,
+                0 /*divMat<float, float, int>*/,
+                divMat<float, float, float>,
+                0/*divMat<float, double, double>*/,
+            },
+            {
+                0 /*divMat<double, double, unsigned char>*/,
+                0 /*divMat<double, double, signed char>*/,
+                0 /*divMat<double, double, unsigned short>*/,
+                0 /*divMat<double, double, short>*/,
+                0 /*divMat<double, double, int>*/,
+                0 /*divMat<double, double, float>*/,
+                0/*divMat<double, double, double>*/,
+            }
+        };
+#else
         static const func_t funcs[7][7] =
         {
             {
@@ -1130,7 +1619,7 @@ void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double
                 divMat<unsigned char, float, short>,
                 divMat<unsigned char, float, int>,
                 divMat<unsigned char, float, float>,
-                divMat<unsigned char, double, double>
+                divMat<unsigned char, double, double>,
             },
             {
                 divMat<signed char, float, unsigned char>,
@@ -1139,7 +1628,7 @@ void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double
                 divMat<signed char, float, short>,
                 divMat<signed char, float, int>,
                 divMat<signed char, float, float>,
-                divMat<signed char, double, double>
+                divMat<signed char, double, double>,
             },
             {
                 0 /*divMat<unsigned short, float, unsigned char>*/,
@@ -1148,7 +1637,7 @@ void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double
                 divMat<unsigned short, float, short>,
                 divMat<unsigned short, float, int>,
                 divMat<unsigned short, float, float>,
-                divMat<unsigned short, double, double>
+                divMat<unsigned short, double, double>,
             },
             {
                 0 /*divMat<short, float, unsigned char>*/,
@@ -1157,7 +1646,7 @@ void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double
                 divMat<short, float, short>,
                 divMat<short, float, int>,
                 divMat<short, float, float>,
-                divMat<short, double, double>
+                divMat<short, double, double>,
             },
             {
                 0 /*divMat<int, float, unsigned char>*/,
@@ -1166,7 +1655,7 @@ void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double
                 0 /*divMat<int, float, short>*/,
                 divMat<int, float, int>,
                 divMat<int, float, float>,
-                divMat<int, double, double>
+                divMat<int, double, double>,
             },
             {
                 0 /*divMat<float, float, unsigned char>*/,
@@ -1175,7 +1664,7 @@ void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double
                 0 /*divMat<float, float, short>*/,
                 0 /*divMat<float, float, int>*/,
                 divMat<float, float, float>,
-                divMat<float, double, double>
+                divMat<float, double, double>,
             },
             {
                 0 /*divMat<double, double, unsigned char>*/,
@@ -1184,9 +1673,10 @@ void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double
                 0 /*divMat<double, double, short>*/,
                 0 /*divMat<double, double, int>*/,
                 0 /*divMat<double, double, float>*/,
-                divMat<double, double, double>
+                divMat<double, double, double>,
             }
         };
+#endif
 
         if (dtype < 0)
             dtype = src1.depth();
@@ -1230,6 +1720,75 @@ void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, double sc
     using namespace arithm;
 
     typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[7][7] =
+    {
+        {
+            divScalar<unsigned char, float, unsigned char>,
+            0/*divScalar<unsigned char, float, signed char>*/,
+            0/*divScalar<unsigned char, float, unsigned short>*/,
+            0/*divScalar<unsigned char, float, short>*/,
+            0/*divScalar<unsigned char, float, int>*/,
+            0/*divScalar<unsigned char, float, float>*/,
+            0/*divScalar<unsigned char, double, double>*/,
+        },
+        {
+            0/*divScalar<signed char, float, unsigned char>*/,
+            0/*divScalar<signed char, float, signed char>*/,
+            0/*divScalar<signed char, float, unsigned short>*/,
+            0/*divScalar<signed char, float, short>*/,
+            0/*divScalar<signed char, float, int>*/,
+            0/*divScalar<signed char, float, float>*/,
+            0/*divScalar<signed char, double, double>*/,
+        },
+        {
+            0 /*divScalar<unsigned short, float, unsigned char>*/,
+            0 /*divScalar<unsigned short, float, signed char>*/,
+            0/*divScalar<unsigned short, float, unsigned short>*/,
+            0/*divScalar<unsigned short, float, short>*/,
+            0/*divScalar<unsigned short, float, int>*/,
+            0/*divScalar<unsigned short, float, float>*/,
+            0/*divScalar<unsigned short, double, double>*/,
+        },
+        {
+            0 /*divScalar<short, float, unsigned char>*/,
+            0 /*divScalar<short, float, signed char>*/,
+            0/*divScalar<short, float, unsigned short>*/,
+            0/*divScalar<short, float, short>*/,
+            0/*divScalar<short, float, int>*/,
+            0/*divScalar<short, float, float>*/,
+            0/*divScalar<short, double, double>*/,
+        },
+        {
+            0 /*divScalar<int, float, unsigned char>*/,
+            0 /*divScalar<int, float, signed char>*/,
+            0 /*divScalar<int, float, unsigned short>*/,
+            0 /*divScalar<int, float, short>*/,
+            0/*divScalar<int, float, int>*/,
+            0/*divScalar<int, float, float>*/,
+            0/*divScalar<int, double, double>*/,
+        },
+        {
+            0 /*divScalar<float, float, unsigned char>*/,
+            0 /*divScalar<float, float, signed char>*/,
+            0 /*divScalar<float, float, unsigned short>*/,
+            0 /*divScalar<float, float, short>*/,
+            0 /*divScalar<float, float, int>*/,
+            divScalar<float, float, float>,
+            0/*divScalar<float, double, double>*/,
+        },
+        {
+            0 /*divScalar<double, double, unsigned char>*/,
+            0 /*divScalar<double, double, signed char>*/,
+            0 /*divScalar<double, double, unsigned short>*/,
+            0 /*divScalar<double, double, short>*/,
+            0 /*divScalar<double, double, int>*/,
+            0 /*divScalar<double, double, float>*/,
+            0/*divScalar<double, double, double>*/,
+        }
+    };
+#else
     static const func_t funcs[7][7] =
     {
         {
@@ -1239,7 +1798,7 @@ void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, double sc
             divScalar<unsigned char, float, short>,
             divScalar<unsigned char, float, int>,
             divScalar<unsigned char, float, float>,
-            divScalar<unsigned char, double, double>
+            divScalar<unsigned char, double, double>,
         },
         {
             divScalar<signed char, float, unsigned char>,
@@ -1248,7 +1807,7 @@ void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, double sc
             divScalar<signed char, float, short>,
             divScalar<signed char, float, int>,
             divScalar<signed char, float, float>,
-            divScalar<signed char, double, double>
+            divScalar<signed char, double, double>,
         },
         {
             0 /*divScalar<unsigned short, float, unsigned char>*/,
@@ -1257,7 +1816,7 @@ void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, double sc
             divScalar<unsigned short, float, short>,
             divScalar<unsigned short, float, int>,
             divScalar<unsigned short, float, float>,
-            divScalar<unsigned short, double, double>
+            divScalar<unsigned short, double, double>,
         },
         {
             0 /*divScalar<short, float, unsigned char>*/,
@@ -1266,7 +1825,7 @@ void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, double sc
             divScalar<short, float, short>,
             divScalar<short, float, int>,
             divScalar<short, float, float>,
-            divScalar<short, double, double>
+            divScalar<short, double, double>,
         },
         {
             0 /*divScalar<int, float, unsigned char>*/,
@@ -1275,7 +1834,7 @@ void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, double sc
             0 /*divScalar<int, float, short>*/,
             divScalar<int, float, int>,
             divScalar<int, float, float>,
-            divScalar<int, double, double>
+            divScalar<int, double, double>,
         },
         {
             0 /*divScalar<float, float, unsigned char>*/,
@@ -1284,7 +1843,7 @@ void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, double sc
             0 /*divScalar<float, float, short>*/,
             0 /*divScalar<float, float, int>*/,
             divScalar<float, float, float>,
-            divScalar<float, double, double>
+            divScalar<float, double, double>,
         },
         {
             0 /*divScalar<double, double, unsigned char>*/,
@@ -1293,9 +1852,10 @@ void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, double sc
             0 /*divScalar<double, double, short>*/,
             0 /*divScalar<double, double, int>*/,
             0 /*divScalar<double, double, float>*/,
-            divScalar<double, double, double>
+            divScalar<double, double, double>,
         }
     };
+#endif
 
     typedef void (*npp_func_t)(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream);
     static const npp_func_t npp_funcs[7][4] =
@@ -1359,6 +1919,75 @@ void cv::gpu::divide(double scale, const GpuMat& src, GpuMat& dst, int dtype, St
     using namespace arithm;
 
     typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[7][7] =
+    {
+        {
+            divInv<unsigned char, float, unsigned char>,
+            0/*divInv<unsigned char, float, signed char>*/,
+            0/*divInv<unsigned char, float, unsigned short>*/,
+            0/*divInv<unsigned char, float, short>*/,
+            0/*divInv<unsigned char, float, int>*/,
+            0/*divInv<unsigned char, float, float>*/,
+            0/*divInv<unsigned char, double, double>*/,
+        },
+        {
+            0/*divInv<signed char, float, unsigned char>*/,
+            0/*divInv<signed char, float, signed char>*/,
+            0/*divInv<signed char, float, unsigned short>*/,
+            0/*divInv<signed char, float, short>*/,
+            0/*divInv<signed char, float, int>*/,
+            0/*divInv<signed char, float, float>*/,
+            0/*divInv<signed char, double, double>*/,
+        },
+        {
+            0 /*divInv<unsigned short, float, unsigned char>*/,
+            0 /*divInv<unsigned short, float, signed char>*/,
+            0/*divInv<unsigned short, float, unsigned short>*/,
+            0/*divInv<unsigned short, float, short>*/,
+            0/*divInv<unsigned short, float, int>*/,
+            0/*divInv<unsigned short, float, float>*/,
+            0/*divInv<unsigned short, double, double>*/,
+        },
+        {
+            0 /*divInv<short, float, unsigned char>*/,
+            0 /*divInv<short, float, signed char>*/,
+            0/*divInv<short, float, unsigned short>*/,
+            0/*divInv<short, float, short>*/,
+            0/*divInv<short, float, int>*/,
+            0/*divInv<short, float, float>*/,
+            0/*divInv<short, double, double>*/,
+        },
+        {
+            0 /*divInv<int, float, unsigned char>*/,
+            0 /*divInv<int, float, signed char>*/,
+            0 /*divInv<int, float, unsigned short>*/,
+            0 /*divInv<int, float, short>*/,
+            0/*divInv<int, float, int>*/,
+            0/*divInv<int, float, float>*/,
+            0/*divInv<int, double, double>*/,
+        },
+        {
+            0 /*divInv<float, float, unsigned char>*/,
+            0 /*divInv<float, float, signed char>*/,
+            0 /*divInv<float, float, unsigned short>*/,
+            0 /*divInv<float, float, short>*/,
+            0 /*divInv<float, float, int>*/,
+            divInv<float, float, float>,
+            0/*divInv<float, double, double>*/,
+        },
+        {
+            0 /*divInv<double, double, unsigned char>*/,
+            0 /*divInv<double, double, signed char>*/,
+            0 /*divInv<double, double, unsigned short>*/,
+            0 /*divInv<double, double, short>*/,
+            0 /*divInv<double, double, int>*/,
+            0 /*divInv<double, double, float>*/,
+            0/*divInv<double, double, double>*/,
+        }
+    };
+#else
     static const func_t funcs[7][7] =
     {
         {
@@ -1368,7 +1997,7 @@ void cv::gpu::divide(double scale, const GpuMat& src, GpuMat& dst, int dtype, St
             divInv<unsigned char, float, short>,
             divInv<unsigned char, float, int>,
             divInv<unsigned char, float, float>,
-            divInv<unsigned char, double, double>
+            divInv<unsigned char, double, double>,
         },
         {
             divInv<signed char, float, unsigned char>,
@@ -1377,7 +2006,7 @@ void cv::gpu::divide(double scale, const GpuMat& src, GpuMat& dst, int dtype, St
             divInv<signed char, float, short>,
             divInv<signed char, float, int>,
             divInv<signed char, float, float>,
-            divInv<signed char, double, double>
+            divInv<signed char, double, double>,
         },
         {
             0 /*divInv<unsigned short, float, unsigned char>*/,
@@ -1386,7 +2015,7 @@ void cv::gpu::divide(double scale, const GpuMat& src, GpuMat& dst, int dtype, St
             divInv<unsigned short, float, short>,
             divInv<unsigned short, float, int>,
             divInv<unsigned short, float, float>,
-            divInv<unsigned short, double, double>
+            divInv<unsigned short, double, double>,
         },
         {
             0 /*divInv<short, float, unsigned char>*/,
@@ -1395,7 +2024,7 @@ void cv::gpu::divide(double scale, const GpuMat& src, GpuMat& dst, int dtype, St
             divInv<short, float, short>,
             divInv<short, float, int>,
             divInv<short, float, float>,
-            divInv<short, double, double>
+            divInv<short, double, double>,
         },
         {
             0 /*divInv<int, float, unsigned char>*/,
@@ -1404,7 +2033,7 @@ void cv::gpu::divide(double scale, const GpuMat& src, GpuMat& dst, int dtype, St
             0 /*divInv<int, float, short>*/,
             divInv<int, float, int>,
             divInv<int, float, float>,
-            divInv<int, double, double>
+            divInv<int, double, double>,
         },
         {
             0 /*divInv<float, float, unsigned char>*/,
@@ -1413,7 +2042,7 @@ void cv::gpu::divide(double scale, const GpuMat& src, GpuMat& dst, int dtype, St
             0 /*divInv<float, float, short>*/,
             0 /*divInv<float, float, int>*/,
             divInv<float, float, float>,
-            divInv<float, double, double>
+            divInv<float, double, double>,
         },
         {
             0 /*divInv<double, double, unsigned char>*/,
@@ -1422,9 +2051,10 @@ void cv::gpu::divide(double scale, const GpuMat& src, GpuMat& dst, int dtype, St
             0 /*divInv<double, double, short>*/,
             0 /*divInv<double, double, int>*/,
             0 /*divInv<double, double, float>*/,
-            divInv<double, double, double>
+            divInv<double, double, double>,
         }
     };
+#endif
 
     if (dtype < 0)
         dtype = src.depth();
@@ -1471,6 +2101,19 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea
     using namespace arithm;
 
     typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[] =
+    {
+        absDiffMat<unsigned char>,
+        0/*absDiffMat<signed char>*/,
+        0/*absDiffMat<unsigned short>*/,
+        0/*absDiffMat<short>*/,
+        0/*absDiffMat<int>*/,
+        absDiffMat<float>,
+        0/*absDiffMat<double>*/,
+    };
+#else
     static const func_t funcs[] =
     {
         absDiffMat<unsigned char>,
@@ -1479,8 +2122,9 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea
         absDiffMat<short>,
         absDiffMat<int>,
         absDiffMat<float>,
-        absDiffMat<double>
+        absDiffMat<double>,
     };
+#endif
 
     const int depth = src1.depth();
     const int cn = src1.channels();
@@ -1556,6 +2200,19 @@ void cv::gpu::absdiff(const GpuMat& src1, const Scalar& src2, GpuMat& dst, Strea
     using namespace arithm;
 
     typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[] =
+    {
+        absDiffScalar<unsigned char, float>,
+        0/*absDiffScalar<signed char, float>*/,
+        0/*absDiffScalar<unsigned short, float>*/,
+        0/*absDiffScalar<short, float>*/,
+        0/*absDiffScalar<int, float>*/,
+        absDiffScalar<float, float>,
+        0/*absDiffScalar<double, double>*/,
+    };
+#else
     static const func_t funcs[] =
     {
         absDiffScalar<unsigned char, float>,
@@ -1564,8 +2221,9 @@ void cv::gpu::absdiff(const GpuMat& src1, const Scalar& src2, GpuMat& dst, Strea
         absDiffScalar<short, float>,
         absDiffScalar<int, float>,
         absDiffScalar<float, float>,
-        absDiffScalar<double, double>
+        absDiffScalar<double, double>,
     };
+#endif
 
     const int depth = src1.depth();
 
@@ -1578,9 +2236,13 @@ void cv::gpu::absdiff(const GpuMat& src1, const Scalar& src2, GpuMat& dst, Strea
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
+    const func_t func = funcs[depth];
+    if (!func)
+        CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
+
     dst.create(src1.size(), src1.type());
 
-    funcs[depth](src1, src2.val[0], dst, StreamAccessor::getStream(stream));
+    func(src1, src2.val[0], dst, StreamAccessor::getStream(stream));
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -1597,6 +2259,19 @@ void cv::gpu::abs(const GpuMat& src, GpuMat& dst, Stream& stream)
     using namespace arithm;
 
     typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[] =
+    {
+        0/*absMat<unsigned char>*/,
+        0/*absMat<signed char>*/,
+        0/*absMat<unsigned short>*/,
+        0/*absMat<short>*/,
+        0/*absMat<int>*/,
+        absMat<float>,
+        0/*absMat<double>*/,
+    };
+#else
     static const func_t funcs[] =
     {
         absMat<unsigned char>,
@@ -1605,8 +2280,9 @@ void cv::gpu::abs(const GpuMat& src, GpuMat& dst, Stream& stream)
         absMat<short>,
         absMat<int>,
         absMat<float>,
-        absMat<double>
+        absMat<double>,
     };
+#endif
 
     const int depth = src.depth();
 
@@ -1619,9 +2295,13 @@ void cv::gpu::abs(const GpuMat& src, GpuMat& dst, Stream& stream)
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
+    const func_t func = funcs[depth];
+    if (!func)
+        CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
+
     dst.create(src.size(), src.type());
 
-    funcs[depth](src, dst, StreamAccessor::getStream(stream));
+    func(src, dst, StreamAccessor::getStream(stream));
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -1638,6 +2318,19 @@ void cv::gpu::sqr(const GpuMat& src, GpuMat& dst, Stream& stream)
     using namespace arithm;
 
     typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[] =
+    {
+        0/*sqrMat<unsigned char>*/,
+        0/*sqrMat<signed char>*/,
+        0/*sqrMat<unsigned short>*/,
+        0/*sqrMat<short>*/,
+        0/*sqrMat<int>*/,
+        sqrMat<float>,
+        0/*sqrMat<double>*/,
+    };
+#else
     static const func_t funcs[] =
     {
         sqrMat<unsigned char>,
@@ -1646,8 +2339,9 @@ void cv::gpu::sqr(const GpuMat& src, GpuMat& dst, Stream& stream)
         sqrMat<short>,
         sqrMat<int>,
         sqrMat<float>,
-        sqrMat<double>
+        sqrMat<double>,
     };
+#endif
 
     const int depth = src.depth();
 
@@ -1660,9 +2354,13 @@ void cv::gpu::sqr(const GpuMat& src, GpuMat& dst, Stream& stream)
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
+    const func_t func = funcs[depth];
+    if (!func)
+        CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
+
     dst.create(src.size(), src.type());
 
-    funcs[depth](src, dst, StreamAccessor::getStream(stream));
+    func(src, dst, StreamAccessor::getStream(stream));
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -1679,6 +2377,19 @@ void cv::gpu::sqrt(const GpuMat& src, GpuMat& dst, Stream& stream)
     using namespace arithm;
 
     typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[] =
+    {
+        0/*sqrtMat<unsigned char>*/,
+        0/*sqrtMat<signed char>*/,
+        0/*sqrtMat<unsigned short>*/,
+        0/*sqrtMat<short>*/,
+        0/*sqrtMat<int>*/,
+        sqrtMat<float>,
+        0/*sqrtMat<double>*/,
+    };
+#else
     static const func_t funcs[] =
     {
         sqrtMat<unsigned char>,
@@ -1687,8 +2398,9 @@ void cv::gpu::sqrt(const GpuMat& src, GpuMat& dst, Stream& stream)
         sqrtMat<short>,
         sqrtMat<int>,
         sqrtMat<float>,
-        sqrtMat<double>
+        sqrtMat<double>,
     };
+#endif
 
     const int depth = src.depth();
 
@@ -1701,9 +2413,13 @@ void cv::gpu::sqrt(const GpuMat& src, GpuMat& dst, Stream& stream)
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
+    const func_t func = funcs[depth];
+    if (!func)
+        CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
+
     dst.create(src.size(), src.type());
 
-    funcs[depth](src, dst, StreamAccessor::getStream(stream));
+    func(src, dst, StreamAccessor::getStream(stream));
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -1720,6 +2436,19 @@ void cv::gpu::log(const GpuMat& src, GpuMat& dst, Stream& stream)
     using namespace arithm;
 
     typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[] =
+    {
+        0/*logMat<unsigned char>*/,
+        0/*logMat<signed char>*/,
+        0/*logMat<unsigned short>*/,
+        0/*logMat<short>*/,
+        0/*logMat<int>*/,
+        logMat<float>,
+        0/*logMat<double>*/,
+    };
+#else
     static const func_t funcs[] =
     {
         logMat<unsigned char>,
@@ -1728,8 +2457,9 @@ void cv::gpu::log(const GpuMat& src, GpuMat& dst, Stream& stream)
         logMat<short>,
         logMat<int>,
         logMat<float>,
-        logMat<double>
+        logMat<double>,
     };
+#endif
 
     const int depth = src.depth();
 
@@ -1742,9 +2472,13 @@ void cv::gpu::log(const GpuMat& src, GpuMat& dst, Stream& stream)
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
+    const func_t func = funcs[depth];
+    if (!func)
+        CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
+
     dst.create(src.size(), src.type());
 
-    funcs[depth](src, dst, StreamAccessor::getStream(stream));
+    func(src, dst, StreamAccessor::getStream(stream));
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -1761,6 +2495,19 @@ void cv::gpu::exp(const GpuMat& src, GpuMat& dst, Stream& stream)
     using namespace arithm;
 
     typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[] =
+    {
+        0/*expMat<unsigned char>*/,
+        0/*expMat<signed char>*/,
+        0/*expMat<unsigned short>*/,
+        0/*expMat<short>*/,
+        0/*expMat<int>*/,
+        expMat<float>,
+        0/*expMat<double>*/,
+    };
+#else
     static const func_t funcs[] =
     {
         expMat<unsigned char>,
@@ -1769,8 +2516,9 @@ void cv::gpu::exp(const GpuMat& src, GpuMat& dst, Stream& stream)
         expMat<short>,
         expMat<int>,
         expMat<float>,
-        expMat<double>
+        expMat<double>,
     };
+#endif
 
     const int depth = src.depth();
 
@@ -1783,9 +2531,13 @@ void cv::gpu::exp(const GpuMat& src, GpuMat& dst, Stream& stream)
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
+    const func_t func = funcs[depth];
+    if (!func)
+        CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
+
     dst.create(src.size(), src.type());
 
-    funcs[depth](src, dst, StreamAccessor::getStream(stream));
+    func(src, dst, StreamAccessor::getStream(stream));
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -1809,6 +2561,19 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
     using namespace arithm;
 
     typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[7][4] =
+    {
+        {cmpMatEq<unsigned char> , cmpMatNe<unsigned char> , cmpMatLt<unsigned char> , cmpMatLe<unsigned char> },
+        {0, 0, 0, 0},
+        {0, 0, 0, 0},
+        {0, 0, 0, 0},
+        {0, 0, 0, 0},
+        {cmpMatEq<float>         , cmpMatNe<float>         , cmpMatLt<float>         , cmpMatLe<float>         },
+        {0, 0, 0, 0},
+    };
+#else
     static const func_t funcs[7][4] =
     {
         {cmpMatEq<unsigned char> , cmpMatNe<unsigned char> , cmpMatLt<unsigned char> , cmpMatLe<unsigned char> },
@@ -1819,6 +2584,7 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
         {cmpMatEq<float>         , cmpMatNe<float>         , cmpMatLt<float>         , cmpMatLe<float>         },
         {cmpMatEq<double>        , cmpMatNe<double>        , cmpMatLt<double>        , cmpMatLe<double>        }
     };
+#endif
 
     typedef void (*func_v4_t)(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
     static const func_v4_t funcs_v4[] =
@@ -1839,10 +2605,6 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
-    dst.create(src1.size(), CV_MAKE_TYPE(CV_8U, cn));
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
     static const int codes[] =
     {
         0, 2, 3, 2, 3, 1
@@ -1857,6 +2619,15 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
     };
 
     const int code = codes[cmpop];
+
+    const func_t func = funcs[depth][code];
+    if (!func)
+        CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
+
+    dst.create(src1.size(), CV_MAKE_TYPE(CV_8U, cn));
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+
     PtrStepSzb src1_(src1.rows, src1.cols * cn, psrc1[cmpop]->data, psrc1[cmpop]->step);
     PtrStepSzb src2_(src1.rows, src1.cols * cn, psrc2[cmpop]->data, psrc2[cmpop]->step);
     PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
@@ -1882,8 +2653,6 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
         }
     }
 
-    const func_t func = funcs[depth][code];
-
     func(src1_, src2_, dst_, stream);
 }
 
@@ -1913,6 +2682,31 @@ void cv::gpu::compare(const GpuMat& src, Scalar sc, GpuMat& dst, int cmpop, Stre
     using namespace arithm;
 
     typedef void (*func_t)(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    typedef void (*cast_func_t)(Scalar& sc);
+
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[7][6] =
+    {
+        {cmpScalarEq<unsigned char> , cmpScalarGt<unsigned char> , cmpScalarGe<unsigned char> , cmpScalarLt<unsigned char> , cmpScalarLe<unsigned char> , cmpScalarNe<unsigned char> },
+        {0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0, 0},
+        {cmpScalarEq<float>         , cmpScalarGt<float>         , cmpScalarGe<float>         , cmpScalarLt<float>         , cmpScalarLe<float>         , cmpScalarNe<float>         },
+        {0, 0, 0, 0, 0, 0},
+    };
+
+    static const cast_func_t cast_func[] =
+    {
+        castScalar<unsigned char>,
+        0/*castScalar<signed char>*/,
+        0/*castScalar<unsigned short>*/,
+        0/*castScalar<short>*/,
+        0/*castScalar<int>*/,
+        castScalar<float>,
+        0/*castScalar<double>*/,
+    };
+#else
     static const func_t funcs[7][6] =
     {
         {cmpScalarEq<unsigned char> , cmpScalarGt<unsigned char> , cmpScalarGe<unsigned char> , cmpScalarLt<unsigned char> , cmpScalarLe<unsigned char> , cmpScalarNe<unsigned char> },
@@ -1924,11 +2718,11 @@ void cv::gpu::compare(const GpuMat& src, Scalar sc, GpuMat& dst, int cmpop, Stre
         {cmpScalarEq<double>        , cmpScalarGt<double>        , cmpScalarGe<double>        , cmpScalarLt<double>        , cmpScalarLe<double>        , cmpScalarNe<double>        }
     };
 
-    typedef void (*cast_func_t)(Scalar& sc);
     static const cast_func_t cast_func[] =
     {
         castScalar<unsigned char>, castScalar<signed char>, castScalar<unsigned short>, castScalar<short>, castScalar<int>, castScalar<float>, castScalar<double>
     };
+#endif
 
     const int depth = src.depth();
     const int cn = src.channels();
@@ -1943,11 +2737,15 @@ void cv::gpu::compare(const GpuMat& src, Scalar sc, GpuMat& dst, int cmpop, Stre
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
+    const func_t func = funcs[depth][cmpop];
+    if (!func)
+        CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
+
     dst.create(src.size(), CV_MAKE_TYPE(CV_8U, cn));
 
     cast_func[depth](sc);
 
-    funcs[depth][cmpop](src, cn, sc.val, dst, StreamAccessor::getStream(stream));
+    func(src, cn, sc.val, dst, StreamAccessor::getStream(stream));
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -2391,14 +3189,56 @@ void cv::gpu::bitwise_and(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stre
     using namespace arithm;
 
     typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream);
+
+#ifdef OPENCV_TINY_GPU_MODULE
     static const func_t funcs[5][4] =
     {
-        {BitScalar<unsigned char, bitScalarAnd<unsigned char> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, BitScalar4< bitScalarAnd<unsigned int> >::call},
+        {
+            BitScalar<unsigned char, bitScalarAnd<unsigned char> >::call,
+            0,
+            0/*NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call*/,
+            0/*NppBitwiseC<CV_8U , 4, nppiAndC_8u_C4R >::call*/,
+        },
         {0,0,0,0},
-        {BitScalar<unsigned short, bitScalarAnd<unsigned short> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call},
+        {
+            0/*BitScalar<unsigned short, bitScalarAnd<unsigned short> >::call*/,
+            0,
+            0/*NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call*/,
+            0/*NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call*/,
+        },
         {0,0,0,0},
-        {BitScalar<int, bitScalarAnd<int> >::call                      , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call}
+        {
+            0/*BitScalar<int, bitScalarAnd<int> >::call*/,
+            0,
+            0/*NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call*/,
+            0/*NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call*/,
+        }
     };
+#else
+    static const func_t funcs[5][4] =
+    {
+        {
+            BitScalar<unsigned char, bitScalarAnd<unsigned char> >::call,
+            0,
+            NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call,
+            BitScalar4< bitScalarAnd<unsigned int> >::call
+        },
+        {0,0,0,0},
+        {
+            BitScalar<unsigned short, bitScalarAnd<unsigned short> >::call,
+            0,
+            NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call,
+            NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call
+        },
+        {0,0,0,0},
+        {
+            BitScalar<int, bitScalarAnd<int> >::call,
+            0,
+            NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call,
+            NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call
+        }
+    };
+#endif
 
     const int depth = src.depth();
     const int cn = src.channels();
@@ -2406,9 +3246,13 @@ void cv::gpu::bitwise_and(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stre
     CV_Assert( depth == CV_8U || depth == CV_16U || depth == CV_32S );
     CV_Assert( cn == 1 || cn == 3 || cn == 4 );
 
+    const func_t func = funcs[depth][cn - 1];
+    if (!func)
+        CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
+
     dst.create(src.size(), src.type());
 
-    funcs[depth][cn - 1](src, sc, dst, StreamAccessor::getStream(stream));
+    func(src, sc, dst, StreamAccessor::getStream(stream));
 }
 
 void cv::gpu::bitwise_or(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream)
@@ -2416,14 +3260,56 @@ void cv::gpu::bitwise_or(const GpuMat& src, const Scalar& sc, GpuMat& dst, Strea
     using namespace arithm;
 
     typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream);
+
+#ifdef OPENCV_TINY_GPU_MODULE
     static const func_t funcs[5][4] =
     {
-        {BitScalar<unsigned char, bitScalarOr<unsigned char> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R >::call, BitScalar4< bitScalarOr<unsigned int> >::call},
+        {
+            BitScalar<unsigned char, bitScalarOr<unsigned char> >::call,
+            0,
+            0/*NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R >::call*/,
+            0/*NppBitwiseC<CV_8U , 4, nppiOrC_8u_C4R >::call*/,
+        },
         {0,0,0,0},
-        {BitScalar<unsigned short, bitScalarOr<unsigned short> >::call, 0, NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R>::call},
+        {
+            0/*BitScalar<unsigned short, bitScalarOr<unsigned short> >::call*/,
+            0,
+            0/*NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R>::call*/,
+            0/*NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R>::call*/,
+        },
         {0,0,0,0},
-        {BitScalar<int, bitScalarOr<int> >::call                      , 0, NppBitwiseC<CV_32S, 3, nppiOrC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiOrC_32s_C4R>::call}
+        {
+            0/*BitScalar<int, bitScalarOr<int> >::call*/,
+            0,
+            0/*NppBitwiseC<CV_32S, 3, nppiOrC_32s_C3R>::call*/,
+            0/*NppBitwiseC<CV_32S, 4, nppiOrC_32s_C4R>::call*/,
+        }
     };
+#else
+    static const func_t funcs[5][4] =
+    {
+        {
+            BitScalar<unsigned char, bitScalarOr<unsigned char> >::call,
+            0,
+            NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R >::call,
+            BitScalar4< bitScalarOr<unsigned int> >::call
+        },
+        {0,0,0,0},
+        {
+            BitScalar<unsigned short, bitScalarOr<unsigned short> >::call,
+            0,
+            NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R>::call,
+            NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R>::call
+        },
+        {0,0,0,0},
+        {
+            BitScalar<int, bitScalarOr<int> >::call,
+            0,
+            NppBitwiseC<CV_32S, 3, nppiOrC_32s_C3R>::call,
+            NppBitwiseC<CV_32S, 4, nppiOrC_32s_C4R>::call
+        }
+    };
+#endif
 
     const int depth = src.depth();
     const int cn = src.channels();
@@ -2431,9 +3317,13 @@ void cv::gpu::bitwise_or(const GpuMat& src, const Scalar& sc, GpuMat& dst, Strea
     CV_Assert( depth == CV_8U || depth == CV_16U || depth == CV_32S );
     CV_Assert( cn == 1 || cn == 3 || cn == 4 );
 
+    const func_t func = funcs[depth][cn - 1];
+    if (!func)
+        CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
+
     dst.create(src.size(), src.type());
 
-    funcs[depth][cn - 1](src, sc, dst, StreamAccessor::getStream(stream));
+    func(src, sc, dst, StreamAccessor::getStream(stream));
 }
 
 void cv::gpu::bitwise_xor(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream)
@@ -2441,14 +3331,56 @@ void cv::gpu::bitwise_xor(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stre
     using namespace arithm;
 
     typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream);
+
+#ifdef OPENCV_TINY_GPU_MODULE
     static const func_t funcs[5][4] =
     {
-        {BitScalar<unsigned char, bitScalarXor<unsigned char> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R >::call, BitScalar4< bitScalarXor<unsigned int> >::call},
+        {
+            BitScalar<unsigned char, bitScalarXor<unsigned char> >::call,
+            0,
+            0/*NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R >::call*/,
+            0/*NppBitwiseC<CV_8U , 4, nppiXorC_8u_C4R >::call*/,
+        },
         {0,0,0,0},
-        {BitScalar<unsigned short, bitScalarXor<unsigned short> >::call, 0, NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R>::call},
+        {
+            0/*BitScalar<unsigned short, bitScalarXor<unsigned short> >::call*/,
+            0,
+            0/*NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R>::call*/,
+            0/*NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R>::call*/,
+        },
         {0,0,0,0},
-        {BitScalar<int, bitScalarXor<int> >::call                      , 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R>::call}
+        {
+            0/*BitScalar<int, bitScalarXor<int> >::call*/,
+            0,
+            0/*NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R>::call*/,
+            0/*NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R>::call*/,
+        }
     };
+#else
+    static const func_t funcs[5][4] =
+    {
+        {
+            BitScalar<unsigned char, bitScalarXor<unsigned char> >::call,
+            0,
+            NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R >::call,
+            BitScalar4< bitScalarXor<unsigned int> >::call
+        },
+        {0,0,0,0},
+        {
+            BitScalar<unsigned short, bitScalarXor<unsigned short> >::call,
+            0,
+            NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R>::call,
+            NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R>::call
+        },
+        {0,0,0,0},
+        {
+            BitScalar<int, bitScalarXor<int> >::call,
+            0,
+            NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R>::call,
+            NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R>::call
+        }
+    };
+#endif
 
     const int depth = src.depth();
     const int cn = src.channels();
@@ -2456,9 +3388,13 @@ void cv::gpu::bitwise_xor(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stre
     CV_Assert( depth == CV_8U || depth == CV_16U || depth == CV_32S );
     CV_Assert( cn == 1 || cn == 3 || cn == 4 );
 
+    const func_t func = funcs[depth][cn - 1];
+    if (!func)
+        CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
+
     dst.create(src.size(), src.type());
 
-    funcs[depth][cn - 1](src, sc, dst, StreamAccessor::getStream(stream));
+    func(src, sc, dst, StreamAccessor::getStream(stream));
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -2578,6 +3514,19 @@ void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
     using namespace arithm;
 
     typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[] =
+    {
+        minMat<unsigned char>,
+        0/*minMat<signed char>*/,
+        0/*minMat<unsigned short>*/,
+        0/*minMat<short>*/,
+        0/*minMat<int>*/,
+        minMat<float>,
+        0/*minMat<double>*/,
+    };
+#else
     static const func_t funcs[] =
     {
         minMat<unsigned char>,
@@ -2586,8 +3535,9 @@ void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
         minMat<short>,
         minMat<int>,
         minMat<float>,
-        minMat<double>
+        minMat<double>,
     };
+#endif
 
     const int depth = src1.depth();
     const int cn = src1.channels();
@@ -2657,6 +3607,19 @@ void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
     using namespace arithm;
 
     typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[] =
+    {
+        maxMat<unsigned char>,
+        0/*maxMat<signed char>*/,
+        0/*maxMat<unsigned short>*/,
+        0/*maxMat<short>*/,
+        0/*maxMat<int>*/,
+        maxMat<float>,
+        0/*maxMat<double>*/,
+    };
+#else
     static const func_t funcs[] =
     {
         maxMat<unsigned char>,
@@ -2665,8 +3628,9 @@ void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
         maxMat<short>,
         maxMat<int>,
         maxMat<float>,
-        maxMat<double>
+        maxMat<double>,
     };
+#endif
 
     const int depth = src1.depth();
     const int cn = src1.channels();
@@ -2744,6 +3708,31 @@ void cv::gpu::min(const GpuMat& src, double val, GpuMat& dst, Stream& stream)
     using namespace arithm;
 
     typedef void (*func_t)(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    typedef double (*cast_func_t)(double sc);
+
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[] =
+    {
+        minScalar<unsigned char>,
+        0/*minScalar<signed char>*/,
+        0/*minScalar<unsigned short>*/,
+        0/*minScalar<short>*/,
+        minScalar<int>,
+        minScalar<float>,
+        0/*minScalar<double>*/,
+    };
+
+    static const cast_func_t cast_func[] =
+    {
+        castScalar<unsigned char>,
+        0/*castScalar<signed char>*/,
+        0/*castScalar<unsigned short>*/,
+        0/*castScalar<short>*/,
+        castScalar<int>,
+        castScalar<float>,
+        0/*castScalar<double>*/,
+    };
+#else
     static const func_t funcs[] =
     {
         minScalar<unsigned char>,
@@ -2752,14 +3741,20 @@ void cv::gpu::min(const GpuMat& src, double val, GpuMat& dst, Stream& stream)
         minScalar<short>,
         minScalar<int>,
         minScalar<float>,
-        minScalar<double>
+        minScalar<double>,
     };
 
-    typedef double (*cast_func_t)(double sc);
     static const cast_func_t cast_func[] =
     {
-        castScalar<unsigned char>, castScalar<signed char>, castScalar<unsigned short>, castScalar<short>, castScalar<int>, castScalar<float>, castScalar<double>
+        castScalar<unsigned char>,
+        castScalar<signed char>,
+        castScalar<unsigned short>,
+        castScalar<short>,
+        castScalar<int>,
+        castScalar<float>,
+        castScalar<double>,
     };
+#endif
 
     const int depth = src.depth();
 
@@ -2772,9 +3767,13 @@ void cv::gpu::min(const GpuMat& src, double val, GpuMat& dst, Stream& stream)
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
+    const func_t func = funcs[depth];
+    if (!func)
+        CV_Error(CV_StsNotImplemented, "not available in tiny build");
+
     dst.create(src.size(), src.type());
 
-    funcs[depth](src, cast_func[depth](val), dst, StreamAccessor::getStream(stream));
+    func(src, cast_func[depth](val), dst, StreamAccessor::getStream(stream));
 }
 
 void cv::gpu::max(const GpuMat& src, double val, GpuMat& dst, Stream& stream)
@@ -2782,6 +3781,31 @@ void cv::gpu::max(const GpuMat& src, double val, GpuMat& dst, Stream& stream)
     using namespace arithm;
 
     typedef void (*func_t)(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    typedef double (*cast_func_t)(double sc);
+
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[] =
+    {
+        maxScalar<unsigned char>,
+        0/*maxScalar<signed char>*/,
+        0/*maxScalar<unsigned short>*/,
+        0/*maxScalar<short>*/,
+        0/*maxScalar<int>*/,
+        maxScalar<float>,
+        0/*maxScalar<double>*/,
+    };
+
+    static const cast_func_t cast_func[] =
+    {
+        castScalar<unsigned char>,
+        0/*castScalar<signed char>*/,
+        0/*castScalar<unsigned short>*/,
+        0/*castScalar<short>*/,
+        0/*castScalar<int>*/,
+        castScalar<float>,
+        0/*castScalar<double>*/
+    };
+#else
     static const func_t funcs[] =
     {
         maxScalar<unsigned char>,
@@ -2790,14 +3814,20 @@ void cv::gpu::max(const GpuMat& src, double val, GpuMat& dst, Stream& stream)
         maxScalar<short>,
         maxScalar<int>,
         maxScalar<float>,
-        maxScalar<double>
+        maxScalar<double>,
     };
 
-    typedef double (*cast_func_t)(double sc);
     static const cast_func_t cast_func[] =
     {
-        castScalar<unsigned char>, castScalar<signed char>, castScalar<unsigned short>, castScalar<short>, castScalar<int>, castScalar<float>, castScalar<double>
+        castScalar<unsigned char>,
+        castScalar<signed char>,
+        castScalar<unsigned short>,
+        castScalar<short>,
+        castScalar<int>,
+        castScalar<float>,
+        castScalar<double>,
     };
+#endif
 
     const int depth = src.depth();
 
@@ -2810,9 +3840,13 @@ void cv::gpu::max(const GpuMat& src, double val, GpuMat& dst, Stream& stream)
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
+    const func_t func = funcs[depth];
+    if (!func)
+        CV_Error(CV_StsNotImplemented, "not available in tiny build");
+
     dst.create(src.size(), src.type());
 
-    funcs[depth](src, cast_func[depth](val), dst, StreamAccessor::getStream(stream));
+    func(src, cast_func[depth](val), dst, StreamAccessor::getStream(stream));
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -2858,6 +3892,18 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
     else
     {
         typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
+#ifdef OPENCV_TINY_GPU_MODULE
+        static const func_t funcs[] =
+        {
+            arithm::threshold<unsigned char>,
+            0/*arithm::threshold<signed char>*/,
+            0/*arithm::threshold<unsigned short>*/,
+            0/*arithm::threshold<short>*/,
+            0/*arithm::threshold<int>*/,
+            arithm::threshold<float>,
+            0/*arithm::threshold<double>*/
+        };
+#else
         static const func_t funcs[] =
         {
             arithm::threshold<unsigned char>,
@@ -2868,6 +3914,11 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
             arithm::threshold<float>,
             arithm::threshold<double>
         };
+#endif
+
+        const func_t func = funcs[depth];
+        if (!func)
+            CV_Error(CV_StsNotImplemented, "not available in tiny build");
 
         if (depth != CV_32F && depth != CV_64F)
         {
@@ -2875,7 +3926,7 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
             maxVal = cvRound(maxVal);
         }
 
-        funcs[depth](src, dst, thresh, maxVal, type, stream);
+        func(src, dst, thresh, maxVal, type, stream);
     }
 
     return thresh;
@@ -2892,6 +3943,18 @@ namespace arithm
 void cv::gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream)
 {
     typedef void (*func_t)(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[] =
+    {
+        0/*arithm::pow<unsigned char>*/,
+        0/*arithm::pow<signed char>*/,
+        0/*arithm::pow<unsigned short>*/,
+        0/*arithm::pow<short>*/,
+        0/*arithm::pow<int>*/,
+        arithm::pow<float>,
+        0/*arithm::pow<double>*/,
+    };
+#else
     static const func_t funcs[] =
     {
         arithm::pow<unsigned char>,
@@ -2902,6 +3965,7 @@ void cv::gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream)
         arithm::pow<float>,
         arithm::pow<double>
     };
+#endif
 
     const int depth = src.depth();
     const int cn = src.channels();
@@ -2914,12 +3978,16 @@ void cv::gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream)
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
+    const func_t func = funcs[depth];
+    if (!func)
+        CV_Error(CV_StsNotImplemented, "not available in tiny build");
+
     dst.create(src.size(), src.type());
 
     PtrStepSzb src_(src.rows, src.cols * cn, src.data, src.step);
     PtrStepSzb dst_(src.rows, src.cols * cn, dst.data, dst.step);
 
-    funcs[depth](src_, power, dst_, StreamAccessor::getStream(stream));
+    func(src_, power, dst_, StreamAccessor::getStream(stream));
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -3007,6 +4075,466 @@ namespace arithm
 void cv::gpu::addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst, int ddepth, Stream& stream)
 {
     typedef void (*func_t)(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[7][7][7] =
+    {
+        {
+            {
+                arithm::addWeighted<unsigned char, unsigned char, unsigned char >,
+                0/*arithm::addWeighted<unsigned char, unsigned char, signed char >*/,
+                0/*arithm::addWeighted<unsigned char, unsigned char, unsigned short>*/,
+                0/*arithm::addWeighted<unsigned char, unsigned char, short >*/,
+                0/*arithm::addWeighted<unsigned char, unsigned char, int   >*/,
+                0/*arithm::addWeighted<unsigned char, unsigned char, float >*/,
+                0/*arithm::addWeighted<unsigned char, unsigned char, double>*/
+            },
+            {
+                0/*arithm::addWeighted<unsigned char, signed char, unsigned char >*/,
+                0/*arithm::addWeighted<unsigned char, signed char, signed char >*/,
+                0/*arithm::addWeighted<unsigned char, signed char, unsigned short>*/,
+                0/*arithm::addWeighted<unsigned char, signed char, short >*/,
+                0/*arithm::addWeighted<unsigned char, signed char, int   >*/,
+                0/*arithm::addWeighted<unsigned char, signed char, float >*/,
+                0/*arithm::addWeighted<unsigned char, signed char, double>*/,
+            },
+            {
+                0/*arithm::addWeighted<unsigned char, unsigned short, unsigned char >*/,
+                0/*arithm::addWeighted<unsigned char, unsigned short, signed char >*/,
+                0/*arithm::addWeighted<unsigned char, unsigned short, unsigned short>*/,
+                0/*arithm::addWeighted<unsigned char, unsigned short, short >*/,
+                0/*arithm::addWeighted<unsigned char, unsigned short, int   >*/,
+                0/*arithm::addWeighted<unsigned char, unsigned short, float >*/,
+                0/*arithm::addWeighted<unsigned char, unsigned short, double>*/,
+            },
+            {
+                0/*arithm::addWeighted<unsigned char, short, unsigned char >*/,
+                0/*arithm::addWeighted<unsigned char, short, signed char >*/,
+                0/*arithm::addWeighted<unsigned char, short, unsigned short>*/,
+                0/*arithm::addWeighted<unsigned char, short, short >*/,
+                0/*arithm::addWeighted<unsigned char, short, int   >*/,
+                0/*arithm::addWeighted<unsigned char, short, float >*/,
+                0/*arithm::addWeighted<unsigned char, short, double>*/,
+            },
+            {
+                0/*arithm::addWeighted<unsigned char, int, unsigned char >*/,
+                0/*arithm::addWeighted<unsigned char, int, signed char >*/,
+                0/*arithm::addWeighted<unsigned char, int, unsigned short>*/,
+                0/*arithm::addWeighted<unsigned char, int, short >*/,
+                0/*arithm::addWeighted<unsigned char, int, int   >*/,
+                0/*arithm::addWeighted<unsigned char, int, float >*/,
+                0/*arithm::addWeighted<unsigned char, int, double>*/,
+            },
+            {
+                0/*arithm::addWeighted<unsigned char, float, unsigned char >*/,
+                0/*arithm::addWeighted<unsigned char, float, signed char >*/,
+                0/*arithm::addWeighted<unsigned char, float, unsigned short>*/,
+                0/*arithm::addWeighted<unsigned char, float, short >*/,
+                0/*arithm::addWeighted<unsigned char, float, int   >*/,
+                0/*arithm::addWeighted<unsigned char, float, float >*/,
+                0/*arithm::addWeighted<unsigned char, float, double>*/,
+            },
+            {
+                0/*arithm::addWeighted<unsigned char, double, unsigned char >*/,
+                0/*arithm::addWeighted<unsigned char, double, signed char >*/,
+                0/*arithm::addWeighted<unsigned char, double, unsigned short>*/,
+                0/*arithm::addWeighted<unsigned char, double, short >*/,
+                0/*arithm::addWeighted<unsigned char, double, int   >*/,
+                0/*arithm::addWeighted<unsigned char, double, float >*/,
+                0/*arithm::addWeighted<unsigned char, double, double>*/,
+            }
+        },
+        {
+            {
+                0/*arithm::addWeighted<signed char, unsigned char, unsigned char >*/,
+                0/*arithm::addWeighted<signed char, unsigned char, signed char >*/,
+                0/*arithm::addWeighted<signed char, unsigned char, unsigned short>*/,
+                0/*arithm::addWeighted<signed char, unsigned char, short >*/,
+                0/*arithm::addWeighted<signed char, unsigned char, int   >*/,
+                0/*arithm::addWeighted<signed char, unsigned char, float >*/,
+                0/*arithm::addWeighted<signed char, unsigned char, double>*/,
+            },
+            {
+                0/*arithm::addWeighted<signed char, signed char, unsigned char >*/,
+                0/*arithm::addWeighted<signed char, signed char, signed char >*/,
+                0/*arithm::addWeighted<signed char, signed char, unsigned short>*/,
+                0/*arithm::addWeighted<signed char, signed char, short >*/,
+                0/*arithm::addWeighted<signed char, signed char, int   >*/,
+                0/*arithm::addWeighted<signed char, signed char, float >*/,
+                0/*arithm::addWeighted<signed char, signed char, double>*/,
+            },
+            {
+                0/*arithm::addWeighted<signed char, unsigned short, unsigned char >*/,
+                0/*arithm::addWeighted<signed char, unsigned short, signed char >*/,
+                0/*arithm::addWeighted<signed char, unsigned short, unsigned short>*/,
+                0/*arithm::addWeighted<signed char, unsigned short, short >*/,
+                0/*arithm::addWeighted<signed char, unsigned short, int   >*/,
+                0/*arithm::addWeighted<signed char, unsigned short, float >*/,
+                0/*arithm::addWeighted<signed char, unsigned short, double>*/,
+            },
+            {
+                0/*arithm::addWeighted<signed char, short, unsigned char >*/,
+                0/*arithm::addWeighted<signed char, short, signed char >*/,
+                0/*arithm::addWeighted<signed char, short, unsigned short>*/,
+                0/*arithm::addWeighted<signed char, short, short >*/,
+                0/*arithm::addWeighted<signed char, short, int   >*/,
+                0/*arithm::addWeighted<signed char, short, float >*/,
+                0/*arithm::addWeighted<signed char, short, double>*/,
+            },
+            {
+                0/*arithm::addWeighted<signed char, int, unsigned char >*/,
+                0/*arithm::addWeighted<signed char, int, signed char >*/,
+                0/*arithm::addWeighted<signed char, int, unsigned short>*/,
+                0/*arithm::addWeighted<signed char, int, short >*/,
+                0/*arithm::addWeighted<signed char, int, int   >*/,
+                0/*arithm::addWeighted<signed char, int, float >*/,
+                0/*arithm::addWeighted<signed char, int, double>*/,
+            },
+            {
+                0/*arithm::addWeighted<signed char, float, unsigned char >*/,
+                0/*arithm::addWeighted<signed char, float, signed char >*/,
+                0/*arithm::addWeighted<signed char, float, unsigned short>*/,
+                0/*arithm::addWeighted<signed char, float, short >*/,
+                0/*arithm::addWeighted<signed char, float, int   >*/,
+                0/*arithm::addWeighted<signed char, float, float >*/,
+                0/*arithm::addWeighted<signed char, float, double>*/,
+            },
+            {
+                0/*arithm::addWeighted<signed char, double, unsigned char >*/,
+                0/*arithm::addWeighted<signed char, double, signed char >*/,
+                0/*arithm::addWeighted<signed char, double, unsigned short>*/,
+                0/*arithm::addWeighted<signed char, double, short >*/,
+                0/*arithm::addWeighted<signed char, double, int   >*/,
+                0/*arithm::addWeighted<signed char, double, float >*/,
+                0/*arithm::addWeighted<signed char, double, double>*/,
+            }
+        },
+        {
+            {
+                0/*arithm::addWeighted<unsigned short, unsigned char, unsigned char >*/,
+                0/*arithm::addWeighted<unsigned short, unsigned char, signed char >*/,
+                0/*arithm::addWeighted<unsigned short, unsigned char, unsigned short>*/,
+                0/*arithm::addWeighted<unsigned short, unsigned char, short >*/,
+                0/*arithm::addWeighted<unsigned short, unsigned char, int   >*/,
+                0/*arithm::addWeighted<unsigned short, unsigned char, float >*/,
+                0/*arithm::addWeighted<unsigned short, unsigned char, double>*/,
+            },
+            {
+                0/*arithm::addWeighted<unsigned short, signed char, unsigned char >*/,
+                0/*arithm::addWeighted<unsigned short, signed char, signed char >*/,
+                0/*arithm::addWeighted<unsigned short, signed char, unsigned short>*/,
+                0/*arithm::addWeighted<unsigned short, signed char, short >*/,
+                0/*arithm::addWeighted<unsigned short, signed char, int   >*/,
+                0/*arithm::addWeighted<unsigned short, signed char, float >*/,
+                0/*arithm::addWeighted<unsigned short, signed char, double>*/,
+            },
+            {
+                0/*arithm::addWeighted<unsigned short, unsigned short, unsigned char >*/,
+                0/*arithm::addWeighted<unsigned short, unsigned short, signed char >*/,
+                0/*arithm::addWeighted<unsigned short, unsigned short, unsigned short>*/,
+                0/*arithm::addWeighted<unsigned short, unsigned short, short >*/,
+                0/*arithm::addWeighted<unsigned short, unsigned short, int   >*/,
+                0/*arithm::addWeighted<unsigned short, unsigned short, float >*/,
+                0/*arithm::addWeighted<unsigned short, unsigned short, double>*/,
+            },
+            {
+                0/*arithm::addWeighted<unsigned short, short, unsigned char >*/,
+                0/*arithm::addWeighted<unsigned short, short, signed char >*/,
+                0/*arithm::addWeighted<unsigned short, short, unsigned short>*/,
+                0/*arithm::addWeighted<unsigned short, short, short >*/,
+                0/*arithm::addWeighted<unsigned short, short, int   >*/,
+                0/*arithm::addWeighted<unsigned short, short, float >*/,
+                0/*arithm::addWeighted<unsigned short, short, double>*/,
+            },
+            {
+                0/*arithm::addWeighted<unsigned short, int, unsigned char >*/,
+                0/*arithm::addWeighted<unsigned short, int, signed char >*/,
+                0/*arithm::addWeighted<unsigned short, int, unsigned short>*/,
+                0/*arithm::addWeighted<unsigned short, int, short >*/,
+                0/*arithm::addWeighted<unsigned short, int, int   >*/,
+                0/*arithm::addWeighted<unsigned short, int, float >*/,
+                0/*arithm::addWeighted<unsigned short, int, double>*/,
+            },
+            {
+                0/*arithm::addWeighted<unsigned short, float, unsigned char >*/,
+                0/*arithm::addWeighted<unsigned short, float, signed char >*/,
+                0/*arithm::addWeighted<unsigned short, float, unsigned short>*/,
+                0/*arithm::addWeighted<unsigned short, float, short >*/,
+                0/*arithm::addWeighted<unsigned short, float, int   >*/,
+                0/*arithm::addWeighted<unsigned short, float, float >*/,
+                0/*arithm::addWeighted<unsigned short, float, double>*/,
+            },
+            {
+                0/*arithm::addWeighted<unsigned short, double, unsigned char >*/,
+                0/*arithm::addWeighted<unsigned short, double, signed char >*/,
+                0/*arithm::addWeighted<unsigned short, double, unsigned short>*/,
+                0/*arithm::addWeighted<unsigned short, double, short >*/,
+                0/*arithm::addWeighted<unsigned short, double, int   >*/,
+                0/*arithm::addWeighted<unsigned short, double, float >*/,
+                0/*arithm::addWeighted<unsigned short, double, double>*/,
+            }
+        },
+        {
+            {
+                0/*arithm::addWeighted<short, unsigned char, unsigned char >*/,
+                0/*arithm::addWeighted<short, unsigned char, signed char >*/,
+                0/*arithm::addWeighted<short, unsigned char, unsigned short>*/,
+                0/*arithm::addWeighted<short, unsigned char, short >*/,
+                0/*arithm::addWeighted<short, unsigned char, int   >*/,
+                0/*arithm::addWeighted<short, unsigned char, float >*/,
+                0/*arithm::addWeighted<short, unsigned char, double>*/,
+            },
+            {
+                0/*arithm::addWeighted<short, signed char, unsigned char >*/,
+                0/*arithm::addWeighted<short, signed char, signed char >*/,
+                0/*arithm::addWeighted<short, signed char, unsigned short>*/,
+                0/*arithm::addWeighted<short, signed char, short >*/,
+                0/*arithm::addWeighted<short, signed char, int   >*/,
+                0/*arithm::addWeighted<short, signed char, float >*/,
+                0/*arithm::addWeighted<short, signed char, double>*/,
+            },
+            {
+                0/*arithm::addWeighted<short, unsigned short, unsigned char >*/,
+                0/*arithm::addWeighted<short, unsigned short, signed char >*/,
+                0/*arithm::addWeighted<short, unsigned short, unsigned short>*/,
+                0/*arithm::addWeighted<short, unsigned short, short >*/,
+                0/*arithm::addWeighted<short, unsigned short, int   >*/,
+                0/*arithm::addWeighted<short, unsigned short, float >*/,
+                0/*arithm::addWeighted<short, unsigned short, double>*/,
+            },
+            {
+                0/*arithm::addWeighted<short, short, unsigned char >*/,
+                0/*arithm::addWeighted<short, short, signed char >*/,
+                0/*arithm::addWeighted<short, short, unsigned short>*/,
+                0/*arithm::addWeighted<short, short, short >*/,
+                0/*arithm::addWeighted<short, short, int   >*/,
+                0/*arithm::addWeighted<short, short, float >*/,
+                0/*arithm::addWeighted<short, short, double>*/,
+            },
+            {
+                0/*arithm::addWeighted<short, int, unsigned char >*/,
+                0/*arithm::addWeighted<short, int, signed char >*/,
+                0/*arithm::addWeighted<short, int, unsigned short>*/,
+                0/*arithm::addWeighted<short, int, short >*/,
+                0/*arithm::addWeighted<short, int, int   >*/,
+                0/*arithm::addWeighted<short, int, float >*/,
+                0/*arithm::addWeighted<short, int, double>*/,
+            },
+            {
+                0/*arithm::addWeighted<short, float, unsigned char >*/,
+                0/*arithm::addWeighted<short, float, signed char >*/,
+                0/*arithm::addWeighted<short, float, unsigned short>*/,
+                0/*arithm::addWeighted<short, float, short >*/,
+                0/*arithm::addWeighted<short, float, int   >*/,
+                0/*arithm::addWeighted<short, float, float >*/,
+                0/*arithm::addWeighted<short, float, double>*/,
+            },
+            {
+                0/*arithm::addWeighted<short, double, unsigned char >*/,
+                0/*arithm::addWeighted<short, double, signed char >*/,
+                0/*arithm::addWeighted<short, double, unsigned short>*/,
+                0/*arithm::addWeighted<short, double, short >*/,
+                0/*arithm::addWeighted<short, double, int   >*/,
+                0/*arithm::addWeighted<short, double, float >*/,
+                0/*arithm::addWeighted<short, double, double>*/,
+            }
+        },
+        {
+            {
+                0/*arithm::addWeighted<int, unsigned char, unsigned char >*/,
+                0/*arithm::addWeighted<int, unsigned char, signed char >*/,
+                0/*arithm::addWeighted<int, unsigned char, unsigned short>*/,
+                0/*arithm::addWeighted<int, unsigned char, short >*/,
+                0/*arithm::addWeighted<int, unsigned char, int   >*/,
+                0/*arithm::addWeighted<int, unsigned char, float >*/,
+                0/*arithm::addWeighted<int, unsigned char, double>*/,
+            },
+            {
+                0/*arithm::addWeighted<int, signed char, unsigned char >*/,
+                0/*arithm::addWeighted<int, signed char, signed char >*/,
+                0/*arithm::addWeighted<int, signed char, unsigned short>*/,
+                0/*arithm::addWeighted<int, signed char, short >*/,
+                0/*arithm::addWeighted<int, signed char, int   >*/,
+                0/*arithm::addWeighted<int, signed char, float >*/,
+                0/*arithm::addWeighted<int, signed char, double>*/,
+            },
+            {
+                0/*arithm::addWeighted<int, unsigned short, unsigned char >*/,
+                0/*arithm::addWeighted<int, unsigned short, signed char >*/,
+                0/*arithm::addWeighted<int, unsigned short, unsigned short>*/,
+                0/*arithm::addWeighted<int, unsigned short, short >*/,
+                0/*arithm::addWeighted<int, unsigned short, int   >*/,
+                0/*arithm::addWeighted<int, unsigned short, float >*/,
+                0/*arithm::addWeighted<int, unsigned short, double>*/,
+            },
+            {
+                0/*arithm::addWeighted<int, short, unsigned char >*/,
+                0/*arithm::addWeighted<int, short, signed char >*/,
+                0/*arithm::addWeighted<int, short, unsigned short>*/,
+                0/*arithm::addWeighted<int, short, short >*/,
+                0/*arithm::addWeighted<int, short, int   >*/,
+                0/*arithm::addWeighted<int, short, float >*/,
+                0/*arithm::addWeighted<int, short, double>*/,
+            },
+            {
+                0/*arithm::addWeighted<int, int, unsigned char >*/,
+                0/*arithm::addWeighted<int, int, signed char >*/,
+                0/*arithm::addWeighted<int, int, unsigned short>*/,
+                0/*arithm::addWeighted<int, int, short >*/,
+                0/*arithm::addWeighted<int, int, int   >*/,
+                0/*arithm::addWeighted<int, int, float >*/,
+                0/*arithm::addWeighted<int, int, double>*/,
+            },
+            {
+                0/*arithm::addWeighted<int, float, unsigned char >*/,
+                0/*arithm::addWeighted<int, float, signed char >*/,
+                0/*arithm::addWeighted<int, float, unsigned short>*/,
+                0/*arithm::addWeighted<int, float, short >*/,
+                0/*arithm::addWeighted<int, float, int   >*/,
+                0/*arithm::addWeighted<int, float, float >*/,
+                0/*arithm::addWeighted<int, float, double>*/,
+            },
+            {
+                0/*arithm::addWeighted<int, double, unsigned char >*/,
+                0/*arithm::addWeighted<int, double, signed char >*/,
+                0/*arithm::addWeighted<int, double, unsigned short>*/,
+                0/*arithm::addWeighted<int, double, short >*/,
+                0/*arithm::addWeighted<int, double, int   >*/,
+                0/*arithm::addWeighted<int, double, float >*/,
+                0/*arithm::addWeighted<int, double, double>*/,
+            }
+        },
+        {
+            {
+                0/*arithm::addWeighted<float, unsigned char, unsigned char >*/,
+                0/*arithm::addWeighted<float, unsigned char, signed char >*/,
+                0/*arithm::addWeighted<float, unsigned char, unsigned short>*/,
+                0/*arithm::addWeighted<float, unsigned char, short >*/,
+                0/*arithm::addWeighted<float, unsigned char, int   >*/,
+                0/*arithm::addWeighted<float, unsigned char, float >*/,
+                0/*arithm::addWeighted<float, unsigned char, double>*/,
+            },
+            {
+                0/*arithm::addWeighted<float, signed char, unsigned char >*/,
+                0/*arithm::addWeighted<float, signed char, signed char >*/,
+                0/*arithm::addWeighted<float, signed char, unsigned short>*/,
+                0/*arithm::addWeighted<float, signed char, short >*/,
+                0/*arithm::addWeighted<float, signed char, int   >*/,
+                0/*arithm::addWeighted<float, signed char, float >*/,
+                0/*arithm::addWeighted<float, signed char, double>*/,
+            },
+            {
+                0/*arithm::addWeighted<float, unsigned short, unsigned char >*/,
+                0/*arithm::addWeighted<float, unsigned short, signed char >*/,
+                0/*arithm::addWeighted<float, unsigned short, unsigned short>*/,
+                0/*arithm::addWeighted<float, unsigned short, short >*/,
+                0/*arithm::addWeighted<float, unsigned short, int   >*/,
+                0/*arithm::addWeighted<float, unsigned short, float >*/,
+                0/*arithm::addWeighted<float, unsigned short, double>*/,
+            },
+            {
+                0/*arithm::addWeighted<float, short, unsigned char >*/,
+                0/*arithm::addWeighted<float, short, signed char >*/,
+                0/*arithm::addWeighted<float, short, unsigned short>*/,
+                0/*arithm::addWeighted<float, short, short >*/,
+                0/*arithm::addWeighted<float, short, int   >*/,
+                0/*arithm::addWeighted<float, short, float >*/,
+                0/*arithm::addWeighted<float, short, double>*/,
+            },
+            {
+                0/*arithm::addWeighted<float, int, unsigned char >*/,
+                0/*arithm::addWeighted<float, int, signed char >*/,
+                0/*arithm::addWeighted<float, int, unsigned short>*/,
+                0/*arithm::addWeighted<float, int, short >*/,
+                0/*arithm::addWeighted<float, int, int   >*/,
+                0/*arithm::addWeighted<float, int, float >*/,
+                0/*arithm::addWeighted<float, int, double>*/,
+            },
+            {
+                0/*arithm::addWeighted<float, float, unsigned char >*/,
+                0/*arithm::addWeighted<float, float, signed char >*/,
+                0/*arithm::addWeighted<float, float, unsigned short>*/,
+                0/*arithm::addWeighted<float, float, short >*/,
+                0/*arithm::addWeighted<float, float, int   >*/,
+                arithm::addWeighted<float, float, float >,
+                0/*arithm::addWeighted<float, float, double>*/,
+            },
+            {
+                0/*arithm::addWeighted<float, double, unsigned char >*/,
+                0/*arithm::addWeighted<float, double, signed char >*/,
+                0/*arithm::addWeighted<float, double, unsigned short>*/,
+                0/*arithm::addWeighted<float, double, short >*/,
+                0/*arithm::addWeighted<float, double, int   >*/,
+                0/*arithm::addWeighted<float, double, float >*/,
+                0/*arithm::addWeighted<float, double, double>*/,
+            }
+        },
+        {
+            {
+                0/*arithm::addWeighted<double, unsigned char, unsigned char >*/,
+                0/*arithm::addWeighted<double, unsigned char, signed char >*/,
+                0/*arithm::addWeighted<double, unsigned char, unsigned short>*/,
+                0/*arithm::addWeighted<double, unsigned char, short >*/,
+                0/*arithm::addWeighted<double, unsigned char, int   >*/,
+                0/*arithm::addWeighted<double, unsigned char, float >*/,
+                0/*arithm::addWeighted<double, unsigned char, double>*/,
+            },
+            {
+                0/*arithm::addWeighted<double, signed char, unsigned char >*/,
+                0/*arithm::addWeighted<double, signed char, signed char >*/,
+                0/*arithm::addWeighted<double, signed char, unsigned short>*/,
+                0/*arithm::addWeighted<double, signed char, short >*/,
+                0/*arithm::addWeighted<double, signed char, int   >*/,
+                0/*arithm::addWeighted<double, signed char, float >*/,
+                0/*arithm::addWeighted<double, signed char, double>*/,
+            },
+            {
+                0/*arithm::addWeighted<double, unsigned short, unsigned char >*/,
+                0/*arithm::addWeighted<double, unsigned short, signed char >*/,
+                0/*arithm::addWeighted<double, unsigned short, unsigned short>*/,
+                0/*arithm::addWeighted<double, unsigned short, short >*/,
+                0/*arithm::addWeighted<double, unsigned short, int   >*/,
+                0/*arithm::addWeighted<double, unsigned short, float >*/,
+                0/*arithm::addWeighted<double, unsigned short, double>*/,
+            },
+            {
+                0/*arithm::addWeighted<double, short, unsigned char >*/,
+                0/*arithm::addWeighted<double, short, signed char >*/,
+                0/*arithm::addWeighted<double, short, unsigned short>*/,
+                0/*arithm::addWeighted<double, short, short >*/,
+                0/*arithm::addWeighted<double, short, int   >*/,
+                0/*arithm::addWeighted<double, short, float >*/,
+                0/*arithm::addWeighted<double, short, double>*/,
+            },
+            {
+                0/*arithm::addWeighted<double, int, unsigned char >*/,
+                0/*arithm::addWeighted<double, int, signed char >*/,
+                0/*arithm::addWeighted<double, int, unsigned short>*/,
+                0/*arithm::addWeighted<double, int, short >*/,
+                0/*arithm::addWeighted<double, int, int   >*/,
+                0/*arithm::addWeighted<double, int, float >*/,
+                0/*arithm::addWeighted<double, int, double>*/,
+            },
+            {
+                0/*arithm::addWeighted<double, float, unsigned char >*/,
+                0/*arithm::addWeighted<double, float, signed char >*/,
+                0/*arithm::addWeighted<double, float, unsigned short>*/,
+                0/*arithm::addWeighted<double, float, short >*/,
+                0/*arithm::addWeighted<double, float, int   >*/,
+                0/*arithm::addWeighted<double, float, float >*/,
+                0/*arithm::addWeighted<double, float, double>*/,
+            },
+            {
+                0/*arithm::addWeighted<double, double, unsigned char >*/,
+                0/*arithm::addWeighted<double, double, signed char >*/,
+                0/*arithm::addWeighted<double, double, unsigned short>*/,
+                0/*arithm::addWeighted<double, double, short >*/,
+                0/*arithm::addWeighted<double, double, int   >*/,
+                0/*arithm::addWeighted<double, double, float >*/,
+                0/*arithm::addWeighted<double, double, double>*/,
+            }
+        }
+    };
+#else
     static const func_t funcs[7][7][7] =
     {
         {
@@ -3465,6 +4993,7 @@ void cv::gpu::addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2,
             }
         }
     };
+#endif
 
     int sdepth1 = src1.depth();
     int sdepth2 = src2.depth();
diff --git a/modules/gpu/src/filtering.cpp b/modules/gpu/src/filtering.cpp
index 8905eaed64..c7fd61a9c9 100644
--- a/modules/gpu/src/filtering.cpp
+++ b/modules/gpu/src/filtering.cpp
@@ -789,12 +789,14 @@ Ptr<BaseFilter_GPU> cv::gpu::getLinearFilter_GPU(int srcType, int dstType, const
     case CV_8UC4:
         func = filter2D_gpu<uchar4, uchar4>;
         break;
+#ifndef OPENCV_TINY_GPU_MODULE
     case CV_16UC1:
         func = filter2D_gpu<ushort, ushort>;
         break;
     case CV_16UC4:
         func = filter2D_gpu<ushort4, ushort4>;
         break;
+#endif
     case CV_32FC1:
         func = filter2D_gpu<float, float>;
         break;
@@ -893,6 +895,18 @@ namespace
 
 Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel, int anchor, int borderType)
 {
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const gpuFilter1D_t funcs[7][4] =
+    {
+        {filter::linearRow<uchar, float>, 0, filter::linearRow<uchar3, float3>, filter::linearRow<uchar4, float4>},
+        {0, 0, 0, 0},
+        {0, 0, 0, 0},
+        {0, 0, 0, 0},
+        {0, 0, 0, 0},
+        {filter::linearRow<float, float>, 0, filter::linearRow<float3, float3>, filter::linearRow<float4, float4>},
+        {0, 0, 0, 0}
+    };
+#else
     static const gpuFilter1D_t funcs[7][4] =
     {
         {filter::linearRow<uchar, float>, 0, filter::linearRow<uchar3, float3>, filter::linearRow<uchar4, float4>},
@@ -903,6 +917,7 @@ Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType,
         {filter::linearRow<float, float>, 0, filter::linearRow<float3, float3>, filter::linearRow<float4, float4>},
         {0, 0, 0, 0}
     };
+#endif
     static const nppFilter1D_t npp_funcs[] =
     {
         0, nppiFilterRow_8u_C1R, 0, 0, nppiFilterRow_8u_C4R
@@ -998,6 +1013,18 @@ namespace
 
 Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel, int anchor, int borderType)
 {
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const gpuFilter1D_t funcs[7][4] =
+    {
+        {filter::linearColumn<float, uchar>, 0, filter::linearColumn<float3, uchar3>, filter::linearColumn<float4, uchar4>},
+        {0, 0, 0, 0},
+        {0, 0, 0, 0},
+        {0, 0, 0, 0},
+        {0, 0, 0, 0},
+        {filter::linearColumn<float, float>, 0, filter::linearColumn<float3, float3>, filter::linearColumn<float4, float4>},
+        {0, 0, 0, 0}
+    };
+#else
     static const gpuFilter1D_t funcs[7][4] =
     {
         {filter::linearColumn<float, uchar>, 0, filter::linearColumn<float3, uchar3>, filter::linearColumn<float4, uchar4>},
@@ -1008,6 +1035,7 @@ Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int ds
         {filter::linearColumn<float, float>, 0, filter::linearColumn<float3, float3>, filter::linearColumn<float4, float4>},
         {0, 0, 0, 0}
     };
+#endif
     static const nppFilter1D_t npp_funcs[] =
     {
         0, nppiFilterColumn_8u_C1R, 0, 0, nppiFilterColumn_8u_C4R
diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp
index 66f838f77a..c0dfc44d13 100644
--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@@ -336,6 +336,17 @@ void cv::gpu::copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom
 #endif
     {
         typedef void (*caller_t)(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderType, const Scalar& value, cudaStream_t stream);
+#ifdef OPENCV_TINY_GPU_MODULE
+        static const caller_t callers[6][4] =
+        {
+            {   copyMakeBorder_caller<uchar, 1>  ,  copyMakeBorder_caller<uchar, 2>     ,    copyMakeBorder_caller<uchar, 3>  ,    copyMakeBorder_caller<uchar, 4>},
+            {0, 0, 0, 0},
+            {0, 0, 0, 0},
+            {0, 0, 0, 0},
+            {0, 0, 0, 0},
+            {   copyMakeBorder_caller<float, 1>  ,  0/*copyMakeBorder_caller<float, 2>*/,    copyMakeBorder_caller<float, 3>  ,    copyMakeBorder_caller<float ,4>}
+        };
+#else
         static const caller_t callers[6][4] =
         {
             {   copyMakeBorder_caller<uchar, 1>  ,    copyMakeBorder_caller<uchar, 2>   ,    copyMakeBorder_caller<uchar, 3>  ,    copyMakeBorder_caller<uchar, 4>},
@@ -345,6 +356,7 @@ void cv::gpu::copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom
             {0/*copyMakeBorder_caller<int,   1>*/, 0/*copyMakeBorder_caller<int,   2>*/ , 0/*copyMakeBorder_caller<int,   3>*/, 0/*copyMakeBorder_caller<int  , 4>*/},
             {   copyMakeBorder_caller<float, 1>  , 0/*copyMakeBorder_caller<float, 2>*/ ,    copyMakeBorder_caller<float, 3>  ,    copyMakeBorder_caller<float ,4>}
         };
+#endif
 
         caller_t func = callers[src.depth()][src.channels() - 1];
         CV_Assert(func != 0);
diff --git a/modules/gpu/src/matrix_reductions.cpp b/modules/gpu/src/matrix_reductions.cpp
index 4e09246e78..c22790e35d 100644
--- a/modules/gpu/src/matrix_reductions.cpp
+++ b/modules/gpu/src/matrix_reductions.cpp
@@ -261,6 +261,18 @@ Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
 Scalar cv::gpu::sum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
 {
     typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[7][5] =
+    {
+        {0, ::sum::run<uchar , 1>, 0, 0, 0},
+        {0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0},
+        {0, ::sum::run<float , 1>, 0, 0, 0},
+        {0, 0, 0, 0, 0},
+    };
+#else
     static const func_t funcs[7][5] =
     {
         {0, ::sum::run<uchar , 1>, ::sum::run<uchar , 2>, ::sum::run<uchar , 3>, ::sum::run<uchar , 4>},
@@ -271,6 +283,7 @@ Scalar cv::gpu::sum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
         {0, ::sum::run<float , 1>, ::sum::run<float , 2>, ::sum::run<float , 3>, ::sum::run<float , 4>},
         {0, ::sum::run<double, 1>, ::sum::run<double, 2>, ::sum::run<double, 3>, ::sum::run<double, 4>}
     };
+#endif
 
     CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
 
@@ -286,6 +299,8 @@ Scalar cv::gpu::sum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
     buf.setTo(Scalar::all(0));
 
     const func_t func = funcs[src.depth()][src.channels()];
+    if (!func)
+        CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
 
     double result[4];
     func(src, buf.data, result, mask);
@@ -307,6 +322,18 @@ Scalar cv::gpu::absSum(const GpuMat& src, GpuMat& buf)
 Scalar cv::gpu::absSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
 {
     typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[7][5] =
+    {
+        {0, ::sum::runAbs<uchar , 1>, 0, 0, 0},
+        {0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0},
+        {0, ::sum::runAbs<float , 1>, 0, 0, 0},
+        {0, 0, 0, 0, 0},
+    };
+#else
     static const func_t funcs[7][5] =
     {
         {0, ::sum::runAbs<uchar , 1>, ::sum::runAbs<uchar , 2>, ::sum::runAbs<uchar , 3>, ::sum::runAbs<uchar , 4>},
@@ -317,6 +344,7 @@ Scalar cv::gpu::absSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
         {0, ::sum::runAbs<float , 1>, ::sum::runAbs<float , 2>, ::sum::runAbs<float , 3>, ::sum::runAbs<float , 4>},
         {0, ::sum::runAbs<double, 1>, ::sum::runAbs<double, 2>, ::sum::runAbs<double, 3>, ::sum::runAbs<double, 4>}
     };
+#endif
 
     CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
 
@@ -332,6 +360,8 @@ Scalar cv::gpu::absSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
     buf.setTo(Scalar::all(0));
 
     const func_t func = funcs[src.depth()][src.channels()];
+    if (!func)
+        CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
 
     double result[4];
     func(src, buf.data, result, mask);
@@ -353,6 +383,18 @@ Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
 Scalar cv::gpu::sqrSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
 {
     typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[7][5] =
+    {
+        {0, ::sum::runSqr<uchar , 1>, 0, 0, 0},
+        {0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0},
+        {0, ::sum::runSqr<float , 1>, 0, 0, 0},
+        {0, 0, 0, 0, 0},
+    };
+#else
     static const func_t funcs[7][5] =
     {
         {0, ::sum::runSqr<uchar , 1>, ::sum::runSqr<uchar , 2>, ::sum::runSqr<uchar , 3>, ::sum::runSqr<uchar , 4>},
@@ -363,6 +405,7 @@ Scalar cv::gpu::sqrSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
         {0, ::sum::runSqr<float , 1>, ::sum::runSqr<float , 2>, ::sum::runSqr<float , 3>, ::sum::runSqr<float , 4>},
         {0, ::sum::runSqr<double, 1>, ::sum::runSqr<double, 2>, ::sum::runSqr<double, 3>, ::sum::runSqr<double, 4>}
     };
+#endif
 
     CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
 
@@ -378,6 +421,8 @@ Scalar cv::gpu::sqrSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
     buf.setTo(Scalar::all(0));
 
     const func_t func = funcs[src.depth()][src.channels()];
+    if (!func)
+        CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
 
     double result[4];
     func(src, buf.data, result, mask);
@@ -405,6 +450,18 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
 void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf)
 {
     typedef void (*func_t)(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[] =
+    {
+        ::minMax::run<uchar>,
+        0/*::minMax::run<schar>*/,
+        0/*::minMax::run<ushort>*/,
+        0/*::minMax::run<short>*/,
+        0/*::minMax::run<int>*/,
+        ::minMax::run<float>,
+        0/*::minMax::run<double>*/,
+    };
+#else
     static const func_t funcs[] =
     {
         ::minMax::run<uchar>,
@@ -413,8 +470,9 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
         ::minMax::run<short>,
         ::minMax::run<int>,
         ::minMax::run<float>,
-        ::minMax::run<double>
+        ::minMax::run<double>,
     };
+#endif
 
     CV_Assert( src.channels() == 1 );
     CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
@@ -430,6 +488,8 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
     ensureSizeIsEnough(buf_size, CV_8U, buf);
 
     const func_t func = funcs[src.depth()];
+    if (!func)
+        CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
 
     double temp1, temp2;
     func(src, mask, minVal ? minVal : &temp1, maxVal ? maxVal : &temp2, buf);
@@ -456,6 +516,18 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
                         const GpuMat& mask, GpuMat& valBuf, GpuMat& locBuf)
 {
     typedef void (*func_t)(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[] =
+    {
+        ::minMaxLoc::run<uchar>,
+        0/*::minMaxLoc::run<schar>*/,
+        0/*::minMaxLoc::run<ushort>*/,
+        0/*::minMaxLoc::run<short>*/,
+        ::minMaxLoc::run<int>,
+        ::minMaxLoc::run<float>,
+        0/*::minMaxLoc::run<double>*/,
+    };
+#else
     static const func_t funcs[] =
     {
         ::minMaxLoc::run<uchar>,
@@ -464,8 +536,9 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
         ::minMaxLoc::run<short>,
         ::minMaxLoc::run<int>,
         ::minMaxLoc::run<float>,
-        ::minMaxLoc::run<double>
+        ::minMaxLoc::run<double>,
     };
+#endif
 
     CV_Assert( src.channels() == 1 );
     CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
@@ -482,6 +555,8 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
     ensureSizeIsEnough(locbuf_size, CV_8U, locBuf);
 
     const func_t func = funcs[src.depth()];
+    if (!func)
+        CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
 
     double temp1, temp2;
     Point temp3, temp4;
@@ -508,6 +583,18 @@ int cv::gpu::countNonZero(const GpuMat& src)
 int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
 {
     typedef int (*func_t)(const PtrStepSzb src, PtrStep<unsigned int> buf);
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[] =
+    {
+        ::countNonZero::run<uchar>,
+        0/*::countNonZero::run<schar>*/,
+        0/*::countNonZero::run<ushort>*/,
+        0/*::countNonZero::run<short>*/,
+        0/*::countNonZero::run<int>*/,
+        ::countNonZero::run<float>,
+        0/*::countNonZero::run<double>*/,
+    };
+#else
     static const func_t funcs[] =
     {
         ::countNonZero::run<uchar>,
@@ -516,8 +603,9 @@ int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
         ::countNonZero::run<short>,
         ::countNonZero::run<int>,
         ::countNonZero::run<float>,
-        ::countNonZero::run<double>
+        ::countNonZero::run<double>,
     };
+#endif
 
     CV_Assert(src.channels() == 1);
 
@@ -532,6 +620,8 @@ int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
     ensureSizeIsEnough(buf_size, CV_8U, buf);
 
     const func_t func = funcs[src.depth()];
+    if (!func)
+        CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
 
     return func(src, buf);
 }
@@ -562,6 +652,74 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
     if (dim == 0)
     {
         typedef void (*func_t)(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+#ifdef OPENCV_TINY_GPU_MODULE
+        static const func_t funcs[7][7] =
+        {
+            {
+                ::reduce::rows<unsigned char, int, unsigned char>,
+                0/*::reduce::rows<unsigned char, int, signed char>*/,
+                0/*::reduce::rows<unsigned char, int, unsigned short>*/,
+                0/*::reduce::rows<unsigned char, int, short>*/,
+                0/*::reduce::rows<unsigned char, int, int>*/,
+                ::reduce::rows<unsigned char, float, float>,
+                0/*::reduce::rows<unsigned char, double, double>*/,
+            },
+            {
+                0/*::reduce::rows<signed char, int, unsigned char>*/,
+                0/*::reduce::rows<signed char, int, signed char>*/,
+                0/*::reduce::rows<signed char, int, unsigned short>*/,
+                0/*::reduce::rows<signed char, int, short>*/,
+                0/*::reduce::rows<signed char, int, int>*/,
+                0/*::reduce::rows<signed char, float, float>*/,
+                0/*::reduce::rows<signed char, double, double>*/,
+            },
+            {
+                0/*::reduce::rows<unsigned short, int, unsigned char>*/,
+                0/*::reduce::rows<unsigned short, int, signed char>*/,
+                0/*::reduce::rows<unsigned short, int, unsigned short>*/,
+                0/*::reduce::rows<unsigned short, int, short>*/,
+                0/*::reduce::rows<unsigned short, int, int>*/,
+                0/*::reduce::rows<unsigned short, float, float>*/,
+                0/*::reduce::rows<unsigned short, double, double>*/,
+            },
+            {
+                0/*::reduce::rows<short, int, unsigned char>*/,
+                0/*::reduce::rows<short, int, signed char>*/,
+                0/*::reduce::rows<short, int, unsigned short>*/,
+                0/*::reduce::rows<short, int, short>*/,
+                0/*::reduce::rows<short, int, int>*/,
+                0/*::reduce::rows<short, float, float>*/,
+                0/*::reduce::rows<short, double, double>*/,
+            },
+            {
+                0/*::reduce::rows<int, int, unsigned char>*/,
+                0/*::reduce::rows<int, int, signed char>*/,
+                0/*::reduce::rows<int, int, unsigned short>*/,
+                0/*::reduce::rows<int, int, short>*/,
+                0/*::reduce::rows<int, int, int>*/,
+                0/*::reduce::rows<int, float, float>*/,
+                0/*::reduce::rows<int, double, double>*/,
+            },
+            {
+                0/*::reduce::rows<float, float, unsigned char>*/,
+                0/*::reduce::rows<float, float, signed char>*/,
+                0/*::reduce::rows<float, float, unsigned short>*/,
+                0/*::reduce::rows<float, float, short>*/,
+                0/*::reduce::rows<float, float, int>*/,
+                ::reduce::rows<float, float, float>,
+                0/*::reduce::rows<float, double, double>*/,
+            },
+            {
+                0/*::reduce::rows<double, double, unsigned char>*/,
+                0/*::reduce::rows<double, double, signed char>*/,
+                0/*::reduce::rows<double, double, unsigned short>*/,
+                0/*::reduce::rows<double, double, short>*/,
+                0/*::reduce::rows<double, double, int>*/,
+                0/*::reduce::rows<double, double, float>*/,
+                0/*::reduce::rows<double, double, double>*/,
+            }
+        };
+#else
         static const func_t funcs[7][7] =
         {
             {
@@ -571,7 +729,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
                 0/*::reduce::rows<unsigned char, int, short>*/,
                 ::reduce::rows<unsigned char, int, int>,
                 ::reduce::rows<unsigned char, float, float>,
-                ::reduce::rows<unsigned char, double, double>
+                ::reduce::rows<unsigned char, double, double>,
             },
             {
                 0/*::reduce::rows<signed char, int, unsigned char>*/,
@@ -580,7 +738,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
                 0/*::reduce::rows<signed char, int, short>*/,
                 0/*::reduce::rows<signed char, int, int>*/,
                 0/*::reduce::rows<signed char, float, float>*/,
-                0/*::reduce::rows<signed char, double, double>*/
+                0/*::reduce::rows<signed char, double, double>*/,
             },
             {
                 0/*::reduce::rows<unsigned short, int, unsigned char>*/,
@@ -589,7 +747,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
                 0/*::reduce::rows<unsigned short, int, short>*/,
                 ::reduce::rows<unsigned short, int, int>,
                 ::reduce::rows<unsigned short, float, float>,
-                ::reduce::rows<unsigned short, double, double>
+                ::reduce::rows<unsigned short, double, double>,
             },
             {
                 0/*::reduce::rows<short, int, unsigned char>*/,
@@ -598,7 +756,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
                 ::reduce::rows<short, int, short>,
                 ::reduce::rows<short, int, int>,
                 ::reduce::rows<short, float, float>,
-                ::reduce::rows<short, double, double>
+                ::reduce::rows<short, double, double>,
             },
             {
                 0/*::reduce::rows<int, int, unsigned char>*/,
@@ -607,7 +765,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
                 0/*::reduce::rows<int, int, short>*/,
                 ::reduce::rows<int, int, int>,
                 ::reduce::rows<int, float, float>,
-                ::reduce::rows<int, double, double>
+                ::reduce::rows<int, double, double>,
             },
             {
                 0/*::reduce::rows<float, float, unsigned char>*/,
@@ -616,7 +774,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
                 0/*::reduce::rows<float, float, short>*/,
                 0/*::reduce::rows<float, float, int>*/,
                 ::reduce::rows<float, float, float>,
-                ::reduce::rows<float, double, double>
+                ::reduce::rows<float, double, double>,
             },
             {
                 0/*::reduce::rows<double, double, unsigned char>*/,
@@ -625,9 +783,10 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
                 0/*::reduce::rows<double, double, short>*/,
                 0/*::reduce::rows<double, double, int>*/,
                 0/*::reduce::rows<double, double, float>*/,
-                ::reduce::rows<double, double, double>
+                ::reduce::rows<double, double, double>,
             }
         };
+#endif
 
         const func_t func = funcs[src.depth()][dst.depth()];
 
@@ -639,6 +798,74 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
     else
     {
         typedef void (*func_t)(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+#ifdef OPENCV_TINY_GPU_MODULE
+        static const func_t funcs[7][7] =
+        {
+            {
+                ::reduce::cols<unsigned char, int, unsigned char>,
+                0/*::reduce::cols<unsigned char, int, signed char>*/,
+                0/*::reduce::cols<unsigned char, int, unsigned short>*/,
+                0/*::reduce::cols<unsigned char, int, short>*/,
+                0/*::reduce::cols<unsigned char, int, int>*/,
+                ::reduce::cols<unsigned char, float, float>,
+                0/*::reduce::cols<unsigned char, double, double>*/,
+            },
+            {
+                0/*::reduce::cols<signed char, int, unsigned char>*/,
+                0/*::reduce::cols<signed char, int, signed char>*/,
+                0/*::reduce::cols<signed char, int, unsigned short>*/,
+                0/*::reduce::cols<signed char, int, short>*/,
+                0/*::reduce::cols<signed char, int, int>*/,
+                0/*::reduce::cols<signed char, float, float>*/,
+                0/*::reduce::cols<signed char, double, double>*/,
+            },
+            {
+                0/*::reduce::cols<unsigned short, int, unsigned char>*/,
+                0/*::reduce::cols<unsigned short, int, signed char>*/,
+                0/*::reduce::cols<unsigned short, int, unsigned short>*/,
+                0/*::reduce::cols<unsigned short, int, short>*/,
+                0/*::reduce::cols<unsigned short, int, int>*/,
+                0/*::reduce::cols<unsigned short, float, float>*/,
+                0/*::reduce::cols<unsigned short, double, double>*/,
+            },
+            {
+                0/*::reduce::cols<short, int, unsigned char>*/,
+                0/*::reduce::cols<short, int, signed char>*/,
+                0/*::reduce::cols<short, int, unsigned short>*/,
+                0/*::reduce::cols<short, int, short>*/,
+                0/*::reduce::cols<short, int, int>*/,
+                0/*::reduce::cols<short, float, float>*/,
+                0/*::reduce::cols<short, double, double>*/,
+            },
+            {
+                0/*::reduce::cols<int, int, unsigned char>*/,
+                0/*::reduce::cols<int, int, signed char>*/,
+                0/*::reduce::cols<int, int, unsigned short>*/,
+                0/*::reduce::cols<int, int, short>*/,
+                0/*::reduce::cols<int, int, int>*/,
+                0/*::reduce::cols<int, float, float>*/,
+                0/*::reduce::cols<int, double, double>*/,
+            },
+            {
+                0/*::reduce::cols<float, float, unsigned char>*/,
+                0/*::reduce::cols<float, float, signed char>*/,
+                0/*::reduce::cols<float, float, unsigned short>*/,
+                0/*::reduce::cols<float, float, short>*/,
+                0/*::reduce::cols<float, float, int>*/,
+                ::reduce::cols<float, float, float>,
+                0/*::reduce::cols<float, double, double>*/,
+            },
+            {
+                0/*::reduce::cols<double, double, unsigned char>*/,
+                0/*::reduce::cols<double, double, signed char>*/,
+                0/*::reduce::cols<double, double, unsigned short>*/,
+                0/*::reduce::cols<double, double, short>*/,
+                0/*::reduce::cols<double, double, int>*/,
+                0/*::reduce::cols<double, double, float>*/,
+                0/*::reduce::cols<double, double, double>*/,
+            }
+        };
+#else
         static const func_t funcs[7][7] =
         {
             {
@@ -648,7 +875,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
                 0/*::reduce::cols<unsigned char, int, short>*/,
                 ::reduce::cols<unsigned char, int, int>,
                 ::reduce::cols<unsigned char, float, float>,
-                ::reduce::cols<unsigned char, double, double>
+                ::reduce::cols<unsigned char, double, double>,
             },
             {
                 0/*::reduce::cols<signed char, int, unsigned char>*/,
@@ -657,7 +884,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
                 0/*::reduce::cols<signed char, int, short>*/,
                 0/*::reduce::cols<signed char, int, int>*/,
                 0/*::reduce::cols<signed char, float, float>*/,
-                0/*::reduce::cols<signed char, double, double>*/
+                0/*::reduce::cols<signed char, double, double>*/,
             },
             {
                 0/*::reduce::cols<unsigned short, int, unsigned char>*/,
@@ -666,7 +893,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
                 0/*::reduce::cols<unsigned short, int, short>*/,
                 ::reduce::cols<unsigned short, int, int>,
                 ::reduce::cols<unsigned short, float, float>,
-                ::reduce::cols<unsigned short, double, double>
+                ::reduce::cols<unsigned short, double, double>,
             },
             {
                 0/*::reduce::cols<short, int, unsigned char>*/,
@@ -675,7 +902,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
                 ::reduce::cols<short, int, short>,
                 ::reduce::cols<short, int, int>,
                 ::reduce::cols<short, float, float>,
-                ::reduce::cols<short, double, double>
+                ::reduce::cols<short, double, double>,
             },
             {
                 0/*::reduce::cols<int, int, unsigned char>*/,
@@ -684,7 +911,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
                 0/*::reduce::cols<int, int, short>*/,
                 ::reduce::cols<int, int, int>,
                 ::reduce::cols<int, float, float>,
-                ::reduce::cols<int, double, double>
+                ::reduce::cols<int, double, double>,
             },
             {
                 0/*::reduce::cols<float, float, unsigned char>*/,
@@ -693,7 +920,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
                 0/*::reduce::cols<float, float, short>*/,
                 0/*::reduce::cols<float, float, int>*/,
                 ::reduce::cols<float, float, float>,
-                ::reduce::cols<float, double, double>
+                ::reduce::cols<float, double, double>,
             },
             {
                 0/*::reduce::cols<double, double, unsigned char>*/,
@@ -702,9 +929,10 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
                 0/*::reduce::cols<double, double, short>*/,
                 0/*::reduce::cols<double, double, int>*/,
                 0/*::reduce::cols<double, double, float>*/,
-                ::reduce::cols<double, double, double>
+                ::reduce::cols<double, double, double>,
             }
         };
+#endif
 
         const func_t func = funcs[src.depth()][dst.depth()];
 
diff --git a/modules/gpu/src/pyramids.cpp b/modules/gpu/src/pyramids.cpp
index 85fb99040c..b4d4676587 100644
--- a/modules/gpu/src/pyramids.cpp
+++ b/modules/gpu/src/pyramids.cpp
@@ -68,6 +68,17 @@ void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst, Stream& stream)
 
     typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[6][4] =
+    {
+        {pyrDown_gpu<uchar>      , 0 /*pyrDown_gpu<uchar2>*/ , pyrDown_gpu<uchar3>      , pyrDown_gpu<uchar4>      },
+        {0, 0, 0, 0},
+        {0, 0, 0, 0},
+        {0, 0, 0, 0},
+        {0, 0, 0, 0},
+        {pyrDown_gpu<float>      , 0 /*pyrDown_gpu<float2>*/ , pyrDown_gpu<float3>      , pyrDown_gpu<float4>      }
+    };
+#else
     static const func_t funcs[6][4] =
     {
         {pyrDown_gpu<uchar>      , 0 /*pyrDown_gpu<uchar2>*/ , pyrDown_gpu<uchar3>      , pyrDown_gpu<uchar4>      },
@@ -77,6 +88,7 @@ void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst, Stream& stream)
         {0 /*pyrDown_gpu<int>*/  , 0 /*pyrDown_gpu<int2>*/   , 0 /*pyrDown_gpu<int3>*/  , 0 /*pyrDown_gpu<int4>*/  },
         {pyrDown_gpu<float>      , 0 /*pyrDown_gpu<float2>*/ , pyrDown_gpu<float3>      , pyrDown_gpu<float4>      }
     };
+#endif
 
     CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
 
@@ -106,6 +118,17 @@ void cv::gpu::pyrUp(const GpuMat& src, GpuMat& dst, Stream& stream)
 
     typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[6][4] =
+    {
+        {pyrUp_gpu<uchar>      , 0 /*pyrUp_gpu<uchar2>*/ , pyrUp_gpu<uchar3>      , pyrUp_gpu<uchar4>      },
+        {0, 0, 0, 0},
+        {0, 0, 0, 0},
+        {0, 0, 0, 0},
+        {0, 0, 0, 0},
+        {pyrUp_gpu<float>      , 0 /*pyrUp_gpu<float2>*/ , pyrUp_gpu<float3>      , pyrUp_gpu<float4>      }
+    };
+#else
     static const func_t funcs[6][4] =
     {
         {pyrUp_gpu<uchar>      , 0 /*pyrUp_gpu<uchar2>*/ , pyrUp_gpu<uchar3>      , pyrUp_gpu<uchar4>      },
@@ -115,6 +138,7 @@ void cv::gpu::pyrUp(const GpuMat& src, GpuMat& dst, Stream& stream)
         {0 /*pyrUp_gpu<int>*/  , 0 /*pyrUp_gpu<int2>*/   , 0 /*pyrUp_gpu<int3>*/  , 0 /*pyrUp_gpu<int4>*/  },
         {pyrUp_gpu<float>      , 0 /*pyrUp_gpu<float2>*/ , pyrUp_gpu<float3>      , pyrUp_gpu<float4>      }
     };
+#endif
 
     CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
 
diff --git a/modules/gpu/src/remap.cpp b/modules/gpu/src/remap.cpp
index 4b87286331..3e13c7285c 100644
--- a/modules/gpu/src/remap.cpp
+++ b/modules/gpu/src/remap.cpp
@@ -65,6 +65,17 @@ void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const Gp
     typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation,
         int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[6][4] =
+    {
+        {remap_gpu<uchar>      , 0 /*remap_gpu<uchar2>*/ , remap_gpu<uchar3>     , remap_gpu<uchar4>     },
+        {0, 0, 0, 0},
+        {0, 0, 0, 0},
+        {0, 0, 0, 0},
+        {0, 0, 0, 0},
+        {remap_gpu<float>      , 0 /*remap_gpu<float2>*/ , remap_gpu<float3>     , remap_gpu<float4>     }
+    };
+#else
     static const func_t funcs[6][4] =
     {
         {remap_gpu<uchar>      , 0 /*remap_gpu<uchar2>*/ , remap_gpu<uchar3>     , remap_gpu<uchar4>     },
@@ -74,6 +85,7 @@ void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const Gp
         {0 /*remap_gpu<int>*/  , 0 /*remap_gpu<int2>*/   , 0 /*remap_gpu<int3>*/ , 0 /*remap_gpu<int4>*/ },
         {remap_gpu<float>      , 0 /*remap_gpu<float2>*/ , remap_gpu<float3>     , remap_gpu<float4>     }
     };
+#endif
 
     CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
     CV_Assert(xmap.type() == CV_32F && ymap.type() == CV_32F && xmap.size() == ymap.size());
diff --git a/modules/gpu/src/resize.cpp b/modules/gpu/src/resize.cpp
index e1b502672a..66a771668d 100644
--- a/modules/gpu/src/resize.cpp
+++ b/modules/gpu/src/resize.cpp
@@ -57,6 +57,18 @@ namespace cv { namespace gpu { namespace device
 void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& stream)
 {
     typedef void (*func_t)(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[6][4] =
+    {
+        {device::resize<uchar>      , 0 /*device::resize<uchar2>*/ , device::resize<uchar3>     , device::resize<uchar4>     },
+        {0, 0, 0, 0},
+        {0, 0, 0, 0},
+        {0, 0, 0, 0},
+        {0, 0, 0, 0},
+        {device::resize<float>      , 0 /*device::resize<float2>*/ , device::resize<float3>     , device::resize<float4>     }
+    };
+#else
     static const func_t funcs[6][4] =
     {
         {device::resize<uchar>      , 0 /*device::resize<uchar2>*/ , device::resize<uchar3>     , device::resize<uchar4>     },
@@ -66,6 +78,7 @@ void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, doub
         {0 /*device::resize<int>*/  , 0 /*device::resize<int2>*/   , 0 /*device::resize<int3>*/ , 0 /*device::resize<int4>*/ },
         {device::resize<float>      , 0 /*device::resize<float2>*/ , device::resize<float3>     , device::resize<float4>     }
     };
+#endif
 
     CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 );
     CV_Assert( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC || interpolation == INTER_AREA );
diff --git a/modules/gpu/src/warp.cpp b/modules/gpu/src/warp.cpp
index 827d5219f1..c963235b72 100644
--- a/modules/gpu/src/warp.cpp
+++ b/modules/gpu/src/warp.cpp
@@ -277,6 +277,17 @@ void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsiz
         typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
             int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 
+#ifdef OPENCV_TINY_GPU_MODULE
+        static const func_t funcs[6][4] =
+        {
+            {warpAffine_gpu<uchar>      , 0 /*warpAffine_gpu<uchar2>*/ , warpAffine_gpu<uchar3>     , warpAffine_gpu<uchar4>     },
+            {0, 0, 0, 0},
+            {0, 0, 0, 0},
+            {0, 0, 0, 0},
+            {0, 0, 0, 0},
+            {warpAffine_gpu<float>      , 0 /*warpAffine_gpu<float2>*/ , warpAffine_gpu<float3>     , warpAffine_gpu<float4>     }
+        };
+#else
         static const func_t funcs[6][4] =
         {
             {warpAffine_gpu<uchar>      , 0 /*warpAffine_gpu<uchar2>*/ , warpAffine_gpu<uchar3>     , warpAffine_gpu<uchar4>     },
@@ -286,6 +297,7 @@ void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsiz
             {0 /*warpAffine_gpu<int>*/  , 0 /*warpAffine_gpu<int2>*/   , 0 /*warpAffine_gpu<int3>*/ , 0 /*warpAffine_gpu<int4>*/ },
             {warpAffine_gpu<float>      , 0 /*warpAffine_gpu<float2>*/ , warpAffine_gpu<float3>     , warpAffine_gpu<float4>     }
         };
+#endif
 
         const func_t func = funcs[src.depth()][src.channels() - 1];
         CV_Assert(func != 0);
@@ -415,6 +427,17 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size
         typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
             int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
 
+#ifdef OPENCV_TINY_GPU_MODULE
+        static const func_t funcs[6][4] =
+        {
+            {warpPerspective_gpu<uchar>      , 0 /*warpPerspective_gpu<uchar2>*/ , warpPerspective_gpu<uchar3>     , warpPerspective_gpu<uchar4>     },
+            {0, 0, 0, 0},
+            {0, 0, 0, 0},
+            {0, 0, 0, 0},
+            {0, 0, 0, 0},
+            {warpPerspective_gpu<float>      , 0 /*warpPerspective_gpu<float2>*/ , warpPerspective_gpu<float3>     , warpPerspective_gpu<float4>     }
+        };
+#else
         static const func_t funcs[6][4] =
         {
             {warpPerspective_gpu<uchar>      , 0 /*warpPerspective_gpu<uchar2>*/ , warpPerspective_gpu<uchar3>     , warpPerspective_gpu<uchar4>     },
@@ -424,6 +447,7 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size
             {0 /*warpPerspective_gpu<int>*/  , 0 /*warpPerspective_gpu<int2>*/   , 0 /*warpPerspective_gpu<int3>*/ , 0 /*warpPerspective_gpu<int4>*/ },
             {warpPerspective_gpu<float>      , 0 /*warpPerspective_gpu<float2>*/ , warpPerspective_gpu<float3>     , warpPerspective_gpu<float4>     }
         };
+#endif
 
         const func_t func = funcs[src.depth()][src.channels() - 1];
         CV_Assert(func != 0);
diff --git a/modules/gpu/test/test_color.cpp b/modules/gpu/test/test_color.cpp
index 5720e0c9d0..6d4c8c5423 100644
--- a/modules/gpu/test/test_color.cpp
+++ b/modules/gpu/test/test_color.cpp
@@ -2285,11 +2285,19 @@ GPU_TEST_P(CvtColor, BayerGR2Gray)
     EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 2);
 }
 
+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, CvtColor, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U), MatDepth(CV_32F)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, CvtColor, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
     testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_32F)),
     WHOLE_SUBMAT));
+#endif
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // Demosaicing
diff --git a/modules/gpu/test/test_copy_make_border.cpp b/modules/gpu/test/test_copy_make_border.cpp
index 24a75c0235..b06f795411 100644
--- a/modules/gpu/test/test_copy_make_border.cpp
+++ b/modules/gpu/test/test_copy_make_border.cpp
@@ -87,6 +87,20 @@ GPU_TEST_P(CopyMakeBorder, Accuracy)
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, CopyMakeBorder, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1),
+                    MatType(CV_8UC3),
+                    MatType(CV_8UC4),
+                    MatType(CV_32FC1),
+                    MatType(CV_32FC3),
+                    MatType(CV_32FC4)),
+    testing::Values(Border(1), Border(10), Border(50)),
+    ALL_BORDER_TYPES,
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, CopyMakeBorder, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
@@ -102,5 +116,6 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, CopyMakeBorder, testing::Combine(
     testing::Values(Border(1), Border(10), Border(50)),
     ALL_BORDER_TYPES,
     WHOLE_SUBMAT));
+#endif
 
 #endif // HAVE_CUDA
diff --git a/modules/gpu/test/test_core.cpp b/modules/gpu/test/test_core.cpp
index b8b83ef10c..dae80c72df 100644
--- a/modules/gpu/test/test_core.cpp
+++ b/modules/gpu/test/test_core.cpp
@@ -1341,11 +1341,19 @@ GPU_TEST_P(Abs, Accuracy)
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_Core, Abs, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_32F)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_Core, Abs, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
     testing::Values(MatDepth(CV_16S), MatDepth(CV_32F)),
     WHOLE_SUBMAT));
+#endif
 
 ////////////////////////////////////////////////////////////////////////////////
 // Sqr
@@ -1381,6 +1389,13 @@ GPU_TEST_P(Sqr, Accuracy)
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_Core, Sqr, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_32F)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_Core, Sqr, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
@@ -1389,6 +1404,7 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Sqr, testing::Combine(
                     MatDepth(CV_16S),
                     MatDepth(CV_32F)),
     WHOLE_SUBMAT));
+#endif
 
 ////////////////////////////////////////////////////////////////////////////////
 // Sqrt
@@ -1451,6 +1467,13 @@ GPU_TEST_P(Sqrt, Accuracy)
     EXPECT_MAT_NEAR(dst_gold, dst, depth < CV_32F ? 1.0 : 1e-5);
 }
 
+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_Core, Sqrt, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_32F)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_Core, Sqrt, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
@@ -1459,6 +1482,7 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Sqrt, testing::Combine(
                     MatDepth(CV_16S),
                     MatDepth(CV_32F)),
     WHOLE_SUBMAT));
+#endif
 
 ////////////////////////////////////////////////////////////////////////////////
 // Log
@@ -1521,6 +1545,13 @@ GPU_TEST_P(Log, Accuracy)
     EXPECT_MAT_NEAR(dst_gold, dst, depth < CV_32F ? 1.0 : 1e-6);
 }
 
+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_Core, Log, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_32F)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_Core, Log, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
@@ -1529,6 +1560,7 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Log, testing::Combine(
                     MatDepth(CV_16S),
                     MatDepth(CV_32F)),
     WHOLE_SUBMAT));
+#endif
 
 ////////////////////////////////////////////////////////////////////////////////
 // Exp
@@ -1601,6 +1633,13 @@ GPU_TEST_P(Exp, Accuracy)
     EXPECT_MAT_NEAR(dst_gold, dst, depth < CV_32F ? 1.0 : 1e-2);
 }
 
+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_Core, Exp, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_32F)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_Core, Exp, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
@@ -1609,6 +1648,7 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Exp, testing::Combine(
                     MatDepth(CV_16S),
                     MatDepth(CV_32F)),
     WHOLE_SUBMAT));
+#endif
 
 ////////////////////////////////////////////////////////////////////////////////
 // Compare_Array
@@ -1775,12 +1815,21 @@ GPU_TEST_P(Compare_Scalar, Accuracy)
     }
 }
 
+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_Core, Compare_Scalar, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    CmpCode::all(),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_Core, Compare_Scalar, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
     TYPES(CV_8U, CV_64F, 1, 4),
     CmpCode::all(),
     WHOLE_SUBMAT));
+#endif
 
 //////////////////////////////////////////////////////////////////////////////
 // Bitwise_Array
@@ -1936,11 +1985,19 @@ GPU_TEST_P(Bitwise_Scalar, Xor)
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_Core, Bitwise_Scalar, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U)),
+    testing::Values(Channels(1))));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_Core, Bitwise_Scalar, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
     testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_32S)),
     IMAGE_CHANNELS));
+#endif
 
 //////////////////////////////////////////////////////////////////////////////
 // RShift
@@ -2317,11 +2374,19 @@ GPU_TEST_P(Pow, Accuracy)
     }
 }
 
+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_Core, Pow, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_32F)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_Core, Pow, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
     ALL_DEPTH,
     WHOLE_SUBMAT));
+#endif
 
 //////////////////////////////////////////////////////////////////////////////
 // AddWeighted
@@ -2380,6 +2445,23 @@ GPU_TEST_P(AddWeighted, Accuracy)
     }
 }
 
+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_Core_1, AddWeighted, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U)),
+    testing::Values(MatDepth(CV_8U)),
+    testing::Values(MatDepth(CV_8U)),
+    WHOLE_SUBMAT));
+
+INSTANTIATE_TEST_CASE_P(GPU_Core_2, AddWeighted, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_32F)),
+    testing::Values(MatDepth(CV_32F)),
+    testing::Values(MatDepth(CV_32F)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_Core, AddWeighted, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
@@ -2387,6 +2469,7 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, AddWeighted, testing::Combine(
     ALL_DEPTH,
     ALL_DEPTH,
     WHOLE_SUBMAT));
+#endif
 
 //////////////////////////////////////////////////////////////////////////////
 // GEMM
@@ -2953,6 +3036,15 @@ GPU_TEST_P(Norm, Accuracy)
     EXPECT_NEAR(val_gold, val, depth < CV_32F ? 0.0 : 1.0);
 }
 
+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_Core, Norm, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U),
+                    MatDepth(CV_32F)),
+    testing::Values(NormCode(cv::NORM_L1), NormCode(cv::NORM_L2), NormCode(cv::NORM_INF)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_Core, Norm, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
@@ -2964,6 +3056,7 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Norm, testing::Combine(
                     MatDepth(CV_32F)),
     testing::Values(NormCode(cv::NORM_L1), NormCode(cv::NORM_L2), NormCode(cv::NORM_INF)),
     WHOLE_SUBMAT));
+#endif
 
 ////////////////////////////////////////////////////////////////////////////////
 // normDiff
@@ -3136,11 +3229,19 @@ GPU_TEST_P(Sum, Sqr)
     EXPECT_SCALAR_NEAR(val_gold, val, CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.5);
 }
 
+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_Core, Sum, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_32FC1)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_Core, Sum, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
     TYPES(CV_8U, CV_64F, 1, 4),
     WHOLE_SUBMAT));
+#endif
 
 ////////////////////////////////////////////////////////////////////////////////
 // MinMax
@@ -3513,11 +3614,19 @@ PARAM_TEST_CASE(Reduce, cv::gpu::DeviceInfo, cv::Size, MatDepth, Channels, Reduc
         type = CV_MAKE_TYPE(depth, channels);
 
         if (reduceOp == CV_REDUCE_MAX || reduceOp == CV_REDUCE_MIN)
+        {
             dst_depth = depth;
+        }
+#ifndef OPENCV_TINY_GPU_MODULE
         else if (reduceOp == CV_REDUCE_SUM)
+        {
             dst_depth = depth == CV_8U ? CV_32S : depth < CV_64F ? CV_32F : depth;
+        }
+#endif
         else
+        {
             dst_depth = depth < CV_32F ? CV_32F : depth;
+        }
 
         dst_type = CV_MAKE_TYPE(dst_depth, channels);
     }
@@ -3553,6 +3662,16 @@ GPU_TEST_P(Reduce, Cols)
     EXPECT_MAT_NEAR(dst_gold, dst, dst_depth < CV_32F ? 0.0 : 0.02);
 }
 
+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_Core, Reduce, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U),
+                    MatDepth(CV_32F)),
+    ALL_CHANNELS,
+    ALL_REDUCE_CODES,
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_Core, Reduce, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
@@ -3564,6 +3683,7 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Reduce, testing::Combine(
     ALL_CHANNELS,
     ALL_REDUCE_CODES,
     WHOLE_SUBMAT));
+#endif
 
 //////////////////////////////////////////////////////////////////////////////
 // Normalize
diff --git a/modules/gpu/test/test_features2d.cpp b/modules/gpu/test/test_features2d.cpp
index 697483657e..99d9b2e8f6 100644
--- a/modules/gpu/test/test_features2d.cpp
+++ b/modules/gpu/test/test_features2d.cpp
@@ -310,6 +310,7 @@ GPU_TEST_P(BruteForceMatcher, Match_Single)
     ASSERT_EQ(0, badCount);
 }
 
+#ifndef OPENCV_TINY_GPU_MODULE
 GPU_TEST_P(BruteForceMatcher, Match_Collection)
 {
     cv::gpu::BFMatcher_GPU matcher(normCode);
@@ -363,6 +364,7 @@ GPU_TEST_P(BruteForceMatcher, Match_Collection)
 
     ASSERT_EQ(0, badCount);
 }
+#endif
 
 GPU_TEST_P(BruteForceMatcher, KnnMatch_2_Single)
 {
@@ -442,6 +444,7 @@ GPU_TEST_P(BruteForceMatcher, KnnMatch_3_Single)
     ASSERT_EQ(0, badCount);
 }
 
+#ifndef OPENCV_TINY_GPU_MODULE
 GPU_TEST_P(BruteForceMatcher, KnnMatch_2_Collection)
 {
     cv::gpu::BFMatcher_GPU matcher(normCode);
@@ -565,6 +568,7 @@ GPU_TEST_P(BruteForceMatcher, KnnMatch_3_Collection)
 
     ASSERT_EQ(0, badCount);
 }
+#endif
 
 GPU_TEST_P(BruteForceMatcher, RadiusMatch_Single)
 {
@@ -615,6 +619,7 @@ GPU_TEST_P(BruteForceMatcher, RadiusMatch_Single)
     }
 }
 
+#ifndef OPENCV_TINY_GPU_MODULE
 GPU_TEST_P(BruteForceMatcher, RadiusMatch_Collection)
 {
     cv::gpu::BFMatcher_GPU matcher(normCode);
@@ -693,11 +698,20 @@ GPU_TEST_P(BruteForceMatcher, RadiusMatch_Collection)
         ASSERT_EQ(0, badCount);
     }
 }
+#endif
 
+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_Features2D, BruteForceMatcher, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(NormCode(cv::NORM_L2)),
+    testing::Values(DescriptorSize(57), DescriptorSize(64), DescriptorSize(83), DescriptorSize(128), DescriptorSize(179), DescriptorSize(256), DescriptorSize(304)),
+    testing::Values(UseMask(false), UseMask(true))));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_Features2D, BruteForceMatcher, testing::Combine(
     ALL_DEVICES,
     testing::Values(NormCode(cv::NORM_L1), NormCode(cv::NORM_L2)),
     testing::Values(DescriptorSize(57), DescriptorSize(64), DescriptorSize(83), DescriptorSize(128), DescriptorSize(179), DescriptorSize(256), DescriptorSize(304)),
     testing::Values(UseMask(false), UseMask(true))));
+#endif
 
 #endif // HAVE_CUDA
diff --git a/modules/gpu/test/test_filters.cpp b/modules/gpu/test/test_filters.cpp
index cbb6db8a2a..cac3c70d79 100644
--- a/modules/gpu/test/test_filters.cpp
+++ b/modules/gpu/test/test_filters.cpp
@@ -164,6 +164,21 @@ GPU_TEST_P(Sobel, Accuracy)
     EXPECT_MAT_NEAR(getInnerROI(dst_gold, ksize), getInnerROI(dst, ksize), CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.1);
 }
 
+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_Filter, Sobel, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U), MatDepth(CV_32F)),
+    IMAGE_CHANNELS,
+    testing::Values(KSize(cv::Size(3, 3)), KSize(cv::Size(5, 5)), KSize(cv::Size(7, 7))),
+    testing::Values(Deriv_X(0), Deriv_X(1), Deriv_X(2)),
+    testing::Values(Deriv_Y(0), Deriv_Y(1), Deriv_Y(2)),
+    testing::Values(BorderType(cv::BORDER_REFLECT101),
+                    BorderType(cv::BORDER_REPLICATE),
+                    BorderType(cv::BORDER_CONSTANT),
+                    BorderType(cv::BORDER_REFLECT)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_Filter, Sobel, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
@@ -177,6 +192,7 @@ INSTANTIATE_TEST_CASE_P(GPU_Filter, Sobel, testing::Combine(
                     BorderType(cv::BORDER_CONSTANT),
                     BorderType(cv::BORDER_REFLECT)),
     WHOLE_SUBMAT));
+#endif
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // Scharr
@@ -227,6 +243,20 @@ GPU_TEST_P(Scharr, Accuracy)
     EXPECT_MAT_NEAR(getInnerROI(dst_gold, cv::Size(3, 3)), getInnerROI(dst, cv::Size(3, 3)), CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.1);
 }
 
+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_Filter, Scharr, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U), MatDepth(CV_32F)),
+    IMAGE_CHANNELS,
+    testing::Values(Deriv_X(0), Deriv_X(1)),
+    testing::Values(Deriv_Y(0), Deriv_Y(1)),
+    testing::Values(BorderType(cv::BORDER_REFLECT101),
+                    BorderType(cv::BORDER_REPLICATE),
+                    BorderType(cv::BORDER_CONSTANT),
+                    BorderType(cv::BORDER_REFLECT)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_Filter, Scharr, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
@@ -239,6 +269,7 @@ INSTANTIATE_TEST_CASE_P(GPU_Filter, Scharr, testing::Combine(
                     BorderType(cv::BORDER_CONSTANT),
                     BorderType(cv::BORDER_REFLECT)),
     WHOLE_SUBMAT));
+#endif
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // GaussianBlur
@@ -301,6 +332,21 @@ GPU_TEST_P(GaussianBlur, Accuracy)
     }
 }
 
+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_Filter, GaussianBlur, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U), MatDepth(CV_32F)),
+    IMAGE_CHANNELS,
+    testing::Values(KSize(cv::Size(3, 3)),
+                    KSize(cv::Size(5, 5)),
+                    KSize(cv::Size(7, 7))),
+    testing::Values(BorderType(cv::BORDER_REFLECT101),
+                    BorderType(cv::BORDER_REPLICATE),
+                    BorderType(cv::BORDER_CONSTANT),
+                    BorderType(cv::BORDER_REFLECT)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_Filter, GaussianBlur, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
@@ -326,6 +372,7 @@ INSTANTIATE_TEST_CASE_P(GPU_Filter, GaussianBlur, testing::Combine(
                     BorderType(cv::BORDER_CONSTANT),
                     BorderType(cv::BORDER_REFLECT)),
     WHOLE_SUBMAT));
+#endif
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // Laplacian
@@ -565,6 +612,16 @@ GPU_TEST_P(Filter2D, Accuracy)
     EXPECT_MAT_NEAR(dst_gold, dst, CV_MAT_DEPTH(type) == CV_32F ? 1e-1 : 1.0);
 }
 
+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_Filter, Filter2D, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC4)),
+    testing::Values(KSize(cv::Size(3, 3)), KSize(cv::Size(5, 5)), KSize(cv::Size(7, 7)), KSize(cv::Size(11, 11)), KSize(cv::Size(13, 13)), KSize(cv::Size(15, 15))),
+    testing::Values(Anchor(cv::Point(-1, -1)), Anchor(cv::Point(0, 0)), Anchor(cv::Point(2, 2))),
+    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_Filter, Filter2D, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
@@ -573,5 +630,6 @@ INSTANTIATE_TEST_CASE_P(GPU_Filter, Filter2D, testing::Combine(
     testing::Values(Anchor(cv::Point(-1, -1)), Anchor(cv::Point(0, 0)), Anchor(cv::Point(2, 2))),
     testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT)),
     WHOLE_SUBMAT));
+#endif
 
 #endif // HAVE_CUDA
diff --git a/modules/gpu/test/test_imgproc.cpp b/modules/gpu/test/test_imgproc.cpp
index aa27bfe206..c6c0bf1868 100644
--- a/modules/gpu/test/test_imgproc.cpp
+++ b/modules/gpu/test/test_imgproc.cpp
@@ -357,11 +357,19 @@ GPU_TEST_P(Canny, Accuracy)
     }
 }
 
+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Canny, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(AppertureSize(3)),
+    testing::Values(L2gradient(false), L2gradient(true)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Canny, testing::Combine(
     ALL_DEVICES,
     testing::Values(AppertureSize(3), AppertureSize(5)),
     testing::Values(L2gradient(false), L2gradient(true)),
     WHOLE_SUBMAT));
+#endif
 
 ////////////////////////////////////////////////////////////////////////////////
 // MeanShift
diff --git a/modules/gpu/test/test_pyramids.cpp b/modules/gpu/test/test_pyramids.cpp
index 6b0540fc10..5ddecf49ac 100644
--- a/modules/gpu/test/test_pyramids.cpp
+++ b/modules/gpu/test/test_pyramids.cpp
@@ -80,11 +80,19 @@ GPU_TEST_P(PyrDown, Accuracy)
     EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-4 : 1.0);
 }
 
+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrDown, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrDown, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
     testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
     WHOLE_SUBMAT));
+#endif
 
 ////////////////////////////////////////////////////////
 // pyrUp
@@ -120,10 +128,18 @@ GPU_TEST_P(PyrUp, Accuracy)
     EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-4 : 1.0);
 }
 
+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrUp, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrUp, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
     testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
     WHOLE_SUBMAT));
+#endif
 
 #endif // HAVE_CUDA
diff --git a/modules/gpu/test/test_remap.cpp b/modules/gpu/test/test_remap.cpp
index eb4b9ece85..cd0520070b 100644
--- a/modules/gpu/test/test_remap.cpp
+++ b/modules/gpu/test/test_remap.cpp
@@ -169,6 +169,15 @@ GPU_TEST_P(Remap, Accuracy)
     EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-3 : 1.0);
 }
 
+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Remap, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR)),
+    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Remap, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
@@ -176,5 +185,6 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Remap, testing::Combine(
     testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
     testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT), BorderType(cv::BORDER_WRAP)),
     WHOLE_SUBMAT));
+#endif
 
 #endif // HAVE_CUDA
diff --git a/modules/gpu/test/test_resize.cpp b/modules/gpu/test/test_resize.cpp
index 25f0f0e2bb..99cbfec3b4 100644
--- a/modules/gpu/test/test_resize.cpp
+++ b/modules/gpu/test/test_resize.cpp
@@ -174,6 +174,15 @@ GPU_TEST_P(Resize, Accuracy)
     EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-2 : 1.0);
 }
 
+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Resize, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    testing::Values(0.3, 0.5, 1.5, 2.0),
+    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Resize, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
@@ -181,6 +190,7 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Resize, testing::Combine(
     testing::Values(0.3, 0.5, 1.5, 2.0),
     testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
     WHOLE_SUBMAT));
+#endif
 
 /////////////////
 
@@ -221,6 +231,15 @@ GPU_TEST_P(ResizeSameAsHost, Accuracy)
     EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-2 : src.depth() == CV_8U ? 4.0 : 1.0);
 }
 
+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, ResizeSameAsHost, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    testing::Values(0.3, 0.5),
+    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_AREA)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, ResizeSameAsHost, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
@@ -228,7 +247,17 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, ResizeSameAsHost, testing::Combine(
     testing::Values(0.3, 0.5),
     testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_AREA)),
     WHOLE_SUBMAT));
+#endif
 
+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc2, ResizeSameAsHost, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    testing::Values(0.3, 0.5, 1.5, 2.0),
+    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc2, ResizeSameAsHost, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
@@ -236,5 +265,6 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc2, ResizeSameAsHost, testing::Combine(
     testing::Values(0.3, 0.5, 1.5, 2.0),
     testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR)),
     WHOLE_SUBMAT));
+#endif
 
 #endif // HAVE_CUDA
diff --git a/modules/gpu/test/test_threshold.cpp b/modules/gpu/test/test_threshold.cpp
index 52ebd7f592..cd06c17dd9 100644
--- a/modules/gpu/test/test_threshold.cpp
+++ b/modules/gpu/test/test_threshold.cpp
@@ -83,11 +83,20 @@ GPU_TEST_P(Threshold, Accuracy)
     EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 
+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Threshold, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_32FC1)),
+    ThreshOp::all(),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Threshold, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
     testing::Values(MatType(CV_8UC1), MatType(CV_16SC1), MatType(CV_32FC1)),
     ThreshOp::all(),
     WHOLE_SUBMAT));
+#endif
 
 #endif // HAVE_CUDA
diff --git a/modules/gpu/test/test_warp_affine.cpp b/modules/gpu/test/test_warp_affine.cpp
index 43bf0f6d9e..a20bbbeb4d 100644
--- a/modules/gpu/test/test_warp_affine.cpp
+++ b/modules/gpu/test/test_warp_affine.cpp
@@ -222,6 +222,16 @@ GPU_TEST_P(WarpAffine, Accuracy)
     EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-1 : 1.0);
 }
 
+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, WarpAffine, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    DIRECT_INVERSE,
+    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR)),
+    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_REFLECT)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, WarpAffine, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
@@ -230,6 +240,7 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, WarpAffine, testing::Combine(
     testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
     testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_REFLECT), BorderType(cv::BORDER_WRAP)),
     WHOLE_SUBMAT));
+#endif
 
 ///////////////////////////////////////////////////////////////////
 // Test NPP
@@ -271,10 +282,18 @@ GPU_TEST_P(WarpAffineNPP, Accuracy)
     EXPECT_MAT_SIMILAR(dst_gold, dst, 2e-2);
 }
 
+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, WarpAffineNPP, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    DIRECT_INVERSE,
+    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR))));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, WarpAffineNPP, testing::Combine(
     ALL_DEVICES,
     testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
     DIRECT_INVERSE,
     testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC))));
+#endif
 
 #endif // HAVE_CUDA
diff --git a/modules/gpu/test/test_warp_perspective.cpp b/modules/gpu/test/test_warp_perspective.cpp
index d225e58b66..892704dd30 100644
--- a/modules/gpu/test/test_warp_perspective.cpp
+++ b/modules/gpu/test/test_warp_perspective.cpp
@@ -225,6 +225,16 @@ GPU_TEST_P(WarpPerspective, Accuracy)
     EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-1 : 1.0);
 }
 
+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, WarpPerspective, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    DIRECT_INVERSE,
+    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR)),
+    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_REFLECT)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, WarpPerspective, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
@@ -233,6 +243,7 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, WarpPerspective, testing::Combine(
     testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
     testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_REFLECT), BorderType(cv::BORDER_WRAP)),
     WHOLE_SUBMAT));
+#endif
 
 ///////////////////////////////////////////////////////////////////
 // Test NPP
@@ -274,10 +285,18 @@ GPU_TEST_P(WarpPerspectiveNPP, Accuracy)
     EXPECT_MAT_SIMILAR(dst_gold, dst, 2e-2);
 }
 
+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, WarpPerspectiveNPP, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    DIRECT_INVERSE,
+    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR))));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, WarpPerspectiveNPP, testing::Combine(
     ALL_DEVICES,
     testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
     DIRECT_INVERSE,
     testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC))));
+#endif
 
 #endif // HAVE_CUDA
diff --git a/modules/ts/include/opencv2/ts/gpu_perf.hpp b/modules/ts/include/opencv2/ts/gpu_perf.hpp
index b7b73b7bc5..d74d7ea031 100644
--- a/modules/ts/include/opencv2/ts/gpu_perf.hpp
+++ b/modules/ts/include/opencv2/ts/gpu_perf.hpp
@@ -50,8 +50,13 @@
 
 namespace perf
 {
+#ifdef OPENCV_TINY_GPU_MODULE
+    #define ALL_BORDER_MODES testing::Values(BorderMode(cv::BORDER_REFLECT101), BorderMode(cv::BORDER_REPLICATE), BorderMode(cv::BORDER_CONSTANT), BorderMode(cv::BORDER_REFLECT))
+    #define ALL_INTERPOLATIONS testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_AREA))
+#else
     #define ALL_BORDER_MODES BorderMode::all()
     #define ALL_INTERPOLATIONS Interpolation::all()
+#endif
 
     CV_ENUM(BorderMode, BORDER_REFLECT101, BORDER_REPLICATE, BORDER_CONSTANT, BORDER_REFLECT, BORDER_WRAP)
     CV_ENUM(Interpolation, INTER_NEAREST, INTER_LINEAR, INTER_CUBIC, INTER_AREA)
diff --git a/modules/ts/include/opencv2/ts/gpu_test.hpp b/modules/ts/include/opencv2/ts/gpu_test.hpp
index 943a3536ca..01737bc951 100644
--- a/modules/ts/include/opencv2/ts/gpu_test.hpp
+++ b/modules/ts/include/opencv2/ts/gpu_test.hpp
@@ -215,6 +215,12 @@ namespace cvtest
 
     using perf::MatDepth;
 
+#ifdef OPENCV_TINY_GPU_MODULE
+    #define ALL_DEPTH testing::Values(MatDepth(CV_8U), MatDepth(CV_32F))
+
+    #define DEPTH_PAIRS testing::Values(std::make_pair(MatDepth(CV_8U), MatDepth(CV_8U)),   \
+                                        std::make_pair(MatDepth(CV_32F), MatDepth(CV_32F)))
+#else
     #define ALL_DEPTH testing::Values(MatDepth(CV_8U), MatDepth(CV_8S), MatDepth(CV_16U), MatDepth(CV_16S), MatDepth(CV_32S), MatDepth(CV_32F), MatDepth(CV_64F))
 
     #define DEPTH_PAIRS testing::Values(std::make_pair(MatDepth(CV_8U), MatDepth(CV_8U)),   \
@@ -242,6 +248,7 @@ namespace cvtest
                                         std::make_pair(MatDepth(CV_32F), MatDepth(CV_64F)), \
                                                                                             \
                                         std::make_pair(MatDepth(CV_64F), MatDepth(CV_64F)))
+#endif
 
     // Type
 
@@ -318,7 +325,11 @@ namespace cvtest
     CV_ENUM(Interpolation, INTER_NEAREST, INTER_LINEAR, INTER_CUBIC, INTER_AREA)
 
     CV_ENUM(BorderType, BORDER_REFLECT101, BORDER_REPLICATE, BORDER_CONSTANT, BORDER_REFLECT, BORDER_WRAP)
+#ifdef OPENCV_TINY_GPU_MODULE
+    #define ALL_BORDER_TYPES testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT))
+#else
     #define ALL_BORDER_TYPES testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT), BorderType(cv::BORDER_WRAP))
+#endif
 
     CV_FLAGS(WarpFlags, INTER_NEAREST, INTER_LINEAR, INTER_CUBIC, WARP_INVERSE_MAP)