diff --git a/3rdparty/libjpeg-turbo/CMakeLists.txt b/3rdparty/libjpeg-turbo/CMakeLists.txt
index 901669a4a8..3c7f29b08e 100644
--- a/3rdparty/libjpeg-turbo/CMakeLists.txt
+++ b/3rdparty/libjpeg-turbo/CMakeLists.txt
@@ -3,10 +3,10 @@ project(${JPEG_LIBRARY} C)
 ocv_warnings_disable(CMAKE_C_FLAGS -Wunused-parameter -Wsign-compare -Wshorten-64-to-32 -Wimplicit-fallthrough)
 
 set(VERSION_MAJOR 2)
-set(VERSION_MINOR 0)
-set(VERSION_REVISION 6)
+set(VERSION_MINOR 1)
+set(VERSION_REVISION 0)
 set(VERSION ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_REVISION})
-set(LIBJPEG_TURBO_VERSION_NUMBER 2000006)
+set(LIBJPEG_TURBO_VERSION_NUMBER 2001000)
 
 string(TIMESTAMP BUILD "opencv-${OPENCV_VERSION}-libjpeg-turbo")
 if(CMAKE_BUILD_TYPE STREQUAL "Debug")
@@ -46,7 +46,6 @@ if(UNIX)
   ocv_update(HAVE_UNSIGNED_SHORT 1)
   # undef INCOMPLETE_TYPES_BROKEN
   ocv_update(RIGHT_SHIFT_IS_UNSIGNED 0)
-  ocv_update(__CHAR_UNSIGNED__ 0)
 endif()
 
 
diff --git a/3rdparty/libjpeg-turbo/LICENSE.md b/3rdparty/libjpeg-turbo/LICENSE.md
index 99c9aadcc4..a1cdad52fa 100644
--- a/3rdparty/libjpeg-turbo/LICENSE.md
+++ b/3rdparty/libjpeg-turbo/LICENSE.md
@@ -91,7 +91,7 @@ best of our understanding.
 The Modified (3-clause) BSD License
 ===================================
 
-Copyright (C)2009-2020 D. R. Commander.  All Rights Reserved.
+Copyright (C)2009-2021 D. R. Commander.  All Rights Reserved.<br>
 Copyright (C)2015 Viktor Szathmáry.  All Rights Reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/3rdparty/libjpeg-turbo/README.ijg b/3rdparty/libjpeg-turbo/README.ijg
index d681cf1273..9453c19501 100644
--- a/3rdparty/libjpeg-turbo/README.ijg
+++ b/3rdparty/libjpeg-turbo/README.ijg
@@ -128,7 +128,7 @@ with respect to this software, its quality, accuracy, merchantability, or
 fitness for a particular purpose.  This software is provided "AS IS", and you,
 its user, assume the entire risk as to its quality and accuracy.
 
-This software is copyright (C) 1991-2016, Thomas G. Lane, Guido Vollbeding.
+This software is copyright (C) 1991-2020, Thomas G. Lane, Guido Vollbeding.
 All Rights Reserved except as specified below.
 
 Permission is hereby granted to use, copy, modify, and distribute this
@@ -159,19 +159,6 @@ commercial products, provided that all warranty or liability claims are
 assumed by the product vendor.
 
 
-The IJG distribution formerly included code to read and write GIF files.
-To avoid entanglement with the Unisys LZW patent (now expired), GIF reading
-support has been removed altogether, and the GIF writer has been simplified
-to produce "uncompressed GIFs".  This technique does not use the LZW
-algorithm; the resulting GIF files are larger than usual, but are readable
-by all standard GIF decoders.
-
-We are required to state that
-    "The Graphics Interchange Format(c) is the Copyright property of
-    CompuServe Incorporated.  GIF(sm) is a Service Mark property of
-    CompuServe Incorporated."
-
-
 REFERENCES
 ==========
 
diff --git a/3rdparty/libjpeg-turbo/README.md b/3rdparty/libjpeg-turbo/README.md
index 90a4a43ee1..01e391ea7c 100644
--- a/3rdparty/libjpeg-turbo/README.md
+++ b/3rdparty/libjpeg-turbo/README.md
@@ -3,7 +3,7 @@ Background
 
 libjpeg-turbo is a JPEG image codec that uses SIMD instructions to accelerate
 baseline JPEG compression and decompression on x86, x86-64, Arm, PowerPC, and
-MIPS systems, as well as progressive JPEG compression on x86 and x86-64
+MIPS systems, as well as progressive JPEG compression on x86, x86-64, and Arm
 systems.  On such systems, libjpeg-turbo is generally 2-6x as fast as libjpeg,
 all else being equal.  On other types of systems, libjpeg-turbo can still
 outperform libjpeg by a significant amount, by virtue of its highly-optimized
diff --git a/3rdparty/libjpeg-turbo/jconfig.h.in b/3rdparty/libjpeg-turbo/jconfig.h.in
index 18a69a4814..d4284d97b8 100644
--- a/3rdparty/libjpeg-turbo/jconfig.h.in
+++ b/3rdparty/libjpeg-turbo/jconfig.h.in
@@ -61,11 +61,6 @@
    unsigned. */
 #cmakedefine RIGHT_SHIFT_IS_UNSIGNED 1
 
-/* Define to 1 if type `char' is unsigned and you are not using gcc.  */
-#ifndef __CHAR_UNSIGNED__
-  #cmakedefine __CHAR_UNSIGNED__ 1
-#endif
-
 /* Define to empty if `const' does not conform to ANSI C. */
 /* #undef const */
 
diff --git a/3rdparty/libjpeg-turbo/jconfig.h.win.in b/3rdparty/libjpeg-turbo/jconfig.h.win.in
index 6db0b345b2..13cceef01d 100644
--- a/3rdparty/libjpeg-turbo/jconfig.h.win.in
+++ b/3rdparty/libjpeg-turbo/jconfig.h.win.in
@@ -18,7 +18,6 @@
 #define HAVE_UNSIGNED_SHORT
 #undef INCOMPLETE_TYPES_BROKEN
 #undef RIGHT_SHIFT_IS_UNSIGNED
-#undef __CHAR_UNSIGNED__
 
 /* Define "boolean" as unsigned char, not int, per Windows custom */
 #ifndef __RPCNDR_H__            /* don't conflict if rpcndr.h already read */
diff --git a/3rdparty/libjpeg-turbo/src/jccolext.c b/3rdparty/libjpeg-turbo/src/jccolext.c
index 19c955c9d6..303b322ce6 100644
--- a/3rdparty/libjpeg-turbo/src/jccolext.c
+++ b/3rdparty/libjpeg-turbo/src/jccolext.c
@@ -48,9 +48,9 @@ rgb_ycc_convert_internal(j_compress_ptr cinfo, JSAMPARRAY input_buf,
     outptr2 = output_buf[2][output_row];
     output_row++;
     for (col = 0; col < num_cols; col++) {
-      r = GETJSAMPLE(inptr[RGB_RED]);
-      g = GETJSAMPLE(inptr[RGB_GREEN]);
-      b = GETJSAMPLE(inptr[RGB_BLUE]);
+      r = inptr[RGB_RED];
+      g = inptr[RGB_GREEN];
+      b = inptr[RGB_BLUE];
       inptr += RGB_PIXELSIZE;
       /* If the inputs are 0..MAXJSAMPLE, the outputs of these equations
        * must be too; we do not need an explicit range-limiting operation.
@@ -100,9 +100,9 @@ rgb_gray_convert_internal(j_compress_ptr cinfo, JSAMPARRAY input_buf,
     outptr = output_buf[0][output_row];
     output_row++;
     for (col = 0; col < num_cols; col++) {
-      r = GETJSAMPLE(inptr[RGB_RED]);
-      g = GETJSAMPLE(inptr[RGB_GREEN]);
-      b = GETJSAMPLE(inptr[RGB_BLUE]);
+      r = inptr[RGB_RED];
+      g = inptr[RGB_GREEN];
+      b = inptr[RGB_BLUE];
       inptr += RGB_PIXELSIZE;
       /* Y */
       outptr[col] = (JSAMPLE)((ctab[r + R_Y_OFF] + ctab[g + G_Y_OFF] +
@@ -135,9 +135,9 @@ rgb_rgb_convert_internal(j_compress_ptr cinfo, JSAMPARRAY input_buf,
     outptr2 = output_buf[2][output_row];
     output_row++;
     for (col = 0; col < num_cols; col++) {
-      outptr0[col] = GETJSAMPLE(inptr[RGB_RED]);
-      outptr1[col] = GETJSAMPLE(inptr[RGB_GREEN]);
-      outptr2[col] = GETJSAMPLE(inptr[RGB_BLUE]);
+      outptr0[col] = inptr[RGB_RED];
+      outptr1[col] = inptr[RGB_GREEN];
+      outptr2[col] = inptr[RGB_BLUE];
       inptr += RGB_PIXELSIZE;
     }
   }
diff --git a/3rdparty/libjpeg-turbo/src/jccolor.c b/3rdparty/libjpeg-turbo/src/jccolor.c
index 036f6016d1..bdc563c723 100644
--- a/3rdparty/libjpeg-turbo/src/jccolor.c
+++ b/3rdparty/libjpeg-turbo/src/jccolor.c
@@ -392,11 +392,11 @@ cmyk_ycck_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
     outptr3 = output_buf[3][output_row];
     output_row++;
     for (col = 0; col < num_cols; col++) {
-      r = MAXJSAMPLE - GETJSAMPLE(inptr[0]);
-      g = MAXJSAMPLE - GETJSAMPLE(inptr[1]);
-      b = MAXJSAMPLE - GETJSAMPLE(inptr[2]);
+      r = MAXJSAMPLE - inptr[0];
+      g = MAXJSAMPLE - inptr[1];
+      b = MAXJSAMPLE - inptr[2];
       /* K passes through as-is */
-      outptr3[col] = inptr[3];  /* don't need GETJSAMPLE here */
+      outptr3[col] = inptr[3];
       inptr += 4;
       /* If the inputs are 0..MAXJSAMPLE, the outputs of these equations
        * must be too; we do not need an explicit range-limiting operation.
@@ -438,7 +438,7 @@ grayscale_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
     outptr = output_buf[0][output_row];
     output_row++;
     for (col = 0; col < num_cols; col++) {
-      outptr[col] = inptr[0];   /* don't need GETJSAMPLE() here */
+      outptr[col] = inptr[0];
       inptr += instride;
     }
   }
@@ -497,7 +497,7 @@ null_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
         inptr = *input_buf;
         outptr = output_buf[ci][output_row];
         for (col = 0; col < num_cols; col++) {
-          outptr[col] = inptr[ci]; /* don't need GETJSAMPLE() here */
+          outptr[col] = inptr[ci];
           inptr += nc;
         }
       }
diff --git a/3rdparty/libjpeg-turbo/src/jcdctmgr.c b/3rdparty/libjpeg-turbo/src/jcdctmgr.c
index c04058e6ce..7dae17a6e1 100644
--- a/3rdparty/libjpeg-turbo/src/jcdctmgr.c
+++ b/3rdparty/libjpeg-turbo/src/jcdctmgr.c
@@ -381,19 +381,19 @@ convsamp(JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace)
     elemptr = sample_data[elemr] + start_col;
 
 #if DCTSIZE == 8                /* unroll the inner loop */
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
 #else
     {
       register int elemc;
       for (elemc = DCTSIZE; elemc > 0; elemc--)
-        *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+        *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
     }
 #endif
   }
@@ -533,20 +533,19 @@ convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
   for (elemr = 0; elemr < DCTSIZE; elemr++) {
     elemptr = sample_data[elemr] + start_col;
 #if DCTSIZE == 8                /* unroll the inner loop */
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
 #else
     {
       register int elemc;
       for (elemc = DCTSIZE; elemc > 0; elemc--)
-        *workspaceptr++ = (FAST_FLOAT)
-                          (GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+        *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
     }
 #endif
   }
diff --git a/3rdparty/libjpeg-turbo/src/jchuff.c b/3rdparty/libjpeg-turbo/src/jchuff.c
index db85ce114f..2bce767ebd 100644
--- a/3rdparty/libjpeg-turbo/src/jchuff.c
+++ b/3rdparty/libjpeg-turbo/src/jchuff.c
@@ -4,8 +4,10 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009-2011, 2014-2016, 2018-2019, D. R. Commander.
+ * Copyright (C) 2009-2011, 2014-2016, 2018-2021, D. R. Commander.
  * Copyright (C) 2015, Matthieu Darbois.
+ * Copyright (C) 2018, Matthias Räncker.
+ * Copyright (C) 2020, Arm Limited.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -42,15 +44,19 @@
  * flags (this defines __thumb__).
  */
 
-/* NOTE: Both GCC and Clang define __GNUC__ */
-#if defined(__GNUC__) && (defined(__arm__) || defined(__aarch64__))
+#if defined(__arm__) || defined(__aarch64__) || defined(_M_ARM) || \
+    defined(_M_ARM64)
 #if !defined(__thumb__) || defined(__thumb2__)
 #define USE_CLZ_INTRINSIC
 #endif
 #endif
 
 #ifdef USE_CLZ_INTRINSIC
+#if defined(_MSC_VER) && !defined(__clang__)
+#define JPEG_NBITS_NONZERO(x)  (32 - _CountLeadingZeros(x))
+#else
 #define JPEG_NBITS_NONZERO(x)  (32 - __builtin_clz(x))
+#endif
 #define JPEG_NBITS(x)          (x ? JPEG_NBITS_NONZERO(x) : 0)
 #else
 #include "jpeg_nbits_table.h"
@@ -65,32 +71,43 @@
  * but must not be updated permanently until we complete the MCU.
  */
 
+#if defined(__x86_64__) && defined(__ILP32__)
+typedef unsigned long long bit_buf_type;
+#else
+typedef size_t bit_buf_type;
+#endif
+
+/* NOTE: The more optimal Huffman encoding algorithm is only used by the
+ * intrinsics implementation of the Arm Neon SIMD extensions, which is why we
+ * retain the old Huffman encoder behavior when using the GAS implementation.
+ */
+#if defined(WITH_SIMD) && !(defined(__arm__) || defined(__aarch64__) || \
+                            defined(_M_ARM) || defined(_M_ARM64))
+typedef unsigned long long simd_bit_buf_type;
+#else
+typedef bit_buf_type simd_bit_buf_type;
+#endif
+
+#if (defined(SIZEOF_SIZE_T) && SIZEOF_SIZE_T == 8) || defined(_WIN64) || \
+    (defined(__x86_64__) && defined(__ILP32__))
+#define BIT_BUF_SIZE  64
+#elif (defined(SIZEOF_SIZE_T) && SIZEOF_SIZE_T == 4) || defined(_WIN32)
+#define BIT_BUF_SIZE  32
+#else
+#error Cannot determine word size
+#endif
+#define SIMD_BIT_BUF_SIZE  (sizeof(simd_bit_buf_type) * 8)
+
 typedef struct {
-  size_t put_buffer;                    /* current bit-accumulation buffer */
-  int put_bits;                         /* # of bits now in it */
+  union {
+    bit_buf_type c;
+    simd_bit_buf_type simd;
+  } put_buffer;                         /* current bit accumulation buffer */
+  int free_bits;                        /* # of bits available in it */
+                                        /* (Neon GAS: # of bits now in it) */
   int last_dc_val[MAX_COMPS_IN_SCAN];   /* last DC coef for each component */
 } savable_state;
 
-/* This macro is to work around compilers with missing or broken
- * structure assignment.  You'll need to fix this code if you have
- * such a compiler and you change MAX_COMPS_IN_SCAN.
- */
-
-#ifndef NO_STRUCT_ASSIGN
-#define ASSIGN_STATE(dest, src)  ((dest) = (src))
-#else
-#if MAX_COMPS_IN_SCAN == 4
-#define ASSIGN_STATE(dest, src) \
-  ((dest).put_buffer = (src).put_buffer, \
-   (dest).put_bits = (src).put_bits, \
-   (dest).last_dc_val[0] = (src).last_dc_val[0], \
-   (dest).last_dc_val[1] = (src).last_dc_val[1], \
-   (dest).last_dc_val[2] = (src).last_dc_val[2], \
-   (dest).last_dc_val[3] = (src).last_dc_val[3])
-#endif
-#endif
-
-
 typedef struct {
   struct jpeg_entropy_encoder pub; /* public fields */
 
@@ -123,6 +140,7 @@ typedef struct {
   size_t free_in_buffer;        /* # of byte spaces remaining in buffer */
   savable_state cur;            /* Current bit buffer & DC state */
   j_compress_ptr cinfo;         /* dump_buffer needs access to this */
+  int simd;
 } working_state;
 
 
@@ -201,8 +219,17 @@ start_pass_huff(j_compress_ptr cinfo, boolean gather_statistics)
   }
 
   /* Initialize bit buffer to empty */
-  entropy->saved.put_buffer = 0;
-  entropy->saved.put_bits = 0;
+  if (entropy->simd) {
+    entropy->saved.put_buffer.simd = 0;
+#if defined(__aarch64__) && !defined(NEON_INTRINSICS)
+    entropy->saved.free_bits = 0;
+#else
+    entropy->saved.free_bits = SIMD_BIT_BUF_SIZE;
+#endif
+  } else {
+    entropy->saved.put_buffer.c = 0;
+    entropy->saved.free_bits = BIT_BUF_SIZE;
+  }
 
   /* Initialize restart stuff */
   entropy->restarts_to_go = cinfo->restart_interval;
@@ -287,6 +314,7 @@ jpeg_make_c_derived_tbl(j_compress_ptr cinfo, boolean isDC, int tblno,
    * this lets us detect duplicate VAL entries here, and later
    * allows emit_bits to detect any attempt to emit such symbols.
    */
+  MEMZERO(dtbl->ehufco, sizeof(dtbl->ehufco));
   MEMZERO(dtbl->ehufsi, sizeof(dtbl->ehufsi));
 
   /* This is also a convenient place to check for out-of-range
@@ -334,94 +362,94 @@ dump_buffer(working_state *state)
 
 /* Outputting bits to the file */
 
-/* These macros perform the same task as the emit_bits() function in the
- * original libjpeg code.  In addition to reducing overhead by explicitly
- * inlining the code, additional performance is achieved by taking into
- * account the size of the bit buffer and waiting until it is almost full
- * before emptying it.  This mostly benefits 64-bit platforms, since 6
- * bytes can be stored in a 64-bit bit buffer before it has to be emptied.
+/* Output byte b and, speculatively, an additional 0 byte.  0xFF must be
+ * encoded as 0xFF 0x00, so the output buffer pointer is advanced by 2 if the
+ * byte is 0xFF.  Otherwise, the output buffer pointer is advanced by 1, and
+ * the speculative 0 byte will be overwritten by the next byte.
  */
-
-#define EMIT_BYTE() { \
-  JOCTET c; \
-  put_bits -= 8; \
-  c = (JOCTET)GETJOCTET(put_buffer >> put_bits); \
-  *buffer++ = c; \
-  if (c == 0xFF)  /* need to stuff a zero byte? */ \
-    *buffer++ = 0; \
+#define EMIT_BYTE(b) { \
+  buffer[0] = (JOCTET)(b); \
+  buffer[1] = 0; \
+  buffer -= -2 + ((JOCTET)(b) < 0xFF); \
 }
 
-#define PUT_BITS(code, size) { \
-  put_bits += size; \
-  put_buffer = (put_buffer << size) | code; \
-}
+/* Output the entire bit buffer.  If there are no 0xFF bytes in it, then write
+ * directly to the output buffer.  Otherwise, use the EMIT_BYTE() macro to
+ * encode 0xFF as 0xFF 0x00.
+ */
+#if BIT_BUF_SIZE == 64
 
-#if SIZEOF_SIZE_T != 8 && !defined(_WIN64)
-
-#define CHECKBUF15() { \
-  if (put_bits > 15) { \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
+#define FLUSH() { \
+  if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \
+    EMIT_BYTE(put_buffer >> 56) \
+    EMIT_BYTE(put_buffer >> 48) \
+    EMIT_BYTE(put_buffer >> 40) \
+    EMIT_BYTE(put_buffer >> 32) \
+    EMIT_BYTE(put_buffer >> 24) \
+    EMIT_BYTE(put_buffer >> 16) \
+    EMIT_BYTE(put_buffer >>  8) \
+    EMIT_BYTE(put_buffer      ) \
+  } else { \
+    buffer[0] = (JOCTET)(put_buffer >> 56); \
+    buffer[1] = (JOCTET)(put_buffer >> 48); \
+    buffer[2] = (JOCTET)(put_buffer >> 40); \
+    buffer[3] = (JOCTET)(put_buffer >> 32); \
+    buffer[4] = (JOCTET)(put_buffer >> 24); \
+    buffer[5] = (JOCTET)(put_buffer >> 16); \
+    buffer[6] = (JOCTET)(put_buffer >> 8); \
+    buffer[7] = (JOCTET)(put_buffer); \
+    buffer += 8; \
   } \
 }
 
-#endif
-
-#define CHECKBUF31() { \
-  if (put_bits > 31) { \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-  } \
-}
-
-#define CHECKBUF47() { \
-  if (put_bits > 47) { \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-  } \
-}
-
-#if !defined(_WIN32) && !defined(SIZEOF_SIZE_T)
-#error Cannot determine word size
-#endif
-
-#if SIZEOF_SIZE_T == 8 || defined(_WIN64)
-
-#define EMIT_BITS(code, size) { \
-  CHECKBUF47() \
-  PUT_BITS(code, size) \
-}
-
-#define EMIT_CODE(code, size) { \
-  temp2 &= (((JLONG)1) << nbits) - 1; \
-  CHECKBUF31() \
-  PUT_BITS(code, size) \
-  PUT_BITS(temp2, nbits) \
-}
-
 #else
 
-#define EMIT_BITS(code, size) { \
-  PUT_BITS(code, size) \
-  CHECKBUF15() \
-}
-
-#define EMIT_CODE(code, size) { \
-  temp2 &= (((JLONG)1) << nbits) - 1; \
-  PUT_BITS(code, size) \
-  CHECKBUF15() \
-  PUT_BITS(temp2, nbits) \
-  CHECKBUF15() \
+#define FLUSH() { \
+  if (put_buffer & 0x80808080 & ~(put_buffer + 0x01010101)) { \
+    EMIT_BYTE(put_buffer >> 24) \
+    EMIT_BYTE(put_buffer >> 16) \
+    EMIT_BYTE(put_buffer >>  8) \
+    EMIT_BYTE(put_buffer      ) \
+  } else { \
+    buffer[0] = (JOCTET)(put_buffer >> 24); \
+    buffer[1] = (JOCTET)(put_buffer >> 16); \
+    buffer[2] = (JOCTET)(put_buffer >> 8); \
+    buffer[3] = (JOCTET)(put_buffer); \
+    buffer += 4; \
+  } \
 }
 
 #endif
 
+/* Fill the bit buffer to capacity with the leading bits from code, then output
+ * the bit buffer and put the remaining bits from code into the bit buffer.
+ */
+#define PUT_AND_FLUSH(code, size) { \
+  put_buffer = (put_buffer << (size + free_bits)) | (code >> -free_bits); \
+  FLUSH() \
+  free_bits += BIT_BUF_SIZE; \
+  put_buffer = code; \
+}
+
+/* Insert code into the bit buffer and output the bit buffer if needed.
+ * NOTE: We can't flush with free_bits == 0, since the left shift in
+ * PUT_AND_FLUSH() would have undefined behavior.
+ */
+#define PUT_BITS(code, size) { \
+  free_bits -= size; \
+  if (free_bits < 0) \
+    PUT_AND_FLUSH(code, size) \
+  else \
+    put_buffer = (put_buffer << size) | code; \
+}
+
+#define PUT_CODE(code, size) { \
+  temp &= (((JLONG)1) << nbits) - 1; \
+  temp |= code << nbits; \
+  nbits += size; \
+  PUT_BITS(temp, nbits) \
+}
+
 
 /* Although it is exceedingly rare, it is possible for a Huffman-encoded
  * coefficient block to be larger than the 128-byte unencoded block.  For each
@@ -444,6 +472,7 @@ dump_buffer(working_state *state)
 
 #define STORE_BUFFER() { \
   if (localbuf) { \
+    size_t bytes, bytestocopy; \
     bytes = buffer - _buffer; \
     buffer = _buffer; \
     while (bytes > 0) { \
@@ -466,20 +495,46 @@ dump_buffer(working_state *state)
 LOCAL(boolean)
 flush_bits(working_state *state)
 {
-  JOCTET _buffer[BUFSIZE], *buffer;
-  size_t put_buffer;  int put_bits;
-  size_t bytes, bytestocopy;  int localbuf = 0;
+  JOCTET _buffer[BUFSIZE], *buffer, temp;
+  simd_bit_buf_type put_buffer;  int put_bits;
+  int localbuf = 0;
+
+  if (state->simd) {
+#if defined(__aarch64__) && !defined(NEON_INTRINSICS)
+    put_bits = state->cur.free_bits;
+#else
+    put_bits = SIMD_BIT_BUF_SIZE - state->cur.free_bits;
+#endif
+    put_buffer = state->cur.put_buffer.simd;
+  } else {
+    put_bits = BIT_BUF_SIZE - state->cur.free_bits;
+    put_buffer = state->cur.put_buffer.c;
+  }
 
-  put_buffer = state->cur.put_buffer;
-  put_bits = state->cur.put_bits;
   LOAD_BUFFER()
 
-  /* fill any partial byte with ones */
-  PUT_BITS(0x7F, 7)
-  while (put_bits >= 8) EMIT_BYTE()
+  while (put_bits >= 8) {
+    put_bits -= 8;
+    temp = (JOCTET)(put_buffer >> put_bits);
+    EMIT_BYTE(temp)
+  }
+  if (put_bits) {
+    /* fill partial byte with ones */
+    temp = (JOCTET)((put_buffer << (8 - put_bits)) | (0xFF >> put_bits));
+    EMIT_BYTE(temp)
+  }
 
-  state->cur.put_buffer = 0;    /* and reset bit-buffer to empty */
-  state->cur.put_bits = 0;
+  if (state->simd) {                    /* and reset bit buffer to empty */
+    state->cur.put_buffer.simd = 0;
+#if defined(__aarch64__) && !defined(NEON_INTRINSICS)
+    state->cur.free_bits = 0;
+#else
+    state->cur.free_bits = SIMD_BIT_BUF_SIZE;
+#endif
+  } else {
+    state->cur.put_buffer.c = 0;
+    state->cur.free_bits = BIT_BUF_SIZE;
+  }
   STORE_BUFFER()
 
   return TRUE;
@@ -493,7 +548,7 @@ encode_one_block_simd(working_state *state, JCOEFPTR block, int last_dc_val,
                       c_derived_tbl *dctbl, c_derived_tbl *actbl)
 {
   JOCTET _buffer[BUFSIZE], *buffer;
-  size_t bytes, bytestocopy;  int localbuf = 0;
+  int localbuf = 0;
 
   LOAD_BUFFER()
 
@@ -509,53 +564,41 @@ LOCAL(boolean)
 encode_one_block(working_state *state, JCOEFPTR block, int last_dc_val,
                  c_derived_tbl *dctbl, c_derived_tbl *actbl)
 {
-  int temp, temp2, temp3;
-  int nbits;
-  int r, code, size;
+  int temp, nbits, free_bits;
+  bit_buf_type put_buffer;
   JOCTET _buffer[BUFSIZE], *buffer;
-  size_t put_buffer;  int put_bits;
-  int code_0xf0 = actbl->ehufco[0xf0], size_0xf0 = actbl->ehufsi[0xf0];
-  size_t bytes, bytestocopy;  int localbuf = 0;
+  int localbuf = 0;
 
-  put_buffer = state->cur.put_buffer;
-  put_bits = state->cur.put_bits;
+  free_bits = state->cur.free_bits;
+  put_buffer = state->cur.put_buffer.c;
   LOAD_BUFFER()
 
   /* Encode the DC coefficient difference per section F.1.2.1 */
 
-  temp = temp2 = block[0] - last_dc_val;
+  temp = block[0] - last_dc_val;
 
   /* This is a well-known technique for obtaining the absolute value without a
    * branch.  It is derived from an assembly language technique presented in
    * "How to Optimize for the Pentium Processors", Copyright (c) 1996, 1997 by
-   * Agner Fog.
+   * Agner Fog.  This code assumes we are on a two's complement machine.
    */
-  temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
-  temp ^= temp3;
-  temp -= temp3;
-
-  /* For a negative input, want temp2 = bitwise complement of abs(input) */
-  /* This code assumes we are on a two's complement machine */
-  temp2 += temp3;
+  nbits = temp >> (CHAR_BIT * sizeof(int) - 1);
+  temp += nbits;
+  nbits ^= temp;
 
   /* Find the number of bits needed for the magnitude of the coefficient */
-  nbits = JPEG_NBITS(temp);
+  nbits = JPEG_NBITS(nbits);
 
-  /* Emit the Huffman-coded symbol for the number of bits */
-  code = dctbl->ehufco[nbits];
-  size = dctbl->ehufsi[nbits];
-  EMIT_BITS(code, size)
-
-  /* Mask off any extra bits in code */
-  temp2 &= (((JLONG)1) << nbits) - 1;
-
-  /* Emit that number of bits of the value, if positive, */
-  /* or the complement of its magnitude, if negative. */
-  EMIT_BITS(temp2, nbits)
+  /* Emit the Huffman-coded symbol for the number of bits.
+   * Emit that number of bits of the value, if positive,
+   * or the complement of its magnitude, if negative.
+   */
+  PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits])
 
   /* Encode the AC coefficients per section F.1.2.2 */
 
-  r = 0;                        /* r = run length of zeros */
+  {
+    int r = 0;                  /* r = run length of zeros */
 
 /* Manually unroll the k loop to eliminate the counter variable.  This
  * improves performance greatly on systems with a limited number of
@@ -563,51 +606,46 @@ encode_one_block(working_state *state, JCOEFPTR block, int last_dc_val,
  */
 #define kloop(jpeg_natural_order_of_k) { \
   if ((temp = block[jpeg_natural_order_of_k]) == 0) { \
-    r++; \
+    r += 16; \
   } else { \
-    temp2 = temp; \
     /* Branch-less absolute value, bitwise complement, etc., same as above */ \
-    temp3 = temp >> (CHAR_BIT * sizeof(int) - 1); \
-    temp ^= temp3; \
-    temp -= temp3; \
-    temp2 += temp3; \
-    nbits = JPEG_NBITS_NONZERO(temp); \
+    nbits = temp >> (CHAR_BIT * sizeof(int) - 1); \
+    temp += nbits; \
+    nbits ^= temp; \
+    nbits = JPEG_NBITS_NONZERO(nbits); \
     /* if run length > 15, must emit special run-length-16 codes (0xF0) */ \
-    while (r > 15) { \
-      EMIT_BITS(code_0xf0, size_0xf0) \
-      r -= 16; \
+    while (r >= 16 * 16) { \
+      r -= 16 * 16; \
+      PUT_BITS(actbl->ehufco[0xf0], actbl->ehufsi[0xf0]) \
     } \
     /* Emit Huffman symbol for run length / number of bits */ \
-    temp3 = (r << 4) + nbits; \
-    code = actbl->ehufco[temp3]; \
-    size = actbl->ehufsi[temp3]; \
-    EMIT_CODE(code, size) \
+    r += nbits; \
+    PUT_CODE(actbl->ehufco[r], actbl->ehufsi[r]) \
     r = 0; \
   } \
 }
 
-  /* One iteration for each value in jpeg_natural_order[] */
-  kloop(1);   kloop(8);   kloop(16);  kloop(9);   kloop(2);   kloop(3);
-  kloop(10);  kloop(17);  kloop(24);  kloop(32);  kloop(25);  kloop(18);
-  kloop(11);  kloop(4);   kloop(5);   kloop(12);  kloop(19);  kloop(26);
-  kloop(33);  kloop(40);  kloop(48);  kloop(41);  kloop(34);  kloop(27);
-  kloop(20);  kloop(13);  kloop(6);   kloop(7);   kloop(14);  kloop(21);
-  kloop(28);  kloop(35);  kloop(42);  kloop(49);  kloop(56);  kloop(57);
-  kloop(50);  kloop(43);  kloop(36);  kloop(29);  kloop(22);  kloop(15);
-  kloop(23);  kloop(30);  kloop(37);  kloop(44);  kloop(51);  kloop(58);
-  kloop(59);  kloop(52);  kloop(45);  kloop(38);  kloop(31);  kloop(39);
-  kloop(46);  kloop(53);  kloop(60);  kloop(61);  kloop(54);  kloop(47);
-  kloop(55);  kloop(62);  kloop(63);
+    /* One iteration for each value in jpeg_natural_order[] */
+    kloop(1);   kloop(8);   kloop(16);  kloop(9);   kloop(2);   kloop(3);
+    kloop(10);  kloop(17);  kloop(24);  kloop(32);  kloop(25);  kloop(18);
+    kloop(11);  kloop(4);   kloop(5);   kloop(12);  kloop(19);  kloop(26);
+    kloop(33);  kloop(40);  kloop(48);  kloop(41);  kloop(34);  kloop(27);
+    kloop(20);  kloop(13);  kloop(6);   kloop(7);   kloop(14);  kloop(21);
+    kloop(28);  kloop(35);  kloop(42);  kloop(49);  kloop(56);  kloop(57);
+    kloop(50);  kloop(43);  kloop(36);  kloop(29);  kloop(22);  kloop(15);
+    kloop(23);  kloop(30);  kloop(37);  kloop(44);  kloop(51);  kloop(58);
+    kloop(59);  kloop(52);  kloop(45);  kloop(38);  kloop(31);  kloop(39);
+    kloop(46);  kloop(53);  kloop(60);  kloop(61);  kloop(54);  kloop(47);
+    kloop(55);  kloop(62);  kloop(63);
 
-  /* If the last coef(s) were zero, emit an end-of-block code */
-  if (r > 0) {
-    code = actbl->ehufco[0];
-    size = actbl->ehufsi[0];
-    EMIT_BITS(code, size)
+    /* If the last coef(s) were zero, emit an end-of-block code */
+    if (r > 0) {
+      PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
+    }
   }
 
-  state->cur.put_buffer = put_buffer;
-  state->cur.put_bits = put_bits;
+  state->cur.put_buffer.c = put_buffer;
+  state->cur.free_bits = free_bits;
   STORE_BUFFER()
 
   return TRUE;
@@ -654,8 +692,9 @@ encode_mcu_huff(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
   /* Load up working state */
   state.next_output_byte = cinfo->dest->next_output_byte;
   state.free_in_buffer = cinfo->dest->free_in_buffer;
-  ASSIGN_STATE(state.cur, entropy->saved);
+  state.cur = entropy->saved;
   state.cinfo = cinfo;
+  state.simd = entropy->simd;
 
   /* Emit restart marker if needed */
   if (cinfo->restart_interval) {
@@ -694,7 +733,7 @@ encode_mcu_huff(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
   /* Completed MCU, so update state */
   cinfo->dest->next_output_byte = state.next_output_byte;
   cinfo->dest->free_in_buffer = state.free_in_buffer;
-  ASSIGN_STATE(entropy->saved, state.cur);
+  entropy->saved = state.cur;
 
   /* Update restart-interval state too */
   if (cinfo->restart_interval) {
@@ -723,8 +762,9 @@ finish_pass_huff(j_compress_ptr cinfo)
   /* Load up working state ... flush_bits needs it */
   state.next_output_byte = cinfo->dest->next_output_byte;
   state.free_in_buffer = cinfo->dest->free_in_buffer;
-  ASSIGN_STATE(state.cur, entropy->saved);
+  state.cur = entropy->saved;
   state.cinfo = cinfo;
+  state.simd = entropy->simd;
 
   /* Flush out the last data */
   if (!flush_bits(&state))
@@ -733,7 +773,7 @@ finish_pass_huff(j_compress_ptr cinfo)
   /* Update state */
   cinfo->dest->next_output_byte = state.next_output_byte;
   cinfo->dest->free_in_buffer = state.free_in_buffer;
-  ASSIGN_STATE(entropy->saved, state.cur);
+  entropy->saved = state.cur;
 }
 
 
diff --git a/3rdparty/libjpeg-turbo/src/jcphuff.c b/3rdparty/libjpeg-turbo/src/jcphuff.c
index a8b94bed84..bd14fc27d5 100644
--- a/3rdparty/libjpeg-turbo/src/jcphuff.c
+++ b/3rdparty/libjpeg-turbo/src/jcphuff.c
@@ -4,8 +4,9 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1995-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2011, 2015, 2018, D. R. Commander.
+ * Copyright (C) 2011, 2015, 2018, 2021, D. R. Commander.
  * Copyright (C) 2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2020, Arm Limited.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -51,15 +52,19 @@
  * flags (this defines __thumb__).
  */
 
-/* NOTE: Both GCC and Clang define __GNUC__ */
-#if defined(__GNUC__) && (defined(__arm__) || defined(__aarch64__))
+#if defined(__arm__) || defined(__aarch64__) || defined(_M_ARM) || \
+    defined(_M_ARM64)
 #if !defined(__thumb__) || defined(__thumb2__)
 #define USE_CLZ_INTRINSIC
 #endif
 #endif
 
 #ifdef USE_CLZ_INTRINSIC
+#if defined(_MSC_VER) && !defined(__clang__)
+#define JPEG_NBITS_NONZERO(x)  (32 - _CountLeadingZeros(x))
+#else
 #define JPEG_NBITS_NONZERO(x)  (32 - __builtin_clz(x))
+#endif
 #define JPEG_NBITS(x)          (x ? JPEG_NBITS_NONZERO(x) : 0)
 #else
 #include "jpeg_nbits_table.h"
@@ -169,24 +174,26 @@ INLINE
 METHODDEF(int)
 count_zeroes(size_t *x)
 {
-  int result;
 #if defined(HAVE_BUILTIN_CTZL)
+  int result;
   result = __builtin_ctzl(*x);
   *x >>= result;
 #elif defined(HAVE_BITSCANFORWARD64)
+  unsigned long result;
   _BitScanForward64(&result, *x);
   *x >>= result;
 #elif defined(HAVE_BITSCANFORWARD)
+  unsigned long result;
   _BitScanForward(&result, *x);
   *x >>= result;
 #else
-  result = 0;
+  int result = 0;
   while ((*x & 1) == 0) {
     ++result;
     *x >>= 1;
   }
 #endif
-  return result;
+  return (int)result;
 }
 
 
@@ -860,7 +867,7 @@ encode_mcu_AC_refine_prepare(const JCOEF *block,
 
 #define ENCODE_COEFS_AC_REFINE(label) { \
   while (zerobits) { \
-    int idx = count_zeroes(&zerobits); \
+    idx = count_zeroes(&zerobits); \
     r += idx; \
     cabsvalue += idx; \
     signbits >>= idx; \
@@ -917,7 +924,7 @@ METHODDEF(boolean)
 encode_mcu_AC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 {
   phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
-  register int temp, r;
+  register int temp, r, idx;
   char *BR_buffer;
   unsigned int BR;
   int Sl = cinfo->Se - cinfo->Ss + 1;
@@ -968,7 +975,7 @@ encode_mcu_AC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 
   if (zerobits) {
     int diff = ((absvalues + DCTSIZE2 / 2) - cabsvalue);
-    int idx = count_zeroes(&zerobits);
+    idx = count_zeroes(&zerobits);
     signbits >>= idx;
     idx += diff;
     r += idx;
diff --git a/3rdparty/libjpeg-turbo/src/jcsample.c b/3rdparty/libjpeg-turbo/src/jcsample.c
index bd27b84e06..e8515ebf0f 100644
--- a/3rdparty/libjpeg-turbo/src/jcsample.c
+++ b/3rdparty/libjpeg-turbo/src/jcsample.c
@@ -6,7 +6,7 @@
  * libjpeg-turbo Modifications:
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * Copyright (C) 2014, MIPS Technologies, Inc., California.
- * Copyright (C) 2015, D. R. Commander.
+ * Copyright (C) 2015, 2019, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -103,7 +103,7 @@ expand_right_edge(JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols,
   if (numcols > 0) {
     for (row = 0; row < num_rows; row++) {
       ptr = image_data[row] + input_cols;
-      pixval = ptr[-1];         /* don't need GETJSAMPLE() here */
+      pixval = ptr[-1];
       for (count = numcols; count > 0; count--)
         *ptr++ = pixval;
     }
@@ -174,7 +174,7 @@ int_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
       for (v = 0; v < v_expand; v++) {
         inptr = input_data[inrow + v] + outcol_h;
         for (h = 0; h < h_expand; h++) {
-          outvalue += (JLONG)GETJSAMPLE(*inptr++);
+          outvalue += (JLONG)(*inptr++);
         }
       }
       *outptr++ = (JSAMPLE)((outvalue + numpix2) / numpix);
@@ -237,8 +237,7 @@ h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
     inptr = input_data[outrow];
     bias = 0;                   /* bias = 0,1,0,1,... for successive samples */
     for (outcol = 0; outcol < output_cols; outcol++) {
-      *outptr++ =
-        (JSAMPLE)((GETJSAMPLE(*inptr) + GETJSAMPLE(inptr[1]) + bias) >> 1);
+      *outptr++ = (JSAMPLE)((inptr[0] + inptr[1] + bias) >> 1);
       bias ^= 1;                /* 0=>1, 1=>0 */
       inptr += 2;
     }
@@ -277,8 +276,7 @@ h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
     bias = 1;                   /* bias = 1,2,1,2,... for successive samples */
     for (outcol = 0; outcol < output_cols; outcol++) {
       *outptr++ =
-        (JSAMPLE)((GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[1]) +
-                   GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]) + bias) >> 2);
+        (JSAMPLE)((inptr0[0] + inptr0[1] + inptr1[0] + inptr1[1] + bias) >> 2);
       bias ^= 3;                /* 1=>2, 2=>1 */
       inptr0 += 2;  inptr1 += 2;
     }
@@ -337,33 +335,25 @@ h2v2_smooth_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
     below_ptr = input_data[inrow + 2];
 
     /* Special case for first column: pretend column -1 is same as column 0 */
-    membersum = GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[1]) +
-                GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]);
-    neighsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(above_ptr[1]) +
-               GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[1]) +
-               GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[2]) +
-               GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[2]);
+    membersum = inptr0[0] + inptr0[1] + inptr1[0] + inptr1[1];
+    neighsum = above_ptr[0] + above_ptr[1] + below_ptr[0] + below_ptr[1] +
+               inptr0[0] + inptr0[2] + inptr1[0] + inptr1[2];
     neighsum += neighsum;
-    neighsum += GETJSAMPLE(*above_ptr) + GETJSAMPLE(above_ptr[2]) +
-                GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[2]);
+    neighsum += above_ptr[0] + above_ptr[2] + below_ptr[0] + below_ptr[2];
     membersum = membersum * memberscale + neighsum * neighscale;
     *outptr++ = (JSAMPLE)((membersum + 32768) >> 16);
     inptr0 += 2;  inptr1 += 2;  above_ptr += 2;  below_ptr += 2;
 
     for (colctr = output_cols - 2; colctr > 0; colctr--) {
       /* sum of pixels directly mapped to this output element */
-      membersum = GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[1]) +
-                  GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]);
+      membersum = inptr0[0] + inptr0[1] + inptr1[0] + inptr1[1];
       /* sum of edge-neighbor pixels */
-      neighsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(above_ptr[1]) +
-                 GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[1]) +
-                 GETJSAMPLE(inptr0[-1]) + GETJSAMPLE(inptr0[2]) +
-                 GETJSAMPLE(inptr1[-1]) + GETJSAMPLE(inptr1[2]);
+      neighsum = above_ptr[0] + above_ptr[1] + below_ptr[0] + below_ptr[1] +
+                 inptr0[-1] + inptr0[2] + inptr1[-1] + inptr1[2];
       /* The edge-neighbors count twice as much as corner-neighbors */
       neighsum += neighsum;
       /* Add in the corner-neighbors */
-      neighsum += GETJSAMPLE(above_ptr[-1]) + GETJSAMPLE(above_ptr[2]) +
-                  GETJSAMPLE(below_ptr[-1]) + GETJSAMPLE(below_ptr[2]);
+      neighsum += above_ptr[-1] + above_ptr[2] + below_ptr[-1] + below_ptr[2];
       /* form final output scaled up by 2^16 */
       membersum = membersum * memberscale + neighsum * neighscale;
       /* round, descale and output it */
@@ -372,15 +362,11 @@ h2v2_smooth_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
     }
 
     /* Special case for last column */
-    membersum = GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[1]) +
-                GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]);
-    neighsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(above_ptr[1]) +
-               GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[1]) +
-               GETJSAMPLE(inptr0[-1]) + GETJSAMPLE(inptr0[1]) +
-               GETJSAMPLE(inptr1[-1]) + GETJSAMPLE(inptr1[1]);
+    membersum = inptr0[0] + inptr0[1] + inptr1[0] + inptr1[1];
+    neighsum = above_ptr[0] + above_ptr[1] + below_ptr[0] + below_ptr[1] +
+               inptr0[-1] + inptr0[1] + inptr1[-1] + inptr1[1];
     neighsum += neighsum;
-    neighsum += GETJSAMPLE(above_ptr[-1]) + GETJSAMPLE(above_ptr[1]) +
-                GETJSAMPLE(below_ptr[-1]) + GETJSAMPLE(below_ptr[1]);
+    neighsum += above_ptr[-1] + above_ptr[1] + below_ptr[-1] + below_ptr[1];
     membersum = membersum * memberscale + neighsum * neighscale;
     *outptr = (JSAMPLE)((membersum + 32768) >> 16);
 
@@ -429,21 +415,18 @@ fullsize_smooth_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
     below_ptr = input_data[outrow + 1];
 
     /* Special case for first column */
-    colsum = GETJSAMPLE(*above_ptr++) + GETJSAMPLE(*below_ptr++) +
-             GETJSAMPLE(*inptr);
-    membersum = GETJSAMPLE(*inptr++);
-    nextcolsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(*below_ptr) +
-                 GETJSAMPLE(*inptr);
+    colsum = (*above_ptr++) + (*below_ptr++) + inptr[0];
+    membersum = *inptr++;
+    nextcolsum = above_ptr[0] + below_ptr[0] + inptr[0];
     neighsum = colsum + (colsum - membersum) + nextcolsum;
     membersum = membersum * memberscale + neighsum * neighscale;
     *outptr++ = (JSAMPLE)((membersum + 32768) >> 16);
     lastcolsum = colsum;  colsum = nextcolsum;
 
     for (colctr = output_cols - 2; colctr > 0; colctr--) {
-      membersum = GETJSAMPLE(*inptr++);
+      membersum = *inptr++;
       above_ptr++;  below_ptr++;
-      nextcolsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(*below_ptr) +
-                   GETJSAMPLE(*inptr);
+      nextcolsum = above_ptr[0] + below_ptr[0] + inptr[0];
       neighsum = lastcolsum + (colsum - membersum) + nextcolsum;
       membersum = membersum * memberscale + neighsum * neighscale;
       *outptr++ = (JSAMPLE)((membersum + 32768) >> 16);
@@ -451,7 +434,7 @@ fullsize_smooth_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
     }
 
     /* Special case for last column */
-    membersum = GETJSAMPLE(*inptr);
+    membersum = *inptr;
     neighsum = lastcolsum + (colsum - membersum) + colsum;
     membersum = membersum * memberscale + neighsum * neighscale;
     *outptr = (JSAMPLE)((membersum + 32768) >> 16);
diff --git a/3rdparty/libjpeg-turbo/src/jdapistd.c b/3rdparty/libjpeg-turbo/src/jdapistd.c
index 38bd1110d9..695a620099 100644
--- a/3rdparty/libjpeg-turbo/src/jdapistd.c
+++ b/3rdparty/libjpeg-turbo/src/jdapistd.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, 2015-2018, 2020, D. R. Commander.
+ * Copyright (C) 2010, 2015-2020, D. R. Commander.
  * Copyright (C) 2015, Google, Inc.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
@@ -319,6 +319,8 @@ read_and_discard_scanlines(j_decompress_ptr cinfo, JDIMENSION num_lines)
 {
   JDIMENSION n;
   my_master_ptr master = (my_master_ptr)cinfo->master;
+  JSAMPLE dummy_sample[1] = { 0 };
+  JSAMPROW dummy_row = dummy_sample;
   JSAMPARRAY scanlines = NULL;
   void (*color_convert) (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
                          JDIMENSION input_row, JSAMPARRAY output_buf,
@@ -329,6 +331,10 @@ read_and_discard_scanlines(j_decompress_ptr cinfo, JDIMENSION num_lines)
   if (cinfo->cconvert && cinfo->cconvert->color_convert) {
     color_convert = cinfo->cconvert->color_convert;
     cinfo->cconvert->color_convert = noop_convert;
+    /* This just prevents UBSan from complaining about adding 0 to a NULL
+     * pointer.  The pointer isn't actually used.
+     */
+    scanlines = &dummy_row;
   }
 
   if (cinfo->cquantize && cinfo->cquantize->color_quantize) {
@@ -532,6 +538,8 @@ jpeg_skip_scanlines(j_decompress_ptr cinfo, JDIMENSION num_lines)
          * decoded coefficients.  This is ~5% faster for large subsets, but
          * it's tough to tell a difference for smaller images.
          */
+        if (!cinfo->entropy->insufficient_data)
+          cinfo->master->last_good_iMCU_row = cinfo->input_iMCU_row;
         (*cinfo->entropy->decode_mcu) (cinfo, NULL);
       }
     }
diff --git a/3rdparty/libjpeg-turbo/src/jdarith.c b/3rdparty/libjpeg-turbo/src/jdarith.c
index 6002481e24..7f0d3a785c 100644
--- a/3rdparty/libjpeg-turbo/src/jdarith.c
+++ b/3rdparty/libjpeg-turbo/src/jdarith.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Developed 1997-2015 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015-2018, D. R. Commander.
+ * Copyright (C) 2015-2020, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -80,7 +80,7 @@ get_byte(j_decompress_ptr cinfo)
     if (!(*src->fill_input_buffer) (cinfo))
       ERREXIT(cinfo, JERR_CANT_SUSPEND);
   src->bytes_in_buffer--;
-  return GETJOCTET(*src->next_input_byte++);
+  return *src->next_input_byte++;
 }
 
 
@@ -665,8 +665,16 @@ bad:
     for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
       int coefi, cindex = cinfo->cur_comp_info[ci]->component_index;
       int *coef_bit_ptr = &cinfo->coef_bits[cindex][0];
+      int *prev_coef_bit_ptr =
+        &cinfo->coef_bits[cindex + cinfo->num_components][0];
       if (cinfo->Ss && coef_bit_ptr[0] < 0) /* AC without prior DC scan */
         WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, 0);
+      for (coefi = MIN(cinfo->Ss, 1); coefi <= MAX(cinfo->Se, 9); coefi++) {
+        if (cinfo->input_scan_number > 1)
+          prev_coef_bit_ptr[coefi] = coef_bit_ptr[coefi];
+        else
+          prev_coef_bit_ptr[coefi] = 0;
+      }
       for (coefi = cinfo->Ss; coefi <= cinfo->Se; coefi++) {
         int expected = (coef_bit_ptr[coefi] < 0) ? 0 : coef_bit_ptr[coefi];
         if (cinfo->Ah != expected)
@@ -727,6 +735,7 @@ bad:
   entropy->c = 0;
   entropy->a = 0;
   entropy->ct = -16;    /* force reading 2 initial bytes to fill C */
+  entropy->pub.insufficient_data = FALSE;
 
   /* Initialize restart counter */
   entropy->restarts_to_go = cinfo->restart_interval;
@@ -763,7 +772,7 @@ jinit_arith_decoder(j_decompress_ptr cinfo)
     int *coef_bit_ptr, ci;
     cinfo->coef_bits = (int (*)[DCTSIZE2])
       (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
-                                  cinfo->num_components * DCTSIZE2 *
+                                  cinfo->num_components * 2 * DCTSIZE2 *
                                   sizeof(int));
     coef_bit_ptr = &cinfo->coef_bits[0][0];
     for (ci = 0; ci < cinfo->num_components; ci++)
diff --git a/3rdparty/libjpeg-turbo/src/jdcoefct.c b/3rdparty/libjpeg-turbo/src/jdcoefct.c
index 2ba6aa11e4..15e6cded62 100644
--- a/3rdparty/libjpeg-turbo/src/jdcoefct.c
+++ b/3rdparty/libjpeg-turbo/src/jdcoefct.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1994-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2010, 2015-2016, D. R. Commander.
+ * Copyright (C) 2010, 2015-2016, 2019-2020, D. R. Commander.
  * Copyright (C) 2015, 2020, Google, Inc.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
@@ -102,6 +102,8 @@ decompress_onepass(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
       /* Try to fetch an MCU.  Entropy decoder expects buffer to be zeroed. */
       jzero_far((void *)coef->MCU_buffer[0],
                 (size_t)(cinfo->blocks_in_MCU * sizeof(JBLOCK)));
+      if (!cinfo->entropy->insufficient_data)
+        cinfo->master->last_good_iMCU_row = cinfo->input_iMCU_row;
       if (!(*cinfo->entropy->decode_mcu) (cinfo, coef->MCU_buffer)) {
         /* Suspension forced; update state counters and exit */
         coef->MCU_vert_offset = yoffset;
@@ -227,6 +229,8 @@ consume_data(j_decompress_ptr cinfo)
           }
         }
       }
+      if (!cinfo->entropy->insufficient_data)
+        cinfo->master->last_good_iMCU_row = cinfo->input_iMCU_row;
       /* Try to fetch the MCU. */
       if (!(*cinfo->entropy->decode_mcu) (cinfo, coef->MCU_buffer)) {
         /* Suspension forced; update state counters and exit */
@@ -326,19 +330,22 @@ decompress_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 #ifdef BLOCK_SMOOTHING_SUPPORTED
 
 /*
- * This code applies interblock smoothing as described by section K.8
- * of the JPEG standard: the first 5 AC coefficients are estimated from
- * the DC values of a DCT block and its 8 neighboring blocks.
+ * This code applies interblock smoothing; the first 9 AC coefficients are
+ * estimated from the DC values of a DCT block and its 24 neighboring blocks.
  * We apply smoothing only for progressive JPEG decoding, and only if
  * the coefficients it can estimate are not yet known to full precision.
  */
 
-/* Natural-order array positions of the first 5 zigzag-order coefficients */
+/* Natural-order array positions of the first 9 zigzag-order coefficients */
 #define Q01_POS  1
 #define Q10_POS  8
 #define Q20_POS  16
 #define Q11_POS  9
 #define Q02_POS  2
+#define Q03_POS  3
+#define Q12_POS  10
+#define Q21_POS  17
+#define Q30_POS  24
 
 /*
  * Determine whether block smoothing is applicable and safe.
@@ -356,8 +363,8 @@ smoothing_ok(j_decompress_ptr cinfo)
   int ci, coefi;
   jpeg_component_info *compptr;
   JQUANT_TBL *qtable;
-  int *coef_bits;
-  int *coef_bits_latch;
+  int *coef_bits, *prev_coef_bits;
+  int *coef_bits_latch, *prev_coef_bits_latch;
 
   if (!cinfo->progressive_mode || cinfo->coef_bits == NULL)
     return FALSE;
@@ -366,34 +373,47 @@ smoothing_ok(j_decompress_ptr cinfo)
   if (coef->coef_bits_latch == NULL)
     coef->coef_bits_latch = (int *)
       (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
-                                  cinfo->num_components *
+                                  cinfo->num_components * 2 *
                                   (SAVED_COEFS * sizeof(int)));
   coef_bits_latch = coef->coef_bits_latch;
+  prev_coef_bits_latch =
+    &coef->coef_bits_latch[cinfo->num_components * SAVED_COEFS];
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
     /* All components' quantization values must already be latched. */
     if ((qtable = compptr->quant_table) == NULL)
       return FALSE;
-    /* Verify DC & first 5 AC quantizers are nonzero to avoid zero-divide. */
+    /* Verify DC & first 9 AC quantizers are nonzero to avoid zero-divide. */
     if (qtable->quantval[0] == 0 ||
         qtable->quantval[Q01_POS] == 0 ||
         qtable->quantval[Q10_POS] == 0 ||
         qtable->quantval[Q20_POS] == 0 ||
         qtable->quantval[Q11_POS] == 0 ||
-        qtable->quantval[Q02_POS] == 0)
+        qtable->quantval[Q02_POS] == 0 ||
+        qtable->quantval[Q03_POS] == 0 ||
+        qtable->quantval[Q12_POS] == 0 ||
+        qtable->quantval[Q21_POS] == 0 ||
+        qtable->quantval[Q30_POS] == 0)
       return FALSE;
     /* DC values must be at least partly known for all components. */
     coef_bits = cinfo->coef_bits[ci];
+    prev_coef_bits = cinfo->coef_bits[ci + cinfo->num_components];
     if (coef_bits[0] < 0)
       return FALSE;
+    coef_bits_latch[0] = coef_bits[0];
     /* Block smoothing is helpful if some AC coefficients remain inaccurate. */
-    for (coefi = 1; coefi <= 5; coefi++) {
+    for (coefi = 1; coefi < SAVED_COEFS; coefi++) {
+      if (cinfo->input_scan_number > 1)
+        prev_coef_bits_latch[coefi] = prev_coef_bits[coefi];
+      else
+        prev_coef_bits_latch[coefi] = -1;
       coef_bits_latch[coefi] = coef_bits[coefi];
       if (coef_bits[coefi] != 0)
         smoothing_useful = TRUE;
     }
     coef_bits_latch += SAVED_COEFS;
+    prev_coef_bits_latch += SAVED_COEFS;
   }
 
   return smoothing_useful;
@@ -412,17 +432,20 @@ decompress_smooth_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
   JDIMENSION block_num, last_block_column;
   int ci, block_row, block_rows, access_rows;
   JBLOCKARRAY buffer;
-  JBLOCKROW buffer_ptr, prev_block_row, next_block_row;
+  JBLOCKROW buffer_ptr, prev_prev_block_row, prev_block_row;
+  JBLOCKROW next_block_row, next_next_block_row;
   JSAMPARRAY output_ptr;
   JDIMENSION output_col;
   jpeg_component_info *compptr;
   inverse_DCT_method_ptr inverse_DCT;
-  boolean first_row, last_row;
+  boolean change_dc;
   JCOEF *workspace;
   int *coef_bits;
   JQUANT_TBL *quanttbl;
-  JLONG Q00, Q01, Q02, Q10, Q11, Q20, num;
-  int DC1, DC2, DC3, DC4, DC5, DC6, DC7, DC8, DC9;
+  JLONG Q00, Q01, Q02, Q03 = 0, Q10, Q11, Q12 = 0, Q20, Q21 = 0, Q30 = 0, num;
+  int DC01, DC02, DC03, DC04, DC05, DC06, DC07, DC08, DC09, DC10, DC11, DC12,
+      DC13, DC14, DC15, DC16, DC17, DC18, DC19, DC20, DC21, DC22, DC23, DC24,
+      DC25;
   int Al, pred;
 
   /* Keep a local variable to avoid looking it up more than once */
@@ -434,10 +457,10 @@ decompress_smooth_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
     if (cinfo->input_scan_number == cinfo->output_scan_number) {
       /* If input is working on current scan, we ordinarily want it to
        * have completed the current row.  But if input scan is DC,
-       * we want it to keep one row ahead so that next block row's DC
+       * we want it to keep two rows ahead so that next two block rows' DC
        * values are up to date.
        */
-      JDIMENSION delta = (cinfo->Ss == 0) ? 1 : 0;
+      JDIMENSION delta = (cinfo->Ss == 0) ? 2 : 0;
       if (cinfo->input_iMCU_row > cinfo->output_iMCU_row + delta)
         break;
     }
@@ -452,34 +475,53 @@ decompress_smooth_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
     if (!compptr->component_needed)
       continue;
     /* Count non-dummy DCT block rows in this iMCU row. */
-    if (cinfo->output_iMCU_row < last_iMCU_row) {
+    if (cinfo->output_iMCU_row < last_iMCU_row - 1) {
+      block_rows = compptr->v_samp_factor;
+      access_rows = block_rows * 3; /* this and next two iMCU rows */
+    } else if (cinfo->output_iMCU_row < last_iMCU_row) {
       block_rows = compptr->v_samp_factor;
       access_rows = block_rows * 2; /* this and next iMCU row */
-      last_row = FALSE;
     } else {
       /* NB: can't use last_row_height here; it is input-side-dependent! */
       block_rows = (int)(compptr->height_in_blocks % compptr->v_samp_factor);
       if (block_rows == 0) block_rows = compptr->v_samp_factor;
       access_rows = block_rows; /* this iMCU row only */
-      last_row = TRUE;
     }
     /* Align the virtual buffer for this component. */
-    if (cinfo->output_iMCU_row > 0) {
-      access_rows += compptr->v_samp_factor; /* prior iMCU row too */
+    if (cinfo->output_iMCU_row > 1) {
+      access_rows += 2 * compptr->v_samp_factor; /* prior two iMCU rows too */
+      buffer = (*cinfo->mem->access_virt_barray)
+        ((j_common_ptr)cinfo, coef->whole_image[ci],
+         (cinfo->output_iMCU_row - 2) * compptr->v_samp_factor,
+         (JDIMENSION)access_rows, FALSE);
+      buffer += 2 * compptr->v_samp_factor; /* point to current iMCU row */
+    } else if (cinfo->output_iMCU_row > 0) {
       buffer = (*cinfo->mem->access_virt_barray)
         ((j_common_ptr)cinfo, coef->whole_image[ci],
          (cinfo->output_iMCU_row - 1) * compptr->v_samp_factor,
          (JDIMENSION)access_rows, FALSE);
       buffer += compptr->v_samp_factor; /* point to current iMCU row */
-      first_row = FALSE;
     } else {
       buffer = (*cinfo->mem->access_virt_barray)
         ((j_common_ptr)cinfo, coef->whole_image[ci],
          (JDIMENSION)0, (JDIMENSION)access_rows, FALSE);
-      first_row = TRUE;
     }
-    /* Fetch component-dependent info */
-    coef_bits = coef->coef_bits_latch + (ci * SAVED_COEFS);
+    /* Fetch component-dependent info.
+     * If the current scan is incomplete, then we use the component-dependent
+     * info from the previous scan.
+     */
+    if (cinfo->output_iMCU_row > cinfo->master->last_good_iMCU_row)
+      coef_bits =
+        coef->coef_bits_latch + ((ci + cinfo->num_components) * SAVED_COEFS);
+    else
+      coef_bits = coef->coef_bits_latch + (ci * SAVED_COEFS);
+
+    /* We only do DC interpolation if no AC coefficient data is available. */
+    change_dc =
+      coef_bits[1] == -1 && coef_bits[2] == -1 && coef_bits[3] == -1 &&
+      coef_bits[4] == -1 && coef_bits[5] == -1 && coef_bits[6] == -1 &&
+      coef_bits[7] == -1 && coef_bits[8] == -1 && coef_bits[9] == -1;
+
     quanttbl = compptr->quant_table;
     Q00 = quanttbl->quantval[0];
     Q01 = quanttbl->quantval[Q01_POS];
@@ -487,27 +529,51 @@ decompress_smooth_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
     Q20 = quanttbl->quantval[Q20_POS];
     Q11 = quanttbl->quantval[Q11_POS];
     Q02 = quanttbl->quantval[Q02_POS];
+    if (change_dc) {
+      Q03 = quanttbl->quantval[Q03_POS];
+      Q12 = quanttbl->quantval[Q12_POS];
+      Q21 = quanttbl->quantval[Q21_POS];
+      Q30 = quanttbl->quantval[Q30_POS];
+    }
     inverse_DCT = cinfo->idct->inverse_DCT[ci];
     output_ptr = output_buf[ci];
     /* Loop over all DCT blocks to be processed. */
     for (block_row = 0; block_row < block_rows; block_row++) {
       buffer_ptr = buffer[block_row] + cinfo->master->first_MCU_col[ci];
-      if (first_row && block_row == 0)
+
+      if (block_row > 0 || cinfo->output_iMCU_row > 0)
+        prev_block_row =
+          buffer[block_row - 1] + cinfo->master->first_MCU_col[ci];
+      else
         prev_block_row = buffer_ptr;
+
+      if (block_row > 1 || cinfo->output_iMCU_row > 1)
+        prev_prev_block_row =
+          buffer[block_row - 2] + cinfo->master->first_MCU_col[ci];
+      else
+        prev_prev_block_row = prev_block_row;
+
+      if (block_row < block_rows - 1 || cinfo->output_iMCU_row < last_iMCU_row)
+        next_block_row =
+          buffer[block_row + 1] + cinfo->master->first_MCU_col[ci];
       else
-        prev_block_row = buffer[block_row - 1] +
-                         cinfo->master->first_MCU_col[ci];
-      if (last_row && block_row == block_rows - 1)
         next_block_row = buffer_ptr;
+
+      if (block_row < block_rows - 2 ||
+          cinfo->output_iMCU_row < last_iMCU_row - 1)
+        next_next_block_row =
+          buffer[block_row + 2] + cinfo->master->first_MCU_col[ci];
       else
-        next_block_row = buffer[block_row + 1] +
-                         cinfo->master->first_MCU_col[ci];
+        next_next_block_row = next_block_row;
+
       /* We fetch the surrounding DC values using a sliding-register approach.
-       * Initialize all nine here so as to do the right thing on narrow pics.
+       * Initialize all 25 here so as to do the right thing on narrow pics.
        */
-      DC1 = DC2 = DC3 = (int)prev_block_row[0][0];
-      DC4 = DC5 = DC6 = (int)buffer_ptr[0][0];
-      DC7 = DC8 = DC9 = (int)next_block_row[0][0];
+      DC01 = DC02 = DC03 = DC04 = DC05 = (int)prev_prev_block_row[0][0];
+      DC06 = DC07 = DC08 = DC09 = DC10 = (int)prev_block_row[0][0];
+      DC11 = DC12 = DC13 = DC14 = DC15 = (int)buffer_ptr[0][0];
+      DC16 = DC17 = DC18 = DC19 = DC20 = (int)next_block_row[0][0];
+      DC21 = DC22 = DC23 = DC24 = DC25 = (int)next_next_block_row[0][0];
       output_col = 0;
       last_block_column = compptr->width_in_blocks - 1;
       for (block_num = cinfo->master->first_MCU_col[ci];
@@ -515,18 +581,39 @@ decompress_smooth_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
         /* Fetch current DCT block into workspace so we can modify it. */
         jcopy_block_row(buffer_ptr, (JBLOCKROW)workspace, (JDIMENSION)1);
         /* Update DC values */
-        if (block_num < last_block_column) {
-          DC3 = (int)prev_block_row[1][0];
-          DC6 = (int)buffer_ptr[1][0];
-          DC9 = (int)next_block_row[1][0];
+        if (block_num == cinfo->master->first_MCU_col[ci] &&
+            block_num < last_block_column) {
+          DC04 = (int)prev_prev_block_row[1][0];
+          DC09 = (int)prev_block_row[1][0];
+          DC14 = (int)buffer_ptr[1][0];
+          DC19 = (int)next_block_row[1][0];
+          DC24 = (int)next_next_block_row[1][0];
         }
-        /* Compute coefficient estimates per K.8.
-         * An estimate is applied only if coefficient is still zero,
-         * and is not known to be fully accurate.
+        if (block_num + 1 < last_block_column) {
+          DC05 = (int)prev_prev_block_row[2][0];
+          DC10 = (int)prev_block_row[2][0];
+          DC15 = (int)buffer_ptr[2][0];
+          DC20 = (int)next_block_row[2][0];
+          DC25 = (int)next_next_block_row[2][0];
+        }
+        /* If DC interpolation is enabled, compute coefficient estimates using
+         * a Gaussian-like kernel, keeping the averages of the DC values.
+         *
+         * If DC interpolation is disabled, compute coefficient estimates using
+         * an algorithm similar to the one described in Section K.8 of the JPEG
+         * standard, except applied to a 5x5 window rather than a 3x3 window.
+         *
+         * An estimate is applied only if the coefficient is still zero and is
+         * not known to be fully accurate.
          */
         /* AC01 */
         if ((Al = coef_bits[1]) != 0 && workspace[1] == 0) {
-          num = 36 * Q00 * (DC4 - DC6);
+          num = Q00 * (change_dc ?
+                (-DC01 - DC02 + DC04 + DC05 - 3 * DC06 + 13 * DC07 -
+                 13 * DC09 + 3 * DC10 - 3 * DC11 + 38 * DC12 - 38 * DC14 +
+                 3 * DC15 - 3 * DC16 + 13 * DC17 - 13 * DC19 + 3 * DC20 -
+                 DC21 - DC22 + DC24 + DC25) :
+                (-7 * DC11 + 50 * DC12 - 50 * DC14 + 7 * DC15));
           if (num >= 0) {
             pred = (int)(((Q01 << 7) + num) / (Q01 << 8));
             if (Al > 0 && pred >= (1 << Al))
@@ -541,7 +628,12 @@ decompress_smooth_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
         }
         /* AC10 */
         if ((Al = coef_bits[2]) != 0 && workspace[8] == 0) {
-          num = 36 * Q00 * (DC2 - DC8);
+          num = Q00 * (change_dc ?
+                (-DC01 - 3 * DC02 - 3 * DC03 - 3 * DC04 - DC05 - DC06 +
+                 13 * DC07 + 38 * DC08 + 13 * DC09 - DC10 + DC16 -
+                 13 * DC17 - 38 * DC18 - 13 * DC19 + DC20 + DC21 +
+                 3 * DC22 + 3 * DC23 + 3 * DC24 + DC25) :
+                (-7 * DC03 + 50 * DC08 - 50 * DC18 + 7 * DC23));
           if (num >= 0) {
             pred = (int)(((Q10 << 7) + num) / (Q10 << 8));
             if (Al > 0 && pred >= (1 << Al))
@@ -556,7 +648,10 @@ decompress_smooth_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
         }
         /* AC20 */
         if ((Al = coef_bits[3]) != 0 && workspace[16] == 0) {
-          num = 9 * Q00 * (DC2 + DC8 - 2 * DC5);
+          num = Q00 * (change_dc ?
+                (DC03 + 2 * DC07 + 7 * DC08 + 2 * DC09 - 5 * DC12 - 14 * DC13 -
+                 5 * DC14 + 2 * DC17 + 7 * DC18 + 2 * DC19 + DC23) :
+                (-DC03 + 13 * DC08 - 24 * DC13 + 13 * DC18 - DC23));
           if (num >= 0) {
             pred = (int)(((Q20 << 7) + num) / (Q20 << 8));
             if (Al > 0 && pred >= (1 << Al))
@@ -571,7 +666,11 @@ decompress_smooth_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
         }
         /* AC11 */
         if ((Al = coef_bits[4]) != 0 && workspace[9] == 0) {
-          num = 5 * Q00 * (DC1 - DC3 - DC7 + DC9);
+          num = Q00 * (change_dc ?
+                (-DC01 + DC05 + 9 * DC07 - 9 * DC09 - 9 * DC17 +
+                 9 * DC19 + DC21 - DC25) :
+                (DC10 + DC16 - 10 * DC17 + 10 * DC19 - DC02 - DC20 + DC22 -
+                 DC24 + DC04 - DC06 + 10 * DC07 - 10 * DC09));
           if (num >= 0) {
             pred = (int)(((Q11 << 7) + num) / (Q11 << 8));
             if (Al > 0 && pred >= (1 << Al))
@@ -586,7 +685,10 @@ decompress_smooth_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
         }
         /* AC02 */
         if ((Al = coef_bits[5]) != 0 && workspace[2] == 0) {
-          num = 9 * Q00 * (DC4 + DC6 - 2 * DC5);
+          num = Q00 * (change_dc ?
+                (2 * DC07 - 5 * DC08 + 2 * DC09 + DC11 + 7 * DC12 - 14 * DC13 +
+                 7 * DC14 + DC15 + 2 * DC17 - 5 * DC18 + 2 * DC19) :
+                (-DC11 + 13 * DC12 - 24 * DC13 + 13 * DC14 - DC15));
           if (num >= 0) {
             pred = (int)(((Q02 << 7) + num) / (Q02 << 8));
             if (Al > 0 && pred >= (1 << Al))
@@ -599,14 +701,96 @@ decompress_smooth_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
           }
           workspace[2] = (JCOEF)pred;
         }
+        if (change_dc) {
+          /* AC03 */
+          if ((Al = coef_bits[6]) != 0 && workspace[3] == 0) {
+            num = Q00 * (DC07 - DC09 + 2 * DC12 - 2 * DC14 + DC17 - DC19);
+            if (num >= 0) {
+              pred = (int)(((Q03 << 7) + num) / (Q03 << 8));
+              if (Al > 0 && pred >= (1 << Al))
+                pred = (1 << Al) - 1;
+            } else {
+              pred = (int)(((Q03 << 7) - num) / (Q03 << 8));
+              if (Al > 0 && pred >= (1 << Al))
+                pred = (1 << Al) - 1;
+              pred = -pred;
+            }
+            workspace[3] = (JCOEF)pred;
+          }
+          /* AC12 */
+          if ((Al = coef_bits[7]) != 0 && workspace[10] == 0) {
+            num = Q00 * (DC07 - 3 * DC08 + DC09 - DC17 + 3 * DC18 - DC19);
+            if (num >= 0) {
+              pred = (int)(((Q12 << 7) + num) / (Q12 << 8));
+              if (Al > 0 && pred >= (1 << Al))
+                pred = (1 << Al) - 1;
+            } else {
+              pred = (int)(((Q12 << 7) - num) / (Q12 << 8));
+              if (Al > 0 && pred >= (1 << Al))
+                pred = (1 << Al) - 1;
+              pred = -pred;
+            }
+            workspace[10] = (JCOEF)pred;
+          }
+          /* AC21 */
+          if ((Al = coef_bits[8]) != 0 && workspace[17] == 0) {
+            num = Q00 * (DC07 - DC09 - 3 * DC12 + 3 * DC14 + DC17 - DC19);
+            if (num >= 0) {
+              pred = (int)(((Q21 << 7) + num) / (Q21 << 8));
+              if (Al > 0 && pred >= (1 << Al))
+                pred = (1 << Al) - 1;
+            } else {
+              pred = (int)(((Q21 << 7) - num) / (Q21 << 8));
+              if (Al > 0 && pred >= (1 << Al))
+                pred = (1 << Al) - 1;
+              pred = -pred;
+            }
+            workspace[17] = (JCOEF)pred;
+          }
+          /* AC30 */
+          if ((Al = coef_bits[9]) != 0 && workspace[24] == 0) {
+            num = Q00 * (DC07 + 2 * DC08 + DC09 - DC17 - 2 * DC18 - DC19);
+            if (num >= 0) {
+              pred = (int)(((Q30 << 7) + num) / (Q30 << 8));
+              if (Al > 0 && pred >= (1 << Al))
+                pred = (1 << Al) - 1;
+            } else {
+              pred = (int)(((Q30 << 7) - num) / (Q30 << 8));
+              if (Al > 0 && pred >= (1 << Al))
+                pred = (1 << Al) - 1;
+              pred = -pred;
+            }
+            workspace[24] = (JCOEF)pred;
+          }
+          /* coef_bits[0] is non-negative.  Otherwise this function would not
+           * be called.
+           */
+          num = Q00 *
+                (-2 * DC01 - 6 * DC02 - 8 * DC03 - 6 * DC04 - 2 * DC05 -
+                 6 * DC06 + 6 * DC07 + 42 * DC08 + 6 * DC09 - 6 * DC10 -
+                 8 * DC11 + 42 * DC12 + 152 * DC13 + 42 * DC14 - 8 * DC15 -
+                 6 * DC16 + 6 * DC17 + 42 * DC18 + 6 * DC19 - 6 * DC20 -
+                 2 * DC21 - 6 * DC22 - 8 * DC23 - 6 * DC24 - 2 * DC25);
+          if (num >= 0) {
+            pred = (int)(((Q00 << 7) + num) / (Q00 << 8));
+          } else {
+            pred = (int)(((Q00 << 7) - num) / (Q00 << 8));
+            pred = -pred;
+          }
+          workspace[0] = (JCOEF)pred;
+        }  /* change_dc */
+
         /* OK, do the IDCT */
         (*inverse_DCT) (cinfo, compptr, (JCOEFPTR)workspace, output_ptr,
                         output_col);
         /* Advance for next column */
-        DC1 = DC2;  DC2 = DC3;
-        DC4 = DC5;  DC5 = DC6;
-        DC7 = DC8;  DC8 = DC9;
-        buffer_ptr++, prev_block_row++, next_block_row++;
+        DC01 = DC02;  DC02 = DC03;  DC03 = DC04;  DC04 = DC05;
+        DC06 = DC07;  DC07 = DC08;  DC08 = DC09;  DC09 = DC10;
+        DC11 = DC12;  DC12 = DC13;  DC13 = DC14;  DC14 = DC15;
+        DC16 = DC17;  DC17 = DC18;  DC18 = DC19;  DC19 = DC20;
+        DC21 = DC22;  DC22 = DC23;  DC23 = DC24;  DC24 = DC25;
+        buffer_ptr++, prev_block_row++, next_block_row++,
+          prev_prev_block_row++, next_next_block_row++;
         output_col += compptr->_DCT_scaled_size;
       }
       output_ptr += compptr->_DCT_scaled_size;
@@ -655,7 +839,7 @@ jinit_d_coef_controller(j_decompress_ptr cinfo, boolean need_full_buffer)
 #ifdef BLOCK_SMOOTHING_SUPPORTED
       /* If block smoothing could be used, need a bigger window */
       if (cinfo->progressive_mode)
-        access_rows *= 3;
+        access_rows *= 5;
 #endif
       coef->whole_image[ci] = (*cinfo->mem->request_virt_barray)
         ((j_common_ptr)cinfo, JPOOL_IMAGE, TRUE,
diff --git a/3rdparty/libjpeg-turbo/src/jdcoefct.h b/3rdparty/libjpeg-turbo/src/jdcoefct.h
index c4d1943dd4..9a0e780663 100644
--- a/3rdparty/libjpeg-turbo/src/jdcoefct.h
+++ b/3rdparty/libjpeg-turbo/src/jdcoefct.h
@@ -5,6 +5,7 @@
  * Copyright (C) 1994-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2020, Google, Inc.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  */
@@ -51,7 +52,7 @@ typedef struct {
 #ifdef BLOCK_SMOOTHING_SUPPORTED
   /* When doing block smoothing, we latch coefficient Al values here */
   int *coef_bits_latch;
-#define SAVED_COEFS  6          /* we save coef_bits[0..5] */
+#define SAVED_COEFS  10         /* we save coef_bits[0..9] */
 #endif
 } my_coef_controller;
 
diff --git a/3rdparty/libjpeg-turbo/src/jdcol565.c b/3rdparty/libjpeg-turbo/src/jdcol565.c
index 40068ef84f..53c7bd9187 100644
--- a/3rdparty/libjpeg-turbo/src/jdcol565.c
+++ b/3rdparty/libjpeg-turbo/src/jdcol565.c
@@ -45,9 +45,9 @@ ycc_rgb565_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
     outptr = *output_buf++;
 
     if (PACK_NEED_ALIGNMENT(outptr)) {
-      y  = GETJSAMPLE(*inptr0++);
-      cb = GETJSAMPLE(*inptr1++);
-      cr = GETJSAMPLE(*inptr2++);
+      y  = *inptr0++;
+      cb = *inptr1++;
+      cr = *inptr2++;
       r = range_limit[y + Crrtab[cr]];
       g = range_limit[y + ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
                                             SCALEBITS))];
@@ -58,18 +58,18 @@ ycc_rgb565_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
       num_cols--;
     }
     for (col = 0; col < (num_cols >> 1); col++) {
-      y  = GETJSAMPLE(*inptr0++);
-      cb = GETJSAMPLE(*inptr1++);
-      cr = GETJSAMPLE(*inptr2++);
+      y  = *inptr0++;
+      cb = *inptr1++;
+      cr = *inptr2++;
       r = range_limit[y + Crrtab[cr]];
       g = range_limit[y + ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
                                             SCALEBITS))];
       b = range_limit[y + Cbbtab[cb]];
       rgb = PACK_SHORT_565(r, g, b);
 
-      y  = GETJSAMPLE(*inptr0++);
-      cb = GETJSAMPLE(*inptr1++);
-      cr = GETJSAMPLE(*inptr2++);
+      y  = *inptr0++;
+      cb = *inptr1++;
+      cr = *inptr2++;
       r = range_limit[y + Crrtab[cr]];
       g = range_limit[y + ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
                                             SCALEBITS))];
@@ -80,9 +80,9 @@ ycc_rgb565_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
       outptr += 4;
     }
     if (num_cols & 1) {
-      y  = GETJSAMPLE(*inptr0);
-      cb = GETJSAMPLE(*inptr1);
-      cr = GETJSAMPLE(*inptr2);
+      y  = *inptr0;
+      cb = *inptr1;
+      cr = *inptr2;
       r = range_limit[y + Crrtab[cr]];
       g = range_limit[y + ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
                                             SCALEBITS))];
@@ -125,9 +125,9 @@ ycc_rgb565D_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
     input_row++;
     outptr = *output_buf++;
     if (PACK_NEED_ALIGNMENT(outptr)) {
-      y  = GETJSAMPLE(*inptr0++);
-      cb = GETJSAMPLE(*inptr1++);
-      cr = GETJSAMPLE(*inptr2++);
+      y  = *inptr0++;
+      cb = *inptr1++;
+      cr = *inptr2++;
       r = range_limit[DITHER_565_R(y + Crrtab[cr], d0)];
       g = range_limit[DITHER_565_G(y +
                                    ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
@@ -139,9 +139,9 @@ ycc_rgb565D_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
       num_cols--;
     }
     for (col = 0; col < (num_cols >> 1); col++) {
-      y  = GETJSAMPLE(*inptr0++);
-      cb = GETJSAMPLE(*inptr1++);
-      cr = GETJSAMPLE(*inptr2++);
+      y  = *inptr0++;
+      cb = *inptr1++;
+      cr = *inptr2++;
       r = range_limit[DITHER_565_R(y + Crrtab[cr], d0)];
       g = range_limit[DITHER_565_G(y +
                                    ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
@@ -150,9 +150,9 @@ ycc_rgb565D_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
       d0 = DITHER_ROTATE(d0);
       rgb = PACK_SHORT_565(r, g, b);
 
-      y  = GETJSAMPLE(*inptr0++);
-      cb = GETJSAMPLE(*inptr1++);
-      cr = GETJSAMPLE(*inptr2++);
+      y  = *inptr0++;
+      cb = *inptr1++;
+      cr = *inptr2++;
       r = range_limit[DITHER_565_R(y + Crrtab[cr], d0)];
       g = range_limit[DITHER_565_G(y +
                                    ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
@@ -165,9 +165,9 @@ ycc_rgb565D_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
       outptr += 4;
     }
     if (num_cols & 1) {
-      y  = GETJSAMPLE(*inptr0);
-      cb = GETJSAMPLE(*inptr1);
-      cr = GETJSAMPLE(*inptr2);
+      y  = *inptr0;
+      cb = *inptr1;
+      cr = *inptr2;
       r = range_limit[DITHER_565_R(y + Crrtab[cr], d0)];
       g = range_limit[DITHER_565_G(y +
                                    ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
@@ -202,32 +202,32 @@ rgb_rgb565_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
     input_row++;
     outptr = *output_buf++;
     if (PACK_NEED_ALIGNMENT(outptr)) {
-      r = GETJSAMPLE(*inptr0++);
-      g = GETJSAMPLE(*inptr1++);
-      b = GETJSAMPLE(*inptr2++);
+      r = *inptr0++;
+      g = *inptr1++;
+      b = *inptr2++;
       rgb = PACK_SHORT_565(r, g, b);
       *(INT16 *)outptr = (INT16)rgb;
       outptr += 2;
       num_cols--;
     }
     for (col = 0; col < (num_cols >> 1); col++) {
-      r = GETJSAMPLE(*inptr0++);
-      g = GETJSAMPLE(*inptr1++);
-      b = GETJSAMPLE(*inptr2++);
+      r = *inptr0++;
+      g = *inptr1++;
+      b = *inptr2++;
       rgb = PACK_SHORT_565(r, g, b);
 
-      r = GETJSAMPLE(*inptr0++);
-      g = GETJSAMPLE(*inptr1++);
-      b = GETJSAMPLE(*inptr2++);
+      r = *inptr0++;
+      g = *inptr1++;
+      b = *inptr2++;
       rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b));
 
       WRITE_TWO_ALIGNED_PIXELS(outptr, rgb);
       outptr += 4;
     }
     if (num_cols & 1) {
-      r = GETJSAMPLE(*inptr0);
-      g = GETJSAMPLE(*inptr1);
-      b = GETJSAMPLE(*inptr2);
+      r = *inptr0;
+      g = *inptr1;
+      b = *inptr2;
       rgb = PACK_SHORT_565(r, g, b);
       *(INT16 *)outptr = (INT16)rgb;
     }
@@ -259,24 +259,24 @@ rgb_rgb565D_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
     input_row++;
     outptr = *output_buf++;
     if (PACK_NEED_ALIGNMENT(outptr)) {
-      r = range_limit[DITHER_565_R(GETJSAMPLE(*inptr0++), d0)];
-      g = range_limit[DITHER_565_G(GETJSAMPLE(*inptr1++), d0)];
-      b = range_limit[DITHER_565_B(GETJSAMPLE(*inptr2++), d0)];
+      r = range_limit[DITHER_565_R(*inptr0++, d0)];
+      g = range_limit[DITHER_565_G(*inptr1++, d0)];
+      b = range_limit[DITHER_565_B(*inptr2++, d0)];
       rgb = PACK_SHORT_565(r, g, b);
       *(INT16 *)outptr = (INT16)rgb;
       outptr += 2;
       num_cols--;
     }
     for (col = 0; col < (num_cols >> 1); col++) {
-      r = range_limit[DITHER_565_R(GETJSAMPLE(*inptr0++), d0)];
-      g = range_limit[DITHER_565_G(GETJSAMPLE(*inptr1++), d0)];
-      b = range_limit[DITHER_565_B(GETJSAMPLE(*inptr2++), d0)];
+      r = range_limit[DITHER_565_R(*inptr0++, d0)];
+      g = range_limit[DITHER_565_G(*inptr1++, d0)];
+      b = range_limit[DITHER_565_B(*inptr2++, d0)];
       d0 = DITHER_ROTATE(d0);
       rgb = PACK_SHORT_565(r, g, b);
 
-      r = range_limit[DITHER_565_R(GETJSAMPLE(*inptr0++), d0)];
-      g = range_limit[DITHER_565_G(GETJSAMPLE(*inptr1++), d0)];
-      b = range_limit[DITHER_565_B(GETJSAMPLE(*inptr2++), d0)];
+      r = range_limit[DITHER_565_R(*inptr0++, d0)];
+      g = range_limit[DITHER_565_G(*inptr1++, d0)];
+      b = range_limit[DITHER_565_B(*inptr2++, d0)];
       d0 = DITHER_ROTATE(d0);
       rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b));
 
@@ -284,9 +284,9 @@ rgb_rgb565D_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
       outptr += 4;
     }
     if (num_cols & 1) {
-      r = range_limit[DITHER_565_R(GETJSAMPLE(*inptr0), d0)];
-      g = range_limit[DITHER_565_G(GETJSAMPLE(*inptr1), d0)];
-      b = range_limit[DITHER_565_B(GETJSAMPLE(*inptr2), d0)];
+      r = range_limit[DITHER_565_R(*inptr0, d0)];
+      g = range_limit[DITHER_565_G(*inptr1, d0)];
+      b = range_limit[DITHER_565_B(*inptr2, d0)];
       rgb = PACK_SHORT_565(r, g, b);
       *(INT16 *)outptr = (INT16)rgb;
     }
diff --git a/3rdparty/libjpeg-turbo/src/jdcolext.c b/3rdparty/libjpeg-turbo/src/jdcolext.c
index 72a5301070..863c7a2fbc 100644
--- a/3rdparty/libjpeg-turbo/src/jdcolext.c
+++ b/3rdparty/libjpeg-turbo/src/jdcolext.c
@@ -53,9 +53,9 @@ ycc_rgb_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
     input_row++;
     outptr = *output_buf++;
     for (col = 0; col < num_cols; col++) {
-      y  = GETJSAMPLE(inptr0[col]);
-      cb = GETJSAMPLE(inptr1[col]);
-      cr = GETJSAMPLE(inptr2[col]);
+      y  = inptr0[col];
+      cb = inptr1[col];
+      cr = inptr2[col];
       /* Range-limiting is essential due to noise introduced by DCT losses. */
       outptr[RGB_RED] =   range_limit[y + Crrtab[cr]];
       outptr[RGB_GREEN] = range_limit[y +
@@ -93,7 +93,6 @@ gray_rgb_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
     inptr = input_buf[0][input_row++];
     outptr = *output_buf++;
     for (col = 0; col < num_cols; col++) {
-      /* We can dispense with GETJSAMPLE() here */
       outptr[RGB_RED] = outptr[RGB_GREEN] = outptr[RGB_BLUE] = inptr[col];
       /* Set unused byte to 0xFF so it can be interpreted as an opaque */
       /* alpha channel value */
@@ -128,7 +127,6 @@ rgb_rgb_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
     input_row++;
     outptr = *output_buf++;
     for (col = 0; col < num_cols; col++) {
-      /* We can dispense with GETJSAMPLE() here */
       outptr[RGB_RED] = inptr0[col];
       outptr[RGB_GREEN] = inptr1[col];
       outptr[RGB_BLUE] = inptr2[col];
diff --git a/3rdparty/libjpeg-turbo/src/jdcolor.c b/3rdparty/libjpeg-turbo/src/jdcolor.c
index d3ae40c7da..8da2b4eaf2 100644
--- a/3rdparty/libjpeg-turbo/src/jdcolor.c
+++ b/3rdparty/libjpeg-turbo/src/jdcolor.c
@@ -341,9 +341,9 @@ rgb_gray_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
     input_row++;
     outptr = *output_buf++;
     for (col = 0; col < num_cols; col++) {
-      r = GETJSAMPLE(inptr0[col]);
-      g = GETJSAMPLE(inptr1[col]);
-      b = GETJSAMPLE(inptr2[col]);
+      r = inptr0[col];
+      g = inptr1[col];
+      b = inptr2[col];
       /* Y */
       outptr[col] = (JSAMPLE)((ctab[r + R_Y_OFF] + ctab[g + G_Y_OFF] +
                                ctab[b + B_Y_OFF]) >> SCALEBITS);
@@ -550,9 +550,9 @@ ycck_cmyk_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
     input_row++;
     outptr = *output_buf++;
     for (col = 0; col < num_cols; col++) {
-      y  = GETJSAMPLE(inptr0[col]);
-      cb = GETJSAMPLE(inptr1[col]);
-      cr = GETJSAMPLE(inptr2[col]);
+      y  = inptr0[col];
+      cb = inptr1[col];
+      cr = inptr2[col];
       /* Range-limiting is essential due to noise introduced by DCT losses. */
       outptr[0] = range_limit[MAXJSAMPLE - (y + Crrtab[cr])];   /* red */
       outptr[1] = range_limit[MAXJSAMPLE - (y +                 /* green */
@@ -560,7 +560,7 @@ ycck_cmyk_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
                                                  SCALEBITS)))];
       outptr[2] = range_limit[MAXJSAMPLE - (y + Cbbtab[cb])];   /* blue */
       /* K passes through unchanged */
-      outptr[3] = inptr3[col];  /* don't need GETJSAMPLE here */
+      outptr[3] = inptr3[col];
       outptr += 4;
     }
   }
diff --git a/3rdparty/libjpeg-turbo/src/jdhuff.c b/3rdparty/libjpeg-turbo/src/jdhuff.c
index a1128178b0..f786c10547 100644
--- a/3rdparty/libjpeg-turbo/src/jdhuff.c
+++ b/3rdparty/libjpeg-turbo/src/jdhuff.c
@@ -5,6 +5,7 @@
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2009-2011, 2016, 2018-2019, D. R. Commander.
+ * Copyright (C) 2018, Matthias Räncker.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -39,24 +40,6 @@ typedef struct {
   int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
 } savable_state;
 
-/* This macro is to work around compilers with missing or broken
- * structure assignment.  You'll need to fix this code if you have
- * such a compiler and you change MAX_COMPS_IN_SCAN.
- */
-
-#ifndef NO_STRUCT_ASSIGN
-#define ASSIGN_STATE(dest, src)  ((dest) = (src))
-#else
-#if MAX_COMPS_IN_SCAN == 4
-#define ASSIGN_STATE(dest, src) \
-  ((dest).last_dc_val[0] = (src).last_dc_val[0], \
-   (dest).last_dc_val[1] = (src).last_dc_val[1], \
-   (dest).last_dc_val[2] = (src).last_dc_val[2], \
-   (dest).last_dc_val[3] = (src).last_dc_val[3])
-#endif
-#endif
-
-
 typedef struct {
   struct jpeg_entropy_decoder pub; /* public fields */
 
@@ -325,7 +308,7 @@ jpeg_fill_bit_buffer(bitread_working_state *state,
         bytes_in_buffer = cinfo->src->bytes_in_buffer;
       }
       bytes_in_buffer--;
-      c = GETJOCTET(*next_input_byte++);
+      c = *next_input_byte++;
 
       /* If it's 0xFF, check and discard stuffed zero byte */
       if (c == 0xFF) {
@@ -342,7 +325,7 @@ jpeg_fill_bit_buffer(bitread_working_state *state,
             bytes_in_buffer = cinfo->src->bytes_in_buffer;
           }
           bytes_in_buffer--;
-          c = GETJOCTET(*next_input_byte++);
+          c = *next_input_byte++;
         } while (c == 0xFF);
 
         if (c == 0) {
@@ -405,8 +388,8 @@ no_more_bytes:
 
 #define GET_BYTE { \
   register int c0, c1; \
-  c0 = GETJOCTET(*buffer++); \
-  c1 = GETJOCTET(*buffer); \
+  c0 = *buffer++; \
+  c1 = *buffer; \
   /* Pre-execute most common case */ \
   get_buffer = (get_buffer << 8) | c0; \
   bits_left += 8; \
@@ -423,7 +406,7 @@ no_more_bytes:
   } \
 }
 
-#if SIZEOF_SIZE_T == 8 || defined(_WIN64)
+#if SIZEOF_SIZE_T == 8 || defined(_WIN64) || (defined(__x86_64__) && defined(__ILP32__))
 
 /* Pre-fetch 48 bytes, because the holding register is 64-bit */
 #define FILL_BIT_BUFFER_FAST \
@@ -557,6 +540,12 @@ process_restart(j_decompress_ptr cinfo)
 }
 
 
+#if defined(__has_feature)
+#if __has_feature(undefined_behavior_sanitizer)
+__attribute__((no_sanitize("signed-integer-overflow"),
+               no_sanitize("unsigned-integer-overflow")))
+#endif
+#endif
 LOCAL(boolean)
 decode_mcu_slow(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
@@ -568,7 +557,7 @@ decode_mcu_slow(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 
   /* Load up working state */
   BITREAD_LOAD_STATE(cinfo, entropy->bitstate);
-  ASSIGN_STATE(state, entropy->saved);
+  state = entropy->saved;
 
   for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
     JBLOCKROW block = MCU_data ? MCU_data[blkn] : NULL;
@@ -589,11 +578,15 @@ decode_mcu_slow(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
     if (entropy->dc_needed[blkn]) {
       /* Convert DC difference to actual value, update last_dc_val */
       int ci = cinfo->MCU_membership[blkn];
-      /* This is really just
-       *   s += state.last_dc_val[ci];
-       * It is written this way in order to shut up UBSan.
+      /* Certain malformed JPEG images produce repeated DC coefficient
+       * differences of 2047 or -2047, which causes state.last_dc_val[ci] to
+       * grow until it overflows or underflows a 32-bit signed integer.  This
+       * behavior is, to the best of our understanding, innocuous, and it is
+       * unclear how to work around it without potentially affecting
+       * performance.  Thus, we (hopefully temporarily) suppress UBSan integer
+       * overflow errors for this function.
        */
-      s = (int)((unsigned int)s + (unsigned int)state.last_dc_val[ci]);
+      s += state.last_dc_val[ci];
       state.last_dc_val[ci] = s;
       if (block) {
         /* Output the DC coefficient (assumes jpeg_natural_order[0] = 0) */
@@ -653,7 +646,7 @@ decode_mcu_slow(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 
   /* Completed MCU, so update state */
   BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
-  ASSIGN_STATE(entropy->saved, state);
+  entropy->saved = state;
   return TRUE;
 }
 
@@ -671,7 +664,7 @@ decode_mcu_fast(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
   /* Load up working state */
   BITREAD_LOAD_STATE(cinfo, entropy->bitstate);
   buffer = (JOCTET *)br_state.next_input_byte;
-  ASSIGN_STATE(state, entropy->saved);
+  state = entropy->saved;
 
   for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
     JBLOCKROW block = MCU_data ? MCU_data[blkn] : NULL;
@@ -688,7 +681,7 @@ decode_mcu_fast(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 
     if (entropy->dc_needed[blkn]) {
       int ci = cinfo->MCU_membership[blkn];
-      s = (int)((unsigned int)s + (unsigned int)state.last_dc_val[ci]);
+      s += state.last_dc_val[ci];
       state.last_dc_val[ci] = s;
       if (block)
         (*block)[0] = (JCOEF)s;
@@ -740,7 +733,7 @@ decode_mcu_fast(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
   br_state.bytes_in_buffer -= (buffer - br_state.next_input_byte);
   br_state.next_input_byte = buffer;
   BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
-  ASSIGN_STATE(entropy->saved, state);
+  entropy->saved = state;
   return TRUE;
 }
 
@@ -795,7 +788,8 @@ use_slow:
   }
 
   /* Account for restart interval (no-op if not using restarts) */
-  entropy->restarts_to_go--;
+  if (cinfo->restart_interval)
+    entropy->restarts_to_go--;
 
   return TRUE;
 }
diff --git a/3rdparty/libjpeg-turbo/src/jdhuff.h b/3rdparty/libjpeg-turbo/src/jdhuff.h
index 6a8d90f402..cfa0b7f558 100644
--- a/3rdparty/libjpeg-turbo/src/jdhuff.h
+++ b/3rdparty/libjpeg-turbo/src/jdhuff.h
@@ -4,7 +4,8 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010-2011, 2015-2016, D. R. Commander.
+ * Copyright (C) 2010-2011, 2015-2016, 2021, D. R. Commander.
+ * Copyright (C) 2018, Matthias Räncker.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -78,6 +79,11 @@ EXTERN(void) jpeg_make_d_derived_tbl(j_decompress_ptr cinfo, boolean isDC,
 typedef size_t bit_buf_type;            /* type of bit-extraction buffer */
 #define BIT_BUF_SIZE  64                /* size of buffer in bits */
 
+#elif defined(__x86_64__) && defined(__ILP32__)
+
+typedef unsigned long long bit_buf_type; /* type of bit-extraction buffer */
+#define BIT_BUF_SIZE  64                 /* size of buffer in bits */
+
 #else
 
 typedef unsigned long bit_buf_type;     /* type of bit-extraction buffer */
@@ -228,7 +234,10 @@ slowlabel: \
       s |= GET_BITS(1); \
       nb++; \
     } \
-    s = htbl->pub->huffval[(int)(s + htbl->valoffset[nb]) & 0xFF]; \
+    if (nb > 16) \
+      s = 0; \
+    else \
+      s = htbl->pub->huffval[(int)(s + htbl->valoffset[nb]) & 0xFF]; \
   }
 
 /* Out-of-line case for Huffman code fetching */
diff --git a/3rdparty/libjpeg-turbo/src/jdicc.c b/3rdparty/libjpeg-turbo/src/jdicc.c
index 7224695816..a1a5b867ae 100644
--- a/3rdparty/libjpeg-turbo/src/jdicc.c
+++ b/3rdparty/libjpeg-turbo/src/jdicc.c
@@ -38,18 +38,18 @@ marker_is_icc(jpeg_saved_marker_ptr marker)
     marker->marker == ICC_MARKER &&
     marker->data_length >= ICC_OVERHEAD_LEN &&
     /* verify the identifying string */
-    GETJOCTET(marker->data[0]) == 0x49 &&
-    GETJOCTET(marker->data[1]) == 0x43 &&
-    GETJOCTET(marker->data[2]) == 0x43 &&
-    GETJOCTET(marker->data[3]) == 0x5F &&
-    GETJOCTET(marker->data[4]) == 0x50 &&
-    GETJOCTET(marker->data[5]) == 0x52 &&
-    GETJOCTET(marker->data[6]) == 0x4F &&
-    GETJOCTET(marker->data[7]) == 0x46 &&
-    GETJOCTET(marker->data[8]) == 0x49 &&
-    GETJOCTET(marker->data[9]) == 0x4C &&
-    GETJOCTET(marker->data[10]) == 0x45 &&
-    GETJOCTET(marker->data[11]) == 0x0;
+    marker->data[0] == 0x49 &&
+    marker->data[1] == 0x43 &&
+    marker->data[2] == 0x43 &&
+    marker->data[3] == 0x5F &&
+    marker->data[4] == 0x50 &&
+    marker->data[5] == 0x52 &&
+    marker->data[6] == 0x4F &&
+    marker->data[7] == 0x46 &&
+    marker->data[8] == 0x49 &&
+    marker->data[9] == 0x4C &&
+    marker->data[10] == 0x45 &&
+    marker->data[11] == 0x0;
 }
 
 
@@ -102,12 +102,12 @@ jpeg_read_icc_profile(j_decompress_ptr cinfo, JOCTET **icc_data_ptr,
   for (marker = cinfo->marker_list; marker != NULL; marker = marker->next) {
     if (marker_is_icc(marker)) {
       if (num_markers == 0)
-        num_markers = GETJOCTET(marker->data[13]);
-      else if (num_markers != GETJOCTET(marker->data[13])) {
+        num_markers = marker->data[13];
+      else if (num_markers != marker->data[13]) {
         WARNMS(cinfo, JWRN_BOGUS_ICC);  /* inconsistent num_markers fields */
         return FALSE;
       }
-      seq_no = GETJOCTET(marker->data[12]);
+      seq_no = marker->data[12];
       if (seq_no <= 0 || seq_no > num_markers) {
         WARNMS(cinfo, JWRN_BOGUS_ICC);  /* bogus sequence number */
         return FALSE;
@@ -154,7 +154,7 @@ jpeg_read_icc_profile(j_decompress_ptr cinfo, JOCTET **icc_data_ptr,
       JOCTET FAR *src_ptr;
       JOCTET *dst_ptr;
       unsigned int length;
-      seq_no = GETJOCTET(marker->data[12]);
+      seq_no = marker->data[12];
       dst_ptr = icc_data + data_offset[seq_no];
       src_ptr = marker->data + ICC_OVERHEAD_LEN;
       length = data_length[seq_no];
diff --git a/3rdparty/libjpeg-turbo/src/jdmarker.c b/3rdparty/libjpeg-turbo/src/jdmarker.c
index c9c7ef6399..b964c3a1a6 100644
--- a/3rdparty/libjpeg-turbo/src/jdmarker.c
+++ b/3rdparty/libjpeg-turbo/src/jdmarker.c
@@ -151,7 +151,7 @@ typedef my_marker_reader *my_marker_ptr;
 #define INPUT_BYTE(cinfo, V, action) \
   MAKESTMT( MAKE_BYTE_AVAIL(cinfo, action); \
             bytes_in_buffer--; \
-            V = GETJOCTET(*next_input_byte++); )
+            V = *next_input_byte++; )
 
 /* As above, but read two bytes interpreted as an unsigned 16-bit integer.
  * V should be declared unsigned int or perhaps JLONG.
@@ -159,10 +159,10 @@ typedef my_marker_reader *my_marker_ptr;
 #define INPUT_2BYTES(cinfo, V, action) \
   MAKESTMT( MAKE_BYTE_AVAIL(cinfo, action); \
             bytes_in_buffer--; \
-            V = ((unsigned int)GETJOCTET(*next_input_byte++)) << 8; \
+            V = ((unsigned int)(*next_input_byte++)) << 8; \
             MAKE_BYTE_AVAIL(cinfo, action); \
             bytes_in_buffer--; \
-            V += GETJOCTET(*next_input_byte++); )
+            V += *next_input_byte++; )
 
 
 /*
@@ -608,18 +608,18 @@ examine_app0(j_decompress_ptr cinfo, JOCTET *data, unsigned int datalen,
   JLONG totallen = (JLONG)datalen + remaining;
 
   if (datalen >= APP0_DATA_LEN &&
-      GETJOCTET(data[0]) == 0x4A &&
-      GETJOCTET(data[1]) == 0x46 &&
-      GETJOCTET(data[2]) == 0x49 &&
-      GETJOCTET(data[3]) == 0x46 &&
-      GETJOCTET(data[4]) == 0) {
+      data[0] == 0x4A &&
+      data[1] == 0x46 &&
+      data[2] == 0x49 &&
+      data[3] == 0x46 &&
+      data[4] == 0) {
     /* Found JFIF APP0 marker: save info */
     cinfo->saw_JFIF_marker = TRUE;
-    cinfo->JFIF_major_version = GETJOCTET(data[5]);
-    cinfo->JFIF_minor_version = GETJOCTET(data[6]);
-    cinfo->density_unit = GETJOCTET(data[7]);
-    cinfo->X_density = (GETJOCTET(data[8]) << 8) + GETJOCTET(data[9]);
-    cinfo->Y_density = (GETJOCTET(data[10]) << 8) + GETJOCTET(data[11]);
+    cinfo->JFIF_major_version = data[5];
+    cinfo->JFIF_minor_version = data[6];
+    cinfo->density_unit = data[7];
+    cinfo->X_density = (data[8] << 8) + data[9];
+    cinfo->Y_density = (data[10] << 8) + data[11];
     /* Check version.
      * Major version must be 1, anything else signals an incompatible change.
      * (We used to treat this as an error, but now it's a nonfatal warning,
@@ -634,24 +634,22 @@ examine_app0(j_decompress_ptr cinfo, JOCTET *data, unsigned int datalen,
              cinfo->JFIF_major_version, cinfo->JFIF_minor_version,
              cinfo->X_density, cinfo->Y_density, cinfo->density_unit);
     /* Validate thumbnail dimensions and issue appropriate messages */
-    if (GETJOCTET(data[12]) | GETJOCTET(data[13]))
-      TRACEMS2(cinfo, 1, JTRC_JFIF_THUMBNAIL,
-               GETJOCTET(data[12]), GETJOCTET(data[13]));
+    if (data[12] | data[13])
+      TRACEMS2(cinfo, 1, JTRC_JFIF_THUMBNAIL, data[12], data[13]);
     totallen -= APP0_DATA_LEN;
-    if (totallen !=
-        ((JLONG)GETJOCTET(data[12]) * (JLONG)GETJOCTET(data[13]) * (JLONG)3))
+    if (totallen != ((JLONG)data[12] * (JLONG)data[13] * (JLONG)3))
       TRACEMS1(cinfo, 1, JTRC_JFIF_BADTHUMBNAILSIZE, (int)totallen);
   } else if (datalen >= 6 &&
-             GETJOCTET(data[0]) == 0x4A &&
-             GETJOCTET(data[1]) == 0x46 &&
-             GETJOCTET(data[2]) == 0x58 &&
-             GETJOCTET(data[3]) == 0x58 &&
-             GETJOCTET(data[4]) == 0) {
+             data[0] == 0x4A &&
+             data[1] == 0x46 &&
+             data[2] == 0x58 &&
+             data[3] == 0x58 &&
+             data[4] == 0) {
     /* Found JFIF "JFXX" extension APP0 marker */
     /* The library doesn't actually do anything with these,
      * but we try to produce a helpful trace message.
      */
-    switch (GETJOCTET(data[5])) {
+    switch (data[5]) {
     case 0x10:
       TRACEMS1(cinfo, 1, JTRC_THUMB_JPEG, (int)totallen);
       break;
@@ -662,8 +660,7 @@ examine_app0(j_decompress_ptr cinfo, JOCTET *data, unsigned int datalen,
       TRACEMS1(cinfo, 1, JTRC_THUMB_RGB, (int)totallen);
       break;
     default:
-      TRACEMS2(cinfo, 1, JTRC_JFIF_EXTENSION,
-               GETJOCTET(data[5]), (int)totallen);
+      TRACEMS2(cinfo, 1, JTRC_JFIF_EXTENSION, data[5], (int)totallen);
       break;
     }
   } else {
@@ -684,16 +681,16 @@ examine_app14(j_decompress_ptr cinfo, JOCTET *data, unsigned int datalen,
   unsigned int version, flags0, flags1, transform;
 
   if (datalen >= APP14_DATA_LEN &&
-      GETJOCTET(data[0]) == 0x41 &&
-      GETJOCTET(data[1]) == 0x64 &&
-      GETJOCTET(data[2]) == 0x6F &&
-      GETJOCTET(data[3]) == 0x62 &&
-      GETJOCTET(data[4]) == 0x65) {
+      data[0] == 0x41 &&
+      data[1] == 0x64 &&
+      data[2] == 0x6F &&
+      data[3] == 0x62 &&
+      data[4] == 0x65) {
     /* Found Adobe APP14 marker */
-    version = (GETJOCTET(data[5]) << 8) + GETJOCTET(data[6]);
-    flags0 = (GETJOCTET(data[7]) << 8) + GETJOCTET(data[8]);
-    flags1 = (GETJOCTET(data[9]) << 8) + GETJOCTET(data[10]);
-    transform = GETJOCTET(data[11]);
+    version = (data[5] << 8) + data[6];
+    flags0 = (data[7] << 8) + data[8];
+    flags1 = (data[9] << 8) + data[10];
+    transform = data[11];
     TRACEMS4(cinfo, 1, JTRC_ADOBE, version, flags0, flags1, transform);
     cinfo->saw_Adobe_marker = TRUE;
     cinfo->Adobe_transform = (UINT8)transform;
diff --git a/3rdparty/libjpeg-turbo/src/jdmaster.c b/3rdparty/libjpeg-turbo/src/jdmaster.c
index b20906438e..cbc8774b1f 100644
--- a/3rdparty/libjpeg-turbo/src/jdmaster.c
+++ b/3rdparty/libjpeg-turbo/src/jdmaster.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * Modified 2002-2009 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009-2011, 2016, D. R. Commander.
+ * Copyright (C) 2009-2011, 2016, 2019, D. R. Commander.
  * Copyright (C) 2013, Linaro Limited.
  * Copyright (C) 2015, Google, Inc.
  * For conditions of distribution and use, see the accompanying README.ijg
@@ -22,7 +22,6 @@
 #include "jpeglib.h"
 #include "jpegcomp.h"
 #include "jdmaster.h"
-#include "jsimd.h"
 
 
 /*
@@ -70,17 +69,6 @@ use_merged_upsample(j_decompress_ptr cinfo)
       cinfo->comp_info[1]._DCT_scaled_size != cinfo->_min_DCT_scaled_size ||
       cinfo->comp_info[2]._DCT_scaled_size != cinfo->_min_DCT_scaled_size)
     return FALSE;
-#ifdef WITH_SIMD
-  /* If YCbCr-to-RGB color conversion is SIMD-accelerated but merged upsampling
-     isn't, then disabling merged upsampling is likely to be faster when
-     decompressing YCbCr JPEG images. */
-  if (!jsimd_can_h2v2_merged_upsample() && !jsimd_can_h2v1_merged_upsample() &&
-      jsimd_can_ycc_rgb() && cinfo->jpeg_color_space == JCS_YCbCr &&
-      (cinfo->out_color_space == JCS_RGB ||
-       (cinfo->out_color_space >= JCS_EXT_RGB &&
-        cinfo->out_color_space <= JCS_EXT_ARGB)))
-    return FALSE;
-#endif
   /* ??? also need to test for upsample-time rescaling, when & if supported */
   return TRUE;                  /* by golly, it'll work... */
 #else
@@ -580,6 +568,7 @@ master_selection(j_decompress_ptr cinfo)
    */
   cinfo->master->first_iMCU_col = 0;
   cinfo->master->last_iMCU_col = cinfo->MCUs_per_row - 1;
+  cinfo->master->last_good_iMCU_row = 0;
 
 #ifdef D_MULTISCAN_FILES_SUPPORTED
   /* If jpeg_start_decompress will read the whole file, initialize
diff --git a/3rdparty/libjpeg-turbo/src/jdmrg565.c b/3rdparty/libjpeg-turbo/src/jdmrg565.c
index 53f1e16700..980a4e216e 100644
--- a/3rdparty/libjpeg-turbo/src/jdmrg565.c
+++ b/3rdparty/libjpeg-turbo/src/jdmrg565.c
@@ -43,20 +43,20 @@ h2v1_merged_upsample_565_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
   /* Loop for each pair of output pixels */
   for (col = cinfo->output_width >> 1; col > 0; col--) {
     /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
+    cb = *inptr1++;
+    cr = *inptr2++;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
 
     /* Fetch 2 Y values and emit 2 pixels */
-    y  = GETJSAMPLE(*inptr0++);
+    y  = *inptr0++;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
     rgb = PACK_SHORT_565(r, g, b);
 
-    y  = GETJSAMPLE(*inptr0++);
+    y  = *inptr0++;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
@@ -68,12 +68,12 @@ h2v1_merged_upsample_565_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
 
   /* If image width is odd, do the last output column separately */
   if (cinfo->output_width & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
+    cb = *inptr1;
+    cr = *inptr2;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
-    y  = GETJSAMPLE(*inptr0);
+    y  = *inptr0;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
@@ -115,21 +115,21 @@ h2v1_merged_upsample_565D_internal(j_decompress_ptr cinfo,
   /* Loop for each pair of output pixels */
   for (col = cinfo->output_width >> 1; col > 0; col--) {
     /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
+    cb = *inptr1++;
+    cr = *inptr2++;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
 
     /* Fetch 2 Y values and emit 2 pixels */
-    y  = GETJSAMPLE(*inptr0++);
+    y  = *inptr0++;
     r = range_limit[DITHER_565_R(y + cred, d0)];
     g = range_limit[DITHER_565_G(y + cgreen, d0)];
     b = range_limit[DITHER_565_B(y + cblue, d0)];
     d0 = DITHER_ROTATE(d0);
     rgb = PACK_SHORT_565(r, g, b);
 
-    y  = GETJSAMPLE(*inptr0++);
+    y  = *inptr0++;
     r = range_limit[DITHER_565_R(y + cred, d0)];
     g = range_limit[DITHER_565_G(y + cgreen, d0)];
     b = range_limit[DITHER_565_B(y + cblue, d0)];
@@ -142,12 +142,12 @@ h2v1_merged_upsample_565D_internal(j_decompress_ptr cinfo,
 
   /* If image width is odd, do the last output column separately */
   if (cinfo->output_width & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
+    cb = *inptr1;
+    cr = *inptr2;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
-    y  = GETJSAMPLE(*inptr0);
+    y  = *inptr0;
     r = range_limit[DITHER_565_R(y + cred, d0)];
     g = range_limit[DITHER_565_G(y + cgreen, d0)];
     b = range_limit[DITHER_565_B(y + cblue, d0)];
@@ -189,20 +189,20 @@ h2v2_merged_upsample_565_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
   /* Loop for each group of output pixels */
   for (col = cinfo->output_width >> 1; col > 0; col--) {
     /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
+    cb = *inptr1++;
+    cr = *inptr2++;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
 
     /* Fetch 4 Y values and emit 4 pixels */
-    y  = GETJSAMPLE(*inptr00++);
+    y  = *inptr00++;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
     rgb = PACK_SHORT_565(r, g, b);
 
-    y  = GETJSAMPLE(*inptr00++);
+    y  = *inptr00++;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
@@ -211,13 +211,13 @@ h2v2_merged_upsample_565_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
     WRITE_TWO_PIXELS(outptr0, rgb);
     outptr0 += 4;
 
-    y  = GETJSAMPLE(*inptr01++);
+    y  = *inptr01++;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
     rgb = PACK_SHORT_565(r, g, b);
 
-    y  = GETJSAMPLE(*inptr01++);
+    y  = *inptr01++;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
@@ -229,20 +229,20 @@ h2v2_merged_upsample_565_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
 
   /* If image width is odd, do the last output column separately */
   if (cinfo->output_width & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
+    cb = *inptr1;
+    cr = *inptr2;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
 
-    y  = GETJSAMPLE(*inptr00);
+    y  = *inptr00;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
     rgb = PACK_SHORT_565(r, g, b);
     *(INT16 *)outptr0 = (INT16)rgb;
 
-    y  = GETJSAMPLE(*inptr01);
+    y  = *inptr01;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
@@ -287,21 +287,21 @@ h2v2_merged_upsample_565D_internal(j_decompress_ptr cinfo,
   /* Loop for each group of output pixels */
   for (col = cinfo->output_width >> 1; col > 0; col--) {
     /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
+    cb = *inptr1++;
+    cr = *inptr2++;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
 
     /* Fetch 4 Y values and emit 4 pixels */
-    y  = GETJSAMPLE(*inptr00++);
+    y  = *inptr00++;
     r = range_limit[DITHER_565_R(y + cred, d0)];
     g = range_limit[DITHER_565_G(y + cgreen, d0)];
     b = range_limit[DITHER_565_B(y + cblue, d0)];
     d0 = DITHER_ROTATE(d0);
     rgb = PACK_SHORT_565(r, g, b);
 
-    y  = GETJSAMPLE(*inptr00++);
+    y  = *inptr00++;
     r = range_limit[DITHER_565_R(y + cred, d0)];
     g = range_limit[DITHER_565_G(y + cgreen, d0)];
     b = range_limit[DITHER_565_B(y + cblue, d0)];
@@ -311,14 +311,14 @@ h2v2_merged_upsample_565D_internal(j_decompress_ptr cinfo,
     WRITE_TWO_PIXELS(outptr0, rgb);
     outptr0 += 4;
 
-    y  = GETJSAMPLE(*inptr01++);
+    y  = *inptr01++;
     r = range_limit[DITHER_565_R(y + cred, d1)];
     g = range_limit[DITHER_565_G(y + cgreen, d1)];
     b = range_limit[DITHER_565_B(y + cblue, d1)];
     d1 = DITHER_ROTATE(d1);
     rgb = PACK_SHORT_565(r, g, b);
 
-    y  = GETJSAMPLE(*inptr01++);
+    y  = *inptr01++;
     r = range_limit[DITHER_565_R(y + cred, d1)];
     g = range_limit[DITHER_565_G(y + cgreen, d1)];
     b = range_limit[DITHER_565_B(y + cblue, d1)];
@@ -331,20 +331,20 @@ h2v2_merged_upsample_565D_internal(j_decompress_ptr cinfo,
 
   /* If image width is odd, do the last output column separately */
   if (cinfo->output_width & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
+    cb = *inptr1;
+    cr = *inptr2;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
 
-    y  = GETJSAMPLE(*inptr00);
+    y  = *inptr00;
     r = range_limit[DITHER_565_R(y + cred, d0)];
     g = range_limit[DITHER_565_G(y + cgreen, d0)];
     b = range_limit[DITHER_565_B(y + cblue, d0)];
     rgb = PACK_SHORT_565(r, g, b);
     *(INT16 *)outptr0 = (INT16)rgb;
 
-    y  = GETJSAMPLE(*inptr01);
+    y  = *inptr01;
     r = range_limit[DITHER_565_R(y + cred, d1)];
     g = range_limit[DITHER_565_G(y + cgreen, d1)];
     b = range_limit[DITHER_565_B(y + cblue, d1)];
diff --git a/3rdparty/libjpeg-turbo/src/jdmrgext.c b/3rdparty/libjpeg-turbo/src/jdmrgext.c
index c9a44d8219..9bf4f1a307 100644
--- a/3rdparty/libjpeg-turbo/src/jdmrgext.c
+++ b/3rdparty/libjpeg-turbo/src/jdmrgext.c
@@ -46,13 +46,13 @@ h2v1_merged_upsample_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
   /* Loop for each pair of output pixels */
   for (col = cinfo->output_width >> 1; col > 0; col--) {
     /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
+    cb = *inptr1++;
+    cr = *inptr2++;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
     /* Fetch 2 Y values and emit 2 pixels */
-    y  = GETJSAMPLE(*inptr0++);
+    y  = *inptr0++;
     outptr[RGB_RED] =   range_limit[y + cred];
     outptr[RGB_GREEN] = range_limit[y + cgreen];
     outptr[RGB_BLUE] =  range_limit[y + cblue];
@@ -60,7 +60,7 @@ h2v1_merged_upsample_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
     outptr[RGB_ALPHA] = 0xFF;
 #endif
     outptr += RGB_PIXELSIZE;
-    y  = GETJSAMPLE(*inptr0++);
+    y  = *inptr0++;
     outptr[RGB_RED] =   range_limit[y + cred];
     outptr[RGB_GREEN] = range_limit[y + cgreen];
     outptr[RGB_BLUE] =  range_limit[y + cblue];
@@ -71,12 +71,12 @@ h2v1_merged_upsample_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
   }
   /* If image width is odd, do the last output column separately */
   if (cinfo->output_width & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
+    cb = *inptr1;
+    cr = *inptr2;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
-    y  = GETJSAMPLE(*inptr0);
+    y  = *inptr0;
     outptr[RGB_RED] =   range_limit[y + cred];
     outptr[RGB_GREEN] = range_limit[y + cgreen];
     outptr[RGB_BLUE] =  range_limit[y + cblue];
@@ -120,13 +120,13 @@ h2v2_merged_upsample_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
   /* Loop for each group of output pixels */
   for (col = cinfo->output_width >> 1; col > 0; col--) {
     /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
+    cb = *inptr1++;
+    cr = *inptr2++;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
     /* Fetch 4 Y values and emit 4 pixels */
-    y  = GETJSAMPLE(*inptr00++);
+    y  = *inptr00++;
     outptr0[RGB_RED] =   range_limit[y + cred];
     outptr0[RGB_GREEN] = range_limit[y + cgreen];
     outptr0[RGB_BLUE] =  range_limit[y + cblue];
@@ -134,7 +134,7 @@ h2v2_merged_upsample_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
     outptr0[RGB_ALPHA] = 0xFF;
 #endif
     outptr0 += RGB_PIXELSIZE;
-    y  = GETJSAMPLE(*inptr00++);
+    y  = *inptr00++;
     outptr0[RGB_RED] =   range_limit[y + cred];
     outptr0[RGB_GREEN] = range_limit[y + cgreen];
     outptr0[RGB_BLUE] =  range_limit[y + cblue];
@@ -142,7 +142,7 @@ h2v2_merged_upsample_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
     outptr0[RGB_ALPHA] = 0xFF;
 #endif
     outptr0 += RGB_PIXELSIZE;
-    y  = GETJSAMPLE(*inptr01++);
+    y  = *inptr01++;
     outptr1[RGB_RED] =   range_limit[y + cred];
     outptr1[RGB_GREEN] = range_limit[y + cgreen];
     outptr1[RGB_BLUE] =  range_limit[y + cblue];
@@ -150,7 +150,7 @@ h2v2_merged_upsample_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
     outptr1[RGB_ALPHA] = 0xFF;
 #endif
     outptr1 += RGB_PIXELSIZE;
-    y  = GETJSAMPLE(*inptr01++);
+    y  = *inptr01++;
     outptr1[RGB_RED] =   range_limit[y + cred];
     outptr1[RGB_GREEN] = range_limit[y + cgreen];
     outptr1[RGB_BLUE] =  range_limit[y + cblue];
@@ -161,19 +161,19 @@ h2v2_merged_upsample_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
   }
   /* If image width is odd, do the last output column separately */
   if (cinfo->output_width & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
+    cb = *inptr1;
+    cr = *inptr2;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
-    y  = GETJSAMPLE(*inptr00);
+    y  = *inptr00;
     outptr0[RGB_RED] =   range_limit[y + cred];
     outptr0[RGB_GREEN] = range_limit[y + cgreen];
     outptr0[RGB_BLUE] =  range_limit[y + cblue];
 #ifdef RGB_ALPHA
     outptr0[RGB_ALPHA] = 0xFF;
 #endif
-    y  = GETJSAMPLE(*inptr01);
+    y  = *inptr01;
     outptr1[RGB_RED] =   range_limit[y + cred];
     outptr1[RGB_GREEN] = range_limit[y + cgreen];
     outptr1[RGB_BLUE] =  range_limit[y + cblue];
diff --git a/3rdparty/libjpeg-turbo/src/jdphuff.c b/3rdparty/libjpeg-turbo/src/jdphuff.c
index 9e82636bbd..c6d82ca14b 100644
--- a/3rdparty/libjpeg-turbo/src/jdphuff.c
+++ b/3rdparty/libjpeg-turbo/src/jdphuff.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1995-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015-2016, 2018, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018-2021, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -41,25 +41,6 @@ typedef struct {
   int last_dc_val[MAX_COMPS_IN_SCAN];   /* last DC coef for each component */
 } savable_state;
 
-/* This macro is to work around compilers with missing or broken
- * structure assignment.  You'll need to fix this code if you have
- * such a compiler and you change MAX_COMPS_IN_SCAN.
- */
-
-#ifndef NO_STRUCT_ASSIGN
-#define ASSIGN_STATE(dest, src)  ((dest) = (src))
-#else
-#if MAX_COMPS_IN_SCAN == 4
-#define ASSIGN_STATE(dest, src) \
-  ((dest).EOBRUN = (src).EOBRUN, \
-   (dest).last_dc_val[0] = (src).last_dc_val[0], \
-   (dest).last_dc_val[1] = (src).last_dc_val[1], \
-   (dest).last_dc_val[2] = (src).last_dc_val[2], \
-   (dest).last_dc_val[3] = (src).last_dc_val[3])
-#endif
-#endif
-
-
 typedef struct {
   struct jpeg_entropy_decoder pub; /* public fields */
 
@@ -102,7 +83,7 @@ start_pass_phuff_decoder(j_decompress_ptr cinfo)
   boolean is_DC_band, bad;
   int ci, coefi, tbl;
   d_derived_tbl **pdtbl;
-  int *coef_bit_ptr;
+  int *coef_bit_ptr, *prev_coef_bit_ptr;
   jpeg_component_info *compptr;
 
   is_DC_band = (cinfo->Ss == 0);
@@ -143,8 +124,15 @@ start_pass_phuff_decoder(j_decompress_ptr cinfo)
   for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
     int cindex = cinfo->cur_comp_info[ci]->component_index;
     coef_bit_ptr = &cinfo->coef_bits[cindex][0];
+    prev_coef_bit_ptr = &cinfo->coef_bits[cindex + cinfo->num_components][0];
     if (!is_DC_band && coef_bit_ptr[0] < 0) /* AC without prior DC scan */
       WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, 0);
+    for (coefi = MIN(cinfo->Ss, 1); coefi <= MAX(cinfo->Se, 9); coefi++) {
+      if (cinfo->input_scan_number > 1)
+        prev_coef_bit_ptr[coefi] = coef_bit_ptr[coefi];
+      else
+        prev_coef_bit_ptr[coefi] = 0;
+    }
     for (coefi = cinfo->Ss; coefi <= cinfo->Se; coefi++) {
       int expected = (coef_bit_ptr[coefi] < 0) ? 0 : coef_bit_ptr[coefi];
       if (cinfo->Ah != expected)
@@ -323,7 +311,7 @@ decode_mcu_DC_first(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 
     /* Load up working state */
     BITREAD_LOAD_STATE(cinfo, entropy->bitstate);
-    ASSIGN_STATE(state, entropy->saved);
+    state = entropy->saved;
 
     /* Outer loop handles each block in the MCU */
 
@@ -356,11 +344,12 @@ decode_mcu_DC_first(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 
     /* Completed MCU, so update state */
     BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
-    ASSIGN_STATE(entropy->saved, state);
+    entropy->saved = state;
   }
 
   /* Account for restart interval (no-op if not using restarts) */
-  entropy->restarts_to_go--;
+  if (cinfo->restart_interval)
+    entropy->restarts_to_go--;
 
   return TRUE;
 }
@@ -444,7 +433,8 @@ decode_mcu_AC_first(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
   }
 
   /* Account for restart interval (no-op if not using restarts) */
-  entropy->restarts_to_go--;
+  if (cinfo->restart_interval)
+    entropy->restarts_to_go--;
 
   return TRUE;
 }
@@ -495,7 +485,8 @@ decode_mcu_DC_refine(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
   BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
 
   /* Account for restart interval (no-op if not using restarts) */
-  entropy->restarts_to_go--;
+  if (cinfo->restart_interval)
+    entropy->restarts_to_go--;
 
   return TRUE;
 }
@@ -638,7 +629,8 @@ decode_mcu_AC_refine(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
   }
 
   /* Account for restart interval (no-op if not using restarts) */
-  entropy->restarts_to_go--;
+  if (cinfo->restart_interval)
+    entropy->restarts_to_go--;
 
   return TRUE;
 
@@ -676,7 +668,7 @@ jinit_phuff_decoder(j_decompress_ptr cinfo)
   /* Create progression status table */
   cinfo->coef_bits = (int (*)[DCTSIZE2])
     (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
-                                cinfo->num_components * DCTSIZE2 *
+                                cinfo->num_components * 2 * DCTSIZE2 *
                                 sizeof(int));
   coef_bit_ptr = &cinfo->coef_bits[0][0];
   for (ci = 0; ci < cinfo->num_components; ci++)
diff --git a/3rdparty/libjpeg-turbo/src/jdsample.c b/3rdparty/libjpeg-turbo/src/jdsample.c
index 50a68b3013..eaad72a030 100644
--- a/3rdparty/libjpeg-turbo/src/jdsample.c
+++ b/3rdparty/libjpeg-turbo/src/jdsample.c
@@ -8,7 +8,7 @@
  * Copyright (C) 2010, 2015-2016, D. R. Commander.
  * Copyright (C) 2014, MIPS Technologies, Inc., California.
  * Copyright (C) 2015, Google, Inc.
- * Copyright (C) 2019, Arm Limited.
+ * Copyright (C) 2019-2020, Arm Limited.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -177,7 +177,7 @@ int_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
     outptr = output_data[outrow];
     outend = outptr + cinfo->output_width;
     while (outptr < outend) {
-      invalue = *inptr++;       /* don't need GETJSAMPLE() here */
+      invalue = *inptr++;
       for (h = h_expand; h > 0; h--) {
         *outptr++ = invalue;
       }
@@ -213,7 +213,7 @@ h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
     outptr = output_data[inrow];
     outend = outptr + cinfo->output_width;
     while (outptr < outend) {
-      invalue = *inptr++;       /* don't need GETJSAMPLE() here */
+      invalue = *inptr++;
       *outptr++ = invalue;
       *outptr++ = invalue;
     }
@@ -242,7 +242,7 @@ h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
     outptr = output_data[outrow];
     outend = outptr + cinfo->output_width;
     while (outptr < outend) {
-      invalue = *inptr++;       /* don't need GETJSAMPLE() here */
+      invalue = *inptr++;
       *outptr++ = invalue;
       *outptr++ = invalue;
     }
@@ -283,20 +283,20 @@ h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
     inptr = input_data[inrow];
     outptr = output_data[inrow];
     /* Special case for first column */
-    invalue = GETJSAMPLE(*inptr++);
+    invalue = *inptr++;
     *outptr++ = (JSAMPLE)invalue;
-    *outptr++ = (JSAMPLE)((invalue * 3 + GETJSAMPLE(*inptr) + 2) >> 2);
+    *outptr++ = (JSAMPLE)((invalue * 3 + inptr[0] + 2) >> 2);
 
     for (colctr = compptr->downsampled_width - 2; colctr > 0; colctr--) {
       /* General case: 3/4 * nearer pixel + 1/4 * further pixel */
-      invalue = GETJSAMPLE(*inptr++) * 3;
-      *outptr++ = (JSAMPLE)((invalue + GETJSAMPLE(inptr[-2]) + 1) >> 2);
-      *outptr++ = (JSAMPLE)((invalue + GETJSAMPLE(*inptr) + 2) >> 2);
+      invalue = (*inptr++) * 3;
+      *outptr++ = (JSAMPLE)((invalue + inptr[-2] + 1) >> 2);
+      *outptr++ = (JSAMPLE)((invalue + inptr[0] + 2) >> 2);
     }
 
     /* Special case for last column */
-    invalue = GETJSAMPLE(*inptr);
-    *outptr++ = (JSAMPLE)((invalue * 3 + GETJSAMPLE(inptr[-1]) + 1) >> 2);
+    invalue = *inptr;
+    *outptr++ = (JSAMPLE)((invalue * 3 + inptr[-1] + 1) >> 2);
     *outptr++ = (JSAMPLE)invalue;
   }
 }
@@ -338,7 +338,7 @@ h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
       outptr = output_data[outrow++];
 
       for (colctr = 0; colctr < compptr->downsampled_width; colctr++) {
-        thiscolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
+        thiscolsum = (*inptr0++) * 3 + (*inptr1++);
         *outptr++ = (JSAMPLE)((thiscolsum + bias) >> 2);
       }
     }
@@ -381,8 +381,8 @@ h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
       outptr = output_data[outrow++];
 
       /* Special case for first column */
-      thiscolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
-      nextcolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
+      thiscolsum = (*inptr0++) * 3 + (*inptr1++);
+      nextcolsum = (*inptr0++) * 3 + (*inptr1++);
       *outptr++ = (JSAMPLE)((thiscolsum * 4 + 8) >> 4);
       *outptr++ = (JSAMPLE)((thiscolsum * 3 + nextcolsum + 7) >> 4);
       lastcolsum = thiscolsum;  thiscolsum = nextcolsum;
@@ -390,7 +390,7 @@ h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
       for (colctr = compptr->downsampled_width - 2; colctr > 0; colctr--) {
         /* General case: 3/4 * nearer pixel + 1/4 * further pixel in each */
         /* dimension, thus 9/16, 3/16, 3/16, 1/16 overall */
-        nextcolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
+        nextcolsum = (*inptr0++) * 3 + (*inptr1++);
         *outptr++ = (JSAMPLE)((thiscolsum * 3 + lastcolsum + 8) >> 4);
         *outptr++ = (JSAMPLE)((thiscolsum * 3 + nextcolsum + 7) >> 4);
         lastcolsum = thiscolsum;  thiscolsum = nextcolsum;
@@ -477,7 +477,13 @@ jinit_upsampler(j_decompress_ptr cinfo)
     } else if (h_in_group == h_out_group &&
                v_in_group * 2 == v_out_group && do_fancy) {
       /* Non-fancy upsampling is handled by the generic method */
-      upsample->methods[ci] = h1v2_fancy_upsample;
+#if defined(__arm__) || defined(__aarch64__) || \
+    defined(_M_ARM) || defined(_M_ARM64)
+      if (jsimd_can_h1v2_fancy_upsample())
+        upsample->methods[ci] = jsimd_h1v2_fancy_upsample;
+      else
+#endif
+        upsample->methods[ci] = h1v2_fancy_upsample;
       upsample->pub.need_context_rows = TRUE;
     } else if (h_in_group * 2 == h_out_group &&
                v_in_group * 2 == v_out_group) {
diff --git a/3rdparty/libjpeg-turbo/src/jerror.h b/3rdparty/libjpeg-turbo/src/jerror.h
index 933a3690fd..4476df2c93 100644
--- a/3rdparty/libjpeg-turbo/src/jerror.h
+++ b/3rdparty/libjpeg-turbo/src/jerror.h
@@ -207,6 +207,10 @@ JMESSAGE(JWRN_ARITH_BAD_CODE, "Corrupt JPEG data: bad arithmetic code")
 #endif
 #endif
 JMESSAGE(JWRN_BOGUS_ICC, "Corrupt JPEG data: bad ICC marker")
+#if JPEG_LIB_VERSION < 70
+JMESSAGE(JERR_BAD_DROP_SAMPLING,
+         "Component index %d: mismatching sampling ratio %d:%d, %d:%d, %c")
+#endif
 
 #ifdef JMAKE_ENUM_LIST
 
@@ -252,6 +256,15 @@ JMESSAGE(JWRN_BOGUS_ICC, "Corrupt JPEG data: bad ICC marker")
    (cinfo)->err->msg_parm.i[2] = (p3), \
    (cinfo)->err->msg_parm.i[3] = (p4), \
    (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
+#define ERREXIT6(cinfo, code, p1, p2, p3, p4, p5, p6) \
+  ((cinfo)->err->msg_code = (code), \
+   (cinfo)->err->msg_parm.i[0] = (p1), \
+   (cinfo)->err->msg_parm.i[1] = (p2), \
+   (cinfo)->err->msg_parm.i[2] = (p3), \
+   (cinfo)->err->msg_parm.i[3] = (p4), \
+   (cinfo)->err->msg_parm.i[4] = (p5), \
+   (cinfo)->err->msg_parm.i[5] = (p6), \
+   (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
 #define ERREXITS(cinfo, code, str) \
   ((cinfo)->err->msg_code = (code), \
    strncpy((cinfo)->err->msg_parm.s, (str), JMSG_STR_PARM_MAX), \
diff --git a/3rdparty/libjpeg-turbo/src/jidctint.c b/3rdparty/libjpeg-turbo/src/jidctint.c
index 50f385da33..bb08748019 100644
--- a/3rdparty/libjpeg-turbo/src/jidctint.c
+++ b/3rdparty/libjpeg-turbo/src/jidctint.c
@@ -3,7 +3,7 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1998, Thomas G. Lane.
- * Modification developed 2002-2009 by Guido Vollbeding.
+ * Modification developed 2002-2018 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2015, 2020, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
@@ -417,7 +417,7 @@ jpeg_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
 /*
  * Perform dequantization and inverse DCT on one block of coefficients,
- * producing a 7x7 output block.
+ * producing a reduced-size 7x7 output block.
  *
  * Optimized algorithm with 12 multiplications in the 1-D kernel.
  * cK represents sqrt(2) * cos(K*pi/14).
@@ -1258,7 +1258,7 @@ jpeg_idct_10x10(j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
 /*
  * Perform dequantization and inverse DCT on one block of coefficients,
- * producing a 11x11 output block.
+ * producing an 11x11 output block.
  *
  * Optimized algorithm with 24 multiplications in the 1-D kernel.
  * cK represents sqrt(2) * cos(K*pi/22).
@@ -2398,7 +2398,7 @@ jpeg_idct_16x16(j_decompress_ptr cinfo, jpeg_component_info *compptr,
     tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
     /* Add fudge factor here for final descale. */
-    tmp0 += 1 << (CONST_BITS - PASS1_BITS - 1);
+    tmp0 += ONE << (CONST_BITS - PASS1_BITS - 1);
 
     z1 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
     tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
diff --git a/3rdparty/libjpeg-turbo/src/jmorecfg.h b/3rdparty/libjpeg-turbo/src/jmorecfg.h
index aa29f0f9f1..fb3a9cf411 100644
--- a/3rdparty/libjpeg-turbo/src/jmorecfg.h
+++ b/3rdparty/libjpeg-turbo/src/jmorecfg.h
@@ -43,25 +43,11 @@
 
 #if BITS_IN_JSAMPLE == 8
 /* JSAMPLE should be the smallest type that will hold the values 0..255.
- * You can use a signed char by having GETJSAMPLE mask it with 0xFF.
  */
 
-#ifdef HAVE_UNSIGNED_CHAR
-
 typedef unsigned char JSAMPLE;
 #define GETJSAMPLE(value)  ((int)(value))
 
-#else /* not HAVE_UNSIGNED_CHAR */
-
-typedef char JSAMPLE;
-#ifdef __CHAR_UNSIGNED__
-#define GETJSAMPLE(value)  ((int)(value))
-#else
-#define GETJSAMPLE(value)  ((int)(value) & 0xFF)
-#endif /* __CHAR_UNSIGNED__ */
-
-#endif /* HAVE_UNSIGNED_CHAR */
-
 #define MAXJSAMPLE      255
 #define CENTERJSAMPLE   128
 
@@ -97,22 +83,9 @@ typedef short JCOEF;
  * managers, this is also the data type passed to fread/fwrite.
  */
 
-#ifdef HAVE_UNSIGNED_CHAR
-
 typedef unsigned char JOCTET;
 #define GETJOCTET(value)  (value)
 
-#else /* not HAVE_UNSIGNED_CHAR */
-
-typedef char JOCTET;
-#ifdef __CHAR_UNSIGNED__
-#define GETJOCTET(value)  (value)
-#else
-#define GETJOCTET(value)  ((value) & 0xFF)
-#endif /* __CHAR_UNSIGNED__ */
-
-#endif /* HAVE_UNSIGNED_CHAR */
-
 
 /* These typedefs are used for various table entries and so forth.
  * They must be at least as wide as specified; but making them too big
@@ -123,15 +96,7 @@ typedef char JOCTET;
 
 /* UINT8 must hold at least the values 0..255. */
 
-#ifdef HAVE_UNSIGNED_CHAR
 typedef unsigned char UINT8;
-#else /* not HAVE_UNSIGNED_CHAR */
-#ifdef __CHAR_UNSIGNED__
-typedef char UINT8;
-#else /* not __CHAR_UNSIGNED__ */
-typedef short UINT8;
-#endif /* __CHAR_UNSIGNED__ */
-#endif /* HAVE_UNSIGNED_CHAR */
 
 /* UINT16 must hold at least the values 0..65535. */
 
diff --git a/3rdparty/libjpeg-turbo/src/jpegint.h b/3rdparty/libjpeg-turbo/src/jpegint.h
index ad36ca8b56..195fbcb9b6 100644
--- a/3rdparty/libjpeg-turbo/src/jpegint.h
+++ b/3rdparty/libjpeg-turbo/src/jpegint.h
@@ -5,7 +5,7 @@
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * Modified 1997-2009 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015-2016, D. R. Commander.
+ * Copyright (C) 2015-2016, 2019, D. R. Commander.
  * Copyright (C) 2015, Google, Inc.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
@@ -158,6 +158,9 @@ struct jpeg_decomp_master {
   JDIMENSION first_MCU_col[MAX_COMPONENTS];
   JDIMENSION last_MCU_col[MAX_COMPONENTS];
   boolean jinit_upsampler_no_alloc;
+
+  /* Last iMCU row that was successfully decoded */
+  JDIMENSION last_good_iMCU_row;
 };
 
 /* Input control module */
diff --git a/3rdparty/libjpeg-turbo/src/jquant1.c b/3rdparty/libjpeg-turbo/src/jquant1.c
index 40bbb28cc7..73b83e16e5 100644
--- a/3rdparty/libjpeg-turbo/src/jquant1.c
+++ b/3rdparty/libjpeg-turbo/src/jquant1.c
@@ -479,7 +479,7 @@ color_quantize(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
     for (col = width; col > 0; col--) {
       pixcode = 0;
       for (ci = 0; ci < nc; ci++) {
-        pixcode += GETJSAMPLE(colorindex[ci][GETJSAMPLE(*ptrin++)]);
+        pixcode += colorindex[ci][*ptrin++];
       }
       *ptrout++ = (JSAMPLE)pixcode;
     }
@@ -506,9 +506,9 @@ color_quantize3(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
     ptrin = input_buf[row];
     ptrout = output_buf[row];
     for (col = width; col > 0; col--) {
-      pixcode  = GETJSAMPLE(colorindex0[GETJSAMPLE(*ptrin++)]);
-      pixcode += GETJSAMPLE(colorindex1[GETJSAMPLE(*ptrin++)]);
-      pixcode += GETJSAMPLE(colorindex2[GETJSAMPLE(*ptrin++)]);
+      pixcode  = colorindex0[*ptrin++];
+      pixcode += colorindex1[*ptrin++];
+      pixcode += colorindex2[*ptrin++];
       *ptrout++ = (JSAMPLE)pixcode;
     }
   }
@@ -552,7 +552,7 @@ quantize_ord_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
          * required amount of padding.
          */
         *output_ptr +=
-          colorindex_ci[GETJSAMPLE(*input_ptr) + dither[col_index]];
+          colorindex_ci[*input_ptr + dither[col_index]];
         input_ptr += nc;
         output_ptr++;
         col_index = (col_index + 1) & ODITHER_MASK;
@@ -595,12 +595,9 @@ quantize3_ord_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
     col_index = 0;
 
     for (col = width; col > 0; col--) {
-      pixcode  =
-        GETJSAMPLE(colorindex0[GETJSAMPLE(*input_ptr++) + dither0[col_index]]);
-      pixcode +=
-        GETJSAMPLE(colorindex1[GETJSAMPLE(*input_ptr++) + dither1[col_index]]);
-      pixcode +=
-        GETJSAMPLE(colorindex2[GETJSAMPLE(*input_ptr++) + dither2[col_index]]);
+      pixcode  = colorindex0[(*input_ptr++) + dither0[col_index]];
+      pixcode += colorindex1[(*input_ptr++) + dither1[col_index]];
+      pixcode += colorindex2[(*input_ptr++) + dither2[col_index]];
       *output_ptr++ = (JSAMPLE)pixcode;
       col_index = (col_index + 1) & ODITHER_MASK;
     }
@@ -677,15 +674,15 @@ quantize_fs_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
          * The maximum error is +- MAXJSAMPLE; this sets the required size
          * of the range_limit array.
          */
-        cur += GETJSAMPLE(*input_ptr);
-        cur = GETJSAMPLE(range_limit[cur]);
+        cur += *input_ptr;
+        cur = range_limit[cur];
         /* Select output value, accumulate into output code for this pixel */
-        pixcode = GETJSAMPLE(colorindex_ci[cur]);
+        pixcode = colorindex_ci[cur];
         *output_ptr += (JSAMPLE)pixcode;
         /* Compute actual representation error at this pixel */
         /* Note: we can do this even though we don't have the final */
         /* pixel code, because the colormap is orthogonal. */
-        cur -= GETJSAMPLE(colormap_ci[pixcode]);
+        cur -= colormap_ci[pixcode];
         /* Compute error fractions to be propagated to adjacent pixels.
          * Add these into the running sums, and simultaneously shift the
          * next-line error sums left by 1 column.
diff --git a/3rdparty/libjpeg-turbo/src/jquant2.c b/3rdparty/libjpeg-turbo/src/jquant2.c
index 6570613bb9..44efb18cad 100644
--- a/3rdparty/libjpeg-turbo/src/jquant2.c
+++ b/3rdparty/libjpeg-turbo/src/jquant2.c
@@ -215,9 +215,9 @@ prescan_quantize(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
     ptr = input_buf[row];
     for (col = width; col > 0; col--) {
       /* get pixel value and index into the histogram */
-      histp = &histogram[GETJSAMPLE(ptr[0]) >> C0_SHIFT]
-                        [GETJSAMPLE(ptr[1]) >> C1_SHIFT]
-                        [GETJSAMPLE(ptr[2]) >> C2_SHIFT];
+      histp = &histogram[ptr[0] >> C0_SHIFT]
+                        [ptr[1] >> C1_SHIFT]
+                        [ptr[2] >> C2_SHIFT];
       /* increment, check for overflow and undo increment if so. */
       if (++(*histp) <= 0)
         (*histp)--;
@@ -665,7 +665,7 @@ find_nearby_colors(j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
 
   for (i = 0; i < numcolors; i++) {
     /* We compute the squared-c0-distance term, then add in the other two. */
-    x = GETJSAMPLE(cinfo->colormap[0][i]);
+    x = cinfo->colormap[0][i];
     if (x < minc0) {
       tdist = (x - minc0) * C0_SCALE;
       min_dist = tdist * tdist;
@@ -688,7 +688,7 @@ find_nearby_colors(j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
       }
     }
 
-    x = GETJSAMPLE(cinfo->colormap[1][i]);
+    x = cinfo->colormap[1][i];
     if (x < minc1) {
       tdist = (x - minc1) * C1_SCALE;
       min_dist += tdist * tdist;
@@ -710,7 +710,7 @@ find_nearby_colors(j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
       }
     }
 
-    x = GETJSAMPLE(cinfo->colormap[2][i]);
+    x = cinfo->colormap[2][i];
     if (x < minc2) {
       tdist = (x - minc2) * C2_SCALE;
       min_dist += tdist * tdist;
@@ -788,13 +788,13 @@ find_best_colors(j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
 #define STEP_C2  ((1 << C2_SHIFT) * C2_SCALE)
 
   for (i = 0; i < numcolors; i++) {
-    icolor = GETJSAMPLE(colorlist[i]);
+    icolor = colorlist[i];
     /* Compute (square of) distance from minc0/c1/c2 to this color */
-    inc0 = (minc0 - GETJSAMPLE(cinfo->colormap[0][icolor])) * C0_SCALE;
+    inc0 = (minc0 - cinfo->colormap[0][icolor]) * C0_SCALE;
     dist0 = inc0 * inc0;
-    inc1 = (minc1 - GETJSAMPLE(cinfo->colormap[1][icolor])) * C1_SCALE;
+    inc1 = (minc1 - cinfo->colormap[1][icolor]) * C1_SCALE;
     dist0 += inc1 * inc1;
-    inc2 = (minc2 - GETJSAMPLE(cinfo->colormap[2][icolor])) * C2_SCALE;
+    inc2 = (minc2 - cinfo->colormap[2][icolor]) * C2_SCALE;
     dist0 += inc2 * inc2;
     /* Form the initial difference increments */
     inc0 = inc0 * (2 * STEP_C0) + STEP_C0 * STEP_C0;
@@ -879,7 +879,7 @@ fill_inverse_cmap(j_decompress_ptr cinfo, int c0, int c1, int c2)
     for (ic1 = 0; ic1 < BOX_C1_ELEMS; ic1++) {
       cachep = &histogram[c0 + ic0][c1 + ic1][c2];
       for (ic2 = 0; ic2 < BOX_C2_ELEMS; ic2++) {
-        *cachep++ = (histcell)(GETJSAMPLE(*cptr++) + 1);
+        *cachep++ = (histcell)((*cptr++) + 1);
       }
     }
   }
@@ -909,9 +909,9 @@ pass2_no_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
     outptr = output_buf[row];
     for (col = width; col > 0; col--) {
       /* get pixel value and index into the cache */
-      c0 = GETJSAMPLE(*inptr++) >> C0_SHIFT;
-      c1 = GETJSAMPLE(*inptr++) >> C1_SHIFT;
-      c2 = GETJSAMPLE(*inptr++) >> C2_SHIFT;
+      c0 = (*inptr++) >> C0_SHIFT;
+      c1 = (*inptr++) >> C1_SHIFT;
+      c2 = (*inptr++) >> C2_SHIFT;
       cachep = &histogram[c0][c1][c2];
       /* If we have not seen this color before, find nearest colormap entry */
       /* and update the cache */
@@ -996,12 +996,12 @@ pass2_fs_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
        * The maximum error is +- MAXJSAMPLE (or less with error limiting);
        * this sets the required size of the range_limit array.
        */
-      cur0 += GETJSAMPLE(inptr[0]);
-      cur1 += GETJSAMPLE(inptr[1]);
-      cur2 += GETJSAMPLE(inptr[2]);
-      cur0 = GETJSAMPLE(range_limit[cur0]);
-      cur1 = GETJSAMPLE(range_limit[cur1]);
-      cur2 = GETJSAMPLE(range_limit[cur2]);
+      cur0 += inptr[0];
+      cur1 += inptr[1];
+      cur2 += inptr[2];
+      cur0 = range_limit[cur0];
+      cur1 = range_limit[cur1];
+      cur2 = range_limit[cur2];
       /* Index into the cache with adjusted pixel value */
       cachep =
         &histogram[cur0 >> C0_SHIFT][cur1 >> C1_SHIFT][cur2 >> C2_SHIFT];
@@ -1015,9 +1015,9 @@ pass2_fs_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
         register int pixcode = *cachep - 1;
         *outptr = (JSAMPLE)pixcode;
         /* Compute representation error for this pixel */
-        cur0 -= GETJSAMPLE(colormap0[pixcode]);
-        cur1 -= GETJSAMPLE(colormap1[pixcode]);
-        cur2 -= GETJSAMPLE(colormap2[pixcode]);
+        cur0 -= colormap0[pixcode];
+        cur1 -= colormap1[pixcode];
+        cur2 -= colormap2[pixcode];
       }
       /* Compute error fractions to be propagated to adjacent pixels.
        * Add these into the running sums, and simultaneously shift the
diff --git a/3rdparty/libjpeg-turbo/src/jsimd.h b/3rdparty/libjpeg-turbo/src/jsimd.h
index 51e2b8c89d..6c203655ef 100644
--- a/3rdparty/libjpeg-turbo/src/jsimd.h
+++ b/3rdparty/libjpeg-turbo/src/jsimd.h
@@ -4,6 +4,7 @@
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * Copyright (C) 2011, 2014, D. R. Commander.
  * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2020, Arm Limited.
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -75,6 +76,7 @@ EXTERN(void) jsimd_int_upsample(j_decompress_ptr cinfo,
 
 EXTERN(int) jsimd_can_h2v2_fancy_upsample(void);
 EXTERN(int) jsimd_can_h2v1_fancy_upsample(void);
+EXTERN(int) jsimd_can_h1v2_fancy_upsample(void);
 
 EXTERN(void) jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo,
                                        jpeg_component_info *compptr,
@@ -84,6 +86,10 @@ EXTERN(void) jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo,
                                        jpeg_component_info *compptr,
                                        JSAMPARRAY input_data,
                                        JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo,
+                                       jpeg_component_info *compptr,
+                                       JSAMPARRAY input_data,
+                                       JSAMPARRAY *output_data_ptr);
 
 EXTERN(int) jsimd_can_h2v2_merged_upsample(void);
 EXTERN(int) jsimd_can_h2v1_merged_upsample(void);
diff --git a/3rdparty/libjpeg-turbo/src/jsimd_none.c b/3rdparty/libjpeg-turbo/src/jsimd_none.c
index 3cb6c80f8a..5b38a9fb5c 100644
--- a/3rdparty/libjpeg-turbo/src/jsimd_none.c
+++ b/3rdparty/libjpeg-turbo/src/jsimd_none.c
@@ -4,6 +4,7 @@
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * Copyright (C) 2009-2011, 2014, D. R. Commander.
  * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2020, Arm Limited.
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -169,6 +170,12 @@ jsimd_can_h2v1_fancy_upsample(void)
   return 0;
 }
 
+GLOBAL(int)
+jsimd_can_h1v2_fancy_upsample(void)
+{
+  return 0;
+}
+
 GLOBAL(void)
 jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                           JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
@@ -181,6 +188,12 @@ jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
 {
 }
 
+GLOBAL(void)
+jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
 GLOBAL(int)
 jsimd_can_h2v2_merged_upsample(void)
 {
diff --git a/3rdparty/libjpeg-turbo/src/jversion.h b/3rdparty/libjpeg-turbo/src/jversion.h
index 4462b94104..2ab534af41 100644
--- a/3rdparty/libjpeg-turbo/src/jversion.h
+++ b/3rdparty/libjpeg-turbo/src/jversion.h
@@ -2,9 +2,9 @@
  * jversion.h
  *
  * This file was part of the Independent JPEG Group's software:
- * Copyright (C) 1991-2012, Thomas G. Lane, Guido Vollbeding.
+ * Copyright (C) 1991-2020, Thomas G. Lane, Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, 2012-2020, D. R. Commander.
+ * Copyright (C) 2010, 2012-2021, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -37,9 +37,9 @@
  */
 
 #define JCOPYRIGHT \
-  "Copyright (C) 2009-2020 D. R. Commander\n" \
+  "Copyright (C) 2009-2021 D. R. Commander\n" \
   "Copyright (C) 2015, 2020 Google, Inc.\n" \
-  "Copyright (C) 2019 Arm Limited\n" \
+  "Copyright (C) 2019-2020 Arm Limited\n" \
   "Copyright (C) 2015-2016, 2018 Matthieu Darbois\n" \
   "Copyright (C) 2011-2016 Siarhei Siamashka\n" \
   "Copyright (C) 2015 Intel Corporation\n" \
@@ -48,7 +48,7 @@
   "Copyright (C) 2009, 2012 Pierre Ossman for Cendio AB\n" \
   "Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies)\n" \
   "Copyright (C) 1999-2006 MIYASAKA Masaru\n" \
-  "Copyright (C) 1991-2017 Thomas G. Lane, Guido Vollbeding"
+  "Copyright (C) 1991-2020 Thomas G. Lane, Guido Vollbeding"
 
 #define JCOPYRIGHT_SHORT \
-  "Copyright (C) 1991-2020 The libjpeg-turbo Project and many others"
+  "Copyright (C) 1991-2021 The libjpeg-turbo Project and many others"
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 78327c1a70..eef357e8ba 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -239,7 +239,7 @@ OCV_OPTION(WITH_CAP_IOS "Enable iOS video capture" ON
   VISIBLE_IF IOS
   VERIFY HAVE_CAP_IOS)
 OCV_OPTION(WITH_CAROTENE "Use NVidia carotene acceleration library for ARM platform" ON
-  VISIBLE_IF (ARM OR AARCH64) AND NOT IOS AND NOT (CMAKE_VERSION VERSION_LESS "2.8.11"))
+  VISIBLE_IF (ARM OR AARCH64) AND NOT IOS)
 OCV_OPTION(WITH_CPUFEATURES "Use cpufeatures Android library" ON
   VISIBLE_IF ANDROID
   VERIFY HAVE_CPUFEATURES)
@@ -499,7 +499,7 @@ OCV_OPTION(OPENCV_WARNINGS_ARE_ERRORS "Treat warnings as errors"
 OCV_OPTION(ANDROID_EXAMPLES_WITH_LIBS "Build binaries of Android examples with native libraries" OFF  IF ANDROID )
 OCV_OPTION(ENABLE_IMPL_COLLECTION     "Collect implementation data on function call"             OFF )
 OCV_OPTION(ENABLE_INSTRUMENTATION     "Instrument functions to collect calls trace and performance" OFF )
-OCV_OPTION(ENABLE_GNU_STL_DEBUG       "Enable GNU STL Debug mode (defines _GLIBCXX_DEBUG)"       OFF IF ((NOT CMAKE_VERSION VERSION_LESS "2.8.11") AND CV_GCC) )
+OCV_OPTION(ENABLE_GNU_STL_DEBUG       "Enable GNU STL Debug mode (defines _GLIBCXX_DEBUG)"       OFF IF CV_GCC )
 OCV_OPTION(ENABLE_BUILD_HARDENING     "Enable hardening of the resulting binaries (against security attacks, detects memory corruption, etc)" OFF)
 OCV_OPTION(ENABLE_LTO                 "Enable Link Time Optimization" OFF IF CV_GCC OR MSVC)
 OCV_OPTION(ENABLE_THIN_LTO            "Enable Thin LTO" OFF IF CV_CLANG)
@@ -511,6 +511,7 @@ OCV_OPTION(CV_TRACE                   "Enable OpenCV code trace" ON)
 OCV_OPTION(OPENCV_GENERATE_SETUPVARS  "Generate setup_vars* scripts" ON IF (NOT ANDROID AND NOT APPLE_FRAMEWORK) )
 OCV_OPTION(ENABLE_CONFIG_VERIFICATION "Fail build if actual configuration doesn't match requested (WITH_XXX != HAVE_XXX)" OFF)
 OCV_OPTION(OPENCV_ENABLE_MEMALIGN     "Enable posix_memalign or memalign usage" ON)
+OCV_OPTION(OPENCV_DISABLE_FILESYSTEM_SUPPORT "Disable filesystem support" OFF)
 
 OCV_OPTION(ENABLE_PYLINT              "Add target with Pylint checks"                            (BUILD_DOCS OR BUILD_EXAMPLES) IF (NOT CMAKE_CROSSCOMPILING AND NOT APPLE_FRAMEWORK) )
 OCV_OPTION(ENABLE_FLAKE8              "Add target with Python flake8 checker"                    (BUILD_DOCS OR BUILD_EXAMPLES) IF (NOT CMAKE_CROSSCOMPILING AND NOT APPLE_FRAMEWORK) )
@@ -522,6 +523,10 @@ if(ENABLE_IMPL_COLLECTION)
   add_definitions(-DCV_COLLECT_IMPL_DATA)
 endif()
 
+if(OPENCV_DISABLE_FILESYSTEM_SUPPORT)
+  add_definitions(-DOPENCV_HAVE_FILESYSTEM_SUPPORT=0)
+endif()
+
 set(OPENCV_MATHJAX_RELPATH "https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0" CACHE STRING "URI to a MathJax installation")
 
 # ----------------------------------------------------------------------------
@@ -655,6 +660,8 @@ if(UNIX)
       set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} m pthread)
     elseif(EMSCRIPTEN)
       # no need to link to system libs with emscripten
+    elseif(QNXNTO)
+      # no need to link to system libs with QNX
     else()
       set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} dl m pthread rt)
     endif()
@@ -1048,7 +1055,6 @@ endif()
 status("")
 status("  Platform:")
 if(NOT DEFINED OPENCV_TIMESTAMP
-    AND NOT CMAKE_VERSION VERSION_LESS 2.8.11
     AND NOT BUILD_INFO_SKIP_TIMESTAMP
 )
   string(TIMESTAMP OPENCV_TIMESTAMP "" UTC)
@@ -1133,6 +1139,10 @@ endif()
 status("    ccache:"                  OPENCV_COMPILER_IS_CCACHE THEN YES ELSE NO)
 status("    Precompiled headers:"     PCHSupport_FOUND AND ENABLE_PRECOMPILED_HEADERS THEN YES ELSE NO)
 
+if(OPENCV_DISABLE_FILESYSTEM_SUPPORT)
+  status("    Filesystem support is disabled")
+endif()
+
 # ========================== Dependencies ============================
 ocv_get_all_libs(deps_modules deps_extra deps_3rdparty)
 status("    Extra dependencies:" ${deps_extra})
@@ -1432,7 +1442,16 @@ if(WITH_LIBREALSENSE OR HAVE_LIBREALSENSE)
 endif()
 
 if(WITH_MFX OR HAVE_MFX)
-  status("    Intel Media SDK:" HAVE_MFX      THEN "YES (${MFX_LIBRARY})" ELSE NO)
+  if(HAVE_MFX)
+    if(MFX_LIBRARY)
+      set(__details " (${MFX_LIBRARY})")
+    elseif(MFX_LIBRARIES)
+      set(__details " (${MFX_LIBRARIES})")
+    else()
+      set(__details " (unknown)")
+    endif()
+  endif()
+  status("    Intel Media SDK:" HAVE_MFX      THEN "YES${__details}" ELSE NO)
 endif()
 
 if(WITH_GPHOTO2 OR HAVE_GPHOTO2)
diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake
index 40a058d74e..a161b6eb8b 100644
--- a/cmake/OpenCVCompilerOptions.cmake
+++ b/cmake/OpenCVCompilerOptions.cmake
@@ -227,9 +227,11 @@ if(CV_GCC OR CV_CLANG)
         if(APPLE)
           set(OPENCV_EXTRA_EXE_LINKER_FLAGS "${OPENCV_EXTRA_EXE_LINKER_FLAGS} -Wl,-dead_strip")
           set(OPENCV_EXTRA_SHARED_LINKER_FLAGS "${OPENCV_EXTRA_SHARED_LINKER_FLAGS} -Wl,-dead_strip")
+          set(OPENCV_EXTRA_MODULE_LINKER_FLAGS "${OPENCV_EXTRA_MODULE_LINKER_FLAGS} -Wl,-dead_strip")
         else()
           set(OPENCV_EXTRA_EXE_LINKER_FLAGS "${OPENCV_EXTRA_EXE_LINKER_FLAGS} -Wl,--gc-sections")
           set(OPENCV_EXTRA_SHARED_LINKER_FLAGS "${OPENCV_EXTRA_SHARED_LINKER_FLAGS} -Wl,--gc-sections")
+          set(OPENCV_EXTRA_MODULE_LINKER_FLAGS "${OPENCV_EXTRA_MODULE_LINKER_FLAGS} -Wl,--gc-sections")
         endif()
       endif()
     endif()
@@ -281,6 +283,7 @@ if(MSVC)
     set(OPENCV_EXTRA_FLAGS_RELEASE "${OPENCV_EXTRA_FLAGS_RELEASE} /Zi")
     set(OPENCV_EXTRA_EXE_LINKER_FLAGS_RELEASE "${OPENCV_EXTRA_EXE_LINKER_FLAGS_RELEASE} /debug")
     set(OPENCV_EXTRA_SHARED_LINKER_FLAGS_RELEASE "${OPENCV_EXTRA_SHARED_LINKER_FLAGS_RELEASE} /debug")
+    set(OPENCV_EXTRA_MODULE_LINKER_FLAGS_RELEASE "${OPENCV_EXTRA_MODULE_LINKER_FLAGS_RELEASE} /debug")
   endif()
 
   # Remove unreferenced functions: function level linking
@@ -350,6 +353,7 @@ if(NOT OPENCV_SKIP_LINK_AS_NEEDED)
     if(HAVE_LINK_AS_NEEDED)
       set(OPENCV_EXTRA_EXE_LINKER_FLAGS "${OPENCV_EXTRA_EXE_LINKER_FLAGS} ${_option}")
       set(OPENCV_EXTRA_SHARED_LINKER_FLAGS "${OPENCV_EXTRA_SHARED_LINKER_FLAGS} ${_option}")
+      set(OPENCV_EXTRA_MODULE_LINKER_FLAGS "${OPENCV_EXTRA_MODULE_LINKER_FLAGS} ${_option}")
     endif()
   endif()
 endif()
@@ -368,6 +372,9 @@ if(NOT OPENCV_SKIP_EXTRA_COMPILER_FLAGS)
   set(CMAKE_SHARED_LINKER_FLAGS         "${CMAKE_SHARED_LINKER_FLAGS} ${OPENCV_EXTRA_SHARED_LINKER_FLAGS}")
   set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} ${OPENCV_EXTRA_SHARED_LINKER_FLAGS_RELEASE}")
   set(CMAKE_SHARED_LINKER_FLAGS_DEBUG   "${CMAKE_SHARED_LINKER_FLAGS_DEBUG} ${OPENCV_EXTRA_SHARED_LINKER_FLAGS_DEBUG}")
+  set(CMAKE_MODULE_LINKER_FLAGS         "${CMAKE_MODULE_LINKER_FLAGS} ${OPENCV_EXTRA_MODULE_LINKER_FLAGS}")
+  set(CMAKE_MODULE_LINKER_FLAGS_RELEASE "${CMAKE_MODULE_LINKER_FLAGS_RELEASE} ${OPENCV_EXTRA_MODULE_LINKER_FLAGS_RELEASE}")
+  set(CMAKE_MODULE_LINKER_FLAGS_DEBUG   "${CMAKE_MODULE_LINKER_FLAGS_DEBUG} ${OPENCV_EXTRA_MODULE_LINKER_FLAGS_DEBUG}")
 endif()
 
 if(MSVC)
diff --git a/cmake/OpenCVDetectInferenceEngine.cmake b/cmake/OpenCVDetectInferenceEngine.cmake
index 216c02c3cc..2c0296d634 100644
--- a/cmake/OpenCVDetectInferenceEngine.cmake
+++ b/cmake/OpenCVDetectInferenceEngine.cmake
@@ -134,10 +134,14 @@ endif()
 # Add more features to the target
 
 if(INF_ENGINE_TARGET)
+  if(InferenceEngine_VERSION VERSION_GREATER_EQUAL "2021.4")
+    math(EXPR INF_ENGINE_RELEASE "${InferenceEngine_VERSION_MAJOR} * 1000000 + ${InferenceEngine_VERSION_MINOR} * 10000 + ${InferenceEngine_VERSION_PATCH} * 100")
+  endif()
   if(NOT INF_ENGINE_RELEASE)
     message(WARNING "InferenceEngine version has not been set, 2021.3 will be used by default. Set INF_ENGINE_RELEASE variable if you experience build errors.")
+    set(INF_ENGINE_RELEASE "2021030000")
   endif()
-  set(INF_ENGINE_RELEASE "2021030000" CACHE STRING "Force IE version, should be in form YYYYAABBCC (e.g. 2020.1.0.2 -> 2020010002)")
+  set(INF_ENGINE_RELEASE "${INF_ENGINE_RELEASE}" CACHE STRING "Force IE version, should be in form YYYYAABBCC (e.g. 2020.1.0.2 -> 2020010002)")
   set_target_properties(${INF_ENGINE_TARGET} PROPERTIES
     INTERFACE_COMPILE_DEFINITIONS "HAVE_INF_ENGINE=1;INF_ENGINE_RELEASE=${INF_ENGINE_RELEASE}"
   )
diff --git a/cmake/OpenCVDownload.cmake b/cmake/OpenCVDownload.cmake
index 63cf6d3238..a427a41227 100644
--- a/cmake/OpenCVDownload.cmake
+++ b/cmake/OpenCVDownload.cmake
@@ -23,7 +23,7 @@ set(OPENCV_DOWNLOAD_LOG "${OpenCV_BINARY_DIR}/CMakeDownloadLog.txt")
 set(OPENCV_DOWNLOAD_WITH_CURL "${OpenCV_BINARY_DIR}/download_with_curl.sh")
 set(OPENCV_DOWNLOAD_WITH_WGET "${OpenCV_BINARY_DIR}/download_with_wget.sh")
 set(OPENCV_DOWNLOAD_TRIES_LIST 1 CACHE STRING "List of download tries") # a list
-set(OPENCV_DOWNLOAD_PARAMS INACTIVITY_TIMEOUT 60 TIMEOUT 600 CACHE STRING "Download parameters to be passed to file(DOWNLAOD ...)")
+set(OPENCV_DOWNLOAD_PARAMS INACTIVITY_TIMEOUT 60 TIMEOUT 600 CACHE STRING "Download parameters to be passed to file(DOWNLOAD ...)")
 mark_as_advanced(OPENCV_DOWNLOAD_TRIES_LIST OPENCV_DOWNLOAD_PARAMS)
 
 # Init download cache directory and log file and helper scripts
diff --git a/cmake/OpenCVFindLibsGUI.cmake b/cmake/OpenCVFindLibsGUI.cmake
index e3593d4dc9..8030e8b0c0 100644
--- a/cmake/OpenCVFindLibsGUI.cmake
+++ b/cmake/OpenCVFindLibsGUI.cmake
@@ -11,7 +11,7 @@ if(WITH_WIN32UI)
     CMAKE_FLAGS "-DLINK_LIBRARIES:STRING=user32;gdi32")
 endif()
 
-# --- QT4 ---
+# --- QT4/5 ---
 ocv_clear_vars(HAVE_QT HAVE_QT5)
 if(WITH_QT)
   if(NOT WITH_QT EQUAL 4)
@@ -34,41 +34,6 @@ if(WITH_QT)
   endif()
 endif()
 
-# --- GTK ---
-ocv_clear_vars(HAVE_GTK HAVE_GTK3 HAVE_GTHREAD HAVE_GTKGLEXT)
-if(WITH_GTK AND NOT HAVE_QT)
-  if(NOT WITH_GTK_2_X)
-    ocv_check_modules(GTK3 gtk+-3.0)
-    if(HAVE_GTK3)
-      ocv_append_build_options(HIGHGUI GTK3)
-      set(HAVE_GTK TRUE)
-    endif()
-  endif()
-  if(NOT HAVE_GTK)
-    ocv_check_modules(GTK2 gtk+-2.0)
-    if(HAVE_GTK2)
-      if (GTK2_VERSION VERSION_LESS MIN_VER_GTK)
-        message (FATAL_ERROR "GTK support requires a minimum version of ${MIN_VER_GTK} (${GTK2_VERSION} found)")
-      else()
-        ocv_append_build_options(HIGHGUI GTK2)
-        set(HAVE_GTK TRUE)
-      endif()
-    endif()
-  endif()
-  ocv_check_modules(GTHREAD gthread-2.0)
-  if(HAVE_GTK AND NOT HAVE_GTHREAD)
-    message(FATAL_ERROR "gthread not found. This library is required when building with GTK support")
-  else()
-    ocv_append_build_options(HIGHGUI GTHREAD)
-  endif()
-  if(WITH_OPENGL AND NOT HAVE_GTK3)
-    ocv_check_modules(GTKGLEXT gtkglext-1.0)
-    if(HAVE_GTKGLEXT)
-      ocv_append_build_options(HIGHGUI GTKGLEXT)
-    endif()
-  endif()
-endif()
-
 # --- OpenGl ---
 ocv_clear_vars(HAVE_OPENGL HAVE_QT_OPENGL)
 if(WITH_OPENGL)
diff --git a/cmake/OpenCVFindOpenEXR.cmake b/cmake/OpenCVFindOpenEXR.cmake
index ef633e853a..133468243a 100644
--- a/cmake/OpenCVFindOpenEXR.cmake
+++ b/cmake/OpenCVFindOpenEXR.cmake
@@ -9,6 +9,14 @@
 # OPENEXR_LIBRARIES = libraries that are needed to use OpenEXR.
 #
 
+find_package(OpenEXR 3.0 CONFIG QUIET)
+if(TARGET OpenEXR::OpenEXR)
+    SET(OPENEXR_FOUND TRUE)
+    SET(OPENEXR_LIBRARIES OpenEXR::OpenEXR)
+    SET(OPENEXR_VERSION ${OpenEXR_VERSION})
+    return()
+endif()
+
 SET(OPENEXR_LIBRARIES "")
 SET(OPENEXR_LIBSEARCH_SUFFIXES "")
 file(TO_CMAKE_PATH "$ENV{ProgramFiles}" ProgramFiles_ENV_PATH)
diff --git a/cmake/OpenCVMinDepVersions.cmake b/cmake/OpenCVMinDepVersions.cmake
index ce0c0ba816..db225e2ab5 100644
--- a/cmake/OpenCVMinDepVersions.cmake
+++ b/cmake/OpenCVMinDepVersions.cmake
@@ -6,4 +6,3 @@ set(MIN_VER_CUDNN 7.5)
 set(MIN_VER_PYTHON2 2.7)
 set(MIN_VER_PYTHON3 3.2)
 set(MIN_VER_ZLIB 1.2.3)
-set(MIN_VER_GTK 2.18.0)
diff --git a/cmake/OpenCVModule.cmake b/cmake/OpenCVModule.cmake
index 224953a1f3..7c48aad9c2 100644
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@@ -880,7 +880,9 @@ macro(_ocv_create_module)
 
   ocv_compiler_optimization_process_sources(OPENCV_MODULE_${the_module}_SOURCES OPENCV_MODULE_${the_module}_DEPS_EXT ${the_module})
   set(__module_headers ${OPENCV_MODULE_${the_module}_HEADERS})
-  list(SORT __module_headers)  # fix headers order, useful for bindings
+  if(__module_headers)
+    list(SORT __module_headers)  # fix headers order, useful for bindings
+  endif()
   set(OPENCV_MODULE_${the_module}_HEADERS ${__module_headers} CACHE INTERNAL "List of header files for ${the_module}")
   set(OPENCV_MODULE_${the_module}_SOURCES ${OPENCV_MODULE_${the_module}_SOURCES} CACHE INTERNAL "List of source files for ${the_module}")
 
@@ -1181,6 +1183,9 @@ function(ocv_add_perf_tests)
       if(TARGET opencv_videoio_plugins)
         add_dependencies(${the_target} opencv_videoio_plugins)
       endif()
+      if(TARGET opencv_highgui_plugins)
+        add_dependencies(${the_target} opencv_highgui_plugins)
+      endif()
 
       if(HAVE_HPX)
         message("Linking HPX to Perf test of module ${name}")
@@ -1276,6 +1281,9 @@ function(ocv_add_accuracy_tests)
       if(TARGET opencv_videoio_plugins)
         add_dependencies(${the_target} opencv_videoio_plugins)
       endif()
+      if(TARGET opencv_highgui_plugins)
+        add_dependencies(${the_target} opencv_highgui_plugins)
+      endif()
 
       if(HAVE_HPX)
         message("Linking HPX to Perf test of module ${name}")
@@ -1366,6 +1374,9 @@ function(ocv_add_samples)
         if(TARGET opencv_videoio_plugins)
           add_dependencies(${the_target} opencv_videoio_plugins)
         endif()
+        if(TARGET opencv_highgui_plugins)
+          add_dependencies(${the_target} opencv_highgui_plugins)
+        endif()
 
         if(INSTALL_BIN_EXAMPLES)
           install(TARGETS ${the_target} RUNTIME DESTINATION "${OPENCV_SAMPLES_BIN_INSTALL_PATH}/${module_id}" COMPONENT samples)
diff --git a/cmake/OpenCVUtils.cmake b/cmake/OpenCVUtils.cmake
index 0951e06581..5d8ab70e0e 100644
--- a/cmake/OpenCVUtils.cmake
+++ b/cmake/OpenCVUtils.cmake
@@ -866,7 +866,9 @@ macro(ocv_check_modules define)
       foreach(flag ${${define}_LDFLAGS})
         if(flag MATCHES "^-L(.*)")
           list(APPEND _libs_paths ${CMAKE_MATCH_1})
-        elseif(IS_ABSOLUTE "${flag}")
+        elseif(IS_ABSOLUTE "${flag}"
+            OR flag STREQUAL "-lstdc++"
+        )
           list(APPEND _libs "${flag}")
         elseif(flag MATCHES "^-l(.*)")
           set(_lib "${CMAKE_MATCH_1}")
@@ -1578,24 +1580,41 @@ endfunction()
 
 
 function(ocv_add_external_target name inc link def)
-  if(BUILD_SHARED_LIBS)
+  if(BUILD_SHARED_LIBS AND link)
     set(imp IMPORTED)
   endif()
   add_library(ocv.3rdparty.${name} INTERFACE ${imp})
-  set_target_properties(ocv.3rdparty.${name} PROPERTIES
-    INTERFACE_INCLUDE_DIRECTORIES "${inc}"
-    INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${inc}"
-    INTERFACE_COMPILE_DEFINITIONS "${def}")
-  # When cmake version is greater than or equal to 3.11, INTERFACE_LINK_LIBRARIES no longer applies to interface library
-  # See https://github.com/opencv/opencv/pull/18658
-  if (CMAKE_VERSION VERSION_LESS 3.11)
-    set_target_properties(ocv.3rdparty.${name} PROPERTIES
-      INTERFACE_LINK_LIBRARIES "${link}")
-  else()
-    target_link_libraries(ocv.3rdparty.${name} INTERFACE ${link})
+  if(def)
+    if(NOT (CMAKE_VERSION VERSION_LESS "3.11.0"))  # https://gitlab.kitware.com/cmake/cmake/-/merge_requests/1264 : eliminates "Cannot specify compile definitions for imported target" error message
+      target_compile_definitions(ocv.3rdparty.${name} INTERFACE "${def}")
+    else()
+      set_target_properties(ocv.3rdparty.${name} PROPERTIES INTERFACE_COMPILE_DEFINITIONS "${def}")
+    endif()
   endif()
-  #
-  if(NOT BUILD_SHARED_LIBS)
+  if(inc)
+    if(NOT (CMAKE_VERSION VERSION_LESS "3.11.0"))  # https://gitlab.kitware.com/cmake/cmake/-/merge_requests/1264 : eliminates "Cannot specify compile definitions for imported target" error message
+      target_include_directories(ocv.3rdparty.${name} SYSTEM INTERFACE "$<BUILD_INTERFACE:${inc}>")
+    else()
+      set_target_properties(ocv.3rdparty.${name} PROPERTIES
+          INTERFACE_INCLUDE_DIRECTORIES "$<BUILD_INTERFACE:${inc}>"
+          INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "$<BUILD_INTERFACE:${inc}>"
+      )
+    endif()
+  endif()
+  if(link)
+    # When cmake version is greater than or equal to 3.11, INTERFACE_LINK_LIBRARIES no longer applies to interface library
+    # See https://github.com/opencv/opencv/pull/18658
+    if(CMAKE_VERSION VERSION_LESS 3.11)
+      set_target_properties(ocv.3rdparty.${name} PROPERTIES
+        INTERFACE_LINK_LIBRARIES "${link}")
+    else()
+      target_link_libraries(ocv.3rdparty.${name} INTERFACE ${link})
+    endif()
+  endif()
+  # to install used target only upgrade CMake
+  if(NOT BUILD_SHARED_LIBS
+      AND CMAKE_VERSION VERSION_LESS "3.13.0"  # https://gitlab.kitware.com/cmake/cmake/-/merge_requests/2152
+  )
     install(TARGETS ocv.3rdparty.${name} EXPORT OpenCVModules)
   endif()
 endfunction()
diff --git a/cmake/templates/cvconfig.h.in b/cmake/templates/cvconfig.h.in
index 8cb59c34cd..4b18a49709 100644
--- a/cmake/templates/cvconfig.h.in
+++ b/cmake/templates/cvconfig.h.in
@@ -28,9 +28,6 @@
 /* Clp support */
 #cmakedefine HAVE_CLP
 
-/* Cocoa API */
-#cmakedefine HAVE_COCOA
-
 /* NVIDIA CUDA Runtime API*/
 #cmakedefine HAVE_CUDA
 
@@ -56,12 +53,6 @@
 /* Geospatial Data Abstraction Library */
 #cmakedefine HAVE_GDAL
 
-/* GTK+ 2.0 Thread support */
-#cmakedefine HAVE_GTHREAD
-
-/* GTK+ 2.x toolkit */
-#cmakedefine HAVE_GTK
-
 /* Halide support */
 #cmakedefine HAVE_HALIDE
 
@@ -121,12 +112,6 @@
 /* parallel_for with pthreads */
 #cmakedefine HAVE_PTHREADS_PF
 
-/* Qt support */
-#cmakedefine HAVE_QT
-
-/* Qt OpenGL support */
-#cmakedefine HAVE_QT_OPENGL
-
 /* Intel Threading Building Blocks */
 #cmakedefine HAVE_TBB
 
diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in
index a321be9878..26ad42b1e5 100644
--- a/doc/Doxyfile.in
+++ b/doc/Doxyfile.in
@@ -31,7 +31,7 @@ MULTILINE_CPP_IS_BRIEF = NO
 INHERIT_DOCS           = YES
 SEPARATE_MEMBER_PAGES  = NO
 TAB_SIZE               = 4
-ALIASES               += add_toggle{1}="@htmlonly[block] <div class='newInnerHTML'>\1</div><div> <script type="text/javascript"> addToggle(); </script>@endhtmlonly"
+ALIASES               += add_toggle{1}="@htmlonly[block] <div class='newInnerHTML'>\1</div><div> <script type='text/javascript'> addToggle(); </script>@endhtmlonly"
 ALIASES               += add_toggle_cpp="@htmlonly[block] <div class='newInnerHTML' title='cpp' style='display: none;'>C++</div><div class='toggleable_div label_cpp' style='display: none;'>@endhtmlonly"
 ALIASES               += add_toggle_java="@htmlonly[block] <div class='newInnerHTML' title='java' style='display: none;'>Java</div><div class='toggleable_div label_java' style='display: none;'>@endhtmlonly"
 ALIASES               += add_toggle_python="@htmlonly[block] <div class='newInnerHTML' title='python' style='display: none;'>Python</div><div class='toggleable_div label_python' style='display: none;'>@endhtmlonly"
diff --git a/doc/opencv.bib b/doc/opencv.bib
index d44b0f5293..d0661e8d5f 100644
--- a/doc/opencv.bib
+++ b/doc/opencv.bib
@@ -850,12 +850,12 @@
   journal = {IEEE Transactions on Robotics and Automation},
   title = {Robot sensor calibration: solving AX=XB on the Euclidean group},
   year = {1994},
+  month = oct,
   volume = {10},
   number = {5},
   pages = {717-721},
   doi = {10.1109/70.326576},
-  ISSN = {1042-296X},
-  month = {Oct}
+  issn = {1042-296X}
 }
 @inproceedings{PM03,
   author = {P{\'e}rez, Patrick and Gangnet, Michel and Blake, Andrew},
@@ -1051,12 +1051,12 @@
   journal = {IEEE Transactions on Robotics and Automation},
   title = {A new technique for fully autonomous and efficient 3D robotics hand/eye calibration},
   year = {1989},
+  month = jun,
   volume = {5},
   number = {3},
   pages = {345-358},
   doi = {10.1109/70.34770},
-  ISSN = {1042-296X},
-  month = {June}
+  issn = {1042-296X}
 }
 @inproceedings{UES01,
   author = {Uyttendaele, Matthew and Eden, Ashley and Skeliski, R},
@@ -1324,3 +1324,13 @@
   pages={5551--5560},
   year={2017}
 }
+@article{umeyama1991least,
+  title={Least-squares estimation of transformation parameters between two point patterns},
+  author={Umeyama, Shinji},
+  journal={IEEE Computer Architecture Letters},
+  volume={13},
+  number={04},
+  pages={376--380},
+  year={1991},
+  publisher={IEEE Computer Society}
+}
diff --git a/doc/py_tutorials/py_feature2d/py_features_harris/py_features_harris.markdown b/doc/py_tutorials/py_feature2d/py_features_harris/py_features_harris.markdown
index e24e692087..60e5686934 100644
--- a/doc/py_tutorials/py_feature2d/py_features_harris/py_features_harris.markdown
+++ b/doc/py_tutorials/py_feature2d/py_features_harris/py_features_harris.markdown
@@ -40,12 +40,12 @@ using **cv.Sobel()**).
 Then comes the main part. After this, they created a score, basically an equation, which
 determines if a window can contain a corner or not.
 
-\f[R = det(M) - k(trace(M))^2\f]
+\f[R = \det(M) - k(\operatorname{trace}(M))^2\f]
 
 where
-    -   \f$det(M) = \lambda_1 \lambda_2\f$
-    -   \f$trace(M) = \lambda_1 + \lambda_2\f$
-    -   \f$\lambda_1\f$ and \f$\lambda_2\f$ are the eigenvalues of M
+    -   \f$\det(M) = \lambda_1 \lambda_2\f$
+    -   \f$\operatorname{trace}(M) = \lambda_1 + \lambda_2\f$
+    -   \f$\lambda_1\f$ and \f$\lambda_2\f$ are the eigenvalues of \f$M\f$
 
 So the magnitudes of these eigenvalues decide whether a region is a corner, an edge, or flat.
 
diff --git a/doc/py_tutorials/py_feature2d/py_shi_tomasi/py_shi_tomasi.markdown b/doc/py_tutorials/py_feature2d/py_shi_tomasi/py_shi_tomasi.markdown
index 1229581ce6..c5d29493e4 100644
--- a/doc/py_tutorials/py_feature2d/py_shi_tomasi/py_shi_tomasi.markdown
+++ b/doc/py_tutorials/py_feature2d/py_shi_tomasi/py_shi_tomasi.markdown
@@ -20,7 +20,7 @@ Harris Corner Detector. The scoring function in Harris Corner Detector was given
 
 Instead of this, Shi-Tomasi proposed:
 
-\f[R = min(\lambda_1, \lambda_2)\f]
+\f[R = \min(\lambda_1, \lambda_2)\f]
 
 If it is a greater than a threshold value, it is considered as a corner. If we plot it in
 \f$\lambda_1 - \lambda_2\f$ space as we did in Harris Corner Detector, we get an image as below:
@@ -28,7 +28,7 @@ If it is a greater than a threshold value, it is considered as a corner. If we p
 ![image](images/shitomasi_space.png)
 
 From the figure, you can see that only when \f$\lambda_1\f$ and \f$\lambda_2\f$ are above a minimum value,
-\f$\lambda_{min}\f$, it is considered as a corner(green region).
+\f$\lambda_{\min}\f$, it is considered as a corner(green region).
 
 Code
 ----
diff --git a/doc/py_tutorials/py_feature2d/py_sift_intro/py_sift_intro.markdown b/doc/py_tutorials/py_feature2d/py_sift_intro/py_sift_intro.markdown
index dee4df774a..bbbae6a3e6 100644
--- a/doc/py_tutorials/py_feature2d/py_sift_intro/py_sift_intro.markdown
+++ b/doc/py_tutorials/py_feature2d/py_sift_intro/py_sift_intro.markdown
@@ -156,7 +156,7 @@ sift = cv.SIFT_create()
 kp, des = sift.detectAndCompute(gray,None)
 @endcode
 Here kp will be a list of keypoints and des is a numpy array of shape
-\f$Number\_of\_Keypoints \times 128\f$.
+\f$\text{(Number of Keypoints)} \times 128\f$.
 
 So we got keypoints, descriptors etc. Now we want to see how to match keypoints in different images.
 That we will learn in coming chapters.
diff --git a/doc/tools/html_functions.py b/doc/tools/html_functions.py
index b76639cea5..204f6d1c1b 100644
--- a/doc/tools/html_functions.py
+++ b/doc/tools/html_functions.py
@@ -107,17 +107,10 @@ def add_signature_to_table(soup, table, signature, language, type):
     """ Add a signature to an html table"""
     row = soup.new_tag('tr')
     row.append(soup.new_tag('td', style='width: 20px;'))
-
-    if 'ret' in signature:
-        row.append(append(soup.new_tag('td'), signature['ret']))
-        row.append(append(soup.new_tag('td'), '='))
-    else:
-        row.append(soup.new_tag('td')) # return values
-        row.append(soup.new_tag('td')) # '='
-
     row.append(append(soup.new_tag('td'), signature['name'] + '('))
     row.append(append(soup.new_tag('td', **{'class': 'paramname'}), signature['arg']))
-    row.append(append(soup.new_tag('td'), ')'))
+    row.append(append(soup.new_tag('td'), ') -> '))
+    row.append(append(soup.new_tag('td'), signature['ret']))
     table.append(row)
 
 
diff --git a/doc/tutorials/calib3d/real_time_pose/real_time_pose.markdown b/doc/tutorials/calib3d/real_time_pose/real_time_pose.markdown
index 58419f8618..1bba591074 100644
--- a/doc/tutorials/calib3d/real_time_pose/real_time_pose.markdown
+++ b/doc/tutorials/calib3d/real_time_pose/real_time_pose.markdown
@@ -87,7 +87,7 @@ The tutorial consists of two main programs:
 
     The application starts up extracting the ORB features and descriptors from the input image and
     then uses the mesh along with the [Möller–Trumbore intersection
-    algorithm](http://http://en.wikipedia.org/wiki/M%C3%B6ller%E2%80%93Trumbore_intersection_algorithm/)
+    algorithm](http://en.wikipedia.org/wiki/M%C3%B6ller%E2%80%93Trumbore_intersection_algorithm/)
     to compute the 3D coordinates of the found features. Finally, the 3D points and the descriptors
     are stored in different lists in a file with YAML format which each row is a different point. The
     technical background on how to store the files can be found in the @ref tutorial_file_input_output_with_xml_yml
diff --git a/doc/tutorials/introduction/config_reference/config_reference.markdown b/doc/tutorials/introduction/config_reference/config_reference.markdown
index 1d4f426c8f..438cc70288 100644
--- a/doc/tutorials/introduction/config_reference/config_reference.markdown
+++ b/doc/tutorials/introduction/config_reference/config_reference.markdown
@@ -396,13 +396,14 @@ There are multiple less popular frameworks which can be used to read and write v
 
 ### videoio plugins
 
-Some _videoio_ backends can be built as plugins thus breaking strict dependency on third-party libraries and making them optional at runtime. Following options can be used to control this mechanism:
+Since version 4.1.0 some _videoio_ backends can be built as plugins thus breaking strict dependency on third-party libraries and making them optional at runtime. Following options can be used to control this mechanism:
 
 | Option | Default | Description |
 | --------| ------ | ------- |
 | `VIDEOIO_ENABLE_PLUGINS` | _ON_ | Enable or disable plugins completely. |
 | `VIDEOIO_PLUGIN_LIST` | _empty_ | Comma- or semicolon-separated list of backend names to be compiled as plugins. Supported names are _ffmpeg_, _gstreamer_, _msmf_, _mfx_ and _all_. |
-| `VIDEOIO_ENABLE_STRICT_PLUGIN_CHECK` | _ON_ | Enable strict runtime version check to only allow plugins built with the same version of OpenCV. |
+
+Check @ref tutorial_general_install for standalone plugins build instructions.
 
 
 ## Parallel processing {#tutorial_config_reference_func_core}
@@ -421,6 +422,17 @@ Some of OpenCV algorithms can use multithreading to accelerate processing. OpenC
 @note OpenCV can download and build TBB library from GitHub, this functionality can be enabled with the `BUILD_TBB` option.
 
 
+### Threading plugins
+
+Since version 4.5.2 OpenCV supports dynamically loaded threading backends. At this moment only separate compilation process is supported: first you have to build OpenCV with some _default_ parallel backend (e.g. pthreads), then build each plugin and copy resulting binaries to the _lib_ or _bin_ folder.
+
+| Option | Default | Description |
+| ------ | ------- | ----------- |
+| PARALLEL_ENABLE_PLUGINS | ON | Enable plugin support, if this option is disabled OpenCV will not try to load anything |
+
+Check @ref tutorial_general_install for standalone plugins build instructions.
+
+
 ## GUI backends (highgui module) {#tutorial_config_reference_highgui}
 
 OpenCV relies on various GUI libraries for window drawing.
@@ -442,6 +454,18 @@ OpenCV relies on various GUI libraries for window drawing.
 OpenGL integration can be used to draw HW-accelerated windows with following backends: GTK, WIN32 and Qt. And enables basic interoperability with OpenGL, see @ref core_opengl and @ref highgui_opengl for details.
 
 
+### highgui plugins
+
+Since OpenCV 4.5.3 GTK backend can be build as a dynamically loaded plugin. Following options can be used to control this mechanism:
+
+| Option | Default | Description |
+| --------| ------ | ------- |
+| `HIGHGUI_ENABLE_PLUGINS` | _ON_ | Enable or disable plugins completely. |
+| `HIGHGUI_PLUGIN_LIST` | _empty_ | Comma- or semicolon-separated list of backend names to be compiled as plugins. Supported names are _gtk_, _gtk2_, _gtk3_, and _all_. |
+
+Check @ref tutorial_general_install for standalone plugins build instructions.
+
+
 ## Deep learning neural networks inference backends and options (dnn module) {#tutorial_config_reference_dnn}
 
 OpenCV have own DNN inference module which have own build-in engine, but can also use other libraries for optimized processing. Multiple backends can be enabled in single build. Selection happens at runtime automatically or manually.
diff --git a/doc/tutorials/introduction/general_install/general_install.markdown b/doc/tutorials/introduction/general_install/general_install.markdown
index e8c93f430e..7b0c5d2b06 100644
--- a/doc/tutorials/introduction/general_install/general_install.markdown
+++ b/doc/tutorials/introduction/general_install/general_install.markdown
@@ -105,7 +105,7 @@ cmake --build <build-directory> <build-options>
 make
 ```
 
-## Step 3: Install {#tutorial_general_install_sources_4}
+## (optional) Step 3: Install {#tutorial_general_install_sources_4}
 
 During installation procedure build results and other files from build directory will be copied to the install location. Default installation location is `/usr/local` on UNIX and `C:/Program Files` on Windows. This location can be changed at the configuration step by setting `CMAKE_INSTALL_PREFIX` option. To perform installation run the following command:
 ```
@@ -117,3 +117,32 @@ This step is optional, OpenCV can be used directly from the build directory.
 
 @note
 If the installation root location is a protected system directory, so the installation process must be run with superuser or administrator privileges (e.g. `sudo cmake ...`).
+
+
+## (optional) Step 4: Build plugins {#tutorial_general_install_plugins_4}
+
+It is possible to decouple some of OpenCV dependencies and make them optional by extracting parts of the code into dynamically-loaded plugins. It helps to produce adaptive binary distributions which can work on systems with less dependencies and extend functionality just by installing missing libraries. For now modules _core_, _videoio_ and _highgui_ support this mechanism for some of their dependencies. In some cases it is possible to build plugins together with OpenCV by setting options like `VIDEOIO_PLUGIN_LIST` or `HIGHGUI_PLUGIN_LIST`, more options related to this scenario can be found in the @ref tutorial_config_reference. In other cases plugins should be built separately in their own build procedure and this section describes such standalone build process.
+
+@note It is recommended to use compiler, configuration and build options which are compatible to the one used for OpenCV build, otherwise resulting library can refuse to load or cause other runtime problems. Note that some functionality can be limited or work slower when backends are loaded dynamically due to extra barrier between OpenCV and corresponding third-party library.
+
+Build procedure is similar to the main OpenCV build, but you have to use special CMake projects located in corresponding subdirectories, these folders can also contain reference scripts and Docker images. It is important to use `opencv_<module>_<backend>` name prefix for plugins so that loader is able to find them. Each supported prefix can be used to load only one library, however multiple candidates can be probed for a single prefix. For example, you can have _libopencv_videoio_ffmpeg_3.so_ and _libopencv_videoio_ffmpeg_4.so_ plugins and the first one which can be loaded successfully will occupy internal slot and stop probing process. Possible prefixes and project locations are presented in the table below:
+
+| module | backends | location |
+| ------ | -------- | -------- |
+| core | parallel_tbb, parallel_onetbb, parallel_openmp | _opencv/modules/core/misc/plugins_ |
+| highgui | gtk, gtk2, gtk3 | _opencv/modules/highgui/misc/plugins_ |
+| videoio | ffmpeg, gstreamer, intel_mfx, msmf | _opencv/modules/videoio/misc_ |
+
+Example:
+```.sh
+# set-up environment for TBB detection, for example:
+#   export TBB_DIR=<dir-with-tbb-cmake-config>
+cmake -G<generator> \
+    -DOPENCV_PLUGIN_NAME=opencv_core_tbb_<suffix> \
+    -DOPENCV_PLUGIN_DESTINATION=<dest-folder> \
+    -DCMAKE_BUILD_TYPE=<config> \
+    <opencv>/modules/core/misc/plugins/parallel_tbb
+cmake --build . --config <config>
+```
+
+@note On Windows plugins must be linked with existing OpenCV build. Set `OpenCV_DIR` environment or CMake variable to the directory with _OpenCVConfig.cmake_ file, it can be OpenCV build directory or some path in the location where you performed installation.
diff --git a/doc/tutorials/introduction/linux_install/linux_install.markdown b/doc/tutorials/introduction/linux_install/linux_install.markdown
index af06810cdd..46f67ac61c 100644
--- a/doc/tutorials/introduction/linux_install/linux_install.markdown
+++ b/doc/tutorials/introduction/linux_install/linux_install.markdown
@@ -108,7 +108,7 @@ CMake package files will be located in the build root:
 ## Install
 
 @warning
-Installation process only copies files to predefined locations and do minor patching. Library installed using this method is not integrated into the system package registry and can not be uninstalled automatically. We do not recommend system-wide installation to regular users due to possible conflicts with system packages.
+The installation process only copies files to predefined locations and does minor patching. Installing using this method does not integrate opencv into the system package registry and thus, for example, opencv can not be uninstalled automatically. We do not recommend system-wide installation to regular users due to possible conflicts with system packages.
 
 By default OpenCV will be installed to the `/usr/local` directory, all files will be copied to following locations:
 * `/usr/local/bin` - executable files
diff --git a/modules/3d/include/opencv2/3d.hpp b/modules/3d/include/opencv2/3d.hpp
index 6984b705a2..7fdd4cc4e4 100644
--- a/modules/3d/include/opencv2/3d.hpp
+++ b/modules/3d/include/opencv2/3d.hpp
@@ -1852,6 +1852,33 @@ CV_EXPORTS_W  int estimateAffine3D(InputArray src, InputArray dst,
                                    OutputArray out, OutputArray inliers,
                                    double ransacThreshold = 3, double confidence = 0.99);
 
+/** @brief Computes an optimal affine transformation between two 3D point sets.
+
+It computes \f$R,s,t\f$ minimizing \f$\sum{i} dst_i - c \cdot R \cdot src_i \f$
+where \f$R\f$ is a 3x3 rotation matrix, \f$t\f$ is a 3x1 translation vector and \f$s\f$ is a
+scalar size value. This is an implementation of the algorithm by Umeyama \cite umeyama1991least .
+The estimated affine transform has a homogeneous scale which is a subclass of affine
+transformations with 7 degrees of freedom. The paired point sets need to comprise at least 3
+points each.
+
+@param src First input 3D point set.
+@param dst Second input 3D point set.
+@param scale If null is passed, the scale parameter c will be assumed to be 1.0.
+Else the pointed-to variable will be set to the optimal scale.
+@param force_rotation If true, the returned rotation will never be a reflection.
+This might be unwanted, e.g. when optimizing a transform between a right- and a
+left-handed coordinate system.
+@return 3D affine transformation matrix \f$3 \times 4\f$ of the form
+\f[T =
+\begin{bmatrix}
+R & t\\
+\end{bmatrix}
+\f]
+
+ */
+CV_EXPORTS_W   cv::Mat estimateAffine3D(InputArray src, InputArray dst,
+                                        CV_OUT double* scale = nullptr, bool force_rotation = true);
+
 /** @brief Computes an optimal translation between two 3D point sets.
  *
  * It computes
diff --git a/modules/3d/src/ptsetreg.cpp b/modules/3d/src/ptsetreg.cpp
index 04f665fdab..6482d872be 100644
--- a/modules/3d/src/ptsetreg.cpp
+++ b/modules/3d/src/ptsetreg.cpp
@@ -899,6 +899,86 @@ int estimateAffine3D(InputArray _from, InputArray _to,
     return createRANSACPointSetRegistrator(makePtr<Affine3DEstimatorCallback>(), 4, ransacThreshold, confidence)->run(dFrom, dTo, _out, _inliers);
 }
 
+Mat    estimateAffine3D(InputArray _from, InputArray _to,
+                        CV_OUT double* _scale, bool force_rotation)
+{
+    CV_INSTRUMENT_REGION();
+    Mat from = _from.getMat(), to = _to.getMat();
+    int count = from.checkVector(3);
+
+    CV_CheckGE(count, 3, "Umeyama algorithm needs at least 3 points for affine transformation estimation.");
+    CV_CheckEQ(to.checkVector(3), count, "Point sets need to have the same size");
+    from = from.reshape(1, count);
+    to = to.reshape(1, count);
+    if(from.type() != CV_64F)
+        from.convertTo(from, CV_64F);
+    if(to.type() != CV_64F)
+        to.convertTo(to, CV_64F);
+
+    const double one_over_n = 1./count;
+
+    const auto colwise_mean = [one_over_n](const Mat& m)
+    {
+        Mat my;
+        reduce(m, my, 0, REDUCE_SUM, CV_64F);
+        return my * one_over_n;
+    };
+
+    const auto demean = [count](const Mat& A, const Mat& mean)
+    {
+        Mat A_centered = Mat::zeros(count, 3, CV_64F);
+        for(int i = 0; i < count; i++)
+        {
+            A_centered.row(i) = A.row(i) - mean;
+        }
+        return A_centered;
+    };
+
+    Mat from_mean = colwise_mean(from);
+    Mat to_mean = colwise_mean(to);
+
+    Mat from_centered = demean(from, from_mean);
+    Mat to_centered = demean(to, to_mean);
+
+    Mat cov = to_centered.t() * from_centered * one_over_n;
+
+    Mat u,d,vt;
+    SVD::compute(cov, d, u, vt, SVD::MODIFY_A | SVD::FULL_UV);
+
+    CV_CheckGE(countNonZero(d), 2, "Points cannot be colinear");
+
+    Mat S = Mat::eye(3, 3, CV_64F);
+    // det(d) can only ever be >=0, so we can always use this here (compared to the original formula by Umeyama)
+    if (force_rotation && (determinant(u) * determinant(vt) < 0))
+    {
+        S.at<double>(2, 2) = -1;
+    }
+    Mat rmat = u*S*vt;
+
+    double scale = 1.0;
+    if (_scale)
+    {
+        double var_from = 0.;
+        scale = 0.;
+        for(int i = 0; i < 3; i++)
+        {
+            var_from += norm(from_centered.col(i), NORM_L2SQR);
+            scale += d.at<double>(i, 0) * S.at<double>(i, i);
+        }
+        double inverse_var = count / var_from;
+        scale *= inverse_var;
+        *_scale = scale;
+    }
+    Mat new_to = scale * rmat * from_mean.t();
+
+    Mat transform;
+    transform.create(3, 4, CV_64F);
+    Mat r_part(transform(Rect(0, 0, 3, 3)));
+    rmat.copyTo(r_part);
+    transform.col(3) = to_mean.t() - new_to;
+    return transform;
+}
+
 int estimateTranslation3D(InputArray _from, InputArray _to,
                           OutputArray _out, OutputArray _inliers,
                           double ransacThreshold, double confidence)
diff --git a/modules/3d/src/solvepnp.cpp b/modules/3d/src/solvepnp.cpp
index 03fb6f88c0..e358557859 100644
--- a/modules/3d/src/solvepnp.cpp
+++ b/modules/3d/src/solvepnp.cpp
@@ -402,7 +402,14 @@ bool solvePnPRansac( InputArray objectPoints, InputArray imagePoints,
     Ptr<usac::RansacOutput> ransac_output;
     if (usac::run(model_params, imagePoints, objectPoints, model_params->getRandomGeneratorState(),
             ransac_output, cameraMatrix, noArray(), distCoeffs, noArray())) {
-        usac::saveMask(inliers, ransac_output->getInliersMask());
+        if (inliers.needed()) {
+            const auto &inliers_mask = ransac_output->getInliersMask();
+            Mat inliers_;
+            for (int i = 0; i < (int)inliers_mask.size(); i++)
+                if (inliers_mask[i])
+                    inliers_.push_back(i);
+            inliers_.copyTo(inliers);
+        }
         const Mat &model = ransac_output->getModel();
         model.col(0).copyTo(rvec);
         model.col(1).copyTo(tvec);
diff --git a/modules/3d/src/usac/ransac_solvers.cpp b/modules/3d/src/usac/ransac_solvers.cpp
index 0c7637d582..b7f3e6e0c1 100644
--- a/modules/3d/src/usac/ransac_solvers.cpp
+++ b/modules/3d/src/usac/ransac_solvers.cpp
@@ -408,10 +408,11 @@ int mergePoints (InputArray pts1_, InputArray pts2_, Mat &pts, bool ispnp) {
 void saveMask (OutputArray mask, const std::vector<bool> &inliers_mask) {
     if (mask.needed()) {
         const int points_size = (int) inliers_mask.size();
-        mask.create(points_size, 1, CV_8U);
-        auto * maskptr = mask.getMat().ptr<uchar>();
+        Mat tmp_mask(points_size, 1, CV_8U);
+        auto * maskptr = tmp_mask.ptr<uchar>();
         for (int i = 0; i < points_size; i++)
             maskptr[i] = (uchar) inliers_mask[i];
+        tmp_mask.copyTo(mask);
     }
 }
 void setParameters (Ptr<Model> &params, EstimationMethod estimator, const UsacParams &usac_params,
@@ -538,23 +539,26 @@ Mat findEssentialMat (InputArray points1, InputArray points2, InputArray cameraM
 bool solvePnPRansac( InputArray objectPoints, InputArray imagePoints,
        InputArray cameraMatrix, InputArray distCoeffs, OutputArray rvec, OutputArray tvec,
        bool /*useExtrinsicGuess*/, int max_iters, float thr, double conf,
-       OutputArray mask, int method) {
+       OutputArray inliers, int method) {
     Ptr<Model> params;
     setParameters(method, params, cameraMatrix.empty() ? EstimationMethod ::P6P : EstimationMethod ::P3P,
-            thr, max_iters, conf, mask.needed());
+            thr, max_iters, conf, inliers.needed());
     Ptr<RansacOutput> ransac_output;
     if (run(params, imagePoints, objectPoints, params->getRandomGeneratorState(),
             ransac_output, cameraMatrix, noArray(), distCoeffs, noArray())) {
-        saveMask(mask, ransac_output->getInliersMask());
+        if (inliers.needed()) {
+            const auto &inliers_mask = ransac_output->getInliersMask();
+            Mat inliers_;
+            for (int i = 0; i < (int)inliers_mask.size(); i++)
+                if (inliers_mask[i])
+                    inliers_.push_back(i);
+            inliers_.copyTo(inliers);
+        }
         const Mat &model = ransac_output->getModel();
         model.col(0).copyTo(rvec);
         model.col(1).copyTo(tvec);
         return true;
     }
-    if (mask.needed()){
-        mask.create(std::max(objectPoints.getMat().rows, objectPoints.getMat().cols), 1, CV_8U);
-        mask.setTo(Scalar::all(0));
-    }
     return false;
 }
 
diff --git a/modules/3d/test/test_affine3d_estimator.cpp b/modules/3d/test/test_affine3d_estimator.cpp
index 521b01ac08..f5a118da5d 100644
--- a/modules/3d/test/test_affine3d_estimator.cpp
+++ b/modules/3d/test/test_affine3d_estimator.cpp
@@ -41,6 +41,7 @@
 //M*/
 
 #include "test_precomp.hpp"
+#include "opencv2/core/affine.hpp"
 
 namespace opencv_test { namespace {
 
@@ -201,4 +202,25 @@ TEST(Calib3d_EstimateAffine3D, regression_16007)
     EXPECT_EQ(1, res);
 }
 
+TEST(Calib3d_EstimateAffine3D, umeyama_3_pt)
+{
+    std::vector<cv::Vec3d> points =   {{{0.80549149, 0.8225781, 0.79949521},
+                                        {0.28906756, 0.57158557, 0.9864789},
+                                        {0.58266182, 0.65474983, 0.25078834}}};
+    cv::Mat R =   (cv::Mat_<double>(3,3) << 0.9689135, -0.0232753, 0.2463025,
+                                            0.0236362,  0.9997195, 0.0014915,
+                                            -0.2462682, 0.0043765, 0.9691918);
+    cv::Vec3d t(1., 2., 3.);
+    cv::Affine3d transform(R, t);
+    std::vector<cv::Vec3d> transformed_points(points.size());
+    std::transform(points.begin(), points.end(), transformed_points.begin(), [transform](const cv::Vec3d v){return transform * v;});
+    double scale;
+    cv::Mat trafo_est = estimateAffine3D(points, transformed_points, &scale);
+    Mat R_est(trafo_est(Rect(0, 0, 3, 3)));
+    EXPECT_LE(cvtest::norm(R_est, R, NORM_INF), 1e-6);
+    Vec3d t_est = trafo_est.col(3);
+    EXPECT_LE(cvtest::norm(t_est, t, NORM_INF), 1e-6);
+    EXPECT_NEAR(scale, 1.0, 1e-6);
+}
+
 }} // namespace
diff --git a/modules/3d/test/test_usac.cpp b/modules/3d/test/test_usac.cpp
index fb5641bd1e..6e6d8cecf3 100644
--- a/modules/3d/test/test_usac.cpp
+++ b/modules/3d/test/test_usac.cpp
@@ -345,11 +345,16 @@ TEST(usac_P3P, accuracy) {
                    log(1 - pow(inl_ratio, 3 /* sample size */));
 
         for (auto flag : flags) {
+            std::vector<int> inliers;
             cv::Mat rvec, tvec, mask, R, P;
             CV_Assert(cv::solvePnPRansac(obj_pts, img_pts, K1, cv::noArray(), rvec, tvec,
-                    false, (int)max_iters, (float)thr, conf, mask, flag));
+                    false, (int)max_iters, (float)thr, conf, inliers, flag));
             cv::Rodrigues(rvec, R);
             cv::hconcat(K1 * R, K1 * tvec, P);
+            mask.create(pts_size, 1, CV_8U);
+            mask.setTo(Scalar::all(0));
+            for (auto inl : inliers)
+                mask.at<uchar>(inl) = true;
             checkInliersMask(TestSolver ::PnP, inl_size, thr, img_pts, obj_pts, P, mask);
         }
     }
@@ -416,19 +421,27 @@ TEST(usac_testUsacParams, accuracy) {
             // CV_Error(cv::Error::StsError, "Essential matrix estimation failed!");
     }
 
+    std::vector<int> inliers(pts_size);
     // P3P
     inl_size = generatePoints(rng, pts1, pts2, K1, K2, false, pts_size, TestSolver::PnP,
     getInlierRatio(usac_params.maxIterations, 3, usac_params.confidence), 0.01, gt_inliers);
-    CV_Assert(cv::solvePnPRansac(pts2, pts1, K1, dist_coeff, rvec, tvec, mask, usac_params));
+    CV_Assert(cv::solvePnPRansac(pts2, pts1, K1, dist_coeff, rvec, tvec, inliers, usac_params));
     cv::Rodrigues(rvec, R); cv::hconcat(K1 * R, K1 * tvec, model);
+    mask.create(pts_size, 1, CV_8U);
+    mask.setTo(Scalar::all(0));
+    for (auto inl : inliers)
+        mask.at<uchar>(inl) = true;
     checkInliersMask(TestSolver::PnP, inl_size, usac_params.threshold, pts1, pts2, model, mask);
 
     // P6P
     inl_size = generatePoints(rng, pts1, pts2, K1, K2, false, pts_size, TestSolver::PnP,
     getInlierRatio(usac_params.maxIterations, 6, usac_params.confidence), 0.1, gt_inliers);
     cv::Mat K_est;
-    CV_Assert(cv::solvePnPRansac(pts2, pts1, K_est, dist_coeff, rvec, tvec, mask, usac_params));
+    CV_Assert(cv::solvePnPRansac(pts2, pts1, K_est, dist_coeff, rvec, tvec, inliers, usac_params));
     cv::Rodrigues(rvec, R); cv::hconcat(K_est * R, K_est * tvec, model);
+    mask.setTo(Scalar::all(0));
+    for (auto inl : inliers)
+        mask.at<uchar>(inl) = true;
     checkInliersMask(TestSolver::PnP, inl_size, usac_params.threshold, pts1, pts2, model, mask);
 
     // Affine2D
diff --git a/modules/calib/include/opencv2/calib.hpp b/modules/calib/include/opencv2/calib.hpp
index efcdd5d9e1..eab7097d1b 100644
--- a/modules/calib/include/opencv2/calib.hpp
+++ b/modules/calib/include/opencv2/calib.hpp
@@ -738,8 +738,8 @@ concatenated together.
 @param imageSize Size of the image used only to initialize the camera intrinsic matrix.
 @param cameraMatrix Input/output 3x3 floating-point camera intrinsic matrix
 \f$\cameramatrix{A}\f$ . If @ref CALIB_USE_INTRINSIC_GUESS
-and/or @ref CALIB_FIX_ASPECT_RATIO are specified, some or all of fx, fy, cx, cy must be
-initialized before calling the function.
+and/or @ref CALIB_FIX_ASPECT_RATIO, @ref CALIB_FIX_PRINCIPAL_POINT or @ref CALIB_FIX_FOCAL_LENGTH
+are specified, some or all of fx, fy, cx, cy must be initialized before calling the function.
 @param distCoeffs Input/output vector of distortion coefficients
 \f$\distcoeffs\f$.
 @param rvecs Output vector of rotation vectors (@ref Rodrigues ) estimated for each pattern view
@@ -765,7 +765,7 @@ the number of pattern views. \f$R_i, T_i\f$ are concatenated 1x3 vectors.
 fx, fy, cx, cy that are optimized further. Otherwise, (cx, cy) is initially set to the image
 center ( imageSize is used), and focal distances are computed in a least-squares fashion.
 Note, that if intrinsic parameters are known, there is no need to use this function just to
-estimate extrinsic parameters. Use solvePnP instead.
+estimate extrinsic parameters. Use @ref solvePnP instead.
 -   @ref CALIB_FIX_PRINCIPAL_POINT The principal point is not changed during the global
 optimization. It stays at the center or at a different location specified when
  @ref CALIB_USE_INTRINSIC_GUESS is set too.
@@ -775,24 +775,23 @@ ratio fx/fy stays the same as in the input cameraMatrix . When
 ignored, only their ratio is computed and used further.
 -   @ref CALIB_ZERO_TANGENT_DIST Tangential distortion coefficients \f$(p_1, p_2)\f$ are set
 to zeros and stay zero.
+-   @ref CALIB_FIX_FOCAL_LENGTH The focal length is not changed during the global optimization if
+ @ref CALIB_USE_INTRINSIC_GUESS is set.
 -   @ref CALIB_FIX_K1,..., @ref CALIB_FIX_K6 The corresponding radial distortion
 coefficient is not changed during the optimization. If @ref CALIB_USE_INTRINSIC_GUESS is
 set, the coefficient from the supplied distCoeffs matrix is used. Otherwise, it is set to 0.
 -   @ref CALIB_RATIONAL_MODEL Coefficients k4, k5, and k6 are enabled. To provide the
 backward compatibility, this extra flag should be explicitly specified to make the
-calibration function use the rational model and return 8 coefficients. If the flag is not
-set, the function computes and returns only 5 distortion coefficients.
+calibration function use the rational model and return 8 coefficients or more.
 -   @ref CALIB_THIN_PRISM_MODEL Coefficients s1, s2, s3 and s4 are enabled. To provide the
 backward compatibility, this extra flag should be explicitly specified to make the
-calibration function use the thin prism model and return 12 coefficients. If the flag is not
-set, the function computes and returns only 5 distortion coefficients.
+calibration function use the thin prism model and return 12 coefficients or more.
 -   @ref CALIB_FIX_S1_S2_S3_S4 The thin prism distortion coefficients are not changed during
 the optimization. If @ref CALIB_USE_INTRINSIC_GUESS is set, the coefficient from the
 supplied distCoeffs matrix is used. Otherwise, it is set to 0.
 -   @ref CALIB_TILTED_MODEL Coefficients tauX and tauY are enabled. To provide the
 backward compatibility, this extra flag should be explicitly specified to make the
-calibration function use the tilted sensor model and return 14 coefficients. If the flag is not
-set, the function computes and returns only 5 distortion coefficients.
+calibration function use the tilted sensor model and return 14 coefficients.
 -   @ref CALIB_FIX_TAUX_TAUY The coefficients of the tilted sensor model are not changed during
 the optimization. If @ref CALIB_USE_INTRINSIC_GUESS is set, the coefficient from the
 supplied distCoeffs matrix is used. Otherwise, it is set to 0.
@@ -817,12 +816,12 @@ The algorithm performs the following steps:
     zeros initially unless some of CALIB_FIX_K? are specified.
 
 -   Estimate the initial camera pose as if the intrinsic parameters have been already known. This is
-    done using solvePnP .
+    done using @ref solvePnP .
 
 -   Run the global Levenberg-Marquardt optimization algorithm to minimize the reprojection error,
     that is, the total sum of squared distances between the observed feature points imagePoints and
     the projected (using the current estimates for camera parameters and the poses) object points
-    objectPoints. See projectPoints for details.
+    objectPoints. See @ref projectPoints for details.
 
 @note
     If you use a non-square (i.e. non-N-by-N) grid and @ref findChessboardCorners for calibration,
diff --git a/modules/core/include/opencv2/core/cv_cpu_dispatch.h b/modules/core/include/opencv2/core/cv_cpu_dispatch.h
index fe15e51e4e..8365b10ba9 100644
--- a/modules/core/include/opencv2/core/cv_cpu_dispatch.h
+++ b/modules/core/include/opencv2/core/cv_cpu_dispatch.h
@@ -142,6 +142,11 @@
 #  define CV_NEON 1
 #endif
 
+#if defined(__riscv) && defined(__riscv_vector) && defined(__riscv_vector_071)
+# include<riscv-vector.h>
+# define CV_RVV071 1
+#endif
+
 #if defined(__ARM_NEON__) || defined(__aarch64__)
 #  include <arm_neon.h>
 #endif
@@ -338,6 +343,10 @@ struct VZeroUpperGuard {
 #  define CV_NEON 0
 #endif
 
+#ifndef CV_RVV071
+#  define CV_RVV071 0
+#endif
+
 #ifndef CV_VSX
 #  define CV_VSX 0
 #endif
diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h
index 6a55995fc9..0b3d5328f1 100644
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@@ -271,6 +271,8 @@ namespace cv {
 
 #define CV_CPU_MSA              150
 
+#define CV_CPU_RISCVV           170
+
 #define CV_CPU_VSX              200
 #define CV_CPU_VSX3             201
 
@@ -325,6 +327,8 @@ enum CpuFeatures {
 
     CPU_MSA             = 150,
 
+    CPU_RISCVV          = 170,
+
     CPU_VSX             = 200,
     CPU_VSX3            = 201,
 
@@ -681,7 +685,7 @@ __CV_ENUM_FLAGS_BITWISE_XOR_EQ   (EnumType, EnumType)
 #  define CV_XADD(addr, delta) (int)_InterlockedExchangeAdd((long volatile*)addr, delta)
 #else
   #ifdef OPENCV_FORCE_UNSAFE_XADD
-    CV_INLINE CV_XADD(int* addr, int delta) { int tmp = *addr; *addr += delta; return tmp; }
+    CV_INLINE int CV_XADD(int* addr, int delta) { int tmp = *addr; *addr += delta; return tmp; }
   #else
     #error "OpenCV: can't define safe CV_XADD macro for current platform (unsupported). Define CV_XADD macro through custom port header (see OPENCV_INCLUDE_PORT_FILE)"
   #endif
diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp
index 6f5b8e1788..ac331f2154 100644
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@@ -200,7 +200,7 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
 #   undef CV_RVV
 #endif
 
-#if (CV_SSE2 || CV_NEON || CV_VSX || CV_MSA || CV_WASM_SIMD || CV_RVV) && !defined(CV_FORCE_SIMD128_CPP)
+#if (CV_SSE2 || CV_NEON || CV_VSX || CV_MSA || CV_WASM_SIMD || CV_RVV071 || CV_RVV) && !defined(CV_FORCE_SIMD128_CPP)
 #define CV__SIMD_FORWARD 128
 #include "opencv2/core/hal/intrin_forward.hpp"
 #endif
@@ -214,6 +214,10 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
 
 #include "opencv2/core/hal/intrin_neon.hpp"
 
+#elif CV_RVV071 && !defined(CV_FORCE_SIMD128_CPP)
+#define CV_SIMD128_CPP 0
+#include "opencv2/core/hal/intrin_rvv071.hpp"
+
 #elif CV_VSX && !defined(CV_FORCE_SIMD128_CPP)
 
 #include "opencv2/core/hal/intrin_vsx.hpp"
diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
index 785648575a..e17972a3fc 100644
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@@ -538,49 +538,81 @@ inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
                          v_int16x8& c, v_int16x8& d)
 {
     c.val = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
+#if CV_NEON_AARCH64
+    d.val = vmull_high_s8(a.val, b.val);
+#else // #if CV_NEON_AARCH64
     d.val = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val));
+#endif // #if CV_NEON_AARCH64
 }
 
 inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
                          v_uint16x8& c, v_uint16x8& d)
 {
     c.val = vmull_u8(vget_low_u8(a.val), vget_low_u8(b.val));
+#if CV_NEON_AARCH64
+    d.val = vmull_high_u8(a.val, b.val);
+#else // #if CV_NEON_AARCH64
     d.val = vmull_u8(vget_high_u8(a.val), vget_high_u8(b.val));
+#endif // #if CV_NEON_AARCH64
 }
 
 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
                          v_int32x4& c, v_int32x4& d)
 {
     c.val = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
+#if CV_NEON_AARCH64
+    d.val = vmull_high_s16(a.val, b.val);
+#else // #if CV_NEON_AARCH64
     d.val = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
+#endif // #if CV_NEON_AARCH64
 }
 
 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
                          v_uint32x4& c, v_uint32x4& d)
 {
     c.val = vmull_u16(vget_low_u16(a.val), vget_low_u16(b.val));
+#if CV_NEON_AARCH64
+    d.val = vmull_high_u16(a.val, b.val);
+#else // #if CV_NEON_AARCH64
     d.val = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
+#endif // #if CV_NEON_AARCH64
 }
 
 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
                          v_uint64x2& c, v_uint64x2& d)
 {
     c.val = vmull_u32(vget_low_u32(a.val), vget_low_u32(b.val));
+#if CV_NEON_AARCH64
+    d.val = vmull_high_u32(a.val, b.val);
+#else // #if CV_NEON_AARCH64
     d.val = vmull_u32(vget_high_u32(a.val), vget_high_u32(b.val));
+#endif // #if CV_NEON_AARCH64
 }
 
 inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
 {
     return v_int16x8(vcombine_s16(
                                   vshrn_n_s32(vmull_s16( vget_low_s16(a.val),  vget_low_s16(b.val)), 16),
-                                  vshrn_n_s32(vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val)), 16)
+                                  vshrn_n_s32(
+#if CV_NEON_AARCH64
+                                    vmull_high_s16(a.val, b.val)
+#else // #if CV_NEON_AARCH64
+                                    vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val))
+#endif // #if CV_NEON_AARCH64
+                                    , 16)
                                  ));
 }
 inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
 {
     return v_uint16x8(vcombine_u16(
                                    vshrn_n_u32(vmull_u16( vget_low_u16(a.val),  vget_low_u16(b.val)), 16),
-                                   vshrn_n_u32(vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val)), 16)
+                                   vshrn_n_u32(
+#if CV_NEON_AARCH64
+                                    vmull_high_u16(a.val, b.val)
+#else // #if CV_NEON_AARCH64
+                                    vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val))
+#endif // #if CV_NEON_AARCH64
+                                    , 16)
                                   ));
 }
 
@@ -1254,29 +1286,56 @@ OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64)
 
 inline unsigned v_reduce_sum(const v_uint8x16& a)
 {
+#if CV_NEON_AARCH64
+    uint16_t t0 = vaddlvq_u8(a.val);
+    return t0;
+#else // #if CV_NEON_AARCH64
     uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(a.val));
     uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
     return vget_lane_u32(vpadd_u32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
 }
 inline int v_reduce_sum(const v_int8x16& a)
 {
+#if CV_NEON_AARCH64
+    int16_t t0 = vaddlvq_s8(a.val);
+    return t0;
+#else // #if CV_NEON_AARCH64
     int32x4_t t0 = vpaddlq_s16(vpaddlq_s8(a.val));
     int32x2_t t1 = vpadd_s32(vget_low_s32(t0), vget_high_s32(t0));
     return vget_lane_s32(vpadd_s32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
 }
 inline unsigned v_reduce_sum(const v_uint16x8& a)
 {
+#if CV_NEON_AARCH64
+    uint32_t t0 = vaddlvq_u16(a.val);
+    return t0;
+#else // #if CV_NEON_AARCH64
     uint32x4_t t0 = vpaddlq_u16(a.val);
     uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
     return vget_lane_u32(vpadd_u32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
 }
 inline int v_reduce_sum(const v_int16x8& a)
 {
+#if CV_NEON_AARCH64
+    int32_t t0 = vaddlvq_s16(a.val);
+    return t0;
+#else // #if CV_NEON_AARCH64
     int32x4_t t0 = vpaddlq_s16(a.val);
     int32x2_t t1 = vpadd_s32(vget_low_s32(t0), vget_high_s32(t0));
     return vget_lane_s32(vpadd_s32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
 }
 
+#if CV_NEON_AARCH64
+#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
+inline scalartype v_reduce_##func(const _Tpvec& a) \
+{ \
+    return v##vectorfunc##vq_##suffix(a.val); \
+}
+#else // #if CV_NEON_AARCH64
 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
 inline scalartype v_reduce_##func(const _Tpvec& a) \
 { \
@@ -1285,12 +1344,20 @@ inline scalartype v_reduce_##func(const _Tpvec& a) \
     a0 = vp##vectorfunc##_##suffix(a0, a0); \
     return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \
 }
+#endif // #if CV_NEON_AARCH64
 
 OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_uint8x16, uint8x8, uchar, max, max, u8)
 OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_uint8x16, uint8x8, uchar, min, min, u8)
 OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_int8x16, int8x8, schar, max, max, s8)
 OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_int8x16, int8x8, schar, min, min, s8)
 
+#if CV_NEON_AARCH64
+#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
+inline scalartype v_reduce_##func(const _Tpvec& a) \
+{ \
+    return v##vectorfunc##vq_##suffix(a.val); \
+}
+#else // #if CV_NEON_AARCH64
 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
 inline scalartype v_reduce_##func(const _Tpvec& a) \
 { \
@@ -1298,18 +1365,27 @@ inline scalartype v_reduce_##func(const _Tpvec& a) \
     a0 = vp##vectorfunc##_##suffix(a0, a0); \
     return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \
 }
+#endif // #if CV_NEON_AARCH64
 
 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, ushort, max, max, u16)
 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, ushort, min, min, u16)
 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, max, max, s16)
 OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, min, min, s16)
 
+#if CV_NEON_AARCH64
+#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
+inline scalartype v_reduce_##func(const _Tpvec& a) \
+{ \
+    return v##vectorfunc##vq_##suffix(a.val); \
+}
+#else // #if CV_NEON_AARCH64
 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
 inline scalartype v_reduce_##func(const _Tpvec& a) \
 { \
     _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
     return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, vget_high_##suffix(a.val)),0); \
 }
+#endif // #if CV_NEON_AARCH64
 
 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, sum, add, u32)
 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, max, max, u32)
@@ -1322,9 +1398,21 @@ OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, max, max, f32)
 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, min, min, f32)
 
 inline uint64 v_reduce_sum(const v_uint64x2& a)
-{ return vget_lane_u64(vadd_u64(vget_low_u64(a.val), vget_high_u64(a.val)),0); }
+{
+#if CV_NEON_AARCH64
+    return vaddvq_u64(a.val);
+#else // #if CV_NEON_AARCH64
+    return vget_lane_u64(vadd_u64(vget_low_u64(a.val), vget_high_u64(a.val)),0);
+#endif // #if CV_NEON_AARCH64
+}
 inline int64 v_reduce_sum(const v_int64x2& a)
-{ return vget_lane_s64(vadd_s64(vget_low_s64(a.val), vget_high_s64(a.val)),0); }
+{
+#if CV_NEON_AARCH64
+    return vaddvq_s64(a.val);
+#else // #if CV_NEON_AARCH64
+    return vget_lane_s64(vadd_s64(vget_low_s64(a.val), vget_high_s64(a.val)),0);
+#endif // #if CV_NEON_AARCH64
+}
 #if CV_SIMD128_64F
 inline double v_reduce_sum(const v_float64x2& a)
 {
@@ -1335,6 +1423,11 @@ inline double v_reduce_sum(const v_float64x2& a)
 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
                                  const v_float32x4& c, const v_float32x4& d)
 {
+#if CV_NEON_AARCH64
+    float32x4_t ab = vpaddq_f32(a.val, b.val); // a0+a1 a2+a3 b0+b1 b2+b3
+    float32x4_t cd = vpaddq_f32(c.val, d.val); // c0+c1 d0+d1 c2+c3 d2+d3
+    return v_float32x4(vpaddq_f32(ab, cd));  // sumA sumB sumC sumD
+#else // #if CV_NEON_AARCH64
     float32x4x2_t ab = vtrnq_f32(a.val, b.val);
     float32x4x2_t cd = vtrnq_f32(c.val, d.val);
 
@@ -1345,49 +1438,91 @@ inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
     float32x4_t v1 = vcombine_f32(vget_high_f32(u0), vget_high_f32(u1));
 
     return v_float32x4(vaddq_f32(v0, v1));
+#endif // #if CV_NEON_AARCH64
 }
 
 inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
 {
+#if CV_NEON_AARCH64
+    uint8x16_t t0 = vabdq_u8(a.val, b.val);
+    uint16_t t1 = vaddlvq_u8(t0);
+    return t1;
+#else // #if CV_NEON_AARCH64
     uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(vabdq_u8(a.val, b.val)));
     uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
     return vget_lane_u32(vpadd_u32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
 }
 inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
 {
+#if CV_NEON_AARCH64
+    uint8x16_t t0 = vreinterpretq_u8_s8(vabdq_s8(a.val, b.val));
+    uint16_t t1 = vaddlvq_u8(t0);
+    return t1;
+#else // #if CV_NEON_AARCH64
     uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_s8(vabdq_s8(a.val, b.val))));
     uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
     return vget_lane_u32(vpadd_u32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
 }
 inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
 {
+#if CV_NEON_AARCH64
+    uint16x8_t t0 = vabdq_u16(a.val, b.val);
+    uint32_t t1 = vaddlvq_u16(t0);
+    return t1;
+#else // #if CV_NEON_AARCH64
     uint32x4_t t0 = vpaddlq_u16(vabdq_u16(a.val, b.val));
     uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
     return vget_lane_u32(vpadd_u32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
 }
 inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
 {
+#if CV_NEON_AARCH64
+    uint16x8_t t0 = vreinterpretq_u16_s16(vabdq_s16(a.val, b.val));
+    uint32_t t1 = vaddlvq_u16(t0);
+    return t1;
+#else // #if CV_NEON_AARCH64
     uint32x4_t t0 = vpaddlq_u16(vreinterpretq_u16_s16(vabdq_s16(a.val, b.val)));
     uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
     return vget_lane_u32(vpadd_u32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
 }
 inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
 {
+#if CV_NEON_AARCH64
+    uint32x4_t t0 = vabdq_u32(a.val, b.val);
+    uint32_t t1 = vaddvq_u32(t0);
+    return t1;
+#else // #if CV_NEON_AARCH64
     uint32x4_t t0 = vabdq_u32(a.val, b.val);
     uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
     return vget_lane_u32(vpadd_u32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
 }
 inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
 {
+#if CV_NEON_AARCH64
+    uint32x4_t t0 = vreinterpretq_u32_s32(vabdq_s32(a.val, b.val));
+    uint32_t t1 = vaddvq_u32(t0);
+    return t1;
+#else // #if CV_NEON_AARCH64
     uint32x4_t t0 = vreinterpretq_u32_s32(vabdq_s32(a.val, b.val));
     uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
     return vget_lane_u32(vpadd_u32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
 }
 inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
 {
+#if CV_NEON_AARCH64
+    float32x4_t t0 = vabdq_f32(a.val, b.val);
+    return vaddvq_f32(t0);
+#else // #if CV_NEON_AARCH64
     float32x4_t t0 = vabdq_f32(a.val, b.val);
     float32x2_t t1 = vpadd_f32(vget_low_f32(t0), vget_high_f32(t0));
     return vget_lane_f32(vpadd_f32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
 }
 
 inline v_uint8x16 v_popcount(const v_uint8x16& a)
@@ -1409,30 +1544,54 @@ inline v_uint64x2 v_popcount(const v_int64x2& a)
 
 inline int v_signmask(const v_uint8x16& a)
 {
+#if CV_NEON_AARCH64
+    const int8x16_t signPosition = {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7};
+    const uint8x16_t byteOrder = {0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15};
+    uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), signPosition);
+    uint8x16_t v1 = vqtbl1q_u8(v0, byteOrder);
+    uint32_t t0 = vaddlvq_u16(vreinterpretq_u16_u8(v1));
+    return t0;
+#else // #if CV_NEON_AARCH64
     int8x8_t m0 = vcreate_s8(CV_BIG_UINT(0x0706050403020100));
     uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_s8(m0, m0));
     uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0)));
     return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8);
+#endif // #if CV_NEON_AARCH64
 }
+
 inline int v_signmask(const v_int8x16& a)
 { return v_signmask(v_reinterpret_as_u8(a)); }
 
 inline int v_signmask(const v_uint16x8& a)
 {
+#if CV_NEON_AARCH64
+    const int16x8_t signPosition = {0,1,2,3,4,5,6,7};
+    uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), signPosition);
+    uint32_t t0 = vaddlvq_u16(v0);
+    return t0;
+#else // #if CV_NEON_AARCH64
     int16x4_t m0 = vcreate_s16(CV_BIG_UINT(0x0003000200010000));
     uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), vcombine_s16(m0, m0));
     uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(v0));
     return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 4);
+#endif // #if CV_NEON_AARCH64
 }
 inline int v_signmask(const v_int16x8& a)
 { return v_signmask(v_reinterpret_as_u16(a)); }
 
 inline int v_signmask(const v_uint32x4& a)
 {
+#if CV_NEON_AARCH64
+    const int32x4_t signPosition = {0,1,2,3};
+    uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), signPosition);
+    uint32_t t0 = vaddvq_u32(v0);
+    return t0;
+#else // #if CV_NEON_AARCH64
     int32x2_t m0 = vcreate_s32(CV_BIG_UINT(0x0000000100000000));
     uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), vcombine_s32(m0, m0));
     uint64x2_t v1 = vpaddlq_u32(v0);
     return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 2);
+#endif // #if CV_NEON_AARCH64
 }
 inline int v_signmask(const v_int32x4& a)
 { return v_signmask(v_reinterpret_as_u32(a)); }
@@ -1440,9 +1599,16 @@ inline int v_signmask(const v_float32x4& a)
 { return v_signmask(v_reinterpret_as_u32(a)); }
 inline int v_signmask(const v_uint64x2& a)
 {
+#if CV_NEON_AARCH64
+    const int64x2_t signPosition = {0,1};
+    uint64x2_t v0 = vshlq_u64(vshrq_n_u64(a.val, 63), signPosition);
+    uint64_t t0 = vaddvq_u64(v0);
+    return t0;
+#else // #if CV_NEON_AARCH64
     int64x1_t m0 = vdup_n_s64(0);
     uint64x2_t v0 = vshlq_u64(vshrq_n_u64(a.val, 63), vcombine_s64(m0, m0));
     return (int)vgetq_lane_u64(v0, 0) + ((int)vgetq_lane_u64(v0, 1) << 1);
+#endif // #if CV_NEON_AARCH64
 }
 inline int v_signmask(const v_int64x2& a)
 { return v_signmask(v_reinterpret_as_u64(a)); }
@@ -1464,19 +1630,31 @@ inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signma
 inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(a)); }
 #endif
 
-#define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
-inline bool v_check_all(const v_##_Tpvec& a) \
-{ \
-    _Tpvec##_t v0 = vshrq_n_##suffix(vmvnq_##suffix(a.val), shift); \
-    uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
-    return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) == 0; \
-} \
-inline bool v_check_any(const v_##_Tpvec& a) \
-{ \
-    _Tpvec##_t v0 = vshrq_n_##suffix(a.val, shift); \
-    uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
-    return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) != 0; \
-}
+#if CV_NEON_AARCH64
+    #define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
+    inline bool v_check_all(const v_##_Tpvec& a) \
+    { \
+        return (vminvq_##suffix(a.val) >> shift) != 0; \
+    } \
+    inline bool v_check_any(const v_##_Tpvec& a) \
+    { \
+        return (vmaxvq_##suffix(a.val) >> shift) != 0; \
+    }
+#else // #if CV_NEON_AARCH64
+    #define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
+    inline bool v_check_all(const v_##_Tpvec& a) \
+    { \
+        _Tpvec##_t v0 = vshrq_n_##suffix(vmvnq_##suffix(a.val), shift); \
+        uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
+        return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) == 0; \
+    } \
+    inline bool v_check_any(const v_##_Tpvec& a) \
+    { \
+        _Tpvec##_t v0 = vshrq_n_##suffix(a.val, shift); \
+        uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
+        return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) != 0; \
+    }
+#endif // #if CV_NEON_AARCH64
 
 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint8x16, u8, 7)
 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint16x8, u16, 15)
@@ -1829,6 +2007,37 @@ inline v_int32x4 v_trunc(const v_float64x2& a)
 }
 #endif
 
+#if CV_NEON_AARCH64
+#define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \
+inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
+                         const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
+                         v_##_Tpvec& b0, v_##_Tpvec& b1, \
+                         v_##_Tpvec& b2, v_##_Tpvec& b3) \
+{ \
+    /* -- Pass 1: 64b transpose */ \
+    _Tpvec##_t t0 = vreinterpretq_##suffix##32_##suffix##64( \
+                        vtrn1q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a0.val), \
+                                            vreinterpretq_##suffix##64_##suffix##32(a2.val))); \
+    _Tpvec##_t t1 = vreinterpretq_##suffix##32_##suffix##64( \
+                        vtrn1q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a1.val), \
+                                            vreinterpretq_##suffix##64_##suffix##32(a3.val))); \
+    _Tpvec##_t t2 = vreinterpretq_##suffix##32_##suffix##64( \
+                        vtrn2q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a0.val), \
+                                            vreinterpretq_##suffix##64_##suffix##32(a2.val))); \
+    _Tpvec##_t t3 = vreinterpretq_##suffix##32_##suffix##64( \
+                        vtrn2q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a1.val), \
+                                            vreinterpretq_##suffix##64_##suffix##32(a3.val))); \
+    /* -- Pass 2: 32b transpose */ \
+    b0.val = vtrn1q_##suffix##32(t0, t1); \
+    b1.val = vtrn2q_##suffix##32(t0, t1); \
+    b2.val = vtrn1q_##suffix##32(t2, t3); \
+    b3.val = vtrn2q_##suffix##32(t2, t3); \
+}
+
+OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u)
+OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s)
+OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f)
+#else // #if CV_NEON_AARCH64
 #define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \
 inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
                          const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
@@ -1854,6 +2063,7 @@ inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u32)
 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32)
 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32)
+#endif // #if CV_NEON_AARCH64
 
 #define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \
 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp
new file mode 100644
index 0000000000..2bdc622ffd
--- /dev/null
+++ b/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp
@@ -0,0 +1,2545 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+// Copyright (C) 2015, PingTouGe Semiconductor Co., Ltd., all rights reserved.
+
+#ifndef OPENCV_HAL_INTRIN_RISCVV_HPP
+#define OPENCV_HAL_INTRIN_RISCVV_HPP
+
+#include <float.h>
+#include <algorithm>
+#include "opencv2/core/utility.hpp"
+
+namespace cv
+{
+
+//! @cond IGNORED
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+#define CV_SIMD128 1
+#define CV_SIMD128_64F 1
+//////////// Types ////////////
+struct v_uint8x16
+{
+    typedef uchar lane_type;
+    enum { nlanes = 16 };
+
+    v_uint8x16() {}
+    explicit v_uint8x16(vuint8m1_t v) : val(v) {}
+    v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
+               uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
+    {
+        uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
+        val = (vuint8m1_t)vle_v_u8m1((unsigned char*)v, 16);
+    }
+    uchar get0() const
+    {
+        return vmv_x_s_u8m1_u8(val, 16);
+    }
+
+    vuint8m1_t val;
+};
+
+struct v_int8x16
+{
+    typedef schar lane_type;
+    enum { nlanes = 16 };
+
+    v_int8x16() {}
+    explicit v_int8x16(vint8m1_t v) : val(v) {}
+    v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
+               schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
+    {
+        schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
+        val = (vint8m1_t)vle_v_i8m1((schar*)v, 16);
+    }
+    schar get0() const
+    {
+        return vmv_x_s_i8m1_i8(val, 16);
+    }
+
+    vint8m1_t val;
+};
+
+struct v_uint16x8
+{
+    typedef ushort lane_type;
+    enum { nlanes = 8 };
+
+    v_uint16x8() {}
+    explicit v_uint16x8(vuint16m1_t v) : val(v) {}
+    v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
+    {
+        ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+        val = (vuint16m1_t)vle_v_u16m1((unsigned short*)v, 8);
+    }
+    ushort get0() const
+    {
+        return vmv_x_s_u16m1_u16(val, 8);
+    }
+
+    vuint16m1_t val;
+};
+
+struct v_int16x8
+{
+    typedef short lane_type;
+    enum { nlanes = 8 };
+
+    v_int16x8() {}
+    explicit v_int16x8(vint16m1_t v) : val(v) {}
+    v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
+    {
+        short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+        val = (vint16m1_t)vle_v_i16m1((signed short*)v, 8);
+    }
+    short get0() const
+    {
+        return vmv_x_s_i16m1_i16(val, 8);
+    }
+
+    vint16m1_t val;
+};
+
+struct v_uint32x4
+{
+    typedef unsigned lane_type;
+    enum { nlanes = 4 };
+
+    v_uint32x4() {}
+    explicit v_uint32x4(vuint32m1_t v) : val(v) {}
+    v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
+    {
+        unsigned v[] = {v0, v1, v2, v3};
+        val = (vuint32m1_t)vle_v_u32m1((unsigned int*)v, 4);
+    }
+    unsigned get0() const
+    {
+        return vmv_x_s_u32m1_u32(val, 4);
+    }
+
+    vuint32m1_t val;
+};
+
+struct v_int32x4
+{
+    typedef int lane_type;
+    enum { nlanes = 4 };
+
+    v_int32x4() {}
+    explicit v_int32x4(vint32m1_t v) : val(v) {}
+    v_int32x4(int v0, int v1, int v2, int v3)
+    {
+        int v[] = {v0, v1, v2, v3};
+        val = (vint32m1_t)vle_v_i32m1((signed int*)v, 4);
+    }
+    int get0() const
+    {
+        return vmv_x_s_i32m1_i32(val, 4);
+    }
+    vint32m1_t val;
+};
+
+struct v_float32x4
+{
+    typedef float lane_type;
+    enum { nlanes = 4 };
+
+    v_float32x4() {}
+    explicit v_float32x4(vfloat32m1_t v) : val(v) {}
+    v_float32x4(float v0, float v1, float v2, float v3)
+    {
+        float v[] = {v0, v1, v2, v3};
+        val = (vfloat32m1_t)vle_v_f32m1((float*)v, 4);
+    }
+    float get0() const
+    {
+        return vfmv_f_s_f32m1_f32(val, 4);
+    }
+    vfloat32m1_t val;
+};
+
+struct v_uint64x2
+{
+    typedef uint64 lane_type;
+    enum { nlanes = 2 };
+
+    v_uint64x2() {}
+    explicit v_uint64x2(vuint64m1_t v) : val(v) {}
+    v_uint64x2(uint64 v0, uint64 v1)
+    {
+        uint64 v[] = {v0, v1};
+        val = (vuint64m1_t)vle_v_u64m1((unsigned long*)v, 2);
+    }
+    uint64 get0() const
+    {
+        return vmv_x_s_u64m1_u64(val, 2);
+    }
+    vuint64m1_t val;
+};
+
+struct v_int64x2
+{
+    typedef int64 lane_type;
+    enum { nlanes = 2 };
+
+    v_int64x2() {}
+    explicit v_int64x2(vint64m1_t v) : val(v) {}
+    v_int64x2(int64 v0, int64 v1)
+    {
+        int64 v[] = {v0, v1};
+        val = (vint64m1_t)vle_v_i64m1((long*)v, 2);
+    }
+    int64 get0() const
+    {
+        return vmv_x_s_i64m1_i64(val, 2);
+    }
+    vint64m1_t val;
+};
+
+struct v_float64x2
+{
+    typedef double lane_type;
+    enum { nlanes = 2 };
+
+    v_float64x2() {}
+    explicit v_float64x2(vfloat64m1_t v) : val(v) {}
+    v_float64x2(double v0, double v1)
+    {
+        double v[] = {v0, v1};
+        val = (vfloat64m1_t)vle_v_f64m1((double*)v, 2);
+    }
+    double get0() const
+    {
+        return vfmv_f_s_f64m1_f64(val, 2);
+    }
+    vfloat64m1_t val;
+};
+
+#define OPENCV_HAL_IMPL_RISCVV_INIT(_Tpv, _Tp, suffix) \
+inline _Tp##m1_t vreinterpretq_##suffix##_##suffix(_Tp##m1_t v) { return v; } \
+inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16((vuint8m1_t)(v.val)); } \
+inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16((vint8m1_t)(v.val)); } \
+inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8((vuint16m1_t)(v.val)); } \
+inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8((vint16m1_t)(v.val)); } \
+inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4((vuint32m1_t)(v.val)); } \
+inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4((vint32m1_t)(v.val)); } \
+inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2((vuint64m1_t)(v.val)); } \
+inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2((vint64m1_t)(v.val)); } \
+inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4((vfloat32m1_t)(v.val)); }\
+inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2((vfloat64m1_t)(v.val)); }
+
+
+OPENCV_HAL_IMPL_RISCVV_INIT(uint8x16, vuint8, u8)
+OPENCV_HAL_IMPL_RISCVV_INIT(int8x16, vint8, s8)
+OPENCV_HAL_IMPL_RISCVV_INIT(uint16x8, vuint16, u16)
+OPENCV_HAL_IMPL_RISCVV_INIT(int16x8, vint16, s16)
+OPENCV_HAL_IMPL_RISCVV_INIT(uint32x4, vuint32, u32)
+OPENCV_HAL_IMPL_RISCVV_INIT(int32x4, vint32, s32)
+OPENCV_HAL_IMPL_RISCVV_INIT(uint64x2, vuint64, u64)
+OPENCV_HAL_IMPL_RISCVV_INIT(int64x2, vint64, s64)
+OPENCV_HAL_IMPL_RISCVV_INIT(float64x2, vfloat64, f64)
+OPENCV_HAL_IMPL_RISCVV_INIT(float32x4, vfloat32, f32)
+#define OPENCV_HAL_IMPL_RISCVV_INIT_SET(__Tp, _Tp, suffix, len, num) \
+inline v_##_Tp##x##num v_setzero_##suffix() { return v_##_Tp##x##num((v##_Tp##m1_t){0}); }     \
+inline v_##_Tp##x##num v_setall_##suffix(__Tp v) { return v_##_Tp##x##num(vmv_v_x_##len##m1(v, num)); }
+
+OPENCV_HAL_IMPL_RISCVV_INIT_SET(uchar, uint8, u8, u8, 16)
+OPENCV_HAL_IMPL_RISCVV_INIT_SET(char, int8, s8, i8, 16)
+OPENCV_HAL_IMPL_RISCVV_INIT_SET(ushort, uint16, u16, u16, 8)
+OPENCV_HAL_IMPL_RISCVV_INIT_SET(short, int16, s16, i16, 8)
+OPENCV_HAL_IMPL_RISCVV_INIT_SET(unsigned int, uint32, u32, u32, 4)
+OPENCV_HAL_IMPL_RISCVV_INIT_SET(int, int32, s32, i32, 4)
+OPENCV_HAL_IMPL_RISCVV_INIT_SET(unsigned long, uint64, u64, u64, 2)
+OPENCV_HAL_IMPL_RISCVV_INIT_SET(long, int64, s64, i64, 2)
+inline v_float32x4 v_setzero_f32() { return v_float32x4((vfloat32m1_t){0}); }
+inline v_float32x4 v_setall_f32(float v) { return v_float32x4(vfmv_v_f_f32m1(v, 4)); }
+
+inline v_float64x2 v_setzero_f64() { return v_float64x2(vfmv_v_f_f64m1(0, 2)); }
+inline v_float64x2 v_setall_f64(double v) { return v_float64x2(vfmv_v_f_f64m1(v, 2)); }
+
+
+#define OPENCV_HAL_IMPL_RISCVV_BIN_OP(bin_op, _Tpvec, intrin) \
+inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a.val, b.val)); \
+} \
+inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
+{ \
+    a.val = intrin(a.val, b.val); \
+    return a; \
+}
+
+#define OPENCV_HAL_IMPL_RISCVV_BIN_OPN(bin_op, _Tpvec, intrin, num) \
+inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a.val, b.val, num)); \
+} \
+inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
+{ \
+    a.val = intrin(a.val, b.val, num); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint8x16, vsaddu_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint8x16, vssubu_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int8x16, vsadd_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int8x16, vssub_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint16x8, vsaddu_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint16x8, vssubu_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int16x8, vsadd_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int16x8, vssub_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int32x4, vsadd_vv_i32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int32x4, vssub_vv_i32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_int32x4, vmul_vv_i32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint32x4, vadd_vv_u32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint32x4, vsub_vv_u32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_uint32x4, vmul_vv_u32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int64x2, vsadd_vv_i64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int64x2, vssub_vv_i64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint64x2, vadd_vv_u64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint64x2, vsub_vv_u64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_float32x4, vfadd_vv_f32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_float32x4, vfsub_vv_f32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_float32x4, vfmul_vv_f32m1, 4)
+inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b)
+{
+    return v_float32x4(vfdiv_vv_f32m1(a.val, b.val, 4));
+}
+inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
+{
+    a.val = vfdiv_vv_f32m1(a.val, b.val, 4);
+    return a;
+}
+
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_float64x2, vfadd_vv_f64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_float64x2, vfsub_vv_f64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_float64x2, vfmul_vv_f64m1, 2)
+inline v_float64x2 operator / (const v_float64x2& a, const v_float64x2& b)
+{
+    return v_float64x2(vfdiv_vv_f64m1(a.val, b.val, 2));
+}
+inline v_float64x2& operator /= (v_float64x2& a, const v_float64x2& b)
+{
+    a.val = vfdiv_vv_f64m1(a.val, b.val, 2);
+    return a;
+}
+// TODO: exp, log, sin, cos
+
+#define OPENCV_HAL_IMPL_RISCVV_BIN_FUNC(_Tpvec, func, intrin) \
+inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a.val, b.val)); \
+}
+
+#define OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(_Tpvec, func, intrin, num) \
+inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a.val, b.val, num)); \
+}
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_min, vminu_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_max, vmaxu_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_min, vmin_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_max, vmax_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_min, vminu_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_max, vmaxu_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_min, vmin_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_max, vmax_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint32x4, v_min, vminu_vv_u32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint32x4, v_max, vmaxu_vv_u32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int32x4, v_min, vmin_vv_i32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int32x4, v_max, vmax_vv_i32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_float32x4, v_min, vfmin_vv_f32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_float32x4, v_max, vfmax_vv_f32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_float64x2, v_min, vfmin_vv_f64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_float64x2, v_max, vfmax_vv_f64m1, 2)
+
+inline v_float32x4 v_sqrt(const v_float32x4& x)
+{
+    return v_float32x4(vfsqrt_v_f32m1(x.val, 4));
+}
+
+inline v_float32x4 v_invsqrt(const v_float32x4& x)
+{
+    return v_float32x4(vfrdiv_vf_f32m1(vfsqrt_v_f32m1(x.val, 4), 1, 4));
+}
+
+inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
+{
+    v_float32x4 x(vfmacc_vv_f32m1(vfmul_vv_f32m1(a.val, a.val, 4), b.val, b.val, 4));
+    return v_sqrt(x);
+}
+
+inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
+{
+    return v_float32x4(vfmacc_vv_f32m1(vfmul_vv_f32m1(a.val, a.val, 4), b.val, b.val, 4));
+}
+
+inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+{
+    return v_float32x4(vfmacc_vv_f32m1(c.val, a.val, b.val, 4));
+}
+
+inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return v_int32x4(vmacc_vv_i32m1(c.val, a.val, b.val, 4));
+}
+
+inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+{
+    return v_fma(a, b, c);
+}
+
+inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return v_fma(a, b, c);
+}
+
+inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
+                            const v_float32x4& m1, const v_float32x4& m2,
+                            const v_float32x4& m3)
+{
+    vfloat32m1_t res = vfmul_vf_f32m1(m0.val, v.val[0], 4);//vmuli_f32(m0.val, v.val, 0);
+    res = vfmacc_vf_f32m1(res, v.val[1], m1.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
+    res = vfmacc_vf_f32m1(res, v.val[2], m2.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
+    res = vfmacc_vf_f32m1(res, v.val[3], m3.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
+    return v_float32x4(res);
+}
+
+inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
+                               const v_float32x4& m1, const v_float32x4& m2,
+                               const v_float32x4& a)
+{
+    vfloat32m1_t res = vfmul_vf_f32m1(m0.val, v.val[0], 4);//vmuli_f32(m0.val, v.val, 0);
+    res = vfmacc_vf_f32m1(res, v.val[1], m1.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
+    res = vfmacc_vf_f32m1(res, v.val[2], m2.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
+    res = vfadd_vv_f32m1(res, a.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
+    return v_float32x4(res);
+}
+
+inline v_float64x2 v_sqrt(const v_float64x2& x)
+{
+    return v_float64x2(vfsqrt_v_f64m1(x.val, 2));
+}
+
+inline v_float64x2 v_invsqrt(const v_float64x2& x)
+{
+    return v_float64x2(vfrdiv_vf_f64m1(vfsqrt_v_f64m1(x.val, 2), 1, 2));
+}
+
+inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
+{
+    v_float64x2 x(vfmacc_vv_f64m1(vfmul_vv_f64m1(a.val, a.val, 2), b.val, b.val, 2));
+    return v_sqrt(x);
+}
+
+inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
+{
+    return v_float64x2(vfmacc_vv_f64m1(vfmul_vv_f64m1(a.val, a.val, 2), b.val, b.val, 2));
+}
+
+inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
+{
+    return v_float64x2(vfmacc_vv_f64m1(c.val, a.val, b.val, 2));
+}
+
+inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
+{
+    return v_fma(a, b, c);
+}
+
+#define OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(_Tpvec, suffix, num) \
+    OPENCV_HAL_IMPL_RISCVV_BIN_OPN(&, _Tpvec, vand_vv_##suffix, num) \
+    OPENCV_HAL_IMPL_RISCVV_BIN_OPN(|, _Tpvec, vor_vv_##suffix, num) \
+    OPENCV_HAL_IMPL_RISCVV_BIN_OPN(^, _Tpvec, vxor_vv_##suffix, num) \
+    inline _Tpvec operator ~ (const _Tpvec & a) \
+    { \
+        return _Tpvec(vnot_v_##suffix(a.val, num)); \
+    }
+
+OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_uint8x16, u8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_uint16x8, u16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_uint32x4, u32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_uint64x2, u64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int8x16,  i8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int16x8,  i16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int32x4,  i32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int64x2,  i64m1, 2)
+
+#define OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(bin_op, intrin) \
+inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
+{ \
+    return v_float32x4(vfloat32m1_t(intrin(vint32m1_t(a.val), vint32m1_t(b.val), 4))); \
+} \
+inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
+{ \
+    a.val = vfloat32m1_t(intrin(vint32m1_t(a.val), vint32m1_t(b.val), 4)); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(&, vand_vv_i32m1)
+OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(|, vor_vv_i32m1)
+OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(^, vxor_vv_i32m1)
+
+inline v_float32x4 operator ~ (const v_float32x4& a)
+{
+    return v_float32x4((vfloat32m1_t)(vnot_v_i32m1((vint32m1_t)(a.val), 4)));
+}
+
+#define OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(bin_op, intrin) \
+inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
+{ \
+    return v_float64x2(vfloat64m1_t(intrin(vint64m1_t(a.val), vint64m1_t(b.val), 2))); \
+} \
+inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
+{ \
+    a.val = vfloat64m1_t(intrin(vint64m1_t(a.val), vint64m1_t(b.val), 2)); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(&, vand_vv_i64m1)
+OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(|, vor_vv_i64m1)
+OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(^, vxor_vv_i64m1)
+
+inline v_float64x2 operator ~ (const v_float64x2& a)
+{
+    return v_float64x2((vfloat64m1_t)(vnot_v_i64m1((vint64m1_t)(a.val), 2)));
+}
+inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
+{
+    return v_int16x8(vmulh_vv_i16m1(a.val, b.val, 8));
+}
+inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
+{
+    return v_uint16x8(vmulhu_vv_u16m1(a.val, b.val, 8));
+}
+
+//#define OPENCV_HAL_IMPL_RISCVV_ABS(_Tpuvec, _Tpsvec, usuffix, ssuffix) \
+//inline _Tpuvec v_abs(const _Tpsvec& a) {    \
+//    E##xm1_t mask=vmflt_vf_e32xm1_f32m1(x.val, 0.0, 4);
+
+//OPENCV_HAL_IMPL_RISCVV_ABS(v_uint8x16, v_int8x16, u8, s8)
+//OPENCV_HAL_IMPL_RISCVV_ABS(v_uint16x8, v_int16x8, u16, s16)
+//OPENCV_HAL_IMPL_RISCVV_ABS(v_uint32x4, v_int32x4, u32, s32)
+
+inline v_uint32x4 v_abs(v_int32x4 x)
+{
+    vbool32_t mask=vmslt_vx_i32m1_b32(x.val, 0, 4);
+    return v_uint32x4((vuint32m1_t)vrsub_vx_i32m1_m(mask, x.val, x.val, 0, 4));
+}
+
+inline v_uint16x8 v_abs(v_int16x8 x)
+{
+    vbool16_t mask=vmslt_vx_i16m1_b16(x.val, 0, 8);
+    return v_uint16x8((vuint16m1_t)vrsub_vx_i16m1_m(mask, x.val, x.val, 0, 8));
+}
+
+inline v_uint8x16 v_abs(v_int8x16 x)
+{
+    vbool8_t mask=vmslt_vx_i8m1_b8(x.val, 0, 16);
+    return v_uint8x16((vuint8m1_t)vrsub_vx_i8m1_m(mask, x.val, x.val, 0, 16));
+}
+
+inline v_float32x4 v_abs(v_float32x4 x)
+{
+    return (v_float32x4)vfsgnjx_vv_f32m1(x.val, x.val, 4);
+}
+
+inline v_float64x2 v_abs(v_float64x2 x)
+{
+    return (v_float64x2)vfsgnjx_vv_f64m1(x.val, x.val, 2);
+}
+
+inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
+{
+    vfloat32m1_t ret = vfsub_vv_f32m1(a.val, b.val, 4);
+    return (v_float32x4)vfsgnjx_vv_f32m1(ret, ret, 4);
+}
+
+inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
+{
+    vfloat64m1_t ret = vfsub_vv_f64m1(a.val, b.val, 2);
+    return (v_float64x2)vfsgnjx_vv_f64m1(ret, ret, 2);
+}
+
+#define OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(bit, num) \
+inline v_uint##bit##x##num v_absdiff(v_uint##bit##x##num a, v_uint##bit##x##num b){    \
+    vuint##bit##m1_t vmax = vmaxu_vv_u##bit##m1(a.val, b.val, num);    \
+    vuint##bit##m1_t vmin = vminu_vv_u##bit##m1(a.val, b.val, num);    \
+    return v_uint##bit##x##num(vsub_vv_u##bit##m1(vmax, vmin, num));\
+}
+
+OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(8, 16)
+OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(16, 8)
+OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(32, 4)
+
+/** Saturating absolute difference **/
+inline v_int8x16 v_absdiffs(v_int8x16 a, v_int8x16 b){
+    vint8m1_t vmax = vmax_vv_i8m1(a.val, b.val, 16);
+    vint8m1_t vmin = vmin_vv_i8m1(a.val, b.val, 16);
+    return v_int8x16(vssub_vv_i8m1(vmax, vmin, 16));
+}
+inline v_int16x8 v_absdiffs(v_int16x8 a, v_int16x8 b){
+    vint16m1_t vmax = vmax_vv_i16m1(a.val, b.val, 8);
+    vint16m1_t vmin = vmin_vv_i16m1(a.val, b.val, 8);
+    return v_int16x8(vssub_vv_i16m1(vmax, vmin, 8));
+}
+
+#define OPENCV_HAL_IMPL_RISCVV_ABSDIFF(_Tpvec, _Tpv, num) \
+inline v_uint##_Tpvec v_absdiff(v_int##_Tpvec a, v_int##_Tpvec b){    \
+     vint##_Tpv##_t max = vmax_vv_i##_Tpv(a.val, b.val, num);\
+     vint##_Tpv##_t min = vmin_vv_i##_Tpv(a.val, b.val, num);\
+    return v_uint##_Tpvec((vuint##_Tpv##_t)vsub_vv_i##_Tpv(max, min, num));    \
+}
+
+OPENCV_HAL_IMPL_RISCVV_ABSDIFF(8x16, 8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_ABSDIFF(16x8, 16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_ABSDIFF(32x4, 32m1, 4)
+
+//  Multiply and expand
+inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
+                         v_int16x8& c, v_int16x8& d)
+{
+    vint16m2_t res = vundefined_i16m2();
+    res = vwmul_vv_i16m2(a.val, b.val, 16);
+    c.val = vget_i16m2_i16m1(res, 0);
+    d.val = vget_i16m2_i16m1(res, 1);
+}
+
+inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
+                         v_uint16x8& c, v_uint16x8& d)
+{
+    vuint16m2_t res = vundefined_u16m2();
+    res = vwmulu_vv_u16m2(a.val, b.val, 16);
+    c.val = vget_u16m2_u16m1(res, 0);
+    d.val = vget_u16m2_u16m1(res, 1);
+}
+
+inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
+                         v_int32x4& c, v_int32x4& d)
+{
+    vint32m2_t res = vundefined_i32m2();
+    res = vwmul_vv_i32m2(a.val, b.val, 8);
+    c.val = vget_i32m2_i32m1(res, 0);
+    d.val = vget_i32m2_i32m1(res, 1);
+}
+
+inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
+                         v_uint32x4& c, v_uint32x4& d)
+{
+    vuint32m2_t res = vundefined_u32m2();
+    res = vwmulu_vv_u32m2(a.val, b.val, 8);
+    c.val = vget_u32m2_u32m1(res, 0);
+    d.val = vget_u32m2_u32m1(res, 1);
+}
+
+inline void v_mul_expand(const v_int32x4& a, const v_int32x4& b,
+                         v_int64x2& c, v_int64x2& d)
+{
+    vint64m2_t res = vundefined_i64m2();
+    res = vwmul_vv_i64m2(a.val, b.val, 4);
+    c.val = vget_i64m2_i64m1(res, 0);
+    d.val = vget_i64m2_i64m1(res, 1);
+}
+
+inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
+                         v_uint64x2& c, v_uint64x2& d)
+{
+    vuint64m2_t res = vundefined_u64m2();
+    res = vwmulu_vv_u64m2(a.val, b.val, 4);
+    c.val = vget_u64m2_u64m1(res, 0);
+    d.val = vget_u64m2_u64m1(res, 1);
+}
+
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_add_wrap, vadd_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_add_wrap, vadd_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_add_wrap, vadd_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_add_wrap, vadd_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_sub_wrap, vsub_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_sub_wrap, vsub_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_sub_wrap, vsub_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_sub_wrap, vsub_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_mul_wrap, vmul_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_mul_wrap, vmul_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_mul_wrap, vmul_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_mul_wrap, vmul_vv_i16m1, 8)
+//////// Dot Product ////////
+// 16 >> 32
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
+{
+    vint32m2_t res = vundefined_i32m2();
+    res = vwmul_vv_i32m2(a.val, b.val, 8);
+    res = vrgather_vv_i32m2(res, (vuint32m2_t){0, 2, 4, 6, 1, 3, 5, 7}, 8);
+    return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(res, 0), vget_i32m2_i32m1(res, 1), 4));
+}
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{
+    vint32m2_t res = vundefined_i32m2();
+    res = vwmul_vv_i32m2(a.val, b.val, 8);
+    res = vrgather_vv_i32m2(res, (vuint32m2_t){0, 2, 4, 6, 1, 3, 5, 7}, 8);
+    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(res, 0),vget_i32m2_i32m1(res, 1), 4), c.val, 4));
+}
+
+// 32 >> 64
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
+{
+    vint64m2_t res = vundefined_i64m2();
+    res = vwmul_vv_i64m2(a.val, b.val, 4);
+    res = vrgather_vv_i64m2(res, (vuint64m2_t){0, 2, 1, 3}, 4);
+    return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(res, 0), vget_i64m2_i64m1(res, 1), 2));
+}
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{
+    vint64m2_t res = vundefined_i64m2();
+    res = vwmul_vv_i64m2(a.val, b.val, 4);
+    res = vrgather_vv_i64m2(res, (vuint64m2_t){0, 2, 1, 3}, 4);
+    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(res, 0), vget_i64m2_i64m1(res, 1), 2), c.val, 2));
+}
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
+{
+    vuint16m2_t v1 = vundefined_u16m2();
+    vuint32m2_t v2 = vundefined_u32m2();
+    v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
+    v1 = vrgather_vv_u16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
+    v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
+    return v_uint32x4(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4));
+}
+
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
+                                   const v_uint32x4& c)
+{
+    vuint16m2_t v1 = vundefined_u16m2();
+    vuint32m2_t v2 = vundefined_u32m2();
+    v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
+    v1 = vrgather_vv_u16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
+    v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
+    return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4), c.val, 4));
+}
+
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
+{
+    vint16m2_t v1 = vundefined_i16m2();
+    vint32m2_t v2 = vundefined_i32m2();
+    v1 = vwmul_vv_i16m2(a.val, b.val, 16);
+    v1 = vrgather_vv_i16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
+    v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
+    return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4));
+}
+
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
+                                   const v_int32x4& c)
+{
+    vint16m2_t v1 = vundefined_i16m2();
+    vint32m2_t v2 = vundefined_i32m2();
+    v1 = vwmul_vv_i16m2(a.val, b.val, 16);
+    v1 = vrgather_vv_i16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
+    v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
+    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4), c.val, 4));
+}
+
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
+{
+    vuint32m2_t v1 = vundefined_u32m2();
+    vuint64m2_t v2 = vundefined_u64m2();
+    v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
+    v1 = vrgather_vv_u32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
+    v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
+    return v_uint64x2(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2));
+}
+
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b,
+                                   const v_uint64x2& c)
+{
+    vuint32m2_t v1 = vundefined_u32m2();
+    vuint64m2_t v2 = vundefined_u64m2();
+    v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
+    v1 = vrgather_vv_u32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
+    v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
+    return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2), c.val, 2));
+}
+
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
+{
+    vint32m2_t v1 = vundefined_i32m2();
+    vint64m2_t v2 = vundefined_i64m2();
+    v1 = vwmul_vv_i32m2(a.val, b.val, 8);
+    v1 = vrgather_vv_i32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
+    v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
+    return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2));
+}
+
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b,
+                                   const v_int64x2& c)
+{
+    vint32m2_t v1 = vundefined_i32m2();
+    vint64m2_t v2 = vundefined_i64m2();
+    v1 = vwmul_vv_i32m2(a.val, b.val, 8);
+    v1 = vrgather_vv_i32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
+    v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
+    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2), c.val, 2));
+}
+
+//////// Fast Dot Product ////////
+// 16 >> 32
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
+{
+    vint32m2_t v1 = vundefined_i32m2();
+    v1 = vwmul_vv_i32m2(a.val, b.val, 8);
+    return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4));
+}
+
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{
+    vint32m2_t v1 = vundefined_i32m2();
+    v1 = vwmul_vv_i32m2(a.val, b.val, 8);
+    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4), c.val, 4));
+}
+
+// 32 >> 64
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
+{
+    vint64m2_t v1 = vundefined_i64m2();
+    v1 = vwmul_vv_i64m2(a.val, b.val, 4);
+    return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(v1, 0), vget_i64m2_i64m1(v1, 1), 2));
+}
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{
+    vint64m2_t v1 = vundefined_i64m2();
+    v1 = vwmul_vv_i64m2(a.val, b.val, 8);
+    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v1, 0), vget_i64m2_i64m1(v1, 1), 4), c.val, 4));
+}
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
+{
+    vuint16m2_t v1 = vundefined_u16m2();
+    vuint32m2_t v2 = vundefined_u32m2();
+    v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
+    v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
+    return v_uint32x4(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4));
+}
+
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{
+    vuint16m2_t v1 = vundefined_u16m2();
+    vuint32m2_t v2 = vundefined_u32m2();
+    v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
+    v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
+    return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4), c.val, 4));
+}
+
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
+{
+    vint16m2_t v1 = vundefined_i16m2();
+    vint32m2_t v2 = vundefined_i32m2();
+    v1 = vwmul_vv_i16m2(a.val, b.val, 16);
+    v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
+    return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4));
+}
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{
+    vint16m2_t v1 = vundefined_i16m2();
+    vint32m2_t v2 = vundefined_i32m2();
+    v1 = vwmul_vv_i16m2(a.val, b.val, 16);
+    v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
+    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4), c.val, 4));
+}
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
+{
+    vuint32m2_t v1 = vundefined_u32m2();
+    vuint64m2_t v2 = vundefined_u64m2();
+    v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
+    v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
+    return v_uint64x2(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2));
+}
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{
+    vuint32m2_t v1 = vundefined_u32m2();
+    vuint64m2_t v2 = vundefined_u64m2();
+    v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
+    v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
+    return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2), c.val, 2));
+}
+
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
+{
+    vint32m2_t v1 = vundefined_i32m2();
+    vint64m2_t v2 = vundefined_i64m2();
+    v1 = vwmul_vv_i32m2(a.val, b.val, 8);
+    v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
+    return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2));
+}
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{
+    vint32m2_t v1 = vundefined_i32m2();
+    vint64m2_t v2 = vundefined_i64m2();
+    v1 = vwmul_vv_i32m2(a.val, b.val, 8);
+    v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
+    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2), c.val, 2));
+}
+
+
+#define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(_Tpvec, _Tpvec2, len, scalartype, func, intrin, num) \
+inline scalartype v_reduce_##func(const v_##_Tpvec##x##num& a) \
+{\
+    v##_Tpvec2##m1_t val = vmv_v_x_##len##m1(0, num); \
+    val = intrin(val, a.val, val, num);    \
+    return vmv_x_s_##len##m1_##len(val, num);    \
+}
+
+
+#define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(_Tpvec, _Tpvec2, scalartype, func, funcu, num) \
+inline scalartype v_reduce_##func(const v_##_Tpvec##x##num& a) \
+{\
+    v##_Tpvec##m1_t val = (v##_Tpvec##m1_t)vmv_v_x_i8m1(0, num); \
+    val = v##funcu##_vs_##_Tpvec2##m1_##_Tpvec2##m1(val, a.val, a.val, num);    \
+    return val[0];    \
+}
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int8, int16, i16, int, sum, vwredsum_vs_i8m1_i16m1, 16)
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int16, int32, i32, int, sum, vwredsum_vs_i16m1_i32m1, 8)
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int32, int64, i64, int, sum, vwredsum_vs_i32m1_i64m1, 4)
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint8, uint16, u16, unsigned, sum, vwredsumu_vs_u8m1_u16m1, 16)
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint16, uint32, u32, unsigned, sum, vwredsumu_vs_u16m1_u32m1, 8)
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint32, uint64, u64, unsigned, sum, vwredsumu_vs_u32m1_u64m1, 4)
+inline float v_reduce_sum(const v_float32x4& a) \
+{\
+    vfloat32m1_t val = vfmv_v_f_f32m1(0.0, 4); \
+    val = vfredsum_vs_f32m1_f32m1(val, a.val, val, 4);    \
+    return vfmv_f_s_f32m1_f32(val, 4);    \
+}
+inline double v_reduce_sum(const v_float64x2& a) \
+{\
+    vfloat64m1_t val = vfmv_v_f_f64m1(0.0, 2); \
+    val = vfredsum_vs_f64m1_f64m1(val, a.val, val, 2);    \
+    return vfmv_f_s_f64m1_f64(val, 2);    \
+}
+inline uint64 v_reduce_sum(const v_uint64x2& a)
+{ return vext_x_v_u64m1_u64((vuint64m1_t)a.val, 0, 2)+vext_x_v_u64m1_u64((vuint64m1_t)a.val, 1, 2); }
+
+inline int64 v_reduce_sum(const v_int64x2& a)
+{ return vext_x_v_i64m1_i64((vint64m1_t)a.val, 0, 2)+vext_x_v_i64m1_i64((vint64m1_t)a.val, 1, 2); }
+
+#define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(func)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int8,  i8, int, func, red##func, 16)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int16, i16, int, func, red##func, 8)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int32, i32, int, func, red##func, 4)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int64, i64, int, func, red##func, 2)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint8,  u8, unsigned, func, red##func##u, 16)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint16, u16, unsigned, func, red##func##u, 8)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint32, u32, unsigned, func, red##func##u, 4)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(float32, f32, float, func, fred##func, 4)
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(max)
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(min)
+
+inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
+                                 const v_float32x4& c, const v_float32x4& d)
+{
+    vfloat32m1_t a0 = vfmv_v_f_f32m1(0.0, 4);
+    vfloat32m1_t b0 = vfmv_v_f_f32m1(0.0, 4);
+    vfloat32m1_t c0 = vfmv_v_f_f32m1(0.0, 4);
+    vfloat32m1_t d0 = vfmv_v_f_f32m1(0.0, 4);
+    a0 = vfredsum_vs_f32m1_f32m1(a0, a.val, a0, 4);
+    b0 = vfredsum_vs_f32m1_f32m1(b0, b.val, b0, 4);
+    c0 = vfredsum_vs_f32m1_f32m1(c0, c.val, c0, 4);
+    d0 = vfredsum_vs_f32m1_f32m1(d0, d.val, d0, 4);
+    return v_float32x4(a0[0], b0[0], c0[0], d0[0]);
+}
+
+inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
+{
+    vfloat32m1_t a0 = vfmv_v_f_f32m1(0.0, 4);
+    vfloat32m1_t x = vfsub_vv_f32m1(a.val, b.val, 4);
+    vbool32_t mask=vmflt_vf_f32m1_b32(x, 0, 4);
+    vfloat32m1_t val = vfrsub_vf_f32m1_m(mask, x, x, 0, 4);
+    a0 = vfredsum_vs_f32m1_f32m1(a0, val, a0, 4);
+    return a0[0];
+}
+
+#define OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(_Tpvec, _Tpvec2) \
+inline unsigned v_reduce_sad(const _Tpvec& a, const _Tpvec&b){    \
+    _Tpvec2 x = v_absdiff(a, b);    \
+    return v_reduce_sum(x);    \
+}
+
+OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_int8x16, v_uint8x16)
+OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_uint8x16, v_uint8x16)
+OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_int16x8, v_uint16x8)
+OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_uint16x8, v_uint16x8)
+OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_int32x4, v_uint32x4)
+OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_uint32x4, v_uint32x4)
+
+#define OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(_Tpvec, _Tp, _T, num, uv) \
+inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    vbool##_T##_t mask = vmseq_vv_##_Tp##_b##_T(a.val, b.val, num);    \
+    return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num));    \
+} \
+inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    vbool##_T##_t mask = vmsne_vv_##_Tp##_b##_T(a.val, b.val, num);    \
+    return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num));    \
+} \
+inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    vbool##_T##_t mask = vmslt##uv##_Tp##_b##_T(a.val, b.val, num);    \
+    return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num));    \
+} \
+inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    vbool##_T##_t mask = vmslt##uv##_Tp##_b##_T(b.val, a.val, num);    \
+    return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num));    \
+} \
+inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    vbool##_T##_t mask = vmsle##uv##_Tp##_b##_T(a.val, b.val, num);    \
+    return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num));    \
+} \
+inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    vbool##_T##_t mask = vmsle##uv##_Tp##_b##_T(b.val, a.val, num);    \
+    return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num));    \
+} \
+
+OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_int8x16, i8m1,  8, 16, _vv_)
+OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_int16x8, i16m1, 16, 8, _vv_)
+OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_int32x4, i32m1, 32, 4, _vv_)
+OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_int64x2, i64m1, 64, 2, _vv_)
+OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint8x16, u8m1, 8, 16, u_vv_)
+OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint16x8, u16m1, 16, 8, u_vv_)
+OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint32x4, u32m1, 32, 4, u_vv_)
+OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint64x2, u64m1, 64, 2, u_vv_)
+
+//TODO: ==
+inline v_float32x4 operator == (const v_float32x4& a, const v_float32x4& b)
+{
+    vbool32_t mask = vmfeq_vv_f32m1_b32(a.val, b.val, 4);
+    vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
+    return v_float32x4((vfloat32m1_t)res);
+}
+inline v_float32x4 operator != (const v_float32x4& a, const v_float32x4& b)
+{
+    vbool32_t mask = vmfne_vv_f32m1_b32(a.val, b.val, 4);
+    vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
+    return v_float32x4((vfloat32m1_t)res);
+}
+inline v_float32x4 operator < (const v_float32x4& a, const v_float32x4& b)
+{
+    vbool32_t mask = vmflt_vv_f32m1_b32(a.val, b.val, 4);
+    vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
+    return v_float32x4((vfloat32m1_t)res);
+}
+inline v_float32x4 operator <= (const v_float32x4& a, const v_float32x4& b)
+{
+    vbool32_t mask = vmfle_vv_f32m1_b32(a.val, b.val, 4);
+    vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
+    return v_float32x4((vfloat32m1_t)res);
+}
+inline v_float32x4 operator > (const v_float32x4& a, const v_float32x4& b)
+{
+    vbool32_t mask = vmfgt_vv_f32m1_b32(a.val, b.val, 4);
+    vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
+    return v_float32x4((vfloat32m1_t)res);
+}
+inline v_float32x4 operator >= (const v_float32x4& a, const v_float32x4& b)
+{
+    vbool32_t mask = vmfge_vv_f32m1_b32(a.val, b.val, 4);
+    vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
+    return v_float32x4((vfloat32m1_t)res);
+}
+inline v_float32x4 v_not_nan(const v_float32x4& a)
+{
+    vbool32_t mask = vmford_vv_f32m1_b32(a.val, a.val, 4);
+    vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
+    return v_float32x4((vfloat32m1_t)res);
+}
+
+//TODO: ==
+inline v_float64x2 operator == (const v_float64x2& a, const v_float64x2& b)
+{
+    vbool64_t mask = vmfeq_vv_f64m1_b64(a.val, b.val, 2);
+    vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
+    return v_float64x2((vfloat64m1_t)res);
+}
+inline v_float64x2 operator != (const v_float64x2& a, const v_float64x2& b)
+{
+    vbool64_t mask = vmfne_vv_f64m1_b64(a.val, b.val, 2);
+    vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
+    return v_float64x2((vfloat64m1_t)res);
+}
+inline v_float64x2 operator < (const v_float64x2& a, const v_float64x2& b)
+{
+    vbool64_t mask = vmflt_vv_f64m1_b64(a.val, b.val, 2);
+    vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
+    return v_float64x2((vfloat64m1_t)res);
+}
+inline v_float64x2 operator <= (const v_float64x2& a, const v_float64x2& b)
+{
+    vbool64_t mask = vmfle_vv_f64m1_b64(a.val, b.val, 2);
+    vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
+    return v_float64x2((vfloat64m1_t)res);
+}
+inline v_float64x2 operator > (const v_float64x2& a, const v_float64x2& b)
+{
+    vbool64_t mask = vmfgt_vv_f64m1_b64(a.val, b.val, 2);
+    vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
+    return v_float64x2((vfloat64m1_t)res);
+}
+inline v_float64x2 operator >= (const v_float64x2& a, const v_float64x2& b)
+{
+    vbool64_t mask = vmfge_vv_f64m1_b64(a.val, b.val, 2);
+    vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
+    return v_float64x2((vfloat64m1_t)res);
+}
+inline v_float64x2 v_not_nan(const v_float64x2& a)
+{
+    vbool64_t mask = vmford_vv_f64m1_b64(a.val, a.val, 2);
+    vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
+    return v_float64x2((vfloat64m1_t)res);
+}
+#define OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(_Tp, _T) \
+inline void v_transpose4x4(const v_##_Tp##32x4& a0, const v_##_Tp##32x4& a1, \
+                         const v_##_Tp##32x4& a2, const v_##_Tp##32x4& a3, \
+                         v_##_Tp##32x4& b0, v_##_Tp##32x4& b1, \
+                         v_##_Tp##32x4& b2, v_##_Tp##32x4& b3) \
+{ \
+    v##_Tp##32m4_t val = vundefined_##_T##m4();    \
+    val = vset_##_T##m4(val, 0, a0.val);    \
+    val = vset_##_T##m4(val, 1, a1.val);    \
+    val = vset_##_T##m4(val, 2, a2.val);    \
+    val = vset_##_T##m4(val, 3, a3.val);   \
+    val = vrgather_vv_##_T##m4(val, (vuint32m4_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);    \
+    b0.val = vget_##_T##m4_##_T##m1(val, 0);   \
+    b1.val = vget_##_T##m4_##_T##m1(val, 1);   \
+    b2.val = vget_##_T##m4_##_T##m1(val, 2);   \
+    b3.val = vget_##_T##m4_##_T##m1(val, 3);   \
+}
+OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(uint, u32)
+OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(int, i32)
+OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(float, f32)
+
+
+#define OPENCV_HAL_IMPL_RISCVV_SHIFT_LEFT(_Tpvec, suffix, _T, num) \
+inline _Tpvec operator << (const _Tpvec& a, int n) \
+{ return _Tpvec((vsll_vx_##_T##m1(a.val, n, num))); } \
+template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
+{ return _Tpvec((vsll_vx_##_T##m1(a.val, n, num))); }
+
+#define OPENCV_HAL_IMPL_RISCVV_SHIFT_RIGHT(_Tpvec, suffix, _T, num, intric) \
+inline _Tpvec operator >> (const _Tpvec& a, int n) \
+{ return _Tpvec((v##intric##_vx_##_T##m1(a.val, n, num))); } \
+template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
+{ return _Tpvec((v##intric##_vx_##_T##m1(a.val, n, num))); }\
+template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
+{ return _Tpvec((v##intric##_vx_##_T##m1(vadd_vx_##_T##m1(a.val, 1<<(n-1), num), n, num))); }
+
+// trade efficiency for convenience
+#define OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(suffix, _T, num, intrin) \
+OPENCV_HAL_IMPL_RISCVV_SHIFT_LEFT(v_##suffix##x##num, suffix, _T, num) \
+OPENCV_HAL_IMPL_RISCVV_SHIFT_RIGHT(v_##suffix##x##num, suffix, _T, num, intrin)
+
+OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint8, u8, 16, srl)
+OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint16, u16, 8, srl)
+OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint32, u32, 4, srl)
+OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint64, u64, 2, srl)
+OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int8, i8, 16, sra)
+OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int16, i16, 8, sra)
+OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int32, i32, 4, sra)
+OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int64, i64, 2, sra)
+
+#if 0
+#define VUP4(n) {0, 1, 2, 3}
+#define VUP8(n) {0, 1, 2, 3, 4, 5, 6, 7}
+#define VUP16(n) {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
+#define VUP2(n) {0, 1}
+#endif
+#define OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(_Tpvec, suffix, _T, num, num2, vmv, len) \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
+{    \
+    suffix##m1_t tmp = vmv##_##_T##m1(0, num);\
+        tmp = vslideup_vx_##_T##m1_m(vmset_m_##len(num), tmp, a.val, n, num);\
+        return _Tpvec(tmp);\
+} \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
+{     \
+        return _Tpvec(vslidedown_vx_##_T##m1(a.val, n, num));\
+} \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
+{ return a; } \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    suffix##m2_t tmp = vundefined_##_T##m2();    \
+    tmp = vset_##_T##m2(tmp, 0, a.val);          \
+    tmp = vset_##_T##m2(tmp, 1, b.val);          \
+        tmp = vslidedown_vx_##_T##m2(tmp, n, num2);\
+        return _Tpvec(vget_##_T##m2_##_T##m1(tmp, 0));\
+} \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    suffix##m2_t tmp = vundefined_##_T##m2();    \
+    tmp = vset_##_T##m2(tmp, 0, b.val);    \
+    tmp = vset_##_T##m2(tmp, 1, a.val);    \
+        tmp = vslideup_vx_##_T##m2(tmp, n, num2);\
+        return _Tpvec(vget_##_T##m2_##_T##m1(tmp, 1));\
+} \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    CV_UNUSED(b); return a; \
+}
+
+OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_uint8x16, vuint8, u8, 16, 32, vmv_v_x, b8)
+OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_int8x16, vint8, i8, 16, 32, vmv_v_x, b8)
+OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_uint16x8, vuint16, u16, 8, 16, vmv_v_x, b16)
+OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_int16x8, vint16, i16, 8, 16, vmv_v_x, b16)
+OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_uint32x4, vuint32, u32, 4, 8, vmv_v_x, b32)
+OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_int32x4, vint32, i32, 4, 8, vmv_v_x, b32)
+OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_uint64x2, vuint64, u64, 2, 4, vmv_v_x, b64)
+OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_int64x2, vint64, i64, 2, 4, vmv_v_x, b64)
+OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_float32x4, vfloat32, f32, 4, 8, vfmv_v_f, b32)
+OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_float64x2, vfloat64, f64, 2, 4, vfmv_v_f, b64)
+
+#define OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(_Tpvec, _Tp, _Tp2, len, hnum, num) \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+{ \
+  typedef uint64 CV_DECL_ALIGNED(1) unaligned_uint64; \
+  vuint64m1_t tmp = {*(unaligned_uint64*)ptr0, *(unaligned_uint64*)ptr1};\
+    return _Tpvec(_Tp2##_t(tmp)); } \
+inline _Tpvec v_load_low(const _Tp* ptr) \
+{ return _Tpvec(vle_v_##len(ptr, hnum)); }\
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ return _Tpvec(vle_v_##len(ptr, num)); } \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ return _Tpvec((_Tp2##_t)vle_v_##len((const _Tp *)ptr, num)); } \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ vse_v_##len(ptr, a.val, hnum);}\
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ \
+  _Tp2##_t a0 = vslidedown_vx_##len(a.val, hnum, num);    \
+  vse_v_##len(ptr, a0, hnum);}\
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ vse_v_##len(ptr, a.val, num); } \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ vse_v_##len(ptr, a.val, num); } \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ vse_v_##len(ptr, a.val, num); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
+{ vse_v_##len(ptr, a.val, num); }
+
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint8x16, uchar, vuint8m1, u8m1, 8, 16)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int8x16,  schar, vint8m1, i8m1, 8, 16)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint16x8, ushort, vuint16m1, u16m1, 4, 8)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int16x8,  short,  vint16m1, i16m1, 4, 8)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint32x4, unsigned, vuint32m1, u32m1, 2, 4)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int32x4,  int,     vint32m1, i32m1, 2, 4)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint64x2, unsigned long, vuint64m1, u64m1, 1, 2)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int64x2,  long,     vint64m1, i64m1, 1, 2)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_float32x4, float, vfloat32m1, f32m1, 2, 4)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_float64x2, double, vfloat64m1, f64m1, 1, 2)
+
+
+////////////// Lookup table access ////////////////////
+
+inline v_int8x16 v_lut(const schar* tab, const int* idx)
+{
+#if 1
+    schar CV_DECL_ALIGNED(32) elems[16] =
+    {
+        tab[idx[ 0]],
+        tab[idx[ 1]],
+        tab[idx[ 2]],
+        tab[idx[ 3]],
+        tab[idx[ 4]],
+        tab[idx[ 5]],
+        tab[idx[ 6]],
+        tab[idx[ 7]],
+        tab[idx[ 8]],
+        tab[idx[ 9]],
+        tab[idx[10]],
+        tab[idx[11]],
+        tab[idx[12]],
+        tab[idx[13]],
+        tab[idx[14]],
+        tab[idx[15]]
+    };
+    return v_int8x16(vle_v_i8m1(elems, 16));
+#else
+    int32xm4_t index32 = vlev_int32xm4(idx, 16);
+    vint16m2_t index16 = vnsra_vx_i16m2_int32xm4(index32, 0, 16);
+    vint8m1_t index = vnsra_vx_i8m1_i16m2(index16, 0, 16);
+    return v_int8x16(vlxbv_i8m1(tab, index, 16));
+#endif
+}
+
+inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx){
+    schar CV_DECL_ALIGNED(32) elems[16] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[2]],
+        tab[idx[2] + 1],
+        tab[idx[3]],
+        tab[idx[3] + 1],
+        tab[idx[4]],
+        tab[idx[4] + 1],
+        tab[idx[5]],
+        tab[idx[5] + 1],
+        tab[idx[6]],
+        tab[idx[6] + 1],
+        tab[idx[7]],
+        tab[idx[7] + 1]
+    };
+    return v_int8x16(vle_v_i8m1(elems, 16));
+}
+inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
+{
+    schar CV_DECL_ALIGNED(32) elems[16] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[0] + 2],
+        tab[idx[0] + 3],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[1] + 2],
+        tab[idx[1] + 3],
+        tab[idx[2]],
+        tab[idx[2] + 1],
+        tab[idx[2] + 2],
+        tab[idx[2] + 3],
+        tab[idx[3]],
+        tab[idx[3] + 1],
+        tab[idx[3] + 2],
+        tab[idx[3] + 3]
+    };
+    return v_int8x16(vle_v_i8m1(elems, 16));
+}
+
+inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
+inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
+inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
+
+inline v_int16x8 v_lut(const short* tab, const int* idx)
+{
+    short CV_DECL_ALIGNED(32) elems[8] =
+    {
+        tab[idx[0]],
+        tab[idx[1]],
+        tab[idx[2]],
+        tab[idx[3]],
+        tab[idx[4]],
+        tab[idx[5]],
+        tab[idx[6]],
+        tab[idx[7]]
+    };
+    return v_int16x8(vle_v_i16m1(elems, 8));
+}
+inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
+{
+    short CV_DECL_ALIGNED(32) elems[8] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[2]],
+        tab[idx[2] + 1],
+        tab[idx[3]],
+        tab[idx[3] + 1]
+    };
+    return v_int16x8(vle_v_i16m1(elems, 8));
+}
+inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
+{
+    short CV_DECL_ALIGNED(32) elems[8] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[0] + 2],
+        tab[idx[0] + 3],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[1] + 2],
+        tab[idx[1] + 3]
+    };
+    return v_int16x8(vle_v_i16m1(elems, 8));
+}
+inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
+inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
+inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
+
+inline v_int32x4 v_lut(const int* tab, const int* idx)
+{
+    int CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[idx[0]],
+        tab[idx[1]],
+        tab[idx[2]],
+        tab[idx[3]]
+    };
+    return v_int32x4(vle_v_i32m1(elems, 4));
+}
+inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
+{
+    int CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[1]],
+        tab[idx[1] + 1]
+    };
+    return v_int32x4(vle_v_i32m1(elems, 4));
+}
+inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
+{
+    return v_int32x4(vle_v_i32m1(tab+idx[0], 4));
+}
+inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
+inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
+inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
+
+inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
+{
+    vint64m1_t res = {tab[idx[0]], tab[idx[1]]};
+    return v_int64x2(res);
+}
+inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
+{
+    return v_int64x2(vle_v_i64m1(tab+idx[0], 2));
+}
+
+inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx)
+{
+    vuint64m1_t res = {tab[idx[0]], tab[idx[1]]};
+    return v_uint64x2(res);
+}
+inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx)
+{
+    return v_uint64x2(vle_v_u64m1(tab+idx[0], 2));
+}
+
+inline v_float32x4 v_lut(const float* tab, const int* idx)
+{
+    float CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[idx[0]],
+        tab[idx[1]],
+        tab[idx[2]],
+        tab[idx[3]]
+    };
+    return v_float32x4(vle_v_f32m1(elems, 4));
+}
+inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
+{
+    float CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[idx[0]],
+        tab[idx[0]+1],
+        tab[idx[1]],
+        tab[idx[1]+1]
+    };
+    return v_float32x4(vle_v_f32m1(elems, 4));
+}
+inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
+{
+    return v_float32x4(vle_v_f32m1(tab + idx[0], 4));
+}
+inline v_float64x2 v_lut(const double* tab, const int* idx)
+{
+    vfloat64m1_t res = {tab[idx[0]], tab[idx[1]]};
+    return v_float64x2(res);
+}
+inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
+{
+    return v_float64x2(vle_v_f64m1(tab+idx[0], 2));
+}
+
+inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
+{
+    int CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[idxvec.val[0]],
+        tab[idxvec.val[1]],
+        tab[idxvec.val[2]],
+        tab[idxvec.val[3]]
+    };
+    return v_int32x4(vle_v_i32m1(elems, 4));
+}
+
+inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
+{
+    unsigned CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[idxvec.val[0]],
+        tab[idxvec.val[1]],
+        tab[idxvec.val[2]],
+        tab[idxvec.val[3]]
+    };
+    return v_uint32x4(vle_v_u32m1(elems, 4));
+}
+
+inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
+{
+    float CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[idxvec.val[0]],
+        tab[idxvec.val[1]],
+        tab[idxvec.val[2]],
+        tab[idxvec.val[3]]
+    };
+    return v_float32x4(vle_v_f32m1(elems, 4));
+}
+inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
+{
+    vfloat64m1_t res = {tab[idxvec.val[0]], tab[idxvec.val[1]]};
+    return v_float64x2(res);
+}
+inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
+{
+    vint32m1_t index_x = vmul_vx_i32m1(idxvec.val, 4, 4);
+    vint32m1_t index_y = vadd_vx_i32m1(index_x, 4, 4);
+
+    x.val = vlxe_v_f32m1(tab, index_x, 4);
+    y.val = vlxe_v_f32m1(tab, index_y, 4);
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+
+    x = v_float64x2(tab[idx[0]], tab[idx[1]]);
+    y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
+}
+
+#define OPENCV_HAL_IMPL_RISCVV_PACKS(_Tp, _Tp2, _T2, num2, _T1, num, intrin, shr, _Type) \
+inline v_##_Tp##x##num v_pack(const v_##_Tp2##x##num2& a, const v_##_Tp2##x##num2& b) \
+{ \
+    v##_Tp2##m2_t  tmp = vundefined_##_T2##m2();    \
+    tmp = vset_##_T2##m2(tmp, 0, a.val);    \
+    tmp = vset_##_T2##m2(tmp, 1, b.val);    \
+    return v_##_Tp##x##num(shr##_##_T1##m1(tmp, 0, num)); \
+}\
+template<int n> inline \
+v_##_Tp##x##num v_rshr_pack(const v_##_Tp2##x##num2& a, const v_##_Tp2##x##num2& b) \
+{ \
+    v##_Tp2##m2_t  tmp = vundefined_##_T2##m2();    \
+    tmp = vset_##_T2##m2(tmp, 0, a.val);    \
+    tmp = vset_##_T2##m2(tmp, 1, b.val);    \
+    return v_##_Tp##x##num(intrin##_##_T1##m1(tmp, n, num)); \
+}\
+inline void v_pack_store(_Type* ptr, const v_##_Tp2##x##num2& a) \
+{ \
+    v##_Tp2##m2_t tmp = vundefined_##_T2##m2();    \
+    tmp = vset_##_T2##m2(tmp, 0, a.val);    \
+    tmp = vset_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2));    \
+    asm("" ::: "memory");                                       \
+    vse_v_##_T1##m1(ptr, shr##_##_T1##m1(tmp, 0, num), num2); \
+}\
+template<int n> inline \
+void v_rshr_pack_store(_Type* ptr, const v_##_Tp2##x##num2& a) \
+{ \
+    v##_Tp2##m2_t tmp = vundefined_##_T2##m2();    \
+    tmp = vset_##_T2##m2(tmp, 0, a.val);    \
+    tmp = vset_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2));    \
+    vse_v_##_T1##m1(ptr, intrin##_##_T1##m1(tmp, n, num), num2); \
+}
+OPENCV_HAL_IMPL_RISCVV_PACKS(int8, int16, i16, 8, i8, 16, vnclip_vx, vnclip_vx, signed char)
+OPENCV_HAL_IMPL_RISCVV_PACKS(int16, int32, i32, 4, i16, 8, vnclip_vx, vnclip_vx, signed short)
+OPENCV_HAL_IMPL_RISCVV_PACKS(int32, int64, i64, 2, i32, 4, vnclip_vx, vnsra_vx, int)
+OPENCV_HAL_IMPL_RISCVV_PACKS(uint8, uint16, u16, 8, u8, 16, vnclipu_vx, vnclipu_vx, unsigned char)
+OPENCV_HAL_IMPL_RISCVV_PACKS(uint16, uint32, u32, 4, u16, 8, vnclipu_vx, vnclipu_vx, unsigned short)
+OPENCV_HAL_IMPL_RISCVV_PACKS(uint32, uint64, u64, 2, u32, 4, vnclipu_vx, vnsrl_vx, unsigned int)
+
+// pack boolean
+inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
+{
+    vuint16m2_t tmp = vundefined_u16m2();    \
+    tmp = vset_u16m2(tmp, 0, a.val);    \
+    tmp = vset_u16m2(tmp, 1, b.val);    \
+    return v_uint8x16(vnsrl_vx_u8m1(tmp, 0, 16));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
+                           const v_uint32x4& c, const v_uint32x4& d)
+{
+    vuint32m4_t vabcd = vundefined_u32m4();    \
+    vuint16m2_t v16 = vundefined_u16m2();    \
+    vabcd = vset_u32m4(vabcd, 0, a.val);    \
+    vabcd = vset_u32m4(vabcd, 1, b.val);    \
+    vabcd = vset_u32m4(vabcd, 2, c.val);    \
+    vabcd = vset_u32m4(vabcd, 3, d.val);    \
+    v16 = vnsrl_vx_u16m2(vabcd, 0, 16);
+    return v_uint8x16(vnsrl_vx_u8m1(v16, 0, 16));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
+                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
+                           const v_uint64x2& g, const v_uint64x2& h)
+{
+    vuint64m8_t v64 = vundefined_u64m8();    \
+    vuint32m4_t v32 = vundefined_u32m4();    \
+    vuint16m2_t v16 = vundefined_u16m2();    \
+    v64 = vset_u64m8(v64, 0, a.val);    \
+    v64 = vset_u64m8(v64, 1, b.val);    \
+    v64 = vset_u64m8(v64, 2, c.val);    \
+    v64 = vset_u64m8(v64, 3, d.val);    \
+    v64 = vset_u64m8(v64, 4, e.val);    \
+    v64 = vset_u64m8(v64, 5, f.val);    \
+    v64 = vset_u64m8(v64, 6, g.val);    \
+    v64 = vset_u64m8(v64, 7, h.val);    \
+    v32 = vnsrl_vx_u32m4(v64, 0, 16);
+    v16 = vnsrl_vx_u16m2(v32, 0, 16);
+    return v_uint8x16(vnsrl_vx_u8m1(v16, 0, 16));
+}
+
+//inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b) \
+//{ \
+//    int16xm2_u tmp;    \
+//    tmp.m1[0] = (vint16m1_t)a.val;    \
+//    tmp.m1[1] = (vint16m1_t)b.val;    \
+//    e8xm1_t mask = (e8xm1_t)vmsge_vx_e16xm2_i16m2(tmp.v, 0, 16);\
+//    return v_uint8x16(vnclipuvi_mask_u8m1_u16m2(vmv_v_x_u8m1(0, 16), (vuint16m2_t)tmp.v, 0, mask, 16));
+//}
+
+#define OPENCV_HAL_IMPL_RISCVV_PACK_U(tp1, num1, tp2, num2, _Tp) \
+inline v_uint##tp1##x##num1 v_pack_u(const v_int##tp2##x##num2& a, const v_int##tp2##x##num2& b) \
+{ \
+    vint##tp2##m2_t tmp = vundefined_##i##tp2##m2();    \
+    tmp = vset_##i##tp2##m2(tmp, 0, a.val);    \
+    tmp = vset_##i##tp2##m2(tmp, 1, b.val);    \
+    vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
+    return v_uint##tp1##x##num1(vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val, 0, num1));    \
+} \
+inline void v_pack_u_store(_Tp* ptr, const v_int##tp2##x##num2& a) \
+{ \
+    vint##tp2##m2_t tmp = vundefined_##i##tp2##m2();    \
+    tmp = vset_##i##tp2##m2(tmp, 0, a.val);    \
+    vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
+    return vse_v_u##tp1##m1(ptr, vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val, 0, num1), num2);    \
+} \
+template<int n> inline \
+v_uint##tp1##x##num1 v_rshr_pack_u(const v_int##tp2##x##num2& a, const v_int##tp2##x##num2& b) \
+{ \
+    vint##tp2##m2_t tmp = vundefined_##i##tp2##m2();    \
+    tmp = vset_##i##tp2##m2(tmp, 0, a.val);    \
+    tmp = vset_##i##tp2##m2(tmp, 1, b.val);    \
+    vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
+    return v_uint##tp1##x##num1(vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val, n, num1));    \
+} \
+template<int n> inline \
+void v_rshr_pack_u_store(_Tp* ptr, const v_int##tp2##x##num2& a) \
+{ \
+    vint##tp2##m2_t tmp = vundefined_##i##tp2##m2();    \
+    tmp = vset_##i##tp2##m2(tmp, 0, a.val);    \
+    vint##tp2##m2_t val_ = vmax_vx_i##tp2##m2(tmp, 0, num1);\
+    vuint##tp1##m1_t val = vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val_, n, num1);    \
+    return vse_v_u##tp1##m1(ptr, val, num2);\
+}
+OPENCV_HAL_IMPL_RISCVV_PACK_U(8, 16, 16, 8, unsigned char )
+OPENCV_HAL_IMPL_RISCVV_PACK_U(16, 8, 32, 4, unsigned short)
+
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+
+// saturating multiply 8-bit, 16-bit
+#define OPENCV_HAL_IMPL_RISCVV_MUL_SAT(_Tpvec, _Tpwvec)            \
+    inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b)  \
+    {                                                            \
+        _Tpwvec c, d;                                            \
+        v_mul_expand(a, b, c, d);                                \
+        return v_pack(c, d);                                     \
+    }                                                            \
+    inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \
+    { a = a * b; return a; }
+
+OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_int8x16,  v_int16x8)
+OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_uint8x16, v_uint16x8)
+OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_int16x8,  v_int32x4)
+OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_uint16x8, v_uint32x4)
+
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+static const signed char popCountTable[256] =
+{
+    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
+};
+
+inline vuint8m1_t vcnt_u8(vuint8m1_t val){
+    vuint8m1_t v0 = val & 1;
+    return vlxe_v_u8m1((unsigned char*)popCountTable, val >> 1, 16)+v0;
+}
+
+inline v_uint8x16
+v_popcount(const v_uint8x16& a)
+{
+    return v_uint8x16(vcnt_u8(a.val));
+}
+
+inline v_uint8x16
+v_popcount(const v_int8x16& a)
+{
+    return v_uint8x16(vcnt_u8((vuint8m1_t)a.val));
+}
+
+inline v_uint16x8
+v_popcount(const v_uint16x8& a)
+{
+    vuint8m2_t tmp = vundefined_u8m2();
+    tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
+    vuint64m2_t mask = (vuint64m2_t){0x0E0C0A0806040200, 0, 0x0F0D0B0907050301, 0};
+    tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32);    \
+    vuint16m2_t res = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 8);
+    return v_uint16x8(vget_u16m2_u16m1(res, 0));
+}
+
+inline v_uint16x8
+v_popcount(const v_int16x8& a)
+{
+    vuint8m2_t tmp = vundefined_u8m2();
+    tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
+    vuint64m2_t mask = (vuint64m2_t){0x0E0C0A0806040200, 0, 0x0F0D0B0907050301, 0};
+    tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32);    \
+    vuint16m2_t res = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 8);
+    return v_uint16x8(vget_u16m2_u16m1(res, 0));
+}
+
+inline v_uint32x4
+v_popcount(const v_uint32x4& a)
+{
+    vuint8m2_t tmp = vundefined_u8m2();
+    tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
+    vuint64m2_t mask = (vuint64m2_t){0xFFFFFFFF0C080400, 0xFFFFFFFF0D090501,
+                     0xFFFFFFFF0E0A0602, 0xFFFFFFFF0F0B0703};
+    tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32);    \
+    vuint16m2_t res_ = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 16);
+    vuint32m2_t res  = vwaddu_vv_u32m2(vget_u16m2_u16m1(res_, 0), vget_u16m2_u16m1(res_, 1), 8);
+    return v_uint32x4(vget_u32m2_u32m1(res, 0));
+}
+
+inline v_uint32x4
+v_popcount(const v_int32x4& a)
+{
+    vuint8m2_t tmp = vundefined_u8m2();
+    tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
+    vuint64m2_t mask = (vuint64m2_t){0xFFFFFFFF0C080400, 0xFFFFFFFF0D090501,
+                     0xFFFFFFFF0E0A0602, 0xFFFFFFFF0F0B0703};
+    tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32);    \
+    vuint16m2_t res_ = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 16);
+    vuint32m2_t res  = vwaddu_vv_u32m2(vget_u16m2_u16m1(res_, 0), vget_u16m2_u16m1(res_, 1), 8);
+    return v_uint32x4(vget_u32m2_u32m1(res, 0));
+}
+
+inline v_uint64x2
+v_popcount(const v_uint64x2& a)
+{
+    vuint8m2_t tmp = vundefined_u8m2();
+    tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
+    vuint64m2_t mask = (vuint64m2_t){0x0706050403020100, 0x0000000000000000,
+                     0x0F0E0D0C0B0A0908, 0x0000000000000000};
+    tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32);    \
+    vuint8m1_t zero = vmv_v_x_u8m1(0, 16);
+    vuint8m1_t res1 = zero;
+    vuint8m1_t res2 = zero;
+    res1 = vredsum_vs_u8m1_u8m1(res1, vget_u8m2_u8m1(tmp, 0), zero, 8);
+    res2 = vredsum_vs_u8m1_u8m1(res2, vget_u8m2_u8m1(tmp, 1), zero, 8);
+
+    return v_uint64x2((unsigned long)vmv_x_s_u8m1_u8(res1, 8), (unsigned long)vmv_x_s_u8m1_u8(res2, 8));
+}
+
+inline v_uint64x2
+v_popcount(const v_int64x2& a)
+{
+    vuint8m2_t tmp = vundefined_u8m2();
+    tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
+    vuint64m2_t mask = (vuint64m2_t){0x0706050403020100, 0x0000000000000000,
+                     0x0F0E0D0C0B0A0908, 0x0000000000000000};
+    tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32);    \
+    vuint8m1_t zero = vmv_v_x_u8m1(0, 16);
+    vuint8m1_t res1 = zero;
+    vuint8m1_t res2 = zero;
+    res1 = vredsum_vs_u8m1_u8m1(res1, vget_u8m2_u8m1(tmp, 0), zero, 8);
+    res2 = vredsum_vs_u8m1_u8m1(res2, vget_u8m2_u8m1(tmp, 1), zero, 8);
+
+    return v_uint64x2((unsigned long)vmv_x_s_u8m1_u8(res1, 8), (unsigned long)vmv_x_s_u8m1_u8(res2, 8));
+}
+
+#define SMASK 1, 2, 4, 8, 16, 32, 64, 128
+inline int v_signmask(const v_uint8x16& a)
+{
+    vuint8m1_t t0  = vsrl_vx_u8m1(a.val, 7, 16);
+    vuint8m1_t m1  = (vuint8m1_t){SMASK, SMASK};
+    vuint16m2_t t1 = vwmulu_vv_u16m2(t0, m1, 16);
+    vuint32m1_t res = vmv_v_x_u32m1(0, 4);
+    vuint32m2_t t2 = vwmulu_vx_u32m2(vget_u16m2_u16m1(t1, 1), 256, 8);
+    res = vredsum_vs_u32m2_u32m1(res, t2, res, 8);
+    res = vwredsumu_vs_u16m1_u32m1(res, vget_u16m2_u16m1(t1, 0), res, 8);
+    return vmv_x_s_u32m1_u32(res, 8);
+}
+inline int v_signmask(const v_int8x16& a)
+{
+    vuint8m1_t t0 = vsrl_vx_u8m1((vuint8m1_t)a.val, 7, 16);
+    vuint8m1_t m1 = (vuint8m1_t){SMASK, SMASK};
+    vint16m2_t t1 = (vint16m2_t)vwmulu_vv_u16m2(t0, m1, 16);
+    vint32m1_t res = vmv_v_x_i32m1(0, 4);
+    vint32m2_t t2 = vwmul_vx_i32m2(vget_i16m2_i16m1(t1, 1), 256, 8);
+    res = vredsum_vs_i32m2_i32m1(res, t2, res, 8);
+    res = vwredsum_vs_i16m1_i32m1(res, vget_i16m2_i16m1(t1, 0), res, 8);
+    return vmv_x_s_i32m1_i32(res, 8);
+}
+
+inline int v_signmask(const v_int16x8& a)
+{
+    vint16m1_t t0 = (vint16m1_t)vsrl_vx_u16m1((vuint16m1_t)a.val, 15, 8);
+    vint16m1_t m1 = (vint16m1_t){SMASK};
+    vint16m1_t t1 = vmul_vv_i16m1(t0, m1, 8);
+    vint16m1_t res = vmv_v_x_i16m1(0, 8);
+    res = vredsum_vs_i16m1_i16m1(res, t1, res, 8);
+    return vmv_x_s_i16m1_i16(res, 8);
+}
+inline int v_signmask(const v_uint16x8& a)
+{
+    vint16m1_t t0 = (vint16m1_t)vsrl_vx_u16m1((vuint16m1_t)a.val, 15, 8);
+    vint16m1_t m1 = (vint16m1_t){SMASK};
+    vint16m1_t t1 = vmul_vv_i16m1(t0, m1, 8);
+    vint16m1_t res = vmv_v_x_i16m1(0, 8);
+    res = vredsum_vs_i16m1_i16m1(res, t1, res, 8);
+    return vmv_x_s_i16m1_i16(res, 8);
+}
+inline int v_signmask(const v_int32x4& a)
+{
+    vint32m1_t t0 = (vint32m1_t)vsrl_vx_u32m1((vuint32m1_t)a.val, 31, 4);
+    vint32m1_t m1 = (vint32m1_t){1, 2, 4, 8};
+    vint32m1_t res = vmv_v_x_i32m1(0, 4);
+    vint32m1_t t1 = vmul_vv_i32m1(t0, m1, 4);
+    res = vredsum_vs_i32m1_i32m1(res, t1, res, 4);
+    return vmv_x_s_i32m1_i32(res, 4);
+}
+inline int v_signmask(const v_uint32x4& a)
+{
+    vint32m1_t t0 = (vint32m1_t)vsrl_vx_u32m1(a.val, 31, 4);
+    vint32m1_t m1 = (vint32m1_t){1, 2, 4, 8};
+    vint32m1_t res = vmv_v_x_i32m1(0, 4);
+    vint32m1_t t1 = vmul_vv_i32m1(t0, m1, 4);
+    res = vredsum_vs_i32m1_i32m1(res, t1, res, 4);
+    return vmv_x_s_i32m1_i32(res, 4);
+}
+inline int v_signmask(const v_uint64x2& a)
+{
+    vuint64m1_t v0 = vsrl_vx_u64m1(a.val, 63, 2);
+    int res = (int)vext_x_v_u64m1_u64(v0, 0, 2) + ((int)vext_x_v_u64m1_u64(v0, 1, 2) << 1);
+    return res;
+}
+inline int v_signmask(const v_int64x2& a)
+{ return v_signmask(v_reinterpret_as_u64(a)); }
+inline int v_signmask(const v_float64x2& a)
+{ return v_signmask(v_reinterpret_as_u64(a)); }
+inline int v_signmask(const v_float32x4& a)
+{
+    vint32m1_t t0 = (vint32m1_t)vsrl_vx_u32m1((vuint32m1_t)a.val, 31, 4);
+    vint32m1_t m1 = (vint32m1_t){1, 2, 4, 8};
+    vint32m1_t res = vmv_v_x_i32m1(0, 4);
+    vint32m1_t t1 = vmul_vv_i32m1(t0, m1, 4);
+    res = vredsum_vs_i32m1_i32m1(res, t1, res, 4);
+    return vmv_x_s_i32m1_i32(res, 4);
+}
+
+inline int v_scan_forward(const v_int8x16& a) {
+int val = v_signmask(a);
+if(val==0) return 0;
+else return trailingZeros32(val); }
+inline int v_scan_forward(const v_uint8x16& a) {
+int val = v_signmask(a);
+if(val==0) return 0;
+else return trailingZeros32(val); }
+inline int v_scan_forward(const v_int16x8& a) {
+int val = v_signmask(a);
+if(val==0) return 0;
+else return trailingZeros32(val); }
+inline int v_scan_forward(const v_uint16x8& a) {
+int val = v_signmask(a);
+if(val==0) return 0;
+else return trailingZeros32(val); }
+inline int v_scan_forward(const v_int32x4& a) {
+int val = v_signmask(a);
+if(val==0) return 0;
+else return trailingZeros32(val); }
+inline int v_scan_forward(const v_uint32x4& a) {
+int val = v_signmask(a);
+if(val==0) return 0;
+else return trailingZeros32(val); }
+inline int v_scan_forward(const v_float32x4& a) {
+int val = v_signmask(a);
+if(val==0) return 0;
+else return trailingZeros32(val); }
+inline int v_scan_forward(const v_int64x2& a) {
+int val = v_signmask(a);
+if(val==0) return 0;
+else return trailingZeros32(val); }
+inline int v_scan_forward(const v_uint64x2& a) {
+int val = v_signmask(a);
+if(val==0) return 0;
+else return trailingZeros32(val); }
+
+#define OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(_Tpvec, suffix, _T, shift, num) \
+inline bool v_check_all(const v_##_Tpvec& a) \
+{ \
+    suffix##m1_t v0 = vsrl_vx_##_T(vnot_v_##_T(a.val, num), shift, num); \
+    vuint64m1_t v1 = vuint64m1_t(v0); \
+    return (v1[0] | v1[1]) == 0; \
+} \
+inline bool v_check_any(const v_##_Tpvec& a) \
+{ \
+    suffix##m1_t v0 = vsrl_vx_##_T(a.val, shift, num); \
+    vuint64m1_t v1 = vuint64m1_t(v0); \
+    return (v1[0] | v1[1]) != 0; \
+}
+
+OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint8x16, vuint8,  u8m1, 7, 16)
+OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint16x8, vuint16, u16m1, 15, 8)
+OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint32x4, vuint32, u32m1, 31, 4)
+OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint64x2, vuint64, u64m1, 63, 2)
+
+inline bool v_check_all(const v_int8x16& a)
+{ return v_check_all(v_reinterpret_as_u8(a)); }
+inline bool v_check_all(const v_int16x8& a)
+{ return v_check_all(v_reinterpret_as_u16(a)); }
+inline bool v_check_all(const v_int32x4& a)
+{ return v_check_all(v_reinterpret_as_u32(a)); }
+inline bool v_check_all(const v_float32x4& a)
+{ return v_check_all(v_reinterpret_as_u32(a)); }
+inline bool v_check_all(const v_int64x2& a)
+{ return v_check_all(v_reinterpret_as_u64(a)); }
+inline bool v_check_all(const v_float64x2& a)
+{ return v_check_all(v_reinterpret_as_u64(a)); }
+
+inline bool v_check_any(const v_int8x16& a)
+{ return v_check_any(v_reinterpret_as_u8(a)); }
+inline bool v_check_any(const v_int16x8& a)
+{ return v_check_any(v_reinterpret_as_u16(a)); }
+inline bool v_check_any(const v_int32x4& a)
+{ return v_check_any(v_reinterpret_as_u32(a)); }
+inline bool v_check_any(const v_float32x4& a)
+{ return v_check_any(v_reinterpret_as_u32(a)); }
+inline bool v_check_any(const v_int64x2& a)
+{ return v_check_any(v_reinterpret_as_u64(a)); }
+inline bool v_check_any(const v_float64x2& a)
+{ return v_check_any(v_reinterpret_as_u64(a)); }
+
+#define OPENCV_HAL_IMPL_RISCVV_SELECT(_Tpvec, suffix, _Tpvec2, num) \
+inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(vmerge_vvm_##suffix(_Tpvec2(mask.val), b.val, a.val, num)); \
+}
+
+OPENCV_HAL_IMPL_RISCVV_SELECT(v_int8x16,  i8m1, vbool8_t, 16)
+OPENCV_HAL_IMPL_RISCVV_SELECT(v_int16x8,  i16m1, vbool16_t, 8)
+OPENCV_HAL_IMPL_RISCVV_SELECT(v_int32x4,  i32m1, vbool32_t, 4)
+OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint8x16, u8m1, vbool8_t, 16)
+OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint16x8, u16m1, vbool16_t, 8)
+OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint32x4, u32m1, vbool32_t, 4)
+inline v_float32x4 v_select(const v_float32x4& mask, const v_float32x4& a, const v_float32x4& b)
+{
+    return v_float32x4((vfloat32m1_t)vmerge_vvm_u32m1((vbool32_t)mask.val, (vuint32m1_t)b.val, (vuint32m1_t)a.val, 4));
+}
+inline v_float64x2 v_select(const v_float64x2& mask, const v_float64x2& a, const v_float64x2& b)
+{
+    return v_float64x2((vfloat64m1_t)vmerge_vvm_u64m1((vbool64_t)mask.val, (vuint64m1_t)b.val, (vuint64m1_t)a.val, 2));
+}
+
+#define OPENCV_HAL_IMPL_RISCVV_EXPAND(add, _Tpvec, _Tpwvec, _Tp, _Tp1, num1, _Tp2, num2, _T1, _T2) \
+inline void v_expand(const _Tpvec& a, v_##_Tpwvec& b0, v_##_Tpwvec& b1) \
+{ \
+    _T1##_t b = vw##add##_vv_##_Tp2##m2(a.val, vmv_v_x_##_Tp1(0, num1), num1);    \
+    b0.val = vget_##_Tp2##m2_##_Tp2##m1(b, 0);  \
+    b1.val = vget_##_Tp2##m2_##_Tp2##m1(b, 1);  \
+} \
+inline v_##_Tpwvec v_expand_low(const _Tpvec& a) \
+{ \
+    _T1##_t b = vw##add##_vv_##_Tp2##m2(a.val, vmv_v_x_##_Tp1(0, num2), num2);    \
+    return v_##_Tpwvec(vget_##_Tp2##m2_##_Tp2##m1(b, 0)); \
+} \
+inline v_##_Tpwvec v_expand_high(const _Tpvec& a) \
+{ \
+    _T1##_t b = vw##add##_vv_##_Tp2##m2(a.val, vmv_v_x_##_Tp1(0, num1), num1);    \
+    return v_##_Tpwvec(vget_##_Tp2##m2_##_Tp2##m1(b, 1)); \
+} \
+inline v_##_Tpwvec v_load_expand(const _Tp* ptr) \
+{ \
+    _T2##_t val = vle##_v_##_Tp1(ptr, num2);    \
+    _T1##_t b = vw##add##_vv_##_Tp2##m2(val, vmv_v_x_##_Tp1(0, num2), num2);    \
+    return v_##_Tpwvec(vget_##_Tp2##m2_##_Tp2##m1(b, 0)); \
+}
+
+OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint8x16, uint16x8, uchar, u8m1, 16, u16, 8, vuint16m2, vuint8m1)
+OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint16x8, uint32x4, ushort,  u16m1, 8, u32, 4, vuint32m2, vuint16m1)
+OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint32x4, uint64x2, uint,  u32m1, 4, u64, 2, vuint64m2, vuint32m1)
+OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int8x16, int16x8, schar,  i8m1, 16, i16, 8, vint16m2, vint8m1)
+OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int16x8, int32x4, short,  i16m1, 8, i32, 4, vint32m2, vint16m1)
+OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int32x4, int64x2, int,  i32m1, 4, i64, 2, vint64m2, vint32m1)
+
+inline v_uint32x4 v_load_expand_q(const uchar* ptr)
+{
+    vuint16m2_t b = vundefined_u16m2();
+    vuint32m2_t c = vundefined_u32m2();
+    vuint8m1_t val = vle_v_u8m1(ptr, 4);    \
+    b = vwaddu_vv_u16m2(val, vmv_v_x_u8m1(0, 4), 4);    \
+    c = vwaddu_vv_u32m2(vget_u16m2_u16m1(b, 0), vmv_v_x_u16m1(0, 4), 4);    \
+    return v_uint32x4(vget_u32m2_u32m1(c, 0));
+}
+
+inline v_int32x4 v_load_expand_q(const schar* ptr)
+{
+    vint16m2_t b = vundefined_i16m2();
+    vint32m2_t c = vundefined_i32m2();
+    vint8m1_t val = vle_v_i8m1(ptr, 4);    \
+    b = vwadd_vv_i16m2(val, vmv_v_x_i8m1(0, 4), 4);    \
+    c = vwadd_vv_i32m2(vget_i16m2_i16m1(b, 0), vmv_v_x_i16m1(0, 4), 4);    \
+    return v_int32x4(vget_i32m2_i32m1(c, 0));
+}
+#define VITL_16 (vuint64m2_t){0x1303120211011000, 0x1707160615051404, 0x1B0B1A0A19091808, 0x1F0F1E0E1D0D1C0C}
+#define VITL_8 (vuint64m2_t){0x0009000100080000, 0x000B0003000A0002, 0x000D0005000C0004, 0x000F0007000E0006}
+#define VITL_4 (vuint64m2_t){0x0000000400000000, 0x0000000500000001, 0x0000000600000002, 0x0000000700000003}
+#define VITL_2 (vuint64m2_t){0, 2, 1, 3}
+#define LOW_4  0x0000000100000000, 0x0000000500000004
+#define LOW_8  0x0003000200010000, 0x000B000A00090008
+#define LOW_16 0x0706050403020100, 0x1716151413121110
+#define HIGH_4  0x0000000300000002, 0x0000000700000006
+#define HIGH_8  0x0007000600050004, 0x000F000E000D000C
+#define HIGH_16 0x0F0E0D0C0B0A0908,  0x1F1E1D1C1B1A1918
+#define OPENCV_HAL_IMPL_RISCVV_UNPACKS(_Tpvec, _Tp, _T, _UTp, _UT, num, num2, len, numh) \
+inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
+{ \
+    v##_Tp##m2_t tmp = vundefined_##_T##m2();\
+    tmp = vset_##_T##m2(tmp, 0, a0.val); \
+    tmp = vset_##_T##m2(tmp, 1, a1.val); \
+    vuint64m2_t mask = VITL_##num;    \
+    tmp = (v##_Tp##m2_t)vrgather_vv_##_T##m2((v##_Tp##m2_t)tmp, (v##_UTp##m2_t)mask, num2);    \
+    b0.val = vget_##_T##m2_##_T##m1(tmp, 0); \
+    b1.val = vget_##_T##m2_##_T##m1(tmp, 1); \
+} \
+inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
+{ \
+    v##_Tp##m1_t b0 = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a.val, b.val, numh, num);    \
+    return v_##_Tpvec(b0);\
+} \
+inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
+{ \
+    v##_Tp##m1_t b0 = vslidedown_vx_##_T##m1(b.val, numh, num);    \
+    v##_Tp##m1_t a0 = vslidedown_vx_##_T##m1(a.val, numh, num);    \
+    v##_Tp##m1_t b1 = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a0, b0, numh, num);    \
+    return v_##_Tpvec(b1);\
+} \
+inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
+{ \
+    c.val = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a.val, b.val, numh, num);    \
+    v##_Tp##m1_t b0 = vslidedown_vx_##_T##m1(b.val, numh, num);    \
+    v##_Tp##m1_t a0 = vslidedown_vx_##_T##m1(a.val, numh, num);    \
+    d.val = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a0, b0, numh, num);    \
+}
+
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint8x16, uint8, u8, uint8, u8, 16, 32, b8, 8)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(int8x16, int8, i8, uint8, u8, 16, 32, b8, 8)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint16x8, uint16, u16, uint16, u16, 8, 16, b16, 4)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(int16x8, int16, i16, uint16, u16, 8, 16, b16, 4)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint32x4, uint32, u32, uint32, u32, 4, 8, b32, 2)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(int32x4, int32, i32, uint32, u32, 4, 8, b32, 2)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(float32x4, float32, f32, uint32, u32, 4, 8, b32, 2)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(float64x2, float64, f64, uint64, u64, 2, 4, b64, 1)
+
+inline v_uint8x16 v_reverse(const v_uint8x16 &a)
+{
+    vuint64m1_t mask = (vuint64m1_t){0x08090A0B0C0D0E0F, 0x0001020304050607};
+    return v_uint8x16(vrgather_vv_u8m1(a.val, (vuint8m1_t)mask, 16));
+}
+inline v_int8x16 v_reverse(const v_int8x16 &a)
+{
+    vint64m1_t mask = (vint64m1_t){0x08090A0B0C0D0E0F, 0x0001020304050607};
+    return v_int8x16(vrgather_vv_i8m1(a.val, (vuint8m1_t)mask, 16));
+}
+
+inline v_uint16x8 v_reverse(const v_uint16x8 &a)
+{
+    vuint64m1_t mask = (vuint64m1_t){0x0004000500060007, 0x000000100020003};
+    return v_uint16x8(vrgather_vv_u16m1(a.val, (vuint16m1_t)mask, 8));
+}
+
+inline v_int16x8 v_reverse(const v_int16x8 &a)
+{
+    vint64m1_t mask = (vint64m1_t){0x0004000500060007, 0x000000100020003};
+    return v_int16x8(vrgather_vv_i16m1(a.val, (vuint16m1_t)mask, 8));
+}
+inline v_uint32x4 v_reverse(const v_uint32x4 &a)
+{
+    return v_uint32x4(vrgather_vv_u32m1(a.val, (vuint32m1_t){3, 2, 1, 0}, 4));
+}
+
+inline v_int32x4 v_reverse(const v_int32x4 &a)
+{
+    return v_int32x4(vrgather_vv_i32m1(a.val, (vuint32m1_t){3, 2, 1, 0}, 4));
+}
+
+inline v_float32x4 v_reverse(const v_float32x4 &a)
+{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_uint64x2 v_reverse(const v_uint64x2 &a)
+{
+    return v_uint64x2(a.val[1], a.val[0]);
+}
+
+inline v_int64x2 v_reverse(const v_int64x2 &a)
+{
+    return v_int64x2(a.val[1], a.val[0]);
+}
+
+inline v_float64x2 v_reverse(const v_float64x2 &a)
+{
+    return v_float64x2(a.val[1], a.val[0]);
+}
+
+#define OPENCV_HAL_IMPL_RISCVV_EXTRACT(_Tpvec, suffix, size) \
+template <int n> \
+inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
+{ return v_rotate_right<n>(a, b);}
+OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_uint8x16, u8, 0)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_int8x16, s8, 0)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_uint16x8, u16, 1)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_int16x8, s16, 1)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_uint32x4, u32, 2)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_int32x4, s32, 2)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_uint64x2, u64, 3)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_int64x2, s64, 3)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_float32x4, f32, 2)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_float64x2, f64, 3)
+
+
+#define OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(_Tpvec, _Tp, suffix) \
+template<int i> inline _Tp v_extract_n(_Tpvec v) { return v.val[i]; }
+
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int8x16, schar, s8)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int16x8, short, s16)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint32x4, uint, u32)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int32x4, int, s32)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int64x2, int64, s64)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_float32x4, float, f32)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_float64x2, double, f64)
+
+#define OPENCV_HAL_IMPL_RISCVV_BROADCAST(_Tpvec, _Tp, num) \
+template<int i> inline _Tpvec v_broadcast_element(_Tpvec v) { return _Tpvec(vrgather_vx_##_Tp##m1(v.val, i, num)); }
+
+OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_uint8x16, u8, 16)
+OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int8x16, i8, 16)
+OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_uint16x8, u16, 8)
+OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int16x8, i16, 8)
+OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_uint32x4, u32, 4)
+OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int32x4, i32, 4)
+OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_uint64x2, u64, 2)
+OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int64x2, i64, 2)
+OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_float32x4, f32, 4)
+inline v_int32x4 v_round(const v_float32x4& a)
+{
+    __builtin_riscv_fsrm(0);
+    vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
+    vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
+    vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
+    __builtin_riscv_fsrm(0);
+    return v_int32x4(val);
+}
+inline v_int32x4 v_floor(const v_float32x4& a)
+{
+    __builtin_riscv_fsrm(2);
+    vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
+    vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
+    vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
+    __builtin_riscv_fsrm(0);
+    return v_int32x4(val);
+}
+
+inline v_int32x4 v_ceil(const v_float32x4& a)
+{
+    __builtin_riscv_fsrm(3);
+    vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
+    vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
+    vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
+    __builtin_riscv_fsrm(0);
+    return v_int32x4(val);
+}
+
+inline v_int32x4 v_trunc(const v_float32x4& a)
+{
+    __builtin_riscv_fsrm(1);
+    vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
+    vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
+    vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
+    __builtin_riscv_fsrm(0);
+    return v_int32x4(val);
+}
+
+inline v_int32x4 v_round(const v_float64x2& a)
+{
+    __builtin_riscv_fsrm(0);
+    vfloat64m2_t _val = vundefined_f64m2();
+    _val = vset_f64m2(_val, 0, a.val);
+    //_val = vset_f64m2(_val, 1, a.val);
+    _val = vset_f64m2(_val, 1, vfmv_v_f_f64m1(0, 2));
+    vint32m1_t val = vfncvt_x_f_v_i32m1(_val, 4);
+    __builtin_riscv_fsrm(0);
+    return v_int32x4(val);
+}
+inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
+{
+    __builtin_riscv_fsrm(0);
+    vfloat64m2_t _val = vundefined_f64m2();
+    _val = vset_f64m2(_val, 0, a.val);
+    _val = vset_f64m2(_val, 1, b.val);
+    vint32m1_t val = vfncvt_x_f_v_i32m1(_val, 4);
+    __builtin_riscv_fsrm(0);
+    return v_int32x4(val);
+}
+inline v_int32x4 v_floor(const v_float64x2& a)
+{
+    __builtin_riscv_fsrm(2);
+    vfloat64m2_t _val = vundefined_f64m2();
+    _val = vset_f64m2(_val, 0, a.val);
+    vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
+
+    vint32m1_t nan = vand_vx_i32m1((vint32m1_t)aval, 0x7f800000, 4);
+    vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
+    vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4);
+    __builtin_riscv_fsrm(0);
+    return v_int32x4(val);
+}
+
+inline v_int32x4 v_ceil(const v_float64x2& a)
+{
+    __builtin_riscv_fsrm(3);
+    vfloat64m2_t _val = vundefined_f64m2();
+    _val = vset_f64m2(_val, 0, a.val);
+    vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
+
+    vint32m1_t nan = vand_vx_i32m1((vint32m1_t)aval, 0x7f800000, 4);
+    vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
+    vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4);
+    __builtin_riscv_fsrm(0);
+    return v_int32x4(val);
+}
+
+inline v_int32x4 v_trunc(const v_float64x2& a)
+{
+    __builtin_riscv_fsrm(1);
+    vfloat64m2_t _val = vundefined_f64m2();
+    _val = vset_f64m2(_val, 0, a.val);
+    vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
+
+    vint32m1_t nan = vand_vx_i32m1((vint32m1_t)aval, 0x7f800000, 4);
+    vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
+    vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4);
+    __builtin_riscv_fsrm(0);
+    return v_int32x4(val);
+}
+
+#define OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(intrin, _Tpvec, num, _Tp, _T)    \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b) \
+{ \
+    v##_Tpvec##m1x2_t ret = intrin##2e_v_##_T##m1x2(ptr, num);\
+    a.val = vget_##_T##m1x2_##_T##m1(ret, 0);  \
+    b.val = vget_##_T##m1x2_##_T##m1(ret, 1);  \
+} \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, v_##_Tpvec##x##num& c) \
+{ \
+    v##_Tpvec##m1x3_t ret = intrin##3e_v_##_T##m1x3(ptr, num);\
+    a.val = vget_##_T##m1x3_##_T##m1(ret, 0);  \
+    b.val = vget_##_T##m1x3_##_T##m1(ret, 1);  \
+    c.val = vget_##_T##m1x3_##_T##m1(ret, 2);  \
+}\
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, \
+                                v_##_Tpvec##x##num& c, v_##_Tpvec##x##num& d) \
+{ \
+    v##_Tpvec##m1x4_t ret = intrin##4e_v_##_T##m1x4(ptr, num);\
+    a.val = vget_##_T##m1x4_##_T##m1(ret, 0);  \
+    b.val = vget_##_T##m1x4_##_T##m1(ret, 1);  \
+    c.val = vget_##_T##m1x4_##_T##m1(ret, 2);  \
+    d.val = vget_##_T##m1x4_##_T##m1(ret, 3);  \
+} \
+
+#define OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(intrin, _Tpvec, num, _Tp, _T)    \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ \
+    v##_Tpvec##m1x2_t ret = vundefined_##_T##m1x2();      \
+    ret = vset_##_T##m1x2(ret, 0, a.val);  \
+    ret = vset_##_T##m1x2(ret, 1, b.val);  \
+    intrin##2e_v_##_T##m1x2(ptr, ret, num); \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
+                                const v_##_Tpvec##x##num& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ \
+    v##_Tpvec##m1x3_t ret = vundefined_##_T##m1x3();       \
+    ret = vset_##_T##m1x3(ret, 0, a.val);  \
+    ret = vset_##_T##m1x3(ret, 1, b.val);  \
+    ret = vset_##_T##m1x3(ret, 2, c.val);  \
+    intrin##3e_v_##_T##m1x3(ptr, ret, num); \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
+                                const v_##_Tpvec##x##num& c, const v_##_Tpvec##x##num& d, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
+{ \
+    v##_Tpvec##m1x4_t ret = vundefined_##_T##m1x4();             \
+    ret = vset_##_T##m1x4(ret, 0, a.val);  \
+    ret = vset_##_T##m1x4(ret, 1, b.val);  \
+    ret = vset_##_T##m1x4(ret, 2, c.val);  \
+    ret = vset_##_T##m1x4(ret, 3, d.val);  \
+    intrin##4e_v_##_T##m1x4(ptr, ret, num); \
+}
+
+#define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(_Tpvec, _Tp, num, ld, st, _T) \
+OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(ld, _Tpvec, num, _Tp, _T)    \
+OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(st, _Tpvec, num, _Tp, _T)
+
+//OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint8, uchar, )
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int8, schar, 16, vlseg, vsseg, i8)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int16, short, 8, vlseg, vsseg, i16)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int32, int, 4, vlseg, vsseg, i32)
+
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint8, unsigned char, 16, vlseg, vsseg, u8)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint16, unsigned short, 8, vlseg, vsseg, u16)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint32, unsigned int, 4, vlseg, vsseg, u32)
+
+#define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(_Tpvec, _Tp, num, _T) \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b) \
+{ \
+    v##_Tpvec##m1x2_t ret = vlseg2e_v_##_T##m1x2(ptr, num); \
+    a.val = vget_##_T##m1x2_##_T##m1(ret, 0);  \
+    b.val = vget_##_T##m1x2_##_T##m1(ret, 1);  \
+} \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, v_##_Tpvec##x##num& c) \
+{ \
+    v##_Tpvec##m1x3_t ret = vlseg3e_v_##_T##m1x3(ptr, num);    \
+    a.val = vget_##_T##m1x3_##_T##m1(ret, 0);  \
+    b.val = vget_##_T##m1x3_##_T##m1(ret, 1);  \
+    c.val = vget_##_T##m1x3_##_T##m1(ret, 2);  \
+}\
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, \
+                                v_##_Tpvec##x##num& c, v_##_Tpvec##x##num& d) \
+{ \
+    v##_Tpvec##m1x4_t ret = vlseg4e_v_##_T##m1x4(ptr, num);    \
+    a.val = vget_##_T##m1x4_##_T##m1(ret, 0);  \
+    b.val = vget_##_T##m1x4_##_T##m1(ret, 1);  \
+    c.val = vget_##_T##m1x4_##_T##m1(ret, 2);  \
+    d.val = vget_##_T##m1x4_##_T##m1(ret, 3);  \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ \
+    v##_Tpvec##m1x2_t ret = vundefined_##_T##m1x2();    \
+    ret = vset_##_T##m1x2(ret, 0, a.val);  \
+    ret = vset_##_T##m1x2(ret, 1, b.val);  \
+    vsseg2e_v_##_T##m1x2(ptr, ret, num);    \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
+                                const v_##_Tpvec##x##num& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ \
+    v##_Tpvec##m1x3_t ret = vundefined_##_T##m1x3();    \
+    ret = vset_##_T##m1x3(ret, 0, a.val);  \
+    ret = vset_##_T##m1x3(ret, 1, b.val);  \
+    ret = vset_##_T##m1x3(ret, 2, c.val);  \
+    vsseg3e_v_##_T##m1x3(ptr, ret, num);    \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
+                                const v_##_Tpvec##x##num& c, const v_##_Tpvec##x##num& d, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
+{ \
+    v##_Tpvec##m1x4_t ret = vundefined_##_T##m1x4();    \
+    ret = vset_##_T##m1x4(ret, 0, a.val);  \
+    ret = vset_##_T##m1x4(ret, 1, b.val);  \
+    ret = vset_##_T##m1x4(ret, 2, c.val);  \
+    ret = vset_##_T##m1x4(ret, 3, d.val);  \
+    vsseg4e_v_##_T##m1x4(ptr, ret, num);    \
+}
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float32, float, 4, f32)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float64, double, 2, f64)
+
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(uint64, unsigned long, 2, u64)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(int64, long, 2, i64)
+
+inline v_float32x4 v_cvt_f32(const v_int32x4& a)
+{
+    return v_float32x4(vfcvt_f_x_v_f32m1(a.val, 4));
+}
+
+#if CV_SIMD128_64F
+inline v_float32x4 v_cvt_f32(const v_float64x2& a)
+{
+    vfloat64m2_t _val = vundefined_f64m2();
+    _val = vset_f64m2(_val, 0, a.val);
+    vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
+    return v_float32x4(aval);
+}
+
+inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
+{
+    vfloat64m2_t _val = vundefined_f64m2();
+    _val = vset_f64m2(_val, 0, a.val);
+    _val = vset_f64m2(_val, 1, b.val);
+    vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 4);
+    return v_float32x4(aval);
+}
+
+inline v_float64x2 v_cvt_f64(const v_int32x4& a)
+{
+    vfloat32m1_t val = vfcvt_f_x_v_f32m1(a.val, 4);
+    vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(val, 4);
+    return v_float64x2(vget_f64m2_f64m1(_val, 0));
+}
+
+inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
+{
+    vfloat32m1_t val = vfcvt_f_x_v_f32m1(a.val, 4);
+    vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(val, 4);
+    return v_float64x2(vget_f64m2_f64m1(_val, 1));
+}
+
+inline v_float64x2 v_cvt_f64(const v_float32x4& a)
+{
+    vfloat64m2_t _val  = vfwcvt_f_f_v_f64m2(a.val, 4);
+    return v_float64x2(vget_f64m2_f64m1(_val, 0));
+}
+
+inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
+{
+    vfloat64m2_t _val  = vfwcvt_f_f_v_f64m2(a.val, 4);
+    return v_float64x2(vget_f64m2_f64m1(_val, 1));
+}
+
+inline v_float64x2 v_cvt_f64(const v_int64x2& a)
+{
+    return v_float64x2(vfcvt_f_x_v_f64m1(a.val, 2));
+}
+
+#endif
+inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
+{
+    vuint64m1_t m0 = {0x0705060403010200, 0x0F0D0E0C0B090A08};
+    return v_int8x16(vrgather_vv_i8m1(vec.val, (vuint8m1_t)m0, 16));
+}
+inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec)
+{
+    return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec)));
+}
+
+inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
+{
+    vuint64m1_t m0 = {0x0703060205010400, 0x0F0B0E0A0D090C08};
+    return v_int8x16(vrgather_vv_i8m1(vec.val, (vuint8m1_t)m0, 16));
+}
+inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec)
+{
+    return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec)));
+}
+
+inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
+{
+    vuint64m1_t m0 = {0x0706030205040100, 0x0F0E0B0A0D0C0908};
+    return v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)vec.val, (vuint8m1_t)m0, 16));
+}
+inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
+inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
+{
+    vuint64m1_t m0 = {0x0B0A030209080100, 0x0F0E07060D0C0504};
+    return v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
+}
+inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
+{
+    vuint64m1_t m0 = {0x0B0A090803020100, 0x0F0E0D0C07060504};
+    return v_int32x4((vint32m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
+}
+inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
+{
+    vuint64m1_t m0 = {0x0908060504020100, 0xFFFFFFFF0E0D0C0A};
+    return v_int8x16((vint8m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
+}
+inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
+{
+    vuint64m1_t m0 = {0x0908050403020100, 0xFFFFFFFF0D0C0B0A};
+    return v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
+}
+inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
+inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
+inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
+
+#if CV_SIMD128_64F
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
+{ return v_cvt_f64(v_dotprod(a, b)); }
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a,   const v_int32x4& b,
+                                    const v_float64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
+{
+    vint64m2_t v1 = vwmul_vv_i64m2(a.val, b.val, 4);
+    vfloat64m1_t res = vfcvt_f_x_v_f64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v1, 0), vget_i64m2_i64m1(v1, 1), 2), 2);
+    return v_float64x2(res);
+}
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ v_float64x2 res = v_dotprod_expand_fast(a, b);
+  return res + c; }
+#endif
+////// FP16 support ///////
+inline v_float32x4 v_load_expand(const float16_t* ptr)
+{
+    vfloat16m1_t v = vle_v_f16m1((__fp16*)ptr, 4);
+    vfloat32m2_t v32 = vfwcvt_f_f_v_f32m2(v, 4);
+    return v_float32x4(vget_f32m2_f32m1(v32, 0));
+}
+
+inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+{
+    vfloat32m2_t v32 = vundefined_f32m2();
+    v32 = vset_f32m2(v32, 0, v.val);
+    vfloat16m1_t hv = vfncvt_f_f_v_f16m1(v32, 4);
+    vse_v_f16m1((__fp16*)ptr, hv, 4);
+}
+
+
+inline void v_cleanup() {}
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+}
+#endif
diff --git a/modules/core/include/opencv2/core/mat.hpp b/modules/core/include/opencv2/core/mat.hpp
index 84df297bf9..eeb83c0744 100644
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@@ -2011,6 +2011,11 @@ public:
     template<typename _Tp> MatIterator_<_Tp> begin();
     template<typename _Tp> MatConstIterator_<_Tp> begin() const;
 
+    /** @brief Same as begin() but for inverse traversal
+     */
+    template<typename _Tp> std::reverse_iterator<MatIterator_<_Tp>> rbegin();
+    template<typename _Tp> std::reverse_iterator<MatConstIterator_<_Tp>> rbegin() const;
+
     /** @brief Returns the matrix iterator and sets it to the after-last matrix element.
 
     The methods return the matrix read-only or read-write iterators, set to the point following the last
@@ -2019,6 +2024,12 @@ public:
     template<typename _Tp> MatIterator_<_Tp> end();
     template<typename _Tp> MatConstIterator_<_Tp> end() const;
 
+    /** @brief Same as end() but for inverse traversal
+     */
+    template<typename _Tp> std::reverse_iterator< MatIterator_<_Tp>> rend();
+    template<typename _Tp> std::reverse_iterator< MatConstIterator_<_Tp>> rend() const;
+
+
     /** @brief Runs the given functor over all matrix elements in parallel.
 
     The operation passed as argument has to be a function pointer, a function object or a lambda(C++11).
@@ -2250,6 +2261,12 @@ public:
     const_iterator begin() const;
     const_iterator end() const;
 
+    //reverse iterators
+    std::reverse_iterator<iterator> rbegin();
+    std::reverse_iterator<iterator> rend();
+    std::reverse_iterator<const_iterator> rbegin() const;
+    std::reverse_iterator<const_iterator> rend() const;
+
     //! template methods for for operation over all matrix elements.
     // the operations take care of skipping gaps in the end of rows (if any)
     template<typename Functor> void forEach(const Functor& operation);
diff --git a/modules/core/include/opencv2/core/mat.inl.hpp b/modules/core/include/opencv2/core/mat.inl.hpp
index ff8297ffa4..886b82c6a0 100644
--- a/modules/core/include/opencv2/core/mat.inl.hpp
+++ b/modules/core/include/opencv2/core/mat.inl.hpp
@@ -863,6 +863,33 @@ const _Tp* Mat::ptr(const int* idx) const
     return (const _Tp*)p;
 }
 
+template<int n> inline
+uchar* Mat::ptr(const Vec<int, n>& idx)
+{
+    return Mat::ptr(idx.val);
+}
+
+template<int n> inline
+const uchar* Mat::ptr(const Vec<int, n>& idx) const
+{
+    return Mat::ptr(idx.val);
+}
+
+template<typename _Tp, int n> inline
+_Tp* Mat::ptr(const Vec<int, n>& idx)
+{
+    CV_DbgAssert( elemSize() == sizeof(_Tp) );
+    return Mat::ptr<_Tp>(idx.val);
+}
+
+template<typename _Tp, int n> inline
+const _Tp* Mat::ptr(const Vec<int, n>& idx) const
+{
+    CV_DbgAssert( elemSize() == sizeof(_Tp) );
+    return Mat::ptr<_Tp>(idx.val);
+}
+
+
 template<typename _Tp> inline
 _Tp& Mat::at(int i0, int i1)
 {
@@ -988,6 +1015,17 @@ MatConstIterator_<_Tp> Mat::begin() const
     return MatConstIterator_<_Tp>((const Mat_<_Tp>*)this);
 }
 
+template<typename _Tp> inline
+std::reverse_iterator<MatConstIterator_<_Tp>> Mat::rbegin() const
+{
+    if (empty())
+        return std::reverse_iterator<MatConstIterator_<_Tp>>();
+    CV_DbgAssert( elemSize() == sizeof(_Tp) );
+    MatConstIterator_<_Tp> it((const Mat_<_Tp>*)this);
+    it += total();
+    return std::reverse_iterator<MatConstIterator_<_Tp>> (it);
+}
+
 template<typename _Tp> inline
 MatConstIterator_<_Tp> Mat::end() const
 {
@@ -999,6 +1037,15 @@ MatConstIterator_<_Tp> Mat::end() const
     return it;
 }
 
+template<typename _Tp> inline
+std::reverse_iterator<MatConstIterator_<_Tp>> Mat::rend() const
+{
+    if (empty())
+        return std::reverse_iterator<MatConstIterator_<_Tp>>();
+    CV_DbgAssert( elemSize() == sizeof(_Tp) );
+    return std::reverse_iterator<MatConstIterator_<_Tp>>((const Mat_<_Tp>*)this);
+}
+
 template<typename _Tp> inline
 MatIterator_<_Tp> Mat::begin()
 {
@@ -1008,6 +1055,17 @@ MatIterator_<_Tp> Mat::begin()
     return MatIterator_<_Tp>((Mat_<_Tp>*)this);
 }
 
+template<typename _Tp> inline
+std::reverse_iterator<MatIterator_<_Tp>> Mat::rbegin()
+{
+    if (empty())
+        return std::reverse_iterator<MatIterator_<_Tp>>();
+    CV_DbgAssert( elemSize() == sizeof(_Tp) );
+    MatIterator_<_Tp> it((Mat_<_Tp>*)this);
+    it += total();
+    return std::reverse_iterator<MatIterator_<_Tp>>(it);
+}
+
 template<typename _Tp> inline
 MatIterator_<_Tp> Mat::end()
 {
@@ -1019,6 +1077,15 @@ MatIterator_<_Tp> Mat::end()
     return it;
 }
 
+template<typename _Tp> inline
+std::reverse_iterator<MatIterator_<_Tp>> Mat::rend()
+{
+    if (empty())
+        return std::reverse_iterator<MatIterator_<_Tp>>();
+    CV_DbgAssert( elemSize() == sizeof(_Tp) );
+    return std::reverse_iterator<MatIterator_<_Tp>>(MatIterator_<_Tp>((Mat_<_Tp>*)this));
+}
+
 template<typename _Tp, typename Functor> inline
 void Mat::forEach(const Functor& operation) {
     this->forEach_impl<_Tp>(operation);
@@ -1686,24 +1753,48 @@ MatConstIterator_<_Tp> Mat_<_Tp>::begin() const
     return Mat::begin<_Tp>();
 }
 
+template<typename _Tp> inline
+std::reverse_iterator<MatConstIterator_<_Tp>> Mat_<_Tp>::rbegin() const
+{
+    return Mat::rbegin<_Tp>();
+}
+
 template<typename _Tp> inline
 MatConstIterator_<_Tp> Mat_<_Tp>::end() const
 {
     return Mat::end<_Tp>();
 }
 
+template<typename _Tp> inline
+std::reverse_iterator<MatConstIterator_<_Tp>> Mat_<_Tp>::rend() const
+{
+    return Mat::rend<_Tp>();
+}
+
 template<typename _Tp> inline
 MatIterator_<_Tp> Mat_<_Tp>::begin()
 {
     return Mat::begin<_Tp>();
 }
 
+template<typename _Tp> inline
+std::reverse_iterator<MatIterator_<_Tp>> Mat_<_Tp>::rbegin()
+{
+    return Mat::rbegin<_Tp>();
+}
+
 template<typename _Tp> inline
 MatIterator_<_Tp> Mat_<_Tp>::end()
 {
     return Mat::end<_Tp>();
 }
 
+template<typename _Tp> inline
+std::reverse_iterator<MatIterator_<_Tp>> Mat_<_Tp>::rend()
+{
+    return Mat::rend<_Tp>();
+}
+
 template<typename _Tp> template<typename Functor> inline
 void Mat_<_Tp>::forEach(const Functor& operation) {
     Mat::forEach<_Tp, Functor>(operation);
diff --git a/modules/core/include/opencv2/core/ocl.hpp b/modules/core/include/opencv2/core/ocl.hpp
index 3a76be2353..f9cc9e019a 100644
--- a/modules/core/include/opencv2/core/ocl.hpp
+++ b/modules/core/include/opencv2/core/ocl.hpp
@@ -43,6 +43,8 @@
 #define OPENCV_OPENCL_HPP
 
 #include "opencv2/core.hpp"
+#include <typeinfo>
+#include <typeindex>
 
 namespace cv { namespace ocl {
 
@@ -277,6 +279,12 @@ public:
     /** @returns cl_context value */
     void* ptr() const;
 
+    /**
+     * @brief Get OpenCL context property specified on context creation
+     * @param propertyId Property id (CL_CONTEXT_* as defined in cl_context_properties type)
+     * @returns Property value if property was specified on clCreateContext, or NULL if context created without the property
+     */
+    void* getOpenCLContextProperty(int propertyId) const;
 
     bool useSVM() const;
     void setUseSVM(bool enabled);
@@ -290,6 +298,21 @@ public:
 
     void release();
 
+    class CV_EXPORTS UserContext {
+    public:
+        virtual ~UserContext();
+    };
+    template <typename T>
+    inline void setUserContext(const std::shared_ptr<T>& userContext) {
+        setUserContext(typeid(T), userContext);
+    }
+    template <typename T>
+    inline std::shared_ptr<T> getUserContext() {
+        return std::dynamic_pointer_cast<T>(getUserContext(typeid(T)));
+    }
+    void setUserContext(std::type_index typeId, const std::shared_ptr<UserContext>& userContext);
+    std::shared_ptr<UserContext> getUserContext(std::type_index typeId);
+
     struct Impl;
     inline Impl* getImpl() const { return (Impl*)p; }
     inline bool empty() const { return !p; }
diff --git a/modules/core/include/opencv2/core/types.hpp b/modules/core/include/opencv2/core/types.hpp
index 819fd52817..3f0131da8c 100644
--- a/modules/core/include/opencv2/core/types.hpp
+++ b/modules/core/include/opencv2/core/types.hpp
@@ -714,24 +714,24 @@ public:
     //! the default constructor
     CV_WRAP KeyPoint();
     /**
-    @param _pt x & y coordinates of the keypoint
-    @param _size keypoint diameter
-    @param _angle keypoint orientation
-    @param _response keypoint detector response on the keypoint (that is, strength of the keypoint)
-    @param _octave pyramid octave in which the keypoint has been detected
-    @param _class_id object id
+    @param pt x & y coordinates of the keypoint
+    @param size keypoint diameter
+    @param angle keypoint orientation
+    @param response keypoint detector response on the keypoint (that is, strength of the keypoint)
+    @param octave pyramid octave in which the keypoint has been detected
+    @param class_id object id
      */
-    KeyPoint(Point2f _pt, float _size, float _angle=-1, float _response=0, int _octave=0, int _class_id=-1);
+    KeyPoint(Point2f pt, float size, float angle=-1, float response=0, int octave=0, int class_id=-1);
     /**
     @param x x-coordinate of the keypoint
     @param y y-coordinate of the keypoint
-    @param _size keypoint diameter
-    @param _angle keypoint orientation
-    @param _response keypoint detector response on the keypoint (that is, strength of the keypoint)
-    @param _octave pyramid octave in which the keypoint has been detected
-    @param _class_id object id
+    @param size keypoint diameter
+    @param angle keypoint orientation
+    @param response keypoint detector response on the keypoint (that is, strength of the keypoint)
+    @param octave pyramid octave in which the keypoint has been detected
+    @param class_id object id
      */
-    CV_WRAP KeyPoint(float x, float y, float _size, float _angle=-1, float _response=0, int _octave=0, int _class_id=-1);
+    CV_WRAP KeyPoint(float x, float y, float size, float angle=-1, float response=0, int octave=0, int class_id=-1);
 
     size_t hash() const;
 
diff --git a/modules/core/include/opencv2/core/utils/plugin_loader.private.hpp b/modules/core/include/opencv2/core/utils/plugin_loader.private.hpp
index bc3ae4d08a..d6390fc74a 100644
--- a/modules/core/include/opencv2/core/utils/plugin_loader.private.hpp
+++ b/modules/core/include/opencv2/core/utils/plugin_loader.private.hpp
@@ -80,7 +80,9 @@ LibHandle_t libraryLoad_(const FileSystemPath_t& filename)
     return LoadLibraryW(filename.c_str());
 #endif
 #elif defined(__linux__) || defined(__APPLE__) || defined(__OpenBSD__) || defined(__FreeBSD__) || defined(__HAIKU__) || defined(__GLIBC__)
-    return dlopen(filename.c_str(), RTLD_NOW);
+    void* handle = dlopen(filename.c_str(), RTLD_NOW);
+    CV_LOG_IF_DEBUG(NULL, !handle, "dlopen() error: " << dlerror());
+    return handle;
 #endif
 }
 
diff --git a/modules/core/misc/objc/common/Point2i.h b/modules/core/misc/objc/common/Point2i.h
index e43ee3a8ec..802c99d613 100644
--- a/modules/core/misc/objc/common/Point2i.h
+++ b/modules/core/misc/objc/common/Point2i.h
@@ -21,7 +21,6 @@ NS_ASSUME_NONNULL_BEGIN
 /**
 * Represents a two dimensional point the coordinate values of which are of type `int`
 */
-NS_SWIFT_NAME(Point)
 CV_EXPORTS @interface Point2i : NSObject
 
 # pragma mark - Properties
diff --git a/modules/core/misc/objc/common/Rect2i.h b/modules/core/misc/objc/common/Rect2i.h
index 6ed86d50bd..a9c1c6e04a 100644
--- a/modules/core/misc/objc/common/Rect2i.h
+++ b/modules/core/misc/objc/common/Rect2i.h
@@ -22,7 +22,6 @@ NS_ASSUME_NONNULL_BEGIN
 /**
 * Represents a rectange the coordinate and dimension values of which are of type `int`
 */
-NS_SWIFT_NAME(Rect)
 CV_EXPORTS @interface Rect2i : NSObject
 
 #pragma mark - Properties
diff --git a/modules/core/misc/objc/common/Size2i.h b/modules/core/misc/objc/common/Size2i.h
index cd74e2c84a..473efa3b57 100644
--- a/modules/core/misc/objc/common/Size2i.h
+++ b/modules/core/misc/objc/common/Size2i.h
@@ -21,7 +21,6 @@ NS_ASSUME_NONNULL_BEGIN
 /**
 * Represents the dimensions of a rectangle the values of which are of type `int`
 */
-NS_SWIFT_NAME(Size)
 CV_EXPORTS @interface Size2i : NSObject
 
 #pragma mark - Properties
diff --git a/modules/core/misc/objc/common/Typealiases.swift b/modules/core/misc/objc/common/Typealiases.swift
new file mode 100644
index 0000000000..534dc492fb
--- /dev/null
+++ b/modules/core/misc/objc/common/Typealiases.swift
@@ -0,0 +1,11 @@
+//
+//  Typealiases.swift
+//
+//  Created by Chris Ballinger on 2020/11/18.
+//
+
+import Foundation
+
+public typealias Rect = Rect2i
+public typealias Point = Point2i
+public typealias Size = Size2i
diff --git a/modules/core/misc/objc/gen_dict.json b/modules/core/misc/objc/gen_dict.json
index c2ee554eba..a645df19f5 100644
--- a/modules/core/misc/objc/gen_dict.json
+++ b/modules/core/misc/objc/gen_dict.json
@@ -113,13 +113,13 @@
             "objc_type": "Point2i*",
             "to_cpp": "%(n)s.nativeRef",
             "from_cpp": "[Point2i fromNative:%(n)s]",
-            "swift_type": "Point"
+            "swift_type": "Point2i"
         },
         "Point2i": {
             "objc_type": "Point2i*",
             "to_cpp": "%(n)s.nativeRef",
             "from_cpp": "[Point2i fromNative:%(n)s]",
-            "swift_type": "Point"
+            "swift_type": "Point2i"
         },
         "Point2f": {
             "objc_type": "Point2f*",
@@ -155,13 +155,13 @@
             "objc_type": "Rect2i*",
             "to_cpp": "%(n)s.nativeRef",
             "from_cpp": "[Rect2i fromNative:%(n)s]",
-            "swift_type": "Rect"
+            "swift_type": "Rect2i"
         },
         "Rect2i": {
             "objc_type": "Rect2i*",
             "to_cpp": "%(n)s.nativeRef",
             "from_cpp": "[Rect2i fromNative:%(n)s]",
-            "swift_type": "Rect"
+            "swift_type": "Rect2i"
         },
         "Rect2f": {
             "objc_type": "Rect2f*",
@@ -187,13 +187,13 @@
             "objc_type": "Size2i*",
             "to_cpp": "%(n)s.nativeRef",
             "from_cpp": "[Size2i fromNative:%(n)s]",
-            "swift_type": "Size"
+            "swift_type": "Size2i"
         },
         "Size2i": {
             "objc_type": "Size2i*",
             "to_cpp": "%(n)s.nativeRef",
             "from_cpp": "[Size2i fromNative:%(n)s]",
-            "swift_type": "Size"
+            "swift_type": "Size2i"
         },
         "Size2f": {
             "objc_type": "Size2f*",
@@ -275,7 +275,7 @@
         "vector_Point": {
             "objc_type": "Point2i*",
             "v_type": "Point2i",
-            "swift_type": "[Point]"
+            "swift_type": "[Point2i]"
         },
         "vector_Point2f": {
             "objc_type": "Point2f*",
@@ -300,7 +300,7 @@
         "vector_Rect": {
             "objc_type": "Rect2i*",
             "v_type": "Rect2i",
-            "swift_type": "[Rect]"
+            "swift_type": "[Rect2i]"
         },
         "vector_Rect2d": {
             "objc_type": "Rect2d*",
@@ -388,7 +388,7 @@
         "vector_vector_Point": {
             "objc_type": "Point2i*",
             "v_v_type": "Point2i",
-            "swift_type": "[[Point]]"
+            "swift_type": "[[Point2i]]"
         },
         "vector_vector_Point2f": {
             "objc_type": "Point2f*",
diff --git a/modules/core/src/arithm.simd.hpp b/modules/core/src/arithm.simd.hpp
index 0cddc90998..f88597aacc 100644
--- a/modules/core/src/arithm.simd.hpp
+++ b/modules/core/src/arithm.simd.hpp
@@ -1910,4 +1910,4 @@ DEFINE_SIMD_ALL(recip, recip_loop)
     #define SIMD_GUARD
 #endif
 
-}} // cv::hal::
\ No newline at end of file
+}} // cv::hal::
diff --git a/modules/core/src/directx.cpp b/modules/core/src/directx.cpp
index 0173f02916..d17adc6b48 100644
--- a/modules/core/src/directx.cpp
+++ b/modules/core/src/directx.cpp
@@ -49,7 +49,6 @@
 #ifdef HAVE_DIRECTX
 #include <vector>
 #include "directx.inc.hpp"
-#include "directx.hpp"
 #else // HAVE_DIRECTX
 #define NO_DIRECTX_SUPPORT_ERROR CV_Error(cv::Error::StsBadFunc, "OpenCV was build without DirectX support")
 #endif
@@ -58,6 +57,8 @@
 #define NO_OPENCL_SUPPORT_ERROR CV_Error(cv::Error::StsBadFunc, "OpenCV was build without OpenCL support")
 #endif // HAVE_OPENCL
 
+using namespace cv::ocl;
+
 namespace cv { namespace directx {
 
 int getTypeFromDXGI_FORMAT(const int iDXGI_FORMAT)
@@ -236,187 +237,121 @@ int getTypeFromD3DFORMAT(const int iD3DFORMAT)
 }
 
 #if defined(HAVE_DIRECTX) && defined(HAVE_OPENCL)
-namespace internal {
-struct OpenCLDirectXImpl
+
+#ifdef HAVE_OPENCL_D3D11_NV
+class OpenCL_D3D11_NV : public ocl::Context::UserContext
 {
-    cl_platform_id platform_;
-
-    cl_platform_id initializedPlatform9 = NULL;
-    cl_platform_id initializedPlatform10 = NULL;
-    cl_platform_id initializedPlatform11 = NULL;
 public:
-    OpenCLDirectXImpl()
-        : platform_(0)
+    OpenCL_D3D11_NV(cl_platform_id platform, ID3D11Device*_device) : device(_device)
     {
-    }
-
-    bool isDirect3DDevice9Ex = false; // Direct3DDevice9Ex or Direct3DDevice9 was used
-
-#ifdef HAVE_OPENCL_D3D11_NV
-    clCreateFromD3D11Texture2DNV_fn clCreateFromD3D11Texture2DNV = NULL;
-    clEnqueueAcquireD3D11ObjectsNV_fn clEnqueueAcquireD3D11ObjectsNV = NULL;
-    clEnqueueReleaseD3D11ObjectsNV_fn clEnqueueReleaseD3D11ObjectsNV = NULL;
-#endif
-    clCreateFromD3D11Texture2DKHR_fn clCreateFromD3D11Texture2DKHR = NULL;
-    clEnqueueAcquireD3D11ObjectsKHR_fn clEnqueueAcquireD3D11ObjectsKHR = NULL;
-    clEnqueueReleaseD3D11ObjectsKHR_fn clEnqueueReleaseD3D11ObjectsKHR = NULL;
-
-    clCreateFromD3D10Texture2DKHR_fn clCreateFromD3D10Texture2DKHR = NULL;
-    clEnqueueAcquireD3D10ObjectsKHR_fn clEnqueueAcquireD3D10ObjectsKHR = NULL;
-    clEnqueueReleaseD3D10ObjectsKHR_fn clEnqueueReleaseD3D10ObjectsKHR = NULL;
-
-    clCreateFromDX9MediaSurfaceKHR_fn clCreateFromDX9MediaSurfaceKHR = NULL;
-    clEnqueueAcquireDX9MediaSurfacesKHR_fn clEnqueueAcquireDX9MediaSurfacesKHR = NULL;
-    clEnqueueReleaseDX9MediaSurfacesKHR_fn clEnqueueReleaseDX9MediaSurfacesKHR = NULL;
-
-    cl_platform_id getPlatform()
-    {
-        if (!platform_)
+        clCreateFromD3D11Texture2DNV = (clCreateFromD3D11Texture2DNV_fn)
+            clGetExtensionFunctionAddressForPlatform(platform, "clCreateFromD3D11Texture2DNV");
+        clEnqueueAcquireD3D11ObjectsNV = (clEnqueueAcquireD3D11ObjectsNV_fn)
+            clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueAcquireD3D11ObjectsNV");
+        clEnqueueReleaseD3D11ObjectsNV = (clEnqueueReleaseD3D11ObjectsNV_fn)
+            clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueReleaseD3D11ObjectsNV");
+        if (!clCreateFromD3D11Texture2DNV || !clEnqueueAcquireD3D11ObjectsNV || !clEnqueueReleaseD3D11ObjectsNV)
         {
-            CV_Assert(cv::ocl::haveOpenCL());
-
-            cl_device_id device = (cl_device_id)ocl::Device::getDefault().ptr();
-            CV_Assert(device);
-            cl_int status = clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform_), &platform_, NULL);
-            if (status != CL_SUCCESS)
-                CV_Error(cv::Error::OpenCLInitError, "OpenCL: Can't get platform corresponding to device");
+            CV_Error(cv::Error::OpenCLInitError, "OpenCL: Can't find functions for D3D11_NV");
         }
-
-        return platform_;
+        device->AddRef();
     }
-
-
-    bool initializeD3D11()
-    {
-        using namespace cv::ocl;
-        cl_platform_id platform = getPlatform();
-
-        bool useCLNVEXT = false;
-        size_t exts_len;
-        cl_int status = clGetPlatformInfo(platform, CL_PLATFORM_EXTENSIONS, 0, NULL, &exts_len);
-        if (status != CL_SUCCESS)
-            CV_Error(cv::Error::OpenCLInitError, "OpenCL: Can't get length of CL_PLATFORM_EXTENSIONS");
-        cv::AutoBuffer<char> extensions(exts_len);
-        status = clGetPlatformInfo(platform, CL_PLATFORM_EXTENSIONS, exts_len, static_cast<void*>(extensions.data()), NULL);
-        if (status != CL_SUCCESS)
-            CV_Error(cv::Error::OpenCLInitError, "OpenCL: No available CL_PLATFORM_EXTENSIONS");
-        bool is_support_cl_khr_d3d11_sharing = false;
-        if (strstr(extensions.data(), "cl_khr_d3d11_sharing"))
-            is_support_cl_khr_d3d11_sharing = true;
-#ifdef HAVE_OPENCL_D3D11_NV
-        bool is_support_cl_nv_d3d11_sharing = false;
-        if (strstr(extensions.data(), "cl_nv_d3d11_sharing"))
-            is_support_cl_nv_d3d11_sharing = true;
-        if (!is_support_cl_nv_d3d11_sharing && !is_support_cl_khr_d3d11_sharing)
-            CV_Error(cv::Error::OpenCLInitError, "OpenCL: No supported extensions");
-#else
-        if (!is_support_cl_khr_d3d11_sharing)
-            CV_Error(cv::Error::OpenCLInitError, "OpenCL: No supported extensions");
+    ~OpenCL_D3D11_NV() {
+        device->Release();
+    }
+    ID3D11Device* device;
+    clCreateFromD3D11Texture2DNV_fn clCreateFromD3D11Texture2DNV;
+    clEnqueueAcquireD3D11ObjectsNV_fn clEnqueueAcquireD3D11ObjectsNV;
+    clEnqueueReleaseD3D11ObjectsNV_fn clEnqueueReleaseD3D11ObjectsNV;
+};
 #endif
 
-#ifdef HAVE_OPENCL_D3D11_NV
-        if (is_support_cl_nv_d3d11_sharing)
-        {
-            if (initializedPlatform11 != platform)
-            {
-                clCreateFromD3D11Texture2DNV = (clCreateFromD3D11Texture2DNV_fn)
-                    clGetExtensionFunctionAddressForPlatform(platform, "clCreateFromD3D11Texture2DNV");
-                clEnqueueAcquireD3D11ObjectsNV = (clEnqueueAcquireD3D11ObjectsNV_fn)
-                    clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueAcquireD3D11ObjectsNV");
-                clEnqueueReleaseD3D11ObjectsNV = (clEnqueueReleaseD3D11ObjectsNV_fn)
-                    clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueReleaseD3D11ObjectsNV");
-                initializedPlatform11 = platform;
-            }
-            if (clCreateFromD3D11Texture2DNV && clEnqueueAcquireD3D11ObjectsNV && clEnqueueReleaseD3D11ObjectsNV)
-            {
-                useCLNVEXT = true;
-            }
-        }
-        else
-#endif
-        {
-            if (is_support_cl_khr_d3d11_sharing)
-            {
-                if (initializedPlatform11 != platform)
-                {
-                    clCreateFromD3D11Texture2DKHR = (clCreateFromD3D11Texture2DKHR_fn)
-                        clGetExtensionFunctionAddressForPlatform(platform, "clCreateFromD3D11Texture2DKHR");
-                    clEnqueueAcquireD3D11ObjectsKHR = (clEnqueueAcquireD3D11ObjectsKHR_fn)
-                        clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueAcquireD3D11ObjectsKHR");
-                    clEnqueueReleaseD3D11ObjectsKHR = (clEnqueueReleaseD3D11ObjectsKHR_fn)
-                        clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueReleaseD3D11ObjectsKHR");
-                    initializedPlatform11 = platform;
-                }
-                if (!clCreateFromD3D11Texture2DKHR || !clEnqueueAcquireD3D11ObjectsKHR || !clEnqueueReleaseD3D11ObjectsKHR)
-                {
-                    CV_Error(cv::Error::OpenCLInitError, "OpenCL: Can't find functions for D3D11");
-                }
-            }
-        }
-        return useCLNVEXT;
-    }
-
-    void initializeD3D9()
+class OpenCL_D3D11 : public ocl::Context::UserContext
+{
+public:
+    OpenCL_D3D11(cl_platform_id platform, ID3D11Device* _device) : device(_device)
     {
-        using namespace cv::ocl;
-        cl_platform_id platform = getPlatform();
-        if (initializedPlatform9 != platform)
+        clCreateFromD3D11Texture2DKHR = (clCreateFromD3D11Texture2DKHR_fn)
+            clGetExtensionFunctionAddressForPlatform(platform, "clCreateFromD3D11Texture2DKHR");
+        clEnqueueAcquireD3D11ObjectsKHR = (clEnqueueAcquireD3D11ObjectsKHR_fn)
+            clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueAcquireD3D11ObjectsKHR");
+        clEnqueueReleaseD3D11ObjectsKHR = (clEnqueueReleaseD3D11ObjectsKHR_fn)
+            clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueReleaseD3D11ObjectsKHR");
+        if (!clCreateFromD3D11Texture2DKHR || !clEnqueueAcquireD3D11ObjectsKHR || !clEnqueueReleaseD3D11ObjectsKHR)
         {
-            clCreateFromDX9MediaSurfaceKHR = (clCreateFromDX9MediaSurfaceKHR_fn)
-                clGetExtensionFunctionAddressForPlatform(platform, "clCreateFromDX9MediaSurfaceKHR");
-            clEnqueueAcquireDX9MediaSurfacesKHR = (clEnqueueAcquireDX9MediaSurfacesKHR_fn)
-                clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueAcquireDX9MediaSurfacesKHR");
-            clEnqueueReleaseDX9MediaSurfacesKHR = (clEnqueueReleaseDX9MediaSurfacesKHR_fn)
-                clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueReleaseDX9MediaSurfacesKHR");
-            initializedPlatform9 = platform;
+            CV_Error(cv::Error::OpenCLInitError, "OpenCL: Can't find functions for D3D11");
         }
+        device->AddRef();
+    }
+    ~OpenCL_D3D11() {
+        device->Release();
+    }
+    ID3D11Device* device;
+    clCreateFromD3D11Texture2DKHR_fn clCreateFromD3D11Texture2DKHR;
+    clEnqueueAcquireD3D11ObjectsKHR_fn clEnqueueAcquireD3D11ObjectsKHR;
+    clEnqueueReleaseD3D11ObjectsKHR_fn clEnqueueReleaseD3D11ObjectsKHR;
+};
+
+class OpenCL_D3D9 : public ocl::Context::UserContext
+{
+public:
+    OpenCL_D3D9(cl_platform_id platform, IDirect3DDevice9* _device, IDirect3DDevice9Ex* _deviceEx)
+        : device(_device)
+        , deviceEx(_deviceEx)
+    {
+        clCreateFromDX9MediaSurfaceKHR = (clCreateFromDX9MediaSurfaceKHR_fn)
+            clGetExtensionFunctionAddressForPlatform(platform, "clCreateFromDX9MediaSurfaceKHR");
+        clEnqueueAcquireDX9MediaSurfacesKHR = (clEnqueueAcquireDX9MediaSurfacesKHR_fn)
+            clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueAcquireDX9MediaSurfacesKHR");
+        clEnqueueReleaseDX9MediaSurfacesKHR = (clEnqueueReleaseDX9MediaSurfacesKHR_fn)
+            clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueReleaseDX9MediaSurfacesKHR");
         if (!clCreateFromDX9MediaSurfaceKHR || !clEnqueueAcquireDX9MediaSurfacesKHR || !clEnqueueReleaseDX9MediaSurfacesKHR)
         {
             CV_Error(cv::Error::OpenCLInitError, "OpenCL: Can't find functions for D3D9");
         }
+        if (device)
+            device->AddRef();
+        if (deviceEx)
+            deviceEx->AddRef();
     }
+    ~OpenCL_D3D9() {
+        if (device)
+            device->Release();
+        if (deviceEx)
+            deviceEx->Release();
+    }
+    IDirect3DDevice9* device;
+    IDirect3DDevice9Ex* deviceEx;
+    clCreateFromDX9MediaSurfaceKHR_fn clCreateFromDX9MediaSurfaceKHR;
+    clEnqueueAcquireDX9MediaSurfacesKHR_fn clEnqueueAcquireDX9MediaSurfacesKHR;
+    clEnqueueReleaseDX9MediaSurfacesKHR_fn clEnqueueReleaseDX9MediaSurfacesKHR;
+};
 
-    void initializeD3D10()
+class OpenCL_D3D10 : public ocl::Context::UserContext
+{
+public:
+    OpenCL_D3D10(cl_platform_id platform, ID3D10Device* _device) : device(_device)
     {
-        using namespace cv::ocl;
-        cl_platform_id platform = getPlatform();
-        if (initializedPlatform10 != platform)
-        {
-            clCreateFromD3D10Texture2DKHR = (clCreateFromD3D10Texture2DKHR_fn)
-                clGetExtensionFunctionAddressForPlatform(platform, "clCreateFromD3D10Texture2DKHR");
-            clEnqueueAcquireD3D10ObjectsKHR = (clEnqueueAcquireD3D10ObjectsKHR_fn)
-                clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueAcquireD3D10ObjectsKHR");
-            clEnqueueReleaseD3D10ObjectsKHR = (clEnqueueReleaseD3D10ObjectsKHR_fn)
-                clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueReleaseD3D10ObjectsKHR");
-            initializedPlatform10 = platform;
-        }
+        clCreateFromD3D10Texture2DKHR = (clCreateFromD3D10Texture2DKHR_fn)
+            clGetExtensionFunctionAddressForPlatform(platform, "clCreateFromD3D10Texture2DKHR");
+        clEnqueueAcquireD3D10ObjectsKHR = (clEnqueueAcquireD3D10ObjectsKHR_fn)
+            clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueAcquireD3D10ObjectsKHR");
+        clEnqueueReleaseD3D10ObjectsKHR = (clEnqueueReleaseD3D10ObjectsKHR_fn)
+            clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueReleaseD3D10ObjectsKHR");
         if (!clCreateFromD3D10Texture2DKHR || !clEnqueueAcquireD3D10ObjectsKHR || !clEnqueueReleaseD3D10ObjectsKHR)
         {
             CV_Error(cv::Error::OpenCLInitError, "OpenCL: Can't find functions for D3D10");
         }
+        device->AddRef();
     }
+    ~OpenCL_D3D10() {
+        device->Release();
+    }
+    ID3D10Device* device;
+    clCreateFromD3D10Texture2DKHR_fn clCreateFromD3D10Texture2DKHR;
+    clEnqueueAcquireD3D10ObjectsKHR_fn clEnqueueAcquireD3D10ObjectsKHR;
+    clEnqueueReleaseD3D10ObjectsKHR_fn clEnqueueReleaseD3D10ObjectsKHR;
 };
-
-OpenCLDirectXImpl* createDirectXImpl()
-{
-    return new OpenCLDirectXImpl();
-}
-void deleteDirectXImpl(OpenCLDirectXImpl** p)
-{
-    if (*p)
-    {
-        delete (*p);
-        *p = NULL;
-    }
-}
-OpenCLDirectXImpl& getImpl()
-{
-    OpenCLDirectXImpl* i = getDirectXImpl(ocl::Context::getDefault());
-    CV_Assert(i);
-    return *i;
-}
-}
-using namespace internal;
 #endif
 
 namespace ocl {
@@ -443,95 +378,57 @@ Context& initializeContextFromD3D11Device(ID3D11Device* pD3D11Device)
 
     // TODO Filter platforms by name from OPENCV_OPENCL_DEVICE
 
-    size_t exts_len;
-    cv::AutoBuffer<char> extensions;
-    bool is_support_cl_khr_d3d11_sharing = false;
-#ifdef HAVE_OPENCL_D3D11_NV
-    bool is_support_cl_nv_d3d11_sharing = false;
-#endif
     for (int i = 0; i < (int)numPlatforms; i++)
     {
-        status = clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 0, NULL, &exts_len);
-        if (status != CL_SUCCESS)
-            CV_Error(cv::Error::OpenCLInitError, "OpenCL: Can't get length of CL_PLATFORM_EXTENSIONS");
-        extensions.resize(exts_len);
-        status = clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, exts_len, static_cast<void*>(extensions.data()), NULL);
-        if (status != CL_SUCCESS)
-            CV_Error(cv::Error::OpenCLInitError, "OpenCL: No available CL_PLATFORM_EXTENSIONS");
-        if (strstr(extensions.data(), "cl_khr_d3d11_sharing"))
-            is_support_cl_khr_d3d11_sharing = true;
-#ifdef HAVE_OPENCL_D3D11_NV
-        if (strstr(extensions.data(), "cl_nv_d3d11_sharing"))
-            is_support_cl_nv_d3d11_sharing = true;
-#endif
-    }
-#ifdef HAVE_OPENCL_D3D11_NV
-    if (!is_support_cl_nv_d3d11_sharing && !is_support_cl_khr_d3d11_sharing)
-        CV_Error(cv::Error::OpenCLInitError, "OpenCL: No supported extensions");
-#else
-    if (!is_support_cl_khr_d3d11_sharing)
-        CV_Error(cv::Error::OpenCLInitError, "OpenCL: No supported extensions");
-#endif
+        cl_platform_id platform = platforms[i];
+        std::string platformName = PlatformInfo(&platform).name();
 
-    int found = -1;
-    cl_device_id device = NULL;
-    cl_uint numDevices = 0;
-    cl_context context = NULL;
+        int found = -1;
+        cl_device_id device = NULL;
+        cl_uint numDevices = 0;
+        cl_context context = NULL;
 
 #ifdef HAVE_OPENCL_D3D11_NV
-    if (is_support_cl_nv_d3d11_sharing)
-    {
-        // try with CL_PREFERRED_DEVICES_FOR_D3D11_NV
-        for (int i = 0; i < (int)numPlatforms; i++)
-        {
-            clGetDeviceIDsFromD3D11NV_fn clGetDeviceIDsFromD3D11NV = (clGetDeviceIDsFromD3D11NV_fn)
-                    clGetExtensionFunctionAddressForPlatform(platforms[i], "clGetDeviceIDsFromD3D11NV");
-            if (!clGetDeviceIDsFromD3D11NV)
-                continue;
-
-            device = NULL;
-            numDevices = 0;
-            status = clGetDeviceIDsFromD3D11NV(platforms[i], CL_D3D11_DEVICE_NV, pD3D11Device,
-                    CL_PREFERRED_DEVICES_FOR_D3D11_NV, 1, &device, &numDevices);
-            if (status != CL_SUCCESS)
-                continue;
-            if (numDevices > 0)
-            {
-                cl_context_properties properties[] = {
-                        CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[i],
-                        CL_CONTEXT_D3D11_DEVICE_NV, (cl_context_properties)(pD3D11Device),
-                        //CL_CONTEXT_INTEROP_USER_SYNC, CL_FALSE,
-                        0
-                };
-
-                context = clCreateContext(properties, 1, &device, NULL, NULL, &status);
-                if (status != CL_SUCCESS)
-                {
-                    clReleaseDevice(device);
-                }
-                else
-                {
-                    found = i;
-                    break;
-                }
-            }
-        }
-        if (found < 0)
-        {
-            // try with CL_ALL_DEVICES_FOR_D3D11_NV
-            for (int i = 0; i < (int)numPlatforms; i++)
-            {
-                clGetDeviceIDsFromD3D11NV_fn clGetDeviceIDsFromD3D11NV = (clGetDeviceIDsFromD3D11NV_fn)
-                        clGetExtensionFunctionAddressForPlatform(platforms[i], "clGetDeviceIDsFromD3D11NV");
-                if (!clGetDeviceIDsFromD3D11NV)
-                    continue;
-
+        // Get extension function "clGetDeviceIDsFromD3D11NV" (part of OpenCL extension "cl_nv_d3d11_sharing")
+        clGetDeviceIDsFromD3D11NV_fn clGetDeviceIDsFromD3D11NV = (clGetDeviceIDsFromD3D11NV_fn)
+            clGetExtensionFunctionAddressForPlatform(platforms[i], "clGetDeviceIDsFromD3D11NV");
+        if (clGetDeviceIDsFromD3D11NV) {
+            // try with CL_PREFERRED_DEVICES_FOR_D3D11_NV
+            do {
                 device = NULL;
                 numDevices = 0;
                 status = clGetDeviceIDsFromD3D11NV(platforms[i], CL_D3D11_DEVICE_NV, pD3D11Device,
-                        CL_ALL_DEVICES_FOR_D3D11_NV, 1, &device, &numDevices);
+                    CL_PREFERRED_DEVICES_FOR_D3D11_NV, 1, &device, &numDevices);
                 if (status != CL_SUCCESS)
-                    continue;
+                    break;
+                if (numDevices > 0)
+                {
+                    cl_context_properties properties[] = {
+                            CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[i],
+                            CL_CONTEXT_D3D11_DEVICE_NV, (cl_context_properties)(pD3D11Device),
+                            //CL_CONTEXT_INTEROP_USER_SYNC, CL_FALSE,
+                            0
+                    };
+
+                    context = clCreateContext(properties, 1, &device, NULL, NULL, &status);
+                    if (status != CL_SUCCESS)
+                    {
+                        clReleaseDevice(device);
+                    }
+                    else
+                    {
+                        found = i;
+                    }
+                }
+            } while (0);
+            // try with CL_ALL_DEVICES_FOR_D3D11_NV
+            if (found < 0) do {
+                device = NULL;
+                numDevices = 0;
+                status = clGetDeviceIDsFromD3D11NV(platforms[i], CL_D3D11_DEVICE_NV, pD3D11Device,
+                    CL_ALL_DEVICES_FOR_D3D11_NV, 1, &device, &numDevices);
+                if (status != CL_SUCCESS)
+                    break;
                 if (numDevices > 0)
                 {
                     cl_context_properties properties[] = {
@@ -548,33 +445,43 @@ Context& initializeContextFromD3D11Device(ID3D11Device* pD3D11Device)
                     else
                     {
                         found = i;
-                        break;
                     }
                 }
+            } while (0);
+            if (found >= 0) {
+                OpenCLExecutionContext clExecCtx;
+                try
+                {
+                    clExecCtx = OpenCLExecutionContext::create(platformName, platform, context, device);
+                    clExecCtx.getContext().setUserContext(std::make_shared<OpenCL_D3D11_NV>(platform, pD3D11Device));
+                }
+                catch (...)
+                {
+                    clReleaseDevice(device);
+                    clReleaseContext(context);
+                    throw;
+                }
+                clExecCtx.bind();
+                return const_cast<Context&>(clExecCtx.getContext());
             }
         }
-    }
 #endif
-    if (is_support_cl_khr_d3d11_sharing)
-    {
-        if (found < 0)
+        // Get extension function "clGetDeviceIDsFromD3D11KHR" (part of OpenCL extension "cl_khr_d3d11_sharing")
+        clGetDeviceIDsFromD3D11KHR_fn clGetDeviceIDsFromD3D11KHR = (clGetDeviceIDsFromD3D11KHR_fn)
+            clGetExtensionFunctionAddressForPlatform(platforms[i], "clGetDeviceIDsFromD3D11KHR");
+        if (clGetDeviceIDsFromD3D11KHR)
         {
             // try with CL_PREFERRED_DEVICES_FOR_D3D11_KHR
-            for (int i = 0; i < (int)numPlatforms; i++)
-            {
-                clGetDeviceIDsFromD3D11KHR_fn clGetDeviceIDsFromD3D11KHR = (clGetDeviceIDsFromD3D11KHR_fn)
-                        clGetExtensionFunctionAddressForPlatform(platforms[i], "clGetDeviceIDsFromD3D11KHR");
-                if (!clGetDeviceIDsFromD3D11KHR)
-                    continue;
+            do {
 
                 device = NULL;
                 numDevices = 0;
 
                 status = clGetDeviceIDsFromD3D11KHR(platforms[i], CL_D3D11_DEVICE_KHR, pD3D11Device,
-                        CL_PREFERRED_DEVICES_FOR_D3D11_KHR, 1, &device, &numDevices);
+                    CL_PREFERRED_DEVICES_FOR_D3D11_KHR, 1, &device, &numDevices);
 
                 if (status != CL_SUCCESS)
-                    continue;
+                    break;
                 if (numDevices > 0)
                 {
                     cl_context_properties properties[] = {
@@ -591,27 +498,17 @@ Context& initializeContextFromD3D11Device(ID3D11Device* pD3D11Device)
                     else
                     {
                         found = i;
-                        break;
                     }
                 }
-            }
-        }
-        if (found < 0)
-        {
+            } while (0);
             // try with CL_ALL_DEVICES_FOR_D3D11_KHR
-            for (int i = 0; i < (int)numPlatforms; i++)
-            {
-                clGetDeviceIDsFromD3D11KHR_fn clGetDeviceIDsFromD3D11KHR = (clGetDeviceIDsFromD3D11KHR_fn)
-                        clGetExtensionFunctionAddressForPlatform(platforms[i], "clGetDeviceIDsFromD3D11KHR");
-                if (!clGetDeviceIDsFromD3D11KHR)
-                    continue;
-
+            if (found < 0) do {
                 device = NULL;
                 numDevices = 0;
                 status = clGetDeviceIDsFromD3D11KHR(platforms[i], CL_D3D11_DEVICE_KHR, pD3D11Device,
-                        CL_ALL_DEVICES_FOR_D3D11_KHR, 1, &device, &numDevices);
+                    CL_ALL_DEVICES_FOR_D3D11_KHR, 1, &device, &numDevices);
                 if (status != CL_SUCCESS)
-                    continue;
+                    break;
                 if (numDevices > 0)
                 {
                     cl_context_properties properties[] = {
@@ -628,33 +525,30 @@ Context& initializeContextFromD3D11Device(ID3D11Device* pD3D11Device)
                     else
                     {
                         found = i;
-                        break;
                     }
                 }
+            } while (0);
+
+            if (found >= 0) {
+                OpenCLExecutionContext clExecCtx;
+                try
+                {
+                    clExecCtx = OpenCLExecutionContext::create(platformName, platform, context, device);
+                    clExecCtx.getContext().setUserContext(std::make_shared<OpenCL_D3D11>(platform, pD3D11Device));
+                }
+                catch (...)
+                {
+                    clReleaseDevice(device);
+                    clReleaseContext(context);
+                    throw;
+                }
+                clExecCtx.bind();
+                return const_cast<Context&>(clExecCtx.getContext());
             }
         }
     }
-    if (found < 0)
-    {
-        CV_Error(cv::Error::OpenCLInitError, "OpenCL: Can't create context for DirectX interop");
-    }
 
-    cl_platform_id platform = platforms[found];
-    std::string platformName = PlatformInfo(&platform).name();
-
-    OpenCLExecutionContext clExecCtx;
-    try
-    {
-        clExecCtx = OpenCLExecutionContext::create(platformName, platform, context, device);
-    }
-    catch (...)
-    {
-        clReleaseDevice(device);
-        clReleaseContext(context);
-        throw;
-    }
-    clExecCtx.bind();
-    return const_cast<Context&>(clExecCtx.getContext());
+    CV_Error(cv::Error::OpenCLInitError, "OpenCL: Can't create context for DirectX interop");
 #endif
 }
 
@@ -679,62 +573,28 @@ Context& initializeContextFromD3D10Device(ID3D10Device* pD3D10Device)
         CV_Error(cv::Error::OpenCLInitError, "OpenCL: Can't get platforms");
 
     // TODO Filter platforms by name from OPENCV_OPENCL_DEVICE
-
-    int found = -1;
-    cl_device_id device = NULL;
-    cl_uint numDevices = 0;
-    cl_context context = NULL;
-
-    // try with CL_PREFERRED_DEVICES_FOR_D3D10_KHR
     for (int i = 0; i < (int)numPlatforms; i++)
     {
+        cl_platform_id platform = platforms[i];
+        std::string platformName = PlatformInfo(&platform).name();
+        int found = -1;
+        cl_device_id device = NULL;
+        cl_uint numDevices = 0;
+        cl_context context = NULL;
+
         clGetDeviceIDsFromD3D10KHR_fn clGetDeviceIDsFromD3D10KHR = (clGetDeviceIDsFromD3D10KHR_fn)
-                clGetExtensionFunctionAddressForPlatform(platforms[i], "clGetDeviceIDsFromD3D10KHR");
+            clGetExtensionFunctionAddressForPlatform(platforms[i], "clGetDeviceIDsFromD3D10KHR");
         if (!clGetDeviceIDsFromD3D10KHR)
             continue;
 
-        device = NULL;
-        numDevices = 0;
-        status = clGetDeviceIDsFromD3D10KHR(platforms[i], CL_D3D10_DEVICE_KHR, pD3D10Device,
-                CL_PREFERRED_DEVICES_FOR_D3D10_KHR, 1, &device, &numDevices);
-        if (status != CL_SUCCESS)
-            continue;
-        if (numDevices > 0)
-        {
-            cl_context_properties properties[] = {
-                    CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[i],
-                    CL_CONTEXT_D3D10_DEVICE_KHR, (cl_context_properties)(pD3D10Device),
-                    CL_CONTEXT_INTEROP_USER_SYNC, CL_FALSE,
-                    NULL, NULL
-            };
-            context = clCreateContext(properties, 1, &device, NULL, NULL, &status);
-            if (status != CL_SUCCESS)
-            {
-                clReleaseDevice(device);
-            }
-            else
-            {
-                found = i;
-                break;
-            }
-        }
-    }
-    if (found < 0)
-    {
-        // try with CL_ALL_DEVICES_FOR_D3D10_KHR
-        for (int i = 0; i < (int)numPlatforms; i++)
-        {
-            clGetDeviceIDsFromD3D10KHR_fn clGetDeviceIDsFromD3D10KHR = (clGetDeviceIDsFromD3D10KHR_fn)
-                    clGetExtensionFunctionAddressForPlatform(platforms[i], "clGetDeviceIDsFromD3D10KHR");
-            if (!clGetDeviceIDsFromD3D10KHR)
-                continue;
-
+        // try with CL_PREFERRED_DEVICES_FOR_D3D10_KHR
+        do {
             device = NULL;
             numDevices = 0;
             status = clGetDeviceIDsFromD3D10KHR(platforms[i], CL_D3D10_DEVICE_KHR, pD3D10Device,
-                    CL_ALL_DEVICES_FOR_D3D10_KHR, 1, &device, &numDevices);
+                CL_PREFERRED_DEVICES_FOR_D3D10_KHR, 1, &device, &numDevices);
             if (status != CL_SUCCESS)
-                continue;
+                break;
             if (numDevices > 0)
             {
                 cl_context_properties properties[] = {
@@ -751,30 +611,56 @@ Context& initializeContextFromD3D10Device(ID3D10Device* pD3D10Device)
                 else
                 {
                     found = i;
-                    break;
                 }
             }
+        } while (0);
+        // try with CL_ALL_DEVICES_FOR_D3D10_KHR
+        if (found < 0) do
+        {
+            device = NULL;
+            numDevices = 0;
+            status = clGetDeviceIDsFromD3D10KHR(platforms[i], CL_D3D10_DEVICE_KHR, pD3D10Device,
+                CL_ALL_DEVICES_FOR_D3D10_KHR, 1, &device, &numDevices);
+            if (status != CL_SUCCESS)
+                break;
+            if (numDevices > 0)
+            {
+                cl_context_properties properties[] = {
+                        CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[i],
+                        CL_CONTEXT_D3D10_DEVICE_KHR, (cl_context_properties)(pD3D10Device),
+                        CL_CONTEXT_INTEROP_USER_SYNC, CL_FALSE,
+                        NULL, NULL
+                };
+                context = clCreateContext(properties, 1, &device, NULL, NULL, &status);
+                if (status != CL_SUCCESS)
+                {
+                    clReleaseDevice(device);
+                }
+                else
+                {
+                    found = i;
+                }
+            }
+        } while (0);
+
+        if (found >= 0) {
+            OpenCLExecutionContext clExecCtx;
+            try
+            {
+                clExecCtx = OpenCLExecutionContext::create(platformName, platform, context, device);
+                clExecCtx.getContext().setUserContext(std::make_shared<OpenCL_D3D10>(platform, pD3D10Device));
+            }
+            catch (...)
+            {
+                clReleaseDevice(device);
+                clReleaseContext(context);
+                throw;
+            }
+            clExecCtx.bind();
+            return const_cast<Context&>(clExecCtx.getContext());
         }
-        if (found < 0)
-            CV_Error(cv::Error::OpenCLInitError, "OpenCL: Can't create context for DirectX interop");
     }
-
-    cl_platform_id platform = platforms[found];
-    std::string platformName = PlatformInfo(&platform).name();
-
-    OpenCLExecutionContext clExecCtx;
-    try
-    {
-        clExecCtx = OpenCLExecutionContext::create(platformName, platform, context, device);
-    }
-    catch (...)
-    {
-        clReleaseDevice(device);
-        clReleaseContext(context);
-        throw;
-    }
-    clExecCtx.bind();
-    return const_cast<Context&>(clExecCtx.getContext());
+    CV_Error(cv::Error::OpenCLInitError, "OpenCL: Can't create context for DirectX interop");
 #endif
 }
 
@@ -799,64 +685,29 @@ Context& initializeContextFromDirect3DDevice9Ex(IDirect3DDevice9Ex* pDirect3DDev
         CV_Error(cv::Error::OpenCLInitError, "OpenCL: Can't get platforms");
 
     // TODO Filter platforms by name from OPENCV_OPENCL_DEVICE
-
-    int found = -1;
-    cl_device_id device = NULL;
-    cl_uint numDevices = 0;
-    cl_context context = NULL;
-
-    // try with CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR
     for (int i = 0; i < (int)numPlatforms; i++)
     {
+        cl_platform_id platform = platforms[i];
+        std::string platformName = PlatformInfo(&platform).name();
+        int found = -1;
+        cl_device_id device = NULL;
+        cl_uint numDevices = 0;
+        cl_context context = NULL;
+
         clGetDeviceIDsFromDX9MediaAdapterKHR_fn clGetDeviceIDsFromDX9MediaAdapterKHR = (clGetDeviceIDsFromDX9MediaAdapterKHR_fn)
-                clGetExtensionFunctionAddressForPlatform(platforms[i], "clGetDeviceIDsFromDX9MediaAdapterKHR");
+            clGetExtensionFunctionAddressForPlatform(platforms[i], "clGetDeviceIDsFromDX9MediaAdapterKHR");
         if (!clGetDeviceIDsFromDX9MediaAdapterKHR)
             continue;
 
-        device = NULL;
-        numDevices = 0;
-        cl_dx9_media_adapter_type_khr type = CL_ADAPTER_D3D9EX_KHR;
-        status = clGetDeviceIDsFromDX9MediaAdapterKHR(platforms[i], 1, &type, &pDirect3DDevice9Ex,
-                CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR, 1, &device, &numDevices);
-        if (status != CL_SUCCESS)
-            continue;
-        if (numDevices > 0)
-        {
-            cl_context_properties properties[] = {
-                    CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[i],
-                    CL_CONTEXT_ADAPTER_D3D9EX_KHR, (cl_context_properties)(pDirect3DDevice9Ex),
-                    CL_CONTEXT_INTEROP_USER_SYNC, CL_FALSE,
-                    NULL, NULL
-            };
-            context = clCreateContext(properties, 1, &device, NULL, NULL, &status);
-            if (status != CL_SUCCESS)
-            {
-                clReleaseDevice(device);
-            }
-            else
-            {
-                found = i;
-                break;
-            }
-        }
-    }
-    if (found < 0)
-    {
-        // try with CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR
-        for (int i = 0; i < (int)numPlatforms; i++)
-        {
-            clGetDeviceIDsFromDX9MediaAdapterKHR_fn clGetDeviceIDsFromDX9MediaAdapterKHR = (clGetDeviceIDsFromDX9MediaAdapterKHR_fn)
-                    clGetExtensionFunctionAddressForPlatform(platforms[i], "clGetDeviceIDsFromDX9MediaAdapterKHR");
-            if (!clGetDeviceIDsFromDX9MediaAdapterKHR)
-                continue;
-
+        // try with CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR
+        do {
             device = NULL;
             numDevices = 0;
             cl_dx9_media_adapter_type_khr type = CL_ADAPTER_D3D9EX_KHR;
             status = clGetDeviceIDsFromDX9MediaAdapterKHR(platforms[i], 1, &type, &pDirect3DDevice9Ex,
-                    CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR, 1, &device, &numDevices);
+                CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR, 1, &device, &numDevices);
             if (status != CL_SUCCESS)
-                continue;
+                break;
             if (numDevices > 0)
             {
                 cl_context_properties properties[] = {
@@ -873,31 +724,57 @@ Context& initializeContextFromDirect3DDevice9Ex(IDirect3DDevice9Ex* pDirect3DDev
                 else
                 {
                     found = i;
-                    break;
                 }
             }
+        } while (0);
+        // try with CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR
+        if (found < 0) do
+        {
+            device = NULL;
+            numDevices = 0;
+            cl_dx9_media_adapter_type_khr type = CL_ADAPTER_D3D9EX_KHR;
+            status = clGetDeviceIDsFromDX9MediaAdapterKHR(platforms[i], 1, &type, &pDirect3DDevice9Ex,
+                CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR, 1, &device, &numDevices);
+            if (status != CL_SUCCESS)
+                break;
+            if (numDevices > 0)
+            {
+                cl_context_properties properties[] = {
+                        CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[i],
+                        CL_CONTEXT_ADAPTER_D3D9EX_KHR, (cl_context_properties)(pDirect3DDevice9Ex),
+                        CL_CONTEXT_INTEROP_USER_SYNC, CL_FALSE,
+                        NULL, NULL
+                };
+                context = clCreateContext(properties, 1, &device, NULL, NULL, &status);
+                if (status != CL_SUCCESS)
+                {
+                    clReleaseDevice(device);
+                }
+                else
+                {
+                    found = i;
+                }
+            }
+        } while (0);
+
+        if (found >= 0) {
+            OpenCLExecutionContext clExecCtx;
+            try
+            {
+                clExecCtx = OpenCLExecutionContext::create(platformName, platform, context, device);
+                clExecCtx.getContext().setUserContext(std::make_shared<OpenCL_D3D9>(platform, nullptr, pDirect3DDevice9Ex));
+            }
+            catch (...)
+            {
+                clReleaseDevice(device);
+                clReleaseContext(context);
+                throw;
+            }
+            clExecCtx.bind();
+            return const_cast<Context&>(clExecCtx.getContext());
         }
-        if (found < 0)
-            CV_Error(cv::Error::OpenCLInitError, "OpenCL: Can't create context for DirectX interop");
     }
-
-    cl_platform_id platform = platforms[found];
-    std::string platformName = PlatformInfo(&platform).name();
-
-    OpenCLExecutionContext clExecCtx;
-    try
-    {
-        clExecCtx = OpenCLExecutionContext::create(platformName, platform, context, device);
-    }
-    catch (...)
-    {
-        clReleaseDevice(device);
-        clReleaseContext(context);
-        throw;
-    }
-    clExecCtx.bind();
-    getImpl().isDirect3DDevice9Ex = true;
-    return const_cast<Context&>(clExecCtx.getContext());
+    CV_Error(cv::Error::OpenCLInitError, "OpenCL: Can't create context for DirectX interop");
 #endif
 }
 
@@ -922,64 +799,29 @@ Context& initializeContextFromDirect3DDevice9(IDirect3DDevice9* pDirect3DDevice9
         CV_Error(cv::Error::OpenCLInitError, "OpenCL: Can't get platforms");
 
     // TODO Filter platforms by name from OPENCV_OPENCL_DEVICE
-
-    int found = -1;
-    cl_device_id device = NULL;
-    cl_uint numDevices = 0;
-    cl_context context = NULL;
-
-    // try with CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR
     for (int i = 0; i < (int)numPlatforms; i++)
     {
+        cl_platform_id platform = platforms[i];
+        std::string platformName = PlatformInfo(&platform).name();
+        int found = -1;
+        cl_device_id device = NULL;
+        cl_uint numDevices = 0;
+        cl_context context = NULL;
+
         clGetDeviceIDsFromDX9MediaAdapterKHR_fn clGetDeviceIDsFromDX9MediaAdapterKHR = (clGetDeviceIDsFromDX9MediaAdapterKHR_fn)
-                clGetExtensionFunctionAddressForPlatform(platforms[i], "clGetDeviceIDsFromDX9MediaAdapterKHR");
+            clGetExtensionFunctionAddressForPlatform(platforms[i], "clGetDeviceIDsFromDX9MediaAdapterKHR");
         if (!clGetDeviceIDsFromDX9MediaAdapterKHR)
             continue;
 
-        device = NULL;
-        numDevices = 0;
-        cl_dx9_media_adapter_type_khr type = CL_ADAPTER_D3D9_KHR;
-        status = clGetDeviceIDsFromDX9MediaAdapterKHR(platforms[i], 1, &type, &pDirect3DDevice9,
-                CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR, 1, &device, &numDevices);
-        if (status != CL_SUCCESS)
-            continue;
-        if (numDevices > 0)
-        {
-            cl_context_properties properties[] = {
-                    CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[i],
-                    CL_CONTEXT_ADAPTER_D3D9_KHR, (cl_context_properties)(pDirect3DDevice9),
-                    CL_CONTEXT_INTEROP_USER_SYNC, CL_FALSE,
-                    NULL, NULL
-            };
-            context = clCreateContext(properties, 1, &device, NULL, NULL, &status);
-            if (status != CL_SUCCESS)
-            {
-                clReleaseDevice(device);
-            }
-            else
-            {
-                found = i;
-                break;
-            }
-        }
-    }
-    if (found < 0)
-    {
-        // try with CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR
-        for (int i = 0; i < (int)numPlatforms; i++)
-        {
-            clGetDeviceIDsFromDX9MediaAdapterKHR_fn clGetDeviceIDsFromDX9MediaAdapterKHR = (clGetDeviceIDsFromDX9MediaAdapterKHR_fn)
-                    clGetExtensionFunctionAddressForPlatform(platforms[i], "clGetDeviceIDsFromDX9MediaAdapterKHR");
-            if (!clGetDeviceIDsFromDX9MediaAdapterKHR)
-                continue;
-
+        // try with CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR
+        do {
             device = NULL;
             numDevices = 0;
             cl_dx9_media_adapter_type_khr type = CL_ADAPTER_D3D9_KHR;
             status = clGetDeviceIDsFromDX9MediaAdapterKHR(platforms[i], 1, &type, &pDirect3DDevice9,
-                    CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR, 1, &device, &numDevices);
+                CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR, 1, &device, &numDevices);
             if (status != CL_SUCCESS)
-                continue;
+                break;
             if (numDevices > 0)
             {
                 cl_context_properties properties[] = {
@@ -999,28 +841,56 @@ Context& initializeContextFromDirect3DDevice9(IDirect3DDevice9* pDirect3DDevice9
                     break;
                 }
             }
+        } while (0);
+        // try with CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR
+        if (found < 0) do
+        {
+            device = NULL;
+            numDevices = 0;
+            cl_dx9_media_adapter_type_khr type = CL_ADAPTER_D3D9_KHR;
+            status = clGetDeviceIDsFromDX9MediaAdapterKHR(platforms[i], 1, &type, &pDirect3DDevice9,
+                CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR, 1, &device, &numDevices);
+            if (status != CL_SUCCESS)
+                break;
+            if (numDevices > 0)
+            {
+                cl_context_properties properties[] = {
+                        CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[i],
+                        CL_CONTEXT_ADAPTER_D3D9_KHR, (cl_context_properties)(pDirect3DDevice9),
+                        CL_CONTEXT_INTEROP_USER_SYNC, CL_FALSE,
+                        NULL, NULL
+                };
+                context = clCreateContext(properties, 1, &device, NULL, NULL, &status);
+                if (status != CL_SUCCESS)
+                {
+                    clReleaseDevice(device);
+                }
+                else
+                {
+                    found = i;
+                    break;
+                }
+            }
+        } while (0);
+
+        if (found >= 0) {
+            OpenCLExecutionContext clExecCtx;
+            try
+            {
+                clExecCtx = OpenCLExecutionContext::create(platformName, platform, context, device);
+                clExecCtx.getContext().setUserContext(std::make_shared<OpenCL_D3D9>(platform, pDirect3DDevice9, nullptr));
+            }
+            catch (...)
+            {
+                clReleaseDevice(device);
+                clReleaseContext(context);
+                throw;
+            }
+            clExecCtx.bind();
+            return const_cast<Context&>(clExecCtx.getContext());
         }
-        if (found < 0)
-            CV_Error(cv::Error::OpenCLInitError, "OpenCL: Can't create context for DirectX interop");
     }
-
-    cl_platform_id platform = platforms[found];
-    std::string platformName = PlatformInfo(&platform).name();
-
-    OpenCLExecutionContext clExecCtx;
-    try
-    {
-        clExecCtx = OpenCLExecutionContext::create(platformName, platform, context, device);
-    }
-    catch (...)
-    {
-        clReleaseDevice(device);
-        clReleaseContext(context);
-        throw;
-    }
-    clExecCtx.bind();
-    getImpl().isDirect3DDevice9Ex = false;
-    return const_cast<Context&>(clExecCtx.getContext());
+    CV_Error(cv::Error::OpenCLInitError, "OpenCL: Can't create context for DirectX interop");
 #endif
 }
 
@@ -1104,24 +974,25 @@ static void __convertToD3D11Texture2DKHR(InputArray src, ID3D11Texture2D* pD3D11
 
     cl_mem clBuffer = (cl_mem)u.handle(ACCESS_READ);
 
-    using namespace cv::ocl;
-    Context& ctx = Context::getDefault();
+    ocl::Context& ctx = ocl::OpenCLExecutionContext::getCurrent().getContext();
     cl_context context = (cl_context)ctx.ptr();
-    OpenCLDirectXImpl& impl = getImpl();
+    OpenCL_D3D11* impl = ctx.getUserContext<OpenCL_D3D11>().get();
+    if (nullptr == impl)
+        CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: Context initilized without DirectX interoperability");
 
     cl_int status = 0;
     cl_mem clImage = 0;
 #ifdef HAVE_DIRECTX_NV12
     cl_mem clImageUV = 0;
 #endif
-    clImage = impl.clCreateFromD3D11Texture2DKHR(context, CL_MEM_WRITE_ONLY, pD3D11Texture2D, 0, &status);
+    clImage = impl->clCreateFromD3D11Texture2DKHR(context, CL_MEM_WRITE_ONLY, pD3D11Texture2D, 0, &status);
     if (status != CL_SUCCESS)
         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clCreateFromD3D11Texture2DKHR failed");
 
 #ifdef HAVE_DIRECTX_NV12
     if(DXGI_FORMAT_NV12 == desc.Format)
     {
-        clImageUV = impl.clCreateFromD3D11Texture2DKHR(context, CL_MEM_WRITE_ONLY, pD3D11Texture2D, 1, &status);
+        clImageUV = impl->clCreateFromD3D11Texture2DKHR(context, CL_MEM_WRITE_ONLY, pD3D11Texture2D, 1, &status);
         if (status != CL_SUCCESS)
             CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clCreateFromD3D11Texture2DKHR failed");
     }
@@ -1129,21 +1000,21 @@ static void __convertToD3D11Texture2DKHR(InputArray src, ID3D11Texture2D* pD3D11
 
     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
 
-    status = impl.clEnqueueAcquireD3D11ObjectsKHR(q, 1, &clImage, 0, NULL, NULL);
+    status = impl->clEnqueueAcquireD3D11ObjectsKHR(q, 1, &clImage, 0, NULL, NULL);
     if (status != CL_SUCCESS)
         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueAcquireD3D11ObjectsKHR failed");
 
 #ifdef HAVE_DIRECTX_NV12
     if(DXGI_FORMAT_NV12 == desc.Format)
     {
-        status = impl.clEnqueueAcquireD3D11ObjectsKHR(q, 1, &clImageUV, 0, NULL, NULL);
+        status = impl->clEnqueueAcquireD3D11ObjectsKHR(q, 1, &clImageUV, 0, NULL, NULL);
         if (status != CL_SUCCESS)
             CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueAcquireD3D11ObjectsKHR failed");
 
         if(!ocl::ocl_convert_bgr_to_nv12(clBuffer, (int)u.step[0], u.cols, u.rows, clImage, clImageUV))
             CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: ocl_convert_bgr_to_nv12 failed");
 
-        status = impl.clEnqueueReleaseD3D11ObjectsKHR(q, 1, &clImageUV, 0, NULL, NULL);
+        status = impl->clEnqueueReleaseD3D11ObjectsKHR(q, 1, &clImageUV, 0, NULL, NULL);
         if (status != CL_SUCCESS)
             CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueReleaseD3D11ObjectsKHR failed");
     }
@@ -1159,7 +1030,7 @@ static void __convertToD3D11Texture2DKHR(InputArray src, ID3D11Texture2D* pD3D11
             CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueCopyBufferToImage failed");
     }
 
-    status = impl.clEnqueueReleaseD3D11ObjectsKHR(q, 1, &clImage, 0, NULL, NULL);
+    status = impl->clEnqueueReleaseD3D11ObjectsKHR(q, 1, &clImage, 0, NULL, NULL);
     if (status != CL_SUCCESS)
         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueReleaseD3D11ObjectsKHR failed");
 
@@ -1203,44 +1074,45 @@ static void __convertToD3D11Texture2DNV(InputArray src, ID3D11Texture2D* pD3D11T
 
     cl_mem clBuffer = (cl_mem)u.handle(ACCESS_READ);
 
-    using namespace cv::ocl;
-    Context& ctx = Context::getDefault();
+    ocl::Context& ctx = ocl::OpenCLExecutionContext::getCurrent().getContext();
     cl_context context = (cl_context)ctx.ptr();
-    OpenCLDirectXImpl& impl = getImpl();
+    OpenCL_D3D11_NV* impl = ctx.getUserContext<OpenCL_D3D11_NV>().get();
+    if (nullptr == impl)
+        CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: Context initilized without DirectX interoperability");
 
     cl_int status = 0;
     cl_mem clImage = 0;
 #ifdef HAVE_DIRECTX_NV12
     cl_mem clImageUV = 0;
 #endif
-    clImage = impl.clCreateFromD3D11Texture2DNV(context, CL_MEM_WRITE_ONLY, pD3D11Texture2D, 0, &status);
+    clImage = impl->clCreateFromD3D11Texture2DNV(context, CL_MEM_WRITE_ONLY, pD3D11Texture2D, 0, &status);
     if (status != CL_SUCCESS)
         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clCreateFromD3D11Texture2DNV failed");
 
 #ifdef HAVE_DIRECTX_NV12
     if (DXGI_FORMAT_NV12 == desc.Format)
     {
-        clImageUV = impl.clCreateFromD3D11Texture2DNV(context, CL_MEM_WRITE_ONLY, pD3D11Texture2D, 1, &status);
+        clImageUV = impl->clCreateFromD3D11Texture2DNV(context, CL_MEM_WRITE_ONLY, pD3D11Texture2D, 1, &status);
         if (status != CL_SUCCESS)
             CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clCreateFromD3D11Texture2DNV failed");
     }
 #endif
     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
-    status = impl.clEnqueueAcquireD3D11ObjectsNV(q, 1, &clImage, 0, NULL, NULL);
+    status = impl->clEnqueueAcquireD3D11ObjectsNV(q, 1, &clImage, 0, NULL, NULL);
     if (status != CL_SUCCESS)
         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueAcquireD3D11ObjectsNV failed");
 
 #ifdef HAVE_DIRECTX_NV12
     if(DXGI_FORMAT_NV12 == desc.Format)
     {
-        status = impl.clEnqueueAcquireD3D11ObjectsNV(q, 1, &clImageUV, 0, NULL, NULL);
+        status = impl->clEnqueueAcquireD3D11ObjectsNV(q, 1, &clImageUV, 0, NULL, NULL);
         if (status != CL_SUCCESS)
             CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueAcquireD3D11ObjectsNV failed");
 
         if(!ocl::ocl_convert_bgr_to_nv12(clBuffer, (int)u.step[0], u.cols, u.rows, clImage, clImageUV))
             CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: ocl_convert_bgr_to_nv12 failed");
 
-        status = impl.clEnqueueReleaseD3D11ObjectsNV(q, 1, &clImageUV, 0, NULL, NULL);
+        status = impl->clEnqueueReleaseD3D11ObjectsNV(q, 1, &clImageUV, 0, NULL, NULL);
         if (status != CL_SUCCESS)
             CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueReleaseD3D11ObjectsNV failed");
     }
@@ -1256,7 +1128,7 @@ static void __convertToD3D11Texture2DNV(InputArray src, ID3D11Texture2D* pD3D11T
             CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueCopyBufferToImage failed");
     }
 
-    status = impl.clEnqueueReleaseD3D11ObjectsNV(q, 1, &clImage, 0, NULL, NULL);
+    status = impl->clEnqueueReleaseD3D11ObjectsNV(q, 1, &clImage, 0, NULL, NULL);
     if (status != CL_SUCCESS)
         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueReleaseD3D11ObjectsNV failed");
 
@@ -1298,15 +1170,16 @@ static void __convertFromD3D11Texture2DKHR(ID3D11Texture2D* pD3D11Texture2D, Out
 
     cl_mem clBuffer = (cl_mem)u.handle(ACCESS_READ);
 
-    using namespace cv::ocl;
-    Context& ctx = Context::getDefault();
+    ocl::Context& ctx = ocl::OpenCLExecutionContext::getCurrent().getContext();
     cl_context context = (cl_context)ctx.ptr();
-    OpenCLDirectXImpl& impl = getImpl();
+    OpenCL_D3D11* impl = ctx.getUserContext<OpenCL_D3D11>().get();
+    if (nullptr == impl)
+        CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: Context initilized without DirectX interoperability");
 
     cl_int status = 0;
     cl_mem clImage = 0;
 
-    clImage = impl.clCreateFromD3D11Texture2DKHR(context, CL_MEM_READ_ONLY, pD3D11Texture2D, 0, &status);
+    clImage = impl->clCreateFromD3D11Texture2DKHR(context, CL_MEM_READ_ONLY, pD3D11Texture2D, 0, &status);
     if (status != CL_SUCCESS)
         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clCreateFromD3D11Texture2DKHR failed");
 
@@ -1314,7 +1187,7 @@ static void __convertFromD3D11Texture2DKHR(ID3D11Texture2D* pD3D11Texture2D, Out
     cl_mem clImageUV = 0;
     if(DXGI_FORMAT_NV12 == desc.Format)
     {
-        clImageUV = impl.clCreateFromD3D11Texture2DKHR(context, CL_MEM_READ_ONLY, pD3D11Texture2D, 1, &status);
+        clImageUV = impl->clCreateFromD3D11Texture2DKHR(context, CL_MEM_READ_ONLY, pD3D11Texture2D, 1, &status);
         if (status != CL_SUCCESS)
             CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clCreateFromD3D11Texture2DKHR failed");
     }
@@ -1322,21 +1195,21 @@ static void __convertFromD3D11Texture2DKHR(ID3D11Texture2D* pD3D11Texture2D, Out
 
     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
 
-    status = impl.clEnqueueAcquireD3D11ObjectsKHR(q, 1, &clImage, 0, NULL, NULL);
+    status = impl->clEnqueueAcquireD3D11ObjectsKHR(q, 1, &clImage, 0, NULL, NULL);
     if (status != CL_SUCCESS)
         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueAcquireD3D11ObjectsKHR failed");
 
 #ifdef HAVE_DIRECTX_NV12
     if(DXGI_FORMAT_NV12 == desc.Format)
     {
-        status = impl.clEnqueueAcquireD3D11ObjectsKHR(q, 1, &clImageUV, 0, NULL, NULL);
+        status = impl->clEnqueueAcquireD3D11ObjectsKHR(q, 1, &clImageUV, 0, NULL, NULL);
         if (status != CL_SUCCESS)
             CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueAcquireD3D11ObjectsKHR failed");
 
         if(!ocl::ocl_convert_nv12_to_bgr(clImage, clImageUV, clBuffer, (int)u.step[0], u.cols, u.rows))
             CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: ocl_convert_nv12_to_bgr failed");
 
-        status = impl.clEnqueueReleaseD3D11ObjectsKHR(q, 1, &clImageUV, 0, NULL, NULL);
+        status = impl->clEnqueueReleaseD3D11ObjectsKHR(q, 1, &clImageUV, 0, NULL, NULL);
         if (status != CL_SUCCESS)
             CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueReleaseD3D11ObjectsKHR failed");
     }
@@ -1352,7 +1225,7 @@ static void __convertFromD3D11Texture2DKHR(ID3D11Texture2D* pD3D11Texture2D, Out
             CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueCopyImageToBuffer failed");
     }
 
-    status = impl.clEnqueueReleaseD3D11ObjectsKHR(q, 1, &clImage, 0, NULL, NULL);
+    status = impl->clEnqueueReleaseD3D11ObjectsKHR(q, 1, &clImage, 0, NULL, NULL);
     if (status != CL_SUCCESS)
         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueReleaseD3D11ObjectsKHR failed");
 
@@ -1394,15 +1267,16 @@ static void __convertFromD3D11Texture2DNV(ID3D11Texture2D* pD3D11Texture2D, Outp
 
     cl_mem clBuffer = (cl_mem)u.handle(ACCESS_READ);
 
-    using namespace cv::ocl;
-    Context& ctx = Context::getDefault();
+    ocl::Context& ctx = ocl::OpenCLExecutionContext::getCurrent().getContext();
     cl_context context = (cl_context)ctx.ptr();
-    OpenCLDirectXImpl& impl = getImpl();
+    OpenCL_D3D11_NV* impl = ctx.getUserContext<OpenCL_D3D11_NV>().get();
+    if (nullptr == impl)
+        CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: Context initilized without DirectX interoperability");
 
     cl_int status = 0;
     cl_mem clImage = 0;
 
-    clImage = impl.clCreateFromD3D11Texture2DNV(context, CL_MEM_READ_ONLY, pD3D11Texture2D, 0, &status);
+    clImage = impl->clCreateFromD3D11Texture2DNV(context, CL_MEM_READ_ONLY, pD3D11Texture2D, 0, &status);
     if (status != CL_SUCCESS)
         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clCreateFromD3D11Texture2DNV failed");
 
@@ -1410,28 +1284,28 @@ static void __convertFromD3D11Texture2DNV(ID3D11Texture2D* pD3D11Texture2D, Outp
     cl_mem clImageUV = 0;
     if(DXGI_FORMAT_NV12 == desc.Format)
     {
-        clImageUV = impl.clCreateFromD3D11Texture2DNV(context, CL_MEM_READ_ONLY, pD3D11Texture2D, 1, &status);
+        clImageUV = impl->clCreateFromD3D11Texture2DNV(context, CL_MEM_READ_ONLY, pD3D11Texture2D, 1, &status);
         if (status != CL_SUCCESS)
             CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clCreateFromD3D11Texture2DNV failed");
     }
 #endif
 
     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
-    status = impl.clEnqueueAcquireD3D11ObjectsNV(q, 1, &clImage, 0, NULL, NULL);
+    status = impl->clEnqueueAcquireD3D11ObjectsNV(q, 1, &clImage, 0, NULL, NULL);
     if (status != CL_SUCCESS)
         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueAcquireD3D11ObjectsNV failed");
 
 #ifdef HAVE_DIRECTX_NV12
     if (DXGI_FORMAT::DXGI_FORMAT_NV12 == desc.Format)
     {
-        status = impl.clEnqueueAcquireD3D11ObjectsNV(q, 1, &clImageUV, 0, NULL, NULL);
+        status = impl->clEnqueueAcquireD3D11ObjectsNV(q, 1, &clImageUV, 0, NULL, NULL);
         if (status != CL_SUCCESS)
             CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueAcquireD3D11ObjectsNV failed");
 
         if (!ocl::ocl_convert_nv12_to_bgr(clImage, clImageUV, clBuffer, (int)u.step[0], u.cols, u.rows))
             CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: ocl_convert_nv12_to_bgr failed");
 
-        status = impl.clEnqueueReleaseD3D11ObjectsNV(q, 1, &clImageUV, 0, NULL, NULL);
+        status = impl->clEnqueueReleaseD3D11ObjectsNV(q, 1, &clImageUV, 0, NULL, NULL);
         if (status != CL_SUCCESS)
             CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueReleaseD3D11ObjectsNV failed");
     }
@@ -1447,7 +1321,7 @@ static void __convertFromD3D11Texture2DNV(ID3D11Texture2D* pD3D11Texture2D, Outp
             CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueCopyImageToBuffer failed");
     }
 
-    status = impl.clEnqueueReleaseD3D11ObjectsNV(q, 1, &clImage, 0, NULL, NULL);
+    status = impl->clEnqueueReleaseD3D11ObjectsNV(q, 1, &clImage, 0, NULL, NULL);
     if (status != CL_SUCCESS)
         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueReleaseD3D11ObjectsNV failed");
 
@@ -1479,16 +1353,21 @@ void convertToD3D11Texture2D(InputArray src, ID3D11Texture2D* pD3D11Texture2D)
     NO_OPENCL_SUPPORT_ERROR;
 #else
 
-    bool useCLNVEXT = getImpl().initializeD3D11();
-    if(!useCLNVEXT){
-        __convertToD3D11Texture2DKHR(src,pD3D11Texture2D);
-    }
+    ocl::Context& ctx = ocl::OpenCLExecutionContext::getCurrent().getContext();
 #ifdef HAVE_OPENCL_D3D11_NV
-    else
-    {
+    OpenCL_D3D11_NV* impl_nv = ctx.getUserContext<OpenCL_D3D11_NV>().get();
+    if (impl_nv) {
         __convertToD3D11Texture2DNV(src,pD3D11Texture2D);
+        return;
     }
 #endif
+    OpenCL_D3D11* impl = ctx.getUserContext<OpenCL_D3D11>().get();
+    if (impl) {
+        __convertToD3D11Texture2DKHR(src, pD3D11Texture2D);
+    }
+    else {
+        CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: Context initilized without DirectX interoperability");
+    }
 #endif
 }
 
@@ -1501,16 +1380,20 @@ void convertFromD3D11Texture2D(ID3D11Texture2D* pD3D11Texture2D, OutputArray dst
     NO_OPENCL_SUPPORT_ERROR;
 #else
 
-    bool useCLNVEXT = getImpl().initializeD3D11();
-    if(!useCLNVEXT){
-        __convertFromD3D11Texture2DKHR(pD3D11Texture2D,dst);
-    }
+    ocl::Context& ctx = ocl::OpenCLExecutionContext::getCurrent().getContext();
 #ifdef HAVE_OPENCL_D3D11_NV
-    else
-    {
+    OpenCL_D3D11_NV* impl_nv = ctx.getUserContext<OpenCL_D3D11_NV>().get();
+    if (impl_nv) {
         __convertFromD3D11Texture2DNV(pD3D11Texture2D,dst);
     }
 #endif
+    OpenCL_D3D11* impl = ctx.getUserContext<OpenCL_D3D11>().get();
+    if (impl) {
+        __convertFromD3D11Texture2DKHR(pD3D11Texture2D, dst);
+    }
+    else {
+        CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: Context initilized without DirectX interoperability");
+    }
 #endif
 }
 
@@ -1520,8 +1403,11 @@ void convertToD3D10Texture2D(InputArray src, ID3D10Texture2D* pD3D10Texture2D)
 #if !defined(HAVE_DIRECTX)
     NO_DIRECTX_SUPPORT_ERROR;
 #elif defined(HAVE_OPENCL)
-    OpenCLDirectXImpl& impl = getImpl();
-    impl.initializeD3D10();
+
+    ocl::Context& ctx = ocl::OpenCLExecutionContext::getCurrent().getContext();
+    OpenCL_D3D10* impl = ctx.getUserContext<OpenCL_D3D10>().get();
+    if (nullptr == impl)
+        CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: Context initilized without DirectX interoperability");
 
     D3D10_TEXTURE2D_DESC desc = { 0 };
     pD3D10Texture2D->GetDesc(&desc);
@@ -1533,8 +1419,6 @@ void convertToD3D10Texture2D(InputArray src, ID3D10Texture2D* pD3D10Texture2D)
     Size srcSize = src.size();
     CV_Assert(srcSize.width == (int)desc.Width && srcSize.height == (int)desc.Height);
 
-    using namespace cv::ocl;
-    Context& ctx = Context::getDefault();
     cl_context context = (cl_context)ctx.ptr();
 
     UMat u = src.getUMat();
@@ -1544,14 +1428,14 @@ void convertToD3D10Texture2D(InputArray src, ID3D10Texture2D* pD3D10Texture2D)
     CV_Assert(u.isContinuous());
 
     cl_int status = 0;
-    cl_mem clImage = impl.clCreateFromD3D10Texture2DKHR(context, CL_MEM_WRITE_ONLY, pD3D10Texture2D, 0, &status);
+    cl_mem clImage = impl->clCreateFromD3D10Texture2DKHR(context, CL_MEM_WRITE_ONLY, pD3D10Texture2D, 0, &status);
     if (status != CL_SUCCESS)
         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clCreateFromD3D10Texture2DKHR failed");
 
     cl_mem clBuffer = (cl_mem)u.handle(ACCESS_READ);
 
     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
-    status = impl.clEnqueueAcquireD3D10ObjectsKHR(q, 1, &clImage, 0, NULL, NULL);
+    status = impl->clEnqueueAcquireD3D10ObjectsKHR(q, 1, &clImage, 0, NULL, NULL);
     if (status != CL_SUCCESS)
         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueAcquireD3D10ObjectsKHR failed");
     size_t offset = 0; // TODO
@@ -1560,7 +1444,7 @@ void convertToD3D10Texture2D(InputArray src, ID3D10Texture2D* pD3D10Texture2D)
     status = clEnqueueCopyBufferToImage(q, clBuffer, clImage, offset, dst_origin, region, 0, NULL, NULL);
     if (status != CL_SUCCESS)
         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueCopyBufferToImage failed");
-    status = impl.clEnqueueReleaseD3D10ObjectsKHR(q, 1, &clImage, 0, NULL, NULL);
+    status = impl->clEnqueueReleaseD3D10ObjectsKHR(q, 1, &clImage, 0, NULL, NULL);
     if (status != CL_SUCCESS)
         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueReleaseD3D10ObjectsKHR failed");
 
@@ -1576,14 +1460,17 @@ void convertToD3D10Texture2D(InputArray src, ID3D10Texture2D* pD3D10Texture2D)
     NO_OPENCL_SUPPORT_ERROR;
 #endif
 }
+
 void convertFromD3D10Texture2D(ID3D10Texture2D* pD3D10Texture2D, OutputArray dst)
 {
     CV_UNUSED(pD3D10Texture2D); CV_UNUSED(dst);
 #if !defined(HAVE_DIRECTX)
     NO_DIRECTX_SUPPORT_ERROR;
 #elif defined(HAVE_OPENCL)
-    OpenCLDirectXImpl& impl = getImpl();
-    impl.initializeD3D10();
+    ocl::Context& ctx = ocl::OpenCLExecutionContext::getCurrent().getContext();
+    OpenCL_D3D10* impl = ctx.getUserContext<OpenCL_D3D10>().get();
+    if (nullptr == impl)
+        CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: Context initilized without DirectX interoperability");
 
     D3D10_TEXTURE2D_DESC desc = { 0 };
     pD3D10Texture2D->GetDesc(&desc);
@@ -1591,8 +1478,6 @@ void convertFromD3D10Texture2D(ID3D10Texture2D* pD3D10Texture2D, OutputArray dst
     int textureType = getTypeFromDXGI_FORMAT(desc.Format);
     CV_Assert(textureType >= 0);
 
-    using namespace cv::ocl;
-    Context& ctx = Context::getDefault();
     cl_context context = (cl_context)ctx.ptr();
 
     // TODO Need to specify ACCESS_WRITE here somehow to prevent useless data copying!
@@ -1604,14 +1489,14 @@ void convertFromD3D10Texture2D(ID3D10Texture2D* pD3D10Texture2D, OutputArray dst
     CV_Assert(u.isContinuous());
 
     cl_int status = 0;
-    cl_mem clImage = impl.clCreateFromD3D10Texture2DKHR(context, CL_MEM_READ_ONLY, pD3D10Texture2D, 0, &status);
+    cl_mem clImage = impl->clCreateFromD3D10Texture2DKHR(context, CL_MEM_READ_ONLY, pD3D10Texture2D, 0, &status);
     if (status != CL_SUCCESS)
         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clCreateFromD3D10Texture2DKHR failed");
 
     cl_mem clBuffer = (cl_mem)u.handle(ACCESS_READ);
 
     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
-    status = impl.clEnqueueAcquireD3D10ObjectsKHR(q, 1, &clImage, 0, NULL, NULL);
+    status = impl->clEnqueueAcquireD3D10ObjectsKHR(q, 1, &clImage, 0, NULL, NULL);
     if (status != CL_SUCCESS)
         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueAcquireD3D10ObjectsKHR failed");
     size_t offset = 0; // TODO
@@ -1620,7 +1505,7 @@ void convertFromD3D10Texture2D(ID3D10Texture2D* pD3D10Texture2D, OutputArray dst
     status = clEnqueueCopyImageToBuffer(q, clImage, clBuffer, src_origin, region, offset, 0, NULL, NULL);
     if (status != CL_SUCCESS)
         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueCopyImageToBuffer failed");
-    status = impl.clEnqueueReleaseD3D10ObjectsKHR(q, 1, &clImage, 0, NULL, NULL);
+    status = impl->clEnqueueReleaseD3D10ObjectsKHR(q, 1, &clImage, 0, NULL, NULL);
     if (status != CL_SUCCESS)
         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueReleaseD3D10ObjectsKHR failed");
 
@@ -1637,15 +1522,17 @@ void convertFromD3D10Texture2D(ID3D10Texture2D* pD3D10Texture2D, OutputArray dst
 #endif
 }
 
-
 void convertToDirect3DSurface9(InputArray src, IDirect3DSurface9* pDirect3DSurface9, void* surfaceSharedHandle)
 {
     CV_UNUSED(src); CV_UNUSED(pDirect3DSurface9); CV_UNUSED(surfaceSharedHandle);
 #if !defined(HAVE_DIRECTX)
     NO_DIRECTX_SUPPORT_ERROR;
 #elif defined(HAVE_OPENCL)
-    OpenCLDirectXImpl& impl = getImpl();
-    impl.initializeD3D9();
+
+    ocl::Context& ctx = ocl::OpenCLExecutionContext::getCurrent().getContext();
+    OpenCL_D3D9* impl = ctx.getUserContext<OpenCL_D3D9>().get();
+    if (nullptr == impl)
+        CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: Context initilized without DirectX interoperability");
 
     D3DSURFACE_DESC desc;
     if (FAILED(pDirect3DSurface9->GetDesc(&desc)))
@@ -1660,8 +1547,6 @@ void convertToDirect3DSurface9(InputArray src, IDirect3DSurface9* pDirect3DSurfa
     Size srcSize = src.size();
     CV_Assert(srcSize.width == (int)desc.Width && srcSize.height == (int)desc.Height);
 
-    using namespace cv::ocl;
-    Context& ctx = Context::getDefault();
     cl_context context = (cl_context)ctx.ptr();
 
     UMat u = src.getUMat();
@@ -1672,8 +1557,8 @@ void convertToDirect3DSurface9(InputArray src, IDirect3DSurface9* pDirect3DSurfa
 
     cl_int status = 0;
     cl_dx9_surface_info_khr surfaceInfo = {pDirect3DSurface9, (HANDLE)surfaceSharedHandle};
-    cl_mem clImage = impl.clCreateFromDX9MediaSurfaceKHR(context, CL_MEM_WRITE_ONLY,
-        impl.isDirect3DDevice9Ex ? CL_ADAPTER_D3D9EX_KHR : CL_ADAPTER_D3D9_KHR,
+    cl_mem clImage = impl->clCreateFromDX9MediaSurfaceKHR(context, CL_MEM_WRITE_ONLY,
+        impl->deviceEx ? CL_ADAPTER_D3D9EX_KHR : CL_ADAPTER_D3D9_KHR,
             &surfaceInfo, 0, &status);
     if (status != CL_SUCCESS)
         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clCreateFromDX9MediaSurfaceKHR failed");
@@ -1681,7 +1566,7 @@ void convertToDirect3DSurface9(InputArray src, IDirect3DSurface9* pDirect3DSurfa
     cl_mem clBuffer = (cl_mem)u.handle(ACCESS_READ);
 
     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
-    status = impl.clEnqueueAcquireDX9MediaSurfacesKHR(q, 1, &clImage, 0, NULL, NULL);
+    status = impl->clEnqueueAcquireDX9MediaSurfacesKHR(q, 1, &clImage, 0, NULL, NULL);
     if (status != CL_SUCCESS)
         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueAcquireDX9MediaSurfacesKHR failed");
     size_t offset = 0; // TODO
@@ -1690,7 +1575,7 @@ void convertToDirect3DSurface9(InputArray src, IDirect3DSurface9* pDirect3DSurfa
     status = clEnqueueCopyBufferToImage(q, clBuffer, clImage, offset, dst_origin, region, 0, NULL, NULL);
     if (status != CL_SUCCESS)
         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueCopyBufferToImage failed");
-    status = impl.clEnqueueReleaseDX9MediaSurfacesKHR(q, 1, &clImage, 0, NULL, NULL);
+    status = impl->clEnqueueReleaseDX9MediaSurfacesKHR(q, 1, &clImage, 0, NULL, NULL);
     if (status != CL_SUCCESS)
         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueReleaseDX9MediaSurfacesKHR failed");
 
@@ -1713,8 +1598,11 @@ void convertFromDirect3DSurface9(IDirect3DSurface9* pDirect3DSurface9, OutputArr
 #if !defined(HAVE_DIRECTX)
     NO_DIRECTX_SUPPORT_ERROR;
 #elif defined(HAVE_OPENCL)
-    OpenCLDirectXImpl& impl = getImpl();
-    impl.initializeD3D9();
+
+    ocl::Context& ctx = ocl::OpenCLExecutionContext::getCurrent().getContext();
+    OpenCL_D3D9* impl = ctx.getUserContext<OpenCL_D3D9>().get();
+    if (nullptr == impl)
+        CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: Context initilized without DirectX interoperability");
 
     D3DSURFACE_DESC desc;
     if (FAILED(pDirect3DSurface9->GetDesc(&desc)))
@@ -1725,8 +1613,6 @@ void convertFromDirect3DSurface9(IDirect3DSurface9* pDirect3DSurface9, OutputArr
     int surfaceType = getTypeFromD3DFORMAT(desc.Format);
     CV_Assert(surfaceType >= 0);
 
-    using namespace cv::ocl;
-    Context& ctx = Context::getDefault();
     cl_context context = (cl_context)ctx.ptr();
 
     // TODO Need to specify ACCESS_WRITE here somehow to prevent useless data copying!
@@ -1739,8 +1625,8 @@ void convertFromDirect3DSurface9(IDirect3DSurface9* pDirect3DSurface9, OutputArr
 
     cl_int status = 0;
     cl_dx9_surface_info_khr surfaceInfo = {pDirect3DSurface9, (HANDLE)surfaceSharedHandle};
-    cl_mem clImage = impl.clCreateFromDX9MediaSurfaceKHR(context, CL_MEM_READ_ONLY,
-            impl.isDirect3DDevice9Ex ? CL_ADAPTER_D3D9EX_KHR : CL_ADAPTER_D3D9_KHR,
+    cl_mem clImage = impl->clCreateFromDX9MediaSurfaceKHR(context, CL_MEM_READ_ONLY,
+            impl->deviceEx ? CL_ADAPTER_D3D9EX_KHR : CL_ADAPTER_D3D9_KHR,
             &surfaceInfo, 0, &status);
     if (status != CL_SUCCESS)
         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clCreateFromDX9MediaSurfaceKHR failed");
@@ -1748,7 +1634,7 @@ void convertFromDirect3DSurface9(IDirect3DSurface9* pDirect3DSurface9, OutputArr
     cl_mem clBuffer = (cl_mem)u.handle(ACCESS_WRITE);
 
     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
-    status = impl.clEnqueueAcquireDX9MediaSurfacesKHR(q, 1, &clImage, 0, NULL, NULL);
+    status = impl->clEnqueueAcquireDX9MediaSurfacesKHR(q, 1, &clImage, 0, NULL, NULL);
     if (status != CL_SUCCESS)
         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueAcquireDX9MediaSurfacesKHR failed");
     size_t offset = 0; // TODO
@@ -1757,7 +1643,7 @@ void convertFromDirect3DSurface9(IDirect3DSurface9* pDirect3DSurface9, OutputArr
     status = clEnqueueCopyImageToBuffer(q, clImage, clBuffer, src_origin, region, offset, 0, NULL, NULL);
     if (status != CL_SUCCESS)
         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueCopyImageToBuffer failed");
-    status = impl.clEnqueueReleaseDX9MediaSurfacesKHR(q, 1, &clImage, 0, NULL, NULL);
+    status = impl->clEnqueueReleaseDX9MediaSurfacesKHR(q, 1, &clImage, 0, NULL, NULL);
     if (status != CL_SUCCESS)
         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueReleaseDX9MediaSurfacesKHR failed");
 
diff --git a/modules/core/src/directx.hpp b/modules/core/src/directx.hpp
deleted file mode 100644
index 9f23352d4d..0000000000
--- a/modules/core/src/directx.hpp
+++ /dev/null
@@ -1,23 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-
-#ifndef OPENCV_CORE_SRC_DIRECTX_HPP
-#define OPENCV_CORE_SRC_DIRECTX_HPP
-
-#ifndef HAVE_DIRECTX
-#error Invalid build configuration
-#endif
-
-namespace cv {
-namespace directx {
-namespace internal {
-
-struct OpenCLDirectXImpl;
-OpenCLDirectXImpl* createDirectXImpl();
-void deleteDirectXImpl(OpenCLDirectXImpl**);
-OpenCLDirectXImpl* getDirectXImpl(ocl::Context& ctx);
-
-}}} // namespace internal
-
-#endif  // OPENCV_CORE_SRC_DIRECTX_HPP
diff --git a/modules/core/src/glob.cpp b/modules/core/src/glob.cpp
index fa8592caa5..b7cf1bf236 100644
--- a/modules/core/src/glob.cpp
+++ b/modules/core/src/glob.cpp
@@ -43,7 +43,9 @@
 #include "precomp.hpp"
 
 #include "opencv2/core/utils/filesystem.hpp"
+#include "opencv2/core/utils/filesystem.private.hpp"
 
+#if OPENCV_HAVE_FILESYSTEM_SUPPORT
 #if defined _WIN32 || defined WINCE
 # include <windows.h>
 const char dir_separators[] = "/\\";
@@ -131,12 +133,15 @@ namespace
 
 
 }
-#else
+#else // defined _WIN32 || defined WINCE
 # include <dirent.h>
 # include <sys/stat.h>
 const char dir_separators[] = "/";
-#endif
+#endif // defined _WIN32 || defined WINCE
+#endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
 
+
+#if OPENCV_HAVE_FILESYSTEM_SUPPORT
 static bool isDir(const cv::String& path, DIR* dir)
 {
 #if defined _WIN32 || defined _WIN32_WCE
@@ -168,13 +173,20 @@ static bool isDir(const cv::String& path, DIR* dir)
     return is_dir != 0;
 #endif
 }
+#endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
 
 bool cv::utils::fs::isDirectory(const cv::String& path)
 {
+#if OPENCV_HAVE_FILESYSTEM_SUPPORT
     CV_INSTRUMENT_REGION();
     return isDir(path, NULL);
+#else
+    CV_UNUSED(path);
+    CV_Error(Error::StsNotImplemented, "File system support is disabled in this OpenCV build!");
+#endif
 }
 
+#if OPENCV_HAVE_FILESYSTEM_SUPPORT
 static bool wildcmp(const char *string, const char *wild)
 {
     // Based on wildcmp written by Jack Handy - <A href="mailto:jakkhandy@hotmail.com">jakkhandy@hotmail.com</A>
@@ -267,9 +279,11 @@ static void glob_rec(const cv::String& directory, const cv::String& wildchart, s
         CV_Error_(CV_StsObjectNotFound, ("could not open directory: %s", directory.c_str()));
     }
 }
+#endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
 
 void cv::glob(String pattern, std::vector<String>& result, bool recursive)
 {
+#if OPENCV_HAVE_FILESYSTEM_SUPPORT
     CV_INSTRUMENT_REGION();
 
     result.clear();
@@ -303,20 +317,44 @@ void cv::glob(String pattern, std::vector<String>& result, bool recursive)
 
     glob_rec(path, wildchart, result, recursive, false, path);
     std::sort(result.begin(), result.end());
+#else // OPENCV_HAVE_FILESYSTEM_SUPPORT
+    CV_UNUSED(pattern);
+    CV_UNUSED(result);
+    CV_UNUSED(recursive);
+    CV_Error(Error::StsNotImplemented, "File system support is disabled in this OpenCV build!");
+#endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
 }
 
 void cv::utils::fs::glob(const cv::String& directory, const cv::String& pattern,
         std::vector<cv::String>& result,
         bool recursive, bool includeDirectories)
 {
+#if OPENCV_HAVE_FILESYSTEM_SUPPORT
     glob_rec(directory, pattern, result, recursive, includeDirectories, directory);
     std::sort(result.begin(), result.end());
+#else // OPENCV_HAVE_FILESYSTEM_SUPPORT
+    CV_UNUSED(directory);
+    CV_UNUSED(pattern);
+    CV_UNUSED(result);
+    CV_UNUSED(recursive);
+    CV_UNUSED(includeDirectories);
+    CV_Error(Error::StsNotImplemented, "File system support is disabled in this OpenCV build!");
+#endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
 }
 
 void cv::utils::fs::glob_relative(const cv::String& directory, const cv::String& pattern,
         std::vector<cv::String>& result,
         bool recursive, bool includeDirectories)
 {
+#if OPENCV_HAVE_FILESYSTEM_SUPPORT
     glob_rec(directory, pattern, result, recursive, includeDirectories, cv::String());
     std::sort(result.begin(), result.end());
+#else // OPENCV_HAVE_FILESYSTEM_SUPPORT
+    CV_UNUSED(directory);
+    CV_UNUSED(pattern);
+    CV_UNUSED(result);
+    CV_UNUSED(recursive);
+    CV_UNUSED(includeDirectories);
+    CV_Error(Error::StsNotImplemented, "File system support is disabled in this OpenCV build!");
+#endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
 }
diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp
index ac52eeaf99..8749b29ec8 100644
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@@ -113,10 +113,6 @@
 
 #include "opencv2/core/opencl/runtime/opencl_core.hpp"
 
-#ifdef HAVE_DIRECTX
-#include "directx.hpp"
-#endif
-
 #ifdef HAVE_OPENCL_SVM
 #include "opencv2/core/opencl/runtime/opencl_svm_20.hpp"
 #include "opencv2/core/opencl/runtime/opencl_svm_hsa_extension.hpp"
@@ -2367,9 +2363,6 @@ protected:
         , contextId(CV_XADD(&g_contextId, 1))
         , configuration(configuration_)
         , handle(0)
-#ifdef HAVE_DIRECTX
-        , p_directx_impl(0)
-#endif
 #ifdef HAVE_OPENCL_SVM
         , svmInitialized(false)
 #endif
@@ -2395,11 +2388,10 @@ protected:
                 handle = NULL;
             }
             devices.clear();
-#ifdef HAVE_DIRECTX
-            directx::internal::deleteDirectXImpl(&p_directx_impl);
-#endif
         }
 
+        userContextStorage.clear();
+
         {
             cv::AutoLock lock(cv::getInitializationMutex());
             auto& container = getGlobalContainer();
@@ -2705,18 +2697,20 @@ public:
         return *bufferPoolHostPtr_.get();
     }
 
-#ifdef HAVE_DIRECTX
-    directx::internal::OpenCLDirectXImpl* p_directx_impl;
-
-    directx::internal::OpenCLDirectXImpl* getDirectXImpl()
-    {
-        if (!p_directx_impl)
-        {
-            p_directx_impl = directx::internal::createDirectXImpl();
-        }
-        return p_directx_impl;
+    std::map<std::type_index, std::shared_ptr<UserContext>> userContextStorage;
+    cv::Mutex userContextMutex;
+    void setUserContext(std::type_index typeId, const std::shared_ptr<UserContext>& userContext) {
+        cv::AutoLock lock(userContextMutex);
+        userContextStorage[typeId] = userContext;
+    }
+    std::shared_ptr<UserContext> getUserContext(std::type_index typeId) {
+        cv::AutoLock lock(userContextMutex);
+        auto it = userContextStorage.find(typeId);
+        if (it != userContextStorage.end())
+            return it->second;
+        else
+            return nullptr;
     }
-#endif
 
 #ifdef HAVE_OPENCL_SVM
     bool svmInitialized;
@@ -3036,6 +3030,25 @@ Context Context::create(const std::string& configuration)
     return ctx;
 }
 
+void* Context::getOpenCLContextProperty(int propertyId) const
+{
+    if (p == NULL)
+        return nullptr;
+    ::size_t size = 0;
+    CV_OCL_CHECK(clGetContextInfo(p->handle, CL_CONTEXT_PROPERTIES, 0, NULL, &size));
+    std::vector<cl_context_properties> prop(size / sizeof(cl_context_properties), (cl_context_properties)0);
+    CV_OCL_CHECK(clGetContextInfo(p->handle, CL_CONTEXT_PROPERTIES, size, prop.data(), NULL));
+    for (size_t i = 0; i < prop.size(); i += 2)
+    {
+        if (prop[i] == (cl_context_properties)propertyId)
+        {
+            CV_LOG_DEBUG(NULL, "OpenCL: found context property=" << propertyId << ") => " << (void*)prop[i + 1]);
+            return (void*)prop[i + 1];
+        }
+    }
+    return nullptr;
+}
+
 #ifdef HAVE_OPENCL_SVM
 bool Context::useSVM() const
 {
@@ -3097,6 +3110,21 @@ CV_EXPORTS bool useSVM(UMatUsageFlags usageFlags)
 } // namespace cv::ocl::svm
 #endif // HAVE_OPENCL_SVM
 
+Context::UserContext::~UserContext()
+{
+}
+
+void Context::setUserContext(std::type_index typeId, const std::shared_ptr<Context::UserContext>& userContext)
+{
+    CV_Assert(p);
+    p->setUserContext(typeId, userContext);
+}
+
+std::shared_ptr<Context::UserContext> Context::getUserContext(std::type_index typeId)
+{
+    CV_Assert(p);
+    return p->getUserContext(typeId);
+}
 
 static void get_platform_name(cl_platform_id id, String& name)
 {
@@ -3454,7 +3482,6 @@ struct Kernel::Impl
     void registerImageArgument(int arg, const Image2D& image)
     {
         CV_CheckGE(arg, 0, "");
-        CV_CheckLT(arg, (int)MAX_ARRS, "");
         if (arg < (int)shadow_images.size() && shadow_images[arg].ptr() != image.ptr())  // TODO future: replace ptr => impl (more strong check)
         {
             CV_Check(arg, !isInProgress, "ocl::Kernel: clearing of pending Image2D arguments is not allowed");
@@ -7505,15 +7532,4 @@ uint64 Timer::durationNS() const
 
 }} // namespace
 
-#ifdef HAVE_DIRECTX
-namespace cv { namespace directx { namespace internal {
-OpenCLDirectXImpl* getDirectXImpl(ocl::Context& ctx)
-{
-    ocl::Context::Impl* i = ctx.getImpl();
-    CV_Assert(i);
-    return i->getDirectXImpl();
-}
-}}} // namespace cv::directx::internal
-#endif
-
 #endif // HAVE_OPENCL
diff --git a/modules/core/src/ocl_disabled.impl.hpp b/modules/core/src/ocl_disabled.impl.hpp
index b5f9c4f69b..a217979a1e 100644
--- a/modules/core/src/ocl_disabled.impl.hpp
+++ b/modules/core/src/ocl_disabled.impl.hpp
@@ -172,9 +172,16 @@ Context& Context::getDefault(bool initialize)
 }
 void* Context::ptr() const { return NULL; }
 
+void* Context::getOpenCLContextProperty(int /*propertyId*/) const { OCL_NOT_AVAILABLE(); }
+
 bool Context::useSVM() const { return false; }
 void Context::setUseSVM(bool enabled) { }
 
+Context::UserContext::~UserContext() { }
+
+void Context::setUserContext(std::type_index /*typeId*/, const std::shared_ptr<Context::UserContext>& /*userContext*/) { OCL_NOT_AVAILABLE(); }
+std::shared_ptr<Context::UserContext> Context::getUserContext(std::type_index /*typeId*/) { OCL_NOT_AVAILABLE(); }
+
 /* static */ Context Context::fromHandle(void* context) { OCL_NOT_AVAILABLE(); }
 /* static */ Context Context::fromDevice(const ocl::Device& device) { OCL_NOT_AVAILABLE(); }
 /* static */ Context Context::create(const std::string& configuration) { OCL_NOT_AVAILABLE(); }
diff --git a/modules/core/src/precomp.hpp b/modules/core/src/precomp.hpp
index 5a0a7637c2..3057729928 100644
--- a/modules/core/src/precomp.hpp
+++ b/modules/core/src/precomp.hpp
@@ -375,6 +375,8 @@ cv::Mutex& getInitializationMutex();
 #define CV_SINGLETON_LAZY_INIT(TYPE, INITIALIZER) CV_SINGLETON_LAZY_INIT_(TYPE, INITIALIZER, instance)
 #define CV_SINGLETON_LAZY_INIT_REF(TYPE, INITIALIZER) CV_SINGLETON_LAZY_INIT_(TYPE, INITIALIZER, *instance)
 
+CV_EXPORTS void releaseTlsStorageThread();
+
 int cv_snprintf(char* buf, int len, const char* fmt, ...);
 int cv_vsnprintf(char* buf, int len, const char* fmt, va_list args);
 }
diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp
index 97a2a289c7..c001de3aac 100644
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -53,6 +53,8 @@
 #include <opencv2/core/utils/tls.hpp>
 #include <opencv2/core/utils/instrumentation.hpp>
 
+#include <opencv2/core/utils/filesystem.private.hpp>
+
 namespace cv {
 
 static void _initSystem()
@@ -393,6 +395,7 @@ struct HWFeatures
         g_hwFeatureNames[CPU_VSX3] = "VSX3";
 
         g_hwFeatureNames[CPU_MSA] = "CPU_MSA";
+        g_hwFeatureNames[CPU_RISCVV] = "RISCVV";
 
         g_hwFeatureNames[CPU_AVX512_COMMON] = "AVX512-COMMON";
         g_hwFeatureNames[CPU_AVX512_SKX] = "AVX512-SKX";
@@ -588,6 +591,9 @@ struct HWFeatures
     #if defined _ARM_ && (defined(_WIN32_WCE) && _WIN32_WCE >= 0x800)
         have[CV_CPU_NEON] = true;
     #endif
+    #ifdef __riscv_vector
+        have[CV_CPU_RISCVV] = true;
+    #endif
     #ifdef __mips_msa
         have[CV_CPU_MSA] = true;
     #endif
@@ -947,6 +953,7 @@ String format( const char* fmt, ... )
 
 String tempfile( const char* suffix )
 {
+#if OPENCV_HAVE_FILESYSTEM_SUPPORT
     String fname;
 #ifndef NO_GETENV
     const char *temp_dir = getenv("OPENCV_TEMP_PATH");
@@ -1033,6 +1040,10 @@ String tempfile( const char* suffix )
             return fname + suffix;
     }
     return fname;
+#else // OPENCV_HAVE_FILESYSTEM_SUPPORT
+    CV_UNUSED(suffix);
+    CV_Error(Error::StsNotImplemented, "File system support is disabled in this OpenCV build!");
+#endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
 }
 
 static ErrorCallback customErrorCallback = 0;
@@ -1468,6 +1479,9 @@ struct ThreadData
     size_t idx;               // Thread index in TLS storage. This is not OS thread ID!
 };
 
+
+static bool g_isTlsStorageInitialized = false;
+
 // Main TLS storage class
 class TlsStorage
 {
@@ -1477,6 +1491,7 @@ public:
     {
         tlsSlots.reserve(32);
         threads.reserve(32);
+        g_isTlsStorageInitialized = true;
     }
     ~TlsStorage()
     {
@@ -1681,12 +1696,31 @@ static TlsStorage &getTlsStorage()
 #ifndef _WIN32  // pthread key destructor
 static void opencv_tls_destructor(void* pData)
 {
+    if (!g_isTlsStorageInitialized)
+        return;  // nothing to release, so prefer to avoid creation of new global structures
     getTlsStorage().releaseThread(pData);
 }
 #else // _WIN32
 #ifdef CV_USE_FLS
 static void WINAPI opencv_fls_destructor(void* pData)
 {
+    // Empiric detection of ExitProcess call
+    DWORD code = STILL_ACTIVE/*259*/;
+    BOOL res = GetExitCodeProcess(GetCurrentProcess(), &code);
+    if (res && code != STILL_ACTIVE)
+    {
+        // Looks like we are in ExitProcess() call
+        // This is FLS specific only because their callback is called before DllMain.
+        // TLS doesn't have similar problem, DllMain() is called first which mark __termination properly.
+        // Note: this workaround conflicts with ExitProcess() steps order described in documentation, however it works:
+        // 3. ... called with DLL_PROCESS_DETACH
+        // 7. The termination status of the process changes from STILL_ACTIVE to the exit value of the process.
+        // (ref: https://docs.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-exitprocess)
+        cv::__termination = true;
+    }
+
+    if (!g_isTlsStorageInitialized)
+        return;  // nothing to release, so prefer to avoid creation of new global structures
     getTlsStorage().releaseThread(pData);
 }
 #endif // CV_USE_FLS
@@ -1695,6 +1729,13 @@ static void WINAPI opencv_fls_destructor(void* pData)
 } // namespace details
 using namespace details;
 
+void releaseTlsStorageThread()
+{
+    if (!g_isTlsStorageInitialized)
+        return;  // nothing to release, so prefer to avoid creation of new global structures
+    getTlsStorage().releaseThread();
+}
+
 TLSDataContainer::TLSDataContainer()
 {
     key_ = (int)getTlsStorage().reserveSlot(this); // Reserve key from TLS storage
@@ -1778,7 +1819,7 @@ BOOL WINAPI DllMain(HINSTANCE, DWORD fdwReason, LPVOID lpReserved)
         {
             // Not allowed to free resources if lpReserved is non-null
             // http://msdn.microsoft.com/en-us/library/windows/desktop/ms682583.aspx
-            cv::getTlsStorage().releaseThread();
+            releaseTlsStorageThread();
         }
     }
     return TRUE;
diff --git a/modules/core/src/utils/datafile.cpp b/modules/core/src/utils/datafile.cpp
index 6a53c73499..3af83a5d8f 100644
--- a/modules/core/src/utils/datafile.cpp
+++ b/modules/core/src/utils/datafile.cpp
@@ -16,6 +16,7 @@
 #include "opencv2/core/utils/filesystem.hpp"
 
 #include <opencv2/core/utils/configuration.private.hpp>
+#include "opencv2/core/utils/filesystem.private.hpp"
 
 #ifdef _WIN32
 #define WIN32_LEAN_AND_MEAN
@@ -67,6 +68,7 @@ CV_EXPORTS void addDataSearchSubDirectory(const cv::String& subdir)
     _getDataSearchSubDirectory().push_back(subdir);
 }
 
+#if OPENCV_HAVE_FILESYSTEM_SUPPORT
 static bool isPathSep(char c)
 {
     return c == '/' || c == '\\';
@@ -96,12 +98,14 @@ static bool isSubDirectory_(const cv::String& base_path, const cv::String& path)
     }
     return true;
 }
+
 static bool isSubDirectory(const cv::String& base_path, const cv::String& path)
 {
     bool res = isSubDirectory_(base_path, path);
     CV_LOG_VERBOSE(NULL, 0, "isSubDirectory(): base: " << base_path << "  path: " << path << "  => result: " << (res ? "TRUE" : "FALSE"));
     return res;
 }
+#endif //OPENCV_HAVE_FILESYSTEM_SUPPORT
 
 static cv::String getModuleLocation(const void* addr)
 {
@@ -188,6 +192,7 @@ cv::String findDataFile(const cv::String& relative_path,
                         const std::vector<String>* search_paths,
                         const std::vector<String>* subdir_paths)
 {
+#if OPENCV_HAVE_FILESYSTEM_SUPPORT
     configuration_parameter = configuration_parameter ? configuration_parameter : "OPENCV_DATA_PATH";
     CV_LOG_DEBUG(NULL, cv::format("utils::findDataFile('%s', %s)", relative_path.c_str(), configuration_parameter));
 
@@ -410,10 +415,18 @@ cv::String findDataFile(const cv::String& relative_path,
 #endif
 
     return cv::String();  // not found
+#else // OPENCV_HAVE_FILESYSTEM_SUPPORT
+    CV_UNUSED(relative_path);
+    CV_UNUSED(configuration_parameter);
+    CV_UNUSED(search_paths);
+    CV_UNUSED(subdir_paths);
+    CV_Error(Error::StsNotImplemented, "File system support is disabled in this OpenCV build!");
+#endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
 }
 
 cv::String findDataFile(const cv::String& relative_path, bool required, const char* configuration_parameter)
 {
+#if OPENCV_HAVE_FILESYSTEM_SUPPORT
     CV_LOG_DEBUG(NULL, cv::format("cv::utils::findDataFile('%s', %s, %s)",
                                   relative_path.c_str(), required ? "true" : "false",
                                   configuration_parameter ? configuration_parameter : "NULL"));
@@ -424,6 +437,12 @@ cv::String findDataFile(const cv::String& relative_path, bool required, const ch
     if (result.empty() && required)
         CV_Error(cv::Error::StsError, cv::format("OpenCV: Can't find required data file: %s", relative_path.c_str()));
     return result;
+#else // OPENCV_HAVE_FILESYSTEM_SUPPORT
+    CV_UNUSED(relative_path);
+    CV_UNUSED(required);
+    CV_UNUSED(configuration_parameter);
+    CV_Error(Error::StsNotImplemented, "File system support is disabled in this OpenCV build!");
+#endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
 }
 
 }} // namespace
diff --git a/modules/core/src/utils/samples.cpp b/modules/core/src/utils/samples.cpp
index c1162f85fe..5d1ee5af8b 100644
--- a/modules/core/src/utils/samples.cpp
+++ b/modules/core/src/utils/samples.cpp
@@ -11,6 +11,7 @@
 #define CV_LOG_STRIP_LEVEL CV_LOG_LEVEL_VERBOSE + 1
 #include "opencv2/core/utils/logger.hpp"
 #include "opencv2/core/utils/filesystem.hpp"
+#include "opencv2/core/utils/filesystem.private.hpp"
 
 namespace cv { namespace samples {
 
@@ -49,6 +50,7 @@ CV_EXPORTS void addSamplesDataSearchSubDirectory(const cv::String& subdir)
 
 cv::String findFile(const cv::String& relative_path, bool required, bool silentMode)
 {
+#if OPENCV_HAVE_FILESYSTEM_SUPPORT
     CV_LOG_DEBUG(NULL, cv::format("cv::samples::findFile('%s', %s)", relative_path.c_str(), required ? "true" : "false"));
     cv::String result = cv::utils::findDataFile(relative_path,
                                                 "OPENCV_SAMPLES_DATA_PATH",
@@ -61,6 +63,12 @@ cv::String findFile(const cv::String& relative_path, bool required, bool silentM
     if (result.empty() && required)
         CV_Error(cv::Error::StsError, cv::format("OpenCV samples: Can't find required data file: %s", relative_path.c_str()));
     return result;
+#else
+    CV_UNUSED(relative_path);
+    CV_UNUSED(required);
+    CV_UNUSED(silentMode);
+    CV_Error(Error::StsNotImplemented, "File system support is disabled in this OpenCV build!");
+#endif
 }
 
 
diff --git a/modules/core/src/va_intel.cpp b/modules/core/src/va_intel.cpp
index 1d2b1cbf32..a7623c37f4 100644
--- a/modules/core/src/va_intel.cpp
+++ b/modules/core/src/va_intel.cpp
@@ -7,6 +7,8 @@
 
 #include "precomp.hpp"
 
+#include <opencv2/core/utils/logger.hpp>
+
 #ifdef HAVE_VA
 #  include <va/va.h>
 #else  // HAVE_VA
@@ -48,12 +50,28 @@ namespace cv { namespace va_intel {
 
 #ifdef HAVE_VA_INTEL
 
-static clGetDeviceIDsFromVA_APIMediaAdapterINTEL_fn clGetDeviceIDsFromVA_APIMediaAdapterINTEL = NULL;
-static clCreateFromVA_APIMediaSurfaceINTEL_fn       clCreateFromVA_APIMediaSurfaceINTEL       = NULL;
-static clEnqueueAcquireVA_APIMediaSurfacesINTEL_fn  clEnqueueAcquireVA_APIMediaSurfacesINTEL  = NULL;
-static clEnqueueReleaseVA_APIMediaSurfacesINTEL_fn  clEnqueueReleaseVA_APIMediaSurfacesINTEL  = NULL;
-
-static bool contextInitialized = false;
+class VAAPIInterop : public ocl::Context::UserContext
+{
+public:
+    VAAPIInterop(cl_platform_id platform) {
+        clCreateFromVA_APIMediaSurfaceINTEL       = (clCreateFromVA_APIMediaSurfaceINTEL_fn)
+                clGetExtensionFunctionAddressForPlatform(platform, "clCreateFromVA_APIMediaSurfaceINTEL");
+        clEnqueueAcquireVA_APIMediaSurfacesINTEL  = (clEnqueueAcquireVA_APIMediaSurfacesINTEL_fn)
+                clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueAcquireVA_APIMediaSurfacesINTEL");
+        clEnqueueReleaseVA_APIMediaSurfacesINTEL  = (clEnqueueReleaseVA_APIMediaSurfacesINTEL_fn)
+                clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueReleaseVA_APIMediaSurfacesINTEL");
+        if (!clCreateFromVA_APIMediaSurfaceINTEL ||
+            !clEnqueueAcquireVA_APIMediaSurfacesINTEL ||
+            !clEnqueueReleaseVA_APIMediaSurfacesINTEL) {
+            CV_Error(cv::Error::OpenCLInitError, "OpenCL: Can't get extension function for VA-API interop");
+        }
+    }
+    virtual ~VAAPIInterop() {
+    }
+    clCreateFromVA_APIMediaSurfaceINTEL_fn       clCreateFromVA_APIMediaSurfaceINTEL;
+    clEnqueueAcquireVA_APIMediaSurfacesINTEL_fn  clEnqueueAcquireVA_APIMediaSurfacesINTEL;
+    clEnqueueReleaseVA_APIMediaSurfacesINTEL_fn  clEnqueueReleaseVA_APIMediaSurfacesINTEL;
+};
 
 #endif // HAVE_VA_INTEL
 
@@ -65,10 +83,8 @@ Context& initializeContextFromVA(VADisplay display, bool tryInterop)
 #if !defined(HAVE_VA)
     NO_VA_SUPPORT_ERROR;
 #else  // !HAVE_VA
-    init_libva();
 
 #   ifdef HAVE_VA_INTEL
-    contextInitialized = false;
     if (tryInterop)
     {
         cl_uint numPlatforms;
@@ -97,20 +113,10 @@ Context& initializeContextFromVA(VADisplay display, bool tryInterop)
         for (int i = 0; i < (int)numPlatforms; ++i)
         {
             // Get extension function pointers
-
+            clGetDeviceIDsFromVA_APIMediaAdapterINTEL_fn clGetDeviceIDsFromVA_APIMediaAdapterINTEL;
             clGetDeviceIDsFromVA_APIMediaAdapterINTEL = (clGetDeviceIDsFromVA_APIMediaAdapterINTEL_fn)
                 clGetExtensionFunctionAddressForPlatform(platforms[i], "clGetDeviceIDsFromVA_APIMediaAdapterINTEL");
-            clCreateFromVA_APIMediaSurfaceINTEL       = (clCreateFromVA_APIMediaSurfaceINTEL_fn)
-                clGetExtensionFunctionAddressForPlatform(platforms[i], "clCreateFromVA_APIMediaSurfaceINTEL");
-            clEnqueueAcquireVA_APIMediaSurfacesINTEL  = (clEnqueueAcquireVA_APIMediaSurfacesINTEL_fn)
-                clGetExtensionFunctionAddressForPlatform(platforms[i], "clEnqueueAcquireVA_APIMediaSurfacesINTEL");
-            clEnqueueReleaseVA_APIMediaSurfacesINTEL  = (clEnqueueReleaseVA_APIMediaSurfacesINTEL_fn)
-                clGetExtensionFunctionAddressForPlatform(platforms[i], "clEnqueueReleaseVA_APIMediaSurfacesINTEL");
-
-            if (((void*)clGetDeviceIDsFromVA_APIMediaAdapterINTEL == NULL) ||
-                ((void*)clCreateFromVA_APIMediaSurfaceINTEL == NULL) ||
-                ((void*)clEnqueueAcquireVA_APIMediaSurfacesINTEL == NULL) ||
-                ((void*)clEnqueueReleaseVA_APIMediaSurfacesINTEL == NULL))
+            if ((void*)clGetDeviceIDsFromVA_APIMediaAdapterINTEL == NULL)
             {
                 continue;
             }
@@ -151,8 +157,6 @@ Context& initializeContextFromVA(VADisplay display, bool tryInterop)
 
         if (found >= 0)
         {
-            contextInitialized = true;
-
             cl_platform_id platform = platforms[found];
             std::string platformName = PlatformInfo(&platform).name();
 
@@ -160,6 +164,7 @@ Context& initializeContextFromVA(VADisplay display, bool tryInterop)
             try
             {
                 clExecCtx = OpenCLExecutionContext::create(platformName, platform, context, device);
+                clExecCtx.getContext().setUserContext(std::make_shared<VAAPIInterop>(platform));
             }
             catch (...)
             {
@@ -520,7 +525,6 @@ void convertToVASurface(VADisplay display, InputArray src, VASurfaceID surface,
 #if !defined(HAVE_VA)
     NO_VA_SUPPORT_ERROR;
 #else  // !HAVE_VA
-    init_libva();
 
     const int stype = CV_8UC3;
 
@@ -531,7 +535,18 @@ void convertToVASurface(VADisplay display, InputArray src, VASurfaceID surface,
     CV_Assert(srcSize.width == size.width && srcSize.height == size.height);
 
 #ifdef HAVE_VA_INTEL
-    if (contextInitialized)
+    ocl::OpenCLExecutionContext& ocl_context = ocl::OpenCLExecutionContext::getCurrent();
+    VAAPIInterop* interop = ocl_context.getContext().getUserContext<VAAPIInterop>().get();
+    CV_LOG_IF_DEBUG(NULL, !interop,
+        "OpenCL/VA_INTEL: Can't interop with current OpenCL context - missing VAAPIInterop API. "
+        "OpenCL context should be created through initializeContextFromVA()");
+    void* context_display = ocl_context.getContext().getOpenCLContextProperty(CL_CONTEXT_VA_API_DISPLAY_INTEL);
+    CV_LOG_IF_INFO(NULL, interop && !context_display,
+        "OpenCL/VA_INTEL: Can't interop with current OpenCL context - missing VA display, context re-creation is required");
+    bool isValidContextDisplay = (display == context_display);
+    CV_LOG_IF_INFO(NULL, interop && context_display && !isValidContextDisplay,
+        "OpenCL/VA_INTEL: Can't interop with current OpenCL context - VA display mismatch: " << context_display << "(context) vs " << (void*)display << "(surface)");
+    if (isValidContextDisplay && interop)
     {
         UMat u = src.getUMat();
 
@@ -541,28 +556,26 @@ void convertToVASurface(VADisplay display, InputArray src, VASurfaceID surface,
 
         cl_mem clBuffer = (cl_mem)u.handle(ACCESS_READ);
 
-        using namespace cv::ocl;
-        Context& ctx = Context::getDefault();
-        cl_context context = (cl_context)ctx.ptr();
+        cl_context context = (cl_context)ocl_context.getContext().ptr();
 
         cl_int status = 0;
 
-        cl_mem clImageY = clCreateFromVA_APIMediaSurfaceINTEL(context, CL_MEM_WRITE_ONLY, &surface, 0, &status);
+        cl_mem clImageY = interop->clCreateFromVA_APIMediaSurfaceINTEL(context, CL_MEM_WRITE_ONLY, &surface, 0, &status);
         if (status != CL_SUCCESS)
             CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clCreateFromVA_APIMediaSurfaceINTEL failed (Y plane)");
-        cl_mem clImageUV = clCreateFromVA_APIMediaSurfaceINTEL(context, CL_MEM_WRITE_ONLY, &surface, 1, &status);
+        cl_mem clImageUV = interop->clCreateFromVA_APIMediaSurfaceINTEL(context, CL_MEM_WRITE_ONLY, &surface, 1, &status);
         if (status != CL_SUCCESS)
             CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clCreateFromVA_APIMediaSurfaceINTEL failed (UV plane)");
 
-        cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
+        cl_command_queue q = (cl_command_queue)ocl_context.getQueue().ptr();
 
         cl_mem images[2] = { clImageY, clImageUV };
-        status = clEnqueueAcquireVA_APIMediaSurfacesINTEL(q, 2, images, 0, NULL, NULL);
+        status = interop->clEnqueueAcquireVA_APIMediaSurfacesINTEL(q, 2, images, 0, NULL, NULL);
         if (status != CL_SUCCESS)
             CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueAcquireVA_APIMediaSurfacesINTEL failed");
         if (!ocl::ocl_convert_bgr_to_nv12(clBuffer, (int)u.step[0], u.cols, u.rows, clImageY, clImageUV))
             CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: ocl_convert_bgr_to_nv12 failed");
-        clEnqueueReleaseVA_APIMediaSurfacesINTEL(q, 2, images, 0, NULL, NULL);
+        interop->clEnqueueReleaseVA_APIMediaSurfacesINTEL(q, 2, images, 0, NULL, NULL);
         if (status != CL_SUCCESS)
             CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueReleaseVA_APIMediaSurfacesINTEL failed");
 
@@ -580,6 +593,7 @@ void convertToVASurface(VADisplay display, InputArray src, VASurfaceID surface,
     else
 # endif // HAVE_VA_INTEL
     {
+        init_libva();
         Mat m = src.getMat();
 
         // TODO Add support for roi
@@ -626,7 +640,6 @@ void convertFromVASurface(VADisplay display, VASurfaceID surface, Size size, Out
 #if !defined(HAVE_VA)
     NO_VA_SUPPORT_ERROR;
 #else  // !HAVE_VA
-    init_libva();
 
     const int dtype = CV_8UC3;
 
@@ -634,7 +647,9 @@ void convertFromVASurface(VADisplay display, VASurfaceID surface, Size size, Out
     dst.create(size, dtype);
 
 #ifdef HAVE_VA_INTEL
-    if (contextInitialized)
+    ocl::OpenCLExecutionContext& ocl_context = ocl::OpenCLExecutionContext::getCurrent();
+    VAAPIInterop* interop = ocl_context.getContext().getUserContext<VAAPIInterop>().get();
+    if (display == ocl_context.getContext().getOpenCLContextProperty(CL_CONTEXT_VA_API_DISPLAY_INTEL) && interop)
     {
         UMat u = dst.getUMat();
 
@@ -644,28 +659,26 @@ void convertFromVASurface(VADisplay display, VASurfaceID surface, Size size, Out
 
         cl_mem clBuffer = (cl_mem)u.handle(ACCESS_WRITE);
 
-        using namespace cv::ocl;
-        Context& ctx = Context::getDefault();
-        cl_context context = (cl_context)ctx.ptr();
+        cl_context context = (cl_context)ocl_context.getContext().ptr();
 
         cl_int status = 0;
 
-        cl_mem clImageY = clCreateFromVA_APIMediaSurfaceINTEL(context, CL_MEM_READ_ONLY, &surface, 0, &status);
+        cl_mem clImageY = interop->clCreateFromVA_APIMediaSurfaceINTEL(context, CL_MEM_READ_ONLY, &surface, 0, &status);
         if (status != CL_SUCCESS)
             CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clCreateFromVA_APIMediaSurfaceINTEL failed (Y plane)");
-        cl_mem clImageUV = clCreateFromVA_APIMediaSurfaceINTEL(context, CL_MEM_READ_ONLY, &surface, 1, &status);
+        cl_mem clImageUV = interop->clCreateFromVA_APIMediaSurfaceINTEL(context, CL_MEM_READ_ONLY, &surface, 1, &status);
         if (status != CL_SUCCESS)
             CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clCreateFromVA_APIMediaSurfaceINTEL failed (UV plane)");
 
-        cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
+        cl_command_queue q = (cl_command_queue)ocl_context.getQueue().ptr();
 
         cl_mem images[2] = { clImageY, clImageUV };
-        status = clEnqueueAcquireVA_APIMediaSurfacesINTEL(q, 2, images, 0, NULL, NULL);
+        status = interop->clEnqueueAcquireVA_APIMediaSurfacesINTEL(q, 2, images, 0, NULL, NULL);
         if (status != CL_SUCCESS)
             CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueAcquireVA_APIMediaSurfacesINTEL failed");
         if (!ocl::ocl_convert_nv12_to_bgr(clImageY, clImageUV, clBuffer, (int)u.step[0], u.cols, u.rows))
             CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: ocl_convert_nv12_to_bgr failed");
-        status = clEnqueueReleaseVA_APIMediaSurfacesINTEL(q, 2, images, 0, NULL, NULL);
+        status = interop->clEnqueueReleaseVA_APIMediaSurfacesINTEL(q, 2, images, 0, NULL, NULL);
         if (status != CL_SUCCESS)
             CV_Error(cv::Error::OpenCLApiCallError, "OpenCL: clEnqueueReleaseVA_APIMediaSurfacesINTEL failed");
 
@@ -683,6 +696,7 @@ void convertFromVASurface(VADisplay display, VASurfaceID surface, Size size, Out
     else
 # endif // HAVE_VA_INTEL
     {
+        init_libva();
         Mat m = dst.getMat();
 
         // TODO Add support for roi
diff --git a/modules/core/test/ocl/test_opencl.cpp b/modules/core/test/ocl/test_opencl.cpp
index e639f72948..daa023534d 100644
--- a/modules/core/test/ocl/test_opencl.cpp
+++ b/modules/core/test/ocl/test_opencl.cpp
@@ -132,6 +132,73 @@ TEST(OpenCL, support_SPIR_programs)
     testOpenCLKernel(k);
 }
 
+
+TEST(OpenCL, image2Dcount_regression_19334)
+{
+    cv::ocl::Context ctx = cv::ocl::Context::getDefault();
+    if (!ctx.ptr())
+    {
+        throw cvtest::SkipTestException("OpenCL is not available");
+    }
+    cv::ocl::Device device = cv::ocl::Device::getDefault();
+    if (!device.compilerAvailable())
+    {
+        throw cvtest::SkipTestException("OpenCL compiler is not available");
+    }
+
+    std::string module_name; // empty to disable OpenCL cache
+
+    static const char* opencl_kernel_src =
+"__kernel void test_kernel(int a,\n"
+"                          __global const uchar* src0, int src0_step, int src0_offset, int src0_rows, int src0_cols,\n"
+"                          __global const uchar* src1, int src1_step, int src1_offset, int src1_rows, int src1_cols,\n"
+"                          __global const uchar* src2, int src2_step, int src2_offset, int src2_rows, int src2_cols,\n"
+"                          __read_only image2d_t image)\n"
+"{\n"
+"}";
+    cv::ocl::ProgramSource src(module_name, "test_opencl_image_arg", opencl_kernel_src, "");
+    cv::String errmsg;
+    cv::ocl::Program program(src, "", errmsg);
+    ASSERT_TRUE(program.ptr() != NULL);
+    cv::ocl::Kernel k("test_kernel", program);
+    ASSERT_FALSE(k.empty());
+
+    std::vector<UMat> images(4);
+    for (size_t i = 0; i < images.size(); ++i)
+        images[i] = UMat(10, 10, CV_8UC1);
+    cv::ocl::Image2D image;
+    try
+    {
+        cv::ocl::Image2D image_(images.back());
+        image = image_;
+    }
+    catch (const cv::Exception&)
+    {
+        throw cvtest::SkipTestException("OpenCL images are not supported");
+    }
+
+    int nargs = 0;
+    int a = 0;
+    nargs = k.set(nargs, a);
+    ASSERT_EQ(1, nargs);
+    nargs = k.set(nargs, images[0]);
+    ASSERT_EQ(6, nargs);
+    nargs = k.set(nargs, images[1]);
+    ASSERT_EQ(11, nargs);
+    nargs = k.set(nargs, images[2]);
+    ASSERT_EQ(16, nargs);
+
+    // do not throw (issue of #19334)
+    ASSERT_NO_THROW(nargs = k.set(nargs, image));
+    ASSERT_EQ(17, nargs);
+
+    // allow to replace image argument if kernel is not running
+    UMat image2(10, 10, CV_8UC1);
+    ASSERT_NO_THROW(nargs = k.set(16, cv::ocl::Image2D(image2)));
+    ASSERT_EQ(17, nargs);
+}
+
+
 TEST(OpenCL, move_construct_assign)
 {
     cv::ocl::Context ctx1 = cv::ocl::Context::getDefault();
diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp
index 269ebe0f2a..5c22caaf12 100644
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@@ -577,6 +577,25 @@ template<typename R> struct TheTest
         return *this;
     }
 
+    TheTest & test_mul_hi()
+    {
+        // typedef typename V_RegTraits<R>::w_reg Rx2;
+        Data<R> dataA, dataB(32767);
+        R a = dataA, b = dataB;
+
+        R c = v_mul_hi(a, b);
+
+        Data<R> resC = c;
+        const int n = R::nlanes / 2;
+        for (int i = 0; i < n; ++i)
+        {
+            SCOPED_TRACE(cv::format("i=%d", i));
+            EXPECT_EQ((typename R::lane_type)((dataA[i] * dataB[i]) >> 16), resC[i]);
+        }
+
+        return *this;
+    }
+
     TheTest & test_abs()
     {
         typedef typename V_RegTraits<R>::u_reg Ru;
@@ -1663,6 +1682,7 @@ void test_hal_intrin_uint16()
         .test_arithm_wrap()
         .test_mul()
         .test_mul_expand()
+        .test_mul_hi()
         .test_cmp()
         .test_shift<1>()
         .test_shift<8>()
@@ -1697,6 +1717,7 @@ void test_hal_intrin_int16()
         .test_arithm_wrap()
         .test_mul()
         .test_mul_expand()
+        .test_mul_hi()
         .test_cmp()
         .test_shift<1>()
         .test_shift<8>()
diff --git a/modules/core/test/test_mat.cpp b/modules/core/test/test_mat.cpp
index 9b6145d733..a5d844e7ad 100644
--- a/modules/core/test/test_mat.cpp
+++ b/modules/core/test/test_mat.cpp
@@ -2355,4 +2355,98 @@ TEST(Mat, regression_18473)
 }
 
 
+TEST(Mat, ptrVecni_20044)
+{
+    Mat_<int> m(3,4); m << 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12;
+    Vec2i idx(1,1);
+
+    uchar *u = m.ptr(idx);
+    EXPECT_EQ(int(6), *(int*)(u));
+    const uchar *cu = m.ptr(idx);
+    EXPECT_EQ(int(6), *(int*)(cu));
+
+    int *i = m.ptr<int>(idx);
+    EXPECT_EQ(int(6), *(i));
+    const int *ci = m.ptr<int>(idx);
+    EXPECT_EQ(int(6), *(ci));
+}
+
+TEST(Mat, reverse_iterator_19967)
+{
+    // empty iterator (#16855)
+    cv::Mat m_empty;
+    EXPECT_NO_THROW(m_empty.rbegin<uchar>());
+    EXPECT_NO_THROW(m_empty.rend<uchar>());
+    EXPECT_TRUE(m_empty.rbegin<uchar>() == m_empty.rend<uchar>());
+
+    // 1D test
+    std::vector<uchar> data{0, 1, 2, 3};
+    const std::vector<int> sizes_1d{4};
+
+    //Base class
+    cv::Mat m_1d(sizes_1d, CV_8U, data.data());
+    auto mismatch_it_pair_1d = std::mismatch(data.rbegin(), data.rend(), m_1d.rbegin<uchar>());
+    EXPECT_EQ(mismatch_it_pair_1d.first, data.rend());  // expect no mismatch
+    EXPECT_EQ(mismatch_it_pair_1d.second, m_1d.rend<uchar>());
+
+    //Templated derived class
+    cv::Mat_<uchar> m_1d_t(static_cast<int>(sizes_1d.size()), sizes_1d.data(), data.data());
+    auto mismatch_it_pair_1d_t = std::mismatch(data.rbegin(), data.rend(), m_1d_t.rbegin());
+    EXPECT_EQ(mismatch_it_pair_1d_t.first, data.rend());  // expect no mismatch
+    EXPECT_EQ(mismatch_it_pair_1d_t.second, m_1d_t.rend());
+
+
+    // 2D test
+    const std::vector<int> sizes_2d{2, 2};
+
+    //Base class
+    cv::Mat m_2d(sizes_2d, CV_8U, data.data());
+    auto mismatch_it_pair_2d = std::mismatch(data.rbegin(), data.rend(), m_2d.rbegin<uchar>());
+    EXPECT_EQ(mismatch_it_pair_2d.first, data.rend());
+    EXPECT_EQ(mismatch_it_pair_2d.second, m_2d.rend<uchar>());
+
+    //Templated derived class
+    cv::Mat_<uchar> m_2d_t(static_cast<int>(sizes_2d.size()),sizes_2d.data(), data.data());
+    auto mismatch_it_pair_2d_t = std::mismatch(data.rbegin(), data.rend(), m_2d_t.rbegin());
+    EXPECT_EQ(mismatch_it_pair_2d_t.first, data.rend());
+    EXPECT_EQ(mismatch_it_pair_2d_t.second, m_2d_t.rend());
+
+    // 3D test
+    std::vector<uchar> data_3d{0, 1, 2, 3, 4, 5, 6, 7};
+    const std::vector<int> sizes_3d{2, 2, 2};
+
+    //Base class
+    cv::Mat m_3d(sizes_3d, CV_8U, data_3d.data());
+    auto mismatch_it_pair_3d = std::mismatch(data_3d.rbegin(), data_3d.rend(), m_3d.rbegin<uchar>());
+    EXPECT_EQ(mismatch_it_pair_3d.first, data_3d.rend());
+    EXPECT_EQ(mismatch_it_pair_3d.second, m_3d.rend<uchar>());
+
+    //Templated derived class
+    cv::Mat_<uchar> m_3d_t(static_cast<int>(sizes_3d.size()),sizes_3d.data(), data_3d.data());
+    auto mismatch_it_pair_3d_t = std::mismatch(data_3d.rbegin(), data_3d.rend(), m_3d_t.rbegin());
+    EXPECT_EQ(mismatch_it_pair_3d_t.first, data_3d.rend());
+    EXPECT_EQ(mismatch_it_pair_3d_t.second, m_3d_t.rend());
+
+    // const test base class
+    const cv::Mat m_1d_const(sizes_1d, CV_8U, data.data());
+
+    auto mismatch_it_pair_1d_const = std::mismatch(data.rbegin(), data.rend(), m_1d_const.rbegin<uchar>());
+    EXPECT_EQ(mismatch_it_pair_1d_const.first, data.rend());  // expect no mismatch
+    EXPECT_EQ(mismatch_it_pair_1d_const.second, m_1d_const.rend<uchar>());
+
+    EXPECT_FALSE((std::is_assignable<decltype(m_1d_const.rend<uchar>()), uchar>::value)) << "Constness of const iterator violated.";
+    EXPECT_FALSE((std::is_assignable<decltype(m_1d_const.rbegin<uchar>()), uchar>::value)) << "Constness of const iterator violated.";
+
+    // const test templated dervied class
+    const cv::Mat_<uchar> m_1d_const_t(static_cast<int>(sizes_1d.size()), sizes_1d.data(), data.data());
+
+    auto mismatch_it_pair_1d_const_t = std::mismatch(data.rbegin(), data.rend(), m_1d_const_t.rbegin());
+    EXPECT_EQ(mismatch_it_pair_1d_const_t.first, data.rend());  // expect no mismatch
+    EXPECT_EQ(mismatch_it_pair_1d_const_t.second, m_1d_const_t.rend());
+
+    EXPECT_FALSE((std::is_assignable<decltype(m_1d_const_t.rend()), uchar>::value)) << "Constness of const iterator violated.";
+    EXPECT_FALSE((std::is_assignable<decltype(m_1d_const_t.rbegin()), uchar>::value)) << "Constness of const iterator violated.";
+
+}
+
 }} // namespace
diff --git a/modules/core/test/test_utils.cpp b/modules/core/test/test_utils.cpp
index d8789ddfc2..ed5f34603d 100644
--- a/modules/core/test/test_utils.cpp
+++ b/modules/core/test/test_utils.cpp
@@ -9,6 +9,7 @@
 #include "opencv2/core/utils/buffer_area.private.hpp"
 
 #include "test_utils_tls.impl.hpp"
+#include "opencv2/core/utils/filesystem.private.hpp"
 
 namespace opencv_test { namespace {
 
@@ -336,7 +337,7 @@ TEST(Logger, DISABLED_message_if)
     }
 }
 
-
+#if OPENCV_HAVE_FILESYSTEM_SUPPORT
 TEST(Samples, findFile)
 {
     cv::utils::logging::LogLevel prev = cv::utils::logging::setLogLevel(cv::utils::logging::LOG_LEVEL_VERBOSE);
@@ -353,6 +354,7 @@ TEST(Samples, findFile_missing)
     ASSERT_ANY_THROW(path = samples::findFile("non-existed.file", true));
     cv::utils::logging::setLogLevel(prev);
 }
+#endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
 
 template <typename T>
 inline bool buffers_overlap(T * first, size_t first_num, T * second, size_t second_num)
diff --git a/modules/dnn/CMakeLists.txt b/modules/dnn/CMakeLists.txt
index b0811fb223..4c8129cbda 100644
--- a/modules/dnn/CMakeLists.txt
+++ b/modules/dnn/CMakeLists.txt
@@ -176,16 +176,20 @@ ocv_add_perf_tests(${INF_ENGINE_TARGET}
     FILES Include ${perf_hdrs}
 )
 
-ocv_option(${the_module}_PERF_CAFFE "Add performance tests of Caffe framework" OFF)
-ocv_option(${the_module}_PERF_CLCAFFE "Add performance tests of clCaffe framework" OFF)
+ocv_option(OPENCV_DNN_PERF_CAFFE "Add performance tests of Caffe framework" OFF)
+ocv_option(OPENCV_DNN_PERF_CLCAFFE "Add performance tests of clCaffe framework" OFF)
 if(BUILD_PERF_TESTS)
-  if (${the_module}_PERF_CAFFE)
+  if (OPENCV_DNN_PERF_CAFFE
+      OR ${the_module}_PERF_CAFFE  # compatibility for deprecated option
+  )
     find_package(Caffe QUIET)
     if (Caffe_FOUND)
       add_definitions(-DHAVE_CAFFE=1)
       ocv_target_link_libraries(opencv_perf_dnn caffe)
     endif()
-  elseif(${the_module}_PERF_CLCAFFE)
+  elseif(OPENCV_DNN_PERF_CLCAFFE
+         OR ${the_module}_PERF_CAFFE  # compatibility for deprecated option
+  )
     find_package(Caffe QUIET)
     if (Caffe_FOUND)
       add_definitions(-DHAVE_CLCAFFE=1)
diff --git a/modules/dnn/include/opencv2/dnn/dnn.hpp b/modules/dnn/include/opencv2/dnn/dnn.hpp
index 0743de00ab..255b41de88 100644
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@@ -738,9 +738,11 @@ CV__DNN_INLINE_NS_BEGIN
         CV_WRAP void enableFusion(bool fusion);
 
         /** @brief Returns overall time for inference and timings (in ticks) for layers.
+         *
          * Indexes in returned vector correspond to layers ids. Some layers can be fused with others,
-         * in this case zero ticks count will be return for that skipped layers.
-         * @param timings vector for tick timings for all layers.
+         * in this case zero ticks count will be return for that skipped layers. Supported by DNN_BACKEND_OPENCV on DNN_TARGET_CPU only.
+         *
+         * @param[out] timings vector for tick timings for all layers.
          * @return overall ticks for model inference.
          */
         CV_WRAP int64 getPerfProfile(CV_OUT std::vector<double>& timings);
diff --git a/modules/dnn/src/ie_ngraph.cpp b/modules/dnn/src/ie_ngraph.cpp
index 49717f8513..7484032714 100644
--- a/modules/dnn/src/ie_ngraph.cpp
+++ b/modules/dnn/src/ie_ngraph.cpp
@@ -20,6 +20,9 @@
 #include <opencv2/core/utils/configuration.private.hpp>
 #include <opencv2/core/utils/logger.hpp>
 
+#include "opencv2/core/utils/filesystem.hpp"
+#include "opencv2/core/utils/filesystem.private.hpp"
+
 namespace cv { namespace dnn {
 
 #ifdef HAVE_DNN_NGRAPH
@@ -683,6 +686,23 @@ void InfEngineNgraphNet::initPlugin(InferenceEngine::CNNNetwork& net)
                 ie.SetConfig({{
                     InferenceEngine::PluginConfigParams::KEY_CPU_THREADS_NUM, format("%d", getNumThreads()),
                 }}, device_name);
+#endif
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2021_2)
+            if (device_name.find("GPU") == 0)
+            {
+#if OPENCV_HAVE_FILESYSTEM_SUPPORT
+                std::string cache_path = utils::fs::getCacheDirectory((std::string("dnn_ie_cache_") + device_name).c_str(), "OPENCV_DNN_IE_GPU_CACHE_DIR");
+#else
+                std::string cache_path = utils::getConfigurationParameterString("OPENCV_DNN_IE_GPU_CACHE_DIR", "");
+#endif
+                if (!cache_path.empty() && cache_path != "disabled")
+                {
+                    CV_LOG_INFO(NULL, "OpenCV/nGraph: using GPU kernels cache: " << cache_path);
+                    ie.SetConfig({{
+                        InferenceEngine::PluginConfigParams::KEY_CACHE_DIR, cache_path,
+                    }}, device_name);
+                }
+            }
 #endif
         }
         std::map<std::string, std::string> config;
diff --git a/modules/dnn/src/layers/crop_and_resize_layer.cpp b/modules/dnn/src/layers/crop_and_resize_layer.cpp
index a4443ed3a2..eb8822870f 100644
--- a/modules/dnn/src/layers/crop_and_resize_layer.cpp
+++ b/modules/dnn/src/layers/crop_and_resize_layer.cpp
@@ -133,7 +133,8 @@ public:
         auto input = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
         auto rois = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
 
-        std::vector<size_t> dims = rois->get_shape(), offsets(4, 0);
+        auto rois_shape = rois->get_shape();
+        std::vector<int64_t> dims(rois_shape.begin(), rois_shape.end()), offsets(4, 0);
         offsets[3] = 2;
         dims[3] = 7;
 
@@ -147,7 +148,7 @@ public:
                                       lower_bounds, upper_bounds, strides, std::vector<int64_t>{}, std::vector<int64_t>{});
 
         // Reshape rois from 4D to 2D
-        std::vector<size_t> shapeData = {dims[2], 5};
+        std::vector<int64_t> shapeData = {dims[2], 5};
         auto shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, shapeData.data());
         auto reshape = std::make_shared<ngraph::op::v1::Reshape>(slice, shape, true);
 
diff --git a/modules/features2d/include/opencv2/features2d.hpp b/modules/features2d/include/opencv2/features2d.hpp
index c5a94a68fd..38bd7ae487 100644
--- a/modules/features2d/include/opencv2/features2d.hpp
+++ b/modules/features2d/include/opencv2/features2d.hpp
@@ -61,25 +61,11 @@ easily switch between different algorithms solving the same problem. This sectio
 matching descriptors that are represented as vectors in a multidimensional space. All objects that
 implement vector descriptor matchers inherit the DescriptorMatcher interface.
 
-@note
-   -   An example explaining keypoint matching can be found at
-        opencv_source_code/samples/cpp/descriptor_extractor_matcher.cpp
-    -   An example on descriptor matching evaluation can be found at
-        opencv_source_code/samples/cpp/detector_descriptor_matcher_evaluation.cpp
-    -   An example on one to many image matching can be found at
-        opencv_source_code/samples/cpp/matching_to_many_images.cpp
-
     @defgroup features2d_draw Drawing Function of Keypoints and Matches
     @defgroup features2d_category Object Categorization
 
 This section describes approaches based on local 2D features and used to categorize objects.
 
-@note
-   -   A complete Bag-Of-Words sample can be found at
-        opencv_source_code/samples/cpp/bagofwords_classification.cpp
-    -   (Python) An example using the features2D framework to perform object categorization can be
-        found at opencv_source_code/samples/python/find_obj.py
-
     @defgroup feature2d_hal Hardware Acceleration Layer
     @{
         @defgroup features2d_hal_interface Interface
@@ -90,7 +76,7 @@ This section describes approaches based on local 2D features and used to categor
 namespace cv
 {
 
-//! @addtogroup features2d
+//! @addtogroup features2d_main
 //! @{
 
 // //! writes vector of keypoints to the file storage
@@ -237,9 +223,6 @@ the vector descriptor extractors inherit the DescriptorExtractor interface.
  */
 typedef Feature2D DescriptorExtractor;
 
-//! @addtogroup features2d_main
-//! @{
-
 
 /** @brief Class for implementing the wrapper which makes detectors and extractors to be affine invariant,
 described as ASIFT in @cite YM11 .
@@ -486,20 +469,20 @@ class CV_EXPORTS_W MSER : public Feature2D
 public:
     /** @brief Full constructor for %MSER detector
 
-    @param _delta it compares \f$(size_{i}-size_{i-delta})/size_{i-delta}\f$
-    @param _min_area prune the area which smaller than minArea
-    @param _max_area prune the area which bigger than maxArea
-    @param _max_variation prune the area have similar size to its children
-    @param _min_diversity for color image, trace back to cut off mser with diversity less than min_diversity
-    @param _max_evolution  for color image, the evolution steps
-    @param _area_threshold for color image, the area threshold to cause re-initialize
-    @param _min_margin for color image, ignore too small margin
-    @param _edge_blur_size for color image, the aperture size for edge blur
+    @param delta it compares \f$(size_{i}-size_{i-delta})/size_{i-delta}\f$
+    @param min_area prune the area which smaller than minArea
+    @param max_area prune the area which bigger than maxArea
+    @param max_variation prune the area have similar size to its children
+    @param min_diversity for color image, trace back to cut off mser with diversity less than min_diversity
+    @param max_evolution  for color image, the evolution steps
+    @param area_threshold for color image, the area threshold to cause re-initialize
+    @param min_margin for color image, ignore too small margin
+    @param edge_blur_size for color image, the aperture size for edge blur
      */
-    CV_WRAP static Ptr<MSER> create( int _delta=5, int _min_area=60, int _max_area=14400,
-          double _max_variation=0.25, double _min_diversity=.2,
-          int _max_evolution=200, double _area_threshold=1.01,
-          double _min_margin=0.003, int _edge_blur_size=5 );
+    CV_WRAP static Ptr<MSER> create( int delta=5, int min_area=60, int max_area=14400,
+          double max_variation=0.25, double min_diversity=.2,
+          int max_evolution=200, double area_threshold=1.01,
+          double min_margin=0.003, int edge_blur_size=5 );
 
     /** @brief Detect %MSER regions
 
diff --git a/modules/flann/include/opencv2/flann/any.h b/modules/flann/include/opencv2/flann/any.h
index f5684e9962..4906fec081 100644
--- a/modules/flann/include/opencv2/flann/any.h
+++ b/modules/flann/include/opencv2/flann/any.h
@@ -167,17 +167,15 @@ class SinglePolicy
 
 public:
     static base_any_policy* get_policy();
-
-private:
-    static typename choose_policy<T>::type policy;
 };
 
-template <typename T>
-typename choose_policy<T>::type SinglePolicy<T>::policy;
-
 /// This function will return a different policy for each type.
 template <typename T>
-inline base_any_policy* SinglePolicy<T>::get_policy() { return &policy; }
+inline base_any_policy* SinglePolicy<T>::get_policy()
+{
+    static typename choose_policy<T>::type policy;
+    return &policy;
+}
 
 } // namespace anyimpl
 
diff --git a/modules/gapi/CMakeLists.txt b/modules/gapi/CMakeLists.txt
index 6b586c1f99..b26b613e72 100644
--- a/modules/gapi/CMakeLists.txt
+++ b/modules/gapi/CMakeLists.txt
@@ -162,6 +162,9 @@ set(gapi_srcs
     # Python bridge
     src/backends/ie/bindings_ie.cpp
     src/backends/python/gpythonbackend.cpp
+
+    # Utils (ITT tracing)
+    src/utils/itt.cpp
     )
 
 ocv_add_dispatched_file(backends/fluid/gfluidimgproc_func SSE4_1 AVX2)
@@ -178,13 +181,22 @@ ocv_module_include_directories("${CMAKE_CURRENT_LIST_DIR}/src")
 ocv_create_module()
 
 ocv_target_link_libraries(${the_module} PRIVATE ade)
+
 if(OPENCV_GAPI_INF_ENGINE)
   ocv_target_link_libraries(${the_module} PRIVATE ${INF_ENGINE_TARGET})
 endif()
+
 if(HAVE_TBB)
   ocv_target_link_libraries(${the_module} PRIVATE tbb)
 endif()
 
+# TODO: Consider support of ITT in G-API standalone mode.
+if(CV_TRACE AND HAVE_ITT)
+  ocv_target_compile_definitions(${the_module} PRIVATE -DOPENCV_WITH_ITT=1)
+  ocv_module_include_directories(${ITT_INCLUDE_DIRS})
+  ocv_target_link_libraries(${the_module} PRIVATE ${ITT_LIBRARIES})
+endif()
+
 set(__test_extra_deps "")
 if(OPENCV_GAPI_INF_ENGINE)
   list(APPEND __test_extra_deps ${INF_ENGINE_TARGET})
diff --git a/modules/gapi/include/opencv2/gapi/core.hpp b/modules/gapi/include/opencv2/gapi/core.hpp
index cb5d55d13f..cb8a6127d7 100644
--- a/modules/gapi/include/opencv2/gapi/core.hpp
+++ b/modules/gapi/include/opencv2/gapi/core.hpp
@@ -575,6 +575,12 @@ namespace core {
             return std::make_tuple(empty_gopaque_desc(), empty_array_desc(), empty_array_desc());
         }
     };
+
+    G_TYPED_KERNEL(GTranspose, <GMat(GMat)>, "org.opencv.core.transpose") {
+        static GMatDesc outMeta(GMatDesc in) {
+            return in.withSize({in.size.height, in.size.width});
+        }
+    };
 } // namespace core
 
 namespace streaming {
@@ -1490,7 +1496,7 @@ enlarge an image, it will generally look best with cv::INTER_CUBIC (slow) or cv:
 
 @sa  warpAffine, warpPerspective, remap, resizeP
  */
-GAPI_EXPORTS GMat resize(const GMat& src, const Size& dsize, double fx = 0, double fy = 0, int interpolation = INTER_LINEAR);
+GAPI_EXPORTS_W GMat resize(const GMat& src, const Size& dsize, double fx = 0, double fy = 0, int interpolation = INTER_LINEAR);
 
 /** @brief Resizes a planar image.
 
@@ -1927,6 +1933,21 @@ GAPI_EXPORTS std::tuple<GOpaque<double>,GArray<int>,GArray<Point3f>>
 kmeans(const GArray<Point3f>& data, const int K, const GArray<int>& bestLabels,
        const TermCriteria& criteria, const int attempts, const KmeansFlags flags);
 
+
+/** @brief Transposes a matrix.
+
+The function transposes the matrix:
+\f[\texttt{dst} (i,j) =  \texttt{src} (j,i)\f]
+
+@note
+ - Function textual ID is "org.opencv.core.transpose"
+ - No complex conjugation is done in case of a complex matrix. It should be done separately if needed.
+
+@param src input array.
+*/
+GAPI_EXPORTS GMat transpose(const GMat& src);
+
+
 namespace streaming {
 /** @brief Gets dimensions from Mat.
 
diff --git a/modules/gapi/include/opencv2/gapi/gcommon.hpp b/modules/gapi/include/opencv2/gapi/gcommon.hpp
index 8119e397eb..a9cb015901 100644
--- a/modules/gapi/include/opencv2/gapi/gcommon.hpp
+++ b/modules/gapi/include/opencv2/gapi/gcommon.hpp
@@ -195,6 +195,14 @@ private:
 
 using GCompileArgs = std::vector<GCompileArg>;
 
+inline cv::GCompileArgs& operator += (      cv::GCompileArgs &lhs,
+                                      const cv::GCompileArgs &rhs)
+{
+    lhs.reserve(lhs.size() + rhs.size());
+    lhs.insert(lhs.end(), rhs.begin(), rhs.end());
+    return lhs;
+}
+
 /**
  * @brief Wraps a list of arguments (a parameter pack) into a vector of
  *        compilation arguments (cv::GCompileArg).
diff --git a/modules/gapi/include/opencv2/gapi/gmat.hpp b/modules/gapi/include/opencv2/gapi/gmat.hpp
index 5e567fb107..11c3071f03 100644
--- a/modules/gapi/include/opencv2/gapi/gmat.hpp
+++ b/modules/gapi/include/opencv2/gapi/gmat.hpp
@@ -120,7 +120,7 @@ struct GAPI_EXPORTS_W_SIMPLE GMatDesc
     // Meta combinator: return a new GMatDesc which differs in size by delta
     // (all other fields are taken unchanged from this GMatDesc)
     // FIXME: a better name?
-    GMatDesc withSizeDelta(cv::Size delta) const
+    GAPI_WRAP GMatDesc withSizeDelta(cv::Size delta) const
     {
         GMatDesc desc(*this);
         desc.size += delta;
@@ -130,12 +130,12 @@ struct GAPI_EXPORTS_W_SIMPLE GMatDesc
     // (all other fields are taken unchanged from this GMatDesc)
     //
     // This is an overload.
-    GMatDesc withSizeDelta(int dx, int dy) const
+    GAPI_WRAP GMatDesc withSizeDelta(int dx, int dy) const
     {
         return withSizeDelta(cv::Size{dx,dy});
     }
 
-    GMatDesc withSize(cv::Size sz) const
+    GAPI_WRAP GMatDesc withSize(cv::Size sz) const
     {
         GMatDesc desc(*this);
         desc.size = sz;
@@ -144,7 +144,7 @@ struct GAPI_EXPORTS_W_SIMPLE GMatDesc
 
     // Meta combinator: return a new GMatDesc with specified data depth.
     // (all other fields are taken unchanged from this GMatDesc)
-    GMatDesc withDepth(int ddepth) const
+    GAPI_WRAP GMatDesc withDepth(int ddepth) const
     {
         GAPI_Assert(CV_MAT_CN(ddepth) == 1 || ddepth == -1);
         GMatDesc desc(*this);
@@ -166,7 +166,7 @@ struct GAPI_EXPORTS_W_SIMPLE GMatDesc
     // Meta combinator: return a new GMatDesc with planar flag set
     // (no size changes are performed, only channel interpretation is changed
     // (interleaved -> planar)
-    GMatDesc asPlanar() const
+    GAPI_WRAP GMatDesc asPlanar() const
     {
         GAPI_Assert(planar == false);
         GMatDesc desc(*this);
@@ -177,7 +177,7 @@ struct GAPI_EXPORTS_W_SIMPLE GMatDesc
     // Meta combinator: return a new GMatDesc
     // reinterpreting 1-channel input as planar image
     // (size height is divided by plane number)
-    GMatDesc asPlanar(int planes) const
+    GAPI_WRAP GMatDesc asPlanar(int planes) const
     {
         GAPI_Assert(planar == false);
         GAPI_Assert(chan == 1);
@@ -192,7 +192,7 @@ struct GAPI_EXPORTS_W_SIMPLE GMatDesc
     // Meta combinator: return a new GMatDesc with planar flag set to false
     // (no size changes are performed, only channel interpretation is changed
     // (planar -> interleaved)
-    GMatDesc asInterleaved() const
+    GAPI_WRAP GMatDesc asInterleaved() const
     {
         GAPI_Assert(planar == true);
         GMatDesc desc(*this);
diff --git a/modules/gapi/include/opencv2/gapi/gstreaming.hpp b/modules/gapi/include/opencv2/gapi/gstreaming.hpp
index 4e579caafb..371581345f 100644
--- a/modules/gapi/include/opencv2/gapi/gstreaming.hpp
+++ b/modules/gapi/include/opencv2/gapi/gstreaming.hpp
@@ -371,6 +371,31 @@ protected:
 };
 /** @} */
 
+namespace gapi {
+namespace streaming {
+/**
+ * @brief Specify queue capacity for streaming execution.
+ *
+ * In the streaming mode the pipeline steps are connected with queues
+ * and this compile argument controls every queue's size.
+ */
+struct GAPI_EXPORTS queue_capacity
+{
+    explicit queue_capacity(size_t cap = 1) : capacity(cap) { };
+    size_t capacity;
+};
+/** @} */
+} // namespace streaming
+} // namespace gapi
+
+namespace detail
+{
+template<> struct CompileArgTag<cv::gapi::streaming::queue_capacity>
+{
+    static const char* tag() { return "gapi.queue_capacity"; }
+};
+}
+
 }
 
 #endif // OPENCV_GAPI_GSTREAMING_COMPILED_HPP
diff --git a/modules/gapi/include/opencv2/gapi/imgproc.hpp b/modules/gapi/include/opencv2/gapi/imgproc.hpp
index 25a64a5067..2dbe626ff1 100644
--- a/modules/gapi/include/opencv2/gapi/imgproc.hpp
+++ b/modules/gapi/include/opencv2/gapi/imgproc.hpp
@@ -1341,7 +1341,7 @@ Output image is 8-bit unsigned 3-channel image @ref CV_8UC3.
 @param src input image: 8-bit unsigned 3-channel image @ref CV_8UC3.
 @sa RGB2BGR
 */
-GAPI_EXPORTS GMat BGR2RGB(const GMat& src);
+GAPI_EXPORTS_W GMat BGR2RGB(const GMat& src);
 
 /** @brief Converts an image from RGB color space to gray-scaled.
 
diff --git a/modules/gapi/include/opencv2/gapi/infer.hpp b/modules/gapi/include/opencv2/gapi/infer.hpp
index 6e71f59df9..7ba3a44881 100644
--- a/modules/gapi/include/opencv2/gapi/infer.hpp
+++ b/modules/gapi/include/opencv2/gapi/infer.hpp
@@ -693,6 +693,14 @@ template<typename... Args>
 cv::gapi::GNetPackage networks(Args&&... args) {
     return cv::gapi::GNetPackage({ cv::detail::strip(args)... });
 }
+
+inline cv::gapi::GNetPackage& operator += (      cv::gapi::GNetPackage& lhs,
+                                           const cv::gapi::GNetPackage& rhs) {
+    lhs.networks.reserve(lhs.networks.size() + rhs.networks.size());
+    lhs.networks.insert(lhs.networks.end(), rhs.networks.begin(), rhs.networks.end());
+    return lhs;
+}
+
 } // namespace gapi
 } // namespace cv
 
diff --git a/modules/gapi/include/opencv2/gapi/infer/ie.hpp b/modules/gapi/include/opencv2/gapi/infer/ie.hpp
index 60137c960c..2bed13abc3 100644
--- a/modules/gapi/include/opencv2/gapi/infer/ie.hpp
+++ b/modules/gapi/include/opencv2/gapi/infer/ie.hpp
@@ -235,6 +235,36 @@ public:
         return *this;
     }
 
+    Params& cfgInputReshape(std::map<std::string, std::vector<std::size_t>> && reshape_table) {
+        desc.reshape_table = std::move(reshape_table);
+        return *this;
+    }
+
+    Params& cfgInputReshape(const std::map<std::string, std::vector<std::size_t>>&reshape_table) {
+        desc.reshape_table = reshape_table;
+        return *this;
+    }
+
+    Params& cfgInputReshape(std::string && layer_name, std::vector<size_t> && layer_dims) {
+        desc.reshape_table.emplace(layer_name, layer_dims);
+        return *this;
+    }
+
+    Params& cfgInputReshape(const std::string & layer_name, const std::vector<size_t>&layer_dims) {
+        desc.reshape_table.emplace(layer_name, layer_dims);
+        return *this;
+    }
+
+    Params& cfgInputReshape(std::unordered_set<std::string> && layer_names) {
+        desc.layer_names_to_reshape = std::move(layer_names);
+        return *this;
+    }
+
+    Params& cfgInputReshape(const std::unordered_set<std::string>&layer_names) {
+        desc.layer_names_to_reshape = layer_names;
+        return *this;
+    }
+
     // BEGIN(G-API's network parametrization API)
     GBackend      backend()    const { return cv::gapi::ie::backend();  }
     std::string   tag()        const { return m_tag; }
diff --git a/modules/gapi/misc/python/package/gapi/__init__.py b/modules/gapi/misc/python/package/gapi/__init__.py
new file mode 100644
index 0000000000..733c980010
--- /dev/null
+++ b/modules/gapi/misc/python/package/gapi/__init__.py
@@ -0,0 +1,246 @@
+__all__ = ['op', 'kernel']
+
+import sys
+import cv2 as cv
+
+# NB: Register function in specific module
+def register(mname):
+    def parameterized(func):
+        sys.modules[mname].__dict__[func.__name__] = func
+        return func
+    return parameterized
+
+
+@register('cv2')
+class GOpaque():
+    # NB: Inheritance from c++ class cause segfault.
+    # So just aggregate cv.GOpaqueT instead of inheritance
+    def __new__(cls, argtype):
+        return cv.GOpaqueT(argtype)
+
+    class Bool():
+        def __new__(self):
+            return cv.GOpaqueT(cv.gapi.CV_BOOL)
+
+    class Int():
+        def __new__(self):
+            return cv.GOpaqueT(cv.gapi.CV_INT)
+
+    class Double():
+        def __new__(self):
+            return cv.GOpaqueT(cv.gapi.CV_DOUBLE)
+
+    class Float():
+        def __new__(self):
+            return cv.GOpaqueT(cv.gapi.CV_FLOAT)
+
+    class String():
+        def __new__(self):
+            return cv.GOpaqueT(cv.gapi.CV_STRING)
+
+    class Point():
+        def __new__(self):
+            return cv.GOpaqueT(cv.gapi.CV_POINT)
+
+    class Point2f():
+        def __new__(self):
+            return cv.GOpaqueT(cv.gapi.CV_POINT2F)
+
+    class Size():
+        def __new__(self):
+            return cv.GOpaqueT(cv.gapi.CV_SIZE)
+
+    class Rect():
+        def __new__(self):
+            return cv.GOpaqueT(cv.gapi.CV_RECT)
+
+    class Any():
+        def __new__(self):
+            return cv.GOpaqueT(cv.gapi.CV_ANY)
+
+@register('cv2')
+class GArray():
+    # NB: Inheritance from c++ class cause segfault.
+    # So just aggregate cv.GArrayT instead of inheritance
+    def __new__(cls, argtype):
+        return cv.GArrayT(argtype)
+
+    class Bool():
+        def __new__(self):
+            return cv.GArrayT(cv.gapi.CV_BOOL)
+
+    class Int():
+        def __new__(self):
+            return cv.GArrayT(cv.gapi.CV_INT)
+
+    class Double():
+        def __new__(self):
+            return cv.GArrayT(cv.gapi.CV_DOUBLE)
+
+    class Float():
+        def __new__(self):
+            return cv.GArrayT(cv.gapi.CV_FLOAT)
+
+    class String():
+        def __new__(self):
+            return cv.GArrayT(cv.gapi.CV_STRING)
+
+    class Point():
+        def __new__(self):
+            return cv.GArrayT(cv.gapi.CV_POINT)
+
+    class Point2f():
+        def __new__(self):
+            return cv.GArrayT(cv.gapi.CV_POINT2F)
+
+    class Size():
+        def __new__(self):
+            return cv.GArrayT(cv.gapi.CV_SIZE)
+
+    class Rect():
+        def __new__(self):
+            return cv.GArrayT(cv.gapi.CV_RECT)
+
+    class Scalar():
+        def __new__(self):
+            return cv.GArrayT(cv.gapi.CV_SCALAR)
+
+    class Mat():
+        def __new__(self):
+            return cv.GArrayT(cv.gapi.CV_MAT)
+
+    class GMat():
+        def __new__(self):
+            return cv.GArrayT(cv.gapi.CV_GMAT)
+
+    class Any():
+        def __new__(self):
+            return cv.GArray(cv.gapi.CV_ANY)
+
+
+# NB: Top lvl decorator takes arguments
+def op(op_id, in_types, out_types):
+
+    garray_types= {
+            cv.GArray.Bool:    cv.gapi.CV_BOOL,
+            cv.GArray.Int:     cv.gapi.CV_INT,
+            cv.GArray.Double:  cv.gapi.CV_DOUBLE,
+            cv.GArray.Float:   cv.gapi.CV_FLOAT,
+            cv.GArray.String:  cv.gapi.CV_STRING,
+            cv.GArray.Point:   cv.gapi.CV_POINT,
+            cv.GArray.Point2f: cv.gapi.CV_POINT2F,
+            cv.GArray.Size:    cv.gapi.CV_SIZE,
+            cv.GArray.Rect:    cv.gapi.CV_RECT,
+            cv.GArray.Scalar:  cv.gapi.CV_SCALAR,
+            cv.GArray.Mat:     cv.gapi.CV_MAT,
+            cv.GArray.GMat:    cv.gapi.CV_GMAT,
+            cv.GArray.Any:     cv.gapi.CV_ANY
+    }
+
+    gopaque_types= {
+            cv.GOpaque.Size:    cv.gapi.CV_SIZE,
+            cv.GOpaque.Rect:    cv.gapi.CV_RECT,
+            cv.GOpaque.Bool:    cv.gapi.CV_BOOL,
+            cv.GOpaque.Int:     cv.gapi.CV_INT,
+            cv.GOpaque.Double:  cv.gapi.CV_DOUBLE,
+            cv.GOpaque.Float:   cv.gapi.CV_FLOAT,
+            cv.GOpaque.String:  cv.gapi.CV_STRING,
+            cv.GOpaque.Point:   cv.gapi.CV_POINT,
+            cv.GOpaque.Point2f: cv.gapi.CV_POINT2F,
+            cv.GOpaque.Size:    cv.gapi.CV_SIZE,
+            cv.GOpaque.Rect:    cv.gapi.CV_RECT,
+            cv.GOpaque.Any:     cv.gapi.CV_ANY
+    }
+
+    type2str = {
+        cv.gapi.CV_BOOL:    'cv.gapi.CV_BOOL' ,
+        cv.gapi.CV_INT:     'cv.gapi.CV_INT' ,
+        cv.gapi.CV_DOUBLE:  'cv.gapi.CV_DOUBLE' ,
+        cv.gapi.CV_FLOAT:   'cv.gapi.CV_FLOAT' ,
+        cv.gapi.CV_STRING:  'cv.gapi.CV_STRING' ,
+        cv.gapi.CV_POINT:   'cv.gapi.CV_POINT' ,
+        cv.gapi.CV_POINT2F: 'cv.gapi.CV_POINT2F' ,
+        cv.gapi.CV_SIZE:    'cv.gapi.CV_SIZE',
+        cv.gapi.CV_RECT:    'cv.gapi.CV_RECT',
+        cv.gapi.CV_SCALAR:  'cv.gapi.CV_SCALAR',
+        cv.gapi.CV_MAT:     'cv.gapi.CV_MAT',
+        cv.gapi.CV_GMAT:    'cv.gapi.CV_GMAT'
+    }
+
+    # NB: Second lvl decorator takes class to decorate
+    def op_with_params(cls):
+        if not in_types:
+            raise Exception('{} operation should have at least one input!'.format(cls.__name__))
+
+        if not out_types:
+            raise Exception('{} operation should have at least one output!'.format(cls.__name__))
+
+        for i, t in enumerate(out_types):
+            if t not in [cv.GMat, cv.GScalar, *garray_types, *gopaque_types]:
+                   raise Exception('{} unsupported output type: {} in possition: {}'
+                           .format(cls.__name__, t.__name__, i))
+
+        def on(*args):
+            if len(in_types) != len(args):
+                raise Exception('Invalid number of input elements!\nExpected: {}, Actual: {}'
+                        .format(len(in_types), len(args)))
+
+            for i, (t, a) in enumerate(zip(in_types, args)):
+                if t in garray_types:
+                    if not isinstance(a, cv.GArrayT):
+                        raise Exception("{} invalid type for argument {}.\nExpected: {}, Actual: {}"
+                                .format(cls.__name__, i, cv.GArrayT.__name__, type(a).__name__))
+
+                    elif a.type() != garray_types[t]:
+                        raise Exception("{} invalid GArrayT type for argument {}.\nExpected: {}, Actual: {}"
+                                .format(cls.__name__, i, type2str[garray_types[t]], type2str[a.type()]))
+
+                elif t in gopaque_types:
+                    if not isinstance(a, cv.GOpaqueT):
+                        raise Exception("{} invalid type for argument {}.\nExpected: {}, Actual: {}"
+                                .format(cls.__name__, i, cv.GOpaqueT.__name__, type(a).__name__))
+
+                    elif a.type() != gopaque_types[t]:
+                        raise Exception("{} invalid GOpaque type for argument {}.\nExpected: {}, Actual: {}"
+                                .format(cls.__name__, i, type2str[gopaque_types[t]], type2str[a.type()]))
+
+                else:
+                    if t != type(a):
+                        raise Exception('{} invalid input type for argument {}.\nExpected: {}, Actual: {}'
+                                .format(cls.__name__, i, t.__name__, type(a).__name__))
+
+            op = cv.gapi.__op(op_id, cls.outMeta, *args)
+
+            out_protos = []
+            for i, out_type in enumerate(out_types):
+                if out_type == cv.GMat:
+                    out_protos.append(op.getGMat())
+                elif out_type == cv.GScalar:
+                    out_protos.append(op.getGScalar())
+                elif out_type in gopaque_types:
+                    out_protos.append(op.getGOpaque(gopaque_types[out_type]))
+                elif out_type in garray_types:
+                    out_protos.append(op.getGArray(garray_types[out_type]))
+                else:
+                    raise Exception("""In {}: G-API operation can't produce the output with type: {} in position: {}"""
+                            .format(cls.__name__, out_type.__name__, i))
+
+            return tuple(out_protos) if len(out_protos) != 1 else out_protos[0]
+
+        # NB: Extend operation class
+        cls.id = op_id
+        cls.on = staticmethod(on)
+        return cls
+
+    return op_with_params
+
+
+def kernel(op_cls):
+    # NB: Second lvl decorator takes class to decorate
+    def kernel_with_params(cls):
+        # NB: Add new members to kernel class
+        cls.id      = op_cls.id
+        cls.outMeta = op_cls.outMeta
+        return cls
+
+    return kernel_with_params
diff --git a/modules/gapi/misc/python/pyopencv_gapi.hpp b/modules/gapi/misc/python/pyopencv_gapi.hpp
index 56a7e70d88..6b782cfc8d 100644
--- a/modules/gapi/misc/python/pyopencv_gapi.hpp
+++ b/modules/gapi/misc/python/pyopencv_gapi.hpp
@@ -5,7 +5,6 @@
 
 #ifdef _MSC_VER
 #pragma warning(disable: 4503)  // "decorated name length exceeded"
-                                // on empty_meta(const cv::GMetaArgs&, const cv::GArgs&)
 #endif
 
 #include <opencv2/gapi/cpu/gcpukernel.hpp>
@@ -49,6 +48,121 @@ using GArray_GMat    = cv::GArray<cv::GMat>;
 // WA: Create using
 using std::string;
 
+namespace cv
+{
+namespace detail
+{
+
+class PyObjectHolder
+{
+public:
+    PyObjectHolder(PyObject* o, bool owner = true);
+    PyObject* get() const;
+
+private:
+    class Impl;
+    std::shared_ptr<Impl> m_impl;
+};
+
+} // namespace detail
+} // namespace cv
+
+class cv::detail::PyObjectHolder::Impl
+{
+public:
+    Impl(PyObject* object, bool owner);
+    PyObject* get() const;
+    ~Impl();
+
+private:
+    PyObject* m_object;
+};
+
+cv::detail::PyObjectHolder::Impl::Impl(PyObject* object, bool owner)
+    : m_object(object)
+{
+    // NB: Become an owner of that PyObject.
+    // Need to store this and get access
+    // after the caller which provide the object is out of range.
+    if (owner)
+    {
+        // NB: Impossible take ownership if object is NULL.
+        GAPI_Assert(object);
+        Py_INCREF(m_object);
+    }
+}
+
+cv::detail::PyObjectHolder::Impl::~Impl()
+{
+    // NB: If NULL was set, don't decrease counter.
+    if (m_object)
+    {
+        Py_DECREF(m_object);
+    }
+}
+
+PyObject* cv::detail::PyObjectHolder::Impl::get() const
+{
+    return m_object;
+}
+
+cv::detail::PyObjectHolder::PyObjectHolder(PyObject* object, bool owner)
+        : m_impl(new cv::detail::PyObjectHolder::Impl{object, owner})
+{
+}
+
+PyObject* cv::detail::PyObjectHolder::get() const
+{
+    return m_impl->get();
+}
+
+template<>
+PyObject* pyopencv_from(const cv::detail::PyObjectHolder& v)
+{
+    PyObject* o = cv::util::any_cast<cv::detail::PyObjectHolder>(v).get();
+    Py_INCREF(o);
+    return o;
+}
+
+template<>
+PyObject* pyopencv_from(const cv::GArg& value)
+{
+    GAPI_Assert(value.kind != cv::detail::ArgKind::GOBJREF);
+#define HANDLE_CASE(T, O) case cv::detail::OpaqueKind::CV_##T:  \
+    {                                                           \
+        return pyopencv_from(value.get<O>());                   \
+    }
+
+#define UNSUPPORTED(T) case cv::detail::OpaqueKind::CV_##T: break
+    switch (value.opaque_kind)
+    {
+        HANDLE_CASE(BOOL,    bool);
+        HANDLE_CASE(INT,     int);
+        HANDLE_CASE(DOUBLE,  double);
+        HANDLE_CASE(FLOAT,   float);
+        HANDLE_CASE(STRING,  std::string);
+        HANDLE_CASE(POINT,   cv::Point);
+        HANDLE_CASE(POINT2F, cv::Point2f);
+        HANDLE_CASE(SIZE,    cv::Size);
+        HANDLE_CASE(RECT,    cv::Rect);
+        HANDLE_CASE(SCALAR,  cv::Scalar);
+        HANDLE_CASE(MAT,     cv::Mat);
+        HANDLE_CASE(UNKNOWN, cv::detail::PyObjectHolder);
+        UNSUPPORTED(UINT64);
+        UNSUPPORTED(DRAW_PRIM);
+#undef HANDLE_CASE
+#undef UNSUPPORTED
+    }
+    util::throw_error(std::logic_error("Unsupported kernel input type"));
+}
+
+template<>
+bool pyopencv_to(PyObject* obj, cv::GArg& value, const ArgInfo& info)
+{
+    value = cv::GArg(cv::detail::PyObjectHolder(obj));
+    return true;
+}
+
 template <>
 bool pyopencv_to(PyObject* obj, std::vector<GCompileArg>& value, const ArgInfo& info)
 {
@@ -81,7 +195,7 @@ PyObject* pyopencv_from(const cv::detail::OpaqueRef& o)
         case cv::detail::OpaqueKind::CV_POINT2F   : return pyopencv_from(o.rref<cv::Point2f>());
         case cv::detail::OpaqueKind::CV_SIZE      : return pyopencv_from(o.rref<cv::Size>());
         case cv::detail::OpaqueKind::CV_RECT      : return pyopencv_from(o.rref<cv::Rect>());
-        case cv::detail::OpaqueKind::CV_UNKNOWN   : break;
+        case cv::detail::OpaqueKind::CV_UNKNOWN   : return pyopencv_from(o.rref<cv::GArg>());
         case cv::detail::OpaqueKind::CV_UINT64    : break;
         case cv::detail::OpaqueKind::CV_SCALAR    : break;
         case cv::detail::OpaqueKind::CV_MAT       : break;
@@ -108,7 +222,7 @@ PyObject* pyopencv_from(const cv::detail::VectorRef& v)
         case cv::detail::OpaqueKind::CV_RECT      : return pyopencv_from_generic_vec(v.rref<cv::Rect>());
         case cv::detail::OpaqueKind::CV_SCALAR    : return pyopencv_from_generic_vec(v.rref<cv::Scalar>());
         case cv::detail::OpaqueKind::CV_MAT       : return pyopencv_from_generic_vec(v.rref<cv::Mat>());
-        case cv::detail::OpaqueKind::CV_UNKNOWN   : break;
+        case cv::detail::OpaqueKind::CV_UNKNOWN   : return pyopencv_from_generic_vec(v.rref<cv::GArg>());
         case cv::detail::OpaqueKind::CV_UINT64    : break;
         case cv::detail::OpaqueKind::CV_DRAW_PRIM : break;
     }
@@ -270,7 +384,7 @@ static cv::detail::OpaqueRef extract_opaque_ref(PyObject* from, cv::detail::Opaq
         HANDLE_CASE(POINT2F, cv::Point2f);
         HANDLE_CASE(SIZE,    cv::Size);
         HANDLE_CASE(RECT,    cv::Rect);
-        UNSUPPORTED(UNKNOWN);
+        HANDLE_CASE(UNKNOWN, cv::GArg);
         UNSUPPORTED(UINT64);
         UNSUPPORTED(SCALAR);
         UNSUPPORTED(MAT);
@@ -303,7 +417,7 @@ static cv::detail::VectorRef extract_vector_ref(PyObject* from, cv::detail::Opaq
         HANDLE_CASE(RECT,    cv::Rect);
         HANDLE_CASE(SCALAR,  cv::Scalar);
         HANDLE_CASE(MAT,     cv::Mat);
-        UNSUPPORTED(UNKNOWN);
+        HANDLE_CASE(UNKNOWN, cv::GArg);
         UNSUPPORTED(UINT64);
         UNSUPPORTED(DRAW_PRIM);
 #undef HANDLE_CASE
@@ -415,38 +529,7 @@ static cv::GMetaArgs extract_meta_args(const cv::GTypesInfo& info, PyObject* py_
     return metas;
 }
 
-inline PyObject* extract_opaque_value(const cv::GArg& value)
-{
-    GAPI_Assert(value.kind != cv::detail::ArgKind::GOBJREF);
-#define HANDLE_CASE(T, O) case cv::detail::OpaqueKind::CV_##T:  \
-    {                                                           \
-        return pyopencv_from(value.get<O>());                   \
-    }
-
-#define UNSUPPORTED(T) case cv::detail::OpaqueKind::CV_##T: break
-    switch (value.opaque_kind)
-    {
-        HANDLE_CASE(BOOL,    bool);
-        HANDLE_CASE(INT,     int);
-        HANDLE_CASE(DOUBLE,  double);
-        HANDLE_CASE(FLOAT,   float);
-        HANDLE_CASE(STRING,  std::string);
-        HANDLE_CASE(POINT,   cv::Point);
-        HANDLE_CASE(POINT2F, cv::Point2f);
-        HANDLE_CASE(SIZE,    cv::Size);
-        HANDLE_CASE(RECT,    cv::Rect);
-        HANDLE_CASE(SCALAR,  cv::Scalar);
-        HANDLE_CASE(MAT,     cv::Mat);
-        UNSUPPORTED(UNKNOWN);
-        UNSUPPORTED(UINT64);
-        UNSUPPORTED(DRAW_PRIM);
-#undef HANDLE_CASE
-#undef UNSUPPORTED
-    }
-    util::throw_error(std::logic_error("Unsupported kernel input type"));
-}
-
-static cv::GRunArgs run_py_kernel(PyObject* kernel,
+static cv::GRunArgs run_py_kernel(cv::detail::PyObjectHolder kernel,
                                   const cv::gapi::python::GPythonContext &ctx)
 {
     const auto& ins      = ctx.ins;
@@ -460,33 +543,32 @@ static cv::GRunArgs run_py_kernel(PyObject* kernel,
     try
     {
         int in_idx = 0;
-        PyObject* args = PyTuple_New(ins.size());
+        // NB: Doesn't increase reference counter (false),
+        // because PyObject already have ownership.
+        // In case exception decrement reference counter.
+        cv::detail::PyObjectHolder args(PyTuple_New(ins.size()), false);
         for (size_t i = 0; i < ins.size(); ++i)
         {
-            // NB: If meta is monostate then object isn't associated with G-TYPE, so in case it
-            // kind matches with supported types do conversion from c++ to python, if not (CV_UNKNOWN)
-            // obtain PyObject* and pass as-is.
+            // NB: If meta is monostate then object isn't associated with G-TYPE.
             if (cv::util::holds_alternative<cv::util::monostate>(in_metas[i]))
             {
-                PyTuple_SetItem(args, i,
-                        ins[i].opaque_kind != cv::detail::OpaqueKind::CV_UNKNOWN ? extract_opaque_value(ins[i])
-                                                                                 : ins[i].get<PyObject*>());
+                PyTuple_SetItem(args.get(), i, pyopencv_from(ins[i]));
                 continue;
             }
 
             switch (in_metas[i].index())
             {
                 case cv::GMetaArg::index_of<cv::GMatDesc>():
-                    PyTuple_SetItem(args, i, pyopencv_from(ins[i].get<cv::Mat>()));
+                    PyTuple_SetItem(args.get(), i, pyopencv_from(ins[i].get<cv::Mat>()));
                     break;
                 case cv::GMetaArg::index_of<cv::GScalarDesc>():
-                    PyTuple_SetItem(args, i, pyopencv_from(ins[i].get<cv::Scalar>()));
+                    PyTuple_SetItem(args.get(), i, pyopencv_from(ins[i].get<cv::Scalar>()));
                     break;
                 case cv::GMetaArg::index_of<cv::GOpaqueDesc>():
-                    PyTuple_SetItem(args, i, pyopencv_from(ins[i].get<cv::detail::OpaqueRef>()));
+                    PyTuple_SetItem(args.get(), i, pyopencv_from(ins[i].get<cv::detail::OpaqueRef>()));
                     break;
                 case cv::GMetaArg::index_of<cv::GArrayDesc>():
-                    PyTuple_SetItem(args, i, pyopencv_from(ins[i].get<cv::detail::VectorRef>()));
+                    PyTuple_SetItem(args.get(), i, pyopencv_from(ins[i].get<cv::detail::VectorRef>()));
                     break;
                 case cv::GMetaArg::index_of<cv::GFrameDesc>():
                     util::throw_error(std::logic_error("GFrame isn't supported for custom operation"));
@@ -494,11 +576,21 @@ static cv::GRunArgs run_py_kernel(PyObject* kernel,
             }
             ++in_idx;
         }
+        // NB: Doesn't increase reference counter (false).
+        // In case PyObject_CallObject return NULL, do nothing in destructor.
+        cv::detail::PyObjectHolder result(
+                PyObject_CallObject(kernel.get(), args.get()), false);
 
-        PyObject* result = PyObject_CallObject(kernel, args);
+        if (PyErr_Occurred()) {
+            PyErr_PrintEx(0);
+            PyErr_Clear();
+            throw std::logic_error("Python kernel failed with error!");
+        }
+        // NB: In fact it's impossible situation, becase errors were handled above.
+        GAPI_Assert(result.get() && "Python kernel returned NULL!");
 
-        outs = out_info.size() == 1 ? cv::GRunArgs{extract_run_arg(out_info[0], result)}
-                                    : extract_run_args(out_info, result);
+        outs = out_info.size() == 1 ? cv::GRunArgs{extract_run_arg(out_info[0], result.get())}
+                                    : extract_run_args(out_info, result.get());
     }
     catch (...)
     {
@@ -510,12 +602,6 @@ static cv::GRunArgs run_py_kernel(PyObject* kernel,
     return outs;
 }
 
-// FIXME: Now it's impossible to obtain meta function from operation,
-// because kernel connects to operation only by id (string).
-static cv::GMetaArgs empty_meta(const cv::GMetaArgs &, const cv::GArgs &) {
-    return {};
-}
-
 static GMetaArg get_meta_arg(PyObject* obj)
 {
     if (PyObject_TypeCheck(obj,
@@ -558,33 +644,38 @@ static cv::GMetaArgs get_meta_args(PyObject* tuple)
     return metas;
 }
 
-static GMetaArgs python_meta(PyObject* outMeta, const cv::GMetaArgs &meta, const cv::GArgs &gargs) {
+static GMetaArgs run_py_meta(cv::detail::PyObjectHolder out_meta,
+                            const cv::GMetaArgs         &meta,
+                            const cv::GArgs             &gargs) {
     PyGILState_STATE gstate;
     gstate = PyGILState_Ensure();
 
     cv::GMetaArgs out_metas;
     try
     {
-        PyObject* args = PyTuple_New(meta.size());
+        // NB: Doesn't increase reference counter (false),
+        // because PyObject already have ownership.
+        // In case exception decrement reference counter.
+        cv::detail::PyObjectHolder args(PyTuple_New(meta.size()), false);
         size_t idx = 0;
         for (auto&& m : meta)
         {
             switch (m.index())
             {
                 case cv::GMetaArg::index_of<cv::GMatDesc>():
-                    PyTuple_SetItem(args, idx, pyopencv_from(cv::util::get<cv::GMatDesc>(m)));
+                    PyTuple_SetItem(args.get(), idx, pyopencv_from(cv::util::get<cv::GMatDesc>(m)));
                     break;
                 case cv::GMetaArg::index_of<cv::GScalarDesc>():
-                    PyTuple_SetItem(args, idx, pyopencv_from(cv::util::get<cv::GScalarDesc>(m)));
+                    PyTuple_SetItem(args.get(), idx, pyopencv_from(cv::util::get<cv::GScalarDesc>(m)));
                     break;
                 case cv::GMetaArg::index_of<cv::GArrayDesc>():
-                    PyTuple_SetItem(args, idx, pyopencv_from(cv::util::get<cv::GArrayDesc>(m)));
+                    PyTuple_SetItem(args.get(), idx, pyopencv_from(cv::util::get<cv::GArrayDesc>(m)));
                     break;
                 case cv::GMetaArg::index_of<cv::GOpaqueDesc>():
-                    PyTuple_SetItem(args, idx, pyopencv_from(cv::util::get<cv::GOpaqueDesc>(m)));
+                    PyTuple_SetItem(args.get(), idx, pyopencv_from(cv::util::get<cv::GOpaqueDesc>(m)));
                     break;
                 case cv::GMetaArg::index_of<cv::util::monostate>():
-                    PyTuple_SetItem(args, idx, gargs[idx].get<PyObject*>());
+                    PyTuple_SetItem(args.get(), idx, pyopencv_from(gargs[idx]));
                     break;
                 case cv::GMetaArg::index_of<cv::GFrameDesc>():
                     util::throw_error(std::logic_error("GFrame isn't supported for custom operation"));
@@ -592,9 +683,21 @@ static GMetaArgs python_meta(PyObject* outMeta, const cv::GMetaArgs &meta, const
             }
             ++idx;
         }
-        PyObject* result = PyObject_CallObject(outMeta, args);
-        out_metas = PyTuple_Check(result) ? get_meta_args(result)
-                                          : cv::GMetaArgs{get_meta_arg(result)};
+        // NB: Doesn't increase reference counter (false).
+        // In case PyObject_CallObject return NULL, do nothing in destructor.
+        cv::detail::PyObjectHolder result(
+                PyObject_CallObject(out_meta.get(), args.get()), false);
+
+        if (PyErr_Occurred()) {
+            PyErr_PrintEx(0);
+            PyErr_Clear();
+            throw std::logic_error("Python outMeta failed with error!");
+        }
+        // NB: In fact it's impossible situation, becase errors were handled above.
+        GAPI_Assert(result.get() && "Python outMeta returned NULL!");
+
+        out_metas = PyTuple_Check(result.get()) ? get_meta_args(result.get())
+                                                : cv::GMetaArgs{get_meta_arg(result.get())};
     }
     catch (...)
     {
@@ -611,28 +714,65 @@ static PyObject* pyopencv_cv_gapi_kernels(PyObject* , PyObject* py_args, PyObjec
     using namespace cv;
     gapi::GKernelPackage pkg;
     Py_ssize_t size = PyTuple_Size(py_args);
+
     for (int i = 0; i < size; ++i)
     {
-        PyObject* pair   = PyTuple_GetItem(py_args, i);
-        PyObject* kernel = PyTuple_GetItem(pair, 0);
+        PyObject* user_kernel = PyTuple_GetItem(py_args, i);
 
-        std::string id;
-        if (!pyopencv_to(PyTuple_GetItem(pair, 1), id, ArgInfo("id", false)))
-        {
-            PyErr_SetString(PyExc_TypeError, "Failed to obtain: kernel id must be a string");
+        PyObject* id_obj = PyObject_GetAttrString(user_kernel, "id");
+        if (!id_obj) {
+            PyErr_SetString(PyExc_TypeError,
+                    "Python kernel should contain id, please use cv.gapi.kernel to define kernel");
             return NULL;
         }
-        Py_INCREF(kernel);
+
+        PyObject* out_meta = PyObject_GetAttrString(user_kernel, "outMeta");
+        if (!out_meta) {
+            PyErr_SetString(PyExc_TypeError,
+                    "Python kernel should contain outMeta, please use cv.gapi.kernel to define kernel");
+            return NULL;
+        }
+
+        PyObject* run  = PyObject_GetAttrString(user_kernel, "run");
+        if (!run) {
+            PyErr_SetString(PyExc_TypeError,
+                    "Python kernel should contain run, please use cv.gapi.kernel to define kernel");
+            return NULL;
+        }
+
+        std::string id;
+        if (!pyopencv_to(id_obj, id, ArgInfo("id", false)))
+        {
+            PyErr_SetString(PyExc_TypeError, "Failed to obtain string");
+            return NULL;
+        }
+
+        using namespace std::placeholders;
         gapi::python::GPythonFunctor f(id.c_str(),
-                                       empty_meta,
-                                       std::bind(run_py_kernel,
-                                                 kernel,
-                                                 std::placeholders::_1));
+                std::bind(run_py_meta  , cv::detail::PyObjectHolder{out_meta}, _1, _2),
+                std::bind(run_py_kernel, cv::detail::PyObjectHolder{run}    , _1));
         pkg.include(f);
     }
     return pyopencv_from(pkg);
 }
 
+static PyObject* pyopencv_cv_gapi_networks(PyObject*, PyObject* py_args, PyObject*)
+{
+    using namespace cv;
+    gapi::GNetPackage pkg;
+    Py_ssize_t size = PyTuple_Size(py_args);
+    for (int i = 0; i < size; ++i)
+    {
+        gapi_ie_PyParams params;
+        PyObject* item = PyTuple_GetItem(py_args, i);
+        if (pyopencv_to(item, params, ArgInfo("PyParams", false)))
+        {
+            pkg += gapi::networks(params);
+        }
+    }
+    return pyopencv_from(pkg);
+}
+
 static PyObject* pyopencv_cv_gapi_op(PyObject* , PyObject* py_args, PyObject*)
 {
     using namespace cv;
@@ -644,7 +784,6 @@ static PyObject* pyopencv_cv_gapi_op(PyObject* , PyObject* py_args, PyObject*)
         return NULL;
     }
     PyObject* outMeta = PyTuple_GetItem(py_args, 1);
-    Py_INCREF(outMeta);
 
     cv::GArgs args;
     for (int i = 2; i < size; i++)
@@ -684,13 +823,12 @@ static PyObject* pyopencv_cv_gapi_op(PyObject* , PyObject* py_args, PyObject*)
         }
         else
         {
-            Py_INCREF(item);
-            args.emplace_back(cv::GArg(item));
+            args.emplace_back(cv::GArg(cv::detail::PyObjectHolder{item}));
         }
     }
 
-    cv::GKernel::M outMetaWrapper = std::bind(python_meta,
-                                              outMeta,
+    cv::GKernel::M outMetaWrapper = std::bind(run_py_meta,
+                                              cv::detail::PyObjectHolder{outMeta},
                                               std::placeholders::_1,
                                               std::placeholders::_2);
     return pyopencv_from(cv::gapi::wip::op(id, outMetaWrapper, std::move(args)));
@@ -698,7 +836,7 @@ static PyObject* pyopencv_cv_gapi_op(PyObject* , PyObject* py_args, PyObject*)
 
 static PyObject* pyopencv_cv_gin(PyObject*, PyObject* py_args, PyObject*)
 {
-    Py_INCREF(py_args);
+    cv::detail::PyObjectHolder holder{py_args};
     auto callback = cv::detail::ExtractArgsCallback{[=](const cv::GTypesInfo& info)
         {
             PyGILState_STATE gstate;
@@ -707,7 +845,7 @@ static PyObject* pyopencv_cv_gin(PyObject*, PyObject* py_args, PyObject*)
             cv::GRunArgs args;
             try
             {
-                args = extract_run_args(info, py_args);
+                args = extract_run_args(info, holder.get());
             }
             catch (...)
             {
@@ -792,10 +930,11 @@ struct PyOpenCV_Converter<cv::GOpaque<T>>
 };
 
 
-// extend cv.gapi.wip. methods
-#define PYOPENCV_EXTRA_METHODS_GAPI_WIP \
+// extend cv.gapi methods
+#define PYOPENCV_EXTRA_METHODS_GAPI \
   {"kernels", CV_PY_FN_WITH_KW(pyopencv_cv_gapi_kernels), "kernels(...) -> GKernelPackage"}, \
-  {"op", CV_PY_FN_WITH_KW_(pyopencv_cv_gapi_op, 0), "kernels(...) -> retval\n"}, \
+  {"networks", CV_PY_FN_WITH_KW(pyopencv_cv_gapi_networks), "networks(...) -> GNetPackage"}, \
+  {"__op", CV_PY_FN_WITH_KW(pyopencv_cv_gapi_op), "__op(...) -> retval\n"},
 
 
 #endif  // HAVE_OPENCV_GAPI
diff --git a/modules/gapi/misc/python/python_bridge.hpp b/modules/gapi/misc/python/python_bridge.hpp
index 51f0ca8ab0..0d1c6d51c5 100644
--- a/modules/gapi/misc/python/python_bridge.hpp
+++ b/modules/gapi/misc/python/python_bridge.hpp
@@ -25,29 +25,31 @@
     }
 
 #define GARRAY_TYPE_LIST_G(G, G2) \
-WRAP_ARGS(bool        ,  cv::gapi::ArgType::CV_BOOL,    G) \
-WRAP_ARGS(int         ,  cv::gapi::ArgType::CV_INT,     G) \
-WRAP_ARGS(double      ,  cv::gapi::ArgType::CV_DOUBLE,  G) \
-WRAP_ARGS(float       ,  cv::gapi::ArgType::CV_FLOAT,   G) \
-WRAP_ARGS(std::string ,  cv::gapi::ArgType::CV_STRING,  G) \
-WRAP_ARGS(cv::Point   ,  cv::gapi::ArgType::CV_POINT,   G) \
-WRAP_ARGS(cv::Point2f ,  cv::gapi::ArgType::CV_POINT2F, G) \
-WRAP_ARGS(cv::Size    ,  cv::gapi::ArgType::CV_SIZE,    G) \
-WRAP_ARGS(cv::Rect    ,  cv::gapi::ArgType::CV_RECT,    G) \
-WRAP_ARGS(cv::Scalar  ,  cv::gapi::ArgType::CV_SCALAR,  G) \
-WRAP_ARGS(cv::Mat     ,  cv::gapi::ArgType::CV_MAT,     G) \
-WRAP_ARGS(cv::GMat    ,  cv::gapi::ArgType::CV_GMAT,    G2)
+WRAP_ARGS(bool        , cv::gapi::ArgType::CV_BOOL,    G)  \
+WRAP_ARGS(int         , cv::gapi::ArgType::CV_INT,     G)  \
+WRAP_ARGS(double      , cv::gapi::ArgType::CV_DOUBLE,  G)  \
+WRAP_ARGS(float       , cv::gapi::ArgType::CV_FLOAT,   G)  \
+WRAP_ARGS(std::string , cv::gapi::ArgType::CV_STRING,  G)  \
+WRAP_ARGS(cv::Point   , cv::gapi::ArgType::CV_POINT,   G)  \
+WRAP_ARGS(cv::Point2f , cv::gapi::ArgType::CV_POINT2F, G)  \
+WRAP_ARGS(cv::Size    , cv::gapi::ArgType::CV_SIZE,    G)  \
+WRAP_ARGS(cv::Rect    , cv::gapi::ArgType::CV_RECT,    G)  \
+WRAP_ARGS(cv::Scalar  , cv::gapi::ArgType::CV_SCALAR,  G)  \
+WRAP_ARGS(cv::Mat     , cv::gapi::ArgType::CV_MAT,     G)  \
+WRAP_ARGS(cv::GArg    , cv::gapi::ArgType::CV_ANY,     G)  \
+WRAP_ARGS(cv::GMat    , cv::gapi::ArgType::CV_GMAT,    G2) \
 
 #define GOPAQUE_TYPE_LIST_G(G, G2) \
-WRAP_ARGS(bool        ,  cv::gapi::ArgType::CV_BOOL,    G)  \
-WRAP_ARGS(int         ,  cv::gapi::ArgType::CV_INT,     G)  \
-WRAP_ARGS(double      ,  cv::gapi::ArgType::CV_DOUBLE,  G)  \
-WRAP_ARGS(float       ,  cv::gapi::ArgType::CV_FLOAT,   G)  \
-WRAP_ARGS(std::string ,  cv::gapi::ArgType::CV_STRING,  G)  \
-WRAP_ARGS(cv::Point   ,  cv::gapi::ArgType::CV_POINT,   G)  \
-WRAP_ARGS(cv::Point2f ,  cv::gapi::ArgType::CV_POINT2F, G)  \
-WRAP_ARGS(cv::Size    ,  cv::gapi::ArgType::CV_SIZE,    G)  \
-WRAP_ARGS(cv::Rect    ,  cv::gapi::ArgType::CV_RECT,    G2) \
+WRAP_ARGS(bool        , cv::gapi::ArgType::CV_BOOL,    G)  \
+WRAP_ARGS(int         , cv::gapi::ArgType::CV_INT,     G)  \
+WRAP_ARGS(double      , cv::gapi::ArgType::CV_DOUBLE,  G)  \
+WRAP_ARGS(float       , cv::gapi::ArgType::CV_FLOAT,   G)  \
+WRAP_ARGS(std::string , cv::gapi::ArgType::CV_STRING,  G)  \
+WRAP_ARGS(cv::Point   , cv::gapi::ArgType::CV_POINT,   G)  \
+WRAP_ARGS(cv::Point2f , cv::gapi::ArgType::CV_POINT2F, G)  \
+WRAP_ARGS(cv::Size    , cv::gapi::ArgType::CV_SIZE,    G)  \
+WRAP_ARGS(cv::GArg    , cv::gapi::ArgType::CV_ANY,     G)  \
+WRAP_ARGS(cv::Rect    , cv::gapi::ArgType::CV_RECT,    G2) \
 
 namespace cv {
 namespace gapi {
@@ -66,6 +68,7 @@ enum ArgType {
     CV_SCALAR,
     CV_MAT,
     CV_GMAT,
+    CV_ANY,
 };
 
 GAPI_EXPORTS_W inline cv::GInferOutputs infer(const String& name, const cv::GInferInputs& inputs)
diff --git a/modules/gapi/misc/python/shadow_gapi.hpp b/modules/gapi/misc/python/shadow_gapi.hpp
index bb82002069..40dab41581 100644
--- a/modules/gapi/misc/python/shadow_gapi.hpp
+++ b/modules/gapi/misc/python/shadow_gapi.hpp
@@ -58,7 +58,6 @@ namespace cv
 
    namespace gapi
    {
-       GAPI_EXPORTS_W gapi::GNetPackage networks(const cv::gapi::ie::PyParams& params);
        namespace wip
        {
            class GAPI_EXPORTS_W IStreamSource { };
diff --git a/modules/gapi/misc/python/test/test_gapi_sample_pipelines.py b/modules/gapi/misc/python/test/test_gapi_sample_pipelines.py
index b4440e48c5..2f921901db 100644
--- a/modules/gapi/misc/python/test/test_gapi_sample_pipelines.py
+++ b/modules/gapi/misc/python/test/test_gapi_sample_pipelines.py
@@ -3,523 +3,678 @@
 import numpy as np
 import cv2 as cv
 import os
+import sys
+import unittest
 
 from tests_common import NewOpenCVTests
 
 
-# Plaidml is an optional backend
-pkgs = [
-         ('ocl'    , cv.gapi.core.ocl.kernels()),
-         ('cpu'    , cv.gapi.core.cpu.kernels()),
-         ('fluid'  , cv.gapi.core.fluid.kernels())
-         # ('plaidml', cv.gapi.core.plaidml.kernels())
-     ]
+try:
 
-# Test output GMat.
-def custom_add(img1, img2, dtype):
-    return cv.add(img1, img2)
+    if sys.version_info[:2] < (3, 0):
+        raise unittest.SkipTest('Python 2.x is not supported')
 
-# Test output GScalar.
-def custom_mean(img):
-    return cv.mean(img)
-
-# Test output tuple of GMat's.
-def custom_split3(img):
-    # NB: cv.split return list but g-api requires tuple in multiple output case
-    return tuple(cv.split(img))
-
-# Test output GOpaque.
-def custom_size(img):
-    # NB: Take only H, W, because the operation should return cv::Size which is 2D.
-    return img.shape[:2]
-
-# Test output GArray.
-def custom_goodFeaturesToTrack(img, max_corners, quality_lvl,
-                               min_distance, mask, block_sz,
-                               use_harris_detector, k):
-    features = cv.goodFeaturesToTrack(img, max_corners, quality_lvl,
-                                      min_distance, mask=mask,
-                                      blockSize=block_sz,
-                                      useHarrisDetector=use_harris_detector, k=k)
-    # NB: The operation output is cv::GArray<cv::Pointf>, so it should be mapped
-    # to python paramaters like this: [(1.2, 3.4), (5.2, 3.2)], because the cv::Point2f
-    # according to opencv rules mapped to the tuple and cv::GArray<> mapped to the list.
-    # OpenCV returns np.array with shape (n_features, 1, 2), so let's to convert it to list
-    # tuples with size - n_features.
-    features = list(map(tuple, features.reshape(features.shape[0], -1)))
-    return features
-
-# Test input scalar.
-def custom_addC(img, sc, dtype):
-    # NB: dtype is just ignored in this implementation.
-    # More over from G-API kernel got scalar as tuples with 4 elements
-    # where the last element is equal to zero, just cut him for broadcasting.
-    return img + np.array(sc, dtype=np.uint8)[:-1]
+    # Plaidml is an optional backend
+    pkgs = [
+             ('ocl'    , cv.gapi.core.ocl.kernels()),
+             ('cpu'    , cv.gapi.core.cpu.kernels()),
+             ('fluid'  , cv.gapi.core.fluid.kernels())
+             # ('plaidml', cv.gapi.core.plaidml.kernels())
+           ]
 
 
-# Test input opaque.
-def custom_sizeR(rect):
-    # NB: rect - is tuple (x, y, h, w)
-    return (rect[2], rect[3])
+    @cv.gapi.op('custom.add', in_types=[cv.GMat, cv.GMat, int], out_types=[cv.GMat])
+    class GAdd:
+        """Calculates sum of two matrices."""
 
-# Test input array.
-def custom_boundingRect(array):
-    # NB: OpenCV - numpy array (n_points x 2).
-    #     G-API  - array of tuples (n_points).
-    return cv.boundingRect(np.array(array))
-
-# Test input mat
-def add(g_in1, g_in2, dtype):
-    def custom_add_meta(img_desc1, img_desc2, dtype):
-        return img_desc1
-
-    return cv.gapi.wip.op('custom.add', custom_add_meta, g_in1, g_in2, dtype).getGMat()
+        @staticmethod
+        def outMeta(desc1, desc2, depth):
+            return desc1
 
 
-# Test multiple output mat
-def split3(g_in):
-    def custom_split3_meta(img_desc):
-        out_desc = img_desc.withType(img_desc.depth, 1)
-        return out_desc, out_desc, out_desc
+    @cv.gapi.kernel(GAdd)
+    class GAddImpl:
+        """Implementation for GAdd operation."""
 
-    op = cv.gapi.wip.op('custom.split3', custom_split3_meta, g_in)
-
-    ch1 = op.getGMat()
-    ch2 = op.getGMat()
-    ch3 = op.getGMat()
-
-    return ch1, ch2, ch3
-
-# Test output scalar
-def mean(g_in):
-    def custom_mean_meta(img_desc):
-        return cv.empty_scalar_desc()
-
-    op = cv.gapi.wip.op('custom.mean', custom_mean_meta, g_in)
-    return op.getGScalar()
+        @staticmethod
+        def run(img1, img2, dtype):
+            return cv.add(img1, img2)
 
 
-# Test input scalar
-def addC(g_in, g_sc, dtype):
-    def custom_addC_meta(img_desc, sc_desc, dtype):
-        return img_desc
+    @cv.gapi.op('custom.split3', in_types=[cv.GMat], out_types=[cv.GMat, cv.GMat, cv.GMat])
+    class GSplit3:
+        """Divides a 3-channel matrix into 3 single-channel matrices."""
 
-    op = cv.gapi.wip.op('custom.addC', custom_addC_meta, g_in, g_sc, dtype)
-    return op.getGMat()
+        @staticmethod
+        def outMeta(desc):
+            out_desc = desc.withType(desc.depth, 1)
+            return out_desc, out_desc, out_desc
 
 
-# Test output opaque.
-def size(g_in):
-    def custom_size_meta(img_desc):
-        return cv.empty_gopaque_desc()
+    @cv.gapi.kernel(GSplit3)
+    class GSplit3Impl:
+        """Implementation for GSplit3 operation."""
 
-    op = cv.gapi.wip.op('custom.size', custom_size_meta, g_in)
-    return op.getGOpaque(cv.gapi.CV_SIZE)
+        @staticmethod
+        def run(img):
+            # NB: cv.split return list but g-api requires tuple in multiple output case
+            return tuple(cv.split(img))
 
 
-# Test input opaque.
-def sizeR(g_rect):
-    def custom_sizeR_meta(opaque_desc):
-        return cv.empty_gopaque_desc()
+    @cv.gapi.op('custom.mean', in_types=[cv.GMat], out_types=[cv.GScalar])
+    class GMean:
+        """Calculates the mean value M of matrix elements."""
 
-    op = cv.gapi.wip.op('custom.sizeR', custom_sizeR_meta, g_rect)
-    return op.getGOpaque(cv.gapi.CV_SIZE)
+        @staticmethod
+        def outMeta(desc):
+            return cv.empty_scalar_desc()
 
 
-# Test input array.
-def boundingRect(g_array):
-    def custom_boundingRect_meta(array_desc):
-        return cv.empty_gopaque_desc()
+    @cv.gapi.kernel(GMean)
+    class GMeanImpl:
+        """Implementation for GMean operation."""
 
-    op = cv.gapi.wip.op('custom.boundingRect', custom_boundingRect_meta, g_array)
-    return op.getGOpaque(cv.gapi.CV_RECT)
+        @staticmethod
+        def run(img):
+            # NB: cv.split return list but g-api requires tuple in multiple output case
+            return cv.mean(img)
 
 
-# Test output GArray.
-def goodFeaturesToTrack(g_in, max_corners, quality_lvl,
-                        min_distance, mask, block_sz,
-                        use_harris_detector, k):
-    def custom_goodFeaturesToTrack_meta(img_desc, max_corners, quality_lvl,
-                                        min_distance, mask, block_sz, use_harris_detector, k):
-        return cv.empty_array_desc()
+    @cv.gapi.op('custom.addC', in_types=[cv.GMat, cv.GScalar, int], out_types=[cv.GMat])
+    class GAddC:
+        """Adds a given scalar value to each element of given matrix."""
 
-    op = cv.gapi.wip.op('custom.goodFeaturesToTrack', custom_goodFeaturesToTrack_meta, g_in,
-            max_corners, quality_lvl, min_distance, mask, block_sz, use_harris_detector, k)
-    return op.getGArray(cv.gapi.CV_POINT2F)
+        @staticmethod
+        def outMeta(mat_desc, scalar_desc, dtype):
+            return mat_desc
 
 
-class gapi_sample_pipelines(NewOpenCVTests):
+    @cv.gapi.kernel(GAddC)
+    class GAddCImpl:
+        """Implementation for GAddC operation."""
 
-    # NB: This test check multiple outputs for operation
-    def test_mean_over_r(self):
-        img_path = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
-        in_mat = cv.imread(img_path)
+        @staticmethod
+        def run(img, sc, dtype):
+            # NB: dtype is just ignored in this implementation.
+            # Moreover from G-API kernel got scalar as tuples with 4 elements
+            # where the last element is equal to zero, just cut him for broadcasting.
+            return img + np.array(sc, dtype=np.uint8)[:-1]
 
-        # # OpenCV
-        _, _, r_ch = cv.split(in_mat)
-        expected = cv.mean(r_ch)
 
-        # G-API
-        g_in = cv.GMat()
-        b, g, r = cv.gapi.split3(g_in)
-        g_out = cv.gapi.mean(r)
-        comp = cv.GComputation(g_in, g_out)
+    @cv.gapi.op('custom.size', in_types=[cv.GMat], out_types=[cv.GOpaque.Size])
+    class GSize:
+        """Gets dimensions from input matrix."""
 
-        for pkg_name, pkg in pkgs:
+        @staticmethod
+        def outMeta(mat_desc):
+            return cv.empty_gopaque_desc()
+
+
+    @cv.gapi.kernel(GSize)
+    class GSizeImpl:
+        """Implementation for GSize operation."""
+
+        @staticmethod
+        def run(img):
+            # NB: Take only H, W, because the operation should return cv::Size which is 2D.
+            return img.shape[:2]
+
+
+    @cv.gapi.op('custom.sizeR', in_types=[cv.GOpaque.Rect], out_types=[cv.GOpaque.Size])
+    class GSizeR:
+        """Gets dimensions from rectangle."""
+
+        @staticmethod
+        def outMeta(opaq_desc):
+            return cv.empty_gopaque_desc()
+
+
+    @cv.gapi.kernel(GSizeR)
+    class GSizeRImpl:
+        """Implementation for GSizeR operation."""
+
+        @staticmethod
+        def run(rect):
+            # NB: rect - is tuple (x, y, h, w)
+            return (rect[2], rect[3])
+
+
+    @cv.gapi.op('custom.boundingRect', in_types=[cv.GArray.Point], out_types=[cv.GOpaque.Rect])
+    class GBoundingRect:
+        """Calculates minimal up-right bounding rectangle for the specified
+           9 point set or non-zero pixels of gray-scale image."""
+
+        @staticmethod
+        def outMeta(arr_desc):
+            return cv.empty_gopaque_desc()
+
+
+    @cv.gapi.kernel(GBoundingRect)
+    class GBoundingRectImpl:
+        """Implementation for GBoundingRect operation."""
+
+        @staticmethod
+        def run(array):
+            # NB: OpenCV - numpy array (n_points x 2).
+            #     G-API  - array of tuples (n_points).
+            return cv.boundingRect(np.array(array))
+
+
+    @cv.gapi.op('custom.goodFeaturesToTrack',
+                in_types=[cv.GMat, int, float, float, int, bool, float],
+                out_types=[cv.GArray.Point2f])
+    class GGoodFeatures:
+        """Finds the most prominent corners in the image
+           or in the specified image region."""
+
+        @staticmethod
+        def outMeta(desc, max_corners, quality_lvl,
+                    min_distance, block_sz,
+                    use_harris_detector, k):
+            return cv.empty_array_desc()
+
+
+    @cv.gapi.kernel(GGoodFeatures)
+    class GGoodFeaturesImpl:
+        """Implementation for GGoodFeatures operation."""
+
+        @staticmethod
+        def run(img, max_corners, quality_lvl,
+                min_distance, block_sz,
+                use_harris_detector, k):
+            features = cv.goodFeaturesToTrack(img, max_corners, quality_lvl,
+                                              min_distance, mask=None,
+                                              blockSize=block_sz,
+                                              useHarrisDetector=use_harris_detector, k=k)
+            # NB: The operation output is cv::GArray<cv::Pointf>, so it should be mapped
+            # to python paramaters like this: [(1.2, 3.4), (5.2, 3.2)], because the cv::Point2f
+            # according to opencv rules mapped to the tuple and cv::GArray<> mapped to the list.
+            # OpenCV returns np.array with shape (n_features, 1, 2), so let's to convert it to list
+            # tuples with size == n_features.
+            features = list(map(tuple, features.reshape(features.shape[0], -1)))
+            return features
+
+
+    # To validate invalid cases
+    def create_op(in_types, out_types):
+        @cv.gapi.op('custom.op', in_types=in_types, out_types=out_types)
+        class Op:
+            """Custom operation for testing."""
+
+            @staticmethod
+            def outMeta(desc):
+                raise NotImplementedError("outMeta isn't imlemented")
+        return Op
+
+
+    class gapi_sample_pipelines(NewOpenCVTests):
+
+        def test_custom_op_add(self):
+            sz = (3, 3)
+            in_mat1 = np.full(sz, 45, dtype=np.uint8)
+            in_mat2 = np.full(sz, 50, dtype=np.uint8)
+
+            # OpenCV
+            expected = cv.add(in_mat1, in_mat2)
+
+            # G-API
+            g_in1  = cv.GMat()
+            g_in2  = cv.GMat()
+            g_out = GAdd.on(g_in1, g_in2, cv.CV_8UC1)
+
+            comp = cv.GComputation(cv.GIn(g_in1, g_in2), cv.GOut(g_out))
+
+            pkg = cv.gapi.kernels(GAddImpl)
+            actual = comp.apply(cv.gin(in_mat1, in_mat2), args=cv.compile_args(pkg))
+
+            self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF))
+
+
+        def test_custom_op_split3(self):
+            sz = (4, 4)
+            in_ch1 = np.full(sz, 1, dtype=np.uint8)
+            in_ch2 = np.full(sz, 2, dtype=np.uint8)
+            in_ch3 = np.full(sz, 3, dtype=np.uint8)
+            # H x W x C
+            in_mat = np.stack((in_ch1, in_ch2, in_ch3), axis=2)
+
+            # G-API
+            g_in  = cv.GMat()
+            g_ch1, g_ch2, g_ch3 = GSplit3.on(g_in)
+
+            comp = cv.GComputation(cv.GIn(g_in), cv.GOut(g_ch1, g_ch2, g_ch3))
+
+            pkg = cv.gapi.kernels(GSplit3Impl)
+            ch1, ch2, ch3 = comp.apply(cv.gin(in_mat), args=cv.compile_args(pkg))
+
+            self.assertEqual(0.0, cv.norm(in_ch1, ch1, cv.NORM_INF))
+            self.assertEqual(0.0, cv.norm(in_ch2, ch2, cv.NORM_INF))
+            self.assertEqual(0.0, cv.norm(in_ch3, ch3, cv.NORM_INF))
+
+
+        def test_custom_op_mean(self):
+            img_path = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
+            in_mat = cv.imread(img_path)
+
+            # OpenCV
+            expected = cv.mean(in_mat)
+
+            # G-API
+            g_in  = cv.GMat()
+            g_out = GMean.on(g_in)
+
+            comp = cv.GComputation(g_in, g_out)
+
+            pkg    = cv.gapi.kernels(GMeanImpl)
             actual = comp.apply(cv.gin(in_mat), args=cv.compile_args(pkg))
+
             # Comparison
-            self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF),
-                             'Failed on ' + pkg_name + ' backend')
+            self.assertEqual(expected, actual)
 
 
-    def test_custom_mean(self):
-        img_path = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
-        in_mat = cv.imread(img_path)
+        def test_custom_op_addC(self):
+            sz = (3, 3, 3)
+            in_mat = np.full(sz, 45, dtype=np.uint8)
+            sc = (50, 10, 20)
 
-        # OpenCV
-        expected = cv.mean(in_mat)
+            # Numpy reference, make array from sc to keep uint8 dtype.
+            expected = in_mat + np.array(sc, dtype=np.uint8)
 
-        # G-API
-        g_in = cv.GMat()
-        g_out = cv.gapi.mean(g_in)
+            # G-API
+            g_in  = cv.GMat()
+            g_sc  = cv.GScalar()
+            g_out = GAddC.on(g_in, g_sc, cv.CV_8UC1)
+            comp  = cv.GComputation(cv.GIn(g_in, g_sc), cv.GOut(g_out))
 
-        comp = cv.GComputation(g_in, g_out)
+            pkg = cv.gapi.kernels(GAddCImpl)
+            actual = comp.apply(cv.gin(in_mat, sc), args=cv.compile_args(pkg))
 
-        pkg    = cv.gapi.wip.kernels((custom_mean, 'org.opencv.core.math.mean'))
-        actual = comp.apply(cv.gin(in_mat), args=cv.compile_args(pkg))
+            self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF))
 
-        # Comparison
-        self.assertEqual(expected, actual)
 
+        def test_custom_op_size(self):
+            sz = (100, 150, 3)
+            in_mat = np.full(sz, 45, dtype=np.uint8)
 
-    def test_custom_add(self):
-        sz = (3, 3)
-        in_mat1 = np.full(sz, 45, dtype=np.uint8)
-        in_mat2 = np.full(sz, 50 , dtype=np.uint8)
+            # Open_cV
+            expected = (100, 150)
 
-        # OpenCV
-        expected = cv.add(in_mat1, in_mat2)
+            # G-API
+            g_in = cv.GMat()
+            g_sz = GSize.on(g_in)
+            comp = cv.GComputation(cv.GIn(g_in), cv.GOut(g_sz))
 
-        # G-API
-        g_in1 = cv.GMat()
-        g_in2 = cv.GMat()
-        g_out = cv.gapi.add(g_in1, g_in2)
-        comp = cv.GComputation(cv.GIn(g_in1, g_in2), cv.GOut(g_out))
+            pkg = cv.gapi.kernels(GSizeImpl)
+            actual = comp.apply(cv.gin(in_mat), args=cv.compile_args(pkg))
 
-        pkg = cv.gapi.wip.kernels((custom_add, 'org.opencv.core.math.add'))
-        actual = comp.apply(cv.gin(in_mat1, in_mat2), args=cv.compile_args(pkg))
+            self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF))
 
-        self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF))
 
+        def test_custom_op_sizeR(self):
+            # x, y, h, w
+            roi = (10, 15, 100, 150)
 
-    def test_custom_size(self):
-        sz = (100, 150, 3)
-        in_mat = np.full(sz, 45, dtype=np.uint8)
+            expected = (100, 150)
 
-        # OpenCV
-        expected = (100, 150)
+            # G-API
+            g_r  = cv.GOpaque.Rect()
+            g_sz = GSizeR.on(g_r)
+            comp = cv.GComputation(cv.GIn(g_r), cv.GOut(g_sz))
 
-        # G-API
-        g_in = cv.GMat()
-        g_sz = cv.gapi.streaming.size(g_in)
-        comp = cv.GComputation(cv.GIn(g_in), cv.GOut(g_sz))
+            pkg = cv.gapi.kernels(GSizeRImpl)
+            actual = comp.apply(cv.gin(roi), args=cv.compile_args(pkg))
 
-        pkg = cv.gapi.wip.kernels((custom_size, 'org.opencv.streaming.size'))
-        actual = comp.apply(cv.gin(in_mat), args=cv.compile_args(pkg))
+            # cv.norm works with tuples ?
+            self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF))
 
-        self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF))
 
+        def test_custom_op_boundingRect(self):
+            points = [(0,0), (0,1), (1,0), (1,1)]
 
-    def test_custom_goodFeaturesToTrack(self):
-        # G-API
-        img_path = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
-        in_mat = cv.cvtColor(cv.imread(img_path), cv.COLOR_RGB2GRAY)
+            # OpenCV
+            expected = cv.boundingRect(np.array(points))
 
-        # NB: goodFeaturesToTrack configuration
-        max_corners         = 50
-        quality_lvl         = 0.01
-        min_distance        = 10
-        block_sz            = 3
-        use_harris_detector = True
-        k                   = 0.04
-        mask                = None
+            # G-API
+            g_pts = cv.GArray.Point()
+            g_br  = GBoundingRect.on(g_pts)
+            comp  = cv.GComputation(cv.GIn(g_pts), cv.GOut(g_br))
 
-        # OpenCV
-        expected = cv.goodFeaturesToTrack(in_mat, max_corners, quality_lvl,
-                                          min_distance, mask=mask,
-                                          blockSize=block_sz, useHarrisDetector=use_harris_detector, k=k)
+            pkg = cv.gapi.kernels(GBoundingRectImpl)
+            actual = comp.apply(cv.gin(points), args=cv.compile_args(pkg))
 
-        # G-API
-        g_in = cv.GMat()
-        g_out = cv.gapi.goodFeaturesToTrack(g_in, max_corners, quality_lvl,
-                                            min_distance, mask, block_sz, use_harris_detector, k)
+            # cv.norm works with tuples ?
+            self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF))
 
-        comp = cv.GComputation(cv.GIn(g_in), cv.GOut(g_out))
-        pkg = cv.gapi.wip.kernels((custom_goodFeaturesToTrack, 'org.opencv.imgproc.feature.goodFeaturesToTrack'))
-        actual = comp.apply(cv.gin(in_mat), args=cv.compile_args(pkg))
 
-        # NB: OpenCV & G-API have different output types.
-        # OpenCV - numpy array with shape (num_points, 1, 2)
-        # G-API  - list of tuples with size - num_points
-        # Comparison
-        self.assertEqual(0.0, cv.norm(expected.flatten(),
-                                      np.array(actual, dtype=np.float32).flatten(), cv.NORM_INF))
+        def test_custom_op_goodFeaturesToTrack(self):
+            # G-API
+            img_path = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
+            in_mat = cv.cvtColor(cv.imread(img_path), cv.COLOR_RGB2GRAY)
 
+            # NB: goodFeaturesToTrack configuration
+            max_corners         = 50
+            quality_lvl         = 0.01
+            min_distance        = 10.0
+            block_sz            = 3
+            use_harris_detector = True
+            k                   = 0.04
 
-    def test_custom_addC(self):
-        sz = (3, 3, 3)
-        in_mat = np.full(sz, 45, dtype=np.uint8)
-        sc = (50, 10, 20)
+            # OpenCV
+            expected = cv.goodFeaturesToTrack(in_mat, max_corners, quality_lvl,
+                                              min_distance, mask=None,
+                                              blockSize=block_sz, useHarrisDetector=use_harris_detector, k=k)
 
-        # Numpy reference, make array from sc to keep uint8 dtype.
-        expected = in_mat + np.array(sc, dtype=np.uint8)
+            # G-API
+            g_in = cv.GMat()
+            g_out = GGoodFeatures.on(g_in, max_corners, quality_lvl,
+                                     min_distance, block_sz, use_harris_detector, k)
 
-        # G-API
-        g_in = cv.GMat()
-        g_sc = cv.GScalar()
-        g_out = cv.gapi.addC(g_in, g_sc)
-        comp = cv.GComputation(cv.GIn(g_in, g_sc), cv.GOut(g_out))
+            comp = cv.GComputation(cv.GIn(g_in), cv.GOut(g_out))
+            pkg = cv.gapi.kernels(GGoodFeaturesImpl)
+            actual = comp.apply(cv.gin(in_mat), args=cv.compile_args(pkg))
 
-        pkg = cv.gapi.wip.kernels((custom_addC, 'org.opencv.core.math.addC'))
-        actual = comp.apply(cv.gin(in_mat, sc), args=cv.compile_args(pkg))
+            # NB: OpenCV & G-API have different output types.
+            # OpenCV - numpy array with shape (num_points, 1, 2)
+            # G-API  - list of tuples with size - num_points
+            # Comparison
+            self.assertEqual(0.0, cv.norm(expected.flatten(),
+                                          np.array(actual, dtype=np.float32).flatten(), cv.NORM_INF))
 
-        self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF))
 
+        def test_invalid_op(self):
+            # NB: Empty input types list
+            with self.assertRaises(Exception): create_op(in_types=[], out_types=[cv.GMat])
+            # NB: Empty output types list
+            with self.assertRaises(Exception): create_op(in_types=[cv.GMat], out_types=[])
 
-    def test_custom_sizeR(self):
-        # x, y, h, w
-        roi = (10, 15, 100, 150)
+            # Invalid output types
+            with self.assertRaises(Exception): create_op(in_types=[cv.GMat], out_types=[int])
+            with self.assertRaises(Exception): create_op(in_types=[cv.GMat], out_types=[cv.GMat, int])
+            with self.assertRaises(Exception): create_op(in_types=[cv.GMat], out_types=[str, cv.GScalar])
 
-        expected = (100, 150)
 
-        # G-API
-        g_r  = cv.GOpaqueT(cv.gapi.CV_RECT)
-        g_sz = cv.gapi.streaming.size(g_r)
-        comp = cv.GComputation(cv.GIn(g_r), cv.GOut(g_sz))
+        def test_invalid_op_input(self):
+            # NB: Check GMat/GScalar
+            with self.assertRaises(Exception): create_op([cv.GMat]   , [cv.GScalar]).on(cv.GScalar())
+            with self.assertRaises(Exception): create_op([cv.GScalar], [cv.GScalar]).on(cv.GMat())
 
-        pkg = cv.gapi.wip.kernels((custom_sizeR, 'org.opencv.streaming.sizeR'))
-        actual = comp.apply(cv.gin(roi), args=cv.compile_args(pkg))
+            # NB: Check GOpaque
+            op = create_op([cv.GOpaque.Rect], [cv.GMat])
+            with self.assertRaises(Exception): op.on(cv.GOpaque.Bool())
+            with self.assertRaises(Exception): op.on(cv.GOpaque.Int())
+            with self.assertRaises(Exception): op.on(cv.GOpaque.Double())
+            with self.assertRaises(Exception): op.on(cv.GOpaque.Float())
+            with self.assertRaises(Exception): op.on(cv.GOpaque.String())
+            with self.assertRaises(Exception): op.on(cv.GOpaque.Point())
+            with self.assertRaises(Exception): op.on(cv.GOpaque.Point2f())
+            with self.assertRaises(Exception): op.on(cv.GOpaque.Size())
 
-        # cv.norm works with tuples ?
-        self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF))
+            # NB: Check GArray
+            op = create_op([cv.GArray.Rect], [cv.GMat])
+            with self.assertRaises(Exception): op.on(cv.GArray.Bool())
+            with self.assertRaises(Exception): op.on(cv.GArray.Int())
+            with self.assertRaises(Exception): op.on(cv.GArray.Double())
+            with self.assertRaises(Exception): op.on(cv.GArray.Float())
+            with self.assertRaises(Exception): op.on(cv.GArray.String())
+            with self.assertRaises(Exception): op.on(cv.GArray.Point())
+            with self.assertRaises(Exception): op.on(cv.GArray.Point2f())
+            with self.assertRaises(Exception): op.on(cv.GArray.Size())
 
+            # Check other possible invalid options
+            with self.assertRaises(Exception): op.on(cv.GMat())
+            with self.assertRaises(Exception): op.on(cv.GScalar())
 
-    def test_custom_boundingRect(self):
-        points = [(0,0), (0,1), (1,0), (1,1)]
+            with self.assertRaises(Exception): op.on(1)
+            with self.assertRaises(Exception): op.on('foo')
+            with self.assertRaises(Exception): op.on(False)
 
-        # OpenCV
-        expected = cv.boundingRect(np.array(points))
+            with self.assertRaises(Exception): create_op([cv.GMat, int], [cv.GMat]).on(cv.GMat(), 'foo')
+            with self.assertRaises(Exception): create_op([cv.GMat, int], [cv.GMat]).on(cv.GMat())
 
-        # G-API
-        g_pts = cv.GArrayT(cv.gapi.CV_POINT)
-        g_br  = cv.gapi.boundingRect(g_pts)
-        comp = cv.GComputation(cv.GIn(g_pts), cv.GOut(g_br))
 
-        pkg = cv.gapi.wip.kernels((custom_boundingRect, 'org.opencv.imgproc.shape.boundingRectVector32S'))
-        actual = comp.apply(cv.gin(points), args=cv.compile_args(pkg))
+        def test_stateful_kernel(self):
+            @cv.gapi.op('custom.sum', in_types=[cv.GArray.Int], out_types=[cv.GOpaque.Int])
+            class GSum:
+                @staticmethod
+                def outMeta(arr_desc):
+                    return cv.empty_gopaque_desc()
 
-        # cv.norm works with tuples ?
-        self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF))
 
+            @cv.gapi.kernel(GSum)
+            class GSumImpl:
+                last_result = 0
 
-    def test_multiple_custom_kernels(self):
-        sz = (3, 3, 3)
-        in_mat1 = np.full(sz, 45, dtype=np.uint8)
-        in_mat2 = np.full(sz, 50 , dtype=np.uint8)
+                @staticmethod
+                def run(arr):
+                    GSumImpl.last_result = sum(arr)
+                    return GSumImpl.last_result
 
-        # OpenCV
-        expected = cv.mean(cv.split(cv.add(in_mat1, in_mat2))[1])
 
-        # G-API
-        g_in1 = cv.GMat()
-        g_in2 = cv.GMat()
-        g_sum = cv.gapi.add(g_in1, g_in2)
-        g_b, g_r, g_g = cv.gapi.split3(g_sum)
-        g_mean = cv.gapi.mean(g_b)
+            g_in  = cv.GArray.Int()
+            comp  = cv.GComputation(cv.GIn(g_in), cv.GOut(GSum.on(g_in)))
 
-        comp = cv.GComputation(cv.GIn(g_in1, g_in2), cv.GOut(g_mean))
+            s = comp.apply(cv.gin([1, 2, 3, 4]), args=cv.compile_args(cv.gapi.kernels(GSumImpl)))
+            self.assertEqual(10, s)
 
+            s = comp.apply(cv.gin([1, 2, 8, 7]), args=cv.compile_args(cv.gapi.kernels(GSumImpl)))
+            self.assertEqual(18, s)
 
-        pkg = cv.gapi.wip.kernels((custom_add   , 'org.opencv.core.math.add'),
-                         (custom_mean  , 'org.opencv.core.math.mean'),
-                         (custom_split3, 'org.opencv.core.transform.split3'))
+            self.assertEqual(18, GSumImpl.last_result)
 
-        actual = comp.apply(cv.gin(in_mat1, in_mat2), args=cv.compile_args(pkg))
 
-        self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF))
+        def test_opaq_with_custom_type(self):
+            @cv.gapi.op('custom.op', in_types=[cv.GOpaque.Any, cv.GOpaque.String], out_types=[cv.GOpaque.Any])
+            class GLookUp:
+                @staticmethod
+                def outMeta(opaq_desc0, opaq_desc1):
+                    return cv.empty_gopaque_desc()
 
+            @cv.gapi.kernel(GLookUp)
+            class GLookUpImpl:
+                @staticmethod
+                def run(table, key):
+                    return table[key]
 
-    def test_custom_op_add(self):
-        sz = (3, 3)
-        in_mat1 = np.full(sz, 45, dtype=np.uint8)
-        in_mat2 = np.full(sz, 50, dtype=np.uint8)
 
-        # OpenCV
-        expected = cv.add(in_mat1, in_mat2)
+            g_table = cv.GOpaque.Any()
+            g_key   = cv.GOpaque.String()
+            g_out   = GLookUp.on(g_table, g_key)
 
-        # G-API
-        g_in1  = cv.GMat()
-        g_in2  = cv.GMat()
-        g_out = add(g_in1, g_in2, cv.CV_8UC1)
+            comp = cv.GComputation(cv.GIn(g_table, g_key), cv.GOut(g_out))
 
-        comp = cv.GComputation(cv.GIn(g_in1, g_in2), cv.GOut(g_out))
+            table = {
+                        'int':   42,
+                        'str':   'hello, world!',
+                        'tuple': (42, 42)
+                    }
 
-        pkg = cv.gapi.wip.kernels((custom_add, 'custom.add'))
-        actual = comp.apply(cv.gin(in_mat1, in_mat2), args=cv.compile_args(pkg))
+            out = comp.apply(cv.gin(table, 'int'), args=cv.compile_args(cv.gapi.kernels(GLookUpImpl)))
+            self.assertEqual(42, out)
 
-        self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF))
+            out = comp.apply(cv.gin(table, 'str'), args=cv.compile_args(cv.gapi.kernels(GLookUpImpl)))
+            self.assertEqual('hello, world!', out)
 
+            out = comp.apply(cv.gin(table, 'tuple'), args=cv.compile_args(cv.gapi.kernels(GLookUpImpl)))
+            self.assertEqual((42, 42), out)
 
-    def test_custom_op_split3(self):
-        sz = (4, 4)
-        in_ch1 = np.full(sz, 1, dtype=np.uint8)
-        in_ch2 = np.full(sz, 2, dtype=np.uint8)
-        in_ch3 = np.full(sz, 3, dtype=np.uint8)
-        # H x W x C
-        in_mat = np.stack((in_ch1, in_ch2, in_ch3), axis=2)
 
-        # G-API
-        g_in  = cv.GMat()
-        g_ch1, g_ch2, g_ch3 = split3(g_in)
+        def test_array_with_custom_type(self):
+            @cv.gapi.op('custom.op', in_types=[cv.GArray.Any, cv.GArray.Any], out_types=[cv.GArray.Any])
+            class GConcat:
+                @staticmethod
+                def outMeta(arr_desc0, arr_desc1):
+                    return cv.empty_array_desc()
 
-        comp = cv.GComputation(cv.GIn(g_in), cv.GOut(g_ch1, g_ch2, g_ch3))
+            @cv.gapi.kernel(GConcat)
+            class GConcatImpl:
+                @staticmethod
+                def run(arr0, arr1):
+                    return arr0 + arr1
 
-        pkg = cv.gapi.wip.kernels((custom_split3, 'custom.split3'))
-        ch1, ch2, ch3 = comp.apply(cv.gin(in_mat), args=cv.compile_args(pkg))
+            g_arr0 = cv.GArray.Any()
+            g_arr1 = cv.GArray.Any()
+            g_out  = GConcat.on(g_arr0, g_arr1)
 
-        self.assertEqual(0.0, cv.norm(in_ch1, ch1, cv.NORM_INF))
-        self.assertEqual(0.0, cv.norm(in_ch2, ch2, cv.NORM_INF))
-        self.assertEqual(0.0, cv.norm(in_ch3, ch3, cv.NORM_INF))
+            comp = cv.GComputation(cv.GIn(g_arr0, g_arr1), cv.GOut(g_out))
 
+            arr0 = [(2, 2), 2.0]
+            arr1 = [3,    'str']
 
-    def test_custom_op_mean(self):
-        img_path = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
-        in_mat = cv.imread(img_path)
+            out = comp.apply(cv.gin(arr0, arr1),
+                             args=cv.compile_args(cv.gapi.kernels(GConcatImpl)))
 
-        # OpenCV
-        expected = cv.mean(in_mat)
+            self.assertEqual(arr0 + arr1, out)
 
-        # G-API
-        g_in  = cv.GMat()
-        g_out = mean(g_in)
 
-        comp = cv.GComputation(g_in, g_out)
+        def test_raise_in_kernel(self):
+            @cv.gapi.op('custom.op', in_types=[cv.GMat, cv.GMat], out_types=[cv.GMat])
+            class GAdd:
+                @staticmethod
+                def outMeta(desc0, desc1):
+                    return desc0
 
-        pkg    = cv.gapi.wip.kernels((custom_mean, 'custom.mean'))
-        actual = comp.apply(cv.gin(in_mat), args=cv.compile_args(pkg))
+            @cv.gapi.kernel(GAdd)
+            class GAddImpl:
+                @staticmethod
+                def run(img0, img1):
+                    raise Exception('Error')
+                    return img0 + img1
 
-        # Comparison
-        self.assertEqual(expected, actual)
+            g_in0 = cv.GMat()
+            g_in1 = cv.GMat()
+            g_out = GAdd.on(g_in0, g_in1)
 
+            comp = cv.GComputation(cv.GIn(g_in0, g_in1), cv.GOut(g_out))
 
-    def test_custom_op_addC(self):
-        sz = (3, 3, 3)
-        in_mat = np.full(sz, 45, dtype=np.uint8)
-        sc = (50, 10, 20)
+            img0 = np.array([1, 2, 3])
+            img1 = np.array([1, 2, 3])
 
-        # Numpy reference, make array from sc to keep uint8 dtype.
-        expected = in_mat + np.array(sc, dtype=np.uint8)
+            with self.assertRaises(Exception): comp.apply(cv.gin(img0, img1),
+                                                          args=cv.compile_args(
+                                                              cv.gapi.kernels(GAddImpl)))
 
-        # G-API
-        g_in  = cv.GMat()
-        g_sc  = cv.GScalar()
-        g_out = addC(g_in, g_sc, cv.CV_8UC1)
-        comp  = cv.GComputation(cv.GIn(g_in, g_sc), cv.GOut(g_out))
 
-        pkg = cv.gapi.wip.kernels((custom_addC, 'custom.addC'))
-        actual = comp.apply(cv.gin(in_mat, sc), args=cv.compile_args(pkg))
+        def test_raise_in_outMeta(self):
+            @cv.gapi.op('custom.op', in_types=[cv.GMat, cv.GMat], out_types=[cv.GMat])
+            class GAdd:
+                @staticmethod
+                def outMeta(desc0, desc1):
+                    raise NotImplementedError("outMeta isn't implemented")
+
+            @cv.gapi.kernel(GAdd)
+            class GAddImpl:
+                @staticmethod
+                def run(img0, img1):
+                    return img0 + img1
+
+            g_in0 = cv.GMat()
+            g_in1 = cv.GMat()
+            g_out = GAdd.on(g_in0, g_in1)
 
-        self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF))
+            comp = cv.GComputation(cv.GIn(g_in0, g_in1), cv.GOut(g_out))
 
+            img0 = np.array([1, 2, 3])
+            img1 = np.array([1, 2, 3])
 
-    def test_custom_op_size(self):
-        sz = (100, 150, 3)
-        in_mat = np.full(sz, 45, dtype=np.uint8)
+            with self.assertRaises(Exception): comp.apply(cv.gin(img0, img1),
+                                                          args=cv.compile_args(
+                                                              cv.gapi.kernels(GAddImpl)))
 
-        # Open_cV
-        expected = (100, 150)
 
-        # G-API
-        g_in = cv.GMat()
-        g_sz = size(g_in)
-        comp = cv.GComputation(cv.GIn(g_in), cv.GOut(g_sz))
+        def test_invalid_outMeta(self):
+            @cv.gapi.op('custom.op', in_types=[cv.GMat, cv.GMat], out_types=[cv.GMat])
+            class GAdd:
+                @staticmethod
+                def outMeta(desc0, desc1):
+                    # Invalid outMeta
+                    return cv.empty_gopaque_desc()
 
-        pkg = cv.gapi.wip.kernels((custom_size, 'custom.size'))
-        actual = comp.apply(cv.gin(in_mat), args=cv.compile_args(pkg))
+            @cv.gapi.kernel(GAdd)
+            class GAddImpl:
+                @staticmethod
+                def run(img0, img1):
+                    return img0 + img1
 
-        self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF))
+            g_in0 = cv.GMat()
+            g_in1 = cv.GMat()
+            g_out = GAdd.on(g_in0, g_in1)
+
+            comp = cv.GComputation(cv.GIn(g_in0, g_in1), cv.GOut(g_out))
 
+            img0 = np.array([1, 2, 3])
+            img1 = np.array([1, 2, 3])
 
-    def test_custom_op_sizeR(self):
-        # x, y, h, w
-        roi = (10, 15, 100, 150)
+            # FIXME: Cause Bad variant access.
+            # Need to provide more descriptive error messsage.
+            with self.assertRaises(Exception): comp.apply(cv.gin(img0, img1),
+                                                          args=cv.compile_args(
+                                                              cv.gapi.kernels(GAddImpl)))
 
-        expected = (100, 150)
+        def test_pipeline_with_custom_kernels(self):
+            @cv.gapi.op('custom.resize', in_types=[cv.GMat, tuple], out_types=[cv.GMat])
+            class GResize:
+                @staticmethod
+                def outMeta(desc, size):
+                    return desc.withSize(size)
 
-        # G-API
-        g_r  = cv.GOpaqueT(cv.gapi.CV_RECT)
-        g_sz = sizeR(g_r)
-        comp = cv.GComputation(cv.GIn(g_r), cv.GOut(g_sz))
+            @cv.gapi.kernel(GResize)
+            class GResizeImpl:
+                @staticmethod
+                def run(img, size):
+                    return cv.resize(img, size)
 
-        pkg = cv.gapi.wip.kernels((custom_sizeR, 'custom.sizeR'))
-        actual = comp.apply(cv.gin(roi), args=cv.compile_args(pkg))
+            @cv.gapi.op('custom.transpose', in_types=[cv.GMat, tuple], out_types=[cv.GMat])
+            class GTranspose:
+                @staticmethod
+                def outMeta(desc, order):
+                    return desc
+
+            @cv.gapi.kernel(GTranspose)
+            class GTransposeImpl:
+                @staticmethod
+                def run(img, order):
+                    return np.transpose(img, order)
+
+            img_path = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
+            img      = cv.imread(img_path)
+            size     = (32, 32)
+            order    = (1, 0, 2)
 
-        # cv.norm works with tuples ?
-        self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF))
+            # Dummy pipeline just to validate this case:
+            # gapi -> custom -> custom -> gapi
 
+            # OpenCV
+            expected = cv.cvtColor(img, cv.COLOR_BGR2RGB)
+            expected = cv.resize(expected, size)
+            expected = np.transpose(expected, order)
+            expected = cv.mean(expected)
 
-    def test_custom_op_boundingRect(self):
-        points = [(0,0), (0,1), (1,0), (1,1)]
+            # G-API
+            g_bgr        = cv.GMat()
+            g_rgb        = cv.gapi.BGR2RGB(g_bgr)
+            g_resized    = GResize.on(g_rgb, size)
+            g_transposed = GTranspose.on(g_resized, order)
+            g_mean       = cv.gapi.mean(g_transposed)
+
+            comp = cv.GComputation(cv.GIn(g_bgr), cv.GOut(g_mean))
+            actual = comp.apply(cv.gin(img), args=cv.compile_args(
+                cv.gapi.kernels(GResizeImpl, GTransposeImpl)))
+
+            self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF))
+
+
+except unittest.SkipTest as e:
 
-        # OpenCV
-        expected = cv.boundingRect(np.array(points))
+    message = str(e)
 
-        # G-API
-        g_pts = cv.GArrayT(cv.gapi.CV_POINT)
-        g_br  = boundingRect(g_pts)
-        comp = cv.GComputation(cv.GIn(g_pts), cv.GOut(g_br))
+    class TestSkip(unittest.TestCase):
+        def setUp(self):
+            self.skipTest('Skip tests: ' + message)
 
-        pkg = cv.gapi.wip.kernels((custom_boundingRect, 'custom.boundingRect'))
-        actual = comp.apply(cv.gin(points), args=cv.compile_args(pkg))
-
-        # cv.norm works with tuples ?
-        self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF))
-
-
-    def test_custom_op_goodFeaturesToTrack(self):
-        # G-API
-        img_path = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
-        in_mat = cv.cvtColor(cv.imread(img_path), cv.COLOR_RGB2GRAY)
-
-        # NB: goodFeaturesToTrack configuration
-        max_corners         = 50
-        quality_lvl         = 0.01
-        min_distance        = 10
-        block_sz            = 3
-        use_harris_detector = True
-        k                   = 0.04
-        mask                = None
-
-        # OpenCV
-        expected = cv.goodFeaturesToTrack(in_mat, max_corners, quality_lvl,
-                                          min_distance, mask=mask,
-                                          blockSize=block_sz, useHarrisDetector=use_harris_detector, k=k)
-
-        # G-API
-        g_in = cv.GMat()
-        g_out = goodFeaturesToTrack(g_in, max_corners, quality_lvl,
-                                    min_distance, mask, block_sz, use_harris_detector, k)
-
-        comp = cv.GComputation(cv.GIn(g_in), cv.GOut(g_out))
-        pkg = cv.gapi.wip.kernels((custom_goodFeaturesToTrack, 'custom.goodFeaturesToTrack'))
-        actual = comp.apply(cv.gin(in_mat), args=cv.compile_args(pkg))
-
-        # NB: OpenCV & G-API have different output types.
-        # OpenCV - numpy array with shape (num_points, 1, 2)
-        # G-API  - list of tuples with size - num_points
-        # Comparison
-        self.assertEqual(0.0, cv.norm(expected.flatten(),
-                                      np.array(actual, dtype=np.float32).flatten(), cv.NORM_INF))
+        def test_skip():
+            pass
+
+    pass
 
 
 if __name__ == '__main__':
diff --git a/modules/gapi/perf/common/gapi_core_perf_tests.hpp b/modules/gapi/perf/common/gapi_core_perf_tests.hpp
index be88e5a721..3fd543baed 100644
--- a/modules/gapi/perf/common/gapi_core_perf_tests.hpp
+++ b/modules/gapi/perf/common/gapi_core_perf_tests.hpp
@@ -79,6 +79,7 @@ namespace opencv_test
                                                          cv::GCompileArgs>> {};
     class KMeans3DPerfTest : public TestPerfParams<tuple<int, int, cv::KmeansFlags,
                                                          cv::GCompileArgs>> {};
+    class TransposePerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, cv::GCompileArgs>> {};
     class ResizePerfTest : public TestPerfParams<tuple<compare_f, MatType, int, cv::Size, cv::Size, cv::GCompileArgs>> {};
     class ResizeFxFyPerfTest : public TestPerfParams<tuple<compare_f, MatType, int, cv::Size, double, double, cv::GCompileArgs>> {};
     class ParseSSDBLPerfTest : public TestPerfParams<tuple<cv::Size, float, int, cv::GCompileArgs>>, public ParserSSDTest {};
diff --git a/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp b/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp
index 7fe0ec4c26..6dfc0b2e2f 100644
--- a/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp
+++ b/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp
@@ -2036,6 +2036,42 @@ PERF_TEST_P_(KMeans3DPerfTest, TestPerformance)
 
 //------------------------------------------------------------------------------
 
+PERF_TEST_P_(TransposePerfTest, TestPerformance)
+{
+    compare_f cmpF;
+    cv::Size sz_in;
+    MatType type = -1;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, sz_in, type, compile_args) = GetParam();
+
+    initMatrixRandU(type, sz_in, type, false);
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    cv::transpose(in_mat1, out_mat_ocv);
+
+    // G-API code ////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::transpose(in);
+    cv::GComputation c(cv::GIn(in), cv::GOut(out));
+
+    // Warm-up graph engine:
+    c.apply(cv::gin(in_mat1), cv::gout(out_mat_gapi), std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(cv::gin(in_mat1), cv::gout(out_mat_gapi));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
 PERF_TEST_P_(ResizePerfTest, TestPerformance)
 {
     compare_f cmpF = get<0>(GetParam());
diff --git a/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp b/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp
index 8385169050..871d41792b 100644
--- a/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp
+++ b/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp
@@ -311,6 +311,14 @@ INSTANTIATE_TEST_CASE_P(KMeans3DPerfTestCPU, KMeans3DPerfTest,
                                        cv::KMEANS_PP_CENTERS     | cv::KMEANS_USE_INITIAL_LABELS),
                                 Values(cv::compile_args(CORE_CPU))));
 
+INSTANTIATE_TEST_CASE_P(TransposePerfTestCPU, TransposePerfTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(szSmall128, szVGA, sz720p, sz1080p),
+                                Values(CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1,
+                                       CV_8UC2, CV_16UC2, CV_16SC2, CV_32FC2,
+                                       CV_8UC3, CV_16UC3, CV_16SC3, CV_32FC3),
+                                Values(cv::compile_args(CORE_CPU))));
+
 INSTANTIATE_TEST_CASE_P(ResizePerfTestCPU, ResizePerfTest,
     Combine(Values(AbsExact().to_compare_f()),
         Values(CV_8UC1, CV_16UC1, CV_16SC1),
diff --git a/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp b/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp
index 955799634c..4a38fdb27b 100644
--- a/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp
+++ b/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp
@@ -276,6 +276,14 @@ INSTANTIATE_TEST_CASE_P(ConvertToPerfTestGPU, ConvertToPerfTest,
                                 Values(0.0),
                                 Values(cv::compile_args(CORE_GPU))));
 
+INSTANTIATE_TEST_CASE_P(TransposePerfTestGPU, TransposePerfTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(szSmall128, szVGA, sz720p, sz1080p),
+                                Values(CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1,
+                                       CV_8UC2, CV_16UC2, CV_16SC2, CV_32FC2,
+                                       CV_8UC3, CV_16UC3, CV_16SC3, CV_32FC3),
+                                Values(cv::compile_args(CORE_GPU))));
+
 INSTANTIATE_TEST_CASE_P(ResizePerfTestGPU, ResizePerfTest,
                         Combine(Values(AbsSimilarPoints(2, 0.05).to_compare_f()),
                                 Values(CV_8UC1, CV_16UC1, CV_16SC1),
diff --git a/modules/gapi/samples/face_detection_mtcnn.cpp b/modules/gapi/samples/face_detection_mtcnn.cpp
new file mode 100644
index 0000000000..6e88a9ec56
--- /dev/null
+++ b/modules/gapi/samples/face_detection_mtcnn.cpp
@@ -0,0 +1,740 @@
+#include <algorithm>
+#include <cctype>
+#include <cmath>
+#include <iostream>
+#include <limits>
+#include <numeric>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <opencv2/gapi.hpp>
+#include <opencv2/gapi/core.hpp>
+#include <opencv2/gapi/imgproc.hpp>
+#include <opencv2/gapi/cpu/gcpukernel.hpp>
+#include <opencv2/gapi/infer.hpp>
+#include <opencv2/gapi/infer/ie.hpp>
+#include <opencv2/gapi/streaming/cap.hpp>
+#include <opencv2/gapi/gopaque.hpp>
+#include <opencv2/highgui.hpp>
+
+const std::string about =
+"This is an OpenCV-based version of OMZ MTCNN Face Detection example";
+const std::string keys =
+"{ h help           |                           | Print this help message }"
+"{ input            |                           | Path to the input video file }"
+"{ mtcnnpm          | mtcnn-p.xml               | Path to OpenVINO MTCNN P (Proposal) detection model (.xml)}"
+"{ mtcnnpd          | CPU                       | Target device for the MTCNN P (e.g. CPU, GPU, VPU, ...) }"
+"{ mtcnnrm          | mtcnn-r.xml               | Path to OpenVINO MTCNN R (Refinement) detection model (.xml)}"
+"{ mtcnnrd          | CPU                       | Target device for the MTCNN R (e.g. CPU, GPU, VPU, ...) }"
+"{ mtcnnom          | mtcnn-o.xml               | Path to OpenVINO MTCNN O (Output) detection model (.xml)}"
+"{ mtcnnod          | CPU                       | Target device for the MTCNN O (e.g. CPU, GPU, VPU, ...) }"
+"{ thrp             | 0.6                       | MTCNN P confidence threshold}"
+"{ thrr             | 0.7                       | MTCNN R confidence threshold}"
+"{ thro             | 0.7                       | MTCNN O confidence threshold}"
+"{ half_scale       | false                     | MTCNN P use half scale pyramid}"
+"{ queue_capacity   | 1                         | Streaming executor queue capacity. Calculated automaticaly if 0}"
+;
+
+namespace {
+std::string weights_path(const std::string& model_path) {
+    const auto EXT_LEN = 4u;
+    const auto sz = model_path.size();
+    CV_Assert(sz > EXT_LEN);
+
+    const auto ext = model_path.substr(sz - EXT_LEN);
+    CV_Assert(cv::toLowerCase(ext) == ".xml");
+    return model_path.substr(0u, sz - EXT_LEN) + ".bin";
+}
+//////////////////////////////////////////////////////////////////////
+} // anonymous namespace
+
+namespace custom {
+namespace {
+
+// Define custom structures and operations
+#define NUM_REGRESSIONS 4
+#define NUM_PTS 5
+
+struct BBox {
+    int x1;
+    int y1;
+    int x2;
+    int y2;
+
+    cv::Rect getRect() const { return cv::Rect(x1,
+                                               y1,
+                                               x2 - x1,
+                                               y2 - y1); }
+
+    BBox getSquare() const {
+        BBox bbox;
+        float bboxWidth = static_cast<float>(x2 - x1);
+        float bboxHeight = static_cast<float>(y2 - y1);
+        float side = std::max(bboxWidth, bboxHeight);
+        bbox.x1 = static_cast<int>(static_cast<float>(x1) + (bboxWidth - side) * 0.5f);
+        bbox.y1 = static_cast<int>(static_cast<float>(y1) + (bboxHeight - side) * 0.5f);
+        bbox.x2 = static_cast<int>(static_cast<float>(bbox.x1) + side);
+        bbox.y2 = static_cast<int>(static_cast<float>(bbox.y1) + side);
+        return bbox;
+    }
+};
+
+struct Face {
+    BBox bbox;
+    float score;
+    std::array<float, NUM_REGRESSIONS> regression;
+    std::array<float, 2 * NUM_PTS> ptsCoords;
+
+    static void applyRegression(std::vector<Face>& faces, bool addOne = false) {
+        for (auto& face : faces) {
+            float bboxWidth =
+                face.bbox.x2 - face.bbox.x1 + static_cast<float>(addOne);
+            float bboxHeight =
+                face.bbox.y2 - face.bbox.y1 + static_cast<float>(addOne);
+            face.bbox.x1 = static_cast<int>(static_cast<float>(face.bbox.x1) + (face.regression[1] * bboxWidth));
+            face.bbox.y1 = static_cast<int>(static_cast<float>(face.bbox.y1) + (face.regression[0] * bboxHeight));
+            face.bbox.x2 = static_cast<int>(static_cast<float>(face.bbox.x2) + (face.regression[3] * bboxWidth));
+            face.bbox.y2 = static_cast<int>(static_cast<float>(face.bbox.y2) + (face.regression[2] * bboxHeight));
+        }
+    }
+
+    static void bboxes2Squares(std::vector<Face>& faces) {
+        for (auto& face : faces) {
+            face.bbox = face.bbox.getSquare();
+        }
+    }
+
+    static std::vector<Face> runNMS(std::vector<Face>& faces, const float threshold,
+                                    const bool useMin = false) {
+        std::vector<Face> facesNMS;
+        if (faces.empty()) {
+            return facesNMS;
+        }
+
+        std::sort(faces.begin(), faces.end(), [](const Face& f1, const Face& f2) {
+            return f1.score > f2.score;
+        });
+
+        std::vector<int> indices(faces.size());
+        std::iota(indices.begin(), indices.end(), 0);
+
+        while (indices.size() > 0) {
+            const int idx = indices[0];
+            facesNMS.push_back(faces[idx]);
+            std::vector<int> tmpIndices = indices;
+            indices.clear();
+            const float area1 = static_cast<float>(faces[idx].bbox.x2 - faces[idx].bbox.x1 + 1) *
+                static_cast<float>(faces[idx].bbox.y2 - faces[idx].bbox.y1 + 1);
+            for (size_t i = 1; i < tmpIndices.size(); ++i) {
+                int tmpIdx = tmpIndices[i];
+                const float interX1 = static_cast<float>(std::max(faces[idx].bbox.x1, faces[tmpIdx].bbox.x1));
+                const float interY1 = static_cast<float>(std::max(faces[idx].bbox.y1, faces[tmpIdx].bbox.y1));
+                const float interX2 = static_cast<float>(std::min(faces[idx].bbox.x2, faces[tmpIdx].bbox.x2));
+                const float interY2 = static_cast<float>(std::min(faces[idx].bbox.y2, faces[tmpIdx].bbox.y2));
+
+                const float bboxWidth = std::max(0.0f, (interX2 - interX1 + 1));
+                const float bboxHeight = std::max(0.0f, (interY2 - interY1 + 1));
+
+                const float interArea = bboxWidth * bboxHeight;
+                const float area2 = static_cast<float>(faces[tmpIdx].bbox.x2 - faces[tmpIdx].bbox.x1 + 1) *
+                    static_cast<float>(faces[tmpIdx].bbox.y2 - faces[tmpIdx].bbox.y1 + 1);
+                float overlap = 0.0;
+                if (useMin) {
+                    overlap = interArea / std::min(area1, area2);
+                } else {
+                    overlap = interArea / (area1 + area2 - interArea);
+                }
+                if (overlap <= threshold) {
+                    indices.push_back(tmpIdx);
+                }
+            }
+        }
+        return facesNMS;
+    }
+};
+
+const float P_NET_WINDOW_SIZE = 12.0f;
+
+std::vector<Face> buildFaces(const cv::Mat& scores,
+                             const cv::Mat& regressions,
+                             const float scaleFactor,
+                             const float threshold) {
+
+    auto w = scores.size[3];
+    auto h = scores.size[2];
+    auto size = w * h;
+
+    const float* scores_data = scores.ptr<float>();
+    scores_data += size;
+
+    const float* reg_data = regressions.ptr<float>();
+
+    auto out_side = std::max(h, w);
+    auto in_side = 2 * out_side + 11;
+    float stride = 0.0f;
+    if (out_side != 1)
+    {
+        stride = static_cast<float>(in_side - P_NET_WINDOW_SIZE) / static_cast<float>(out_side - 1);
+    }
+
+    std::vector<Face> boxes;
+
+    for (int i = 0; i < size; i++) {
+        if (scores_data[i] >= (threshold)) {
+            float y = static_cast<float>(i / w);
+            float x = static_cast<float>(i - w * y);
+
+            Face faceInfo;
+            BBox& faceBox = faceInfo.bbox;
+
+            faceBox.x1 = std::max(0, static_cast<int>((x * stride) / scaleFactor));
+            faceBox.y1 = std::max(0, static_cast<int>((y * stride) / scaleFactor));
+            faceBox.x2 = static_cast<int>((x * stride + P_NET_WINDOW_SIZE - 1.0f) / scaleFactor);
+            faceBox.y2 = static_cast<int>((y * stride + P_NET_WINDOW_SIZE - 1.0f) / scaleFactor);
+            faceInfo.regression[0] = reg_data[i];
+            faceInfo.regression[1] = reg_data[i + size];
+            faceInfo.regression[2] = reg_data[i + 2 * size];
+            faceInfo.regression[3] = reg_data[i + 3 * size];
+            faceInfo.score = scores_data[i];
+            boxes.push_back(faceInfo);
+        }
+    }
+
+    return boxes;
+}
+
+// Define networks for this sample
+using GMat2 = std::tuple<cv::GMat, cv::GMat>;
+using GMat3 = std::tuple<cv::GMat, cv::GMat, cv::GMat>;
+using GMats = cv::GArray<cv::GMat>;
+using GRects = cv::GArray<cv::Rect>;
+using GSize = cv::GOpaque<cv::Size>;
+
+G_API_NET(MTCNNRefinement,
+          <GMat2(cv::GMat)>,
+          "sample.custom.mtcnn_refinement");
+
+G_API_NET(MTCNNOutput,
+          <GMat3(cv::GMat)>,
+          "sample.custom.mtcnn_output");
+
+using GFaces = cv::GArray<Face>;
+G_API_OP(BuildFaces,
+         <GFaces(cv::GMat, cv::GMat, float, float)>,
+         "sample.custom.mtcnn.build_faces") {
+         static cv::GArrayDesc outMeta(const cv::GMatDesc&,
+                                       const cv::GMatDesc&,
+                                       const float,
+                                       const float) {
+              return cv::empty_array_desc();
+    }
+};
+
+G_API_OP(RunNMS,
+         <GFaces(GFaces, float, bool)>,
+         "sample.custom.mtcnn.run_nms") {
+         static cv::GArrayDesc outMeta(const cv::GArrayDesc&,
+                                       const float, const bool) {
+             return cv::empty_array_desc();
+    }
+};
+
+G_API_OP(AccumulatePyramidOutputs,
+         <GFaces(GFaces, GFaces)>,
+         "sample.custom.mtcnn.accumulate_pyramid_outputs") {
+         static cv::GArrayDesc outMeta(const cv::GArrayDesc&,
+                                       const cv::GArrayDesc&) {
+             return cv::empty_array_desc();
+    }
+};
+
+G_API_OP(ApplyRegression,
+         <GFaces(GFaces, bool)>,
+         "sample.custom.mtcnn.apply_regression") {
+         static cv::GArrayDesc outMeta(const cv::GArrayDesc&, const bool) {
+             return cv::empty_array_desc();
+    }
+};
+
+G_API_OP(BBoxesToSquares,
+         <GFaces(GFaces)>,
+         "sample.custom.mtcnn.bboxes_to_squares") {
+         static cv::GArrayDesc outMeta(const cv::GArrayDesc&) {
+              return cv::empty_array_desc();
+    }
+};
+
+G_API_OP(R_O_NetPreProcGetROIs,
+         <GRects(GFaces, GSize)>,
+         "sample.custom.mtcnn.bboxes_r_o_net_preproc_get_rois") {
+         static cv::GArrayDesc outMeta(const cv::GArrayDesc&, const cv::GOpaqueDesc&) {
+              return cv::empty_array_desc();
+    }
+};
+
+
+G_API_OP(RNetPostProc,
+         <GFaces(GFaces, GMats, GMats, float)>,
+         "sample.custom.mtcnn.rnet_postproc") {
+         static cv::GArrayDesc outMeta(const cv::GArrayDesc&,
+                                       const cv::GArrayDesc&,
+                                       const cv::GArrayDesc&,
+                                       const float) {
+             return cv::empty_array_desc();
+    }
+};
+
+G_API_OP(ONetPostProc,
+         <GFaces(GFaces, GMats, GMats, GMats, float)>,
+         "sample.custom.mtcnn.onet_postproc") {
+         static cv::GArrayDesc outMeta(const cv::GArrayDesc&,
+                                       const cv::GArrayDesc&,
+                                       const cv::GArrayDesc&,
+                                       const cv::GArrayDesc&,
+                                       const float) {
+             return cv::empty_array_desc();
+    }
+};
+
+G_API_OP(SwapFaces,
+         <GFaces(GFaces)>,
+         "sample.custom.mtcnn.swap_faces") {
+         static cv::GArrayDesc outMeta(const cv::GArrayDesc&) {
+              return cv::empty_array_desc();
+    }
+};
+
+//Custom kernels implementation
+GAPI_OCV_KERNEL(OCVBuildFaces, BuildFaces) {
+    static void run(const cv::Mat & in_scores,
+                    const cv::Mat & in_regresssions,
+                    const float scaleFactor,
+                    const float threshold,
+                    std::vector<Face> &out_faces) {
+        out_faces = buildFaces(in_scores, in_regresssions, scaleFactor, threshold);
+    }
+};// GAPI_OCV_KERNEL(BuildFaces)
+
+GAPI_OCV_KERNEL(OCVRunNMS, RunNMS) {
+    static void run(const std::vector<Face> &in_faces,
+                    const float threshold,
+                    const bool useMin,
+                    std::vector<Face> &out_faces) {
+                    std::vector<Face> in_faces_copy = in_faces;
+        out_faces = Face::runNMS(in_faces_copy, threshold, useMin);
+    }
+};// GAPI_OCV_KERNEL(RunNMS)
+
+GAPI_OCV_KERNEL(OCVAccumulatePyramidOutputs, AccumulatePyramidOutputs) {
+    static void run(const std::vector<Face> &total_faces,
+                    const std::vector<Face> &in_faces,
+                    std::vector<Face> &out_faces) {
+                    out_faces = total_faces;
+        out_faces.insert(out_faces.end(), in_faces.begin(), in_faces.end());
+    }
+};// GAPI_OCV_KERNEL(AccumulatePyramidOutputs)
+
+GAPI_OCV_KERNEL(OCVApplyRegression, ApplyRegression) {
+    static void run(const std::vector<Face> &in_faces,
+                    const bool addOne,
+                    std::vector<Face> &out_faces) {
+        std::vector<Face> in_faces_copy = in_faces;
+        Face::applyRegression(in_faces_copy, addOne);
+        out_faces.clear();
+        out_faces.insert(out_faces.end(), in_faces_copy.begin(), in_faces_copy.end());
+    }
+};// GAPI_OCV_KERNEL(ApplyRegression)
+
+GAPI_OCV_KERNEL(OCVBBoxesToSquares, BBoxesToSquares) {
+    static void run(const std::vector<Face> &in_faces,
+                    std::vector<Face> &out_faces) {
+        std::vector<Face> in_faces_copy = in_faces;
+        Face::bboxes2Squares(in_faces_copy);
+        out_faces.clear();
+        out_faces.insert(out_faces.end(), in_faces_copy.begin(), in_faces_copy.end());
+    }
+};// GAPI_OCV_KERNEL(BBoxesToSquares)
+
+GAPI_OCV_KERNEL(OCVR_O_NetPreProcGetROIs, R_O_NetPreProcGetROIs) {
+    static void run(const std::vector<Face> &in_faces,
+                    const cv::Size & in_image_size,
+                    std::vector<cv::Rect> &outs) {
+        outs.clear();
+        for (const auto& face : in_faces) {
+            cv::Rect tmp_rect = face.bbox.getRect();
+            //Compare to transposed sizes width<->height
+            tmp_rect &= cv::Rect(tmp_rect.x, tmp_rect.y, in_image_size.height - tmp_rect.x - 4, in_image_size.width - tmp_rect.y - 4);
+            outs.push_back(tmp_rect);
+        }
+    }
+};// GAPI_OCV_KERNEL(R_O_NetPreProcGetROIs)
+
+
+GAPI_OCV_KERNEL(OCVRNetPostProc, RNetPostProc) {
+    static void run(const std::vector<Face> &in_faces,
+                    const std::vector<cv::Mat> &in_scores,
+                    const std::vector<cv::Mat> &in_regresssions,
+                    const float threshold,
+                    std::vector<Face> &out_faces) {
+        out_faces.clear();
+        for (unsigned int k = 0; k < in_faces.size(); ++k) {
+            const float* scores_data = in_scores[k].ptr<float>();
+            const float* reg_data = in_regresssions[k].ptr<float>();
+            if (scores_data[1] >= threshold) {
+                Face info = in_faces[k];
+                info.score = scores_data[1];
+                std::copy_n(reg_data, NUM_REGRESSIONS, info.regression.begin());
+                out_faces.push_back(info);
+            }
+        }
+    }
+};// GAPI_OCV_KERNEL(RNetPostProc)
+
+GAPI_OCV_KERNEL(OCVONetPostProc, ONetPostProc) {
+    static void run(const std::vector<Face> &in_faces,
+                    const std::vector<cv::Mat> &in_scores,
+                    const std::vector<cv::Mat> &in_regresssions,
+                    const std::vector<cv::Mat> &in_landmarks,
+                    const float threshold,
+                    std::vector<Face> &out_faces) {
+        out_faces.clear();
+        for (unsigned int k = 0; k < in_faces.size(); ++k) {
+            const float* scores_data = in_scores[k].ptr<float>();
+            const float* reg_data = in_regresssions[k].ptr<float>();
+            const float* landmark_data = in_landmarks[k].ptr<float>();
+            if (scores_data[1] >= threshold) {
+                Face info = in_faces[k];
+                info.score = scores_data[1];
+                for (size_t i = 0; i < 4; ++i) {
+                    info.regression[i] = reg_data[i];
+                }
+                float w = info.bbox.x2 - info.bbox.x1 + 1.0f;
+                float h = info.bbox.y2 - info.bbox.y1 + 1.0f;
+
+                for (size_t p = 0; p < NUM_PTS; ++p) {
+                    info.ptsCoords[2 * p] =
+                        info.bbox.x1 + static_cast<float>(landmark_data[NUM_PTS + p]) * w - 1;
+                    info.ptsCoords[2 * p + 1] = info.bbox.y1 + static_cast<float>(landmark_data[p]) * h - 1;
+                }
+
+                out_faces.push_back(info);
+            }
+        }
+    }
+};// GAPI_OCV_KERNEL(ONetPostProc)
+
+GAPI_OCV_KERNEL(OCVSwapFaces, SwapFaces) {
+    static void run(const std::vector<Face> &in_faces,
+                    std::vector<Face> &out_faces) {
+        std::vector<Face> in_faces_copy = in_faces;
+        out_faces.clear();
+        if (!in_faces_copy.empty()) {
+            for (size_t i = 0; i < in_faces_copy.size(); ++i) {
+                std::swap(in_faces_copy[i].bbox.x1, in_faces_copy[i].bbox.y1);
+                std::swap(in_faces_copy[i].bbox.x2, in_faces_copy[i].bbox.y2);
+                for (size_t p = 0; p < NUM_PTS; ++p) {
+                    std::swap(in_faces_copy[i].ptsCoords[2 * p], in_faces_copy[i].ptsCoords[2 * p + 1]);
+                }
+            }
+            out_faces = in_faces_copy;
+        }
+    }
+};// GAPI_OCV_KERNEL(SwapFaces)
+
+} // anonymous namespace
+} // namespace custom
+
+namespace vis {
+namespace {
+void bbox(const cv::Mat& m, const cv::Rect& rc) {
+    cv::rectangle(m, rc, cv::Scalar{ 0,255,0 }, 2, cv::LINE_8, 0);
+};
+
+using rectPoints = std::pair<cv::Rect, std::vector<cv::Point>>;
+
+static cv::Mat drawRectsAndPoints(const cv::Mat& img,
+    const std::vector<rectPoints> data) {
+    cv::Mat outImg;
+    img.copyTo(outImg);
+
+    for (const auto& el : data) {
+        vis::bbox(outImg, el.first);
+        auto pts = el.second;
+        for (size_t i = 0; i < pts.size(); ++i) {
+            cv::circle(outImg, pts[i], 3, cv::Scalar(0, 255, 255), 1);
+        }
+    }
+    return outImg;
+}
+} // anonymous namespace
+} // namespace vis
+
+
+//Infer helper function
+namespace {
+static inline std::tuple<cv::GMat, cv::GMat> run_mtcnn_p(cv::GMat &in, const std::string &id) {
+    cv::GInferInputs inputs;
+    inputs["data"] = in;
+    auto outputs = cv::gapi::infer<cv::gapi::Generic>(id, inputs);
+    auto regressions = outputs.at("conv4-2");
+    auto scores = outputs.at("prob1");
+    return std::make_tuple(regressions, scores);
+}
+
+static inline std::string get_pnet_level_name(const cv::Size &in_size) {
+    return "MTCNNProposal_" + std::to_string(in_size.width) + "x" + std::to_string(in_size.height);
+}
+
+int calculate_scales(const cv::Size &input_size, std::vector<double> &out_scales, std::vector<cv::Size> &out_sizes ) {
+    //calculate multi - scale and limit the maxinum side to 1000
+    //pr_scale: limit the maxinum side to 1000, < 1.0
+    double pr_scale = 1.0;
+    double h = static_cast<double>(input_size.height);
+    double w = static_cast<double>(input_size.width);
+    if (std::min(w, h) > 1000)
+    {
+        pr_scale = 1000.0 / std::min(h, w);
+        w = w * pr_scale;
+        h = h * pr_scale;
+    }
+    else if (std::max(w, h) < 1000)
+    {
+        w = w * pr_scale;
+        h = h * pr_scale;
+    }
+    //multi - scale
+    out_scales.clear();
+    out_sizes.clear();
+    const double factor = 0.709;
+    int factor_count = 0;
+    double minl = std::min(h, w);
+    while (minl >= 12)
+    {
+        const double current_scale = pr_scale * std::pow(factor, factor_count);
+        cv::Size current_size(static_cast<int>(static_cast<double>(input_size.width) * current_scale),
+                              static_cast<int>(static_cast<double>(input_size.height) * current_scale));
+        out_scales.push_back(current_scale);
+        out_sizes.push_back(current_size);
+        minl *= factor;
+        factor_count += 1;
+    }
+    return factor_count;
+}
+
+int calculate_half_scales(const cv::Size &input_size, std::vector<double>& out_scales, std::vector<cv::Size>& out_sizes) {
+    double pr_scale = 0.5;
+    const double h = static_cast<double>(input_size.height);
+    const double w = static_cast<double>(input_size.width);
+    //multi - scale
+    out_scales.clear();
+    out_sizes.clear();
+    const double factor = 0.5;
+    int factor_count = 0;
+    double minl = std::min(h, w);
+    while (minl >= 12.0*2.0)
+    {
+        const double current_scale = pr_scale;
+        cv::Size current_size(static_cast<int>(static_cast<double>(input_size.width) * current_scale),
+                              static_cast<int>(static_cast<double>(input_size.height) * current_scale));
+        out_scales.push_back(current_scale);
+        out_sizes.push_back(current_size);
+        minl *= factor;
+        factor_count += 1;
+        pr_scale *= 0.5;
+    }
+    return factor_count;
+}
+
+const int MAX_PYRAMID_LEVELS = 13;
+//////////////////////////////////////////////////////////////////////
+} // anonymous namespace
+
+int main(int argc, char* argv[]) {
+    cv::CommandLineParser cmd(argc, argv, keys);
+    cmd.about(about);
+    if (cmd.has("help")) {
+        cmd.printMessage();
+        return 0;
+    }
+    const auto input_file_name = cmd.get<std::string>("input");
+    const auto model_path_p = cmd.get<std::string>("mtcnnpm");
+    const auto target_dev_p = cmd.get<std::string>("mtcnnpd");
+    const auto conf_thresh_p = cmd.get<float>("thrp");
+    const auto model_path_r = cmd.get<std::string>("mtcnnrm");
+    const auto target_dev_r = cmd.get<std::string>("mtcnnrd");
+    const auto conf_thresh_r = cmd.get<float>("thrr");
+    const auto model_path_o = cmd.get<std::string>("mtcnnom");
+    const auto target_dev_o = cmd.get<std::string>("mtcnnod");
+    const auto conf_thresh_o = cmd.get<float>("thro");
+    const auto use_half_scale = cmd.get<bool>("half_scale");
+    const auto streaming_queue_capacity = cmd.get<unsigned int>("queue_capacity");
+
+    std::vector<cv::Size> level_size;
+    std::vector<double> scales;
+    //MTCNN input size
+    cv::VideoCapture cap;
+    cap.open(input_file_name);
+    if (!cap.isOpened())
+        CV_Assert(false);
+    auto in_rsz = cv::Size{ static_cast<int>(cap.get(cv::CAP_PROP_FRAME_WIDTH)),
+                            static_cast<int>(cap.get(cv::CAP_PROP_FRAME_HEIGHT)) };
+    //Calculate scales, number of pyramid levels and sizes for PNet pyramid
+    auto pyramid_levels = use_half_scale ? calculate_half_scales(in_rsz, scales, level_size) :
+                                           calculate_scales(in_rsz, scales, level_size);
+    CV_Assert(pyramid_levels <= MAX_PYRAMID_LEVELS);
+
+    //Proposal part of MTCNN graph
+    //Preprocessing BGR2RGB + transpose (NCWH is expected instead of NCHW)
+    cv::GMat in_original;
+    cv::GMat in_originalRGB = cv::gapi::BGR2RGB(in_original);
+    cv::GOpaque<cv::Size> in_sz = cv::gapi::streaming::size(in_original);
+    cv::GMat in_resized[MAX_PYRAMID_LEVELS];
+    cv::GMat in_transposed[MAX_PYRAMID_LEVELS];
+    cv::GMat regressions[MAX_PYRAMID_LEVELS];
+    cv::GMat scores[MAX_PYRAMID_LEVELS];
+    cv::GArray<custom::Face> nms_p_faces[MAX_PYRAMID_LEVELS];
+    cv::GArray<custom::Face> total_faces[MAX_PYRAMID_LEVELS];
+    cv::GArray<custom::Face> faces_init(std::vector<custom::Face>{});
+
+    //The very first PNet pyramid layer to init total_faces[0]
+    in_resized[0] = cv::gapi::resize(in_originalRGB, level_size[0]);
+    in_transposed[0] = cv::gapi::transpose(in_resized[0]);
+    std::tie(regressions[0], scores[0]) = run_mtcnn_p(in_transposed[0], get_pnet_level_name(level_size[0]));
+    cv::GArray<custom::Face> faces0 = custom::BuildFaces::on(scores[0], regressions[0], static_cast<float>(scales[0]), conf_thresh_p);
+    cv::GArray<custom::Face> final_p_faces_for_bb2squares = custom::ApplyRegression::on(faces0, true);
+    cv::GArray<custom::Face> final_faces_pnet0 = custom::BBoxesToSquares::on(final_p_faces_for_bb2squares);
+    nms_p_faces[0] = custom::RunNMS::on(final_faces_pnet0, 0.5f, false);
+    total_faces[0] = custom::AccumulatePyramidOutputs::on(faces_init, nms_p_faces[0]);
+    //The rest PNet pyramid layers to accumlate all layers result in total_faces[PYRAMID_LEVELS - 1]]
+    for (int i = 1; i < pyramid_levels; ++i)
+    {
+        in_resized[i] = cv::gapi::resize(in_originalRGB, level_size[i]);
+        in_transposed[i] = cv::gapi::transpose(in_resized[i]);
+        std::tie(regressions[i], scores[i]) = run_mtcnn_p(in_transposed[i], get_pnet_level_name(level_size[i]));
+        cv::GArray<custom::Face> faces = custom::BuildFaces::on(scores[i], regressions[i], static_cast<float>(scales[i]), conf_thresh_p);
+        cv::GArray<custom::Face> final_p_faces_for_bb2squares_i = custom::ApplyRegression::on(faces, true);
+        cv::GArray<custom::Face> final_faces_pnet_i = custom::BBoxesToSquares::on(final_p_faces_for_bb2squares_i);
+        nms_p_faces[i] = custom::RunNMS::on(final_faces_pnet_i, 0.5f, false);
+        total_faces[i] = custom::AccumulatePyramidOutputs::on(total_faces[i - 1], nms_p_faces[i]);
+    }
+
+    //Proposal post-processing
+    cv::GArray<custom::Face> final_faces_pnet = custom::RunNMS::on(total_faces[pyramid_levels - 1], 0.7f, true);
+
+    //Refinement part of MTCNN graph
+    cv::GArray<cv::Rect> faces_roi_pnet = custom::R_O_NetPreProcGetROIs::on(final_faces_pnet, in_sz);
+    cv::GArray<cv::GMat> regressionsRNet, scoresRNet;
+    cv::GMat in_originalRGB_transposed = cv::gapi::transpose(in_originalRGB);
+    std::tie(regressionsRNet, scoresRNet) = cv::gapi::infer<custom::MTCNNRefinement>(faces_roi_pnet, in_originalRGB_transposed);
+
+    //Refinement post-processing
+    cv::GArray<custom::Face> rnet_post_proc_faces = custom::RNetPostProc::on(final_faces_pnet, scoresRNet, regressionsRNet, conf_thresh_r);
+    cv::GArray<custom::Face> nms07_r_faces_total = custom::RunNMS::on(rnet_post_proc_faces, 0.7f, false);
+    cv::GArray<custom::Face> final_r_faces_for_bb2squares = custom::ApplyRegression::on(nms07_r_faces_total, true);
+    cv::GArray<custom::Face> final_faces_rnet = custom::BBoxesToSquares::on(final_r_faces_for_bb2squares);
+
+    //Output part of MTCNN graph
+    cv::GArray<cv::Rect> faces_roi_rnet = custom::R_O_NetPreProcGetROIs::on(final_faces_rnet, in_sz);
+    cv::GArray<cv::GMat> regressionsONet, scoresONet, landmarksONet;
+    std::tie(regressionsONet, landmarksONet, scoresONet) = cv::gapi::infer<custom::MTCNNOutput>(faces_roi_rnet, in_originalRGB_transposed);
+
+    //Output post-processing
+    cv::GArray<custom::Face> onet_post_proc_faces = custom::ONetPostProc::on(final_faces_rnet, scoresONet, regressionsONet, landmarksONet, conf_thresh_o);
+    cv::GArray<custom::Face> final_o_faces_for_nms07 = custom::ApplyRegression::on(onet_post_proc_faces, true);
+    cv::GArray<custom::Face> nms07_o_faces_total = custom::RunNMS::on(final_o_faces_for_nms07, 0.7f, true);
+    cv::GArray<custom::Face> final_faces_onet = custom::SwapFaces::on(nms07_o_faces_total);
+
+    cv::GComputation graph_mtcnn(cv::GIn(in_original), cv::GOut(cv::gapi::copy(in_original), final_faces_onet));
+
+    // MTCNN Refinement detection network
+    auto mtcnnr_net = cv::gapi::ie::Params<custom::MTCNNRefinement>{
+        model_path_r,                // path to topology IR
+        weights_path(model_path_r),  // path to weights
+        target_dev_r,                // device specifier
+    }.cfgOutputLayers({ "conv5-2", "prob1" }).cfgInputLayers({ "data" });
+
+    // MTCNN Output detection network
+    auto mtcnno_net = cv::gapi::ie::Params<custom::MTCNNOutput>{
+        model_path_o,                // path to topology IR
+        weights_path(model_path_o),  // path to weights
+        target_dev_o,                // device specifier
+    }.cfgOutputLayers({ "conv6-2", "conv6-3", "prob1" }).cfgInputLayers({ "data" });
+
+    auto networks_mtcnn = cv::gapi::networks(mtcnnr_net, mtcnno_net);
+
+    // MTCNN Proposal detection network
+    for (int i = 0; i < pyramid_levels; ++i)
+    {
+        std::string net_id = get_pnet_level_name(level_size[i]);
+        std::vector<size_t> reshape_dims = { 1, 3, (size_t)level_size[i].width, (size_t)level_size[i].height };
+        cv::gapi::ie::Params<cv::gapi::Generic> mtcnnp_net{
+                    net_id,                      // tag
+                    model_path_p,                // path to topology IR
+                    weights_path(model_path_p),  // path to weights
+                    target_dev_p,                // device specifier
+        };
+        mtcnnp_net.cfgInputReshape({ {"data", reshape_dims} });
+        networks_mtcnn += cv::gapi::networks(mtcnnp_net);
+    }
+
+    auto kernels_mtcnn = cv::gapi::kernels< custom::OCVBuildFaces
+                                          , custom::OCVRunNMS
+                                          , custom::OCVAccumulatePyramidOutputs
+                                          , custom::OCVApplyRegression
+                                          , custom::OCVBBoxesToSquares
+                                          , custom::OCVR_O_NetPreProcGetROIs
+                                          , custom::OCVRNetPostProc
+                                          , custom::OCVONetPostProc
+                                          , custom::OCVSwapFaces
+    >();
+    auto mtcnn_args = cv::compile_args(networks_mtcnn, kernels_mtcnn);
+    if (streaming_queue_capacity != 0)
+        mtcnn_args += cv::compile_args(cv::gapi::streaming::queue_capacity{ streaming_queue_capacity });
+    auto pipeline_mtcnn = graph_mtcnn.compileStreaming(std::move(mtcnn_args));
+
+    std::cout << "Reading " << input_file_name << std::endl;
+    // Input stream
+    auto in_src = cv::gapi::wip::make_src<cv::gapi::wip::GCaptureSource>(input_file_name);
+
+    // Set the pipeline source & start the pipeline
+    pipeline_mtcnn.setSource(cv::gin(in_src));
+    pipeline_mtcnn.start();
+
+    // Declare the output data & run the processing loop
+    cv::TickMeter tm;
+    cv::Mat image;
+    std::vector<custom::Face> out_faces;
+
+    tm.start();
+    int frames = 0;
+    while (pipeline_mtcnn.pull(cv::gout(image, out_faces))) {
+        frames++;
+        std::cout << "Final Faces Size " << out_faces.size() << std::endl;
+        std::vector<vis::rectPoints> data;
+        // show the image with faces in it
+        for (const auto& out_face : out_faces) {
+            std::vector<cv::Point> pts;
+            for (size_t p = 0; p < NUM_PTS; ++p) {
+                pts.push_back(
+                    cv::Point(static_cast<int>(out_face.ptsCoords[2 * p]), static_cast<int>(out_face.ptsCoords[2 * p + 1])));
+            }
+            auto rect = out_face.bbox.getRect();
+            auto d = std::make_pair(rect, pts);
+            data.push_back(d);
+        }
+        // Visualize results on the frame
+        auto resultImg = vis::drawRectsAndPoints(image, data);
+        tm.stop();
+        const auto fps_str = std::to_string(frames / tm.getTimeSec()) + " FPS";
+        cv::putText(resultImg, fps_str, { 0,32 }, cv::FONT_HERSHEY_SIMPLEX, 1.0, { 0,255,0 }, 2);
+        cv::imshow("Out", resultImg);
+        cv::waitKey(1);
+        out_faces.clear();
+        tm.start();
+    }
+    tm.stop();
+    std::cout << "Processed " << frames << " frames"
+        << " (" << frames / tm.getTimeSec() << " FPS)" << std::endl;
+    return 0;
+}
diff --git a/modules/gapi/src/api/kernels_core.cpp b/modules/gapi/src/api/kernels_core.cpp
index 3196b5db2e..4485e36f27 100644
--- a/modules/gapi/src/api/kernels_core.cpp
+++ b/modules/gapi/src/api/kernels_core.cpp
@@ -417,6 +417,12 @@ std::tuple<GOpaque<double>,GArray<int>,GArray<Point3f>> kmeans(const GArray<Poin
     return core::GKMeans3D::on(data, K, bestLabels, criteria, attempts, flags);
 }
 
+
+GMat transpose(const GMat& src)
+{
+    return core::GTranspose::on(src);
+}
+
 GOpaque<Size> streaming::size(const GMat& src)
 {
     return streaming::GSize::on(src);
diff --git a/modules/gapi/src/backends/common/gmetabackend.cpp b/modules/gapi/src/backends/common/gmetabackend.cpp
index 5364152b65..c535569b0c 100644
--- a/modules/gapi/src/backends/common/gmetabackend.cpp
+++ b/modules/gapi/src/backends/common/gmetabackend.cpp
@@ -71,7 +71,7 @@ void GraphMetaExecutable::run(std::vector<InObj>  &&input_objs,
     cv::util::get<cv::detail::OpaqueRef>(out_arg) = it->second;
 }
 
-class GraphMetaBackendImpl final: public cv::gapi::GBackend::Priv {
+class GGraphMetaBackendImpl final: public cv::gapi::GBackend::Priv {
     virtual void unpackKernel(ade::Graph            &,
                               const ade::NodeHandle &,
                               const cv::GKernelImpl &) override {
@@ -88,7 +88,7 @@ class GraphMetaBackendImpl final: public cv::gapi::GBackend::Priv {
 };
 
 cv::gapi::GBackend graph_meta_backend() {
-    static cv::gapi::GBackend this_backend(std::make_shared<GraphMetaBackendImpl>());
+    static cv::gapi::GBackend this_backend(std::make_shared<GGraphMetaBackendImpl>());
     return this_backend;
 }
 
diff --git a/modules/gapi/src/backends/cpu/gcpubackend.cpp b/modules/gapi/src/backends/cpu/gcpubackend.cpp
index cf4b087f92..dfcaf3d478 100644
--- a/modules/gapi/src/backends/cpu/gcpubackend.cpp
+++ b/modules/gapi/src/backends/cpu/gcpubackend.cpp
@@ -2,7 +2,7 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 //
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2021 Intel Corporation
 
 
 #include "precomp.hpp"
@@ -26,6 +26,8 @@
 
 #include "api/gbackend_priv.hpp" // FIXME: Make it part of Backend SDK!
 
+#include "utils/itt.hpp"
+
 // FIXME: Is there a way to take a typed graph (our GModel),
 // and create a new typed graph _ATOP_ of that (by extending with a couple of
 // new types?).
@@ -251,8 +253,13 @@ void cv::gimpl::GCPUExecutable::run(std::vector<InObj>  &&input_objs,
             context.m_state = m_nodesToStates.at(op_info.nh);
         }
 
-        // Now trigger the executable unit
-        k.m_runF(context);
+        {
+            GAPI_ITT_DYNAMIC_LOCAL_HANDLE(op_hndl, op.k.name.c_str());
+            GAPI_ITT_AUTO_TRACE_GUARD(op_hndl);
+
+            // Now trigger the executable unit
+            k.m_runF(context);
+        }
 
         //As Kernels are forbidden to allocate memory for (Mat) outputs,
         //this code seems redundant, at least for Mats
diff --git a/modules/gapi/src/backends/cpu/gcpucore.cpp b/modules/gapi/src/backends/cpu/gcpucore.cpp
index b0bce410f7..168a2f9833 100644
--- a/modules/gapi/src/backends/cpu/gcpucore.cpp
+++ b/modules/gapi/src/backends/cpu/gcpucore.cpp
@@ -634,6 +634,15 @@ GAPI_OCV_KERNEL(GCPUKMeans3D, cv::gapi::core::GKMeans3D)
     }
 };
 
+GAPI_OCV_KERNEL(GCPUTranspose, cv::gapi::core::GTranspose)
+{
+    static void run(const cv::Mat& in, cv::Mat& out)
+    {
+        cv::transpose(in, out);
+    }
+};
+
+
 GAPI_OCV_KERNEL(GCPUParseSSDBL, cv::gapi::nn::parsers::GParseSSDBL)
 {
     static void run(const cv::Mat&  in_ssd_result,
@@ -774,6 +783,7 @@ cv::gapi::GKernelPackage cv::gapi::core::cpu::kernels()
          , GCPUKMeansNDNoInit
          , GCPUKMeans2D
          , GCPUKMeans3D
+         , GCPUTranspose
          , GCPUParseSSDBL
          , GOCVParseSSD
          , GCPUParseYolo
diff --git a/modules/gapi/src/backends/ie/giebackend.cpp b/modules/gapi/src/backends/ie/giebackend.cpp
index 13daf5d6df..46b6bdbb97 100644
--- a/modules/gapi/src/backends/ie/giebackend.cpp
+++ b/modules/gapi/src/backends/ie/giebackend.cpp
@@ -2,7 +2,7 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 //
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2021 Intel Corporation
 
 #include "precomp.hpp"
 
@@ -60,6 +60,8 @@ template<typename T> using QueueClass = tbb::concurrent_bounded_queue<T>;
 template<typename T> using QueueClass = cv::gapi::own::concurrent_bounded_queue<T>;
 #endif // TBB
 
+#include "utils/itt.hpp"
+
 namespace IE = InferenceEngine;
 
 namespace {
@@ -116,6 +118,7 @@ inline int toCV(IE::Precision prec) {
     case IE::Precision::FP32: return CV_32F;
     case IE::Precision::I32:  return CV_32S;
     case IE::Precision::I64:  return CV_32S;
+    case IE::Precision::FP16: return CV_16F;
     default:     GAPI_Assert(false && "IE. Unsupported data type");
     }
     return -1;
@@ -194,6 +197,7 @@ inline void copyFromIE(const IE::Blob::Ptr &blob, MatType &mat) {
         HANDLE(U8, uint8_t);
         HANDLE(FP32, float);
         HANDLE(I32, int);
+        HANDLE(FP16, cv::float16_t);
 #undef HANDLE
         case IE::Precision::I64: {
             GAPI_LOG_WARNING(NULL, "INT64 isn't supported for cv::Mat. Conversion to INT32 is used.");
@@ -530,10 +534,11 @@ public:
     explicit RequestPool(std::vector<InferenceEngine::InferRequest>&& requests);
 
     void execute(Task&& t);
-    void waitAndShutdown();
+    void waitAll();
 
 private:
     void callback(Task task, InferenceEngine::InferRequest& request, size_t id);
+    void setup();
 
     QueueClass<size_t>                         m_idle_ids;
     std::vector<InferenceEngine::InferRequest> m_requests;
@@ -542,11 +547,15 @@ private:
 // RequestPool implementation //////////////////////////////////////////////
 cv::gimpl::ie::RequestPool::RequestPool(std::vector<InferenceEngine::InferRequest>&& requests)
     : m_requests(std::move(requests)) {
-        for (size_t i = 0; i < m_requests.size(); ++i) {
-            m_idle_ids.push(i);
-        }
+        setup();
     }
 
+void cv::gimpl::ie::RequestPool::setup() {
+    for (size_t i = 0; i < m_requests.size(); ++i) {
+        m_idle_ids.push(i);
+    }
+}
+
 void cv::gimpl::ie::RequestPool::execute(cv::gimpl::ie::RequestPool::Task&& t) {
     size_t id = 0u;
     m_idle_ids.pop(id);
@@ -566,12 +575,13 @@ void cv::gimpl::ie::RequestPool::callback(cv::gimpl::ie::RequestPool::Task task,
 }
 
 // NB: Not thread-safe.
-void cv::gimpl::ie::RequestPool::waitAndShutdown() {
+void cv::gimpl::ie::RequestPool::waitAll() {
     // NB: It will be blocked if at least one request is busy.
     for (size_t i = 0; i < m_requests.size(); ++i) {
         size_t id = 0u;
         m_idle_ids.pop(id);
     }
+    setup();
 }
 
 // GCPUExcecutable implementation //////////////////////////////////////////////
@@ -632,7 +642,7 @@ void cv::gimpl::ie::GIEExecutable::run(cv::gimpl::GIslandExecutable::IInput  &in
     if (cv::util::holds_alternative<cv::gimpl::EndOfStream>(in_msg))
     {
         // (3) Wait until all passed task are done.
-        m_reqPool->waitAndShutdown();
+        m_reqPool->waitAll();
         out.post(cv::gimpl::EndOfStream{});
         return;
     }
@@ -671,7 +681,7 @@ void cv::gimpl::ie::GIEExecutable::run(cv::gimpl::GIslandExecutable::IInput  &in
     // (5) In non-streaming mode need to wait until the all tasks are done
     // FIXME: Is there more graceful way to handle this case ?
     if (!m_gm.metadata().contains<Streaming>()) {
-        m_reqPool->waitAndShutdown();
+        m_reqPool->waitAll();
     }
 }
 
@@ -751,6 +761,9 @@ static void configureInputInfo(const IE::InputInfo::Ptr& ii, const cv::GMetaArg
 // to post outputs blobs (cv::GMat's).
 static void PostOutputs(InferenceEngine::InferRequest   &request,
                         std::shared_ptr<IECallContext>   ctx) {
+    GAPI_ITT_STATIC_LOCAL_HANDLE(ie_cb_post_outputs_hndl, "IE_async_callback_PostOutputs");
+    GAPI_ITT_AUTO_TRACE_GUARD(ie_cb_post_outputs_hndl);
+
     for (auto i : ade::util::iota(ctx->uu.params.num_out))
     {
         auto& out_mat = ctx->outMatR(i);
@@ -1279,6 +1292,17 @@ namespace {
                                     , cv::gimpl::ie::InferList2
                                     >();
         }
+
+        virtual bool controlsMerge() const override {
+            return true;
+        }
+
+        virtual bool allowsMerge(const cv::gimpl::GIslandModel::Graph &,
+                                 const ade::NodeHandle &,
+                                 const ade::NodeHandle &,
+                                 const ade::NodeHandle &) const override {
+            return false;
+        }
     };
 }
 
diff --git a/modules/gapi/src/backends/ocl/goclcore.cpp b/modules/gapi/src/backends/ocl/goclcore.cpp
index afe211dc7e..d74d521953 100644
--- a/modules/gapi/src/backends/ocl/goclcore.cpp
+++ b/modules/gapi/src/backends/ocl/goclcore.cpp
@@ -522,6 +522,15 @@ GAPI_OCL_KERNEL(GOCLConvertTo, cv::gapi::core::GConvertTo)
     }
 };
 
+
+GAPI_OCL_KERNEL(GOCLTranspose, cv::gapi::core::GTranspose)
+{
+    static void run(const cv::UMat& in,  cv::UMat& out)
+    {
+        cv::transpose(in, out);
+    }
+};
+
 cv::gapi::GKernelPackage cv::gapi::core::ocl::kernels()
 {
     static auto pkg = cv::gapi::kernels
@@ -586,6 +595,7 @@ cv::gapi::GKernelPackage cv::gapi::core::ocl::kernels()
          , GOCLConcatVert
          , GOCLLUT
          , GOCLConvertTo
+         , GOCLTranspose
          >();
     return pkg;
 }
diff --git a/modules/gapi/src/compiler/gislandmodel.cpp b/modules/gapi/src/compiler/gislandmodel.cpp
index fb2457a191..1a8e0939e2 100644
--- a/modules/gapi/src/compiler/gislandmodel.cpp
+++ b/modules/gapi/src/compiler/gislandmodel.cpp
@@ -2,7 +2,7 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 //
-// Copyright (C) 2018 Intel Corporation
+// Copyright (C) 2018-2021 Intel Corporation
 
 
 #include "precomp.hpp"
@@ -10,6 +10,8 @@
 #include <sstream>
 #include <unordered_set>
 #include <unordered_map>
+#include <typeinfo> // typeid
+#include <cctype> // std::isdigit
 
 #include <ade/util/checked_cast.hpp>
 #include <ade/util/zip_range.hpp> // zip_range, indexed
@@ -335,6 +337,53 @@ ade::NodeHandle GIslandModel::producerOf(const ConstGraph &g, ade::NodeHandle &d
     return ade::NodeHandle();
 }
 
+std::string GIslandModel::traceIslandName(const ade::NodeHandle& island_nh, const Graph& g) {
+    auto island_ptr = g.metadata(island_nh).get<FusedIsland>().object;
+    std::string island_name = island_ptr->name();
+
+    std::string backend_name = "";
+
+    auto& backend_impl = island_ptr->backend().priv();
+    std::string backend_impl_type_name = typeid(backend_impl).name();
+
+    // NOTE: Major part of already existing backends implementaion classes are called using
+    //       "*G[Name]BackendImpl*" pattern.
+    //       We are trying to match against this pattern and retrive just [Name] part.
+    //       If matching isn't successful, full mangled class name will be used.
+    //
+    //       To match we use following algorithm:
+    //           1) Find "BackendImpl" substring, if it doesn't exist, go to step 5.
+    //           2) Let from_pos be second character in a string.
+    //           3) Starting from from_pos, seek for "G" symbol in a string.
+    //              If it doesn't exist or exists after "BackendImpl" position, go to step 5.
+    //           4) Check that previous character before found "G" is digit, means that this is
+    //              part of characters number in a new word in a string (previous words may be
+    //              namespaces).
+    //              If it is so, match is found. Return name between found "G" and "BackendImpl".
+    //              If it isn't so, assign from_pos to found "G" position + 1 and loop to step 3.
+    //           5) Matching is not successful, return full class name.
+    bool matched = false;
+    bool stop = false;
+    auto to_pos = backend_impl_type_name.find("BackendImpl");
+    std::size_t from_pos = 0UL;
+    if (to_pos != std::string::npos) {
+        while (!matched  && !stop) {
+            from_pos = backend_impl_type_name.find("G", from_pos + 1);
+            stop = from_pos == std::string::npos || from_pos >= to_pos;
+            matched = !stop && std::isdigit(backend_impl_type_name[from_pos - 1]);
+        }
+    }
+
+    if (matched) {
+        backend_name = backend_impl_type_name.substr(from_pos + 1, to_pos - from_pos - 1);
+    }
+    else {
+        backend_name = backend_impl_type_name;
+    }
+
+    return island_name + "_" + backend_name;
+}
+
 void GIslandExecutable::run(GIslandExecutable::IInput &in, GIslandExecutable::IOutput &out)
 {
     // Default implementation: just reuse the existing old-fashioned run
diff --git a/modules/gapi/src/compiler/gislandmodel.hpp b/modules/gapi/src/compiler/gislandmodel.hpp
index e8eb73692b..2cdd10346c 100644
--- a/modules/gapi/src/compiler/gislandmodel.hpp
+++ b/modules/gapi/src/compiler/gislandmodel.hpp
@@ -2,7 +2,7 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 //
-// Copyright (C) 2018-2019 Intel Corporation
+// Copyright (C) 2018-2021 Intel Corporation
 
 
 #ifndef OPENCV_GAPI_GISLANDMODEL_HPP
@@ -290,7 +290,11 @@ namespace GIslandModel
     //     from the original model (! don't mix with DataSlot)
     // FIXME: GAPI_EXPORTS because of tests only!
     ade::NodeHandle GAPI_EXPORTS producerOf(const ConstGraph &g, ade::NodeHandle &data_nh);
-
+    // traceIslandName - returns pretty island name for passed island node.
+    //     Function uses RTTI to assembly name.
+    //     In case if name of backend implementation class doesn't fit *G[Name]BackendImpl* pattern,
+    //     raw mangled name of class will be used.
+    std::string traceIslandName(const ade::NodeHandle& op_nh, const Graph& g);
 } // namespace GIslandModel
 
 }} // namespace cv::gimpl
diff --git a/modules/gapi/src/executor/gapi_itt.hpp b/modules/gapi/src/executor/gapi_itt.hpp
deleted file mode 100644
index 2ab3237e7f..0000000000
--- a/modules/gapi/src/executor/gapi_itt.hpp
+++ /dev/null
@@ -1,59 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-//
-// Copyright (C) 2020 Intel Corporation
-
-#ifndef OPENCV_GAPI_GAPI_ITT_HPP
-#define OPENCV_GAPI_GAPI_ITT_HPP
-
-//for ITT_NAMED_TRACE_GUARD
-#include <type_traits>
-#include <memory>
-
-// FIXME: It seems that this macro is not propagated here by the OpenCV cmake (as this is not core module).
-// (Consider using OpenCV's trace.hpp )
-#ifdef OPENCV_WITH_ITT
-#include <ittnotify.h>
-#endif
-
-#include <opencv2/gapi/util/compiler_hints.hpp>
-namespace cv {
-namespace util {
-    template< class T >
-    using remove_reference_t = typename std::remove_reference<T>::type;
-
-    // Home brew ScopeGuard
-    // D will be called automatically with p as argument when ScopeGuard goes out of scope.
-    // call release() on the ScopeGuard object to revoke guard action
-    template<typename T, typename D>
-    auto make_ptr_guard(T* p, D&& d) -> std::unique_ptr<T, util::remove_reference_t<D>> {
-        return {p, std::forward<D>(d)};
-    }
-}  // namespace util
-
-// FIXME: make it more reusable (and move to other place and other namespace)
-namespace gimpl { namespace parallel {
-    #ifdef OPENCV_WITH_ITT
-    extern const __itt_domain* gapi_itt_domain;
-
-    namespace {
-        auto make_itt_guard = [](__itt_string_handle* h) {
-           __itt_task_begin(gapi_itt_domain, __itt_null, __itt_null, (h));
-           return util::make_ptr_guard(reinterpret_cast<int*>(1), [](int* ) { __itt_task_end(gapi_itt_domain); });
-        };
-    }  // namespace
-
-    #define GAPI_ITT_NAMED_TRACE_GUARD(name, h)  auto name =  cv::gimpl::parallel::make_itt_guard(h); cv::util::suppress_unused_warning(name)
-    #else
-    struct dumb_guard {void reset(){}};
-    #define GAPI_ITT_NAMED_TRACE_GUARD(name, h)  cv::gimpl::parallel::dumb_guard name; cv::util::suppress_unused_warning(name)
-    #endif
-
-    #define GAPI_ITT_AUTO_TRACE_GUARD_IMPL_(LINE, h)        GAPI_ITT_NAMED_TRACE_GUARD(itt_trace_guard_##LINE, h)
-    #define GAPI_ITT_AUTO_TRACE_GUARD_IMPL(LINE, h)         GAPI_ITT_AUTO_TRACE_GUARD_IMPL_(LINE, h)
-    #define GAPI_ITT_AUTO_TRACE_GUARD(h)                    GAPI_ITT_AUTO_TRACE_GUARD_IMPL(__LINE__, h)
-}} //gimpl::parallel
-}  //namespace cv
-
-#endif /* OPENCV_GAPI_GAPI_ITT_HPP */
diff --git a/modules/gapi/src/executor/gstreamingexecutor.cpp b/modules/gapi/src/executor/gstreamingexecutor.cpp
index 2a06873fee..74c96bdf3e 100644
--- a/modules/gapi/src/executor/gstreamingexecutor.cpp
+++ b/modules/gapi/src/executor/gstreamingexecutor.cpp
@@ -2,7 +2,7 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 //
-// Copyright (C) 2019-2020 Intel Corporation
+// Copyright (C) 2019-2021 Intel Corporation
 
 #include "precomp.hpp"
 
@@ -16,6 +16,8 @@
 #include <opencv2/gapi/core.hpp> // GCopy -- FIXME - to be removed!
 #endif // GAPI_STANDALONE
 
+#include "utils/itt.hpp"
+
 #include "api/gproto_priv.hpp" // ptr(GRunArgP)
 #include "compiler/passes/passes.hpp"
 #include "backends/common/gbackend.hpp" // createMat
@@ -492,9 +494,15 @@ void emitterActorThread(std::shared_ptr<cv::gimpl::GIslandEmitter> emitter,
         return;
     }
 
+    GAPI_ITT_STATIC_LOCAL_HANDLE(emitter_hndl, "emitter");
+    GAPI_ITT_STATIC_LOCAL_HANDLE(emitter_pull_hndl, "emitter_pull");
+    GAPI_ITT_STATIC_LOCAL_HANDLE(emitter_push_hndl, "emitter_push");
+
     // Now start emitting the data from the source to the pipeline.
     while (true)
     {
+        GAPI_ITT_AUTO_TRACE_GUARD(emitter_hndl);
+
         Cmd cancel;
         if (in_queue.try_pop(cancel))
         {
@@ -507,8 +515,15 @@ void emitterActorThread(std::shared_ptr<cv::gimpl::GIslandEmitter> emitter,
 
         // Try to obtain next data chunk from the source
         cv::GRunArg data;
-        if (emitter->pull(data))
+
+        const bool result = [&](){
+            GAPI_ITT_AUTO_TRACE_GUARD(emitter_pull_hndl);
+            return emitter->pull(data);
+        }();
+
+        if (result)
         {
+            GAPI_ITT_AUTO_TRACE_GUARD(emitter_push_hndl);
             // // On success, broadcast it to our readers
             for (auto &&oq : out_queues)
             {
@@ -539,7 +554,11 @@ void syncActorThread(std::vector<Q*> in_queues,
     std::vector<bool> pop_nexts(in_queues.size());
     std::vector<Cmd> cmds(in_queues.size());
 
+    GAPI_ITT_STATIC_LOCAL_HANDLE(sync_hndl, "sync_actor");
+    GAPI_ITT_STATIC_LOCAL_HANDLE(sync_pull_1_queue_hndl, "sync_actor_pull_from_1_queue");
+    GAPI_ITT_STATIC_LOCAL_HANDLE(sync_push_hndl, "sync_actor_push");
     while (true) {
+        GAPI_ITT_AUTO_TRACE_GUARD(sync_hndl);
         // pop_nexts indicates which queue still contains earlier timestamps and
         // needs to be popped at least one more time.
         // For each iteration (frame) we need to pull from each input queue at least once,
@@ -562,7 +581,10 @@ void syncActorThread(std::vector<Q*> in_queues,
                 auto& q   = std::get<1>(val);
                 auto& cmd = std::get<2>(val);
 
-                q->pop(cmd);
+                {
+                    GAPI_ITT_AUTO_TRACE_GUARD(sync_pull_1_queue_hndl);
+                    q->pop(cmd);
+                }
                 if (cv::util::holds_alternative<Stop>(cmd)) {
                     // We got a stop command from one of the input queues.
                     // Rewind all input queues till Stop command,
@@ -603,9 +625,12 @@ void syncActorThread(std::vector<Q*> in_queues,
         } while (ade::util::any_of(pop_nexts, [](bool v){ return v; }));
 
         // Finally we got all our inputs synchronized, push them further down the graph
-        for (auto &&it : ade::util::zip(out_queues, cmds)) {
-            for (auto &&q : std::get<0>(it)) {
-                q->push(std::get<1>(it));
+        {
+            GAPI_ITT_AUTO_TRACE_GUARD(sync_push_hndl);
+            for (auto &&it : ade::util::zip(out_queues, cmds)) {
+                for (auto &&q : std::get<0>(it)) {
+                    q->push(std::get<1>(it));
+                }
             }
         }
     }
@@ -619,7 +644,11 @@ class StreamingInput final: public cv::gimpl::GIslandExecutable::IInput
 
     virtual cv::gimpl::StreamMsg get() override
     {
+        GAPI_ITT_STATIC_LOCAL_HANDLE(inputs_get_hndl, "StreamingInput::get");
+        GAPI_ITT_AUTO_TRACE_GUARD(inputs_get_hndl);
+
         cv::GRunArgs isl_input_args;
+
         if (!qr.getInputVector(in_queues, in_constants, isl_input_args))
         {
             // Stop case
@@ -680,6 +709,9 @@ class StreamingOutput final: public cv::gimpl::GIslandExecutable::IOutput
     // Prepare this object for posting
     virtual cv::GRunArgP get(int idx) override
     {
+        GAPI_ITT_STATIC_LOCAL_HANDLE(outputs_get_hndl, "StreamingOutput::get (alloc)");
+        GAPI_ITT_AUTO_TRACE_GUARD(outputs_get_hndl);
+
         std::lock_guard<std::mutex> lock{m_mutex};
 
         using MatType = cv::Mat;
@@ -756,8 +788,12 @@ class StreamingOutput final: public cv::gimpl::GIslandExecutable::IOutput
         m_postIdx[cv::gimpl::proto::ptr(ret_val)] = std::make_pair(idx, iter);
         return ret_val;
     }
+
     virtual void post(cv::GRunArgP&& argp) override
     {
+        GAPI_ITT_STATIC_LOCAL_HANDLE(outputs_post_hndl, "StreamingOutput::post");
+        GAPI_ITT_AUTO_TRACE_GUARD(outputs_post_hndl);
+
         std::lock_guard<std::mutex> lock{m_mutex};
 
         // Mark the output ready for posting. If it is the first in the line,
@@ -795,6 +831,7 @@ class StreamingOutput final: public cv::gimpl::GIslandExecutable::IOutput
             post_iter = m_postings[out_idx].erase(post_iter);
         }
     }
+
     virtual void post(cv::gimpl::EndOfStream&&) override
     {
         std::lock_guard<std::mutex> lock{m_mutex};
@@ -859,23 +896,27 @@ public:
 //   executable for processing.
 // - Pushes processing results down to consumers - to the subsequent queues.
 //   Note: Every data object consumer has its own queue.
-void islandActorThread(std::vector<cv::gimpl::RcDesc> in_rcs,                // FIXME: this is...
-                       std::vector<cv::gimpl::RcDesc> out_rcs,               // FIXME: ...basically just...
-                       cv::GMetaArgs out_metas,                              // ...
-                       std::shared_ptr<cv::gimpl::GIslandExecutable> island, // FIXME: ...a copy of OpDesc{}.
+void islandActorThread(std::vector<cv::gimpl::RcDesc> in_rcs,                     // FIXME: this is...
+                       std::vector<cv::gimpl::RcDesc> out_rcs,                    // FIXME: ...basically just...
+                       cv::GMetaArgs out_metas,                                   // ...
+                       std::shared_ptr<cv::gimpl::GIslandExecutable> island_exec, // FIXME: ...a copy of OpDesc{}.
                        std::vector<Q*> in_queues,
                        cv::GRunArgs in_constants,
-                       std::vector< std::vector<Q*> > out_queues)
+                       std::vector< std::vector<Q*> > out_queues,
+                       const std::string& island_meta_info)
 {
     GAPI_Assert(in_queues.size() == in_rcs.size());
     GAPI_Assert(out_queues.size() == out_rcs.size());
     GAPI_Assert(out_queues.size() == out_metas.size());
     QueueReader qr;
     StreamingInput input(qr, in_queues, in_constants, in_rcs);
-    StreamingOutput output(out_metas, out_queues, out_rcs, island);
+    StreamingOutput output(out_metas, out_queues, out_rcs, island_exec);
+
+    GAPI_ITT_DYNAMIC_LOCAL_HANDLE(island_hndl, island_meta_info.c_str());
     while (!output.done())
     {
-        island->run(input, output);
+        GAPI_ITT_AUTO_TRACE_GUARD(island_hndl);
+        island_exec->run(input, output);
     }
 }
 
@@ -904,11 +945,21 @@ void collectorThread(std::vector<Q*>   in_queues,
         flags[idx] = true;
     }
 
+    GAPI_ITT_STATIC_LOCAL_HANDLE(collector_hndl, "collector");
+    GAPI_ITT_STATIC_LOCAL_HANDLE(collector_get_results_hndl, "collector_get_results");
+    GAPI_ITT_STATIC_LOCAL_HANDLE(collector_push_hndl, "collector_push");
+
     QueueReader qr;
     while (true)
     {
+        GAPI_ITT_AUTO_TRACE_GUARD(collector_hndl);
         cv::GRunArgs this_result(out_size);
-        const bool ok = qr.getResultsVector(in_queues, in_mapping, out_size, this_result);
+
+        const bool ok = [&](){
+            GAPI_ITT_AUTO_TRACE_GUARD(collector_get_results_hndl);
+            return qr.getResultsVector(in_queues, in_mapping, out_size, this_result);
+        }();
+
         if (!ok)
         {
             if (handle_stop)
@@ -918,7 +969,11 @@ void collectorThread(std::vector<Q*>   in_queues,
             // Terminate the thread anyway
             return;
         }
-        out_queue.push(Cmd{Result{std::move(this_result), flags}});
+
+        {
+            GAPI_ITT_AUTO_TRACE_GUARD(collector_push_hndl);
+            out_queue.push(Cmd{Result{std::move(this_result), flags}});
+        }
     }
 }
 
@@ -1071,14 +1126,17 @@ cv::gimpl::GStreamingExecutor::GStreamingExecutor(std::unique_ptr<ade::Graph> &&
     m_sink_queues   .resize(proto.out_nhs.size(), nullptr);
     m_sink_sync     .resize(proto.out_nhs.size(), -1);
 
-    // Very rough estimation to limit internal queue sizes.
+    // Very rough estimation to limit internal queue sizes if not specified by the user.
     // Pipeline depth is equal to number of its (pipeline) steps.
-    const auto queue_capacity = 3*std::count_if
-        (m_gim.nodes().begin(),
-         m_gim.nodes().end(),
-         [&](ade::NodeHandle nh) {
-            return m_gim.metadata(nh).get<NodeKind>().k == NodeKind::ISLAND;
-         });
+    auto has_queue_capacity = cv::gapi::getCompileArg<cv::gapi::streaming::queue_capacity>(m_comp_args);
+    const auto queue_capacity = has_queue_capacity ? has_queue_capacity->capacity :
+            3*std::count_if
+            (m_gim.nodes().begin(),
+            m_gim.nodes().end(),
+            [&](ade::NodeHandle nh) {
+                return m_gim.metadata(nh).get<NodeKind>().k == NodeKind::ISLAND;
+            });
+    GAPI_Assert(queue_capacity != 0u);
 
     auto sync_policy = cv::gimpl::getCompileArg<cv::gapi::streaming::sync_policy>(m_comp_args)
                        .value_or(cv::gapi::streaming::sync_policy::dont_sync);
@@ -1379,11 +1437,8 @@ void cv::gimpl::GStreamingExecutor::setSource(GRunArgs &&ins)
 
     m_sync->registerVideoEmitters(std::move(video_emitters));
 
-    // FIXME: The below code assumes our graph may have only one
-    // real video source (and so, only one stream which may really end)
-    // all other inputs are "constant" generators.
     // Craft here a completion callback to notify Const emitters that
-    // a video source is over
+    // any of video sources is over
     GAPI_Assert(m_const_emitter_queues.size() == m_const_vals.size());
     auto real_video_completion_cb = [this]()
     {
@@ -1431,7 +1486,7 @@ void cv::gimpl::GStreamingExecutor::setSource(GRunArgs &&ins)
     for (auto &&op : m_ops)
     {
         // Prepare island thread parameters
-        auto island = m_gim.metadata(op.nh).get<IslandExec>().object;
+        auto island_exec = m_gim.metadata(op.nh).get<IslandExec>().object;
 
         // Collect actor's input queues
         auto in_queues = input_queues(*m_island_graph, op.nh);
@@ -1443,6 +1498,13 @@ void cv::gimpl::GStreamingExecutor::setSource(GRunArgs &&ins)
             out_queues.push_back(reader_queues(*m_island_graph, out_eh));
         }
 
+        // Create just empty island meta information
+        std::string island_meta_info { };
+#if defined(OPENCV_WITH_ITT)
+        // In case if ITT tracing is enabled fill meta information with the built island name
+        island_meta_info = GIslandModel::traceIslandName(op.nh, m_gim);
+#endif // OPENCV_WITH_ITT
+
         // If Island Executable is recompiled, all its stuff including internal kernel states
         // are recreated and re-initialized automatically.
         // But if not, we should notify Island Executable about new started stream to let it update
@@ -1456,10 +1518,11 @@ void cv::gimpl::GStreamingExecutor::setSource(GRunArgs &&ins)
                                op.in_objects,
                                op.out_objects,
                                op.out_metas,
-                               island,
+                               island_exec,
                                in_queues,
                                op.in_constants,
-                               out_queues);
+                               out_queues,
+                               island_meta_info);
     }
 
     // Finally, start collector thread(s).
@@ -1536,6 +1599,9 @@ void cv::gimpl::GStreamingExecutor::wait_shutdown()
 
 bool cv::gimpl::GStreamingExecutor::pull(cv::GRunArgsP &&outs)
 {
+    GAPI_ITT_STATIC_LOCAL_HANDLE(pull_hndl, "GStreamingExecutor::pull");
+    GAPI_ITT_AUTO_TRACE_GUARD(pull_hndl);
+
     // This pull() can only be called when there's no desynchronized
     // parts in the graph.
     GAPI_Assert(!m_desync &&
diff --git a/modules/gapi/src/executor/gtbbexecutor.cpp b/modules/gapi/src/executor/gtbbexecutor.cpp
index 4966ba114b..cc6ccf9ef4 100644
--- a/modules/gapi/src/executor/gtbbexecutor.cpp
+++ b/modules/gapi/src/executor/gtbbexecutor.cpp
@@ -2,14 +2,14 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 //
-// Copyright (C) 2020 Intel Corporation
+// Copyright (C) 2020-2021 Intel Corporation
 
 #include "gtbbexecutor.hpp"
 
 #if defined(HAVE_TBB) && (TBB_INTERFACE_VERSION < 12000)
 // TODO: TBB task API has been deprecated and removed in 12000
 
-#include "gapi_itt.hpp"
+#include "utils/itt.hpp"
 
 #include <opencv2/gapi/own/assert.hpp>
 #include <opencv2/gapi/util/copy_through_move.hpp>
@@ -30,10 +30,6 @@
 #define LOG_DEBUG(tag, ...)   GAPI_LOG_DEBUG(tag, __VA_ARGS__)
 
 
-#ifdef OPENCV_WITH_ITT
-const __itt_domain* cv::gimpl::parallel::gapi_itt_domain = __itt_domain_create("GAPI Context");
-#endif
-
 namespace cv { namespace gimpl { namespace parallel {
 
 namespace detail {
@@ -82,18 +78,9 @@ void spawn_no_assert(tbb::task* root, body_t const& body) {
    tbb::task::spawn(* allocate_task(root, body));
 }
 
-#ifdef OPENCV_WITH_ITT
-namespace {
-    static __itt_string_handle* ittTbbAddReadyBlocksToQueue   = __itt_string_handle_create("add ready blocks to queue");
-    static __itt_string_handle* ittTbbSpawnReadyBlocks        = __itt_string_handle_create("spawn ready blocks");
-    static __itt_string_handle* ittTbbEnqueueSpawnReadyBlocks = __itt_string_handle_create("enqueueing a spawn of ready blocks");
-    static __itt_string_handle* ittTbbUnlockMasterThread      = __itt_string_handle_create("Unlocking master thread");
-}
-#endif // OPENCV_WITH_ITT
-
-
 template<typename body_t>
 void batch_spawn(size_t count, tbb::task* root, body_t const& body, bool do_assert_graph_is_running = true) {
+   GAPI_ITT_STATIC_LOCAL_HANDLE(ittTbbSpawnReadyBlocks, "spawn ready blocks");
    GAPI_ITT_AUTO_TRACE_GUARD(ittTbbSpawnReadyBlocks);
    if (do_assert_graph_is_running) {
        assert_graph_is_running(root);
@@ -143,6 +130,7 @@ void inline wake_master(async_tasks_t& async_tasks, wake_tbb_master wake_master)
 
     if ((active_async_tasks == 0) || (wake_master == wake_tbb_master::YES)) {
         // Was the last async task or asked to wake TBB master up(e.g. there are new TBB tasks to execute)
+        GAPI_ITT_STATIC_LOCAL_HANDLE(ittTbbUnlockMasterThread, "Unlocking master thread");
         GAPI_ITT_AUTO_TRACE_GUARD(ittTbbUnlockMasterThread);
         // While decrement of async_tasks_t::count is atomic, it might occur after the waiting
         // thread has read its value but _before_ it actually starts waiting on the condition variable.
@@ -228,6 +216,7 @@ inline tile_node*  pop(prio_items_queue_t& q) {
 namespace graph {
     // Returns : number of items actually pushed into the q
     std::size_t inline push_ready_dependants(prio_items_queue_t& q, tile_node* node) {
+        GAPI_ITT_STATIC_LOCAL_HANDLE(ittTbbAddReadyBlocksToQueue, "add ready blocks to queue");
         GAPI_ITT_AUTO_TRACE_GUARD(ittTbbAddReadyBlocksToQueue);
         std::size_t ready_items = 0;
         // enable dependent tasks
@@ -330,6 +319,7 @@ namespace graph {
                     if (ready_items > 0) {
                         auto master_was_active = is_tbb_work_present::NO;
                         {
+                            GAPI_ITT_STATIC_LOCAL_HANDLE(ittTbbEnqueueSpawnReadyBlocks, "enqueueing a spawn of ready blocks");
                             GAPI_ITT_AUTO_TRACE_GUARD(ittTbbEnqueueSpawnReadyBlocks);
                             // Force master thread (one that does wait_for_all()) to (actively) wait for enqueued tasks
                             // and unlock it right after all dependent tasks are spawned.
diff --git a/modules/gapi/src/utils/itt.cpp b/modules/gapi/src/utils/itt.cpp
new file mode 100644
index 0000000000..e92defa19c
--- /dev/null
+++ b/modules/gapi/src/utils/itt.cpp
@@ -0,0 +1,17 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2021 Intel Corporation
+
+// TODO: Consider using OpenCV's trace.hpp
+#if defined(OPENCV_WITH_ITT)
+#include <ittnotify.h>
+#include <opencv2/gapi/own/exports.hpp>
+
+namespace cv {
+namespace gimpl {
+    GAPI_EXPORTS __itt_domain* gapi_itt_domain = __itt_domain_create("GAPI Context");
+} // namespace gimpl
+}  // namespace cv
+#endif // OPENCV_WITH_ITT
diff --git a/modules/gapi/src/utils/itt.hpp b/modules/gapi/src/utils/itt.hpp
new file mode 100644
index 0000000000..0b49af7226
--- /dev/null
+++ b/modules/gapi/src/utils/itt.hpp
@@ -0,0 +1,78 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2021 Intel Corporation
+
+#ifndef OPENCV_GAPI_ITT_HPP
+#define OPENCV_GAPI_ITT_HPP
+
+// for GAPI_ITT_NAMED_TRACE_GUARD
+#include <type_traits>
+#include <memory>
+
+#include <opencv2/gapi/util/compiler_hints.hpp>
+
+// NOTE: OPENCV_WITH_ITT is only defined if ITT dependecy is built by OpenCV infrastructure.
+//       There will not be such define in G-API standalone mode.
+// TODO: Consider using OpenCV's trace.hpp
+#if defined(OPENCV_WITH_ITT)
+#include <ittnotify.h>
+
+namespace cv {
+namespace util {
+    template< class T >
+    using remove_reference_t = typename std::remove_reference<T>::type;
+
+    // Home brew ScopeGuard
+    // D will be called automatically with p as argument when ScopeGuard goes out of scope.
+    // call release() on the ScopeGuard object to revoke guard action
+    template<typename T, typename D>
+    auto make_ptr_guard(T* p, D&& d) -> std::unique_ptr<T, util::remove_reference_t<D>> {
+        return {p, std::forward<D>(d)};
+    }
+}  // namespace util
+
+namespace gimpl {
+    extern __itt_domain* gapi_itt_domain;
+    namespace {
+        auto make_itt_guard = [](__itt_string_handle* h) {
+           __itt_task_begin(gapi_itt_domain, __itt_null, __itt_null, (h));
+           return util::make_ptr_guard(reinterpret_cast<int*>(1),
+                                       [](int* ){ __itt_task_end(gapi_itt_domain); });
+        };
+    }  // namespace
+} // namespace gimpl
+} // namespace cv
+
+#define GAPI_ITT_NAMED_TRACE_GUARD(name, h)      auto name = cv::gimpl::make_itt_guard(h); \
+                                                 cv::util::suppress_unused_warning(name)
+#define GAPI_ITT_STATIC_LOCAL_HANDLE_IMPL(n, h)  static __itt_string_handle* n = \
+                                                 __itt_string_handle_create(h)
+#define GAPI_ITT_DYNAMIC_LOCAL_HANDLE_IMPL(n, h) __itt_string_handle* n = \
+                                                 __itt_string_handle_create(h)
+#else // OPENCV_WITH_ITT
+
+namespace cv {
+namespace gimpl {
+struct dumb_guard { void reset() { } };
+} // namespace gimpl
+} // namespace cv
+
+#define GAPI_ITT_NAMED_TRACE_GUARD(name, h)      cv::gimpl::dumb_guard name; \
+                                                 cv::util::suppress_unused_warning(name); \
+                                                 cv::util::suppress_unused_warning(h)
+#define GAPI_ITT_STATIC_LOCAL_HANDLE_IMPL(n, h)  static auto n = h
+#define GAPI_ITT_DYNAMIC_LOCAL_HANDLE_IMPL(n, h) auto n = h
+
+#endif // OPENCV_WITH_ITT
+
+#define GAPI_ITT_AUTO_TRACE_GUARD_IMPL_(LINE, h) GAPI_ITT_NAMED_TRACE_GUARD( \
+                                                    itt_trace_guard_##LINE, h)
+#define GAPI_ITT_AUTO_TRACE_GUARD_IMPL(LINE, h)  GAPI_ITT_AUTO_TRACE_GUARD_IMPL_(LINE, h)
+#define GAPI_ITT_AUTO_TRACE_GUARD(h)             GAPI_ITT_AUTO_TRACE_GUARD_IMPL(__LINE__, h)
+
+#define GAPI_ITT_STATIC_LOCAL_HANDLE(n, h)       GAPI_ITT_STATIC_LOCAL_HANDLE_IMPL(n, h)
+#define GAPI_ITT_DYNAMIC_LOCAL_HANDLE(n, h)      GAPI_ITT_DYNAMIC_LOCAL_HANDLE_IMPL(n, h)
+
+#endif // OPENCV_GAPI_ITT_HPP
diff --git a/modules/gapi/test/common/gapi_core_tests.hpp b/modules/gapi/test/common/gapi_core_tests.hpp
index e87828200e..0d8015eac0 100644
--- a/modules/gapi/test/common/gapi_core_tests.hpp
+++ b/modules/gapi/test/common/gapi_core_tests.hpp
@@ -154,6 +154,8 @@ GAPI_TEST_FIXTURE(WarpAffineTest, initMatrixRandU,
 GAPI_TEST_FIXTURE(KMeansNDTest, initMatrixRandU, FIXTURE_API(CompareMats, int, cv::KmeansFlags), 3, cmpF, K, flags)
 GAPI_TEST_FIXTURE(KMeans2DTest, initNothing,     FIXTURE_API(int, cv::KmeansFlags), 2, K, flags)
 GAPI_TEST_FIXTURE(KMeans3DTest, initNothing,     FIXTURE_API(int, cv::KmeansFlags), 2, K, flags)
+GAPI_TEST_FIXTURE(TransposeTest, initMatrixRandU, FIXTURE_API(CompareMats), 1, cmpF)
+
 
 GAPI_TEST_EXT_BASE_FIXTURE(ParseSSDBLTest, ParserSSDTest, initNothing,
     FIXTURE_API(float, int), 2, confidence_threshold, filter_label)
diff --git a/modules/gapi/test/common/gapi_core_tests_inl.hpp b/modules/gapi/test/common/gapi_core_tests_inl.hpp
index d4760e804e..d9287a176c 100644
--- a/modules/gapi/test/common/gapi_core_tests_inl.hpp
+++ b/modules/gapi/test/common/gapi_core_tests_inl.hpp
@@ -1403,6 +1403,23 @@ TEST_P(KMeans3DTest, AccuracyTest)
     kmeansTestBody(in_vector, sz, type, K, flags, getCompileArgs());
 }
 
+TEST_P(TransposeTest, Test)
+{
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::transpose(in);
+
+    cv::GComputation c(in, out);
+    c.apply(in_mat1, out_mat_gapi, getCompileArgs());
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::transpose(in_mat1, out_mat_ocv);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_ocv, out_mat_gapi));
+    }
+}
 // PLEASE DO NOT PUT NEW ACCURACY TESTS BELOW THIS POINT! //////////////////////
 
 TEST_P(BackendOutputAllocationTest, EmptyOutput)
diff --git a/modules/gapi/test/common/gapi_tests_common.hpp b/modules/gapi/test/common/gapi_tests_common.hpp
index 777574964a..463e9949e9 100644
--- a/modules/gapi/test/common/gapi_tests_common.hpp
+++ b/modules/gapi/test/common/gapi_tests_common.hpp
@@ -542,6 +542,7 @@ struct TestWithParamsSpecific : public TestWithParamsBase<ParamsSpecific<Specifi
  * @param ...       list of names of user-defined parameters. if there are no parameters, the list
  *                  must be empty.
  */
+ //TODO: Consider to remove `Number` and use `std::tuple_size<decltype(std::make_tuple(__VA_ARGS__))>::value`
 #define GAPI_TEST_FIXTURE(Fixture, InitF, API, Number, ...) \
     struct Fixture : public TestWithParams API { \
         static_assert(Number == AllParams::specific_params_size, \
diff --git a/modules/gapi/test/cpu/gapi_core_tests_cpu.cpp b/modules/gapi/test/cpu/gapi_core_tests_cpu.cpp
index 5a06671ae3..424cf1b0ad 100644
--- a/modules/gapi/test/cpu/gapi_core_tests_cpu.cpp
+++ b/modules/gapi/test/cpu/gapi_core_tests_cpu.cpp
@@ -551,6 +551,16 @@ INSTANTIATE_TEST_CASE_P(KMeans3DInitTestCPU, KMeans3DTest,
                                 Values(cv::KMEANS_RANDOM_CENTERS | cv::KMEANS_USE_INITIAL_LABELS,
                                        cv::KMEANS_PP_CENTERS     | cv::KMEANS_USE_INITIAL_LABELS)));
 
+INSTANTIATE_TEST_CASE_P(TransposeTestCPU, TransposeTest,
+                        Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1,
+                                       CV_8UC2, CV_16UC2, CV_16SC2, CV_32FC2,
+                                       CV_8UC3, CV_16UC3, CV_16SC3, CV_32FC3),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(-1),
+                                Values(CORE_CPU),
+                                Values(AbsExact().to_compare_obj())));
 // PLEASE DO NOT PUT NEW ACCURACY TESTS BELOW THIS POINT! //////////////////////
 
 INSTANTIATE_TEST_CASE_P(BackendOutputAllocationTestCPU, BackendOutputAllocationTest,
diff --git a/modules/gapi/test/gpu/gapi_core_tests_gpu.cpp b/modules/gapi/test/gpu/gapi_core_tests_gpu.cpp
index 4e5efe6d3f..2cc859c539 100644
--- a/modules/gapi/test/gpu/gapi_core_tests_gpu.cpp
+++ b/modules/gapi/test/gpu/gapi_core_tests_gpu.cpp
@@ -402,6 +402,16 @@ INSTANTIATE_TEST_CASE_P(ConcatVertTestGPU, ConcatVertTest,
                                 Values(-1),
                                 Values(CORE_GPU)));
 
+INSTANTIATE_TEST_CASE_P(TransposeTestGPU, TransposeTest,
+                        Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1,
+                                       CV_8UC2, CV_16UC2, CV_16SC2, CV_32FC2,
+                                       CV_8UC3, CV_16UC3, CV_16SC3, CV_32FC3),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(-1),
+                                Values(CORE_GPU),
+                                Values(AbsExact().to_compare_obj())));
 // PLEASE DO NOT PUT NEW ACCURACY TESTS BELOW THIS POINT! //////////////////////
 
 INSTANTIATE_TEST_CASE_P(BackendOutputAllocationTestGPU, BackendOutputAllocationTest,
diff --git a/modules/gapi/test/infer/gapi_infer_ie_test.cpp b/modules/gapi/test/infer/gapi_infer_ie_test.cpp
index 366b7b18f2..4ea33f7713 100644
--- a/modules/gapi/test/infer/gapi_infer_ie_test.cpp
+++ b/modules/gapi/test/infer/gapi_infer_ie_test.cpp
@@ -2005,6 +2005,29 @@ TEST_F(InferWithReshapeNV12, TestInferListYUV)
     // Validate
     validate();
 }
+
+TEST_F(ROIList, CallInferMultipleTimes)
+{
+    cv::GArray<cv::Rect> rr;
+    cv::GMat in;
+    cv::GArray<cv::GMat> age, gender;
+    std::tie(age, gender) = cv::gapi::infer<AgeGender>(rr, in);
+    cv::GComputation comp(cv::GIn(in, rr), cv::GOut(age, gender));
+
+    auto pp = cv::gapi::ie::Params<AgeGender> {
+        params.model_path, params.weights_path, params.device_id
+    }.cfgOutputLayers({ "age_conv3", "prob" });
+
+    auto cc = comp.compile(cv::descr_of(cv::gin(m_in_mat, m_roi_list)),
+                           cv::compile_args(cv::gapi::networks(pp)));
+
+    for (int i = 0; i < 10; ++i) {
+        cc(cv::gin(m_in_mat, m_roi_list), cv::gout(m_out_gapi_ages, m_out_gapi_genders));
+    }
+
+    validate();
+}
+
 } // namespace opencv_test
 
 #endif //  HAVE_INF_ENGINE
diff --git a/modules/gapi/test/streaming/gapi_streaming_tests.cpp b/modules/gapi/test/streaming/gapi_streaming_tests.cpp
index 064511880e..f3179a7081 100644
--- a/modules/gapi/test/streaming/gapi_streaming_tests.cpp
+++ b/modules/gapi/test/streaming/gapi_streaming_tests.cpp
@@ -67,13 +67,24 @@ std::ostream& operator<< (std::ostream &os, const KernelPackage &e)
     return os;
 }
 
-struct GAPI_Streaming: public ::testing::TestWithParam<KernelPackage> {
-    GAPI_Streaming() { initTestDataPath(); }
+struct GAPI_Streaming: public ::testing::TestWithParam<std::tuple<KernelPackage,
+                                                                  cv::optional<size_t>>> {
+    GAPI_Streaming() {
+        initTestDataPath();
+        KernelPackage pkg_kind;
+        std::tie(pkg_kind, cap) = GetParam();
+        pkg = getKernelPackage(pkg_kind);
+    }
 
-    cv::gapi::GKernelPackage getKernelPackage()
+    const cv::optional<size_t>& getQueueCapacity()
+    {
+        return cap;
+    }
+
+    cv::gapi::GKernelPackage getKernelPackage(KernelPackage pkg_kind)
     {
         using namespace cv::gapi;
-        switch (GetParam())
+        switch (pkg_kind)
         {
         case KernelPackage::OCV:
             return cv::gapi::combine(core::cpu::kernels(),
@@ -104,6 +115,18 @@ struct GAPI_Streaming: public ::testing::TestWithParam<KernelPackage> {
         }
         throw std::logic_error("Unknown package");
     }
+
+    cv::GCompileArgs getCompileArgs() {
+        using namespace cv::gapi;
+        auto args = cv::compile_args(use_only{pkg});
+        if (cap) {
+            args += cv::compile_args(streaming::queue_capacity{cap.value()});
+        }
+        return args;
+    }
+
+    cv::gapi::GKernelPackage pkg;
+    cv::optional<size_t>     cap;
 };
 
 G_API_OP(Delay, <cv::GMat(cv::GMat, int)>, "org.opencv.test.delay") {
@@ -260,8 +283,7 @@ TEST_P(GAPI_Streaming, SmokeTest_ConstInput_GMat)
     }
 
     // Compilation & testing
-    auto ccomp = c.compileStreaming(cv::descr_of(in_mat),
-                                    cv::compile_args(cv::gapi::use_only{getKernelPackage()}));
+    auto ccomp = c.compileStreaming(cv::descr_of(in_mat), getCompileArgs());
     EXPECT_TRUE(ccomp);
     EXPECT_FALSE(ccomp.running());
 
@@ -306,7 +328,7 @@ TEST_P(GAPI_Streaming, SmokeTest_VideoInput_GMat)
 
     // Compilation & testing
     auto ccomp = c.compileStreaming(cv::GMatDesc{CV_8U,3,cv::Size{768,576}},
-                                    cv::compile_args(cv::gapi::use_only{getKernelPackage()}));
+                                    getCompileArgs());
     EXPECT_TRUE(ccomp);
     EXPECT_FALSE(ccomp.running());
 
@@ -356,7 +378,7 @@ TEST_P(GAPI_Streaming, Regression_CompileTimeScalar)
     cv::GComputation c(cv::GIn(in), cv::GOut(tmp, tmp + 1));
 
     auto ccomp = c.compileStreaming(cv::GMatDesc{CV_8U,3,cv::Size{768,512}},
-                                    cv::compile_args(cv::gapi::use_only{getKernelPackage()}));
+                                    getCompileArgs());
 
     cv::Mat in_mat = cv::imread(findDataFile("cv/edgefilter/kodim23.png"));
     cv::Mat out_mat1, out_mat2;
@@ -379,7 +401,7 @@ TEST_P(GAPI_Streaming, SmokeTest_StartRestart)
     cv::GComputation c(cv::GIn(in), cv::GOut(cv::gapi::copy(in), out));
 
     auto ccomp = c.compileStreaming(cv::GMatDesc{CV_8U,3,cv::Size{768,576}},
-                                    cv::compile_args(cv::gapi::use_only{getKernelPackage()}));
+                                    getCompileArgs());
     EXPECT_TRUE(ccomp);
     EXPECT_FALSE(ccomp.running());
 
@@ -424,8 +446,7 @@ TEST_P(GAPI_Streaming, SmokeTest_VideoConstSource_NoHang)
     auto refc = cv::GComputation([](){
         cv::GMat in;
         return cv::GComputation(in, cv::gapi::copy(in));
-    }).compileStreaming(cv::GMatDesc{CV_8U,3,cv::Size{768,576}},
-                        cv::compile_args(cv::gapi::use_only{getKernelPackage()}));
+    }).compileStreaming(cv::GMatDesc{CV_8U,3,cv::Size{768,576}}, getCompileArgs());
 
     auto path = findDataFile("cv/video/768x576.avi");
     try {
@@ -447,7 +468,7 @@ TEST_P(GAPI_Streaming, SmokeTest_VideoConstSource_NoHang)
     auto testc = cv::GComputation(cv::GIn(in, in2), cv::GOut(out))
         .compileStreaming(cv::GMatDesc{CV_8U,3,cv::Size{256,256}},
                           cv::GMatDesc{CV_8U,3,cv::Size{768,576}},
-                          cv::compile_args(cv::gapi::use_only{getKernelPackage()}));
+                          getCompileArgs());
 
     cv::Mat in_const = cv::Mat::eye(cv::Size(256,256), CV_8UC3);
     testc.setSource(cv::gin(in_const,
@@ -468,7 +489,7 @@ TEST_P(GAPI_Streaming, SmokeTest_AutoMeta)
     cv::GMat out = blr - in;
 
     auto testc = cv::GComputation(cv::GIn(in, in2), cv::GOut(out))
-        .compileStreaming(cv::compile_args(cv::gapi::use_only{getKernelPackage()}));
+        .compileStreaming(getCompileArgs());
 
     cv::Mat in_const = cv::Mat::eye(cv::Size(256,256), CV_8UC3);
     cv::Mat tmp;
@@ -510,7 +531,7 @@ TEST_P(GAPI_Streaming, SmokeTest_AutoMeta_2xConstMat)
     cv::GMat out = blr - in;
 
     auto testc = cv::GComputation(cv::GIn(in, in2), cv::GOut(out))
-        .compileStreaming(cv::compile_args(cv::gapi::use_only{getKernelPackage()}));
+        .compileStreaming(getCompileArgs());
 
     cv::Mat in_const = cv::Mat::eye(cv::Size(256,256), CV_8UC3);
     cv::Mat tmp;
@@ -541,7 +562,7 @@ TEST_P(GAPI_Streaming, SmokeTest_AutoMeta_VideoScalar)
     cv::GMat out_m = in_m * in_s;
 
     auto testc = cv::GComputation(cv::GIn(in_m, in_s), cv::GOut(out_m))
-        .compileStreaming(cv::compile_args(cv::gapi::use_only{getKernelPackage()}));
+        .compileStreaming(getCompileArgs());
 
     cv::Mat tmp;
     // Test with one video source and scalar
@@ -572,11 +593,13 @@ TEST_P(GAPI_Streaming, SmokeTest_AutoMeta_VideoScalar)
 }
 
 INSTANTIATE_TEST_CASE_P(TestStreaming, GAPI_Streaming,
-                        Values(  KernelPackage::OCV
-                             //, KernelPackage::OCL // FIXME: Fails bit-exactness check, maybe relax it?
-                               , KernelPackage::OCV_FLUID
-                             //, KernelPackage::OCL // FIXME: Fails bit-exactness check, maybe relax it?
-                               ));
+                        Combine(Values(  KernelPackage::OCV
+                                    //, KernelPackage::OCL // FIXME: Fails bit-exactness check, maybe relax it?
+                                      , KernelPackage::OCV_FLUID
+                                    //, KernelPackage::OCL // FIXME: Fails bit-exactness check, maybe relax it?
+                                ),
+                                Values(cv::optional<size_t>{}, 1u, 4u))
+                        );
 
 namespace TypesTest
 {
@@ -653,8 +676,15 @@ TEST_P(GAPI_Streaming, SmokeTest_AutoMeta_VideoArray)
     cv::GMat out_m = TypesTest::AddV::on(in_m, in_v) - in_m;
 
     // Run pipeline
+    auto args = cv::compile_args(cv::gapi::kernels<TypesTest::OCVAddV>());
+    auto capacity = getQueueCapacity();
+    if (capacity)
+    {
+        args += cv::compile_args(
+                    cv::gapi::streaming::queue_capacity{capacity.value()});
+    }
     auto testc = cv::GComputation(cv::GIn(in_m, in_v), cv::GOut(out_m))
-                    .compileStreaming(cv::compile_args(cv::gapi::kernels<TypesTest::OCVAddV>()));
+                    .compileStreaming(std::move(args));
 
     cv::Mat tmp;
     // Test with one video source and vector
diff --git a/modules/highgui/CMakeLists.txt b/modules/highgui/CMakeLists.txt
index 7a546616a4..9b68b4672e 100644
--- a/modules/highgui/CMakeLists.txt
+++ b/modules/highgui/CMakeLists.txt
@@ -1,10 +1,46 @@
 set(the_description "High-level GUI")
+
+set(ENABLE_PLUGINS_DEFAULT ON)
+if(EMSCRIPTEN OR IOS OR WINRT)
+  set(ENABLE_PLUGINS_DEFAULT OFF)
+endif()
+set(HIGHGUI_PLUGIN_LIST "" CACHE STRING "List of GUI backends to be compiled as plugins (gtk, gtk2/gtk3, qt, win32 or special value 'all')")
+set(HIGHGUI_ENABLE_PLUGINS "${ENABLE_PLUGINS_DEFAULT}" CACHE BOOL "Allow building and using of GUI plugins")
+mark_as_advanced(HIGHGUI_PLUGIN_LIST HIGHGUI_ENABLE_PLUGINS)
+
+string(REPLACE "," ";" HIGHGUI_PLUGIN_LIST "${HIGHGUI_PLUGIN_LIST}")  # support comma-separated list (,) too
+if(NOT HIGHGUI_ENABLE_PLUGINS)
+  if(HIGHGUI_PLUGIN_LIST)
+    message(WARNING "HighGUI: plugins are disabled through HIGHGUI_ENABLE_PLUGINS, so HIGHGUI_PLUGIN_LIST='${HIGHGUI_PLUGIN_LIST}' is ignored")
+    set(HIGHGUI_PLUGIN_LIST "")
+  endif()
+else()
+  # Make virtual plugins target
+  if(NOT TARGET opencv_highgui_plugins)
+    add_custom_target(opencv_highgui_plugins ALL)
+  endif()
+endif()
+
 if(ANDROID)
   ocv_add_module(highgui opencv_imgproc opencv_imgcodecs OPTIONAL opencv_videoio WRAP python)
 else()
   ocv_add_module(highgui opencv_imgproc opencv_imgcodecs OPTIONAL opencv_videoio WRAP python java)
 endif()
 
+include(${CMAKE_CURRENT_LIST_DIR}/cmake/plugin.cmake)
+
+set(tgts "PRIVATE")
+
+set(highgui_hdrs
+    ${CMAKE_CURRENT_LIST_DIR}/src/precomp.hpp
+    )
+
+set(highgui_srcs
+    ${CMAKE_CURRENT_LIST_DIR}/src/backend.cpp
+    ${CMAKE_CURRENT_LIST_DIR}/src/window.cpp
+    ${CMAKE_CURRENT_LIST_DIR}/src/roiSelector.cpp
+    )
+
 # ----------------------------------------------------------------------------
 #  CMake file for highgui. See root CMakeLists.txt
 #   Some parts taken from version of Hartmut Seichter, HIT Lab NZ.
@@ -24,15 +60,6 @@ if(HAVE_WEBP)
   add_definitions(-DHAVE_WEBP)
 endif()
 
-set(highgui_hdrs
-    ${CMAKE_CURRENT_LIST_DIR}/src/precomp.hpp
-    )
-
-set(highgui_srcs
-    ${CMAKE_CURRENT_LIST_DIR}/src/window.cpp
-    ${CMAKE_CURRENT_LIST_DIR}/src/roiSelector.cpp
-    )
-
 file(GLOB highgui_ext_hdrs
      "${CMAKE_CURRENT_LIST_DIR}/include/opencv2/*.hpp"
      "${CMAKE_CURRENT_LIST_DIR}/include/opencv2/${name}/*.hpp"
@@ -42,6 +69,8 @@ file(GLOB highgui_ext_hdrs
 list(REMOVE_ITEM highgui_ext_hdrs "${CMAKE_CURRENT_LIST_DIR}/include/opencv2/${name}/highgui_winrt.hpp")
 
 if(HAVE_QT5)
+  add_definitions(-DHAVE_QT)
+
   # "Automoc" doesn't work properly with opencv_world build, use QT5_WRAP_CPP() directly
   #set(CMAKE_AUTOMOC ON)
 
@@ -62,13 +91,16 @@ if(HAVE_QT5)
   endforeach()
 
   if(HAVE_QT_OPENGL)
+    add_definitions(-DHAVE_QT_OPENGL)
     add_definitions(${Qt5OpenGL_DEFINITIONS})
     include_directories(${Qt5OpenGL_INCLUDE_DIRS})
     list(APPEND HIGHGUI_LIBRARIES ${Qt5OpenGL_LIBRARIES})
   endif()
 
 elseif(HAVE_QT)
-  if (HAVE_QT_OPENGL)
+  add_definitions(-DHAVE_QT)
+  if(HAVE_QT_OPENGL)
+    add_definitions(-DHAVE_QT_OPENGL)
     set(QT_USE_QTOPENGL TRUE)
   endif()
   include(${QT_USE_FILE})
@@ -121,13 +153,64 @@ elseif(HAVE_WIN32UI)
   if(OpenCV_ARCH STREQUAL "ARM64")
     list(APPEND HIGHGUI_LIBRARIES "comdlg32" "advapi32")
   endif()
-elseif(HAVE_GTK OR HAVE_GTK3)
-  list(APPEND highgui_srcs ${CMAKE_CURRENT_LIST_DIR}/src/window_gtk.cpp)
 elseif(HAVE_COCOA)
+  add_definitions(-DHAVE_COCOA)
   list(APPEND highgui_srcs ${CMAKE_CURRENT_LIST_DIR}/src/window_cocoa.mm)
   list(APPEND HIGHGUI_LIBRARIES "-framework Cocoa")
 endif()
 
+if(TARGET ocv.3rdparty.gtk3 OR TARGET ocv.3rdparty.gtk2)
+  if(TARGET ocv.3rdparty.gtk3 AND NOT WITH_GTK_2_X)
+    set(__gtk_dependency "ocv.3rdparty.gtk3")
+  else()
+    set(__gtk_dependency "ocv.3rdparty.gtk2")
+  endif()
+  if(
+    NOT HIGHGUI_PLUGIN_LIST STREQUAL "all"
+    AND NOT "gtk" IN_LIST HIGHGUI_PLUGIN_LIST
+    AND NOT "gtk2" IN_LIST HIGHGUI_PLUGIN_LIST
+    AND NOT "gtk3" IN_LIST HIGHGUI_PLUGIN_LIST
+  )
+    list(APPEND highgui_srcs ${CMAKE_CURRENT_LIST_DIR}/src/window_gtk.cpp)
+    list(APPEND tgts ${__gtk_dependency})
+    if(TARGET ocv.3rdparty.gthread)
+      list(APPEND tgts ocv.3rdparty.gthread)
+    endif()
+    if(TARGET ocv.3rdparty.gtkglext
+        AND NOT OPENCV_GTK_DISABLE_GTKGLEXT
+    )
+      list(APPEND tgts ocv.3rdparty.gtkglext)
+    endif()
+  elseif("gtk" IN_LIST HIGHGUI_PLUGIN_LIST)
+    ocv_create_builtin_highgui_plugin(opencv_highgui_gtk ${__gtk_dependency} "window_gtk.cpp")
+    if(TARGET ocv.3rdparty.gthread)
+      ocv_target_link_libraries(opencv_highgui_gtk ocv.3rdparty.gthread)
+    endif()
+    if(TARGET ocv.3rdparty.gtkglext)
+      ocv_target_link_libraries(opencv_highgui_gtk ocv.3rdparty.gtkglext)
+    endif()
+  else()
+    if(TARGET ocv.3rdparty.gtk3 AND ("gtk3" IN_LIST HIGHGUI_PLUGIN_LIST OR HIGHGUI_PLUGIN_LIST STREQUAL "all"))
+      ocv_create_builtin_highgui_plugin(opencv_highgui_gtk3 ocv.3rdparty.gtk3 "window_gtk.cpp")
+      if(TARGET ocv.3rdparty.gthread)
+        ocv_target_link_libraries(opencv_highgui_gtk3 ocv.3rdparty.gthread)
+      endif()
+      if(TARGET ocv.3rdparty.gtkglext)
+        ocv_target_link_libraries(opencv_highgui_gtk3 ocv.3rdparty.gtkglext)
+      endif()
+    endif()
+    if(TARGET ocv.3rdparty.gtk2 AND ("gtk2" IN_LIST HIGHGUI_PLUGIN_LIST OR HIGHGUI_PLUGIN_LIST STREQUAL "all"))
+      ocv_create_builtin_highgui_plugin(opencv_highgui_gtk2 ocv.3rdparty.gtk2 "window_gtk.cpp")
+      if(TARGET ocv.3rdparty.gthread)
+        ocv_target_link_libraries(opencv_highgui_gtk2 ocv.3rdparty.gthread)
+      endif()
+      if(TARGET ocv.3rdparty.gtkglext)
+        ocv_target_link_libraries(opencv_highgui_gtk2 ocv.3rdparty.gtkglext)
+      endif()
+    endif()
+  endif()
+endif()
+
 if(TRUE)
   # these variables are set by 'ocv_append_build_options(HIGHGUI ...)'
   foreach(P ${HIGHGUI_INCLUDE_DIRS})
@@ -139,6 +222,21 @@ if(TRUE)
   endforeach()
 endif()
 
+if(tgts STREQUAL "PRIVATE")
+  set(tgts "")
+endif()
+
+# install used dependencies only
+if(NOT BUILD_SHARED_LIBS
+    AND NOT (CMAKE_VERSION VERSION_LESS "3.13.0")  # upgrade CMake: https://gitlab.kitware.com/cmake/cmake/-/merge_requests/2152
+)
+  foreach(tgt in ${tgts})
+    if(tgt MATCHES "^ocv\.3rdparty\.")
+      install(TARGETS ${tgt} EXPORT OpenCVModules)
+    endif()
+  endforeach()
+endif()
+
 source_group("Src" FILES ${highgui_srcs} ${highgui_hdrs})
 source_group("Include" FILES ${highgui_ext_hdrs})
 ocv_set_module_sources(HEADERS ${highgui_ext_hdrs} SOURCES ${highgui_srcs} ${highgui_hdrs})
@@ -162,5 +260,14 @@ if(NOT BUILD_opencv_world)
   ocv_highgui_configure_target()
 endif()
 
-ocv_add_accuracy_tests()
-ocv_add_perf_tests()
+ocv_add_accuracy_tests(${tgts})
+#ocv_add_perf_tests(${tgts})
+
+if(HIGHGUI_ENABLE_PLUGINS)
+  ocv_target_compile_definitions(${the_module} PRIVATE ENABLE_PLUGINS)
+  if(TARGET opencv_test_highgui)
+    ocv_target_compile_definitions(opencv_test_highgui PRIVATE ENABLE_PLUGINS)
+  endif()
+endif()
+
+ocv_target_link_libraries(${the_module} LINK_PRIVATE ${tgts})
diff --git a/modules/highgui/cmake/detect_gtk.cmake b/modules/highgui/cmake/detect_gtk.cmake
new file mode 100644
index 0000000000..1f91986020
--- /dev/null
+++ b/modules/highgui/cmake/detect_gtk.cmake
@@ -0,0 +1,47 @@
+# --- GTK ---
+ocv_clear_vars(HAVE_GTK HAVE_GTK3 HAVE_GTHREAD HAVE_GTKGLEXT)
+if(WITH_GTK)
+  if(NOT WITH_GTK_2_X)
+    ocv_check_modules(GTK3 gtk+-3.0)
+    if(HAVE_GTK3)
+      ocv_add_external_target(gtk3 "${GTK3_INCLUDE_DIRS}" "${GTK3_LIBRARIES}" "HAVE_GTK3;HAVE_GTK")
+      set(HAVE_GTK TRUE)
+      set(HAVE_GTK3 ${HAVE_GTK3} PARENT_SCOPE)
+      set(GTK3_VERSION "${GTK3_VERSION}" PARENT_SCOPE) # informational
+    endif()
+  endif()
+  if((PROJECT_NAME STREQUAL "OpenCV" AND HIGHGUI_ENABLE_PLUGINS) OR NOT HAVE_GTK3)
+    ocv_check_modules(GTK2 gtk+-2.0)
+    if(HAVE_GTK2)
+      set(MIN_VER_GTK "2.18.0")
+      if(GTK2_VERSION VERSION_LESS MIN_VER_GTK)
+        message(FATAL_ERROR "GTK support requires a minimum version of ${MIN_VER_GTK} (${GTK2_VERSION} found)")
+      else()
+        ocv_add_external_target(gtk2 "${GTK2_INCLUDE_DIRS}" "${GTK2_LIBRARIES}" "HAVE_GTK2;HAVE_GTK")
+        set(HAVE_GTK TRUE)
+        set(HAVE_GTK2 ${HAVE_GTK2} PARENT_SCOPE)
+        set(GTK2_VERSION "${GTK2_VERSION}" PARENT_SCOPE) # informational
+      endif()
+    endif()
+  endif()
+  ocv_check_modules(GTHREAD gthread-2.0)
+  if(HAVE_GTK AND NOT HAVE_GTHREAD)
+    message(FATAL_ERROR "gthread not found. This library is required when building with GTK support")
+  else()
+    ocv_add_external_target(gthread "${GTHREAD_INCLUDE_DIRS}" "${GTHREAD_LIBRARIES}" "HAVE_GTHREAD")
+    set(HAVE_GTHREAD "${HAVE_GTHREAD}" PARENT_SCOPE) # informational
+    set(GTHREAD_VERSION "${GTHREAD_VERSION}" PARENT_SCOPE) # informational
+  endif()
+  if(WITH_OPENGL AND NOT HAVE_GTK3)
+    ocv_check_modules(GTKGLEXT gtkglext-1.0)
+    if(HAVE_GTKGLEXT)
+      ocv_add_external_target(gtkglext "${GTKGLEXT_INCLUDE_DIRS}" "${GTKGLEXT_LIBRARIES}" "HAVE_GTKGLEXT")
+      set(HAVE_GTKGLEXT "${HAVE_GTKGLEXT}" PARENT_SCOPE) # informational
+      set(GTKGLEXT_VERSION "${GTKGLEXT_VERSION}" PARENT_SCOPE) # informational
+    endif()
+  endif()
+elseif(HAVE_GTK)
+  ocv_add_external_target(gtk "${GTK_INCLUDE_DIRS}" "${GTK_LIBRARIES}" "${GTK_DEFINES};HAVE_GTK")
+endif()
+
+set(HAVE_GTK ${HAVE_GTK} PARENT_SCOPE)
diff --git a/modules/highgui/cmake/init.cmake b/modules/highgui/cmake/init.cmake
new file mode 100644
index 0000000000..1a115f22ed
--- /dev/null
+++ b/modules/highgui/cmake/init.cmake
@@ -0,0 +1,25 @@
+include(FindPkgConfig)
+
+# FIXIT: stop using PARENT_SCOPE in dependencies
+if(PROJECT_NAME STREQUAL "OpenCV")
+  macro(add_backend backend_id cond_var)
+    if(${cond_var})
+      include("${CMAKE_CURRENT_LIST_DIR}/detect_${backend_id}.cmake")
+    endif()
+  endmacro()
+else()
+  function(add_backend backend_id cond_var)
+    if(${cond_var})
+      include("${CMAKE_CURRENT_LIST_DIR}/detect_${backend_id}.cmake")
+    endif()
+  endfunction()
+endif()
+
+add_backend("gtk" WITH_GTK)
+
+# TODO win32
+# TODO cocoa
+# TODO qt
+# TODO opengl
+
+# FIXIT: move content of cmake/OpenCVFindLibsGUI.cmake here (need to resolve CMake scope issues)
diff --git a/modules/highgui/cmake/plugin.cmake b/modules/highgui/cmake/plugin.cmake
new file mode 100644
index 0000000000..6e0ddd2dc5
--- /dev/null
+++ b/modules/highgui/cmake/plugin.cmake
@@ -0,0 +1,61 @@
+function(ocv_create_builtin_highgui_plugin name target)
+
+  ocv_debug_message("ocv_create_builtin_highgui_plugin(${ARGV})")
+
+  if(NOT TARGET ${target})
+    message(FATAL_ERROR "${target} does not exist!")
+  endif()
+  if(NOT OpenCV_SOURCE_DIR)
+    message(FATAL_ERROR "OpenCV_SOURCE_DIR must be set to build the plugin!")
+  endif()
+
+  message(STATUS "HighGUI: add builtin plugin '${name}'")
+
+  foreach(src ${ARGN})
+    list(APPEND sources "${CMAKE_CURRENT_LIST_DIR}/src/${src}")
+  endforeach()
+
+  add_library(${name} MODULE ${sources})
+  target_include_directories(${name} PRIVATE "${CMAKE_CURRENT_BINARY_DIR}")
+  target_compile_definitions(${name} PRIVATE BUILD_PLUGIN)
+  target_link_libraries(${name} PRIVATE ${target})
+
+  foreach(mod opencv_highgui
+      opencv_core
+      opencv_imgproc
+      opencv_imgcodecs
+      opencv_videoio  # TODO remove this dependency
+  )
+    ocv_target_link_libraries(${name} LINK_PRIVATE ${mod})
+    ocv_target_include_directories(${name} "${OPENCV_MODULE_${mod}_LOCATION}/include")
+  endforeach()
+
+  if(WIN32)
+    set(OPENCV_PLUGIN_VERSION "${OPENCV_DLLVERSION}" CACHE STRING "")
+    if(CMAKE_CXX_SIZEOF_DATA_PTR EQUAL 8)
+      set(OPENCV_PLUGIN_ARCH "_64" CACHE STRING "")
+    else()
+      set(OPENCV_PLUGIN_ARCH "" CACHE STRING "")
+    endif()
+  else()
+    set(OPENCV_PLUGIN_VERSION "" CACHE STRING "")
+    set(OPENCV_PLUGIN_ARCH "" CACHE STRING "")
+  endif()
+
+  set_target_properties(${name} PROPERTIES
+    CXX_STANDARD 11
+    CXX_VISIBILITY_PRESET hidden
+    DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
+    OUTPUT_NAME "${name}${OPENCV_PLUGIN_VERSION}${OPENCV_PLUGIN_ARCH}"
+  )
+
+  if(WIN32)
+    set_target_properties(${name} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
+    install(TARGETS ${name} OPTIONAL LIBRARY DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT plugins)
+  else()
+    install(TARGETS ${name} OPTIONAL LIBRARY DESTINATION ${OPENCV_LIB_INSTALL_PATH} COMPONENT plugins)
+  endif()
+
+  add_dependencies(opencv_highgui_plugins ${name})
+
+endfunction()
diff --git a/modules/highgui/misc/plugins/build_plugins.sh b/modules/highgui/misc/plugins/build_plugins.sh
new file mode 100755
index 0000000000..a27f4a0eca
--- /dev/null
+++ b/modules/highgui/misc/plugins/build_plugins.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+
+set -e
+
+if [ -z $1 ] ; then
+    echo "$0 <destination directory>"
+    exit 1
+fi
+
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+OCV="$( cd "${DIR}/../../../.." >/dev/null 2>&1 && pwd )"
+mkdir -p "${1}"  # Docker creates non-existed mounts with 'root' owner, lets ensure that dir exists under the current user to avoid "Permission denied" problem
+DST="$( cd "$1" >/dev/null 2>&1 && pwd )"
+CFG=$2
+
+do_build()
+{
+TAG=$1
+D=$2
+F=$3
+shift 3
+docker build \
+    --build-arg http_proxy \
+    --build-arg https_proxy \
+    $@ \
+    -t $TAG \
+    -f "${D}/${F}" \
+    "${D}"
+}
+
+do_run()
+{
+TAG=$1
+shift 1
+docker run \
+    -it \
+    --rm \
+    -v "${OCV}":/opencv:ro \
+    -v "${DST}":/dst \
+    -e CFG=$CFG \
+    --user $(id -u):$(id -g) \
+    $TAG \
+    $@
+}
+
+build_gtk2_ubuntu()
+{
+VER=$1
+TAG=opencv_highgui_ubuntu_gtk2_builder:${VER}
+do_build $TAG "${DIR}/plugin_gtk" Dockerfile-ubuntu-gtk2 --build-arg VER=${VER}
+do_run $TAG /opencv/modules/highgui/misc/plugins/plugin_gtk/build.sh /dst gtk2_ubuntu${VER} ${CFG}
+
+}
+
+build_gtk3_ubuntu()
+{
+VER=$1
+TAG=opencv_highgui_ubuntu_gtk3_builder:${VER}
+do_build $TAG "${DIR}/plugin_gtk" Dockerfile-ubuntu-gtk3 --build-arg VER=${VER}
+do_run $TAG /opencv/modules/highgui/misc/plugins/plugin_gtk/build.sh /dst gtk3_ubuntu${VER} ${CFG}
+}
+
+echo "OpenCV: ${OCV}"
+echo "Destination: ${DST}"
+
+build_gtk2_ubuntu 16.04
+build_gtk2_ubuntu 18.04
+build_gtk3_ubuntu 18.04
+build_gtk3_ubuntu 20.04
diff --git a/modules/highgui/misc/plugins/plugin_gtk/CMakeLists.txt b/modules/highgui/misc/plugins/plugin_gtk/CMakeLists.txt
new file mode 100644
index 0000000000..22462900f1
--- /dev/null
+++ b/modules/highgui/misc/plugins/plugin_gtk/CMakeLists.txt
@@ -0,0 +1,48 @@
+cmake_minimum_required(VERSION 3.5)
+project(opencv_highgui_gtk)
+
+get_filename_component(OpenCV_SOURCE_DIR "${CMAKE_CURRENT_LIST_DIR}/../../../../.." ABSOLUTE)
+include("${OpenCV_SOURCE_DIR}/cmake/OpenCVPluginStandalone.cmake")
+
+# scan dependencies
+set(WITH_GTK ON)
+include("${OpenCV_SOURCE_DIR}/modules/highgui/cmake/init.cmake")
+
+if(NOT HAVE_GTK)
+  message(FATAL_ERROR "GTK: NO")
+endif()
+
+ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-deprecated-declarations)
+
+set(OPENCV_PLUGIN_DEPS core imgproc imgcodecs)
+if(TARGET ocv.3rdparty.gtk3)
+  set(__deps ocv.3rdparty.gtk3)
+elseif(TARGET ocv.3rdparty.gtk2)
+  set(__deps ocv.3rdparty.gtk2)
+elseif(TARGET ocv.3rdparty.gtk)
+  set(__deps ocv.3rdparty.gtk)
+else()
+  message(FATAL_ERROR "Missing dependency target for GTK libraries")
+endif()
+ocv_create_plugin(highgui "opencv_highgui_gtk" "${__deps}" "GTK" "src/window_gtk.cpp")
+
+if(HAVE_GTK3)
+  message(STATUS "GTK3+: ver ${GTK3_VERSION}")
+elseif(HAVE_GTK3)
+  message(STATUS "GTK2+: ver ${GTK2_VERSION}")
+elseif(DEFINED GTK_VERSION)
+  message(STATUS "GTK+: ver ${GTK_VERSION}")
+else()
+  message(STATUS "GTK+: YES")
+endif()
+
+if(HAVE_GTHREAD)
+  message(STATUS "GThread : YES (ver ${GTHREAD_VERSION})")
+else()
+  message(STATUS "GThread : NO")
+endif()
+if(HAVE_GTKGLEXT)
+  message(STATUS "GtkGlExt: YES (ver ${GTKGLEXT_VERSION})")
+else()
+  message(STATUS "GtkGlExt: NO")
+endif()
diff --git a/modules/highgui/misc/plugins/plugin_gtk/Dockerfile-ubuntu-gtk2 b/modules/highgui/misc/plugins/plugin_gtk/Dockerfile-ubuntu-gtk2
new file mode 100644
index 0000000000..81836cb384
--- /dev/null
+++ b/modules/highgui/misc/plugins/plugin_gtk/Dockerfile-ubuntu-gtk2
@@ -0,0 +1,21 @@
+ARG VER
+FROM ubuntu:$VER
+
+RUN \
+  apt-get update && \
+  DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    pkg-config \
+    cmake \
+    g++ \
+    ninja-build \
+  && \
+  rm -rf /var/lib/apt/lists/*
+
+RUN \
+  apt-get update && \
+  DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    libgtk2.0-dev \
+  && \
+  rm -rf /var/lib/apt/lists/*
+
+WORKDIR /tmp
diff --git a/modules/highgui/misc/plugins/plugin_gtk/Dockerfile-ubuntu-gtk3 b/modules/highgui/misc/plugins/plugin_gtk/Dockerfile-ubuntu-gtk3
new file mode 100644
index 0000000000..2c6625ae14
--- /dev/null
+++ b/modules/highgui/misc/plugins/plugin_gtk/Dockerfile-ubuntu-gtk3
@@ -0,0 +1,21 @@
+ARG VER
+FROM ubuntu:$VER
+
+RUN \
+  apt-get update && \
+  DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    pkg-config \
+    cmake \
+    g++ \
+    ninja-build \
+  && \
+  rm -rf /var/lib/apt/lists/*
+
+RUN \
+  apt-get update && \
+  DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    libgtk-3-dev \
+  && \
+  rm -rf /var/lib/apt/lists/*
+
+WORKDIR /tmp
diff --git a/modules/highgui/misc/plugins/plugin_gtk/build.sh b/modules/highgui/misc/plugins/plugin_gtk/build.sh
new file mode 100755
index 0000000000..58048698db
--- /dev/null
+++ b/modules/highgui/misc/plugins/plugin_gtk/build.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+set -e
+
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+
+cmake -GNinja \
+    -DOPENCV_PLUGIN_NAME=opencv_highgui_$2 \
+    -DOPENCV_PLUGIN_DESTINATION=$1 \
+    -DCMAKE_BUILD_TYPE=$3 \
+    $DIR
+
+ninja -v
diff --git a/modules/highgui/src/backend.cpp b/modules/highgui/src/backend.cpp
new file mode 100644
index 0000000000..4c0de0584e
--- /dev/null
+++ b/modules/highgui/src/backend.cpp
@@ -0,0 +1,181 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#include "precomp.hpp"
+#include "backend.hpp"
+
+#include <opencv2/core/utils/configuration.private.hpp>
+#include <opencv2/core/utils/logger.defines.hpp>
+#ifdef NDEBUG
+#define CV_LOG_STRIP_LEVEL CV_LOG_LEVEL_DEBUG + 1
+#else
+#define CV_LOG_STRIP_LEVEL CV_LOG_LEVEL_VERBOSE + 1
+#endif
+#include <opencv2/core/utils/logger.hpp>
+
+
+#include "registry.hpp"
+#include "registry.impl.hpp"
+
+#include "plugin_api.hpp"
+#include "plugin_wrapper.impl.hpp"
+
+
+namespace cv { namespace highgui_backend {
+
+UIBackend::~UIBackend()
+{
+    // nothing
+}
+
+UIWindowBase::~UIWindowBase()
+{
+    // nothing
+}
+
+UIWindow::~UIWindow()
+{
+    // nothing
+}
+
+UITrackbar::~UITrackbar()
+{
+    // nothing
+}
+
+static
+std::string& getUIBackendName()
+{
+    static std::string g_backendName = toUpperCase(cv::utils::getConfigurationParameterString("OPENCV_UI_BACKEND", ""));
+    return g_backendName;
+}
+
+static bool g_initializedUIBackend = false;
+
+static
+std::shared_ptr<UIBackend> createUIBackend()
+{
+    const std::string& name = getUIBackendName();
+    bool isKnown = false;
+    const auto& backends = getBackendsInfo();
+    if (!name.empty())
+    {
+        CV_LOG_INFO(NULL, "UI: requested backend name: " << name);
+    }
+    for (size_t i = 0; i < backends.size(); i++)
+    {
+        const auto& info = backends[i];
+        if (!name.empty())
+        {
+            if (name != info.name)
+            {
+                continue;
+            }
+            isKnown = true;
+        }
+        try
+        {
+            CV_LOG_DEBUG(NULL, "UI: trying backend: " << info.name << " (priority=" << info.priority << ")");
+            if (!info.backendFactory)
+            {
+                CV_LOG_DEBUG(NULL, "UI: factory is not available (plugins require filesystem support): " << info.name);
+                continue;
+            }
+            std::shared_ptr<UIBackend> backend = info.backendFactory->create();
+            if (!backend)
+            {
+                CV_LOG_VERBOSE(NULL, 0, "UI: not available: " << info.name);
+                continue;
+            }
+            CV_LOG_INFO(NULL, "UI: using backend: " << info.name << " (priority=" << info.priority << ")");
+            g_initializedUIBackend = true;
+            getUIBackendName() = info.name;
+            return backend;
+        }
+        catch (const std::exception& e)
+        {
+            CV_LOG_WARNING(NULL, "UI: can't initialize " << info.name << " backend: " << e.what());
+        }
+        catch (...)
+        {
+            CV_LOG_WARNING(NULL, "UI: can't initialize " << info.name << " backend: Unknown C++ exception");
+        }
+    }
+    if (name.empty())
+    {
+        CV_LOG_DEBUG(NULL, "UI: fallback on builtin code");
+    }
+    else
+    {
+        if (!isKnown)
+            CV_LOG_INFO(NULL, "UI: unknown backend: " << name);
+    }
+    g_initializedUIBackend = true;
+    return std::shared_ptr<UIBackend>();
+}
+
+static inline
+std::shared_ptr<UIBackend> createDefaultUIBackend()
+{
+    CV_LOG_DEBUG(NULL, "UI: Initializing backend...");
+    return createUIBackend();
+}
+
+std::shared_ptr<UIBackend>& getCurrentUIBackend()
+{
+    static std::shared_ptr<UIBackend> g_currentUIBackend = createDefaultUIBackend();
+    return g_currentUIBackend;
+}
+
+void setUIBackend(const std::shared_ptr<UIBackend>& api)
+{
+    getCurrentUIBackend() = api;
+}
+
+bool setUIBackend(const std::string& backendName)
+{
+    CV_TRACE_FUNCTION();
+
+    std::string backendName_u = toUpperCase(backendName);
+    if (g_initializedUIBackend)
+    {
+        // ... already initialized
+        if (getUIBackendName() == backendName_u)
+        {
+            CV_LOG_INFO(NULL, "UI: backend is already activated: " << (backendName.empty() ? "builtin(legacy)" : backendName));
+            return true;
+        }
+        else
+        {
+            // ... re-create new
+            CV_LOG_DEBUG(NULL, "UI: replacing backend...");
+            getUIBackendName() = backendName_u;
+            getCurrentUIBackend() = createUIBackend();
+        }
+    }
+    else
+    {
+        // ... no backend exists, just specify the name (initialization is triggered by getCurrentUIBackend() call)
+        getUIBackendName() = backendName_u;
+    }
+    std::shared_ptr<UIBackend> api = getCurrentUIBackend();
+    if (!api)
+    {
+        if (!backendName.empty())
+        {
+            CV_LOG_WARNING(NULL, "UI: backend is not available: " << backendName << " (using builtin legacy code)");
+            return false;
+        }
+        else
+        {
+            CV_LOG_WARNING(NULL, "UI: switched to builtin code (legacy)");
+        }
+    }
+    if (!backendName_u.empty())
+    {
+        CV_Assert(backendName_u == getUIBackendName());  // data race?
+    }
+    return true;
+}
+
+}}  // namespace cv::highgui_backend
diff --git a/modules/highgui/src/backend.hpp b/modules/highgui/src/backend.hpp
new file mode 100644
index 0000000000..14c88b2387
--- /dev/null
+++ b/modules/highgui/src/backend.hpp
@@ -0,0 +1,131 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+#ifndef OPENCV_HIGHGUI_BACKEND_HPP
+#define OPENCV_HIGHGUI_BACKEND_HPP
+
+#include <memory>
+#include <map>
+
+namespace cv { namespace highgui_backend {
+
+class CV_EXPORTS UIWindowBase
+{
+public:
+    typedef std::shared_ptr<UIWindowBase> Ptr;
+    typedef std::weak_ptr<UIWindowBase> WeakPtr;
+
+    virtual ~UIWindowBase();
+
+    virtual const std::string& getID() const = 0;  // internal name, used for logging
+
+    virtual bool isActive() const = 0;
+
+    virtual void destroy() = 0;
+};  // UIWindowBase
+
+class UITrackbar;
+
+class CV_EXPORTS UIWindow : public UIWindowBase
+{
+public:
+    virtual ~UIWindow();
+
+    virtual void imshow(InputArray image) = 0;
+
+    virtual double getProperty(int prop) const = 0;
+    virtual bool setProperty(int prop, double value) = 0;
+
+    virtual void resize(int width, int height) = 0;
+    virtual void move(int x, int y) = 0;
+
+    virtual Rect getImageRect() const = 0;
+
+    virtual void setTitle(const std::string& title) = 0;
+
+    virtual void setMouseCallback(MouseCallback onMouse, void* userdata /*= 0*/) = 0;
+
+    //TODO: handle both keys and mouse events (both with mouse coordinates)
+    //virtual void setInputCallback(InputCallback onInputEvent, void* userdata /*= 0*/) = 0;
+
+    virtual std::shared_ptr<UITrackbar> createTrackbar(
+        const std::string& name,
+        int count,
+        TrackbarCallback onChange /*= 0*/,
+        void* userdata /*= 0*/
+    ) = 0;
+
+    virtual std::shared_ptr<UITrackbar> findTrackbar(const std::string& name) = 0;
+
+#if 0  // QT only
+    virtual void displayOverlay(const std::string& text, int delayms = 0) = 0;
+    virtual void displayStatusBar(const std::string& text, int delayms /*= 0*/) = 0;
+    virtual int createButton(
+        const std::string& bar_name, ButtonCallback on_change,
+        void* userdata = 0, int type /*= QT_PUSH_BUTTON*/,
+        bool initial_button_state /*= false*/
+    ) = 0;
+    // addText, QtFont stuff
+#endif
+
+#if 0  // OpenGL
+    virtual void imshow(const ogl::Texture2D& tex) = 0;
+    virtual void setOpenGlDrawCallback(OpenGlDrawCallback onOpenGlDraw, void* userdata = 0) = 0;
+    virtual void setOpenGlContext() = 0;
+    virtual void updateWindow() = 0;
+#endif
+
+};  // UIWindow
+
+
+class CV_EXPORTS UITrackbar : public UIWindowBase
+{
+public:
+    virtual ~UITrackbar();
+
+    virtual int getPos() const = 0;
+    virtual void setPos(int pos) = 0;
+
+    virtual cv::Range getRange() const = 0;
+    virtual void setRange(const cv::Range& range) = 0;
+};  // UITrackbar
+
+
+class CV_EXPORTS UIBackend
+{
+public:
+    virtual ~UIBackend();
+
+    virtual void destroyAllWindows() = 0;
+
+    // namedWindow
+    virtual std::shared_ptr<UIWindow> createWindow(
+        const std::string& winname,
+        int flags
+    ) = 0;
+
+    virtual int waitKeyEx(int delay /*= 0*/) = 0;
+    virtual int pollKey() = 0;
+};
+
+std::shared_ptr<UIBackend>& getCurrentUIBackend();
+void setUIBackend(const std::shared_ptr<UIBackend>& api);
+bool setUIBackend(const std::string& backendName);
+
+#ifndef BUILD_PLUGIN
+
+#ifdef HAVE_GTK
+std::shared_ptr<UIBackend> createUIBackendGTK();
+#endif
+
+#if 0  // TODO: defined HAVE_QT
+std::shared_ptr<UIBackend> createUIBackendQT();
+#endif
+
+#endif  // BUILD_PLUGIN
+
+}  // namespace highgui_backend
+
+}  // namespace cv
+
+#endif // OPENCV_HIGHGUI_BACKEND_HPP
diff --git a/modules/highgui/src/factory.hpp b/modules/highgui/src/factory.hpp
new file mode 100644
index 0000000000..c40358bb20
--- /dev/null
+++ b/modules/highgui/src/factory.hpp
@@ -0,0 +1,48 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_UI_FACTORY_HPP
+#define OPENCV_UI_FACTORY_HPP
+
+#include "backend.hpp"
+
+namespace cv { namespace highgui_backend {
+
+class IUIBackendFactory
+{
+public:
+    virtual ~IUIBackendFactory() {}
+    virtual std::shared_ptr<cv::highgui_backend::UIBackend> create() const = 0;
+};
+
+
+class StaticBackendFactory CV_FINAL: public IUIBackendFactory
+{
+protected:
+    std::function<std::shared_ptr<cv::highgui_backend::UIBackend>(void)> create_fn_;
+
+public:
+    StaticBackendFactory(std::function<std::shared_ptr<cv::highgui_backend::UIBackend>(void)>&& create_fn)
+        : create_fn_(create_fn)
+    {
+        // nothing
+    }
+
+    ~StaticBackendFactory() CV_OVERRIDE {}
+
+    std::shared_ptr<cv::highgui_backend::UIBackend> create() const CV_OVERRIDE
+    {
+        return create_fn_();
+    }
+};
+
+//
+// PluginUIBackendFactory is implemented in plugin_wrapper
+//
+
+std::shared_ptr<IUIBackendFactory> createPluginUIBackendFactory(const std::string& baseName);
+
+}}  // namespace
+
+#endif  // OPENCV_UI_FACTORY_HPP
diff --git a/modules/highgui/src/plugin_api.hpp b/modules/highgui/src/plugin_api.hpp
new file mode 100644
index 0000000000..fb57b7593e
--- /dev/null
+++ b/modules/highgui/src/plugin_api.hpp
@@ -0,0 +1,72 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef UI_PLUGIN_API_HPP
+#define UI_PLUGIN_API_HPP
+
+#include <opencv2/core/cvdef.h>
+#include <opencv2/core/llapi/llapi.h>
+
+#include "backend.hpp"
+
+#if !defined(BUILD_PLUGIN)
+
+/// increased for backward-compatible changes, e.g. add new function
+/// Caller API <= Plugin API -> plugin is fully compatible
+/// Caller API > Plugin API -> plugin is not fully compatible, caller should use extra checks to use plugins with older API
+#define API_VERSION 0 // preview
+
+/// increased for incompatible changes, e.g. remove function argument
+/// Caller ABI == Plugin ABI -> plugin is compatible
+/// Caller ABI > Plugin ABI -> plugin is not compatible, caller should use shim code to use old ABI plugins (caller may know how lower ABI works, so it is possible)
+/// Caller ABI < Plugin ABI -> plugin can't be used (plugin should provide interface with lower ABI to handle that)
+#define ABI_VERSION 0 // preview
+
+#else // !defined(BUILD_PLUGIN)
+
+#if !defined(ABI_VERSION) || !defined(API_VERSION)
+#error "Plugin must define ABI_VERSION and API_VERSION before including plugin_api.hpp"
+#endif
+
+#endif // !defined(BUILD_PLUGIN)
+
+typedef cv::highgui_backend::UIBackend* CvPluginUIBackend;
+
+struct OpenCV_UI_Plugin_API_v0_0_api_entries
+{
+    /** @brief Get backend API instance
+
+    @param[out] handle pointer on backend API handle
+
+    @note API-CALL 1, API-Version == 0
+     */
+    CvResult (CV_API_CALL *getInstance)(CV_OUT CvPluginUIBackend* handle) CV_NOEXCEPT;
+}; // OpenCV_UI_Plugin_API_v0_0_api_entries
+
+typedef struct OpenCV_UI_Plugin_API_v0
+{
+    OpenCV_API_Header api_header;
+    struct OpenCV_UI_Plugin_API_v0_0_api_entries v0;
+} OpenCV_UI_Plugin_API_v0;
+
+#if ABI_VERSION == 0 && API_VERSION == 0
+typedef OpenCV_UI_Plugin_API_v0 OpenCV_UI_Plugin_API;
+#else
+#error "Not supported configuration: check ABI_VERSION/API_VERSION"
+#endif
+
+#ifdef BUILD_PLUGIN
+extern "C" {
+
+CV_PLUGIN_EXPORTS
+const OpenCV_UI_Plugin_API* CV_API_CALL opencv_ui_plugin_init_v0
+        (int requested_abi_version, int requested_api_version, void* reserved /*NULL*/) CV_NOEXCEPT;
+
+}  // extern "C"
+#else  // BUILD_PLUGIN
+typedef const OpenCV_UI_Plugin_API* (CV_API_CALL *FN_opencv_ui_plugin_init_t)
+        (int requested_abi_version, int requested_api_version, void* reserved /*NULL*/);
+#endif  // BUILD_PLUGIN
+
+#endif // UI_PLUGIN_API_HPP
diff --git a/modules/highgui/src/plugin_wrapper.impl.hpp b/modules/highgui/src/plugin_wrapper.impl.hpp
new file mode 100644
index 0000000000..3fa2cfa11a
--- /dev/null
+++ b/modules/highgui/src/plugin_wrapper.impl.hpp
@@ -0,0 +1,284 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+//
+// Not a standalone header, part of backend.cpp
+//
+
+//==================================================================================================
+// Dynamic backend implementation
+
+#include "opencv2/core/utils/plugin_loader.private.hpp"
+
+namespace cv { namespace impl {
+
+using namespace cv::highgui_backend;
+
+#if OPENCV_HAVE_FILESYSTEM_SUPPORT && defined(ENABLE_PLUGINS)
+
+using namespace cv::plugin::impl;  // plugin_loader.hpp
+
+class PluginUIBackend CV_FINAL: public std::enable_shared_from_this<PluginUIBackend>
+{
+protected:
+    void initPluginAPI()
+    {
+        const char* init_name = "opencv_ui_plugin_init_v0";
+        FN_opencv_ui_plugin_init_t fn_init = reinterpret_cast<FN_opencv_ui_plugin_init_t>(lib_->getSymbol(init_name));
+        if (fn_init)
+        {
+            CV_LOG_DEBUG(NULL, "Found entry: '" << init_name << "'");
+            for (int supported_api_version = API_VERSION; supported_api_version >= 0; supported_api_version--)
+            {
+                plugin_api_ = fn_init(ABI_VERSION, supported_api_version, NULL);
+                if (plugin_api_)
+                    break;
+            }
+            if (!plugin_api_)
+            {
+                CV_LOG_INFO(NULL, "UI: plugin is incompatible (can't be initialized): " << lib_->getName());
+                return;
+            }
+            // NB: force strict minor version check (ABI is not preserved for now)
+            if (!checkCompatibility(plugin_api_->api_header, ABI_VERSION, API_VERSION, true))
+            {
+                plugin_api_ = NULL;
+                return;
+            }
+            CV_LOG_INFO(NULL, "UI: plugin is ready to use '" << plugin_api_->api_header.api_description << "'");
+        }
+        else
+        {
+            CV_LOG_INFO(NULL, "UI: plugin is incompatible, missing init function: '" << init_name << "', file: " << lib_->getName());
+        }
+    }
+
+
+    bool checkCompatibility(const OpenCV_API_Header& api_header, unsigned int abi_version, unsigned int api_version, bool checkMinorOpenCVVersion)
+    {
+        if (api_header.opencv_version_major != CV_VERSION_MAJOR)
+        {
+            CV_LOG_ERROR(NULL, "UI: wrong OpenCV major version used by plugin '" << api_header.api_description << "': " <<
+                cv::format("%d.%d, OpenCV version is '" CV_VERSION "'", api_header.opencv_version_major, api_header.opencv_version_minor))
+            return false;
+        }
+        if (!checkMinorOpenCVVersion)
+        {
+            // no checks for OpenCV minor version
+        }
+        else if (api_header.opencv_version_minor != CV_VERSION_MINOR)
+        {
+            CV_LOG_ERROR(NULL, "UI: wrong OpenCV minor version used by plugin '" << api_header.api_description << "': " <<
+                cv::format("%d.%d, OpenCV version is '" CV_VERSION "'", api_header.opencv_version_major, api_header.opencv_version_minor))
+            return false;
+        }
+        CV_LOG_DEBUG(NULL, "UI: initialized '" << api_header.api_description << "': built with "
+            << cv::format("OpenCV %d.%d (ABI/API = %d/%d)",
+                 api_header.opencv_version_major, api_header.opencv_version_minor,
+                 api_header.min_api_version, api_header.api_version)
+            << ", current OpenCV version is '" CV_VERSION "' (ABI/API = " << abi_version << "/" << api_version << ")"
+        );
+        if (api_header.min_api_version != abi_version)  // future: range can be here
+        {
+            // actually this should never happen due to checks in plugin's init() function
+            CV_LOG_ERROR(NULL, "UI: plugin is not supported due to incompatible ABI = " << api_header.min_api_version);
+            return false;
+        }
+        if (api_header.api_version != api_version)
+        {
+            CV_LOG_INFO(NULL, "UI: NOTE: plugin is supported, but there is API version mismath: "
+                << cv::format("plugin API level (%d) != OpenCV API level (%d)", api_header.api_version, api_version));
+            if (api_header.api_version < api_version)
+            {
+                CV_LOG_INFO(NULL, "UI: NOTE: some functionality may be unavailable due to lack of support by plugin implementation");
+            }
+        }
+        return true;
+    }
+
+public:
+    std::shared_ptr<cv::plugin::impl::DynamicLib> lib_;
+    const OpenCV_UI_Plugin_API* plugin_api_;
+
+    PluginUIBackend(const std::shared_ptr<cv::plugin::impl::DynamicLib>& lib)
+        : lib_(lib)
+        , plugin_api_(NULL)
+    {
+        initPluginAPI();
+    }
+
+    std::shared_ptr<cv::highgui_backend::UIBackend> create() const
+    {
+        CV_Assert(plugin_api_);
+
+        CvPluginUIBackend instancePtr = NULL;
+
+        if (plugin_api_->v0.getInstance)
+        {
+            if (CV_ERROR_OK == plugin_api_->v0.getInstance(&instancePtr))
+            {
+                CV_Assert(instancePtr);
+                // TODO C++20 "aliasing constructor"
+                return std::shared_ptr<cv::highgui_backend::UIBackend>(instancePtr, [](cv::highgui_backend::UIBackend*){});  // empty deleter
+            }
+        }
+        return std::shared_ptr<cv::highgui_backend::UIBackend>();
+    }
+};
+
+
+class PluginUIBackendFactory CV_FINAL: public IUIBackendFactory
+{
+public:
+    std::string baseName_;
+    std::shared_ptr<PluginUIBackend> backend;
+    bool initialized;
+public:
+    PluginUIBackendFactory(const std::string& baseName)
+        : baseName_(baseName)
+        , initialized(false)
+    {
+        // nothing, plugins are loaded on demand
+    }
+
+    std::shared_ptr<cv::highgui_backend::UIBackend> create() const CV_OVERRIDE
+    {
+        if (!initialized)
+        {
+            const_cast<PluginUIBackendFactory*>(this)->initBackend();
+        }
+        if (backend)
+            return backend->create();
+        return std::shared_ptr<cv::highgui_backend::UIBackend>();
+    }
+protected:
+    void initBackend()
+    {
+        AutoLock lock(getInitializationMutex());
+        try
+        {
+            if (!initialized)
+                loadPlugin();
+        }
+        catch (...)
+        {
+            CV_LOG_INFO(NULL, "UI: exception during plugin loading: " << baseName_ << ". SKIP");
+        }
+        initialized = true;
+    }
+    void loadPlugin();
+};
+
+static
+std::vector<FileSystemPath_t> getPluginCandidates(const std::string& baseName)
+{
+    using namespace cv::utils;
+    using namespace cv::utils::fs;
+    const std::string baseName_l = toLowerCase(baseName);
+    const std::string baseName_u = toUpperCase(baseName);
+    const FileSystemPath_t baseName_l_fs = toFileSystemPath(baseName_l);
+    std::vector<FileSystemPath_t> paths;
+    // TODO OPENCV_PLUGIN_PATH
+    const std::vector<std::string> paths_ = getConfigurationParameterPaths("OPENCV_CORE_PLUGIN_PATH", std::vector<std::string>());
+    if (paths_.size() != 0)
+    {
+        for (size_t i = 0; i < paths_.size(); i++)
+        {
+            paths.push_back(toFileSystemPath(paths_[i]));
+        }
+    }
+    else
+    {
+        FileSystemPath_t binaryLocation;
+        if (getBinLocation(binaryLocation))
+        {
+            binaryLocation = getParent(binaryLocation);
+#ifndef CV_UI_PLUGIN_SUBDIRECTORY
+            paths.push_back(binaryLocation);
+#else
+            paths.push_back(binaryLocation + toFileSystemPath("/") + toFileSystemPath(CV_UI_PLUGIN_SUBDIRECTORY_STR));
+#endif
+        }
+    }
+    const std::string default_expr = libraryPrefix() + "opencv_highgui_" + baseName_l + "*" + librarySuffix();
+    const std::string plugin_expr = getConfigurationParameterString((std::string("OPENCV_UI_PLUGIN_") + baseName_u).c_str(), default_expr.c_str());
+    std::vector<FileSystemPath_t> results;
+#ifdef _WIN32
+    FileSystemPath_t moduleName = toFileSystemPath(libraryPrefix() + "opencv_highgui_" + baseName_l + librarySuffix());
+    if (plugin_expr != default_expr)
+    {
+        moduleName = toFileSystemPath(plugin_expr);
+        results.push_back(moduleName);
+    }
+    for (const FileSystemPath_t& path : paths)
+    {
+        results.push_back(path + L"\\" + moduleName);
+    }
+    results.push_back(moduleName);
+#else
+    CV_LOG_DEBUG(NULL, "UI: " << baseName << " plugin's glob is '" << plugin_expr << "', " << paths.size() << " location(s)");
+    for (const std::string& path : paths)
+    {
+        if (path.empty())
+            continue;
+        std::vector<std::string> candidates;
+        cv::glob(utils::fs::join(path, plugin_expr), candidates);
+        CV_LOG_DEBUG(NULL, "    - " << path << ": " << candidates.size());
+        copy(candidates.begin(), candidates.end(), back_inserter(results));
+    }
+#endif
+    CV_LOG_DEBUG(NULL, "Found " << results.size() << " plugin(s) for " << baseName);
+    return results;
+}
+
+void PluginUIBackendFactory::loadPlugin()
+{
+    for (const FileSystemPath_t& plugin : getPluginCandidates(baseName_))
+    {
+        auto lib = std::make_shared<cv::plugin::impl::DynamicLib>(plugin);
+        if (!lib->isLoaded())
+        {
+            continue;
+        }
+        try
+        {
+            auto pluginBackend = std::make_shared<PluginUIBackend>(lib);
+            if (!pluginBackend)
+            {
+                continue;
+            }
+            if (pluginBackend->plugin_api_ == NULL)
+            {
+                CV_LOG_ERROR(NULL, "UI: no compatible plugin API for backend: " << baseName_ << " in " << toPrintablePath(plugin));
+                continue;
+            }
+            // NB: we are going to use UI backend, so prevent automatic library unloading
+            lib->disableAutomaticLibraryUnloading();
+            backend = pluginBackend;
+            return;
+        }
+        catch (...)
+        {
+            CV_LOG_WARNING(NULL, "UI: exception during plugin initialization: " << toPrintablePath(plugin) << ". SKIP");
+        }
+    }
+}
+
+#endif  // OPENCV_HAVE_FILESYSTEM_SUPPORT && defined(ENABLE_PLUGINS)
+
+}  // namespace
+
+namespace highgui_backend {
+
+std::shared_ptr<IUIBackendFactory> createPluginUIBackendFactory(const std::string& baseName)
+{
+#if OPENCV_HAVE_FILESYSTEM_SUPPORT && defined(ENABLE_PLUGINS)
+    return std::make_shared<impl::PluginUIBackendFactory>(baseName);
+#else
+    CV_UNUSED(baseName);
+    return std::shared_ptr<IUIBackendFactory>();
+#endif
+}
+
+}}  // namespace
diff --git a/modules/highgui/src/precomp.hpp b/modules/highgui/src/precomp.hpp
index a5b176c9dd..275cc556ae 100644
--- a/modules/highgui/src/precomp.hpp
+++ b/modules/highgui/src/precomp.hpp
@@ -42,10 +42,16 @@
 #ifndef __HIGHGUI_H_
 #define __HIGHGUI_H_
 
+#if defined(__OPENCV_BUILD) && defined(BUILD_PLUGIN)
+#undef __OPENCV_BUILD  // allow public API only
+#endif
+
 #include "opencv2/highgui.hpp"
 
 #include "opencv2/core/utility.hpp"
+#if defined(__OPENCV_BUILD)
 #include "opencv2/core/private.hpp"
+#endif
 
 #include "opencv2/imgproc.hpp"
 #include "opencv2/imgproc/imgproc_c.h"
@@ -169,4 +175,11 @@ inline void convertToShow(const cv::Mat &src, const CvMat* arr, bool toRGB = tru
 }
 
 
+namespace cv {
+
+CV_EXPORTS Mutex& getWindowMutex();
+static inline Mutex& getInitializationMutex() { return getWindowMutex(); }
+
+}  // namespace
+
 #endif /* __HIGHGUI_H_ */
diff --git a/modules/highgui/src/registry.hpp b/modules/highgui/src/registry.hpp
new file mode 100644
index 0000000000..77c1234f05
--- /dev/null
+++ b/modules/highgui/src/registry.hpp
@@ -0,0 +1,25 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_HIGHGUI_REGISTRY_HPP
+#define OPENCV_HIGHGUI_REGISTRY_HPP
+
+#include "factory.hpp"
+
+namespace cv { namespace highgui_backend {
+
+struct BackendInfo
+{
+    int priority;     // 1000-<index*10> - default builtin priority
+                      // 0 - disabled (OPENCV_UI_PRIORITY_<name> = 0)
+                      // >10000 - prioritized list (OPENCV_UI_PRIORITY_LIST)
+    std::string name;
+    std::shared_ptr<IUIBackendFactory> backendFactory;
+};
+
+const std::vector<BackendInfo>& getBackendsInfo();
+
+}} // namespace
+
+#endif // OPENCV_HIGHGUI_REGISTRY_HPP
diff --git a/modules/highgui/src/registry.impl.hpp b/modules/highgui/src/registry.impl.hpp
new file mode 100644
index 0000000000..a2e4dbea47
--- /dev/null
+++ b/modules/highgui/src/registry.impl.hpp
@@ -0,0 +1,183 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+//
+// Not a standalone header, part of backend.cpp
+//
+
+#include "opencv2/core/utils/filesystem.private.hpp"  // OPENCV_HAVE_FILESYSTEM_SUPPORT
+
+namespace cv { namespace highgui_backend {
+
+#if OPENCV_HAVE_FILESYSTEM_SUPPORT && defined(ENABLE_PLUGINS)
+#define DECLARE_DYNAMIC_BACKEND(name) \
+BackendInfo { \
+    1000, name, createPluginUIBackendFactory(name) \
+},
+#else
+#define DECLARE_DYNAMIC_BACKEND(name) /* nothing */
+#endif
+
+#define DECLARE_STATIC_BACKEND(name, createBackendAPI) \
+BackendInfo { \
+    1000, name, std::make_shared<cv::highgui_backend::StaticBackendFactory>([=] () -> std::shared_ptr<cv::highgui_backend::UIBackend> { return createBackendAPI(); }) \
+},
+
+static
+std::vector<BackendInfo>& getBuiltinBackendsInfo()
+{
+    static std::vector<BackendInfo> g_backends
+    {
+#ifdef HAVE_GTK
+        DECLARE_STATIC_BACKEND("GTK", createUIBackendGTK)
+#if defined(HAVE_GTK3)
+        DECLARE_STATIC_BACKEND("GTK3", createUIBackendGTK)
+#elif defined(HAVE_GTK2)
+        DECLARE_STATIC_BACKEND("GTK2", createUIBackendGTK)
+#else
+#warning "HAVE_GTK definition issue. Register new GTK backend"
+#endif
+#elif defined(ENABLE_PLUGINS)
+        DECLARE_DYNAMIC_BACKEND("GTK")
+        DECLARE_DYNAMIC_BACKEND("GTK3")
+        DECLARE_DYNAMIC_BACKEND("GTK2")
+#endif
+
+#if 0  // TODO
+#ifdef HAVE_QT
+        DECLARE_STATIC_BACKEND("QT", createUIBackendQT)
+#elif defined(ENABLE_PLUGINS)
+        DECLARE_DYNAMIC_BACKEND("QT")
+#endif
+#endif
+    };
+    return g_backends;
+};
+
+static
+bool sortByPriority(const BackendInfo &lhs, const BackendInfo &rhs)
+{
+    return lhs.priority > rhs.priority;
+}
+
+/** @brief Manages list of enabled backends
+ */
+class UIBackendRegistry
+{
+protected:
+    std::vector<BackendInfo> enabledBackends;
+    UIBackendRegistry()
+    {
+        enabledBackends = getBuiltinBackendsInfo();
+        int N = (int)enabledBackends.size();
+        for (int i = 0; i < N; i++)
+        {
+            BackendInfo& info = enabledBackends[i];
+            info.priority = 1000 - i * 10;
+        }
+        CV_LOG_DEBUG(NULL, "UI: Builtin backends(" << N << "): " << dumpBackends());
+        if (readPrioritySettings())
+        {
+            CV_LOG_INFO(NULL, "UI: Updated backends priorities: " << dumpBackends());
+            N = (int)enabledBackends.size();
+        }
+        int enabled = 0;
+        for (int i = 0; i < N; i++)
+        {
+            BackendInfo& info = enabledBackends[enabled];
+            if (enabled != i)
+                info = enabledBackends[i];
+            size_t param_priority = utils::getConfigurationParameterSizeT(cv::format("OPENCV_UI_PRIORITY_%s", info.name.c_str()).c_str(), (size_t)info.priority);
+            CV_Assert(param_priority == (size_t)(int)param_priority); // overflow check
+            if (param_priority > 0)
+            {
+                info.priority = (int)param_priority;
+                enabled++;
+            }
+            else
+            {
+                CV_LOG_INFO(NULL, "UI: Disable backend: " << info.name);
+            }
+        }
+        enabledBackends.resize(enabled);
+        CV_LOG_DEBUG(NULL, "UI: Available backends(" << enabled << "): " << dumpBackends());
+        std::sort(enabledBackends.begin(), enabledBackends.end(), sortByPriority);
+        CV_LOG_INFO(NULL, "UI: Enabled backends(" << enabled << ", sorted by priority): " << (enabledBackends.empty() ? std::string("N/A") : dumpBackends()));
+    }
+
+    static std::vector<std::string> tokenize_string(const std::string& input, char token)
+    {
+        std::vector<std::string> result;
+        std::string::size_type prev_pos = 0, pos = 0;
+        while((pos = input.find(token, pos)) != std::string::npos)
+        {
+            result.push_back(input.substr(prev_pos, pos-prev_pos));
+            prev_pos = ++pos;
+        }
+        result.push_back(input.substr(prev_pos));
+        return result;
+    }
+    bool readPrioritySettings()
+    {
+        bool hasChanges = false;
+        cv::String prioritized_backends = utils::getConfigurationParameterString("OPENCV_UI_PRIORITY_LIST", NULL);
+        if (prioritized_backends.empty())
+            return hasChanges;
+        CV_LOG_INFO(NULL, "UI: Configured priority list (OPENCV_UI_PRIORITY_LIST): " << prioritized_backends);
+        const std::vector<std::string> names = tokenize_string(prioritized_backends, ',');
+        for (size_t i = 0; i < names.size(); i++)
+        {
+            const std::string& name = names[i];
+            int priority = (int)(100000 + (names.size() - i) * 1000);
+            bool found = false;
+            for (size_t k = 0; k < enabledBackends.size(); k++)
+            {
+                BackendInfo& info = enabledBackends[k];
+                if (name == info.name)
+                {
+                    info.priority = priority;
+                    CV_LOG_DEBUG(NULL, "UI: New backend priority: '" << name << "' => " << info.priority);
+                    found = true;
+                    hasChanges = true;
+                    break;
+                }
+            }
+            if (!found)
+            {
+                CV_LOG_INFO(NULL, "UI: Adding backend (plugin): '" << name << "'");
+                enabledBackends.push_back(BackendInfo{priority, name, createPluginUIBackendFactory(name)});
+                hasChanges = true;
+            }
+        }
+        return hasChanges;
+    }
+public:
+    std::string dumpBackends() const
+    {
+        std::ostringstream os;
+        for (size_t i = 0; i < enabledBackends.size(); i++)
+        {
+            if (i > 0) os << "; ";
+            const BackendInfo& info = enabledBackends[i];
+            os << info.name << '(' << info.priority << ')';
+        }
+        return os.str();
+    }
+
+    static UIBackendRegistry& getInstance()
+    {
+        static UIBackendRegistry g_instance;
+        return g_instance;
+    }
+
+    inline const std::vector<BackendInfo>& getEnabledBackends() const { return enabledBackends; }
+};
+
+
+const std::vector<BackendInfo>& getBackendsInfo()
+{
+    return cv::highgui_backend::UIBackendRegistry::getInstance().getEnabledBackends();
+}
+
+}} // namespace
diff --git a/modules/highgui/src/window.cpp b/modules/highgui/src/window.cpp
index d2cf1e1e48..7a4532e1b8 100644
--- a/modules/highgui/src/window.cpp
+++ b/modules/highgui/src/window.cpp
@@ -40,20 +40,165 @@
 //M*/
 
 #include "precomp.hpp"
-#include <map>
+#include "backend.hpp"
+
 #include "opencv2/core/opengl.hpp"
 #include "opencv2/core/utils/logger.hpp"
 
 // in later times, use this file as a dispatcher to implementations like cvcap.cpp
 
+
+using namespace cv;
+using namespace cv::highgui_backend;
+
+namespace cv {
+
+Mutex& getWindowMutex()
+{
+    static Mutex* g_window_mutex = new Mutex();
+    return *g_window_mutex;
+}
+
+namespace impl {
+
+typedef std::map<std::string, highgui_backend::UIWindowBase::Ptr> WindowsMap_t;
+static WindowsMap_t& getWindowsMap()
+{
+    static WindowsMap_t g_windowsMap;
+    return g_windowsMap;
+}
+
+static std::shared_ptr<UIWindow> findWindow_(const std::string& name)
+{
+    cv::AutoLock lock(cv::getWindowMutex());
+    auto& windowsMap = getWindowsMap();
+    auto i = windowsMap.find(name);
+    if (i != windowsMap.end())
+    {
+        const auto& ui_base = i->second;
+        if (ui_base)
+        {
+            if (!ui_base->isActive())
+            {
+                windowsMap.erase(i);
+                return std::shared_ptr<UIWindow>();
+            }
+            auto window = std::dynamic_pointer_cast<UIWindow>(ui_base);
+            return window;
+        }
+    }
+    return std::shared_ptr<UIWindow>();
+}
+
+static void cleanupTrackbarCallbacksWithData_();  // forward declaration
+
+static void cleanupClosedWindows_()
+{
+    cv::AutoLock lock(cv::getWindowMutex());
+    auto& windowsMap = getWindowsMap();
+    for (auto it = windowsMap.begin(); it != windowsMap.end();)
+    {
+        const auto& ui_base = it->second;
+        bool erase = (!ui_base || !ui_base->isActive());
+        if (erase)
+        {
+            it = windowsMap.erase(it);
+        }
+        else
+        {
+            ++it;
+        }
+    }
+
+    cleanupTrackbarCallbacksWithData_();
+}
+
+// Just to support deprecated API, to be removed
+struct TrackbarCallbackWithData
+{
+    std::weak_ptr<UITrackbar> trackbar_;
+    int* data_;
+    TrackbarCallback callback_;
+    void* userdata_;
+
+    TrackbarCallbackWithData(int* data, TrackbarCallback callback, void* userdata)
+        : data_(data)
+        , callback_(callback), userdata_(userdata)
+    {
+        // trackbar_ is initialized separatelly
+    }
+
+    ~TrackbarCallbackWithData()
+    {
+        CV_LOG_DEBUG(NULL, "UI/Trackbar: Cleanup deprecated TrackbarCallbackWithData");
+    }
+
+    void onChange(int pos)
+    {
+        if (data_)
+            *data_ = pos;
+        if (callback_)
+            callback_(pos, userdata_);
+    }
+
+    static void onChangeCallback(int pos, void* userdata)
+    {
+        TrackbarCallbackWithData* thiz = (TrackbarCallbackWithData*)userdata;
+        CV_Assert(thiz);
+        return thiz->onChange(pos);
+    }
+};
+
+typedef std::vector< std::shared_ptr<TrackbarCallbackWithData> > TrackbarCallbacksWithData_t;
+static TrackbarCallbacksWithData_t& getTrackbarCallbacksWithData()
+{
+    static TrackbarCallbacksWithData_t g_trackbarCallbacksWithData;
+    return g_trackbarCallbacksWithData;
+}
+
+static void cleanupTrackbarCallbacksWithData_()
+{
+    cv::AutoLock lock(cv::getWindowMutex());
+    auto& callbacks = getTrackbarCallbacksWithData();
+    for (auto it = callbacks.begin(); it != callbacks.end();)
+    {
+        const auto& cb = *it;
+        bool erase = (!cb || cb->trackbar_.expired());
+        if (erase)
+        {
+            it = callbacks.erase(it);
+        }
+        else
+        {
+            ++it;
+        }
+    }
+}
+
+}}  // namespace cv::impl
+
+using namespace cv::impl;
+
 CV_IMPL void cvSetWindowProperty(const char* name, int prop_id, double prop_value)
 {
+    CV_TRACE_FUNCTION();
+    CV_Assert(name);
+
+    {
+        auto window = findWindow_(name);
+        if (window)
+        {
+            /*bool res = */window->setProperty(prop_id, prop_value);
+            return;
+        }
+    }
+
     switch(prop_id)
     {
     //change between fullscreen or not.
     case CV_WND_PROP_FULLSCREEN:
 
-        if (!name || (prop_value!=CV_WINDOW_NORMAL && prop_value!=CV_WINDOW_FULLSCREEN))//bad argument
+        if (prop_value != CV_WINDOW_NORMAL && prop_value != CV_WINDOW_FULLSCREEN)  // bad argument
             break;
 
         #if defined (HAVE_QT)
@@ -109,8 +254,19 @@ CV_IMPL void cvSetWindowProperty(const char* name, int prop_id, double prop_valu
 /* return -1 if error */
 CV_IMPL double cvGetWindowProperty(const char* name, int prop_id)
 {
-    if (!name)
-        return -1;
+    CV_TRACE_FUNCTION();
+    CV_Assert(name);
+
+    {
+        auto window = findWindow_(name);
+        if (window)
+        {
+            double v = window->getProperty(prop_id);
+            if (cvIsNaN(v))
+                return -1;
+            return v;
+        }
+    }
 
     switch(prop_id)
     {
@@ -209,9 +365,18 @@ CV_IMPL double cvGetWindowProperty(const char* name, int prop_id)
 
 cv::Rect cvGetWindowImageRect(const char* name)
 {
+    CV_TRACE_FUNCTION();
     if (!name)
         return cv::Rect(-1, -1, -1, -1);
 
+    {
+        auto window = findWindow_(name);
+        if (window)
+        {
+            return window->getImageRect();
+        }
+    }
+
     #if defined (HAVE_QT)
         return cvGetWindowRect_QT(name);
     #elif defined(HAVE_WIN32UI)
@@ -234,24 +399,90 @@ cv::Rect cv::getWindowImageRect(const String& winname)
 void cv::namedWindow( const String& winname, int flags )
 {
     CV_TRACE_FUNCTION();
+    CV_Assert(!winname.empty());
+
+    {
+        cv::AutoLock lock(cv::getWindowMutex());
+        cleanupClosedWindows_();
+        auto& windowsMap = getWindowsMap();
+        auto i = windowsMap.find(winname);
+        if (i != windowsMap.end())
+        {
+            auto ui_base = i->second;
+            if (ui_base)
+            {
+                auto window = std::dynamic_pointer_cast<UIWindow>(ui_base);
+                if (!window)
+                {
+                    CV_LOG_ERROR(NULL, "OpenCV/UI: Can't create window: '" << winname << "'");
+                }
+                return;
+            }
+        }
+        auto backend = getCurrentUIBackend();
+        if (backend)
+        {
+            auto window = backend->createWindow(winname, flags);
+            if (!window)
+            {
+                CV_LOG_ERROR(NULL, "OpenCV/UI: Can't create window: '" << winname << "'");
+                return;
+            }
+            windowsMap.emplace(winname, window);
+            return;
+        }
+    }
+
     cvNamedWindow( winname.c_str(), flags );
 }
 
 void cv::destroyWindow( const String& winname )
 {
     CV_TRACE_FUNCTION();
+
+    {
+        auto window = findWindow_(winname);
+        if (window)
+        {
+            window->destroy();
+            cleanupClosedWindows_();
+            return;
+        }
+    }
+
     cvDestroyWindow( winname.c_str() );
 }
 
 void cv::destroyAllWindows()
 {
     CV_TRACE_FUNCTION();
+
+    {
+        cv::AutoLock lock(cv::getWindowMutex());
+        auto backend = getCurrentUIBackend();
+        if (backend)
+        {
+            backend->destroyAllWindows();
+            cleanupClosedWindows_();
+            return;
+        }
+    }
+
     cvDestroyAllWindows();
 }
 
 void cv::resizeWindow( const String& winname, int width, int height )
 {
     CV_TRACE_FUNCTION();
+
+    {
+        auto window = findWindow_(winname);
+        if (window)
+        {
+            return window->resize(width, height);
+        }
+    }
+
     cvResizeWindow( winname.c_str(), width, height );
 }
 
@@ -264,6 +495,15 @@ void cv::resizeWindow(const String& winname, const cv::Size& size)
 void cv::moveWindow( const String& winname, int x, int y )
 {
     CV_TRACE_FUNCTION();
+
+    {
+        auto window = findWindow_(winname);
+        if (window)
+        {
+            return window->move(x, y);
+        }
+    }
+
     cvMoveWindow( winname.c_str(), x, y );
 }
 
@@ -282,6 +522,16 @@ double cv::getWindowProperty(const String& winname, int prop_id)
 int cv::waitKeyEx(int delay)
 {
     CV_TRACE_FUNCTION();
+
+    {
+        cv::AutoLock lock(cv::getWindowMutex());
+        auto backend = getCurrentUIBackend();
+        if (backend)
+        {
+            return backend->waitKeyEx(delay);
+        }
+    }
+
     return cvWaitKey(delay);
 }
 
@@ -308,6 +558,16 @@ int cv::waitKey(int delay)
 int cv::pollKey()
 {
     CV_TRACE_FUNCTION();
+
+    {
+        cv::AutoLock lock(cv::getWindowMutex());
+        auto backend = getCurrentUIBackend();
+        if (backend)
+        {
+            return backend->pollKey();
+        }
+    }
+
     // fallback. please implement a proper polling function
     return cvWaitKey(1);
 }
@@ -318,6 +578,44 @@ int cv::createTrackbar(const String& trackbarName, const String& winName,
                    void* userdata)
 {
     CV_TRACE_FUNCTION();
+
+    CV_LOG_IF_WARNING(NULL, value, "UI/Trackbar(" << trackbarName << "@" << winName << "): Using 'value' pointer is unsafe and deprecated. Use NULL as value pointer. "
+            "To fetch trackbar value setup callback.");
+
+    {
+        cv::AutoLock lock(cv::getWindowMutex());
+        auto window = findWindow_(winName);
+        if (window)
+        {
+            if (value)
+            {
+                auto cb = std::make_shared<TrackbarCallbackWithData>(value, callback, userdata);
+                auto trackbar = window->createTrackbar(trackbarName, count, TrackbarCallbackWithData::onChangeCallback, cb.get());
+                if (!trackbar)
+                {
+                    CV_LOG_ERROR(NULL, "OpenCV/UI: Can't create trackbar: '" << trackbarName << "'@'" << winName << "'");
+                    return 0;
+                }
+                cb->trackbar_ = trackbar;
+                getTrackbarCallbacksWithData().emplace_back(cb);
+                getWindowsMap().emplace(trackbar->getID(), trackbar);
+                trackbar->setPos(*value);
+                return 1;
+            }
+            else
+            {
+                auto trackbar = window->createTrackbar(trackbarName, count, callback, userdata);
+                if (!trackbar)
+                {
+                    CV_LOG_ERROR(NULL, "OpenCV/UI: Can't create trackbar: '" << trackbarName << "'@'" << winName << "'");
+                    return 0;
+                }
+                getWindowsMap().emplace(trackbar->getID(), trackbar);
+                return 1;
+            }
+        }
+    }
+
     return cvCreateTrackbar2(trackbarName.c_str(), winName.c_str(),
                              value, count, callback, userdata);
 }
@@ -325,30 +623,92 @@ int cv::createTrackbar(const String& trackbarName, const String& winName,
 void cv::setTrackbarPos( const String& trackbarName, const String& winName, int value )
 {
     CV_TRACE_FUNCTION();
+
+    {
+        cv::AutoLock lock(cv::getWindowMutex());
+        auto window = findWindow_(winName);
+        if (window)
+        {
+            auto trackbar = window->findTrackbar(trackbarName);
+            CV_Assert(trackbar);
+            return trackbar->setPos(value);
+        }
+    }
+
     cvSetTrackbarPos(trackbarName.c_str(), winName.c_str(), value );
 }
 
 void cv::setTrackbarMax(const String& trackbarName, const String& winName, int maxval)
 {
     CV_TRACE_FUNCTION();
+
+    {
+        cv::AutoLock lock(cv::getWindowMutex());
+        auto window = findWindow_(winName);
+        if (window)
+        {
+            auto trackbar = window->findTrackbar(trackbarName);
+            CV_Assert(trackbar);
+            Range old_range = trackbar->getRange();
+            Range range(std::min(old_range.start, maxval), maxval);
+            return trackbar->setRange(range);
+        }
+    }
+
     cvSetTrackbarMax(trackbarName.c_str(), winName.c_str(), maxval);
 }
 
 void cv::setTrackbarMin(const String& trackbarName, const String& winName, int minval)
 {
     CV_TRACE_FUNCTION();
+
+    {
+        cv::AutoLock lock(cv::getWindowMutex());
+        auto window = findWindow_(winName);
+        if (window)
+        {
+            auto trackbar = window->findTrackbar(trackbarName);
+            CV_Assert(trackbar);
+            Range old_range = trackbar->getRange();
+            Range range(minval, std::max(minval, old_range.end));
+            return trackbar->setRange(range);
+        }
+    }
+
     cvSetTrackbarMin(trackbarName.c_str(), winName.c_str(), minval);
 }
 
 int cv::getTrackbarPos( const String& trackbarName, const String& winName )
 {
     CV_TRACE_FUNCTION();
+
+    {
+        cv::AutoLock lock(cv::getWindowMutex());
+        auto window = findWindow_(winName);
+        if (window)
+        {
+            auto trackbar = window->findTrackbar(trackbarName);
+            CV_Assert(trackbar);
+            return trackbar->getPos();
+        }
+    }
+
     return cvGetTrackbarPos(trackbarName.c_str(), winName.c_str());
 }
 
 void cv::setMouseCallback( const String& windowName, MouseCallback onMouse, void* param)
 {
     CV_TRACE_FUNCTION();
+
+    {
+        cv::AutoLock lock(cv::getWindowMutex());
+        auto window = findWindow_(windowName);
+        if (window)
+        {
+            return window->setMouseCallback(onMouse, param);
+        }
+    }
+
     cvSetMouseCallback(windowName.c_str(), onMouse, param);
 }
 
@@ -403,6 +763,39 @@ namespace
 void cv::imshow( const String& winname, InputArray _img )
 {
     CV_TRACE_FUNCTION();
+
+    {
+        cv::AutoLock lock(cv::getWindowMutex());
+        cleanupClosedWindows_();
+        auto& windowsMap = getWindowsMap();
+        auto i = windowsMap.find(winname);
+        if (i != windowsMap.end())
+        {
+            auto ui_base = i->second;
+            if (ui_base)
+            {
+                auto window = std::dynamic_pointer_cast<UIWindow>(ui_base);
+                if (!window)
+                {
+                    CV_LOG_ERROR(NULL, "OpenCV/UI: invalid window name: '" << winname << "'");
+                }
+                return window->imshow(_img);
+            }
+        }
+        auto backend = getCurrentUIBackend();
+        if (backend)
+        {
+            auto window = backend->createWindow(winname, WINDOW_NORMAL);
+            if (!window)
+            {
+                CV_LOG_ERROR(NULL, "OpenCV/UI: Can't create window: '" << winname << "'");
+                return;
+            }
+            windowsMap.emplace(winname, window);
+            return window->imshow(_img);
+        }
+    }
+
     const Size size = _img.size();
 #ifndef HAVE_OPENGL
     CV_Assert(size.width>0 && size.height>0);
diff --git a/modules/highgui/src/window_QT.cpp b/modules/highgui/src/window_QT.cpp
index 1600bd917f..60d7d69a59 100644
--- a/modules/highgui/src/window_QT.cpp
+++ b/modules/highgui/src/window_QT.cpp
@@ -1219,9 +1219,6 @@ void GuiReceiver::addSlider2(QString bar_name, QString window_name, void* value,
     if (t) //trackbar exists
         return;
 
-    if (!value)
-        CV_Error(CV_StsNullPtr, "NULL value pointer" );
-
     if (count <= 0) //count is the max value of the slider, so must be bigger than 0
         CV_Error(CV_StsNullPtr, "Max value of the slider must be bigger than 0" );
 
@@ -1342,7 +1339,8 @@ void CvTrackbar::create(CvWindow* arg, QString name, int* value, int _count)
     slider->setMinimum(0);
     slider->setMaximum(_count);
     slider->setPageStep(5);
-    slider->setValue(*value);
+    if (dataSlider)
+        slider->setValue(*dataSlider);
     slider->setTickPosition(QSlider::TicksBelow);
 
 
@@ -1409,7 +1407,8 @@ void CvTrackbar::update(int myvalue)
 {
     setLabel(myvalue);
 
-    *dataSlider = myvalue;
+    if (dataSlider)
+        *dataSlider = myvalue;
     if (callback)
     {
         callback(myvalue);
diff --git a/modules/highgui/src/window_QT.h b/modules/highgui/src/window_QT.h
index dbeacf2edf..398f3869f8 100644
--- a/modules/highgui/src/window_QT.h
+++ b/modules/highgui/src/window_QT.h
@@ -256,7 +256,7 @@ private:
     QPointer<QPushButton > label;
     CvTrackbarCallback callback;
     CvTrackbarCallback2 callback2;//look like it is use by python binding
-    int* dataSlider;
+    int* dataSlider;  // deprecated
     void* userdata;
 };
 
diff --git a/modules/highgui/src/window_gtk.cpp b/modules/highgui/src/window_gtk.cpp
index 073b340443..f2d02d66b0 100644
--- a/modules/highgui/src/window_gtk.cpp
+++ b/modules/highgui/src/window_gtk.cpp
@@ -40,8 +40,7 @@
 //M*/
 
 #include "precomp.hpp"
-
-#ifndef _WIN32
+#include "backend.hpp"
 
 #if defined (HAVE_GTK)
 
@@ -104,9 +103,6 @@ struct _CvImageWidgetClass
 /** Allocate new image viewer widget */
 GtkWidget*     cvImageWidgetNew      (int flags);
 
-/** Set the image to display in the widget */
-void           cvImageWidgetSetImage(CvImageWidget * widget, const CvArr *arr);
-
 // standard GTK object macros
 #define CV_IMAGE_WIDGET(obj)          G_TYPE_CHECK_INSTANCE_CAST (obj, cvImageWidget_get_type (), CvImageWidget)
 #define CV_IMAGE_WIDGET_CLASS(klass)  GTK_CHECK_CLASS_CAST (klass, cvImageWidget_get_type (), CvImageWidgetClass)
@@ -122,7 +118,10 @@ static GtkWidgetClass * parent_class = NULL;
 // flag to help size initial window
 #define CV_WINDOW_NO_IMAGE 2
 
-void cvImageWidgetSetImage(CvImageWidget * widget, const CvArr *arr){
+/** Set the image to display in the widget */
+static
+void cvImageWidgetSetImage(CvImageWidget * widget, const CvArr *arr)
+{
     CvMat * mat, stub;
     int origin=0;
 
@@ -156,6 +155,7 @@ cvImageWidgetNew (int flags)
   CvImageWidget *image_widget;
 
   image_widget = CV_IMAGE_WIDGET( gtk_widget_new (cvImageWidget_get_type (), NULL) );
+  CV_Assert(image_widget && "GTK widget creation is failed. Ensure that there is no GTK2/GTK3 libraries conflict");
   image_widget->original_image = 0;
   image_widget->scaled_image = 0;
   image_widget->flags = flags | CV_WINDOW_NO_IMAGE;
@@ -522,12 +522,13 @@ struct CvUIBase {
 
 struct CvTrackbar : CvUIBase
 {
-    CvTrackbar(const char* trackbar_name) :
+    CvTrackbar(const std::string& trackbar_name) :
         CvUIBase(CV_TRACKBAR_MAGIC_VAL),
         widget(NULL), name(trackbar_name),
         parent(NULL), data(NULL),
         pos(0), maxval(0), minval(0),
-        notify(NULL), notify2(NULL), userdata(NULL)
+        notify(NULL), notify2(NULL),  // deprecated
+        onChangeCallback(NULL), userdata(NULL)
     {
         // nothing
     }
@@ -538,20 +539,21 @@ struct CvTrackbar : CvUIBase
 
     GtkWidget* widget;
     std::string name;
-    CvWindow* parent;
+    CvWindow* parent;  // TODO weak_ptr
     int* data;
     int pos;
     int maxval;
     int minval;
-    CvTrackbarCallback notify;
-    CvTrackbarCallback2 notify2;
+    CvTrackbarCallback notify;  // deprecated
+    CvTrackbarCallback2 notify2;  // deprecated
+    TrackbarCallback onChangeCallback;
     void* userdata;
 };
 
 
 struct CvWindow : CvUIBase
 {
-    CvWindow(const char* window_name) :
+    CvWindow(const std::string& window_name) :
         CvUIBase(CV_WINDOW_MAGIC_VAL),
         widget(NULL), frame(NULL), paned(NULL), name(window_name),
         last_key(0), flags(0), status(0),
@@ -560,9 +562,10 @@ struct CvWindow : CvUIBase
         ,useGl(false), glDrawCallback(NULL), glDrawData(NULL)
 #endif
     {
-        // nothing
+        CV_LOG_INFO(NULL, "OpenCV/UI: creating GTK window: " << window_name);
     }
     ~CvWindow();
+    void destroy();
 
     GtkWidget* widget;
     GtkWidget* frame;
@@ -576,7 +579,7 @@ struct CvWindow : CvUIBase
     CvMouseCallback on_mouse;
     void* on_mouse_param;
 
-    std::vector< Ptr<CvTrackbar> > trackbars;
+    std::vector< std::shared_ptr<CvTrackbar> > trackbars;
 
 #ifdef HAVE_OPENGL
     bool useGl;
@@ -600,14 +603,14 @@ GCond*				   cond_have_key = NULL;
 GThread*			   window_thread = NULL;
 #endif
 
-static cv::Mutex& getWindowMutex()
-{
-    static cv::Mutex* g_window_mutex = new cv::Mutex();
-    return *g_window_mutex;
-}
-
 static int             last_key = -1;
-static std::vector< Ptr<CvWindow> > g_windows;
+
+static
+std::vector< std::shared_ptr<CvWindow> >& getGTKWindows()
+{
+    static std::vector< std::shared_ptr<CvWindow> > g_windows;
+    return g_windows;
+}
 
 CV_IMPL int cvInitSystem( int argc, char** argv )
 {
@@ -700,19 +703,32 @@ gpointer icvWindowThreadLoop(gpointer /*data*/)
 
 #define CV_LOCK_MUTEX() cv::AutoLock lock(getWindowMutex())
 
-static CvWindow* icvFindWindowByName( const char* name )
+static
+std::shared_ptr<CvWindow> icvFindWindowByName(const std::string& name)
 {
+    auto& g_windows = getGTKWindows();
     for(size_t i = 0; i < g_windows.size(); ++i)
     {
-        CvWindow* window = g_windows[i].get();
+        auto window = g_windows[i];
+        if (!window)
+            continue;
         if (window->name == name)
             return window;
     }
-    return NULL;
+    return std::shared_ptr<CvWindow>();
 }
 
+static inline
+std::shared_ptr<CvWindow> icvFindWindowByName(const char* name)
+{
+    CV_Assert(name);
+    return icvFindWindowByName(std::string(name));
+}
+
+
 static CvWindow* icvWindowByWidget( GtkWidget* widget )
 {
+    auto& g_windows = getGTKWindows();
     for (size_t i = 0; i < g_windows.size(); ++i)
     {
         CvWindow* window = g_windows[i].get();
@@ -722,20 +738,29 @@ static CvWindow* icvWindowByWidget( GtkWidget* widget )
     return NULL;
 }
 
+static Rect getImageRect_(const std::shared_ptr<CvWindow>& window);
+
 CvRect cvGetWindowRect_GTK(const char* name)
 {
     CV_Assert(name && "NULL name string");
 
     CV_LOCK_MUTEX();
-    CvWindow* window = icvFindWindowByName(name);
+    const auto window = icvFindWindowByName(name);
     if (!window)
         CV_Error( CV_StsNullPtr, "NULL window" );
 
+    return cvRect(getImageRect_(window));
+}
+
+static Rect getImageRect_(const std::shared_ptr<CvWindow>& window)
+{
+    CV_Assert(window);
+
     gint wx, wy;
 #ifdef HAVE_OPENGL
     if (window->useGl) {
         gtk_widget_translate_coordinates(window->widget, gtk_widget_get_toplevel(window->widget), 0, 0, &wx, &wy);
-        return cvRect(wx, wy, window->widget->allocation.width, window->widget->allocation.height);
+        return Rect(wx, wy, window->widget->allocation.width, window->widget->allocation.height);
     }
 #endif
 
@@ -743,23 +768,23 @@ CvRect cvGetWindowRect_GTK(const char* name)
     gtk_widget_translate_coordinates(&image_widget->widget, gtk_widget_get_toplevel(&image_widget->widget), 0, 0, &wx, &wy);
     if (image_widget->scaled_image) {
 #if defined (GTK_VERSION3)
-      return cvRect(wx, wy, MIN(image_widget->scaled_image->cols, gtk_widget_get_allocated_width(window->widget)),
+      return Rect(wx, wy, MIN(image_widget->scaled_image->cols, gtk_widget_get_allocated_width(window->widget)),
           MIN(image_widget->scaled_image->rows, gtk_widget_get_allocated_height(window->widget)));
 #else
-      return cvRect(wx, wy, MIN(image_widget->scaled_image->cols, window->widget->allocation.width),
+      return Rect(wx, wy, MIN(image_widget->scaled_image->cols, window->widget->allocation.width),
           MIN(image_widget->scaled_image->rows, window->widget->allocation.height));
 #endif //GTK_VERSION3
     } else if (image_widget->original_image) {
 #if defined (GTK_VERSION3)
-      return cvRect(wx, wy, MIN(image_widget->original_image->cols, gtk_widget_get_allocated_width(window->widget)),
+      return Rect(wx, wy, MIN(image_widget->original_image->cols, gtk_widget_get_allocated_width(window->widget)),
           MIN(image_widget->original_image->rows, gtk_widget_get_allocated_height(window->widget)));
 #else
-      return cvRect(wx, wy, MIN(image_widget->original_image->cols, window->widget->allocation.width),
+      return Rect(wx, wy, MIN(image_widget->original_image->cols, window->widget->allocation.width),
           MIN(image_widget->original_image->rows, window->widget->allocation.height));
 #endif //GTK_VERSION3
     }
 
-    return cvRect(-1, -1, -1, -1);
+    return Rect(-1, -1, -1, -1);
 }
 
 double cvGetModeWindow_GTK(const char* name)//YV
@@ -767,7 +792,7 @@ double cvGetModeWindow_GTK(const char* name)//YV
     CV_Assert(name && "NULL name string");
 
     CV_LOCK_MUTEX();
-    CvWindow* window = icvFindWindowByName(name);
+    const auto window = icvFindWindowByName(name);
     if (!window)
         CV_Error( CV_StsNullPtr, "NULL window" );
 
@@ -775,42 +800,52 @@ double cvGetModeWindow_GTK(const char* name)//YV
     return result;
 }
 
-
+static bool setModeWindow_(const std::shared_ptr<CvWindow>& window, int mode);
 void cvSetModeWindow_GTK( const char* name, double prop_value)//Yannick Verdie
 {
     CV_Assert(name && "NULL name string");
 
     CV_LOCK_MUTEX();
 
-    CvWindow* window = icvFindWindowByName(name);
-    if( !window )
+    const auto window = icvFindWindowByName(name);
+    if (!window)
         CV_Error( CV_StsNullPtr, "NULL window" );
 
-    if(window->flags & CV_WINDOW_AUTOSIZE)//if the flag CV_WINDOW_AUTOSIZE is set
-        return;
+    setModeWindow_(window, (int)prop_value);
+}
+
+static bool setModeWindow_(const std::shared_ptr<CvWindow>& window, int mode)
+{
+    if (window->flags & CV_WINDOW_AUTOSIZE) //if the flag CV_WINDOW_AUTOSIZE is set
+        return false;
 
     //so easy to do fullscreen here, Linux rocks !
 
-    if (window->status==CV_WINDOW_FULLSCREEN && prop_value==CV_WINDOW_NORMAL)
+    if (window->status == mode)
+        return true;
+
+    if (window->status==CV_WINDOW_FULLSCREEN && mode==CV_WINDOW_NORMAL)
     {
         gtk_window_unfullscreen(GTK_WINDOW(window->frame));
         window->status=CV_WINDOW_NORMAL;
-        return;
+        return true;
     }
 
-    if (window->status==CV_WINDOW_NORMAL && prop_value==CV_WINDOW_FULLSCREEN)
+    if (window->status==CV_WINDOW_NORMAL && mode==CV_WINDOW_FULLSCREEN)
     {
         gtk_window_fullscreen(GTK_WINDOW(window->frame));
         window->status=CV_WINDOW_FULLSCREEN;
-        return;
+        return true;
     }
+
+    return false;
 }
 
 void cv::setWindowTitle(const String& winname, const String& title)
 {
     CV_LOCK_MUTEX();
 
-    CvWindow* window = icvFindWindowByName(winname.c_str());
+    auto window = icvFindWindowByName(winname.c_str());
 
     if (!window)
     {
@@ -828,7 +863,7 @@ double cvGetPropWindowAutoSize_GTK(const char* name)
 
     CV_LOCK_MUTEX();
 
-    CvWindow* window = icvFindWindowByName(name);
+    const auto window = icvFindWindowByName(name);
     if (!window)
         return -1; // keep silence here
 
@@ -836,16 +871,22 @@ double cvGetPropWindowAutoSize_GTK(const char* name)
     return result;
 }
 
+static double getRatioWindow_(const std::shared_ptr<CvWindow>& window);
 double cvGetRatioWindow_GTK(const char* name)
 {
     CV_Assert(name && "NULL name string");
 
     CV_LOCK_MUTEX();
 
-    CvWindow* window = icvFindWindowByName(name);
+    const auto window = icvFindWindowByName(name);
     if (!window)
         return -1; // keep silence here
 
+    return getRatioWindow_(window);
+}
+
+static double getRatioWindow_(const std::shared_ptr<CvWindow>& window)
+{
 #if defined (GTK_VERSION3)
     double result = static_cast<double>(
         gtk_widget_get_allocated_width(window->widget)) / gtk_widget_get_allocated_height(window->widget);
@@ -862,7 +903,7 @@ double cvGetOpenGlProp_GTK(const char* name)
 
     CV_LOCK_MUTEX();
 
-    CvWindow* window = icvFindWindowByName(name);
+    const auto window = icvFindWindowByName(name);
     if (!window)
         return -1; // keep silence here
 
@@ -1048,6 +1089,7 @@ static gboolean cvImageWidget_expose(GtkWidget* widget, GdkEventExpose* event, g
 }
 #endif //GTK_VERSION3
 
+static std::shared_ptr<CvWindow> namedWindow_(const std::string& name, int flags);
 CV_IMPL int cvNamedWindow( const char* name, int flags )
 {
     cvInitSystem(name ? 1 : 0,(char**)&name);
@@ -1060,8 +1102,16 @@ CV_IMPL int cvNamedWindow( const char* name, int flags )
     {
         return 1;
     }
+    auto window = namedWindow_(name, flags);
+    return window ? 1 : 0;
+}
 
-    Ptr<CvWindow> window = makePtr<CvWindow>(name);
+static std::shared_ptr<CvWindow> namedWindow_(const std::string& name, int flags)
+{
+    cvInitSystem(0, NULL);
+
+    auto window_ptr = std::make_shared<CvWindow>(name);
+    CvWindow* window = window_ptr.get();
     window->flags = flags;
     window->status = CV_WINDOW_NORMAL;//YV
 
@@ -1116,9 +1166,12 @@ CV_IMPL int cvNamedWindow( const char* name, int flags )
 #endif //GTK_VERSION3_4
 
     gtk_widget_show( window->frame );
-    gtk_window_set_title( GTK_WINDOW(window->frame), name );
+    gtk_window_set_title(GTK_WINDOW(window->frame), name.c_str());
 
-    g_windows.push_back(window);
+    {
+        AutoLock lock(getWindowMutex());
+        getGTKWindows().push_back(window_ptr);
+    }
 
     bool b_nautosize = ((flags & CV_WINDOW_AUTOSIZE) == 0);
     gtk_window_set_resizable( GTK_WINDOW(window->frame), b_nautosize );
@@ -1137,7 +1190,7 @@ CV_IMPL int cvNamedWindow( const char* name, int flags )
         cvSetOpenGlContext(name);
 #endif
 
-    return 1;
+    return window_ptr;
 }
 
 
@@ -1203,13 +1256,21 @@ CV_IMPL void cvSetOpenGlDrawCallback(const char* name, CvOpenGlDrawCallback call
 
 CvWindow::~CvWindow()
 {
+    if (frame)
+        destroy();
+}
+
+inline void CvWindow::destroy()
+{
+    CV_LOG_INFO(NULL, "OpenCV/UI: destroying GTK window: " << name);
     gtk_widget_destroy(frame);
+    frame = nullptr;
 }
 
 static void checkLastWindow()
 {
     // if last window...
-    if (g_windows.empty())
+    if (getGTKWindows().empty())
     {
 #ifdef HAVE_GTHREAD
         if( thread_started )
@@ -1236,11 +1297,13 @@ static void checkLastWindow()
     }
 }
 
-static void icvDeleteWindow( CvWindow* window )
+static
+void icvDeleteWindow_( CvWindow* window )
 {
+    AutoLock lock(getWindowMutex());
+    auto& g_windows = getGTKWindows();
     bool found = false;
-    for (std::vector< Ptr<CvWindow> >::iterator i = g_windows.begin();
-         i != g_windows.end(); ++i)
+    for (auto i = g_windows.begin(); i != g_windows.end(); ++i)
     {
         if (i->get() == window)
         {
@@ -1249,8 +1312,7 @@ static void icvDeleteWindow( CvWindow* window )
             break;
         }
     }
-    CV_Assert(found && "Can't destroy non-registered window");
-
+    CV_LOG_IF_WARNING(NULL, !found, "OpenCV/GTK: Can't destroy non-registered window");
     checkLastWindow();
 }
 
@@ -1259,10 +1321,10 @@ CV_IMPL void cvDestroyWindow( const char* name )
     CV_Assert(name && "NULL name string");
 
     CV_LOCK_MUTEX();
+    auto& g_windows = getGTKWindows();
 
     bool found = false;
-    for (std::vector< Ptr<CvWindow> >::iterator i = g_windows.begin();
-         i != g_windows.end(); ++i)
+    for (auto i = g_windows.begin(); i != g_windows.end(); ++i)
     {
         if (i->get()->name == name)
         {
@@ -1271,7 +1333,7 @@ CV_IMPL void cvDestroyWindow( const char* name )
             break;
         }
     }
-    CV_Assert(found && "Can't destroy non-registered window");
+    CV_LOG_IF_ERROR(NULL, !found, "OpenCV/GTK: Can't destroy non-registered window: '" << name << "'");
 
     checkLastWindow();
 }
@@ -1282,7 +1344,7 @@ cvDestroyAllWindows( void )
 {
     CV_LOCK_MUTEX();
 
-    g_windows.clear();
+    getGTKWindows().clear();
     checkLastWindow();
 }
 
@@ -1305,7 +1367,7 @@ cvShowImage( const char* name, const CvArr* arr )
 
     CV_LOCK_MUTEX();
 
-    CvWindow* window = icvFindWindowByName(name);
+    auto window = icvFindWindowByName(name);
     if(!window)
     {
         cvNamedWindow(name, 1);
@@ -1328,16 +1390,24 @@ cvShowImage( const char* name, const CvArr* arr )
     }
 }
 
+static void resizeWindow_(const std::shared_ptr<CvWindow>& window, int width, int height);
 CV_IMPL void cvResizeWindow(const char* name, int width, int height )
 {
     CV_Assert(name && "NULL name string");
 
     CV_LOCK_MUTEX();
 
-    CvWindow* window = icvFindWindowByName(name);
+    auto window = icvFindWindowByName(name);
     if(!window)
         return;
 
+    return resizeWindow_(window, width, height);
+}
+
+static
+void resizeWindow_(const std::shared_ptr<CvWindow>& window, int width, int height)
+{
+    CV_Assert(window);
     CvImageWidget* image_widget = CV_IMAGE_WIDGET( window->widget );
     //if(image_widget->flags & CV_WINDOW_AUTOSIZE)
         //EXIT;
@@ -1357,26 +1427,30 @@ CV_IMPL void cvMoveWindow( const char* name, int x, int y )
 
     CV_LOCK_MUTEX();
 
-    CvWindow* window = icvFindWindowByName(name);
+    const auto window = icvFindWindowByName(name);
     if(!window)
         return;
 
     gtk_window_move( GTK_WINDOW(window->frame), x, y );
 }
 
-
-static CvTrackbar*
-icvFindTrackbarByName( const CvWindow* window, const char* name )
+static
+std::shared_ptr<CvTrackbar> icvFindTrackbarByName(const std::shared_ptr<CvWindow>& window, const std::string& name)
 {
-    for (size_t i = 0; i < window->trackbars.size(); ++i)
+    CV_Assert(window);
+    auto& trackbars = window->trackbars;
+    for(size_t i = 0; i < trackbars.size(); ++i)
     {
-        CvTrackbar* trackbar = window->trackbars[i].get();
+        auto trackbar = trackbars[i];
+        if (!trackbar)
+            continue;
         if (trackbar->name == name)
             return trackbar;
     }
-    return NULL;
+    return std::shared_ptr<CvTrackbar>();
 }
 
+
 static int
 icvCreateTrackbar( const char* trackbar_name, const char* window_name,
                    int* val, int count, CvTrackbarCallback on_notify,
@@ -1390,16 +1464,16 @@ icvCreateTrackbar( const char* trackbar_name, const char* window_name,
 
     CV_LOCK_MUTEX();
 
-    CvWindow* window = icvFindWindowByName(window_name);
+    const auto window = icvFindWindowByName(window_name);
     if(!window)
         return 0;
 
-    CvTrackbar* trackbar = icvFindTrackbarByName(window, trackbar_name);
-    if (!trackbar)
+    auto trackbar_ = icvFindTrackbarByName(window, trackbar_name);
+    if (!trackbar_)
     {
-        Ptr<CvTrackbar> trackbar_ = makePtr<CvTrackbar>(trackbar_name);
-        trackbar = trackbar_.get();
-        trackbar->parent = window;
+        trackbar_ = std::make_shared<CvTrackbar>(trackbar_name);
+        CvTrackbar* trackbar = trackbar_.get();
+        trackbar->parent = window.get();
         window->trackbars.push_back(trackbar_);
 
         GtkWidget* hscale_box = gtk_hbox_new( FALSE, 10 );
@@ -1418,6 +1492,8 @@ icvCreateTrackbar( const char* trackbar_name, const char* window_name,
         gtk_widget_show( hscale_box );
     }
 
+    CvTrackbar* trackbar = trackbar_.get(); CV_DbgAssert(trackbar);
+
     if( val )
     {
         int value = *val;
@@ -1444,7 +1520,6 @@ icvCreateTrackbar( const char* trackbar_name, const char* window_name,
     return 1;
 }
 
-
 CV_IMPL int
 cvCreateTrackbar( const char* trackbar_name, const char* window_name,
                   int* val, int count, CvTrackbarCallback on_notify )
@@ -1453,7 +1528,6 @@ cvCreateTrackbar( const char* trackbar_name, const char* window_name,
                              on_notify, 0, 0);
 }
 
-
 CV_IMPL int
 cvCreateTrackbar2( const char* trackbar_name, const char* window_name,
                    int* val, int count, CvTrackbarCallback2 on_notify2,
@@ -1463,6 +1537,52 @@ cvCreateTrackbar2( const char* trackbar_name, const char* window_name,
                              0, on_notify2, userdata);
 }
 
+static
+std::shared_ptr<CvTrackbar> createTrackbar_(
+    const std::shared_ptr<CvWindow>& window, const std::string& name,
+    int count,
+    TrackbarCallback onChange, void* userdata
+)
+{
+    CV_Assert(window);
+    CV_Assert(!name.empty());
+
+    if (count <= 0)
+        CV_Error(Error::StsOutOfRange, "Bad trackbar maximal value");
+
+    auto trackbar_ = std::make_shared<CvTrackbar>(name);
+    CvTrackbar* trackbar = trackbar_.get();
+    trackbar->parent = window.get();
+    window->trackbars.push_back(trackbar_);
+
+    GtkWidget* hscale_box = gtk_hbox_new( FALSE, 10 );
+    GtkWidget* hscale_label = gtk_label_new(name.c_str());
+    GtkWidget* hscale = gtk_hscale_new_with_range( 0, count, 1 );
+    gtk_scale_set_digits( GTK_SCALE(hscale), 0 );
+    //gtk_scale_set_value_pos( hscale, GTK_POS_TOP );
+    gtk_scale_set_draw_value( GTK_SCALE(hscale), TRUE );
+
+    trackbar->widget = hscale;
+    gtk_box_pack_start( GTK_BOX(hscale_box), hscale_label, FALSE, FALSE, 5 );
+    gtk_widget_show( hscale_label );
+    gtk_box_pack_start( GTK_BOX(hscale_box), hscale, TRUE, TRUE, 5 );
+    gtk_widget_show( hscale );
+    gtk_box_pack_start( GTK_BOX(window->paned), hscale_box, FALSE, FALSE, 5 );
+    gtk_widget_show( hscale_box );
+
+    trackbar->maxval = count;
+    trackbar->onChangeCallback = onChange;
+    trackbar->userdata = userdata;
+    g_signal_connect(trackbar->widget, "value-changed",
+                     G_CALLBACK(icvOnTrackbar), trackbar);
+
+    // queue a widget resize to trigger a window resize to
+    // compensate for the addition of trackbars
+    gtk_widget_queue_resize(GTK_WIDGET(window->widget));
+
+    return trackbar_;
+}
+
 
 CV_IMPL void
 cvSetMouseCallback( const char* window_name, CvMouseCallback on_mouse, void* param )
@@ -1471,7 +1591,7 @@ cvSetMouseCallback( const char* window_name, CvMouseCallback on_mouse, void* par
 
     CV_LOCK_MUTEX();
 
-    CvWindow* window = icvFindWindowByName(window_name);
+    const auto window = icvFindWindowByName(window_name);
     if (!window)
         return;
 
@@ -1487,18 +1607,18 @@ CV_IMPL int cvGetTrackbarPos( const char* trackbar_name, const char* window_name
 
     CV_LOCK_MUTEX();
 
-    CvWindow* window = icvFindWindowByName(window_name);
+    const auto window = icvFindWindowByName(window_name);
     if (!window)
         return -1;
 
-    CvTrackbar* trackbar = icvFindTrackbarByName(window,trackbar_name);
+    const auto trackbar = icvFindTrackbarByName(window,trackbar_name);
     if (!trackbar)
         return -1;
 
     return trackbar->pos;
 }
 
-
+static void setTrackbarPos_(const std::shared_ptr<CvTrackbar>& trackbar, int pos);
 CV_IMPL void cvSetTrackbarPos( const char* trackbar_name, const char* window_name, int pos )
 {
     CV_Assert(window_name && "NULL window name");
@@ -1506,24 +1626,27 @@ CV_IMPL void cvSetTrackbarPos( const char* trackbar_name, const char* window_nam
 
     CV_LOCK_MUTEX();
 
-    CvWindow* window = icvFindWindowByName(window_name);
+    const auto window = icvFindWindowByName(window_name);
     if(!window)
         return;
 
-    CvTrackbar* trackbar = icvFindTrackbarByName(window,trackbar_name);
-    if( trackbar )
-    {
-        if( pos < trackbar->minval )
-            pos = trackbar->minval;
-
-        if( pos > trackbar->maxval )
-            pos = trackbar->maxval;
-    }
-    else
+    const auto trackbar = icvFindTrackbarByName(window, trackbar_name);
+    if (!trackbar)
     {
         CV_Error( CV_StsNullPtr, "No trackbar found" );
     }
 
+    return setTrackbarPos_(trackbar, pos);
+}
+
+static void setTrackbarPos_(const std::shared_ptr<CvTrackbar>& trackbar, int pos)
+{
+    CV_Assert(trackbar);
+    CV_CheckLE(trackbar->minval, trackbar->maxval, "");
+
+    pos = std::max(pos, trackbar->minval);
+    pos = std::min(pos, trackbar->maxval);
+
     gtk_range_set_value( GTK_RANGE(trackbar->widget), pos );
 }
 
@@ -1535,11 +1658,11 @@ CV_IMPL void cvSetTrackbarMax(const char* trackbar_name, const char* window_name
 
     CV_LOCK_MUTEX();
 
-    CvWindow* window = icvFindWindowByName(window_name);
+    const auto window = icvFindWindowByName(window_name);
     if(!window)
         return;
 
-    CvTrackbar* trackbar = icvFindTrackbarByName(window,trackbar_name);
+    const auto trackbar = icvFindTrackbarByName(window,trackbar_name);
     if(!trackbar)
         return;
 
@@ -1556,11 +1679,11 @@ CV_IMPL void cvSetTrackbarMin(const char* trackbar_name, const char* window_name
 
     CV_LOCK_MUTEX();
 
-    CvWindow* window = icvFindWindowByName(window_name);
+    const auto window = icvFindWindowByName(window_name);
     if(!window)
         return;
 
-    CvTrackbar* trackbar = icvFindTrackbarByName(window,trackbar_name);
+    const auto trackbar = icvFindTrackbarByName(window,trackbar_name);
     if(!trackbar)
         return;
 
@@ -1576,7 +1699,7 @@ CV_IMPL void* cvGetWindowHandle( const char* window_name )
 
     CV_LOCK_MUTEX();
 
-    CvWindow* window = icvFindWindowByName(window_name);
+    const auto window = icvFindWindowByName(window_name);
     if(!window)
         return NULL;
 
@@ -1747,6 +1870,10 @@ static void icvOnTrackbar( GtkWidget* widget, gpointer user_data )
         trackbar->widget == widget )
     {
         trackbar->pos = pos;
+        if (trackbar->onChangeCallback)
+            trackbar->onChangeCallback(pos, trackbar->userdata);
+
+        // deprecated
         if( trackbar->data )
             *trackbar->data = pos;
         if( trackbar->notify2 )
@@ -1762,7 +1889,14 @@ static gboolean icvOnClose( GtkWidget* widget, GdkEvent* /*event*/, gpointer use
     if( window->signature == CV_WINDOW_MAGIC_VAL &&
         window->frame == widget )
     {
-        icvDeleteWindow(window);
+        try
+        {
+            icvDeleteWindow_(window);
+        }
+        catch (...)
+        {
+            CV_LOG_WARNING(NULL, "OpenCV/GTK: unexpected C++ exception in icvDeleteWindow_");
+        }
     }
     return TRUE;
 }
@@ -1787,8 +1921,8 @@ static gboolean icvOnMouse( GtkWidget *widget, GdkEvent *event, gpointer user_da
         GdkEventMotion* event_motion = (GdkEventMotion*)event;
 
         cv_event = CV_EVENT_MOUSEMOVE;
-        pt32f.x = cvRound(event_motion->x);
-        pt32f.y = cvRound(event_motion->y);
+        pt32f.x = cvFloor(event_motion->x);
+        pt32f.y = cvFloor(event_motion->y);
         state = event_motion->state;
     }
     else if( event->type == GDK_BUTTON_PRESS ||
@@ -1796,8 +1930,8 @@ static gboolean icvOnMouse( GtkWidget *widget, GdkEvent *event, gpointer user_da
              event->type == GDK_2BUTTON_PRESS )
     {
         GdkEventButton* event_button = (GdkEventButton*)event;
-        pt32f.x = cvRound(event_button->x);
-        pt32f.y = cvRound(event_button->y);
+        pt32f.x = cvFloor(event_button->x);
+        pt32f.y = cvFloor(event_button->y);
 
 
         if( event_button->type == GDK_BUTTON_PRESS )
@@ -1874,8 +2008,8 @@ static gboolean icvOnMouse( GtkWidget *widget, GdkEvent *event, gpointer user_da
             pt = cvPointFrom32f( pt32f );
         }
 
-//        if((unsigned)pt.x < (unsigned)(image_widget->original_image->width) &&
-//           (unsigned)pt.y < (unsigned)(image_widget->original_image->height) )
+        if((unsigned)pt.x < (unsigned)(image_widget->original_image->width) &&
+           (unsigned)pt.y < (unsigned)(image_widget->original_image->height) )
         {
             flags |= BIT_MAP(state, GDK_SHIFT_MASK,   CV_EVENT_FLAG_SHIFTKEY) |
                 BIT_MAP(state, GDK_CONTROL_MASK, CV_EVENT_FLAG_CTRLKEY)  |
@@ -1916,7 +2050,7 @@ CV_IMPL int cvWaitKey( int delay )
             expired = !g_cond_timed_wait(cond_have_key, last_key_mutex, &timer);
         }
         else{
-            if (g_windows.empty())
+            if (getGTKWindows().empty())
             {
                 CV_LOG_WARNING(NULL, "cv::waitKey() is called without timeout and missing active windows. Ignoring");
             }
@@ -1928,7 +2062,8 @@ CV_IMPL int cvWaitKey( int delay )
         }
         my_last_key = last_key;
         g_mutex_unlock(last_key_mutex);
-        if(expired || g_windows.empty()){
+        if (expired || getGTKWindows().empty())
+        {
             return -1;
         }
         return my_last_key;
@@ -1941,7 +2076,7 @@ CV_IMPL int cvWaitKey( int delay )
         if( delay > 0 )
             timer = g_timeout_add( delay, icvAlarm, &expired );
         last_key = -1;
-        while( gtk_main_iteration_do(TRUE) && last_key < 0 && !expired && (delay > 0 || !g_windows.empty()))
+        while( gtk_main_iteration_do(TRUE) && last_key < 0 && !expired && (delay > 0 || !getGTKWindows().empty()))
             ;
 
         if( delay > 0 && !expired )
@@ -1950,8 +2085,335 @@ CV_IMPL int cvWaitKey( int delay )
     return last_key;
 }
 
+namespace cv { namespace impl {
+
+using namespace cv::highgui_backend;
+
+class GTKTrackbar;
+
+class GTKWindow
+        : public UIWindow
+        , public std::enable_shared_from_this<GTKWindow>
+{
+protected:
+    const std::string name_;
+    std::weak_ptr<CvWindow> window_;
+    std::map<std::string, std::shared_ptr<GTKTrackbar> > trackbars_;
+public:
+    GTKWindow(const std::string& name, const std::shared_ptr<CvWindow>& window)
+        : name_(name)
+        , window_(window)
+    {
+        // nothing
+    }
+
+    ~GTKWindow() CV_OVERRIDE
+    {
+        if (!window_.expired())
+            destroy();
+        CV_LOG_DEBUG(NULL, "OpenCV/UI/GTK: GTKWindow(" << name_ << ") is disposed");
+    }
+
+    const std::string& getID() const CV_OVERRIDE { return name_; }
+
+    bool isActive() const CV_OVERRIDE { return !window_.expired(); }
+
+    void destroy() CV_OVERRIDE
+    {
+        cv::AutoLock lock(getWindowMutex());
+        if (!window_.expired())
+        {
+            auto window = window_.lock();
+            if (window)
+                window->destroy();
+            window_.reset();
+        }
+    }
+
+    void imshow(InputArray image) CV_OVERRIDE
+    {
+        auto window = window_.lock();
+        CV_Assert(window);
+        CvImageWidget* image_widget = CV_IMAGE_WIDGET(window->widget);
+        CV_Assert(image_widget);
+        Mat img = image.getMat();
+        CvMat c_img = cvMat(img);  // TODO Drop C-API
+        cvImageWidgetSetImage(image_widget, &c_img);
+    }
+
+    double getProperty(int prop) const CV_OVERRIDE
+    {
+        auto window = window_.lock();
+        CV_Assert(window);
+        // see cvGetWindowProperty
+        switch (prop)
+        {
+        case CV_WND_PROP_FULLSCREEN:
+            return (double)window->status;
+
+        case CV_WND_PROP_AUTOSIZE:
+            return (window->flags & CV_WINDOW_AUTOSIZE) ? 1.0 : 0.0;
+
+        case CV_WND_PROP_ASPECTRATIO:
+            return getRatioWindow_(window);
+
+#ifdef HAVE_OPENGL
+        case CV_WND_PROP_OPENGL:
+            return window->useGl ? 1.0 : 0.0;
+#endif
+
+        default:
+            break;
+        }
+        return std::numeric_limits<double>::quiet_NaN();
+    }
+
+    bool setProperty(int prop, double value) CV_OVERRIDE
+    {
+        auto window = window_.lock();
+        CV_Assert(window);
+        // see cvSetWindowProperty
+        switch (prop)
+        {
+        case CV_WND_PROP_FULLSCREEN:
+            if (value != CV_WINDOW_NORMAL && value != CV_WINDOW_FULLSCREEN)  // bad arg
+                break;
+            setModeWindow_(window, value);
+            return true;
+
+        default:
+            break;
+        }
+        return false;
+    }
+
+    void resize(int width, int height) CV_OVERRIDE
+    {
+        auto window = window_.lock();
+        CV_Assert(window);
+        resizeWindow_(window, width, height);
+    }
+
+    void move(int x, int y) CV_OVERRIDE
+    {
+        auto window = window_.lock();
+        CV_Assert(window);
+        gtk_window_move(GTK_WINDOW(window->frame), x, y);
+    }
+
+    Rect getImageRect() const CV_OVERRIDE
+    {
+        auto window = window_.lock();
+        CV_Assert(window);
+        return getImageRect_(window);
+    }
+
+    void setTitle(const std::string& title) CV_OVERRIDE
+    {
+        auto window = window_.lock();
+        CV_Assert(window);
+        gtk_window_set_title(GTK_WINDOW(window->frame), title.c_str());
+    }
+
+    void setMouseCallback(MouseCallback onMouse, void* userdata /*= 0*/) CV_OVERRIDE
+    {
+        auto window = window_.lock();
+        CV_Assert(window);
+        window->on_mouse = onMouse;
+        window->on_mouse_param = userdata;
+    }
+
+    std::shared_ptr<UITrackbar> createTrackbar(
+        const std::string& name,
+        int count,
+        TrackbarCallback onChange /*= 0*/,
+        void* userdata /*= 0*/
+    ) CV_OVERRIDE
+    {
+        auto window = window_.lock();
+        CV_Assert(window);
+        CV_LOG_INFO(NULL, "OpenCV/UI: Creating GTK trackbar at '" << name_ << "': '" << name << "'");
+        auto trackbar = createTrackbar_(window, name, count, onChange, userdata);
+        auto ui_trackbar = std::make_shared<GTKTrackbar>(name, trackbar, shared_from_this());
+        {
+            cv::AutoLock lock(getWindowMutex());
+            trackbars_.emplace(name, ui_trackbar);
+        }
+        return std::static_pointer_cast<UITrackbar>(ui_trackbar);
+    }
+
+    std::shared_ptr<UITrackbar> findTrackbar(const std::string& name) CV_OVERRIDE
+    {
+        cv::AutoLock lock(getWindowMutex());
+        auto i = trackbars_.find(name);
+        if (i != trackbars_.end())
+        {
+            return std::static_pointer_cast<UITrackbar>(i->second);
+        }
+        return std::shared_ptr<UITrackbar>();
+    }
+};  // GTKWindow
+
+
+class GTKTrackbar : public UITrackbar
+{
+protected:
+    /*const*/ std::string name_;
+    std::weak_ptr<CvTrackbar> trackbar_;
+    std::weak_ptr<GTKWindow> parent_;
+    std::map<std::string, std::shared_ptr<GTKTrackbar> > trackbars_;
+public:
+    GTKTrackbar(const std::string& name, const std::shared_ptr<CvTrackbar>& trackbar, const std::shared_ptr<GTKWindow>& parent)
+        : trackbar_(trackbar)
+        , parent_(parent)
+    {
+        name_ = std::string("<") + name + ">@" + parent->getID();
+    }
+
+    ~GTKTrackbar() CV_OVERRIDE
+    {
+        if (!trackbar_.expired())
+            destroy();
+        CV_LOG_DEBUG(NULL, "OpenCV/UI/GTK: GTKTrackbar(" << name_ << ") is disposed");
+    }
+
+    const std::string& getID() const CV_OVERRIDE { return name_; }
+
+    bool isActive() const CV_OVERRIDE { return !trackbar_.expired(); }
+
+    void destroy() CV_OVERRIDE
+    {
+        // nothing (destroyed with parent window, dedicated trackbar removal is not supported)
+    }
+
+    int getPos() const CV_OVERRIDE
+    {
+        auto trackbar = trackbar_.lock();
+        CV_Assert(trackbar);
+        return trackbar->pos;
+    }
+    void setPos(int pos) CV_OVERRIDE
+    {
+        auto trackbar = trackbar_.lock();
+        CV_Assert(trackbar);
+        return setTrackbarPos_(trackbar, pos);
+    }
+
+    cv::Range getRange() const CV_OVERRIDE
+    {
+        auto trackbar = trackbar_.lock();
+        CV_Assert(trackbar);
+        return cv::Range(trackbar->minval, trackbar->maxval);
+    }
+
+    void setRange(const cv::Range& range) CV_OVERRIDE
+    {
+        auto trackbar = trackbar_.lock();
+        CV_Assert(trackbar);
+        CV_CheckLE(range.start, range.end, "Invalid trackbar range");
+        gtk_range_set_range(GTK_RANGE(trackbar->widget), range.start, range.end);
+    }
+};  // GTKTrackbar
+
+
+class GTKBackendUI : public UIBackend
+{
+public:
+    ~GTKBackendUI() CV_OVERRIDE
+    {
+        destroyAllWindows();
+    }
+
+    void destroyAllWindows() CV_OVERRIDE
+    {
+        cvDestroyAllWindows();
+    }
+
+    // namedWindow
+    virtual std::shared_ptr<UIWindow> createWindow(
+        const std::string& winname,
+        int flags
+    ) CV_OVERRIDE
+    {
+        CV_LOG_INFO(NULL, "OpenCV/UI: Creating GTK window: " << winname << " (" << flags << ")");
+        auto window = namedWindow_(winname, flags);
+        auto ui_window = std::make_shared<GTKWindow>(winname, window);
+        return ui_window;
+    }
+
+    int waitKeyEx(int delay) CV_OVERRIDE
+    {
+        return cvWaitKey(delay);
+    }
+    int pollKey() CV_OVERRIDE
+    {
+        return cvWaitKey(1);  // TODO
+    }
+};  // GTKBackendUI
+
+static
+std::shared_ptr<GTKBackendUI>& getInstance()
+{
+    static std::shared_ptr<GTKBackendUI> g_instance = std::make_shared<GTKBackendUI>();
+    return g_instance;
+}
+
+} // namespace impl
+
+#ifndef BUILD_PLUGIN
+namespace highgui_backend {
+
+std::shared_ptr<UIBackend> createUIBackendGTK()
+{
+    return impl::getInstance();
+}
+
+}  // namespace highgui_backend
+#endif
+
+}  // namespace
+
+#ifdef BUILD_PLUGIN
+
+#define ABI_VERSION 0
+#define API_VERSION 0
+#include "plugin_api.hpp"
+
+static
+CvResult cv_getInstance(CV_OUT CvPluginUIBackend* handle) CV_NOEXCEPT
+{
+    try
+    {
+        if (!handle)
+            return CV_ERROR_FAIL;
+        *handle = cv::impl::getInstance().get();
+        return CV_ERROR_OK;
+    }
+    catch (...)
+    {
+        return CV_ERROR_FAIL;
+    }
+}
+
+static const OpenCV_UI_Plugin_API plugin_api =
+{
+    {
+        sizeof(OpenCV_UI_Plugin_API), ABI_VERSION, API_VERSION,
+        CV_VERSION_MAJOR, CV_VERSION_MINOR, CV_VERSION_REVISION, CV_VERSION_STATUS,
+        "GTK" CVAUX_STR(GTK_MAJOR_VERSION) " OpenCV UI plugin"
+    },
+    {
+        /*  1*/cv_getInstance
+    }
+};
+
+const OpenCV_UI_Plugin_API* CV_API_CALL opencv_ui_plugin_init_v0(int requested_abi_version, int requested_api_version, void* /*reserved=NULL*/) CV_NOEXCEPT
+{
+    if (requested_abi_version == ABI_VERSION && requested_api_version <= API_VERSION)
+        return &plugin_api;
+    return NULL;
+}
+
+#endif  // BUILD_PLUGIN
 
 #endif  // HAVE_GTK
-#endif  // _WIN32
-
-/* End of file. */
diff --git a/modules/highgui/src/window_winrt_bridge.cpp b/modules/highgui/src/window_winrt_bridge.cpp
index 13edbe5b83..6057f2d5b4 100644
--- a/modules/highgui/src/window_winrt_bridge.cpp
+++ b/modules/highgui/src/window_winrt_bridge.cpp
@@ -271,7 +271,7 @@ void CvWindow::createSlider(cv::String name, int* val, int count, CvTrackbarCall
         // Image control is loaded. See callback implementation in CvWindow ctor.
         slider->Width = sliderDefaultWidth;
     }
-    slider->Value = *val;
+    slider->Value = val ? *val : 0;
     slider->Maximum = count;
     slider->Visibility = Windows::UI::Xaml::Visibility::Visible;
     slider->Margin = Windows::UI::Xaml::ThicknessHelper::FromLengths(10, 10, 10, 0);
diff --git a/modules/highgui/test/test_gui.cpp b/modules/highgui/test/test_gui.cpp
index c973771e98..6bf634b500 100644
--- a/modules/highgui/test/test_gui.cpp
+++ b/modules/highgui/test/test_gui.cpp
@@ -47,13 +47,18 @@ namespace opencv_test { namespace {
 inline void verify_size(const std::string &nm, const cv::Mat &img)
 {
     EXPECT_NO_THROW(imshow(nm, img));
-    EXPECT_EQ(-1, waitKey(500));
+    EXPECT_EQ(-1, waitKey(200));
     Rect rc;
     EXPECT_NO_THROW(rc = getWindowImageRect(nm));
     EXPECT_EQ(rc.size(), img.size());
 }
 
-#if !defined HAVE_GTK && !defined HAVE_QT && !defined HAVE_WIN32UI && !defined HAVE_COCOA
+#if (!defined(ENABLE_PLUGINS) \
+        && !defined HAVE_GTK \
+        && !defined HAVE_QT \
+        && !defined HAVE_WIN32UI \
+        && !defined HAVE_COCOA \
+    )
 TEST(Highgui_GUI, DISABLED_regression)
 #else
 TEST(Highgui_GUI, regression)
@@ -126,11 +131,15 @@ static void Foo(int, void* counter)
     }
 }
 
-#if !defined HAVE_GTK && !defined HAVE_QT && !defined HAVE_WIN32UI
-// && !defined HAVE_COCOA - TODO: fails on Mac?
-TEST(Highgui_GUI, DISABLED_trackbar)
+#if (!defined(ENABLE_PLUGINS) \
+        && !defined HAVE_GTK \
+        && !defined HAVE_QT \
+        && !defined HAVE_WIN32UI \
+    ) \
+    || defined(__APPLE__)  // test fails on Mac (cocoa)
+TEST(Highgui_GUI, DISABLED_trackbar_unsafe)
 #else
-TEST(Highgui_GUI, trackbar)
+TEST(Highgui_GUI, trackbar_unsafe)
 #endif
 {
     int value = 50;
@@ -142,9 +151,52 @@ TEST(Highgui_GUI, trackbar)
     ASSERT_NO_THROW(namedWindow(window_name));
     EXPECT_EQ((int)1, createTrackbar(trackbar_name, window_name, &value, 100, Foo, &callback_count));
     EXPECT_EQ(value, getTrackbarPos(trackbar_name, window_name));
-    EXPECT_EQ(0, callback_count);
+    EXPECT_GE(callback_count, 0);
+    EXPECT_LE(callback_count, 1);
+    int callback_count_base = callback_count;
     EXPECT_NO_THROW(setTrackbarPos(trackbar_name, window_name, 90));
-    EXPECT_EQ(1, callback_count);
+    EXPECT_EQ(callback_count_base + 1, callback_count);
+    EXPECT_EQ(90, value);
+    EXPECT_EQ(90, getTrackbarPos(trackbar_name, window_name));
+    EXPECT_NO_THROW(destroyAllWindows());
+}
+
+static
+void testTrackbarCallback(int pos, void* param)
+{
+    CV_Assert(param);
+    int* status = (int*)param;
+    status[0] = pos;
+    status[1]++;
+}
+
+#if (!defined(ENABLE_PLUGINS) \
+        && !defined HAVE_GTK \
+        && !defined HAVE_QT \
+        && !defined HAVE_WIN32UI \
+    ) \
+    || defined(__APPLE__)  // test fails on Mac (cocoa)
+TEST(Highgui_GUI, DISABLED_trackbar)
+#else
+TEST(Highgui_GUI, trackbar)
+#endif
+{
+    int status[2] = {-1, 0};  // pos, counter
+    const std::string window_name("trackbar_test_window");
+    const std::string trackbar_name("trackbar");
+
+    EXPECT_NO_THROW(destroyAllWindows());
+    ASSERT_NO_THROW(namedWindow(window_name));
+    EXPECT_EQ((int)1, createTrackbar(trackbar_name, window_name, NULL, 100, testTrackbarCallback, status));
+    EXPECT_EQ(0, getTrackbarPos(trackbar_name, window_name));
+    int callback_count = status[1];
+    EXPECT_GE(callback_count, 0);
+    EXPECT_LE(callback_count, 1);
+    int callback_count_base = callback_count;
+    EXPECT_NO_THROW(setTrackbarPos(trackbar_name, window_name, 90));
+    callback_count = status[1];
+    EXPECT_EQ(callback_count_base + 1, callback_count);
+    int value = status[0];
     EXPECT_EQ(90, value);
     EXPECT_EQ(90, getTrackbarPos(trackbar_name, window_name));
     EXPECT_NO_THROW(destroyAllWindows());
diff --git a/modules/imgcodecs/include/opencv2/imgcodecs.hpp b/modules/imgcodecs/include/opencv2/imgcodecs.hpp
index 6a389fd471..d7ff9a178d 100644
--- a/modules/imgcodecs/include/opencv2/imgcodecs.hpp
+++ b/modules/imgcodecs/include/opencv2/imgcodecs.hpp
@@ -120,8 +120,8 @@ enum ImwriteEXRCompressionFlags {
        IMWRITE_EXR_COMPRESSION_PXR24 = 5, //!< lossy 24-bit float compression
        IMWRITE_EXR_COMPRESSION_B44   = 6, //!< lossy 4-by-4 pixel block compression, fixed compression rate
        IMWRITE_EXR_COMPRESSION_B44A  = 7, //!< lossy 4-by-4 pixel block compression, flat fields are compressed more
-       IMWRITE_EXR_COMPRESSION_DWAA  = 8, //!< lossy DCT based compression, in blocks of 32 scanlines. More efficient for partial buffer access.
-       IMWRITE_EXR_COMPRESSION_DWAB  = 9, //!< lossy DCT based compression, in blocks of 256 scanlines. More efficient space wise and faster to decode full frames than DWAA_COMPRESSION.
+       IMWRITE_EXR_COMPRESSION_DWAA  = 8, //!< lossy DCT based compression, in blocks of 32 scanlines. More efficient for partial buffer access. Supported since OpenEXR 2.2.0.
+       IMWRITE_EXR_COMPRESSION_DWAB  = 9, //!< lossy DCT based compression, in blocks of 256 scanlines. More efficient space wise and faster to decode full frames than DWAA_COMPRESSION. Supported since OpenEXR 2.2.0.
      };
 
 //! Imwrite PNG specific flags used to tune the compression algorithm.
@@ -215,6 +215,26 @@ The function imreadmulti loads a multi-page image from the specified file into a
 */
 CV_EXPORTS_W bool imreadmulti(const String& filename, CV_OUT std::vector<Mat>& mats, int flags = IMREAD_ANYCOLOR);
 
+/** @brief Loads a of images of a multi-page image from a file.
+
+The function imreadmulti loads a specified range from a multi-page image from the specified file into a vector of Mat objects.
+@param filename Name of file to be loaded.
+@param start Start index of the image to load
+@param count Count number of images to load
+@param flags Flag that can take values of cv::ImreadModes, default with cv::IMREAD_ANYCOLOR.
+@param mats A vector of Mat objects holding each page, if more than one.
+@sa cv::imread
+*/
+CV_EXPORTS_W bool imreadmulti(const String& filename, CV_OUT std::vector<Mat>& mats, int start, int count, int flags = IMREAD_ANYCOLOR);
+
+/** @brief Returns the number of images inside the give file
+
+The function imcount will return the number of pages in a multi-page image, or 1 for single-page images
+@param filename Name of file to be loaded.
+@param flags Flag that can take values of cv::ImreadModes, default with cv::IMREAD_ANYCOLOR.
+*/
+CV_EXPORTS_W size_t imcount(const String& filename, int flags = IMREAD_ANYCOLOR);
+
 /** @brief Saves an image to a specified file.
 
 The function imwrite saves the image to the specified file. The image format is chosen based on the
@@ -231,6 +251,8 @@ can be saved using this function, with these exceptions:
 should have alpha set to 0, fully opaque pixels should have alpha set to 255/65535 (see the code sample below).
 - Multiple images (vector of Mat) can be saved in TIFF format (see the code sample below).
 
+If the image format is not supported, the image will be converted to 8-bit unsigned (CV_8U) and saved that way.
+
 If the format, depth or channel order is different, use
 Mat::convertTo and cv::cvtColor to convert it before saving. Or, use the universal FileStorage I/O
 functions to save the image to XML or YAML format.
diff --git a/modules/imgcodecs/src/grfmt_exr.cpp b/modules/imgcodecs/src/grfmt_exr.cpp
index 9242871f8d..f3368587f3 100644
--- a/modules/imgcodecs/src/grfmt_exr.cpp
+++ b/modules/imgcodecs/src/grfmt_exr.cpp
@@ -56,6 +56,7 @@
 #include <iostream>
 #include <stdexcept>
 
+#include <ImfFrameBuffer.h>
 #include <ImfHeader.h>
 #include <ImfInputFile.h>
 #include <ImfOutputFile.h>
@@ -63,6 +64,7 @@
 #include <ImfStandardAttributes.h>
 #include <half.h>
 #include "grfmt_exr.hpp"
+#include "OpenEXRConfig.h"
 
 #if defined _WIN32
 
@@ -156,6 +158,10 @@ bool  ExrDecoder::readHeader()
     else
     {
         m_green = channels.findChannel( "Y" );
+        if( !m_green )
+        {
+            m_green = channels.findChannel( "Z" ); // Distance of the front of a sample from the viewer
+        }
         if( m_green )
         {
             m_ischroma = true;
@@ -648,12 +654,14 @@ bool  ExrEncoder::write( const Mat& img, const std::vector<int>& params )
             case IMWRITE_EXR_COMPRESSION_B44A:
                 header.compression() = B44A_COMPRESSION;
                 break;
+#if ((OPENEXR_VERSION_MAJOR * 1000 + OPENEXR_VERSION_MINOR) >= (2 * 1000 + 2)) // available since version 2.2.0
             case IMWRITE_EXR_COMPRESSION_DWAA:
                 header.compression() = DWAA_COMPRESSION;
                 break;
             case IMWRITE_EXR_COMPRESSION_DWAB:
                 header.compression() = DWAB_COMPRESSION;
                 break;
+#endif
             default:
                 CV_Error(Error::StsBadArg, "IMWRITE_EXR_COMPRESSION is invalid or not supported");
             }
diff --git a/modules/imgcodecs/src/loadsave.cpp b/modules/imgcodecs/src/loadsave.cpp
index 350042cd7d..28d8ff285b 100644
--- a/modules/imgcodecs/src/loadsave.cpp
+++ b/modules/imgcodecs/src/loadsave.cpp
@@ -495,25 +495,19 @@ imread_( const String& filename, int flags, Mat& mat )
 }
 
 
-/**
-* Read an image into memory and return the information
-*
-* @param[in] filename File to load
-* @param[in] flags Flags
-* @param[in] mats Reference to C++ vector<Mat> object to hold the images
-*
-*/
 static bool
-imreadmulti_(const String& filename, int flags, std::vector<Mat>& mats)
+imreadmulti_(const String& filename, int flags, std::vector<Mat>& mats, int start, int count)
 {
     /// Search for the relevant decoder to handle the imagery
     ImageDecoder decoder;
 
+    CV_CheckGE(start, 0, "Start index cannont be < 0");
+
 #ifdef HAVE_GDAL
-    if (flags != IMREAD_UNCHANGED && (flags & IMREAD_LOAD_GDAL) == IMREAD_LOAD_GDAL){
+    if (flags != IMREAD_UNCHANGED && (flags & IMREAD_LOAD_GDAL) == IMREAD_LOAD_GDAL) {
         decoder = GdalDecoder().newDecoder();
     }
-    else{
+    else {
 #endif
         decoder = findDecoder(filename);
 #ifdef HAVE_GDAL
@@ -521,10 +515,14 @@ imreadmulti_(const String& filename, int flags, std::vector<Mat>& mats)
 #endif
 
     /// if no decoder was found, return nothing.
-    if (!decoder){
+    if (!decoder) {
         return 0;
     }
 
+    if (count < 0) {
+        count = std::numeric_limits<int>::max();
+    }
+
     /// set the filename in the driver
     decoder->setSource(filename);
 
@@ -532,7 +530,7 @@ imreadmulti_(const String& filename, int flags, std::vector<Mat>& mats)
     try
     {
         // read the header to make sure it succeeds
-        if( !decoder->readHeader() )
+        if (!decoder->readHeader())
             return 0;
     }
     catch (const cv::Exception& e)
@@ -546,11 +544,22 @@ imreadmulti_(const String& filename, int flags, std::vector<Mat>& mats)
         return 0;
     }
 
-    for (;;)
+    int current = start;
+
+    while (current > 0)
+    {
+        if (!decoder->nextPage())
+        {
+            return false;
+        }
+        --current;
+    }
+
+    while (current < count)
     {
         // grab the decoded type
         int type = decoder->type();
-        if( (flags & IMREAD_LOAD_GDAL) != IMREAD_LOAD_GDAL && flags != IMREAD_UNCHANGED )
+        if ((flags & IMREAD_LOAD_GDAL) != IMREAD_LOAD_GDAL && flags != IMREAD_UNCHANGED)
         {
             if ((flags & IMREAD_ANYDEPTH) == 0)
                 type = CV_MAKETYPE(CV_8U, CV_MAT_CN(type));
@@ -585,7 +594,7 @@ imreadmulti_(const String& filename, int flags, std::vector<Mat>& mats)
             break;
 
         // optionally rotate the data if EXIF' orientation flag says so
-        if( (flags & IMREAD_IGNORE_ORIENTATION) == 0 && flags != IMREAD_UNCHANGED )
+        if ((flags & IMREAD_IGNORE_ORIENTATION) == 0 && flags != IMREAD_UNCHANGED)
         {
             ApplyExifOrientation(decoder->getExifTag(ORIENTATION), mat);
         }
@@ -595,6 +604,7 @@ imreadmulti_(const String& filename, int flags, std::vector<Mat>& mats)
         {
             break;
         }
+        ++current;
     }
 
     return !mats.empty();
@@ -636,9 +646,81 @@ bool imreadmulti(const String& filename, std::vector<Mat>& mats, int flags)
 {
     CV_TRACE_FUNCTION();
 
-    return imreadmulti_(filename, flags, mats);
+    return imreadmulti_(filename, flags, mats, 0, -1);
 }
 
+
+bool imreadmulti(const String& filename, std::vector<Mat>& mats, int start, int count, int flags)
+{
+    CV_TRACE_FUNCTION();
+
+    return imreadmulti_(filename, flags, mats, start, count);
+}
+
+static
+size_t imcount_(const String& filename, int flags)
+{
+    /// Search for the relevant decoder to handle the imagery
+    ImageDecoder decoder;
+
+#ifdef HAVE_GDAL
+    if (flags != IMREAD_UNCHANGED && (flags & IMREAD_LOAD_GDAL) == IMREAD_LOAD_GDAL) {
+        decoder = GdalDecoder().newDecoder();
+    }
+    else {
+#else
+        CV_UNUSED(flags);
+#endif
+        decoder = findDecoder(filename);
+#ifdef HAVE_GDAL
+    }
+#endif
+
+    /// if no decoder was found, return nothing.
+    if (!decoder) {
+        return 0;
+    }
+
+    /// set the filename in the driver
+    decoder->setSource(filename);
+
+    // read the header to make sure it succeeds
+    try
+    {
+        // read the header to make sure it succeeds
+        if (!decoder->readHeader())
+            return 0;
+    }
+    catch (const cv::Exception& e)
+    {
+        std::cerr << "imcount_('" << filename << "'): can't read header: " << e.what() << std::endl << std::flush;
+        return 0;
+    }
+    catch (...)
+    {
+        std::cerr << "imcount_('" << filename << "'): can't read header: unknown exception" << std::endl << std::flush;
+        return 0;
+    }
+
+    size_t result = 1;
+
+
+    while (decoder->nextPage())
+    {
+        ++result;
+    }
+
+    return result;
+}
+
+size_t imcount(const String& filename, int flags)
+{
+    CV_TRACE_FUNCTION();
+
+    return imcount_(filename, flags);
+}
+
+
 static bool imwrite_( const String& filename, const std::vector<Mat>& img_vec,
                       const std::vector<int>& params, bool flipv )
 {
diff --git a/modules/imgcodecs/test/test_tiff.cpp b/modules/imgcodecs/test/test_tiff.cpp
index 2c6fb6249b..dec38014aa 100644
--- a/modules/imgcodecs/test/test_tiff.cpp
+++ b/modules/imgcodecs/test/test_tiff.cpp
@@ -358,6 +358,94 @@ TEST(Imgcodecs_Tiff, decode_black_and_write_image_pr17275_default)
     EXPECT_EQ(CV_8UC3, img.type()) << cv::typeToString(img.type());
 }
 
+TEST(Imgcodecs_Tiff, count_multipage)
+{
+    const string root = cvtest::TS::ptr()->get_data_path();
+    {
+        const string filename = root + "readwrite/multipage.tif";
+        ASSERT_EQ((size_t)6, imcount(filename));
+    }
+    {
+        const string filename = root + "readwrite/test32FC3_raw.tiff";
+        ASSERT_EQ((size_t)1, imcount(filename));
+    }
+}
+
+TEST(Imgcodecs_Tiff, read_multipage_indexed)
+{
+    const string root = cvtest::TS::ptr()->get_data_path();
+    const string filename = root + "readwrite/multipage.tif";
+    const string page_files[] = {
+        "readwrite/multipage_p1.tif",
+        "readwrite/multipage_p2.tif",
+        "readwrite/multipage_p3.tif",
+        "readwrite/multipage_p4.tif",
+        "readwrite/multipage_p5.tif",
+        "readwrite/multipage_p6.tif"
+    };
+    const int page_count = sizeof(page_files) / sizeof(page_files[0]);
+    vector<Mat> single_pages;
+    for (int i = 0; i < page_count; i++)
+    {
+        // imread and imreadmulti have different default values for the flag
+        const Mat page = imread(root + page_files[i], IMREAD_ANYCOLOR);
+        single_pages.push_back(page);
+    }
+    ASSERT_EQ((size_t)page_count, single_pages.size());
+
+    {
+        SCOPED_TRACE("Edge Cases");
+        vector<Mat> multi_pages;
+        bool res = imreadmulti(filename, multi_pages, 0, 0);
+        // If we asked for 0 images and we successfully read 0 images should this be false ?
+        ASSERT_TRUE(res == false);
+        ASSERT_EQ((size_t)0, multi_pages.size());
+        res = imreadmulti(filename, multi_pages, 0, 123123);
+        ASSERT_TRUE(res == true);
+        ASSERT_EQ((size_t)6, multi_pages.size());
+    }
+
+    {
+        SCOPED_TRACE("Read all with indices");
+        vector<Mat> multi_pages;
+        bool res = imreadmulti(filename, multi_pages, 0, 6);
+        ASSERT_TRUE(res == true);
+        ASSERT_EQ((size_t)page_count, multi_pages.size());
+        for (int i = 0; i < page_count; i++)
+        {
+            EXPECT_PRED_FORMAT2(cvtest::MatComparator(0, 0), multi_pages[i], single_pages[i]);
+        }
+    }
+
+    {
+        SCOPED_TRACE("Read one by one");
+        vector<Mat> multi_pages;
+        for (int i = 0; i < page_count; i++)
+        {
+            bool res = imreadmulti(filename, multi_pages, i, 1);
+            ASSERT_TRUE(res == true);
+            ASSERT_EQ((size_t)1, multi_pages.size());
+            EXPECT_PRED_FORMAT2(cvtest::MatComparator(0, 0), multi_pages[0], single_pages[i]);
+            multi_pages.clear();
+        }
+    }
+
+    {
+        SCOPED_TRACE("Read multiple at a time");
+        vector<Mat> multi_pages;
+        for (int i = 0; i < page_count/2; i++)
+        {
+            bool res = imreadmulti(filename, multi_pages, i*2, 2);
+            ASSERT_TRUE(res == true);
+            ASSERT_EQ((size_t)2, multi_pages.size());
+            EXPECT_PRED_FORMAT2(cvtest::MatComparator(0, 0), multi_pages[0], single_pages[i * 2]) << i;
+            EXPECT_PRED_FORMAT2(cvtest::MatComparator(0, 0), multi_pages[1], single_pages[i * 2 + 1]);
+            multi_pages.clear();
+        }
+    }
+}
+
+
 #endif
 
 }} // namespace
diff --git a/modules/imgproc/include/opencv2/imgproc.hpp b/modules/imgproc/include/opencv2/imgproc.hpp
index b82afce292..89c97085a0 100644
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@@ -1212,7 +1212,7 @@ protected:
     struct CV_EXPORTS Vertex
     {
         Vertex();
-        Vertex(Point2f pt, bool _isvirtual, int _firstEdge=0);
+        Vertex(Point2f pt, bool isvirtual, int firstEdge=0);
         bool isvirtual() const;
         bool isfree() const;
 
@@ -1443,7 +1443,7 @@ The unnormalized square box filter can be useful in computing local image statis
 variance and standard deviation around the neighborhood of a pixel.
 
 @param src input image
-@param dst output image of the same size and type as _src
+@param dst output image of the same size and type as src
 @param ddepth the output image depth (-1 to use src.depth())
 @param ksize kernel size
 @param anchor kernel anchor point. The default value of Point(-1, -1) denotes that the anchor is at the kernel
@@ -2032,8 +2032,8 @@ CV_EXPORTS_W void HoughLinesP( InputArray image, OutputArray lines,
 
 The function finds lines in a set of points using a modification of the Hough transform.
 @include snippets/imgproc_HoughLinesPointSet.cpp
-@param _point Input vector of points. Each vector must be encoded as a Point vector \f$(x,y)\f$. Type must be CV_32FC2 or CV_32SC2.
-@param _lines Output vector of found lines. Each vector is encoded as a vector<Vec3d> \f$(votes, rho, theta)\f$.
+@param point Input vector of points. Each vector must be encoded as a Point vector \f$(x,y)\f$. Type must be CV_32FC2 or CV_32SC2.
+@param lines Output vector of found lines. Each vector is encoded as a vector<Vec3d> \f$(votes, rho, theta)\f$.
 The larger the value of 'votes', the higher the reliability of the Hough line.
 @param lines_max Max count of hough lines.
 @param threshold Accumulator threshold parameter. Only those lines are returned that get enough
@@ -2045,7 +2045,7 @@ votes ( \f$>\texttt{threshold}\f$ )
 @param max_theta Maximum angle value of the accumulator in radians.
 @param theta_step Angle resolution of the accumulator in radians.
  */
-CV_EXPORTS_W void HoughLinesPointSet( InputArray _point, OutputArray _lines, int lines_max, int threshold,
+CV_EXPORTS_W void HoughLinesPointSet( InputArray point, OutputArray lines, int lines_max, int threshold,
                                       double min_rho, double max_rho, double rho_step,
                                       double min_theta, double max_theta, double theta_step );
 
@@ -2852,6 +2852,22 @@ An example is shown below:
  */
 CV_EXPORTS_W void createHanningWindow(OutputArray dst, Size winSize, int type);
 
+/** @brief Performs the per-element division of the first Fourier spectrum by the second Fourier spectrum.
+
+The function cv::divSpectrums performs the per-element division of the first array by the second array.
+The arrays are CCS-packed or complex matrices that are results of a real or complex Fourier transform.
+
+@param a first input array.
+@param b second input array of the same size and type as src1 .
+@param c output array of the same size and type as src1 .
+@param flags operation flags; currently, the only supported flag is cv::DFT_ROWS, which indicates that
+each row of src1 and src2 is an independent 1D Fourier spectrum. If you do not want to use this flag, then simply add a `0` as value.
+@param conjB optional flag that conjugates the second input array before the multiplication (true)
+or not (false).
+*/
+CV_EXPORTS_W void divSpectrums(InputArray a, InputArray b, OutputArray c,
+                               int flags, bool conjB = false);
+
 //! @} imgproc_motion
 
 //! @addtogroup imgproc_misc
@@ -4034,9 +4050,9 @@ Examples of how intersectConvexConvex works
 
 /** @brief Finds intersection of two convex polygons
 
-@param _p1 First polygon
-@param _p2 Second polygon
-@param _p12 Output polygon describing the intersecting area
+@param p1 First polygon
+@param p2 Second polygon
+@param p12 Output polygon describing the intersecting area
 @param handleNested When true, an intersection is found if one of the polygons is fully enclosed in the other.
 When false, no intersection is found. If the polygons share a side or the vertex of one polygon lies on an edge
 of the other, they are not considered nested and an intersection will be found regardless of the value of handleNested.
@@ -4045,8 +4061,8 @@ of the other, they are not considered nested and an intersection will be found r
 
 @note intersectConvexConvex doesn't confirm that both polygons are convex and will return invalid results if they aren't.
  */
-CV_EXPORTS_W float intersectConvexConvex( InputArray _p1, InputArray _p2,
-                                          OutputArray _p12, bool handleNested = true );
+CV_EXPORTS_W float intersectConvexConvex( InputArray p1, InputArray p2,
+                                          OutputArray p12, bool handleNested = true );
 
 /** @example samples/cpp/fitellipse.cpp
 An example using the fitEllipse technique
diff --git a/modules/imgproc/src/color_lab.cpp b/modules/imgproc/src/color_lab.cpp
index 337d601f69..a181880862 100644
--- a/modules/imgproc/src/color_lab.cpp
+++ b/modules/imgproc/src/color_lab.cpp
@@ -1536,6 +1536,8 @@ static inline void trilinearPackedInterpolate(const v_uint16& inX, const v_uint1
 #endif // CV_SIMD
 
 
+
+
 struct RGB2Lab_b
 {
     typedef uchar channel_type;
@@ -1571,6 +1573,69 @@ struct RGB2Lab_b
         }
     }
 
+#if CV_NEON
+    template <int n>
+    inline void rgb2lab_batch(const ushort* tab,
+                              const v_uint8 vRi, const v_uint8 vGi, const v_uint8 vBi,
+                              v_int32& vL, v_int32& va, v_int32& vb) const
+    {
+        // Define some scalar constants which we will make use of later
+        const int Lscale = (116*255+50)/100;
+        const int Lshift = -((16*255*(1 << lab_shift2) + 50)/100);
+        const int xyzDescaleShift = (1 << (lab_shift - 1));
+        const int labDescaleShift = (1 << (lab_shift2 - 1));
+        const int abShift = 128*(1 << lab_shift2);
+
+        const int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
+                  C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
+                  C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
+
+        // int R = tab[src[0]], G = tab[src[1]], B = tab[src[2]];
+        v_int32 vR(tab[v_extract_n<4*n+0>(vRi)], tab[v_extract_n<4*n+1>(vRi)],
+                   tab[v_extract_n<4*n+2>(vRi)], tab[v_extract_n<4*n+3>(vRi)]);
+        v_int32 vG(tab[v_extract_n<4*n+0>(vGi)], tab[v_extract_n<4*n+1>(vGi)],
+                   tab[v_extract_n<4*n+2>(vGi)], tab[v_extract_n<4*n+3>(vGi)]);
+        v_int32 vB(tab[v_extract_n<4*n+0>(vBi)], tab[v_extract_n<4*n+1>(vBi)],
+                   tab[v_extract_n<4*n+2>(vBi)], tab[v_extract_n<4*n+3>(vBi)]);
+
+        /* int fX = LabCbrtTab_b[CV_DESCALE(R*C0 + G*C1 + B*C2, lab_shift)];*/
+        v_int32 vfX = v_fma(vR, v_setall_s32(C0), v_setall_s32(xyzDescaleShift));
+        vfX = v_fma(vG, v_setall_s32(C1), vfX);
+        vfX = v_fma(vB, v_setall_s32(C2), vfX);
+        vfX = v_shr<lab_shift>(vfX);
+        vfX = v_int32(LabCbrtTab_b[v_extract_n<0>(vfX)], LabCbrtTab_b[v_extract_n<1>(vfX)],
+                      LabCbrtTab_b[v_extract_n<2>(vfX)], LabCbrtTab_b[v_extract_n<3>(vfX)]);
+
+        /* int fY = LabCbrtTab_b[CV_DESCALE(R*C3 + G*C4 + B*C5, lab_shift)]; */
+        v_int32 vfY = v_fma(vR, v_setall_s32(C3), v_setall_s32(xyzDescaleShift));
+        vfY = v_fma(vG, v_setall_s32(C4), vfY);
+        vfY = v_fma(vB, v_setall_s32(C5), vfY);
+        vfY = v_shr<lab_shift>(vfY);
+        vfY = v_int32(LabCbrtTab_b[v_extract_n<0>(vfY)], LabCbrtTab_b[v_extract_n<1>(vfY)],
+                      LabCbrtTab_b[v_extract_n<2>(vfY)], LabCbrtTab_b[v_extract_n<3>(vfY)]);
+
+        /* int fZ = LabCbrtTab_b[CV_DESCALE(R*C6 + G*C7 + B*C8, lab_shift)];*/
+        v_int32 vfZ = v_fma(vR, v_setall_s32(C6), v_setall_s32(xyzDescaleShift));
+        vfZ = v_fma(vG, v_setall_s32(C7), vfZ);
+        vfZ = v_fma(vB, v_setall_s32(C8), vfZ);
+        vfZ = v_shr<lab_shift>(vfZ);
+        vfZ = v_int32(LabCbrtTab_b[v_extract_n<0>(vfZ)], LabCbrtTab_b[v_extract_n<1>(vfZ)],
+                      LabCbrtTab_b[v_extract_n<2>(vfZ)], LabCbrtTab_b[v_extract_n<3>(vfZ)]);
+
+        /* int L = CV_DESCALE( Lscale*fY + Lshift, lab_shift2 );*/
+        vL = v_fma(vfY, v_setall_s32(Lscale), v_setall_s32(Lshift+labDescaleShift));
+        vL = v_shr<lab_shift2>(vL);
+
+        /* int a = CV_DESCALE( 500*(fX - fY) + 128*(1 << lab_shift2), lab_shift2 );*/
+        va = v_fma(vfX - vfY, v_setall_s32(500), v_setall_s32(abShift+labDescaleShift));
+        va = v_shr<lab_shift2>(va);
+
+        /* int b = CV_DESCALE( 200*(fY - fZ) + 128*(1 << lab_shift2), lab_shift2 );*/
+        vb = v_fma(vfY - vfZ, v_setall_s32(200), v_setall_s32(abShift+labDescaleShift));
+        vb = v_shr<lab_shift2>(vb);
+    }
+#endif // CV_NEON
+
     void operator()(const uchar* src, uchar* dst, int n) const
     {
         CV_INSTRUMENT_REGION();
@@ -1585,6 +1650,45 @@ struct RGB2Lab_b
 
         i = 0;
 
+#if CV_NEON
+        // On each loop, we load nlanes of RGB/A v_uint8s and store nlanes of
+        // Lab v_uint8s
+        for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes,
+                src += scn*v_uint8::nlanes, dst += 3*v_uint8::nlanes )
+        {
+            // Load 4 batches of 4 src
+            v_uint8 vRi, vGi, vBi;
+            if(scn == 4)
+            {
+                v_uint8 vAi;
+                v_load_deinterleave(src, vRi, vGi, vBi, vAi);
+            }
+            else // scn == 3
+            {
+                v_load_deinterleave(src, vRi, vGi, vBi);
+            }
+
+            // Do 4 batches of 4 RGB2Labs
+            v_int32 vL0, va0, vb0;
+            rgb2lab_batch<0>(tab, vRi, vGi, vBi, vL0, va0, vb0);
+            v_int32 vL1, va1, vb1;
+            rgb2lab_batch<1>(tab, vRi, vGi, vBi, vL1, va1, vb1);
+            v_int32 vL2, va2, vb2;
+            rgb2lab_batch<2>(tab, vRi, vGi, vBi, vL2, va2, vb2);
+            v_int32 vL3, va3, vb3;
+            rgb2lab_batch<3>(tab, vRi, vGi, vBi, vL3, va3, vb3);
+
+            // Saturate, combine and store all batches
+            // dst[0] = saturate_cast<uchar>(L);
+            // dst[1] = saturate_cast<uchar>(a);
+            // dst[2] = saturate_cast<uchar>(b);
+            v_store_interleave(dst,
+                v_pack(v_pack_u(vL0, vL1), v_pack_u(vL2, vL3)),
+                v_pack(v_pack_u(va0, va1), v_pack_u(va2, va3)),
+                v_pack(v_pack_u(vb0, vb1), v_pack_u(vb2, vb3)));
+        }
+#endif // CV_NEON
+
 #if CV_SIMD
         const int vsize = v_uint8::nlanes;
         const int xyzDescaleShift = 1 << (lab_shift - 1);
diff --git a/modules/imgproc/src/contours.cpp b/modules/imgproc/src/contours.cpp
index e06caf5764..677c10d8bb 100644
--- a/modules/imgproc/src/contours.cpp
+++ b/modules/imgproc/src/contours.cpp
@@ -625,7 +625,8 @@ icvFetchContour( schar                  *ptr,
 
 /*
    trace contour until certain point is met.
-   returns 1 if met, 0 else.
+   returns 1 if met and this is the last contour
+   encountered by a raster scan reaching the point, 0 else.
 */
 static int
 icvTraceContour( schar *ptr, int step, schar *stop_ptr, int is_hole )
@@ -668,14 +669,39 @@ icvTraceContour( schar *ptr, int step, schar *stop_ptr, int is_hole )
                     break;
             }
 
-            if( i3 == stop_ptr || (i4 == i0 && i3 == i1) )
+            if (i3 == stop_ptr) {
+                if (!(*i3 & 0x80)) {
+                    /* it's the only contour */
+                    return 1;
+                }
+
+                /* check if this is the last contour */
+                /* encountered during a raster scan  */
+                schar *i5;
+                int t = s;
+                while (true)
+                {
+                    t = (t - 1) & 7;
+                    i5 = i3 + deltas[t];
+                    if (*i5 != 0)
+                        break;
+                    if (t == 0)
+                        return 1;
+                }
+            }
+
+            if( (i4 == i0 && i3 == i1) )
                 break;
 
             i3 = i4;
             s = (s + 4) & 7;
         }                       /* end of border following loop */
     }
-    return i3 == stop_ptr;
+    else {
+        return i3 == stop_ptr;
+    }
+
+    return 0;
 }
 
 
diff --git a/modules/imgproc/src/phasecorr.cpp b/modules/imgproc/src/phasecorr.cpp
index 4808f971ef..9db436673c 100644
--- a/modules/imgproc/src/phasecorr.cpp
+++ b/modules/imgproc/src/phasecorr.cpp
@@ -82,9 +82,9 @@ static void magSpectrums( InputArray _src, OutputArray _dst)
             {
                 if( k == 1 )
                     dataSrc += cols - 1, dataDst += cols - 1;
-                dataDst[0] = dataSrc[0]*dataSrc[0];
+                dataDst[0] = (float)std::abs(dataSrc[0]);
                 if( rows % 2 == 0 )
-                    dataDst[(rows-1)*stepDst] = dataSrc[(rows-1)*stepSrc]*dataSrc[(rows-1)*stepSrc];
+                    dataDst[(rows-1)*stepDst] = (float)std::abs(dataSrc[(rows-1)*stepSrc]);
 
                 for( j = 1; j <= rows - 2; j += 2 )
                 {
@@ -101,9 +101,9 @@ static void magSpectrums( InputArray _src, OutputArray _dst)
         {
             if( is_1d && cn == 1 )
             {
-                dataDst[0] = dataSrc[0]*dataSrc[0];
+                dataDst[0] = (float)std::abs(dataSrc[0]);
                 if( cols % 2 == 0 )
-                    dataDst[j1] = dataSrc[j1]*dataSrc[j1];
+                    dataDst[j1] = (float)std::abs(dataSrc[j1]);
             }
 
             for( j = j0; j < j1; j += 2 )
@@ -126,9 +126,9 @@ static void magSpectrums( InputArray _src, OutputArray _dst)
             {
                 if( k == 1 )
                     dataSrc += cols - 1, dataDst += cols - 1;
-                dataDst[0] = dataSrc[0]*dataSrc[0];
+                dataDst[0] = std::abs(dataSrc[0]);
                 if( rows % 2 == 0 )
-                    dataDst[(rows-1)*stepDst] = dataSrc[(rows-1)*stepSrc]*dataSrc[(rows-1)*stepSrc];
+                    dataDst[(rows-1)*stepDst] = std::abs(dataSrc[(rows-1)*stepSrc]);
 
                 for( j = 1; j <= rows - 2; j += 2 )
                 {
@@ -145,9 +145,9 @@ static void magSpectrums( InputArray _src, OutputArray _dst)
         {
             if( is_1d && cn == 1 )
             {
-                dataDst[0] = dataSrc[0]*dataSrc[0];
+                dataDst[0] = std::abs(dataSrc[0]);
                 if( cols % 2 == 0 )
-                    dataDst[j1] = dataSrc[j1]*dataSrc[j1];
+                    dataDst[j1] = std::abs(dataSrc[j1]);
             }
 
             for( j = j0; j < j1; j += 2 )
@@ -158,7 +158,7 @@ static void magSpectrums( InputArray _src, OutputArray _dst)
     }
 }
 
-static void divSpectrums( InputArray _srcA, InputArray _srcB, OutputArray _dst, int flags, bool conjB)
+void divSpectrums( InputArray _srcA, InputArray _srcB, OutputArray _dst, int flags, bool conjB)
 {
     Mat srcA = _srcA.getMat(), srcB = _srcB.getMat();
     int depth = srcA.depth(), cn = srcA.channels(), type = srcA.type();
diff --git a/modules/imgproc/src/smooth.simd.hpp b/modules/imgproc/src/smooth.simd.hpp
index 6c41b45e9f..3a39765c71 100644
--- a/modules/imgproc/src/smooth.simd.hpp
+++ b/modules/imgproc/src/smooth.simd.hpp
@@ -1236,8 +1236,12 @@ void hlineSmoothONa_yzy_a<uint16_t, ufixedpoint32>(const uint16_t* src, int cn,
         v_mul_expand(vx_load(src + pre_shift * cn), vx_setall_u16((uint16_t) *((uint32_t*)(m + pre_shift))), v_res0, v_res1);
         for (int j = 0; j < pre_shift; j ++)
         {
+            v_uint16 v_weight = vx_setall_u16((uint16_t) *((uint32_t*)(m + j)));
             v_uint32 v_add0, v_add1;
-            v_mul_expand(vx_load(src + j * cn) + vx_load(src + (n - 1 - j)*cn), vx_setall_u16((uint16_t) *((uint32_t*)(m + j))), v_add0, v_add1);
+            v_mul_expand(vx_load(src + j * cn), v_weight, v_add0, v_add1);
+            v_res0 += v_add0;
+            v_res1 += v_add1;
+            v_mul_expand(vx_load(src + (n - 1 - j)*cn), v_weight, v_add0, v_add1);
             v_res0 += v_add0;
             v_res1 += v_add1;
         }
diff --git a/modules/imgproc/src/templmatch.cpp b/modules/imgproc/src/templmatch.cpp
index 7e34832fe5..e869f8d4a4 100644
--- a/modules/imgproc/src/templmatch.cpp
+++ b/modules/imgproc/src/templmatch.cpp
@@ -1101,6 +1101,7 @@ static bool ipp_sqrDistance(const Mat& src, const Mat& tpl, Mat& dst)
     buffer.allocate( bufSize );
 
     status = CV_INSTRUMENT_FUN_IPP(ippiSqrDistanceNorm, src.ptr(), (int)src.step, srcRoiSize, tpl.ptr(), (int)tpl.step, tplRoiSize, dst.ptr<Ipp32f>(), (int)dst.step, funCfg, buffer);
+    dst = cv::max(dst, 0); // handle edge case from rounding in variance computation which can result in negative values
     return status >= 0;
 }
 
diff --git a/modules/imgproc/test/test_contours.cpp b/modules/imgproc/test/test_contours.cpp
index 5043e8ff64..88e6cfd761 100644
--- a/modules/imgproc/test/test_contours.cpp
+++ b/modules/imgproc/test/test_contours.cpp
@@ -485,6 +485,55 @@ TEST(Imgproc_FindContours, border)
     ASSERT_EQ(0, cvtest::norm(img, img_draw_contours, NORM_INF));
 }
 
+TEST(Imgproc_FindContours, regression_4363_shared_nbd)
+{
+    // Create specific test image
+    Mat1b img(12, 69, (const uchar&)0);
+
+    img(1, 1) = 1;
+
+    // Vertical rectangle with hole sharing the same NBD
+    for (int r = 1; r <= 10; ++r) {
+        for (int c = 3; c <= 5; ++c) {
+            img(r, c) = 1;
+        }
+    }
+    img(9, 4) = 0;
+
+    // 124 small CCs
+    for (int r = 1; r <= 7; r += 2) {
+        for (int c = 7; c <= 67; c += 2) {
+            img(r, c) = 1;
+        }
+    }
+
+    // Last CC
+    img(9, 7) = 1;
+
+    vector< vector<Point> > contours;
+    vector<Vec4i> hierarchy;
+    findContours(img, contours, hierarchy, RETR_TREE, CHAIN_APPROX_NONE);
+
+    bool found = false;
+    size_t index = 0;
+    for (vector< vector<Point> >::const_iterator i = contours.begin(); i != contours.end(); ++i)
+    {
+        const vector<Point>& c = *i;
+        if (!c.empty() && c[0] == Point(7, 9))
+        {
+            found = true;
+            index = (size_t)(i - contours.begin());
+            break;
+        }
+    }
+    EXPECT_TRUE(found) << "Desired result: point (7,9) is a contour - Actual result: point (7,9) is not a contour";
+
+    if (found)
+    {
+        EXPECT_LT(hierarchy[index][3], 0) << "Desired result: (7,9) has no parent - Actual result: parent of (7,9) is another contour. index = " << index;
+    }
+}
+
 TEST(Imgproc_PointPolygonTest, regression_10222)
 {
     vector<Point> contour;
diff --git a/modules/imgproc/test/test_houghcircles.cpp b/modules/imgproc/test/test_houghcircles.cpp
index de92800976..094963211c 100644
--- a/modules/imgproc/test/test_houghcircles.cpp
+++ b/modules/imgproc/test/test_houghcircles.cpp
@@ -56,7 +56,7 @@ using namespace std;
 
 static string getTestCaseName(const string& picture_name, double minDist, double edgeThreshold, double accumThreshold, int minRadius, int maxRadius)
 {
-    string results_name = format("circles_%s_%.0f_%.0f_%.0f_%d_%d",
+    string results_name = cv::format("circles_%s_%.0f_%.0f_%.0f_%d_%d",
         picture_name.c_str(), minDist, edgeThreshold, accumThreshold, minRadius, maxRadius);
     string temp(results_name);
     size_t pos = temp.find_first_of("\\/.");
diff --git a/modules/imgproc/test/test_pc.cpp b/modules/imgproc/test/test_pc.cpp
index 22c4bb5d76..edfe0701e5 100644
--- a/modules/imgproc/test/test_pc.cpp
+++ b/modules/imgproc/test/test_pc.cpp
@@ -120,4 +120,278 @@ TEST(Imgproc_PhaseCorrelatorTest, accuracy_1d_odd_fft) {
     ASSERT_NEAR(phaseShift.x, (double)xShift, 1.);
 }
 
+////////////////////// DivSpectrums ////////////////////////
+class CV_DivSpectrumsTest : public cvtest::ArrayTest
+{
+public:
+    CV_DivSpectrumsTest();
+protected:
+    void run_func();
+    void get_test_array_types_and_sizes( int, vector<vector<Size> >& sizes, vector<vector<int> >& types );
+    void prepare_to_validation( int test_case_idx );
+    int flags;
+};
+
+
+CV_DivSpectrumsTest::CV_DivSpectrumsTest() : flags(0)
+{
+    // Allocate test matrices.
+    test_array[INPUT].push_back(NULL);  // first input DFT as a CCS-packed array or complex matrix.
+    test_array[INPUT].push_back(NULL);  // second input DFT as a CCS-packed array or complex matrix.
+    test_array[OUTPUT].push_back(NULL);  // output DFT as a complex matrix.
+    test_array[REF_OUTPUT].push_back(NULL);  // reference output DFT as a complex matrix.
+    test_array[TEMP].push_back(NULL);  // first input DFT converted to a complex matrix.
+    test_array[TEMP].push_back(NULL);  // second input DFT converted to a complex matrix.
+    test_array[TEMP].push_back(NULL);  // output DFT as a CCV-packed array.
+}
+
+void CV_DivSpectrumsTest::get_test_array_types_and_sizes( int test_case_idx, vector<vector<Size> >& sizes, vector<vector<int> >& types )
+{
+    cvtest::ArrayTest::get_test_array_types_and_sizes(test_case_idx, sizes, types);
+    RNG& rng = ts->get_rng();
+
+    // Get the flag of the input.
+    const int rand_int_flags = cvtest::randInt(rng);
+    flags = rand_int_flags & (CV_DXT_MUL_CONJ | CV_DXT_ROWS);
+
+    // Get input type.
+    const int rand_int_type = cvtest::randInt(rng);
+    int type;
+
+    if (rand_int_type % 4)
+    {
+        type = CV_32FC1;
+    }
+    else if (rand_int_type % 4 == 1)
+    {
+        type = CV_32FC2;
+    }
+    else if (rand_int_type % 4 == 2)
+    {
+        type = CV_64FC1;
+    }
+    else
+    {
+        type = CV_64FC2;
+    }
+
+    for( size_t i = 0; i < types.size(); i++ )
+    {
+        for( size_t j = 0; j < types[i].size(); j++ )
+        {
+            types[i][j] = type;
+        }
+    }
+
+    // Inputs are CCS-packed arrays.  Prepare outputs and temporary inputs as complex matrices.
+    if( type == CV_32FC1 || type == CV_64FC1 )
+    {
+        types[OUTPUT][0] += 8;
+        types[REF_OUTPUT][0] += 8;
+        types[TEMP][0] += 8;
+        types[TEMP][1] += 8;
+    }
+}
+
+/// Helper function to convert a ccs array of depth_t into a complex matrix.
+template<typename depth_t>
+static void convert_from_ccs_helper( const Mat& src0, const Mat& src1, Mat& dst )
+{
+    const int cn = src0.channels();
+    int srcstep = cn;
+    int dststep = 1;
+
+    if( !dst.isContinuous() )
+        dststep = (int)(dst.step/dst.elemSize());
+
+    if( !src0.isContinuous() )
+        srcstep = (int)(src0.step/src0.elemSize1());
+
+    Complex<depth_t> *dst_data = dst.ptr<Complex<depth_t> >();
+    const depth_t* src0_data = src0.ptr<depth_t>();
+    const depth_t* src1_data = src1.ptr<depth_t>();
+    dst_data->re = src0_data[0];
+    dst_data->im = 0;
+    const int n = dst.cols + dst.rows - 1;
+    const int n2 = (n+1) >> 1;
+
+    if( (n & 1) == 0 )
+    {
+        dst_data[n2*dststep].re = src0_data[(cn == 1 ? n-1 : n2)*srcstep];
+        dst_data[n2*dststep].im = 0;
+    }
+
+    int delta0 = srcstep;
+    int delta1 = delta0 + (cn == 1 ? srcstep : 1);
+
+    if( cn == 1 )
+        srcstep *= 2;
+
+    for( int i = 1; i < n2; i++, delta0 += srcstep, delta1 += srcstep )
+    {
+        depth_t t0 = src0_data[delta0];
+        depth_t t1 = src0_data[delta1];
+
+        dst_data[i*dststep].re = t0;
+        dst_data[i*dststep].im = t1;
+
+        t0 = src1_data[delta0];
+        t1 = -src1_data[delta1];
+
+        dst_data[(n-i)*dststep].re = t0;
+        dst_data[(n-i)*dststep].im = t1;
+    }
+}
+
+/// Helper function to convert a ccs array into a complex matrix.
+static void convert_from_ccs( const Mat& src0, const Mat& src1, Mat& dst, const int flags )
+{
+    if( dst.rows > 1 && (dst.cols > 1 || (flags & DFT_ROWS)) )
+    {
+        const int count = dst.rows;
+        const int len = dst.cols;
+        const bool is2d = (flags & DFT_ROWS) == 0;
+        for( int i = 0; i < count; i++ )
+        {
+            const int j = !is2d || i == 0 ? i : count - i;
+            const Mat& src0row = src0.row(i);
+            const Mat& src1row = src1.row(j);
+            Mat dstrow = dst.row(i);
+            convert_from_ccs( src0row, src1row, dstrow, 0 );
+        }
+
+        if( is2d )
+        {
+            const Mat& src0row = src0.col(0);
+            Mat dstrow = dst.col(0);
+            convert_from_ccs( src0row, src0row, dstrow, 0 );
+
+            if( (len & 1) == 0 )
+            {
+                const Mat& src0row_even = src0.col(src0.cols - 1);
+                Mat dstrow_even = dst.col(len/2);
+                convert_from_ccs( src0row_even, src0row_even, dstrow_even, 0 );
+            }
+        }
+    }
+    else
+    {
+        if( dst.depth() == CV_32F )
+        {
+            convert_from_ccs_helper<float>( src0, src1, dst );
+        }
+        else
+        {
+            convert_from_ccs_helper<double>( src0, src1, dst );
+        }
+    }
+}
+
+/// Helper function to compute complex number (nu_re + nu_im * i) / (de_re + de_im * i).
+static std::pair<double, double> divide_complex_numbers( const double nu_re, const double nu_im,
+                                                         const double de_re, const double de_im,
+                                                         const bool conj_de )
+{
+    if ( conj_de )
+    {
+        return divide_complex_numbers( nu_re, nu_im, de_re, -de_im, false /* conj_de */ );
+    }
+
+    const double result_de = de_re * de_re + de_im * de_im + DBL_EPSILON;
+    const double result_re = nu_re * de_re + nu_im * de_im;
+    const double result_im = nu_re * (-de_im) + nu_im * de_re;
+    return std::pair<double, double>(result_re / result_de, result_im / result_de);
+};
+
+/// Helper function to divide a DFT in src1 by a DFT in src2 with depths depth_t.  The DFTs are
+/// complex matrices.
+template <typename depth_t>
+static void div_complex_helper( const Mat& src1, const Mat& src2, Mat& dst, int flags )
+{
+    CV_Assert( src1.size == src2.size && src1.type() == src2.type() );
+    dst.create( src1.rows, src1.cols, src1.type() );
+    const int cn = src1.channels();
+    int cols = src1.cols * cn;
+
+    for( int i = 0; i < dst.rows; i++ )
+    {
+        const depth_t *src1_data = src1.ptr<depth_t>(i);
+        const depth_t *src2_data = src2.ptr<depth_t>(i);
+        depth_t *dst_data = dst.ptr<depth_t>(i);
+        for( int j = 0; j < cols; j += 2 )
+        {
+            std::pair<double, double> result =
+                    divide_complex_numbers( src1_data[j], src1_data[j + 1],
+                                            src2_data[j], src2_data[j + 1],
+                                            (flags & CV_DXT_MUL_CONJ) != 0 );
+            dst_data[j] = (depth_t)result.first;
+            dst_data[j + 1] = (depth_t)result.second;
+        }
+    }
+}
+
+/// Helper function to divide a DFT in src1 by a DFT in src2.  The DFTs are complex matrices.
+static void div_complex( const Mat& src1, const Mat& src2, Mat& dst, const int flags )
+{
+    const int type = src1.type();
+    CV_Assert( type == CV_32FC2 || type == CV_64FC2 );
+
+    if ( src1.depth() == CV_32F )
+    {
+        return div_complex_helper<float>( src1, src2, dst, flags );
+    }
+    else
+    {
+        return div_complex_helper<double>( src1, src2, dst, flags );
+    }
+}
+
+void CV_DivSpectrumsTest::prepare_to_validation( int /* test_case_idx */ )
+{
+    Mat &src1 = test_mat[INPUT][0];
+    Mat &src2 = test_mat[INPUT][1];
+    Mat &ref_dst = test_mat[REF_OUTPUT][0];
+    const int cn = src1.channels();
+    // Inputs are CCS-packed arrays.  Convert them to complex matrices and get the expected output
+    // as a complex matrix.
+    if( cn == 1 )
+    {
+        Mat &converted_src1 = test_mat[TEMP][0];
+        Mat &converted_src2 = test_mat[TEMP][1];
+        convert_from_ccs( src1, src1, converted_src1, flags );
+        convert_from_ccs( src2, src2, converted_src2, flags );
+        div_complex( converted_src1, converted_src2, ref_dst, flags );
+    }
+    // Inputs are complex matrices.  Get the expected output as a complex matrix.
+    else
+    {
+        div_complex( src1, src2, ref_dst, flags );
+    }
+}
+
+void CV_DivSpectrumsTest::run_func()
+{
+    const Mat &src1 = test_mat[INPUT][0];
+    const Mat &src2 = test_mat[INPUT][1];
+    const int cn = src1.channels();
+
+    // Inputs are CCS-packed arrays.  Get the output as a CCS-packed array and convert it to a
+    // complex matrix.
+    if ( cn == 1 )
+    {
+        Mat &dst = test_mat[TEMP][2];
+        cv::divSpectrums( src1, src2, dst, flags, (flags & CV_DXT_MUL_CONJ) != 0 );
+        Mat &converted_dst = test_mat[OUTPUT][0];
+        convert_from_ccs( dst, dst, converted_dst, flags );
+    }
+    // Inputs are complex matrices.  Get the output as a complex matrix.
+    else
+    {
+        Mat &dst = test_mat[OUTPUT][0];
+        cv::divSpectrums( src1, src2, dst, flags, (flags & CV_DXT_MUL_CONJ) != 0 );
+    }
+}
+
+TEST(Imgproc_DivSpectrums, accuracy) { CV_DivSpectrumsTest test; test.safe_run(); }
+
 }} // namespace
diff --git a/modules/imgproc/test/test_smooth_bitexact.cpp b/modules/imgproc/test/test_smooth_bitexact.cpp
index f446deb8d8..246f1df798 100644
--- a/modules/imgproc/test/test_smooth_bitexact.cpp
+++ b/modules/imgproc/test/test_smooth_bitexact.cpp
@@ -220,6 +220,15 @@ TEST(GaussianBlur_Bitexact, regression_15015)
     ASSERT_EQ(0.0, cvtest::norm(dst, src, NORM_INF));
 }
 
+TEST(GaussianBlur_Bitexact, overflow_20121)
+{
+    Mat src(100, 100, CV_16UC1, Scalar(65535));
+    Mat dst;
+    GaussianBlur(src, dst, cv::Size(9, 9), 0.0);
+    double min_val;
+    minMaxLoc(dst, &min_val);
+    ASSERT_EQ(cvRound(min_val), 65535);
+}
 
 static void checkGaussianBlur_8Uvs32F(const Mat& src8u, const Mat& src32f, int N, double sigma)
 {
diff --git a/modules/imgproc/test/test_templmatch.cpp b/modules/imgproc/test/test_templmatch.cpp
index d38aa24a37..d5c16058a7 100644
--- a/modules/imgproc/test/test_templmatch.cpp
+++ b/modules/imgproc/test/test_templmatch.cpp
@@ -334,4 +334,95 @@ void CV_TemplMatchTest::prepare_to_validation( int /*test_case_idx*/ )
 
 TEST(Imgproc_MatchTemplate, accuracy) { CV_TemplMatchTest test; test.safe_run(); }
 
-}} // namespace
+}
+
+TEST(Imgproc_MatchTemplate, bug_9597) {
+        const uint8_t img[] = {
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 246, 246, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 246, 246, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 246, 246, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 246, 246, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 246,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 246, 246, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 246, 246, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 246, 246, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 246, 246, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 247, 247, 247, 247, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 247, 247, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245};
+        const uint8_t tmpl[] = {
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245,
+                245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245 };
+        cv::Mat cvimg(cv::Size(61, 82), CV_8UC1, (void*)img, cv::Mat::AUTO_STEP);
+        cv::Mat cvtmpl(cv::Size(17, 17), CV_8UC1, (void*)tmpl, cv::Mat::AUTO_STEP);
+        cv::Mat result;
+        cv::matchTemplate(cvimg, cvtmpl, result, CV_TM_SQDIFF);
+        double minValue;
+        cv::minMaxLoc(result, &minValue, NULL, NULL, NULL);
+        ASSERT_GE(minValue, 0);
+}
+} // namespace
diff --git a/modules/java/test/pure_test/build.xml b/modules/java/test/pure_test/build.xml
index 15419f5d67..e596c82e9d 100644
--- a/modules/java/test/pure_test/build.xml
+++ b/modules/java/test/pure_test/build.xml
@@ -6,6 +6,7 @@
 
   <property name="opencv.test.package" value="*"/>
   <property name="opencv.test.class" value="*"/>
+  <property name="opencv.test.exclude" value=""/>
 
   <path id="master-classpath">
     <fileset dir="lib">
@@ -53,7 +54,7 @@
       <formatter type="xml"/>
 
       <batchtest fork="yes" todir="${test.dir}">
-        <zipfileset src="build/jar/opencv-test.jar" includes="**/${opencv.test.package}/${opencv.test.class}.class" excludes="**/OpenCVTest*">
+        <zipfileset src="build/jar/opencv-test.jar" includes="**/${opencv.test.package}/${opencv.test.class}.class" excludes="**/OpenCVTest*, ${opencv.test.exclude}">
           <exclude name="**/*$*.class"/>
         </zipfileset>
       </batchtest>
diff --git a/modules/ml/src/boost.cpp b/modules/ml/src/boost.cpp
index 4b94410eeb..be9c9a7b46 100644
--- a/modules/ml/src/boost.cpp
+++ b/modules/ml/src/boost.cpp
@@ -490,6 +490,7 @@ public:
 
     float predict( InputArray samples, OutputArray results, int flags ) const CV_OVERRIDE
     {
+        CV_CheckEQ(samples.cols(), getVarCount(), "");
         return impl.predict(samples, results, flags);
     }
 
diff --git a/modules/ml/src/rtrees.cpp b/modules/ml/src/rtrees.cpp
index 216b42bab6..2cad961f99 100644
--- a/modules/ml/src/rtrees.cpp
+++ b/modules/ml/src/rtrees.cpp
@@ -480,6 +480,7 @@ public:
     float predict( InputArray samples, OutputArray results, int flags ) const CV_OVERRIDE
     {
         CV_TRACE_FUNCTION();
+        CV_CheckEQ(samples.cols(), getVarCount(), "");
         return impl.predict(samples, results, flags);
     }
 
diff --git a/modules/ml/src/tree.cpp b/modules/ml/src/tree.cpp
index 87181b156c..5dae889013 100644
--- a/modules/ml/src/tree.cpp
+++ b/modules/ml/src/tree.cpp
@@ -43,6 +43,8 @@
 #include "precomp.hpp"
 #include <ctype.h>
 
+#include <opencv2/core/utils/logger.hpp>
+
 namespace cv {
 namespace ml {
 
@@ -1694,11 +1696,14 @@ void DTreesImpl::write( FileStorage& fs ) const
 void DTreesImpl::readParams( const FileNode& fn )
 {
     _isClassifier = (int)fn["is_classifier"] != 0;
-    /*int var_all = (int)fn["var_all"];
-    int var_count = (int)fn["var_count"];
-    int cat_var_count = (int)fn["cat_var_count"];
+    int varAll = (int)fn["var_all"];
+    int varCount = (int)fn["var_count"];
+    /*int cat_var_count = (int)fn["cat_var_count"];
     int ord_var_count = (int)fn["ord_var_count"];*/
 
+    if (varAll <= 0)
+        CV_Error(Error::StsParseError, "The field \"var_all\" of DTree classifier is missing or non-positive");
+
     FileNode tparams_node = fn["training_params"];
 
     TreeParams params0 = TreeParams();
@@ -1723,11 +1728,38 @@ void DTreesImpl::readParams( const FileNode& fn )
     readVectorOrMat(fn["var_idx"], varIdx);
     fn["var_type"] >> varType;
 
-    int format = 0;
-    fn["format"] >> format;
-    bool isLegacy = format < 3;
+    bool isLegacy = false;
+    if (fn["format"].empty())  // Export bug until OpenCV 3.2: https://github.com/opencv/opencv/pull/6314
+    {
+        if (!fn["cat_ofs"].empty())
+            isLegacy = false;  // 2.4 doesn't store "cat_ofs"
+        else if (!fn["missing_subst"].empty())
+            isLegacy = false;  // 2.4 doesn't store "missing_subst"
+        else if (!fn["class_labels"].empty())
+            isLegacy = false;  // 2.4 doesn't store "class_labels"
+        else if ((int)varType.size() != varAll)
+            isLegacy = true;  // 3.0+: https://github.com/opencv/opencv/blame/3.0.0/modules/ml/src/tree.cpp#L1576
+        else if (/*(int)varType.size() == varAll &&*/ varCount == varAll)
+            isLegacy = true;
+        else
+        {
+            // 3.0+:
+            // - https://github.com/opencv/opencv/blame/3.0.0/modules/ml/src/tree.cpp#L1552-L1553
+            // - https://github.com/opencv/opencv/blame/3.0.0/modules/ml/src/precomp.hpp#L296
+            isLegacy = !(varCount + 1 == varAll);
+        }
+        CV_LOG_INFO(NULL, "ML/DTrees: possible missing 'format' field due to bug of OpenCV export implementation. "
+                "Details: https://github.com/opencv/opencv/issues/5412. Consider re-exporting of saved ML model. "
+                "isLegacy = " << isLegacy);
+    }
+    else
+    {
+        int format = 0;
+        fn["format"] >> format;
+        CV_CheckGT(format, 0, "");
+        isLegacy = format < 3;
+    }
 
-    int varAll = (int)fn["var_all"];
     if (isLegacy && (int)varType.size() <= varAll)
     {
         std::vector<uchar> extendedTypes(varAll + 1, 0);
diff --git a/modules/ml/test/test_rtrees.cpp b/modules/ml/test/test_rtrees.cpp
index 1ec9b8d042..5a4fb34e74 100644
--- a/modules/ml/test/test_rtrees.cpp
+++ b/modules/ml/test/test_rtrees.cpp
@@ -95,6 +95,25 @@ TEST(ML_RTrees, 11142_sample_weights_classification)
     EXPECT_GE(error_with_weights, error_without_weights);
 }
 
+TEST(ML_RTrees, bug_12974_throw_exception_when_predict_different_feature_count)
+{
+    int numFeatures = 5;
+    // create a 5 feature dataset and train the model
+    cv::Ptr<RTrees> model = RTrees::create();
+    Mat samples(10, numFeatures, CV_32F);
+    randu(samples, 0, 10);
+    Mat labels = (Mat_<int>(10,1) << 0,0,0,0,0,1,1,1,1,1);
+    cv::Ptr<TrainData> trainData = TrainData::create(samples, cv::ml::ROW_SAMPLE, labels);
+    model->train(trainData);
+    // try to predict on data which have fewer features - this should throw an exception
+    for(int i = 1; i < numFeatures - 1; ++i) {
+        Mat test(1, i, CV_32FC1);
+        ASSERT_THROW(model->predict(test), Exception);
+    }
+    // try to predict on data which have more features - this should also throw an exception
+    Mat test(1, numFeatures + 1, CV_32FC1);
+    ASSERT_THROW(model->predict(test), Exception);
+}
 
 
 }} // namespace
diff --git a/modules/objdetect/misc/java/test/QRCodeDetectorTest.java b/modules/objdetect/misc/java/test/QRCodeDetectorTest.java
index cd8be409aa..369e7b8cc3 100644
--- a/modules/objdetect/misc/java/test/QRCodeDetectorTest.java
+++ b/modules/objdetect/misc/java/test/QRCodeDetectorTest.java
@@ -5,7 +5,10 @@ import org.opencv.core.Mat;
 import org.opencv.objdetect.QRCodeDetector;
 import org.opencv.imgcodecs.Imgcodecs;
 import org.opencv.test.OpenCVTestCase;
+import java.util.Arrays;
 import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
 
 public class QRCodeDetectorTest extends OpenCVTestCase {
 
@@ -39,11 +42,7 @@ public class QRCodeDetectorTest extends OpenCVTestCase {
         boolean result = detector.detectAndDecodeMulti(img, output);
         assertTrue(result);
         assertEquals(output.size(), 6);
-        assertEquals(output.get(0), "SKIP");
-        assertEquals(output.get(1), "EXTRA");
-        assertEquals(output.get(2), "TWO STEPS FORWARD");
-        assertEquals(output.get(3), "STEP BACK");
-        assertEquals(output.get(4), "QUESTION");
-        assertEquals(output.get(5), "STEP FORWARD");
+        List < String > expectedResults = Arrays.asList("SKIP", "EXTRA", "TWO STEPS FORWARD", "STEP BACK", "QUESTION", "STEP FORWARD");
+        assertEquals(new HashSet<String>(output), new HashSet<String>(expectedResults));
     }
 }
diff --git a/modules/objdetect/src/hog.cpp b/modules/objdetect/src/hog.cpp
index 5ae9126983..00a182245a 100644
--- a/modules/objdetect/src/hog.cpp
+++ b/modules/objdetect/src/hog.cpp
@@ -120,6 +120,12 @@ void HOGDescriptor::setSVMDetector(InputArray _svmDetector)
     _svmDetector.getMat().convertTo(svmDetector, CV_32F);
     CV_Assert(checkDetectorSize());
 
+    if (_svmDetector.empty())
+    {
+        oclSvmDetector = UMat();
+        return;
+    }
+
     Mat detector_reordered(1, (int)svmDetector.size(), CV_32FC1);
 
     size_t block_hist_size = getBlockHistogramSize(blockSize, cellSize, nbins);
diff --git a/modules/objdetect/test/opencl/test_hogdetector.cpp b/modules/objdetect/test/opencl/test_hogdetector.cpp
index 009274096f..cffe2e3fb5 100644
--- a/modules/objdetect/test/opencl/test_hogdetector.cpp
+++ b/modules/objdetect/test/opencl/test_hogdetector.cpp
@@ -93,6 +93,25 @@ OCL_TEST_P(HOG, GetDescriptors)
     EXPECT_MAT_SIMILAR(cpu_desc, gpu_desc, 1e-1);
 }
 
+OCL_TEST_P(HOG, SVMDetector)
+{
+    HOGDescriptor hog_first, hog_second;
+
+    // empty -> empty
+    hog_first.copyTo(hog_second);
+
+    // first -> both
+    hog_first.setSVMDetector(hog_first.getDefaultPeopleDetector());
+    hog_first.copyTo(hog_second);
+
+    // both -> both
+    hog_first.copyTo(hog_second);
+
+    // second -> empty
+    hog_first.setSVMDetector(cv::noArray());
+    hog_first.copyTo(hog_second);
+}
+
 OCL_TEST_P(HOG, Detect)
 {
     HOGDescriptor hog;
diff --git a/modules/photo/src/contrast_preserve.hpp b/modules/photo/src/contrast_preserve.hpp
index 1afd4bc3e3..5681779fc9 100644
--- a/modules/photo/src/contrast_preserve.hpp
+++ b/modules/photo/src/contrast_preserve.hpp
@@ -285,9 +285,9 @@ void Decolor::grad_system(const Mat &im, vector < vector < double > > &polyGrad,
                     add_vector(comb,idx,r,g,b);
                     for(int i = 0;i<h;i++)
                         for(int j=0;j<w;j++)
-                            curIm.at<float>(i,j)=
+                            curIm.at<float>(i,j)=static_cast<float>(
                                 pow(rgb_channel[2].at<float>(i,j),r)*pow(rgb_channel[1].at<float>(i,j),g)*
-                                pow(rgb_channel[0].at<float>(i,j),b);
+                                pow(rgb_channel[0].at<float>(i,j),b));
                     vector <double> curGrad;
                     gradvector(curIm,curGrad);
                     add_to_vector_poly(polyGrad,curGrad,idx1);
diff --git a/modules/photo/src/merge.cpp b/modules/photo/src/merge.cpp
index fbeb4639b4..e6a00fedb8 100644
--- a/modules/photo/src/merge.cpp
+++ b/modules/photo/src/merge.cpp
@@ -344,7 +344,7 @@ public:
             result += times.at<float>((int)i) * w.mul(im);
             wsum += times.at<float>((int)i) * times.at<float>((int)i) * w;
         }
-        result = result.mul(1 / wsum);
+        result = result.mul(1 / (wsum + Scalar::all(DBL_EPSILON)));
     }
 
     void process(InputArrayOfArrays src, OutputArray dst, InputArray times) CV_OVERRIDE
diff --git a/modules/photo/test/test_hdr.cpp b/modules/photo/test/test_hdr.cpp
index 198b83470c..10050abbcb 100644
--- a/modules/photo/test/test_hdr.cpp
+++ b/modules/photo/test/test_hdr.cpp
@@ -249,4 +249,21 @@ TEST(Photo_CalibrateRobertson, regression)
     checkEqual(expected, response, 1e-1f, "CalibrateRobertson");
 }
 
+TEST(Photo_CalibrateRobertson, bug_18180)
+{
+    vector<Mat> images;
+    vector<cv::String> fn;
+    string test_path = cvtest::TS::ptr()->get_data_path() + "hdr/exposures/bug_18180/";
+    for(int i = 1; i <= 4; ++i)
+        images.push_back(imread(test_path + std::to_string(i) + ".jpg"));
+    vector<float> times {15.0f, 2.5f, 0.25f, 0.33f};
+    Mat response, expected;
+    Ptr<CalibrateRobertson> calibrate = createCalibrateRobertson(2, 0.01f);
+    calibrate->process(images, response, times);
+    Mat response_no_nans = response.clone();
+    patchNaNs(response_no_nans);
+    // since there should be no NaNs, original response vs. response with NaNs patched should be identical
+    EXPECT_EQ(0.0, cv::norm(response, response_no_nans, NORM_L2));
+}
+
 }} // namespace
diff --git a/modules/python/common.cmake b/modules/python/common.cmake
index 6a438fd1a2..1a6cc97429 100644
--- a/modules/python/common.cmake
+++ b/modules/python/common.cmake
@@ -218,6 +218,28 @@ if(NOT OPENCV_SKIP_PYTHON_LOADER)
   endif()
   configure_file("${PYTHON_SOURCE_DIR}/package/template/config-x.y.py.in" "${__python_loader_install_tmp_path}/cv2/${__target_config}" @ONLY)
   install(FILES "${__python_loader_install_tmp_path}/cv2/${__target_config}" DESTINATION "${OPENCV_PYTHON_INSTALL_PATH}/cv2/" COMPONENT python)
+
+  # handle Python extra code
+  foreach(m ${OPENCV_MODULES_BUILD})
+    if (";${OPENCV_MODULE_${m}_WRAPPERS};" MATCHES ";python;" AND HAVE_${m}
+        AND EXISTS "${OPENCV_MODULE_${m}_LOCATION}/misc/python/package"
+    )
+      set(__base "${OPENCV_MODULE_${m}_LOCATION}/misc/python/package")
+      file(GLOB_RECURSE extra_py_files
+          RELATIVE "${__base}"
+          "${__base}/**/*.py"
+      )
+      if(extra_py_files)
+        list(SORT extra_py_files)
+        foreach(f ${extra_py_files})
+          configure_file("${__base}/${f}" "${__loader_path}/cv2/_extra_py_code/${f}" COPYONLY)
+          install(FILES "${__base}/${f}" DESTINATION "${OPENCV_PYTHON_INSTALL_PATH}/cv2/_extra_py_code/${f}" COMPONENT python)
+        endforeach()
+      else()
+        message(WARNING "Module ${m} has no .py files in misc/python/package")
+      endif()
+    endif()
+  endforeach(m)
 endif()  # NOT OPENCV_SKIP_PYTHON_LOADER
 
 unset(PYTHON_SRC_DIR)
diff --git a/modules/python/package/cv2/__init__.py b/modules/python/package/cv2/__init__.py
index 940ac65732..27db65eef6 100644
--- a/modules/python/package/cv2/__init__.py
+++ b/modules/python/package/cv2/__init__.py
@@ -4,6 +4,8 @@ OpenCV Python binary extension loader
 import os
 import sys
 
+__all__ = []
+
 try:
     import numpy
     import numpy.core.multiarray
@@ -13,6 +15,14 @@ except ImportError:
     print('    pip install numpy')
     raise
 
+
+py_code_loader = None
+if sys.version_info[:2] >= (3, 0):
+    try:
+        from . import _extra_py_code as py_code_loader
+    except:
+        pass
+
 # TODO
 # is_x64 = sys.maxsize > 2**32
 
@@ -34,7 +44,7 @@ def bootstrap():
     import platform
     if DEBUG: print('OpenCV loader: os.name="{}"  platform.system()="{}"'.format(os.name, str(platform.system())))
 
-    LOADER_DIR=os.path.dirname(os.path.abspath(__file__))
+    LOADER_DIR = os.path.dirname(os.path.abspath(os.path.realpath(__file__)))
 
     PYTHON_EXTENSIONS_PATHS = []
     BINARIES_PATHS = []
@@ -97,6 +107,11 @@ def bootstrap():
     except:
         pass
 
+    if DEBUG: print('OpenCV loader: binary extension... OK')
+
+    if py_code_loader:
+        py_code_loader.init('cv2')
+
     if DEBUG: print('OpenCV loader: DONE')
 
 bootstrap()
diff --git a/modules/python/package/cv2/_extra_py_code/__init__.py b/modules/python/package/cv2/_extra_py_code/__init__.py
new file mode 100644
index 0000000000..be84566825
--- /dev/null
+++ b/modules/python/package/cv2/_extra_py_code/__init__.py
@@ -0,0 +1,53 @@
+import sys
+import importlib
+
+__all__ = ['init']
+
+
+DEBUG = False
+if hasattr(sys, 'OpenCV_LOADER_DEBUG'):
+    DEBUG = True
+
+
+def _load_py_code(base, name):
+    try:
+        m = importlib.import_module(__name__ + name)
+    except ImportError:
+        return  # extension doesn't exist?
+
+    if DEBUG: print('OpenCV loader: added python code extension for: ' + name)
+
+    if hasattr(m, '__all__'):
+        export_members = { k : getattr(m, k) for k in m.__all__ }
+    else:
+        export_members = m.__dict__
+
+    for k, v in export_members.items():
+        if k.startswith('_'):  # skip internals
+            continue
+        if isinstance(v, type(sys)):  # don't bring modules
+            continue
+        if DEBUG: print('    symbol: {} = {}'.format(k, v))
+        setattr(sys.modules[base + name ], k, v)
+
+    del sys.modules[__name__ + name]
+
+
+# TODO: listdir
+def init(base):
+    _load_py_code(base, '.cv2')  # special case
+    prefix = base
+    prefix_len = len(prefix)
+
+    modules = [ m for m in sys.modules.keys() if m.startswith(prefix) ]
+    for m in modules:
+        m2 = m[prefix_len:]  # strip prefix
+        if len(m2) == 0:
+            continue
+        if m2.startswith('._'):  # skip internals
+            continue
+        if m2.startswith('.load_config_'):  # skip helper files
+            continue
+        _load_py_code(base, m2)
+
+    del sys.modules[__name__]
diff --git a/modules/python/python_loader.cmake b/modules/python/python_loader.cmake
index 31cd33505a..677111b061 100644
--- a/modules/python/python_loader.cmake
+++ b/modules/python/python_loader.cmake
@@ -25,10 +25,12 @@ endif()
 set(PYTHON_LOADER_FILES
     "setup.py" "cv2/__init__.py"
     "cv2/load_config_py2.py" "cv2/load_config_py3.py"
+    "cv2/_extra_py_code/__init__.py"
 )
 foreach(fname ${PYTHON_LOADER_FILES})
   get_filename_component(__dir "${fname}" DIRECTORY)
-  file(COPY "${PYTHON_SOURCE_DIR}/package/${fname}" DESTINATION "${__loader_path}/${__dir}")
+  # avoid using of file(COPY) to rerun CMake on changes
+  configure_file("${PYTHON_SOURCE_DIR}/package/${fname}" "${__loader_path}/${fname}" COPYONLY)
   if(fname STREQUAL "setup.py")
     if(OPENCV_PYTHON_SETUP_PY_INSTALL_PATH)
       install(FILES "${PYTHON_SOURCE_DIR}/package/${fname}" DESTINATION "${OPENCV_PYTHON_SETUP_PY_INSTALL_PATH}" COMPONENT python)
diff --git a/modules/python/src2/gen2.py b/modules/python/src2/gen2.py
index 08de11d861..c6ed806017 100755
--- a/modules/python/src2/gen2.py
+++ b/modules/python/src2/gen2.py
@@ -211,6 +211,7 @@ simple_argtype_mapping = {
     "double": ArgTypeInfo("double", FormatStrings.double, "0", True),
     "c_string": ArgTypeInfo("char*", FormatStrings.string, '(char*)""'),
     "string": ArgTypeInfo("std::string", FormatStrings.object, None, True),
+    "Stream": ArgTypeInfo("Stream", FormatStrings.object, 'Stream::Null()', True),
 }
 
 
diff --git a/modules/python/test/test_cuda.py b/modules/python/test/test_cuda.py
index 4bcd4108f1..4b3fc7d278 100644
--- a/modules/python/test/test_cuda.py
+++ b/modules/python/test/test_cuda.py
@@ -26,6 +26,15 @@ class cuda_test(NewOpenCVTests):
 
         self.assertTrue(np.allclose(cuMat.download(), npMat))
 
+    def test_cuda_upload_download_stream(self):
+        stream = cv.cuda_Stream()
+        npMat = (np.random.random((128, 128, 3)) * 255).astype(np.uint8)
+        cuMat = cv.cuda_GpuMat(128,128, cv.CV_8UC3)
+        cuMat.upload(npMat, stream)
+        npMat2 = cuMat.download(stream=stream)
+        stream.waitForCompletion()
+        self.assertTrue(np.allclose(npMat2, npMat))
+
     def test_cuda_interop(self):
         npMat = (np.random.random((128, 128, 3)) * 255).astype(np.uint8)
         cuMat = cv.cuda_GpuMat()
diff --git a/modules/stitching/src/matchers.cpp b/modules/stitching/src/matchers.cpp
index 5e96822e81..4791584366 100644
--- a/modules/stitching/src/matchers.cpp
+++ b/modules/stitching/src/matchers.cpp
@@ -496,7 +496,7 @@ void BestOf2NearestRangeMatcher::operator ()(const std::vector<ImageFeatures> &f
 
     std::vector<std::pair<int,int> > near_pairs;
     for (int i = 0; i < num_images - 1; ++i)
-        for (int j = i + 1; j < std::min(num_images, i + range_width_); ++j)
+        for (int j = i + 1; j < std::min(num_images, i + 1 + range_width_); ++j)
             if (features[i].keypoints.size() > 0 && features[j].keypoints.size() > 0 && mask_(i, j))
                 near_pairs.push_back(std::make_pair(i, j));
 
diff --git a/modules/ts/misc/run.py b/modules/ts/misc/run.py
index 2d5de0708c..c2e4d6532b 100755
--- a/modules/ts/misc/run.py
+++ b/modules/ts/misc/run.py
@@ -51,6 +51,7 @@ if __name__ == "__main__":
     parser.add_argument("--android_propagate_opencv_env", action="store_true", default=False, help="Android: propagate OPENCV* environment variables")
     parser.add_argument("--serial", metavar="serial number", default="", help="Android: directs command to the USB device or emulator with the given serial number")
     parser.add_argument("--package", metavar="package", default="", help="Java: run JUnit tests for specified module or Android package")
+    parser.add_argument("--java_test_exclude", metavar="java_test_exclude", default="", help="Java: Filter out specific JUnit tests")
 
     parser.add_argument("--trace", action="store_true", default=False, help="Trace: enable OpenCV tracing")
     parser.add_argument("--trace_dump", metavar="trace_dump", default=-1, help="Trace: dump highlight calls (specify max entries count, 0 - dump all)")
diff --git a/modules/ts/misc/run_suite.py b/modules/ts/misc/run_suite.py
index 0420d9a968..2f382238cd 100644
--- a/modules/ts/misc/run_suite.py
+++ b/modules/ts/misc/run_suite.py
@@ -115,6 +115,8 @@ class TestSuite(object):
             cmd = [self.cache.ant_executable, "-Dopencv.build.type=%s" % self.cache.build_type]
             if self.options.package:
                 cmd += ["-Dopencv.test.package=%s" % self.options.package]
+            if self.options.java_test_exclude:
+                cmd += ["-Dopencv.test.exclude=%s" % self.options.java_test_exclude]
             cmd += ["buildAndTest"]
             ret = execute(cmd, cwd=self.cache.java_test_dir)
             return None, ret
diff --git a/modules/ts/misc/table_formatter.py b/modules/ts/misc/table_formatter.py
index 412936950f..96bafab72d 100755
--- a/modules/ts/misc/table_formatter.py
+++ b/modules/ts/misc/table_formatter.py
@@ -1,7 +1,11 @@
 #!/usr/bin/env python
 
 from __future__ import print_function
-import sys, re, os.path, cgi, stat, math
+import sys, re, os.path, stat, math
+try:
+    from html import escape
+except ImportError:
+    from cgi import escape  # Python 2.7
 from optparse import OptionParser
 from color import getColorizer, dummyColorizer
 
@@ -23,7 +27,7 @@ class tblRow(object):
         self.props = props
 
 def htmlEncode(str):
-    return '<br/>'.join([cgi.escape(s) for s in str])
+    return '<br/>'.join([escape(s) for s in str])
 
 class table(object):
     def_align = "left"
diff --git a/modules/video/include/opencv2/video/tracking.hpp b/modules/video/include/opencv2/video/tracking.hpp
index af35aaa4e7..7ec6bc55cd 100644
--- a/modules/video/include/opencv2/video/tracking.hpp
+++ b/modules/video/include/opencv2/video/tracking.hpp
@@ -818,6 +818,36 @@ public:
     //bool update(InputArray image, CV_OUT Rect& boundingBox) CV_OVERRIDE;
 };
 
+class CV_EXPORTS_W TrackerDaSiamRPN : public Tracker
+{
+protected:
+    TrackerDaSiamRPN();  // use ::create()
+public:
+    virtual ~TrackerDaSiamRPN() CV_OVERRIDE;
+
+    struct CV_EXPORTS_W_SIMPLE Params
+    {
+        CV_WRAP Params();
+        CV_PROP_RW std::string model;
+        CV_PROP_RW std::string kernel_cls1;
+        CV_PROP_RW std::string kernel_r1;
+        CV_PROP_RW int backend;
+        CV_PROP_RW int target;
+    };
+
+    /** @brief Constructor
+    @param parameters DaSiamRPN parameters TrackerDaSiamRPN::Params
+    */
+    static CV_WRAP
+    Ptr<TrackerDaSiamRPN> create(const TrackerDaSiamRPN::Params& parameters = TrackerDaSiamRPN::Params());
+
+    /** @brief Return tracking score
+    */
+    CV_WRAP virtual float getTrackingScore() = 0;
+
+    //void init(InputArray image, const Rect& boundingBox) CV_OVERRIDE;
+    //bool update(InputArray image, CV_OUT Rect& boundingBox) CV_OVERRIDE;
+};
 
 
 //! @} video_track
diff --git a/modules/video/misc/python/pyopencv_video.hpp b/modules/video/misc/python/pyopencv_video.hpp
index 761905c8bf..ea8977911f 100644
--- a/modules/video/misc/python/pyopencv_video.hpp
+++ b/modules/video/misc/python/pyopencv_video.hpp
@@ -1,4 +1,5 @@
 #ifdef HAVE_OPENCV_VIDEO
 typedef TrackerMIL::Params TrackerMIL_Params;
 typedef TrackerGOTURN::Params TrackerGOTURN_Params;
+typedef TrackerDaSiamRPN::Params TrackerDaSiamRPN_Params;
 #endif
diff --git a/modules/video/src/ecc.cpp b/modules/video/src/ecc.cpp
index b7148c22ef..eede0e7071 100644
--- a/modules/video/src/ecc.cpp
+++ b/modules/video/src/ecc.cpp
@@ -323,17 +323,34 @@ double cv::computeECC(InputArray templateImage, InputArray inputImage, InputArra
     Scalar meanTemplate, sdTemplate;
 
     int active_pixels = inputMask.empty() ? templateImage.size().area() : countNonZero(inputMask);
-
+    int type = templateImage.type();
     meanStdDev(templateImage, meanTemplate, sdTemplate, inputMask);
     Mat templateImage_zeromean = Mat::zeros(templateImage.size(), templateImage.type());
-    subtract(templateImage, meanTemplate, templateImage_zeromean, inputMask);
+    Mat templateMat = templateImage.getMat();
+    Mat inputMat = inputImage.getMat();
+
+    /*
+     * For unsigned ints, when the mean is computed and subtracted, any values less than the mean
+     * will be set to 0 (since there are no negatives values). This impacts the norm and dot product, which
+     * ultimately results in an incorrect ECC. To circumvent this problem, if unsigned ints are provided,
+     * we convert them to a signed ints with larger resolution for the subtraction step.
+     */
+    if(type == CV_8U || type == CV_16U) {
+        int newType = type == CV_8U ? CV_16S : CV_32S;
+        Mat templateMatConverted, inputMatConverted;
+        templateMat.convertTo(templateMatConverted, newType);
+        cv::swap(templateMat, templateMatConverted);
+        inputMat.convertTo(inputMatConverted, newType);
+        cv::swap(inputMat, inputMatConverted);
+    }
+    subtract(templateMat, meanTemplate, templateImage_zeromean, inputMask);
     double templateImagenorm = std::sqrt(active_pixels*sdTemplate.val[0]*sdTemplate.val[0]);
 
     Scalar meanInput, sdInput;
 
     Mat inputImage_zeromean = Mat::zeros(inputImage.size(), inputImage.type());
     meanStdDev(inputImage, meanInput, sdInput, inputMask);
-    subtract(inputImage, meanInput, inputImage_zeromean, inputMask);
+    subtract(inputMat, meanInput, inputImage_zeromean, inputMask);
     double inputImagenorm = std::sqrt(active_pixels*sdInput.val[0]*sdInput.val[0]);
 
     return templateImage_zeromean.dot(inputImage_zeromean)/(templateImagenorm*inputImagenorm);
diff --git a/modules/video/src/tracking/tracker_dasiamrpn.cpp b/modules/video/src/tracking/tracker_dasiamrpn.cpp
new file mode 100644
index 0000000000..4d1004b419
--- /dev/null
+++ b/modules/video/src/tracking/tracker_dasiamrpn.cpp
@@ -0,0 +1,440 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../precomp.hpp"
+
+#ifdef HAVE_OPENCV_DNN
+#include "opencv2/dnn.hpp"
+#endif
+
+namespace cv {
+
+TrackerDaSiamRPN::TrackerDaSiamRPN()
+{
+    // nothing
+}
+
+TrackerDaSiamRPN::~TrackerDaSiamRPN()
+{
+    // nothing
+}
+
+TrackerDaSiamRPN::Params::Params()
+{
+    model = "dasiamrpn_model.onnx";
+    kernel_cls1 = "dasiamrpn_kernel_cls1.onnx";
+    kernel_r1 = "dasiamrpn_kernel_r1.onnx";
+#ifdef HAVE_OPENCV_DNN
+    backend = dnn::DNN_BACKEND_DEFAULT;
+    target = dnn::DNN_TARGET_CPU;
+#else
+    backend = -1;  // invalid value
+    target = -1;  // invalid value
+#endif
+}
+
+#ifdef HAVE_OPENCV_DNN
+
+template <typename T> static
+T sizeCal(const T& w, const T& h)
+{
+    T pad = (w + h) * T(0.5);
+    T sz2 = (w + pad) * (h + pad);
+    return sqrt(sz2);
+}
+
+template <>
+Mat sizeCal(const Mat& w, const Mat& h)
+{
+    Mat pad = (w + h) * 0.5;
+    Mat sz2 = (w + pad).mul((h + pad));
+
+    cv::sqrt(sz2, sz2);
+    return sz2;
+}
+
+class TrackerDaSiamRPNImpl : public TrackerDaSiamRPN
+{
+public:
+    TrackerDaSiamRPNImpl(const TrackerDaSiamRPN::Params& parameters)
+        : params(parameters)
+    {
+
+        siamRPN = dnn::readNet(params.model);
+        siamKernelCL1 = dnn::readNet(params.kernel_cls1);
+        siamKernelR1 = dnn::readNet(params.kernel_r1);
+
+        CV_Assert(!siamRPN.empty());
+        CV_Assert(!siamKernelCL1.empty());
+        CV_Assert(!siamKernelR1.empty());
+
+        siamRPN.setPreferableBackend(params.backend);
+        siamRPN.setPreferableTarget(params.target);
+        siamKernelR1.setPreferableBackend(params.backend);
+        siamKernelR1.setPreferableTarget(params.target);
+        siamKernelCL1.setPreferableBackend(params.backend);
+        siamKernelCL1.setPreferableTarget(params.target);
+    }
+
+    void init(InputArray image, const Rect& boundingBox) CV_OVERRIDE;
+    bool update(InputArray image, Rect& boundingBox) CV_OVERRIDE;
+    float getTrackingScore() CV_OVERRIDE;
+
+    TrackerDaSiamRPN::Params params;
+
+protected:
+    dnn::Net siamRPN, siamKernelR1, siamKernelCL1;
+    Rect boundingBox_;
+    Mat image_;
+    struct trackerConfig
+    {
+        float windowInfluence = 0.43f;
+        float lr = 0.4f;
+        int scale = 8;
+        bool swapRB = false;
+        int totalStride = 8;
+        float penaltyK = 0.055f;
+        int exemplarSize = 127;
+        int instanceSize = 271;
+        float contextAmount = 0.5f;
+        std::vector<float> ratios = { 0.33f, 0.5f, 1.0f, 2.0f, 3.0f };
+        int anchorNum = int(ratios.size());
+        Mat anchors;
+        Mat windows;
+        Scalar avgChans;
+        Size imgSize = { 0, 0 };
+        Rect2f targetBox = { 0, 0, 0, 0 };
+        int scoreSize = (instanceSize - exemplarSize) / totalStride + 1;
+        float tracking_score;
+
+        void update_scoreSize()
+        {
+            scoreSize = int((instanceSize - exemplarSize) / totalStride + 1);
+        }
+    };
+    trackerConfig trackState;
+
+    void softmax(const Mat& src, Mat& dst);
+    void elementMax(Mat& src);
+    Mat generateHanningWindow();
+    Mat generateAnchors();
+    Mat getSubwindow(Mat& img, const Rect2f& targetBox, float originalSize, Scalar avgChans);
+    void trackerInit(Mat img);
+    void trackerEval(Mat img);
+};
+
+void TrackerDaSiamRPNImpl::init(InputArray image, const Rect& boundingBox)
+{
+    image_ = image.getMat().clone();
+
+    trackState.update_scoreSize();
+    trackState.targetBox = Rect2f(
+        float(boundingBox.x) + float(boundingBox.width) * 0.5f,  // FIXIT don't use center in Rect structures, it is confusing
+        float(boundingBox.y) + float(boundingBox.height) * 0.5f,
+        float(boundingBox.width),
+        float(boundingBox.height)
+    );
+    trackerInit(image_);
+}
+
+void TrackerDaSiamRPNImpl::trackerInit(Mat img)
+{
+    Rect2f targetBox = trackState.targetBox;
+    Mat anchors = generateAnchors();
+    trackState.anchors = anchors;
+
+    Mat windows = generateHanningWindow();
+
+    trackState.windows = windows;
+    trackState.imgSize = img.size();
+
+    trackState.avgChans = mean(img);
+    float wc = targetBox.width + trackState.contextAmount * (targetBox.width + targetBox.height);
+    float hc = targetBox.height + trackState.contextAmount * (targetBox.width + targetBox.height);
+    float sz = (float)cvRound(sqrt(wc * hc));
+
+    Mat zCrop = getSubwindow(img, targetBox, sz, trackState.avgChans);
+    Mat blob;
+
+    dnn::blobFromImage(zCrop, blob, 1.0, Size(trackState.exemplarSize, trackState.exemplarSize), Scalar(), trackState.swapRB, false, CV_32F);
+    siamRPN.setInput(blob);
+    Mat out1;
+    siamRPN.forward(out1, "63");
+
+    siamKernelCL1.setInput(out1);
+    siamKernelR1.setInput(out1);
+
+    Mat cls1 = siamKernelCL1.forward();
+    Mat r1 = siamKernelR1.forward();
+    std::vector<int> r1_shape = { 20, 256, 4, 4 }, cls1_shape = { 10, 256, 4, 4 };
+
+    siamRPN.setParam(siamRPN.getLayerId("65"), 0, r1.reshape(0, r1_shape));
+    siamRPN.setParam(siamRPN.getLayerId("68"), 0, cls1.reshape(0, cls1_shape));
+}
+
+bool TrackerDaSiamRPNImpl::update(InputArray image, Rect& boundingBox)
+{
+    image_ = image.getMat().clone();
+    trackerEval(image_);
+    boundingBox = {
+        int(trackState.targetBox.x - int(trackState.targetBox.width / 2)),
+        int(trackState.targetBox.y - int(trackState.targetBox.height / 2)),
+        int(trackState.targetBox.width),
+        int(trackState.targetBox.height)
+    };
+    return true;
+}
+
+void TrackerDaSiamRPNImpl::trackerEval(Mat img)
+{
+    Rect2f targetBox = trackState.targetBox;
+
+    float wc = targetBox.height + trackState.contextAmount * (targetBox.width + targetBox.height);
+    float hc = targetBox.width + trackState.contextAmount * (targetBox.width + targetBox.height);
+
+    float sz = sqrt(wc * hc);
+    float scaleZ = trackState.exemplarSize / sz;
+
+    float searchSize = float((trackState.instanceSize - trackState.exemplarSize) / 2);
+    float pad = searchSize / scaleZ;
+    float sx = sz + 2 * pad;
+
+    Mat xCrop = getSubwindow(img, targetBox, (float)cvRound(sx), trackState.avgChans);
+
+    Mat blob;
+    std::vector<Mat> outs;
+    std::vector<String> outNames;
+    Mat delta, score;
+    Mat sc, rc, penalty, pscore;
+
+    dnn::blobFromImage(xCrop, blob, 1.0, Size(trackState.instanceSize, trackState.instanceSize), Scalar(), trackState.swapRB, false, CV_32F);
+
+    siamRPN.setInput(blob);
+
+    outNames = siamRPN.getUnconnectedOutLayersNames();
+    siamRPN.forward(outs, outNames);
+
+    delta = outs[0];
+    score = outs[1];
+
+    score = score.reshape(0, { 2, trackState.anchorNum, trackState.scoreSize, trackState.scoreSize });
+    delta = delta.reshape(0, { 4, trackState.anchorNum, trackState.scoreSize, trackState.scoreSize });
+
+    softmax(score, score);
+
+    targetBox.width *= scaleZ;
+    targetBox.height *= scaleZ;
+
+    score = score.row(1);
+    score = score.reshape(0, { 5, 19, 19 });
+
+    // Post processing
+    delta.row(0) = delta.row(0).mul(trackState.anchors.row(2)) + trackState.anchors.row(0);
+    delta.row(1) = delta.row(1).mul(trackState.anchors.row(3)) + trackState.anchors.row(1);
+    exp(delta.row(2), delta.row(2));
+    delta.row(2) = delta.row(2).mul(trackState.anchors.row(2));
+    exp(delta.row(3), delta.row(3));
+    delta.row(3) = delta.row(3).mul(trackState.anchors.row(3));
+
+    sc = sizeCal(delta.row(2), delta.row(3)) / sizeCal(targetBox.width, targetBox.height);
+    elementMax(sc);
+
+    rc = delta.row(2).mul(1 / delta.row(3));
+    rc = (targetBox.width / targetBox.height) / rc;
+    elementMax(rc);
+
+    // Calculating the penalty
+    exp(((rc.mul(sc) - 1.) * trackState.penaltyK * (-1.0)), penalty);
+    penalty = penalty.reshape(0, { trackState.anchorNum, trackState.scoreSize, trackState.scoreSize });
+
+    pscore = penalty.mul(score);
+    pscore = pscore * (1.0 - trackState.windowInfluence) + trackState.windows * trackState.windowInfluence;
+
+    int bestID[] = { 0 };
+    // Find the index of best score.
+    minMaxIdx(pscore.reshape(0, { trackState.anchorNum * trackState.scoreSize * trackState.scoreSize, 1 }), 0, 0, 0, bestID);
+    delta = delta.reshape(0, { 4, trackState.anchorNum * trackState.scoreSize * trackState.scoreSize });
+    penalty = penalty.reshape(0, { trackState.anchorNum * trackState.scoreSize * trackState.scoreSize, 1 });
+    score = score.reshape(0, { trackState.anchorNum * trackState.scoreSize * trackState.scoreSize, 1 });
+
+    int index[] = { 0, bestID[0] };
+    Rect2f resBox = { 0, 0, 0, 0 };
+
+    resBox.x = delta.at<float>(index) / scaleZ;
+    index[0] = 1;
+    resBox.y = delta.at<float>(index) / scaleZ;
+    index[0] = 2;
+    resBox.width = delta.at<float>(index) / scaleZ;
+    index[0] = 3;
+    resBox.height = delta.at<float>(index) / scaleZ;
+
+    float lr = penalty.at<float>(bestID) * score.at<float>(bestID) * trackState.lr;
+
+    resBox.x = resBox.x + targetBox.x;
+    resBox.y = resBox.y + targetBox.y;
+    targetBox.width /= scaleZ;
+    targetBox.height /= scaleZ;
+
+    resBox.width = targetBox.width * (1 - lr) + resBox.width * lr;
+    resBox.height = targetBox.height * (1 - lr) + resBox.height * lr;
+
+    resBox.x = float(fmax(0., fmin(float(trackState.imgSize.width), resBox.x)));
+    resBox.y = float(fmax(0., fmin(float(trackState.imgSize.height), resBox.y)));
+    resBox.width = float(fmax(10., fmin(float(trackState.imgSize.width), resBox.width)));
+    resBox.height = float(fmax(10., fmin(float(trackState.imgSize.height), resBox.height)));
+
+    trackState.targetBox = resBox;
+    trackState.tracking_score = score.at<float>(bestID);
+}
+
+float TrackerDaSiamRPNImpl::getTrackingScore()
+{
+    return trackState.tracking_score;
+}
+
+void TrackerDaSiamRPNImpl::softmax(const Mat& src, Mat& dst)
+{
+    Mat maxVal;
+    cv::max(src.row(1), src.row(0), maxVal);
+
+    src.row(1) -= maxVal;
+    src.row(0) -= maxVal;
+
+    exp(src, dst);
+
+    Mat sumVal = dst.row(0) + dst.row(1);
+    dst.row(0) = dst.row(0) / sumVal;
+    dst.row(1) = dst.row(1) / sumVal;
+}
+
+void TrackerDaSiamRPNImpl::elementMax(Mat& src)
+{
+    int* p = src.size.p;
+    int index[] = { 0, 0, 0, 0 };
+    for (int n = 0; n < *p; n++)
+    {
+        for (int k = 0; k < *(p + 1); k++)
+        {
+            for (int i = 0; i < *(p + 2); i++)
+            {
+                for (int j = 0; j < *(p + 3); j++)
+                {
+                    index[0] = n, index[1] = k, index[2] = i, index[3] = j;
+                    float& v = src.at<float>(index);
+                    v = fmax(v, 1.0f / v);
+                }
+            }
+        }
+    }
+}
+
+Mat TrackerDaSiamRPNImpl::generateHanningWindow()
+{
+    Mat baseWindows, HanningWindows;
+
+    createHanningWindow(baseWindows, Size(trackState.scoreSize, trackState.scoreSize), CV_32F);
+    baseWindows = baseWindows.reshape(0, { 1, trackState.scoreSize, trackState.scoreSize });
+    HanningWindows = baseWindows.clone();
+    for (int i = 1; i < trackState.anchorNum; i++)
+    {
+        HanningWindows.push_back(baseWindows);
+    }
+
+    return HanningWindows;
+}
+
+Mat TrackerDaSiamRPNImpl::generateAnchors()
+{
+    int totalStride = trackState.totalStride, scales = trackState.scale, scoreSize = trackState.scoreSize;
+    std::vector<float> ratios = trackState.ratios;
+    std::vector<Rect2f> baseAnchors;
+    int anchorNum = int(ratios.size());
+    int size = totalStride * totalStride;
+
+    float ori = -(float(scoreSize / 2)) * float(totalStride);
+
+    for (auto i = 0; i < anchorNum; i++)
+    {
+        int ws = int(sqrt(size / ratios[i]));
+        int hs = int(ws * ratios[i]);
+
+        float wws = float(ws) * scales;
+        float hhs = float(hs) * scales;
+        Rect2f anchor = { 0, 0, wws, hhs };
+        baseAnchors.push_back(anchor);
+    }
+
+    int anchorIndex[] = { 0, 0, 0, 0 };
+    const int sizes[] = { 4, (int)ratios.size(), scoreSize, scoreSize };
+    Mat anchors(4, sizes, CV_32F);
+
+    for (auto i = 0; i < scoreSize; i++)
+    {
+        for (auto j = 0; j < scoreSize; j++)
+        {
+            for (auto k = 0; k < anchorNum; k++)
+            {
+                anchorIndex[0] = 1, anchorIndex[1] = k, anchorIndex[2] = i, anchorIndex[3] = j;
+                anchors.at<float>(anchorIndex) = ori + totalStride * i;
+
+                anchorIndex[0] = 0;
+                anchors.at<float>(anchorIndex) = ori + totalStride * j;
+
+                anchorIndex[0] = 2;
+                anchors.at<float>(anchorIndex) = baseAnchors[k].width;
+
+                anchorIndex[0] = 3;
+                anchors.at<float>(anchorIndex) = baseAnchors[k].height;
+            }
+        }
+    }
+
+    return anchors;
+}
+
+Mat TrackerDaSiamRPNImpl::getSubwindow(Mat& img, const Rect2f& targetBox, float originalSize, Scalar avgChans)
+{
+    Mat zCrop, dst;
+    Size imgSize = img.size();
+    float c = (originalSize + 1) / 2;
+    float xMin = (float)cvRound(targetBox.x - c);
+    float xMax = xMin + originalSize - 1;
+    float yMin = (float)cvRound(targetBox.y - c);
+    float yMax = yMin + originalSize - 1;
+
+    int leftPad = (int)(fmax(0., -xMin));
+    int topPad = (int)(fmax(0., -yMin));
+    int rightPad = (int)(fmax(0., xMax - imgSize.width + 1));
+    int bottomPad = (int)(fmax(0., yMax - imgSize.height + 1));
+
+    xMin = xMin + leftPad;
+    xMax = xMax + leftPad;
+    yMax = yMax + topPad;
+    yMin = yMin + topPad;
+
+    if (topPad == 0 && bottomPad == 0 && leftPad == 0 && rightPad == 0)
+    {
+        img(Rect(int(xMin), int(yMin), int(xMax - xMin + 1), int(yMax - yMin + 1))).copyTo(zCrop);
+    }
+    else
+    {
+        copyMakeBorder(img, dst, topPad, bottomPad, leftPad, rightPad, BORDER_CONSTANT, avgChans);
+        dst(Rect(int(xMin), int(yMin), int(xMax - xMin + 1), int(yMax - yMin + 1))).copyTo(zCrop);
+    }
+
+    return zCrop;
+}
+Ptr<TrackerDaSiamRPN> TrackerDaSiamRPN::create(const TrackerDaSiamRPN::Params& parameters)
+{
+    return makePtr<TrackerDaSiamRPNImpl>(parameters);
+}
+
+#else  // OPENCV_HAVE_DNN
+Ptr<TrackerDaSiamRPN> TrackerDaSiamRPN::create(const TrackerDaSiamRPN::Params& parameters)
+{
+    (void)(parameters);
+    CV_Error(cv::Error::StsNotImplemented, "to use GOTURN, the tracking module needs to be built with opencv_dnn !");
+}
+#endif  // OPENCV_HAVE_DNN
+}
diff --git a/modules/video/test/test_ecc.cpp b/modules/video/test/test_ecc.cpp
index 84c5b851f5..21a5ae3915 100644
--- a/modules/video/test/test_ecc.cpp
+++ b/modules/video/test/test_ecc.cpp
@@ -501,6 +501,18 @@ TEST(Video_ECC_Test_Compute, accuracy)
     EXPECT_NEAR(ecc, -0.5f, 1e-5f);
 }
 
+TEST(Video_ECC_Test_Compute, bug_14657)
+{
+    /*
+     * Simple test case - a 2 x 2 matrix with 10, 10, 10, 6. When the mean (36 / 4 = 9) is subtracted,
+     * it results in 1, 1, 1, 0 for the unsigned int case - compare to  1, 1, 1, -3 in the signed case.
+     * For this reason, when the same matrix was provided as the input and the template, we didn't get 1 as expected.
+     */
+    Mat img = (Mat_<uint8_t>(2, 2) << 10, 10, 10, 6);
+    EXPECT_NEAR(computeECC(img, img), 1.0f, 1e-5f);
+}
+
+
 TEST(Video_ECC_Translation, accuracy) { CV_ECC_Test_Translation test; test.safe_run();}
 TEST(Video_ECC_Euclidean, accuracy) { CV_ECC_Test_Euclidean test; test.safe_run(); }
 TEST(Video_ECC_Affine, accuracy) { CV_ECC_Test_Affine test; test.safe_run(); }
diff --git a/modules/video/test/test_trackers.cpp b/modules/video/test/test_trackers.cpp
index 7fd0470181..2d0e184408 100644
--- a/modules/video/test/test_trackers.cpp
+++ b/modules/video/test/test_trackers.cpp
@@ -94,4 +94,36 @@ TEST(GOTURN, memory_usage)
     }
 }
 
+TEST(DaSiamRPN, memory_usage)
+{
+    cv::Rect roi(145, 70, 85, 85);
+
+    std::string model = cvtest::findDataFile("dnn/onnx/models/dasiamrpn_model.onnx", false);
+    std::string kernel_r1 = cvtest::findDataFile("dnn/onnx/models/dasiamrpn_kernel_r1.onnx", false);
+    std::string kernel_cls1 = cvtest::findDataFile("dnn/onnx/models/dasiamrpn_kernel_cls1.onnx", false);
+    cv::TrackerDaSiamRPN::Params params;
+    params.model = model;
+    params.kernel_r1 = kernel_r1;
+    params.kernel_cls1 = kernel_cls1;
+    cv::Ptr<Tracker> tracker = TrackerDaSiamRPN::create(params);
+
+    string inputVideo = cvtest::findDataFile("tracking/david/data/david.webm");
+    cv::VideoCapture video(inputVideo);
+    ASSERT_TRUE(video.isOpened()) << inputVideo;
+
+    cv::Mat frame;
+    video >> frame;
+    ASSERT_FALSE(frame.empty()) << inputVideo;
+    tracker->init(frame, roi);
+    string ground_truth_bb;
+    for (int nframes = 0; nframes < 15; ++nframes)
+    {
+        std::cout << "Frame: " << nframes << std::endl;
+        video >> frame;
+        bool res = tracker->update(frame, roi);
+        ASSERT_TRUE(res);
+        std::cout << "Predicted ROI: " << roi << std::endl;
+    }
+}
+
 }}  // namespace opencv_test::
diff --git a/modules/videoio/CMakeLists.txt b/modules/videoio/CMakeLists.txt
index 3a79631b86..534fcf0e37 100644
--- a/modules/videoio/CMakeLists.txt
+++ b/modules/videoio/CMakeLists.txt
@@ -1,12 +1,18 @@
 set(VIDEOIO_PLUGIN_LIST "" CACHE STRING "List of videoio backends to be compiled as plugins (ffmpeg, gstreamer, mfx, msmf or special value 'all')")
-set(VIDEOIO_ENABLE_PLUGINS "ON" CACHE BOOL "Allow building videoio plugin support")
+set(VIDEOIO_ENABLE_PLUGINS "ON" CACHE BOOL "Allow building and using of videoio plugins")
 mark_as_advanced(VIDEOIO_PLUGIN_LIST VIDEOIO_ENABLE_PLUGINS)
 
 string(REPLACE "," ";" VIDEOIO_PLUGIN_LIST "${VIDEOIO_PLUGIN_LIST}")  # support comma-separated list (,) too
-
-# Make virtual opencv_videoio_plugins target
-if(NOT TARGET opencv_videoio_plugins)
-  add_custom_target(opencv_videoio_plugins ALL)
+if(NOT VIDEOIO_ENABLE_PLUGINS)
+  if(VIDEOIO_PLUGIN_LIST)
+    message(WARNING "VideoIO: plugins are disabled through VIDEOIO_ENABLE_PLUGINS, so VIDEOIO_PLUGIN_LIST='${VIDEOIO_PLUGIN_LIST}' is ignored")
+    set(VIDEOIO_PLUGIN_LIST "")
+  endif()
+else()
+  # Make virtual opencv_videoio_plugins target
+  if(NOT TARGET opencv_videoio_plugins)
+    add_custom_target(opencv_videoio_plugins ALL)
+  endif()
 endif()
 
 ocv_add_module(videoio opencv_imgproc opencv_imgcodecs WRAP java objc python)
@@ -69,7 +75,7 @@ endif()
 
 include(${CMAKE_CURRENT_LIST_DIR}/cmake/plugin.cmake)
 
-set(tgts)
+set(tgts "PRIVATE")
 
 if(TARGET ocv.3rdparty.mediasdk)
   if("mfx" IN_LIST VIDEOIO_PLUGIN_LIST OR VIDEOIO_PLUGIN_LIST STREQUAL "all")
@@ -151,10 +157,26 @@ if(TARGET ocv.3rdparty.ffmpeg)
     list(APPEND tgts ocv.3rdparty.ffmpeg)
   elseif("ffmpeg" IN_LIST VIDEOIO_PLUGIN_LIST OR VIDEOIO_PLUGIN_LIST STREQUAL "all")
     ocv_create_builtin_videoio_plugin("opencv_videoio_ffmpeg" ocv.3rdparty.ffmpeg "cap_ffmpeg.cpp")
+    if(TARGET ocv.3rdparty.ffmpeg.plugin_deps)
+      ocv_target_link_libraries(opencv_videoio_ffmpeg ocv.3rdparty.ffmpeg.plugin_deps)
+    endif()
+    if(TARGET ocv.3rdparty.mediasdk
+        AND NOT OPENCV_FFMPEG_DISABLE_MEDIASDK
+    )
+      ocv_target_link_libraries(opencv_videoio_ffmpeg ocv.3rdparty.mediasdk)
+    endif()
   else()
     list(APPEND videoio_hdrs ${CMAKE_CURRENT_LIST_DIR}/src/cap_ffmpeg_impl.hpp)
     list(APPEND videoio_srcs ${CMAKE_CURRENT_LIST_DIR}/src/cap_ffmpeg.cpp)
     list(APPEND tgts ocv.3rdparty.ffmpeg)
+    if(TARGET ocv.3rdparty.ffmpeg.builtin_deps)
+      list(APPEND tgts ocv.3rdparty.ffmpeg.builtin_deps)
+    endif()
+    if(TARGET ocv.3rdparty.mediasdk
+        AND NOT OPENCV_FFMPEG_DISABLE_MEDIASDK
+    )
+      list(APPEND tgts ocv.3rdparty.mediasdk)
+    endif()
   endif()
 endif()
 
@@ -207,6 +229,21 @@ if(TARGET ocv.3rdparty.android_native_camera)
   list(APPEND tgts ocv.3rdparty.android_native_camera)
 endif()
 
+if(tgts STREQUAL "PRIVATE")
+  set(tgts "")
+endif()
+
+# install used dependencies only
+if(NOT BUILD_SHARED_LIBS
+    AND NOT (CMAKE_VERSION VERSION_LESS "3.13.0")  # upgrade CMake: https://gitlab.kitware.com/cmake/cmake/-/merge_requests/2152
+)
+  foreach(tgt in ${tgts})
+    if(tgt MATCHES "^ocv\.3rdparty\.")
+      install(TARGETS ${tgt} EXPORT OpenCVModules)
+    endif()
+  endforeach()
+endif()
+
 ocv_set_module_sources(HEADERS ${videoio_ext_hdrs} ${videoio_hdrs} SOURCES ${videoio_srcs})
 ocv_module_include_directories()
 ocv_create_module()
diff --git a/modules/videoio/cmake/detect_ffmpeg.cmake b/modules/videoio/cmake/detect_ffmpeg.cmake
index 1e5323d22a..58de4b9515 100644
--- a/modules/videoio/cmake/detect_ffmpeg.cmake
+++ b/modules/videoio/cmake/detect_ffmpeg.cmake
@@ -99,6 +99,38 @@ if(HAVE_FFMPEG_WRAPPER)
   ocv_add_external_target(ffmpeg "" "" "HAVE_FFMPEG_WRAPPER")
 elseif(HAVE_FFMPEG)
   ocv_add_external_target(ffmpeg "${FFMPEG_INCLUDE_DIRS}" "${FFMPEG_LIBRARIES}" "HAVE_FFMPEG")
+  set(__builtin_defines "")
+  set(__builtin_include_dirs "")
+  set(__builtin_libs "")
+  set(__plugin_defines "")
+  set(__plugin_include_dirs "")
+  set(__plugin_libs "")
+  if(HAVE_OPENCL)
+    set(__opencl_dirs "")
+    if(OPENCL_INCLUDE_DIRS)
+      set(__opencl_dirs "${OPENCL_INCLUDE_DIRS}")
+    elseif(OPENCL_INCLUDE_DIR)
+      set(__opencl_dirs "${OPENCL_INCLUDE_DIR}")
+    else()
+      set(__opencl_dirs "${OpenCV_SOURCE_DIR}/3rdparty/include/opencl/1.2")
+    endif()
+    # extra dependencies for buildin code (OpenCL dir is required for extensions like cl_d3d11.h)
+    # buildin HAVE_OPENCL is already defined through cvconfig.h
+    list(APPEND __builtin_include_dirs "${__opencl_dirs}")
+
+    # extra dependencies for
+    list(APPEND __plugin_defines "HAVE_OPENCL")
+    list(APPEND __plugin_include_dirs "${__opencl_dirs}")
+  endif()
+
+  # TODO: libva, d3d11
+
+  if(__builtin_include_dirs OR __builtin_include_defines OR __builtin_include_libs)
+    ocv_add_external_target(ffmpeg.builtin_deps "${__builtin_include_dirs}" "${__builtin_include_libs}" "${__builtin_defines}")
+  endif()
+  if(VIDEOIO_ENABLE_PLUGINS AND __plugin_include_dirs OR __plugin_include_defines OR __plugin_include_libs)
+    ocv_add_external_target(ffmpeg.plugin_deps "${__plugin_include_dirs}" "${__plugin_include_libs}" "${__plugin_defines}")
+  endif()
 endif()
 
 set(HAVE_FFMPEG ${HAVE_FFMPEG} PARENT_SCOPE)
diff --git a/modules/videoio/cmake/detect_msdk.cmake b/modules/videoio/cmake/detect_msdk.cmake
index 41258a7fff..d035c3f5cc 100644
--- a/modules/videoio/cmake/detect_msdk.cmake
+++ b/modules/videoio/cmake/detect_msdk.cmake
@@ -1,3 +1,16 @@
+set(MFX_DEFS "")
+
+if(NOT HAVE_MFX)
+  find_package(VPL QUIET)
+  if(VPL_FOUND)
+    set(MFX_INCLUDE_DIRS "")
+    set(MFX_LIBRARIES "${VPL_IMPORTED_TARGETS}")
+    set(HAVE_MFX TRUE)
+    list(APPEND MFX_DEFS "HAVE_ONEVPL")
+  endif()
+endif()
+
+
 if(NOT HAVE_MFX)
   set(paths "${MFX_HOME}" ENV "MFX_HOME" ENV "INTELMEDIASDKROOT")
   if(MSVC)
@@ -24,9 +37,14 @@ if(NOT HAVE_MFX)
     set(HAVE_MFX TRUE)
     set(MFX_INCLUDE_DIRS "${MFX_INCLUDE}")
     set(MFX_LIBRARIES "${MFX_LIBRARY}")
+    list(APPEND MFX_DEFS "HAVE_MFX_PLUGIN")
   endif()
 endif()
 
+if(NOT HAVE_MFX AND PKG_CONFIG_FOUND)
+  ocv_check_modules(MFX mfx)
+endif()
+
 if(HAVE_MFX AND UNIX)
   foreach(mode NO_DEFAULT_PATH "")
     find_path(MFX_va_INCLUDE va/va.h PATHS ${paths} PATH_SUFFIXES "include" ${mode})
@@ -49,7 +67,8 @@ if(HAVE_MFX AND UNIX)
 endif()
 
 if(HAVE_MFX)
-  ocv_add_external_target(mediasdk "${MFX_INCLUDE_DIRS}" "${MFX_LIBRARIES}" "HAVE_MFX")
+  list(APPEND MFX_DEFS "HAVE_MFX")
+  ocv_add_external_target(mediasdk "${MFX_INCLUDE_DIRS}" "${MFX_LIBRARIES}" "${MFX_DEFS}")
 endif()
 
 set(HAVE_MFX ${HAVE_MFX} PARENT_SCOPE)
diff --git a/modules/videoio/include/opencv2/videoio.hpp b/modules/videoio/include/opencv2/videoio.hpp
index b7de247a1c..348448bda7 100644
--- a/modules/videoio/include/opencv2/videoio.hpp
+++ b/modules/videoio/include/opencv2/videoio.hpp
@@ -184,7 +184,8 @@ enum VideoCaptureProperties {
        CAP_PROP_ORIENTATION_META=48, //!< (read-only) Frame rotation defined by stream meta (applicable for FFmpeg back-end only)
        CAP_PROP_ORIENTATION_AUTO=49, //!< if true - rotates output frames of CvCapture considering video file's metadata  (applicable for FFmpeg back-end only) (https://github.com/opencv/opencv/issues/15499)
        CAP_PROP_HW_ACCELERATION=50, //!< (**open-only**) Hardware acceleration type (see #VideoAccelerationType). Setting supported only via `params` parameter in cv::VideoCapture constructor / .open() method. Default value is backend-specific.
-       CAP_PROP_HW_DEVICE      =51, //!< (**open-only**) Hardware device index (select GPU if multiple available)
+       CAP_PROP_HW_DEVICE      =51, //!< (**open-only**) Hardware device index (select GPU if multiple available). Device enumeration is acceleration type specific.
+       CAP_PROP_HW_ACCELERATION_USE_OPENCL=52, //!< (**open-only**) If non-zero, create new OpenCL context and bind it to current thread. The OpenCL context created with Video Acceleration context attached it (if not attached yet) for optimized GPU data copy between HW accelerated decoder and cv::UMat.
 #ifndef CV_DOXYGEN
        CV__CAP_PROP_LATEST
 #endif
@@ -201,7 +202,8 @@ enum VideoWriterProperties {
                                    //!< will work with grayscale frames.
   VIDEOWRITER_PROP_DEPTH = 5,      //!< Defaults to CV_8U.
   VIDEOWRITER_PROP_HW_ACCELERATION = 6, //!< (**open-only**) Hardware acceleration type (see #VideoAccelerationType). Setting supported only via `params` parameter in VideoWriter constructor / .open() method. Default value is backend-specific.
-  VIDEOWRITER_PROP_HW_DEVICE       = 7, //!< (**open-only**) Hardware device index (select GPU if multiple available)
+  VIDEOWRITER_PROP_HW_DEVICE       = 7, //!< (**open-only**) Hardware device index (select GPU if multiple available). Device enumeration is acceleration type specific.
+  VIDEOWRITER_PROP_HW_ACCELERATION_USE_OPENCL= 8, //!< (**open-only**) If non-zero, create new OpenCL context and bind it to current thread. The OpenCL context created with Video Acceleration context attached it (if not attached yet) for optimized GPU data copy between cv::UMat and HW accelerated encoder.
 #ifndef CV_DOXYGEN
   CV__VIDEOWRITER_PROP_LATEST
 #endif
diff --git a/modules/videoio/include/opencv2/videoio/registry.hpp b/modules/videoio/include/opencv2/videoio/registry.hpp
index 89fb5a836c..cf72247b3f 100644
--- a/modules/videoio/include/opencv2/videoio/registry.hpp
+++ b/modules/videoio/include/opencv2/videoio/registry.hpp
@@ -39,7 +39,32 @@ CV_EXPORTS_W std::vector<VideoCaptureAPIs> getStreamBackends();
 CV_EXPORTS_W std::vector<VideoCaptureAPIs> getWriterBackends();
 
 /** @brief Returns true if backend is available */
-CV_EXPORTS bool hasBackend(VideoCaptureAPIs api);
+CV_EXPORTS_W bool hasBackend(VideoCaptureAPIs api);
+
+/** @brief Returns true if backend is built in (false if backend is used as plugin) */
+CV_EXPORTS_W bool isBackendBuiltIn(VideoCaptureAPIs api);
+
+/** @brief Returns description and ABI/API version of videoio plugin's camera interface */
+CV_EXPORTS_W std::string getCameraBackendPluginVersion(
+    VideoCaptureAPIs api,
+    CV_OUT int& version_ABI,
+    CV_OUT int& version_API
+);
+
+/** @brief Returns description and ABI/API version of videoio plugin's stream capture interface */
+CV_EXPORTS_W std::string getStreamBackendPluginVersion(
+    VideoCaptureAPIs api,
+    CV_OUT int& version_ABI,
+    CV_OUT int& version_API
+);
+
+/** @brief Returns description and ABI/API version of videoio plugin's writer interface */
+CV_EXPORTS_W std::string getWriterBackendPluginVersion(
+    VideoCaptureAPIs api,
+    CV_OUT int& version_ABI,
+    CV_OUT int& version_API
+);
+
 
 //! @}
 }} // namespace
diff --git a/modules/videoio/src/backend.hpp b/modules/videoio/src/backend.hpp
index ecf0e0d1d3..2a95ec05aa 100644
--- a/modules/videoio/src/backend.hpp
+++ b/modules/videoio/src/backend.hpp
@@ -27,6 +27,7 @@ class IBackendFactory
 public:
     virtual ~IBackendFactory() {}
     virtual Ptr<IBackend> getBackend() const = 0;
+    virtual bool isBuiltIn() const = 0;
 };
 
 //=============================================================================
@@ -48,6 +49,17 @@ Ptr<IBackendFactory> createPluginBackendFactory(VideoCaptureAPIs id, const char*
 
 void applyParametersFallback(const Ptr<IVideoCapture>& cap, const VideoCaptureParameters& params);
 
+std::string getCapturePluginVersion(
+    const Ptr<IBackendFactory>& backend_factory,
+    CV_OUT int& version_ABI,
+    CV_OUT int& version_API
+);
+std::string getWriterPluginVersion(
+    const Ptr<IBackendFactory>& backend_factory,
+    CV_OUT int& version_ABI,
+    CV_OUT int& version_API
+);
+
 } // namespace cv::
 
 #endif // BACKEND_HPP_DEFINED
diff --git a/modules/videoio/src/backend_plugin.cpp b/modules/videoio/src/backend_plugin.cpp
index d3b331949a..dc41a90ce8 100644
--- a/modules/videoio/src/backend_plugin.cpp
+++ b/modules/videoio/src/backend_plugin.cpp
@@ -214,6 +214,24 @@ public:
     Ptr<IVideoCapture> createCapture(const std::string &filename, const VideoCaptureParameters& params) const CV_OVERRIDE;
     Ptr<IVideoWriter> createWriter(const std::string& filename, int fourcc, double fps,
                                    const cv::Size& sz, const VideoWriterParameters& params) const CV_OVERRIDE;
+
+    std::string getCapturePluginVersion(CV_OUT int& version_ABI, CV_OUT int& version_API)
+    {
+        CV_Assert(capture_api_ || plugin_api_);
+        const OpenCV_API_Header& api_header = capture_api_ ? capture_api_->api_header : plugin_api_->api_header;
+        version_ABI = api_header.min_api_version;
+        version_API = api_header.api_version;
+        return api_header.api_description;
+    }
+
+    std::string getWriterPluginVersion(CV_OUT int& version_ABI, CV_OUT int& version_API)
+    {
+        CV_Assert(writer_api_ || plugin_api_);
+        const OpenCV_API_Header& api_header = writer_api_ ? writer_api_->api_header : plugin_api_->api_header;
+        version_ABI = api_header.min_api_version;
+        version_API = api_header.api_version;
+        return api_header.api_description;
+    }
 };
 
 class PluginBackendFactory : public IBackendFactory
@@ -233,14 +251,41 @@ public:
 
     Ptr<IBackend> getBackend() const CV_OVERRIDE
     {
-        if (!initialized)
-        {
-            const_cast<PluginBackendFactory*>(this)->initBackend();
-        }
+        initBackend();
         return backend.staticCast<IBackend>();
     }
+
+    bool isBuiltIn() const CV_OVERRIDE { return false; }
+
+    std::string getCapturePluginVersion(
+            CV_OUT int& version_ABI,
+            CV_OUT int& version_API) const
+    {
+        initBackend();
+        if (!backend)
+            CV_Error_(Error::StsNotImplemented, ("Backend '%s' is not available", baseName_));
+        return backend->getCapturePluginVersion(version_ABI, version_API);
+    }
+
+    std::string getWriterPluginVersion(
+            CV_OUT int& version_ABI,
+            CV_OUT int& version_API) const
+    {
+        initBackend();
+        if (!backend)
+            CV_Error_(Error::StsNotImplemented, ("Backend '%s' is not available", baseName_));
+        return backend->getWriterPluginVersion(version_ABI, version_API);
+    }
+
 protected:
-    void initBackend()
+    inline void initBackend() const
+    {
+        if (!initialized)
+        {
+            const_cast<PluginBackendFactory*>(this)->initBackend_();
+        }
+    }
+    void initBackend_()
     {
         AutoLock lock(getInitializationMutex());
         try {
@@ -688,8 +733,51 @@ Ptr<IBackendFactory> createPluginBackendFactory(VideoCaptureAPIs id, const char*
 #if OPENCV_HAVE_FILESYSTEM_SUPPORT && defined(ENABLE_PLUGINS)
     return makePtr<impl::PluginBackendFactory>(id, baseName); //.staticCast<IBackendFactory>();
 #else
+    CV_UNUSED(id);
+    CV_UNUSED(baseName);
     return Ptr<IBackendFactory>();
 #endif
 }
 
+
+std::string getCapturePluginVersion(
+    const Ptr<IBackendFactory>& backend_factory,
+    CV_OUT int& version_ABI,
+    CV_OUT int& version_API
+)
+{
+#if OPENCV_HAVE_FILESYSTEM_SUPPORT && defined(ENABLE_PLUGINS)
+    using namespace impl;
+    CV_Assert(backend_factory);
+    PluginBackendFactory* plugin_backend_factory = dynamic_cast<PluginBackendFactory*>(backend_factory.get());
+    CV_Assert(plugin_backend_factory);
+    return plugin_backend_factory->getCapturePluginVersion(version_ABI, version_API);
+#else
+    CV_UNUSED(backend_factory);
+    CV_UNUSED(version_ABI);
+    CV_UNUSED(version_API);
+    CV_Error(Error::StsBadFunc, "Plugins are not available in this build");
+#endif
+}
+
+std::string getWriterPluginVersion(
+    const Ptr<IBackendFactory>& backend_factory,
+    CV_OUT int& version_ABI,
+    CV_OUT int& version_API
+)
+{
+#if OPENCV_HAVE_FILESYSTEM_SUPPORT && defined(ENABLE_PLUGINS)
+    using namespace impl;
+    CV_Assert(backend_factory);
+    PluginBackendFactory* plugin_backend_factory = dynamic_cast<PluginBackendFactory*>(backend_factory.get());
+    CV_Assert(plugin_backend_factory);
+    return plugin_backend_factory->getWriterPluginVersion(version_ABI, version_API);
+#else
+    CV_UNUSED(backend_factory);
+    CV_UNUSED(version_ABI);
+    CV_UNUSED(version_API);
+    CV_Error(Error::StsBadFunc, "Plugins are not available in this build");
+#endif
+}
+
 }  // namespace
diff --git a/modules/videoio/src/backend_static.cpp b/modules/videoio/src/backend_static.cpp
index 2e0088f558..3001906acf 100644
--- a/modules/videoio/src/backend_static.cpp
+++ b/modules/videoio/src/backend_static.cpp
@@ -99,6 +99,8 @@ public:
     {
         return backend.staticCast<IBackend>();
     }
+
+    bool isBuiltIn() const CV_OVERRIDE { return true; }
 };
 
 
@@ -165,6 +167,8 @@ public:
     {
         return backend.staticCast<IBackend>();
     }
+
+    bool isBuiltIn() const CV_OVERRIDE { return true; }
 };
 
 
diff --git a/modules/videoio/src/cap_ffmpeg.cpp b/modules/videoio/src/cap_ffmpeg.cpp
index bd3600e2fd..474907bb48 100644
--- a/modules/videoio/src/cap_ffmpeg.cpp
+++ b/modules/videoio/src/cap_ffmpeg.cpp
@@ -92,8 +92,17 @@ public:
         unsigned char* data = 0;
         int step=0, width=0, height=0, cn=0;
 
-        if (!ffmpegCapture ||
-           !icvRetrieveFrame_FFMPEG_p(ffmpegCapture, &data, &step, &width, &height, &cn))
+        if (!ffmpegCapture)
+            return false;
+
+        // if UMat, try GPU to GPU copy using OpenCL extensions
+        if (frame.isUMat()) {
+            if (ffmpegCapture->retrieveHWFrame(frame)) {
+                return true;
+            }
+        }
+
+        if (!icvRetrieveFrame_FFMPEG_p(ffmpegCapture, &data, &step, &width, &height, &cn))
             return false;
 
         cv::Mat tmp(height, width, CV_MAKETYPE(CV_8U, cn), data, step);
@@ -176,6 +185,13 @@ public:
             return;
         CV_Assert(image.depth() == CV_8U);
 
+        // if UMat, try GPU to GPU copy using OpenCL extensions
+        if (image.isUMat()) {
+            if (ffmpegWriter->writeHWFrame(image)) {
+                return;
+            }
+        }
+
         icvWriteFrame_FFMPEG_p(ffmpegWriter, (const uchar*)image.getMat().ptr(), (int)image.step(), image.cols(), image.rows(), image.channels(), 0);
     }
     virtual bool open( const cv::String& filename, int fourcc, double fps, cv::Size frameSize, const VideoWriterParameters& params )
diff --git a/modules/videoio/src/cap_ffmpeg_hw.hpp b/modules/videoio/src/cap_ffmpeg_hw.hpp
index 6e4f71fd3d..2a7b249739 100644
--- a/modules/videoio/src/cap_ffmpeg_hw.hpp
+++ b/modules/videoio/src/cap_ffmpeg_hw.hpp
@@ -5,7 +5,10 @@
 // Copyright (C) 2020-2021 Intel Corporation
 
 #include "opencv2/videoio.hpp"
-#if defined(__OPENCV_BUILD) || defined(OPENCV_HAVE_CVCONFIG_H)  // TODO Properly detect and add D3D11 / LIBVA dependencies for standalone plugins
+#ifdef HAVE_OPENCL
+#include "opencv2/core/ocl.hpp"
+#endif
+#if defined(__OPENCV_BUILD) && !defined(BUILD_PLUGIN)  // TODO Properly detect and add D3D11 / LIBVA dependencies for standalone plugins
 #include "cvconfig.h"
 #endif
 #include <sstream>
@@ -14,16 +17,31 @@
 #define D3D11_NO_HELPERS
 #include <d3d11.h>
 #include <codecvt>
+#include "opencv2/core/directx.hpp"
+#ifdef HAVE_OPENCL
+#include <CL/cl_d3d11.h>
 #endif
+#endif // HAVE_D3D11
 
 #ifdef HAVE_VA
 #include <va/va_backend.h>
+#ifdef HAVE_VA_INTEL
+#include "opencv2/core/va_intel.hpp"
+#ifndef CL_TARGET_OPENCL_VERSION
+#define CL_TARGET_OPENCL_VERSION 120
 #endif
+#ifdef HAVE_VA_INTEL_OLD_HEADER
+#include <CL/va_ext.h>
+#else
+#include <CL/cl_va_api_media_sharing_intel.h>
+#endif
+#endif
+#endif // HAVE_VA
 
+// FFMPEG "C" headers
 extern "C" {
 #include <libavcodec/avcodec.h>
 #include <libavutil/avutil.h>
-
 #include <libavutil/hwcontext.h>
 #ifdef HAVE_D3D11
 #include <libavutil/hwcontext_d3d11va.h>
@@ -31,8 +49,23 @@ extern "C" {
 #ifdef HAVE_VA
 #include <libavutil/hwcontext_vaapi.h>
 #endif
+#ifdef HAVE_MFX // dependency only on MFX header files, no linkage dependency
+#include <libavutil/hwcontext_qsv.h>
+#endif
 }
 
+#define HW_DEFAULT_POOL_SIZE    32
+#define HW_DEFAULT_SW_FORMAT    AV_PIX_FMT_NV12
+
+using namespace cv;
+
+static AVCodec *hw_find_codec(AVCodecID id, AVHWDeviceType hw_type, int (*check_category)(const AVCodec *),
+                              const char *disabled_codecs, AVPixelFormat *hw_pix_fmt);
+static AVBufferRef* hw_create_device(AVHWDeviceType hw_type, int hw_device, const std::string& device_subname, bool use_opencl);
+static AVBufferRef* hw_create_frames(struct AVCodecContext* ctx, AVBufferRef *hw_device_ctx, int width, int height, AVPixelFormat hw_format);
+static AVPixelFormat hw_get_format_callback(struct AVCodecContext *ctx, const enum AVPixelFormat * fmt);
+static VideoAccelerationType hw_type_to_va_type(AVHWDeviceType hw_type);
+
 static
 const char* getVideoAccelerationName(VideoAccelerationType va_type)
 {
@@ -70,7 +103,7 @@ std::string getDecoderConfiguration(VideoAccelerationType va_type, AVDictionary
     case VIDEO_ACCELERATION_ANY: return "d3d11va";
     case VIDEO_ACCELERATION_D3D11: return "d3d11va";
     case VIDEO_ACCELERATION_VAAPI: return "";
-    case VIDEO_ACCELERATION_MFX: return "";
+    case VIDEO_ACCELERATION_MFX: return ""; // "qsv" fails if non-Intel D3D11 device
     }
     return "";
 #else
@@ -80,7 +113,7 @@ std::string getDecoderConfiguration(VideoAccelerationType va_type, AVDictionary
     case VIDEO_ACCELERATION_ANY: return "vaapi.iHD";
     case VIDEO_ACCELERATION_D3D11: return "";
     case VIDEO_ACCELERATION_VAAPI: return "vaapi.iHD";
-    case VIDEO_ACCELERATION_MFX: return "";
+    case VIDEO_ACCELERATION_MFX: return "qsv.iHD";
     }
     return "";
 #endif
@@ -125,7 +158,6 @@ std::string getEncoderConfiguration(VideoAccelerationType va_type, AVDictionary
 #endif
 }
 
-
 static
 std::string getDecoderDisabledCodecs(AVDictionary *dict)
 {
@@ -170,19 +202,6 @@ std::string getEncoderDisabledCodecs(AVDictionary *dict)
 #endif
 }
 
-
-#define HW_DEFAULT_POOL_SIZE    32
-#define HW_DEFAULT_SW_FORMAT    AV_PIX_FMT_NV12
-
-using namespace cv;
-
-static AVCodec *hw_find_codec(AVCodecID id, AVHWDeviceType hw_type, int (*check_category)(const AVCodec *),
-                              const char *disabled_codecs, AVPixelFormat *hw_pix_fmt);
-static AVBufferRef* hw_create_device(AVHWDeviceType hw_type, int hw_device, const std::string& device_subname);
-static AVBufferRef* hw_create_frames(struct AVCodecContext* ctx, AVBufferRef *hw_device_ctx, int width, int height, AVPixelFormat hw_format);
-static AVPixelFormat hw_get_format_callback(struct AVCodecContext *ctx, const enum AVPixelFormat * fmt);
-static VideoAccelerationType hw_type_to_va_type(AVHWDeviceType hw_type);
-
 static
 bool hw_check_device(AVBufferRef* ctx, AVHWDeviceType hw_type, const std::string& device_subname) {
     if (!ctx)
@@ -259,75 +278,343 @@ bool hw_check_device(AVBufferRef* ctx, AVHWDeviceType hw_type, const std::string
 }
 
 static
-AVBufferRef* hw_create_device(AVHWDeviceType hw_type, int hw_device, const std::string& device_subname) {
-    if (AV_HWDEVICE_TYPE_NONE == hw_type)
-        return NULL;
-
-    AVHWDeviceType child_type = hw_type;
-    if (hw_type == AV_HWDEVICE_TYPE_QSV) {
-#ifdef _WIN32
-        child_type = AV_HWDEVICE_TYPE_DXVA2;
-#else
-        child_type = AV_HWDEVICE_TYPE_VAAPI;
-#endif
-    }
-
-    AVBufferRef* hw_device_ctx = NULL;
-    char device[128] = "";
-    char* pdevice = NULL;
-    if (hw_device >= 0 && hw_device < 100000) {
-        if (child_type == AV_HWDEVICE_TYPE_VAAPI) {
-            snprintf(device, sizeof(device), "/dev/dri/renderD%d", 128 + hw_device);
-        } else {
-            snprintf(device, sizeof(device), "%d", hw_device);
-        }
-        pdevice = device;
-    }
-    const char *hw_child_name = av_hwdevice_get_type_name(child_type);
-    const char *device_name = pdevice ? pdevice : "'default'";
-    int err = av_hwdevice_ctx_create(&hw_device_ctx, child_type, pdevice, NULL, 0);
-    if (hw_device_ctx && err >= 0)
+AVBufferRef* hw_create_derived_context(AVHWDeviceType hw_type, AVBufferRef* hw_device_ctx) {
+    AVBufferRef* derived_ctx = NULL;
+    const char* hw_name = av_hwdevice_get_type_name(hw_type);
+    int err = av_hwdevice_ctx_create_derived(&derived_ctx, hw_type, hw_device_ctx, 0);
+    if (!derived_ctx || err < 0)
     {
-        CV_LOG_DEBUG(NULL, "FFMPEG: Created video acceleration context (av_hwdevice_ctx_create) for " << hw_child_name << " on device " << device_name);
-        if (!hw_check_device(hw_device_ctx, hw_type, device_subname)) {
-            av_buffer_unref(&hw_device_ctx);
-            return NULL;
-        }
-        if (hw_type != child_type) {
-            AVBufferRef *derived_ctx = NULL;
-            const char *hw_name = av_hwdevice_get_type_name(hw_type);
-            err = av_hwdevice_ctx_create_derived(&derived_ctx, hw_type, hw_device_ctx, 0);
-            if (!derived_ctx || err < 0)
-            {
-                if (derived_ctx)
-                    av_buffer_unref(&derived_ctx);
-                CV_LOG_INFO(NULL, "FFMPEG: Failed to create derived video acceleration (av_hwdevice_ctx_create_derived) for " << hw_name << ". Error=" << err);
-            }
-            else
-            {
-                CV_LOG_DEBUG(NULL, "FFMPEG: Created derived video acceleration context (av_hwdevice_ctx_create_derived) for " << hw_name);
-            }
-            av_buffer_unref(&hw_device_ctx);
-            return derived_ctx;
-        } else {
-            return hw_device_ctx;
-        }
+        if (derived_ctx)
+            av_buffer_unref(&derived_ctx);
+        CV_LOG_INFO(NULL, "FFMPEG: Failed to create derived video acceleration (av_hwdevice_ctx_create_derived) for " << hw_name << ". Error=" << err);
+        return NULL;
     }
     else
     {
-        const char *hw_name = hw_child_name;
-        CV_LOG_INFO(NULL, "FFMPEG: Failed to create " << hw_name << " video acceleration (av_hwdevice_ctx_create) on device " << device_name);
+        // Store child context in 'user_opaque' field of parent context.
+        struct FreeChildContext {
+            static void free(struct AVHWDeviceContext* ctx) {
+                AVBufferRef* child_ctx = (AVBufferRef*)ctx->user_opaque;
+                if (child_ctx)
+                    av_buffer_unref(&child_ctx);
+            }
+        };
+        AVHWDeviceContext* ctx = (AVHWDeviceContext*)derived_ctx->data;
+        ctx->user_opaque = av_buffer_ref(hw_device_ctx);
+        ctx->free = FreeChildContext::free;
+        CV_LOG_INFO(NULL, "FFMPEG: Created derived video acceleration context (av_hwdevice_ctx_create_derived) for " << hw_name);
+        return derived_ctx;
+    }
+}
+
+#ifdef HAVE_OPENCL // GPU buffer interop with cv::UMat
+
+// FFmpeg context attached to OpenCL context
+class OpenCL_FFMPEG_Context : public ocl::Context::UserContext {
+public:
+    OpenCL_FFMPEG_Context(AVBufferRef* ctx) {
+        ctx_ = av_buffer_ref(ctx);
+    }
+    virtual ~OpenCL_FFMPEG_Context() {
+        av_buffer_unref(&ctx_);
+    }
+    AVBufferRef* GetAVHWDevice() {
+        return ctx_;
+    }
+private:
+    AVBufferRef* ctx_;
+};
+
+#ifdef HAVE_MFX
+static
+int hw_find_qsv_surface_index(AVFrame* hw_frame)
+{
+    if (AV_PIX_FMT_QSV != hw_frame->format)
+        return -1;
+    mfxFrameSurface1* surface = (mfxFrameSurface1*)hw_frame->data[3]; // As defined by AV_PIX_FMT_QSV
+    AVHWFramesContext* frames_ctx = (AVHWFramesContext*)hw_frame->hw_frames_ctx->data;
+    AVQSVFramesContext* qsv_ctx = (AVQSVFramesContext*)frames_ctx->hwctx;
+    for (int i = 0; i < qsv_ctx->nb_surfaces; i++) {
+        if (surface == qsv_ctx->surfaces + i) {
+            return i;
+        }
+    }
+    return -1;
+}
+#endif
+
+#ifdef HAVE_VA
+static
+VADisplay hw_get_va_display(AVHWDeviceContext* hw_device_ctx)
+{
+    if (hw_device_ctx->type == AV_HWDEVICE_TYPE_QSV) { // we stored pointer to child context in 'user_opaque' field
+        AVBufferRef* ctx = (AVBufferRef*)hw_device_ctx->user_opaque;
+        hw_device_ctx = (AVHWDeviceContext*)ctx->data;
+    }
+    if (hw_device_ctx && hw_device_ctx->type == AV_HWDEVICE_TYPE_VAAPI) {
+        return ((AVVAAPIDeviceContext*)hw_device_ctx->hwctx)->display;
+    }
+    return NULL;
+}
+#endif // HAVE_VA
+
+#ifdef HAVE_VA_INTEL
+static
+VASurfaceID hw_get_va_surface(AVFrame* hw_frame) {
+    if (AV_PIX_FMT_VAAPI == hw_frame->format) {
+        return (VASurfaceID)(size_t)hw_frame->data[3]; // As defined by AV_PIX_FMT_VAAPI
+    }
+#ifdef HAVE_MFX
+    else if (AV_PIX_FMT_QSV == hw_frame->format) {
+        int frame_idx = hw_find_qsv_surface_index(hw_frame);
+        if (frame_idx >= 0) { // frame index is same in parent (QSV) and child (VAAPI) frame context
+            AVHWFramesContext *frames_ctx = (AVHWFramesContext *) hw_frame->hw_frames_ctx->data;
+            AVHWFramesContext *child_ctx = (AVHWFramesContext *) frames_ctx->user_opaque;
+            if (child_ctx && AV_HWDEVICE_TYPE_VAAPI == child_ctx->device_ctx->type) {
+                AVVAAPIFramesContext *vaapi_ctx = (AVVAAPIFramesContext *) child_ctx->hwctx;
+                CV_Assert(frame_idx < vaapi_ctx->nb_surfaces);
+                return vaapi_ctx->surface_ids[frame_idx];
+            }
+        }
+    }
+#endif // HAVE_MFX
+    return VA_INVALID_SURFACE;
+}
+#endif // HAVE_VA_INTEL
+
+#ifdef HAVE_D3D11
+static
+AVD3D11VADeviceContext* hw_get_d3d11_device_ctx(AVHWDeviceContext* hw_device_ctx) {
+    if (AV_HWDEVICE_TYPE_QSV == hw_device_ctx->type) { // we stored pointer to child context in 'user_opaque' field
+        AVBufferRef* ctx = (AVBufferRef*)hw_device_ctx->user_opaque;
+        hw_device_ctx = (AVHWDeviceContext*)ctx->data;
+    }
+    if (AV_HWDEVICE_TYPE_D3D11VA == hw_device_ctx->type) {
+        return (AVD3D11VADeviceContext*)hw_device_ctx->hwctx;
+    }
+    return NULL;
+}
+
+ID3D11Texture2D* hw_get_d3d11_texture(AVFrame* hw_frame, int* subresource) {
+    ID3D11Texture2D* texture = NULL;
+    if (AV_PIX_FMT_D3D11 == hw_frame->format) {
+        texture = (ID3D11Texture2D*)hw_frame->data[0]; // As defined by AV_PIX_FMT_D3D11
+        *subresource = (intptr_t)hw_frame->data[1]; // As defined by AV_PIX_FMT_D3D11
+    }
+#ifdef HAVE_MFX
+    else if (AV_PIX_FMT_QSV == hw_frame->format) {
+        AVHWFramesContext *frames_ctx = (AVHWFramesContext *) hw_frame->hw_frames_ctx->data;
+        AVHWFramesContext *child_ctx = (AVHWFramesContext *) frames_ctx->user_opaque;
+        if (child_ctx && AV_HWDEVICE_TYPE_D3D11VA == child_ctx->device_ctx->type) {
+            texture = ((AVD3D11VAFramesContext*)child_ctx->hwctx)->texture;
+        }
+        *subresource = hw_find_qsv_surface_index(hw_frame);
+        CV_Assert(*subresource >= 0);
+    }
+#endif
+    return texture;
+}
+
+// In D3D11 case we allocate additional texture as single texture (not texture array) because
+// OpenCL interop with D3D11 doesn't support/work with NV12 sub-texture of texture array.
+ID3D11Texture2D* hw_get_d3d11_single_texture(AVFrame* hw_frame, AVD3D11VADeviceContext* d3d11_device_ctx, ID3D11Texture2D* texture) {
+    AVHWFramesContext* frames_ctx = (AVHWFramesContext*)hw_frame->hw_frames_ctx->data;
+    if (AV_HWDEVICE_TYPE_QSV == frames_ctx->device_ctx->type) {
+        frames_ctx = (AVHWFramesContext*)frames_ctx->user_opaque; // we stored pointer to child context in 'user_opaque' field
+    }
+    if (!frames_ctx || AV_HWDEVICE_TYPE_D3D11VA != frames_ctx->device_ctx->type) {
         return NULL;
     }
+    ID3D11Texture2D* singleTexture = (ID3D11Texture2D*)frames_ctx->user_opaque;
+    if (!singleTexture && d3d11_device_ctx && texture) {
+        D3D11_TEXTURE2D_DESC desc = {};
+        texture->GetDesc(&desc);
+        desc.ArraySize = 1;
+        desc.BindFlags |= D3D11_BIND_SHADER_RESOURCE;
+        desc.MiscFlags |= D3D11_RESOURCE_MISC_SHARED;
+        if (SUCCEEDED(d3d11_device_ctx->device->CreateTexture2D(&desc, NULL, &singleTexture))) {
+            frames_ctx->user_opaque = singleTexture;
+        }
+    }
+    return singleTexture;
+}
+#endif // HAVE_D3D11
+
+static
+AVHWDeviceType hw_check_opencl_context(AVHWDeviceContext* ctx) {
+    ocl::OpenCLExecutionContext& ocl_context = ocl::OpenCLExecutionContext::getCurrentRef();
+    if (!ctx || ocl_context.empty())
+        return AV_HWDEVICE_TYPE_NONE;
+#ifdef HAVE_VA_INTEL
+    VADisplay vadisplay_ocl = ocl_context.getContext().getOpenCLContextProperty(CL_CONTEXT_VA_API_DISPLAY_INTEL);
+    VADisplay vadisplay_ctx = hw_get_va_display(ctx);
+    if (vadisplay_ocl && vadisplay_ocl == vadisplay_ctx)
+        return AV_HWDEVICE_TYPE_VAAPI;
+#endif
+#ifdef HAVE_D3D11
+    ID3D11Device* d3d11device_ocl = (ID3D11Device*)ocl_context.getContext().getOpenCLContextProperty(CL_CONTEXT_D3D11_DEVICE_KHR);
+    AVD3D11VADeviceContext* d3d11_device_ctx = hw_get_d3d11_device_ctx(ctx);
+    if (d3d11_device_ctx && d3d11device_ocl && d3d11_device_ctx->device == d3d11device_ocl)
+        return AV_HWDEVICE_TYPE_D3D11VA;
+#endif
+    return AV_HWDEVICE_TYPE_NONE;
+}
+
+static
+void hw_init_opencl(AVBufferRef* ctx) {
+    if (!ctx)
+        return;
+    AVHWDeviceContext* hw_device_ctx = (AVHWDeviceContext*)ctx->data;
+    if (!hw_device_ctx)
+        return;
+#ifdef HAVE_VA_INTEL
+    VADisplay va_display = hw_get_va_display(hw_device_ctx);
+    if (va_display) {
+        va_intel::ocl::initializeContextFromVA(va_display);
+    }
+#endif
+#ifdef HAVE_D3D11
+    AVD3D11VADeviceContext* d3d11_device_ctx = hw_get_d3d11_device_ctx(hw_device_ctx);
+    if (d3d11_device_ctx) {
+        directx::ocl::initializeContextFromD3D11Device(d3d11_device_ctx->device);
+    }
+#endif
+    if (hw_check_opencl_context(hw_device_ctx) != AV_HWDEVICE_TYPE_NONE) {
+        // Attach AVHWDeviceContext to OpenCL context
+        ocl::Context &ocl_context = ocl::OpenCLExecutionContext::getCurrent().getContext();
+        ocl_context.setUserContext(std::make_shared<OpenCL_FFMPEG_Context>(ctx));
+    }
 }
 
 static
-AVBufferRef* hw_create_frames(struct AVCodecContext* ctx, AVBufferRef *hw_device_ctx, int width, int height, AVPixelFormat hw_format)
-{
-    AVBufferRef *hw_frames_ref = nullptr;
+AVBufferRef* hw_create_context_from_opencl(ocl::OpenCLExecutionContext& ocl_context, AVHWDeviceType hw_type) {
+    if (ocl_context.empty())
+        return NULL;
+    auto ocl_ffmpeg_context = ocl_context.getContext().getUserContext<OpenCL_FFMPEG_Context>();
+    if (!ocl_ffmpeg_context)
+        return NULL;
+    AVBufferRef* ctx = ocl_ffmpeg_context->GetAVHWDevice();
+    if (hw_type != ((AVHWDeviceContext*)ctx->data)->type) {
+        ctx = hw_create_derived_context(hw_type, ctx);
+    }
+    else {
+        ctx = av_buffer_ref(ctx);
+    }
     if (ctx)
+        CV_LOG_INFO(NULL, "FFMPEG: Using " << av_hwdevice_get_type_name(hw_type) << " video acceleration context attached to OpenCL context");
+    return ctx;
+}
+
+#endif // HAVE_OPENCL
+
+static
+AVBufferRef* hw_create_device(AVHWDeviceType hw_type, int hw_device, const std::string& device_subname, bool use_opencl) {
+    AVBufferRef* hw_device_ctx = NULL;
+    if (AV_HWDEVICE_TYPE_NONE == hw_type)
+        return NULL;
+
+#ifdef HAVE_OPENCL
+    // Check if OpenCL context has AVHWDeviceContext attached to it
+    ocl::OpenCLExecutionContext& ocl_context = ocl::OpenCLExecutionContext::getCurrentRef();
+    try {
+        hw_device_ctx = hw_create_context_from_opencl(ocl_context, hw_type);
+        if (hw_device_ctx) {
+            if (hw_device >= 0)
+                CV_LOG_ERROR(NULL, "VIDEOIO/FFMPEG: ignoring property HW_DEVICE as device context already created and attached to OpenCL context");
+            return hw_device_ctx;
+        }
+    }
+    catch (...) {
+        CV_LOG_INFO(NULL, "FFMPEG: Exception creating Video Acceleration context using current OpenCL context");
+    }
+#endif
+
+    // Create new media context. In QSV case, first create 'child' context.
+    std::vector<AVHWDeviceType> child_types = { hw_type };
+    if (hw_type == AV_HWDEVICE_TYPE_QSV) {
+#ifdef _WIN32
+        child_types = { AV_HWDEVICE_TYPE_D3D11VA, AV_HWDEVICE_TYPE_DXVA2 };
+#else
+        child_types = { AV_HWDEVICE_TYPE_VAAPI };
+#endif
+    }
+    for (AVHWDeviceType child_type : child_types) {
+        char device[128] = "";
+        char* pdevice = NULL;
+        if (hw_device >= 0 && hw_device < 100000) {
+            if (child_type == AV_HWDEVICE_TYPE_VAAPI) {
+                snprintf(device, sizeof(device), "/dev/dri/renderD%d", 128 + hw_device);
+            }
+            else {
+                snprintf(device, sizeof(device), "%d", hw_device);
+            }
+            pdevice = device;
+        }
+        const char* hw_child_name = av_hwdevice_get_type_name(child_type);
+        const char* device_name = pdevice ? pdevice : "'default'";
+        int err = av_hwdevice_ctx_create(&hw_device_ctx, child_type, pdevice, NULL, 0);
+        if (hw_device_ctx && err >= 0)
+        {
+            if (!hw_check_device(hw_device_ctx, hw_type, device_subname)) {
+                av_buffer_unref(&hw_device_ctx);
+                continue;
+            }
+            CV_LOG_INFO(NULL, "FFMPEG: Created video acceleration context (av_hwdevice_ctx_create) for " << hw_child_name << " on device " << device_name);
+#ifdef HAVE_OPENCL
+            // if OpenCL context not created yet or property HW_ACCELERATION_USE_OPENCL set, create OpenCL context with binding to video acceleration context
+            if (ocl::haveOpenCL()) {
+                if (ocl_context.empty() || use_opencl) {
+                    try {
+                        hw_init_opencl(hw_device_ctx);
+                        ocl_context = ocl::OpenCLExecutionContext::getCurrentRef();
+                        if (!ocl_context.empty()) {
+                            CV_LOG_INFO(NULL, "FFMPEG: Created OpenCL context with " << hw_child_name <<
+                                " video acceleration on OpenCL device: " << ocl_context.getDevice().name());
+                        }
+                    } catch (...) {
+                        CV_LOG_INFO(NULL, "FFMPEG: Exception creating OpenCL context with " << hw_child_name << " video acceleration");
+                    }
+                }
+                else {
+                    CV_LOG_INFO(NULL, "FFMPEG: Can't bind " << hw_child_name << " video acceleration context to already created OpenCL context");
+                }
+            }
+#else
+            CV_UNUSED(use_opencl);
+#endif
+            if (hw_type != child_type) {
+                AVBufferRef* derived_ctx = hw_create_derived_context(hw_type, hw_device_ctx);
+                av_buffer_unref(&hw_device_ctx);
+                return derived_ctx;
+            } else {
+                return hw_device_ctx;
+            }
+        }
+        else
+        {
+            const char* hw_name = hw_child_name;
+            CV_LOG_INFO(NULL, "FFMPEG: Failed to create " << hw_name << " video acceleration (av_hwdevice_ctx_create) on device " << device_name);
+        }
+    }
+    return NULL;
+}
+
+static
+AVBufferRef* hw_create_frames(struct AVCodecContext* codec_ctx, AVBufferRef *hw_device_ctx, int width, int height, AVPixelFormat hw_format)
+{
+    AVHWDeviceContext *device_ctx = (AVHWDeviceContext*)hw_device_ctx->data;
+    AVBufferRef* child_ctx = hw_device_ctx;
+    // In QSV case we first allocate child D3D11/VAAPI frames (except DXVA2 as no OpenCL interop), then derive to parent QSV frames
+    if (AV_HWDEVICE_TYPE_QSV == device_ctx->type) {
+        AVBufferRef *ctx = (AVBufferRef *) device_ctx->user_opaque; // child context stored during creation of derived context
+        if (ctx && AV_HWDEVICE_TYPE_DXVA2 != ((AVHWDeviceContext *) ctx->data)->type) {
+            child_ctx = ctx;
+        }
+    }
+    AVBufferRef *hw_frames_ref = nullptr;
+    if (codec_ctx)
     {
-        int res = avcodec_get_hw_frames_parameters(ctx, hw_device_ctx, hw_format, &hw_frames_ref);
+        int res = avcodec_get_hw_frames_parameters(codec_ctx, child_ctx, hw_format, &hw_frames_ref);
         if (res < 0)
         {
             CV_LOG_DEBUG(NULL, "FFMPEG: avcodec_get_hw_frames_parameters() call failed: " << res)
@@ -335,7 +622,7 @@ AVBufferRef* hw_create_frames(struct AVCodecContext* ctx, AVBufferRef *hw_device
     }
     if (!hw_frames_ref)
     {
-        hw_frames_ref = av_hwframe_ctx_alloc(hw_device_ctx);
+        hw_frames_ref = av_hwframe_ctx_alloc(child_ctx);
     }
     if (!hw_frames_ref)
     {
@@ -345,12 +632,41 @@ AVBufferRef* hw_create_frames(struct AVCodecContext* ctx, AVBufferRef *hw_device
     AVHWFramesContext *frames_ctx = (AVHWFramesContext *)(hw_frames_ref->data);
     frames_ctx->width = width;
     frames_ctx->height = height;
-    if (frames_ctx->format == AV_PIX_FMT_NONE)
-        frames_ctx->format = hw_format;
+    if (frames_ctx->format == AV_PIX_FMT_NONE) {
+        if (child_ctx == hw_device_ctx) {
+            frames_ctx->format = hw_format;
+        }
+        else {
+            AVHWFramesConstraints* constraints = av_hwdevice_get_hwframe_constraints(child_ctx, NULL);
+            if (constraints) {
+                frames_ctx->format = constraints->valid_hw_formats[0];
+                av_hwframe_constraints_free(&constraints);
+            }
+        }
+    }
     if (frames_ctx->sw_format == AV_PIX_FMT_NONE)
         frames_ctx->sw_format = HW_DEFAULT_SW_FORMAT;
     if (frames_ctx->initial_pool_size == 0)
         frames_ctx->initial_pool_size = HW_DEFAULT_POOL_SIZE;
+
+#ifdef HAVE_D3D11
+    if (frames_ctx->device_ctx && AV_HWDEVICE_TYPE_D3D11VA == frames_ctx->device_ctx->type) {
+        // BindFlags
+        AVD3D11VAFramesContext* frames_hwctx = (AVD3D11VAFramesContext*)frames_ctx->hwctx;
+        frames_hwctx->BindFlags |= D3D11_BIND_DECODER | D3D11_BIND_VIDEO_ENCODER;
+        // See function hw_get_d3d11_single_texture(), it allocates additional ID3D11Texture2D texture and
+        // attaches it as 'user_opaque' field. We have to set free() callback before av_hwframe_ctx_init() call.
+        struct D3D11SingleTexture {
+            static void free(struct AVHWFramesContext* ctx) {
+                ID3D11Texture2D* singleTexture = (ID3D11Texture2D*)ctx->user_opaque;
+                if (ctx->user_opaque)
+                    singleTexture->Release();
+            }
+        };
+        frames_ctx->free = D3D11SingleTexture::free;
+    }
+#endif
+
     int res = av_hwframe_ctx_init(hw_frames_ref);
     if (res < 0)
     {
@@ -358,7 +674,25 @@ AVBufferRef* hw_create_frames(struct AVCodecContext* ctx, AVBufferRef *hw_device
         av_buffer_unref(&hw_frames_ref);
         return NULL;
     }
-    return hw_frames_ref;
+
+    if (child_ctx != hw_device_ctx) {
+        AVBufferRef* derived_frame_ctx = NULL;
+        int flags = AV_HWFRAME_MAP_READ | AV_HWFRAME_MAP_WRITE;
+        res = av_hwframe_ctx_create_derived(&derived_frame_ctx, hw_format, hw_device_ctx, hw_frames_ref, flags);
+        av_buffer_unref(&hw_frames_ref);
+        if (res < 0)
+        {
+            CV_LOG_INFO(NULL, "FFMPEG: Failed to create derived HW frame context (av_hwframe_ctx_create_derived): " << res);
+            return NULL;
+        }
+        else {
+            ((AVHWFramesContext*)derived_frame_ctx->data)->user_opaque = frames_ctx;
+            return derived_frame_ctx;
+        }
+    }
+    else {
+        return hw_frames_ref;
+    }
 }
 
 static
@@ -455,6 +789,110 @@ AVPixelFormat hw_get_format_callback(struct AVCodecContext *ctx, const enum AVPi
     return fmt[0];
 }
 
+// GPU color conversion NV12->BGRA via OpenCL extensions
+static bool
+hw_copy_frame_to_umat(AVBufferRef* ctx, AVFrame* hw_frame, cv::OutputArray output) {
+    CV_UNUSED(hw_frame);
+    CV_UNUSED(output);
+    if (!ctx)
+        return false;
+
+#ifdef HAVE_OPENCL
+    try {
+        // check that current OpenCL context initilized with binding to same VAAPI/D3D11 context
+        AVHWDeviceContext *hw_device_ctx = (AVHWDeviceContext *) ctx->data;
+        AVHWDeviceType child_type = hw_check_opencl_context(hw_device_ctx);
+        if (child_type == AV_HWDEVICE_TYPE_NONE)
+            return false;
+
+#ifdef HAVE_VA_INTEL
+        if (child_type == AV_HWDEVICE_TYPE_VAAPI) {
+            VADisplay va_display = hw_get_va_display(hw_device_ctx);
+            VASurfaceID va_surface = hw_get_va_surface(hw_frame);
+            if (va_display && va_surface != VA_INVALID_SURFACE) {
+                va_intel::convertFromVASurface(va_display, va_surface, {hw_frame->width, hw_frame->height}, output);
+                return true;
+            }
+        }
+#endif
+
+#ifdef HAVE_D3D11
+        if (child_type == AV_HWDEVICE_TYPE_D3D11VA) {
+            AVD3D11VADeviceContext* d3d11_device_ctx = hw_get_d3d11_device_ctx(hw_device_ctx);
+            int subresource = 0;
+            ID3D11Texture2D* texture = hw_get_d3d11_texture(hw_frame, &subresource);
+            ID3D11Texture2D* singleTexture = hw_get_d3d11_single_texture(hw_frame, d3d11_device_ctx, texture);
+            if (texture && singleTexture) {
+                // Copy D3D11 sub-texture to D3D11 single texture
+                d3d11_device_ctx->device_context->CopySubresourceRegion(singleTexture, 0, 0, 0, 0, texture, subresource, NULL);
+                // Copy D3D11 single texture to cv::UMat
+                directx::convertFromD3D11Texture2D(singleTexture, output);
+                return true;
+            }
+        }
+#endif
+    }
+    catch (...)
+    {
+        return false;
+    }
+#endif // HAVE_OPENCL
+
+    return false;
+}
+
+// GPU color conversion BGRA->NV12 via OpenCL extensions
+static bool
+hw_copy_umat_to_frame(AVBufferRef* ctx, cv::InputArray input, AVFrame* hw_frame) {
+    CV_UNUSED(input);
+    CV_UNUSED(hw_frame);
+    if (!ctx)
+        return false;
+
+#ifdef HAVE_OPENCL
+    try {
+        // check that current OpenCL context initilized with binding to same VAAPI/D3D11 context
+        AVHWDeviceContext *hw_device_ctx = (AVHWDeviceContext *) ctx->data;
+        AVHWDeviceType child_type = hw_check_opencl_context(hw_device_ctx);
+        if (child_type == AV_HWDEVICE_TYPE_NONE)
+            return false;
+
+#ifdef HAVE_VA_INTEL
+        if (child_type == AV_HWDEVICE_TYPE_VAAPI) {
+            VADisplay va_display = hw_get_va_display(hw_device_ctx);
+            VASurfaceID va_surface = hw_get_va_surface(hw_frame);
+            if (va_display != NULL && va_surface != VA_INVALID_SURFACE) {
+                va_intel::convertToVASurface(va_display, input, va_surface, {hw_frame->width, hw_frame->height});
+                return true;
+            }
+        }
+#endif
+
+#ifdef HAVE_D3D11
+        if (child_type == AV_HWDEVICE_TYPE_D3D11VA) {
+            AVD3D11VADeviceContext* d3d11_device_ctx = hw_get_d3d11_device_ctx(hw_device_ctx);
+            int subresource = 0;
+            ID3D11Texture2D* texture = hw_get_d3d11_texture(hw_frame, &subresource);
+            ID3D11Texture2D* singleTexture = hw_get_d3d11_single_texture(hw_frame, d3d11_device_ctx, texture);
+            if (texture && singleTexture) {
+                // Copy cv::UMat to D3D11 single texture
+                directx::convertToD3D11Texture2D(input, singleTexture);
+                // Copy D3D11 single texture to D3D11 sub-texture
+                d3d11_device_ctx->device_context->CopySubresourceRegion(texture, subresource, 0, 0, 0, singleTexture, 0, NULL);
+                return true;
+            }
+        }
+#endif
+    }
+    catch (...)
+    {
+        return false;
+    }
+#endif // HAVE_OPENCL
+
+    return false;
+}
+
 static
 VideoAccelerationType hw_type_to_va_type(AVHWDeviceType hw_type) {
     struct HWTypeFFMPEG {
diff --git a/modules/videoio/src/cap_ffmpeg_impl.hpp b/modules/videoio/src/cap_ffmpeg_impl.hpp
index 84e4e722f7..1e73cb8fc8 100644
--- a/modules/videoio/src/cap_ffmpeg_impl.hpp
+++ b/modules/videoio/src/cap_ffmpeg_impl.hpp
@@ -476,6 +476,7 @@ struct CvCapture_FFMPEG
     bool setProperty(int, double);
     bool grabFrame();
     bool retrieveFrame(int, unsigned char** data, int* step, int* width, int* height, int* cn);
+    bool retrieveHWFrame(cv::OutputArray output);
     void rotateFrame(cv::Mat &mat) const;
 
     void init();
@@ -537,6 +538,7 @@ struct CvCapture_FFMPEG
 #endif
     VideoAccelerationType va_type;
     int hw_device;
+    int use_opencl;
 };
 
 void CvCapture_FFMPEG::init()
@@ -574,6 +576,7 @@ void CvCapture_FFMPEG::init()
     bsfc = NULL;
     va_type = cv::VIDEO_ACCELERATION_NONE;  // TODO OpenCV 5.0: change to _ANY?
     hw_device = -1;
+    use_opencl = 0;
 }
 
 
@@ -922,6 +925,9 @@ bool CvCapture_FFMPEG::open(const char* _filename, const VideoCaptureParameters&
                 return false;
             }
         }
+        if (params.has(CAP_PROP_HW_ACCELERATION_USE_OPENCL)) {
+            use_opencl = params.get<int>(CAP_PROP_HW_ACCELERATION_USE_OPENCL);
+        }
         if (params.warnUnusedParameters())
         {
             CV_LOG_ERROR(NULL, "VIDEOIO/FFMPEG: unsupported parameters in .open(), see logger INFO channel for details. Bailout");
@@ -1051,7 +1057,7 @@ bool CvCapture_FFMPEG::open(const char* _filename, const VideoCaptureParameters&
                     if (codec) {
                         if (hw_pix_fmt != AV_PIX_FMT_NONE)
                             enc->get_format = hw_get_format_callback; // set callback to select HW pixel format, not SW format
-                        enc->hw_device_ctx = hw_create_device(hw_type, hw_device, accel_iter.device_subname());
+                        enc->hw_device_ctx = hw_create_device(hw_type, hw_device, accel_iter.device_subname(), use_opencl != 0);
                         if (!enc->hw_device_ctx)
                         {
                             CV_LOG_DEBUG(NULL, "FFMPEG: ... can't create H/W device: '" << accel_iter.hw_type_device_string() << "'");
@@ -1476,6 +1482,22 @@ bool CvCapture_FFMPEG::retrieveFrame(int, unsigned char** data, int* step, int*
     return true;
 }
 
+bool CvCapture_FFMPEG::retrieveHWFrame(cv::OutputArray output)
+{
+#if USE_AV_HW_CODECS
+    // check that we have HW frame in GPU memory
+    if (!picture || !picture->hw_frames_ctx) {
+        return false;
+    }
+
+    // GPU color conversion NV12->BGRA, from GPU media buffer to GPU OpenCL buffer
+    return hw_copy_frame_to_umat(video_st->codec->hw_device_ctx, picture, output);
+#else
+    CV_UNUSED(output);
+    return false;
+#endif
+}
+
 double CvCapture_FFMPEG::getProperty( int property_id ) const
 {
     if( !video_st ) return 0;
@@ -1549,6 +1571,8 @@ double CvCapture_FFMPEG::getProperty( int property_id ) const
         return static_cast<double>(va_type);
     case CAP_PROP_HW_DEVICE:
         return static_cast<double>(hw_device);
+    case CAP_PROP_HW_ACCELERATION_USE_OPENCL:
+        return static_cast<double>(use_opencl);
 #endif  // USE_AV_HW_CODECS
     default:
         break;
@@ -1752,6 +1776,7 @@ struct CvVideoWriter_FFMPEG
                double fps, int width, int height, const VideoWriterParameters& params );
     void close();
     bool writeFrame( const unsigned char* data, int step, int width, int height, int cn, int origin );
+    bool writeHWFrame(cv::InputArray input);
     double getProperty(int propId) const;
 
     void init();
@@ -1774,6 +1799,7 @@ struct CvVideoWriter_FFMPEG
     struct SwsContext *img_convert_ctx;
     VideoAccelerationType va_type;
     int               hw_device;
+    int               use_opencl;
 };
 
 static const char * icvFFMPEGErrStr(int err)
@@ -1836,6 +1862,7 @@ void CvVideoWriter_FFMPEG::init()
     frame_idx = 0;
     va_type = VIDEO_ACCELERATION_NONE;
     hw_device = -1;
+    use_opencl = 0;
     ok = false;
 }
 
@@ -2210,6 +2237,41 @@ bool CvVideoWriter_FFMPEG::writeFrame( const unsigned char* data, int step, int
     return ret;
 }
 
+bool CvVideoWriter_FFMPEG::writeHWFrame(cv::InputArray input) {
+#if USE_AV_HW_CODECS
+    if (!video_st->codec->hw_frames_ctx)
+        return false;
+
+    // Get hardware frame from frame pool
+    AVFrame* hw_frame = av_frame_alloc();
+    if (!hw_frame) {
+        return false;
+    }
+    if (av_hwframe_get_buffer(video_st->codec->hw_frames_ctx, hw_frame, 0) < 0) {
+        av_frame_free(&hw_frame);
+        return false;
+    }
+
+    // GPU to GPU copy
+    if (!hw_copy_umat_to_frame(video_st->codec->hw_device_ctx, input, hw_frame)) {
+        av_frame_free(&hw_frame);
+        return false;
+    }
+
+    // encode
+    hw_frame->pts = frame_idx;
+    icv_av_write_frame_FFMPEG( oc, video_st, outbuf, outbuf_size, hw_frame, frame_idx);
+    frame_idx++;
+
+    av_frame_free(&hw_frame);
+
+    return true;
+#else
+    CV_UNUSED(input);
+    return false;
+#endif
+}
+
 double CvVideoWriter_FFMPEG::getProperty(int propId) const
 {
     CV_UNUSED(propId);
@@ -2222,6 +2284,10 @@ double CvVideoWriter_FFMPEG::getProperty(int propId) const
     {
         return static_cast<double>(hw_device);
     }
+    else if (propId == VIDEOWRITER_PROP_HW_ACCELERATION_USE_OPENCL)
+    {
+        return static_cast<double>(use_opencl);
+    }
 #endif
     return 0;
 }
@@ -2375,6 +2441,9 @@ bool CvVideoWriter_FFMPEG::open( const char * filename, int fourcc,
             return false;
         }
     }
+    if (params.has(VIDEOWRITER_PROP_HW_ACCELERATION_USE_OPENCL)) {
+        use_opencl = params.get<int>(VIDEOWRITER_PROP_HW_ACCELERATION_USE_OPENCL);
+    }
 
     if (params.warnUnusedParameters())
     {
@@ -2638,7 +2707,7 @@ bool CvVideoWriter_FFMPEG::open( const char * filename, int fourcc,
             if (!codec)
                 continue;
 
-            hw_device_ctx = hw_create_device(hw_type, hw_device, accel_iter.device_subname());
+            hw_device_ctx = hw_create_device(hw_type, hw_device, accel_iter.device_subname(), use_opencl != 0);
             if (!hw_device_ctx)
                 continue;
         }
diff --git a/modules/videoio/src/cap_mfx_common.hpp b/modules/videoio/src/cap_mfx_common.hpp
index dca96b6ef9..2830592163 100644
--- a/modules/videoio/src/cap_mfx_common.hpp
+++ b/modules/videoio/src/cap_mfx_common.hpp
@@ -12,12 +12,22 @@
 #include <fstream>
 #include <sstream>
 
-#include <mfxcommon.h>
-#include <mfxstructures.h>
-#include <mfxvideo++.h>
-#include <mfxvp8.h>
-#include <mfxjpeg.h>
-#include <mfxplugin++.h>
+#ifdef HAVE_ONEVPL
+#  include <vpl/mfxcommon.h>
+#  include <vpl/mfxstructures.h>
+#  include <vpl/mfxvideo++.h>
+#  include <vpl/mfxvp8.h>
+#  include <vpl/mfxjpeg.h>
+#else
+#  include <mfxcommon.h>
+#  include <mfxstructures.h>
+#  include <mfxvideo++.h>
+#  include <mfxvp8.h>
+#  include <mfxjpeg.h>
+#  ifdef HAVE_MFX_PLUGIN
+#    include <mfxplugin++.h>
+#  endif
+#endif
 
 //                 //
 //  Debug helpers  //
@@ -93,8 +103,6 @@ inline std::string mfxStatusToString(mfxStatus s) {
     case MFX_ERR_UNDEFINED_BEHAVIOR: return "MFX_ERR_UNDEFINED_BEHAVIOR";
     case MFX_ERR_DEVICE_FAILED: return "MFX_ERR_DEVICE_FAILED";
     case MFX_ERR_MORE_BITSTREAM: return "MFX_ERR_MORE_BITSTREAM";
-    case MFX_ERR_INCOMPATIBLE_AUDIO_PARAM: return "MFX_ERR_INCOMPATIBLE_AUDIO_PARAM";
-    case MFX_ERR_INVALID_AUDIO_PARAM: return "MFX_ERR_INVALID_AUDIO_PARAM";
     case MFX_ERR_GPU_HANG: return "MFX_ERR_GPU_HANG";
     case MFX_ERR_REALLOC_SURFACE: return "MFX_ERR_REALLOC_SURFACE";
     case MFX_WRN_IN_EXECUTION: return "MFX_WRN_IN_EXECUTION";
@@ -105,8 +113,7 @@ inline std::string mfxStatusToString(mfxStatus s) {
     case MFX_WRN_VALUE_NOT_CHANGED: return "MFX_WRN_VALUE_NOT_CHANGED";
     case MFX_WRN_OUT_OF_RANGE: return "MFX_WRN_OUT_OF_RANGE";
     case MFX_WRN_FILTER_SKIPPED: return "MFX_WRN_FILTER_SKIPPED";
-    case MFX_WRN_INCOMPATIBLE_AUDIO_PARAM: return "MFX_WRN_INCOMPATIBLE_AUDIO_PARAM";
-    default: return "<Invalid mfxStatus>";
+    default: return "<Invalid or unknown mfxStatus>";
     }
 }
 
@@ -174,33 +181,45 @@ class Plugin
 public:
     static Plugin * loadEncoderPlugin(MFXVideoSession &session, mfxU32 codecId)
     {
+#ifdef HAVE_MFX_PLUGIN
         static const mfxPluginUID hevc_enc_uid = { 0x6f, 0xad, 0xc7, 0x91, 0xa0, 0xc2, 0xeb, 0x47, 0x9a, 0xb6, 0xdc, 0xd5, 0xea, 0x9d, 0xa3, 0x47 };
         if (codecId == MFX_CODEC_HEVC)
             return new Plugin(session, hevc_enc_uid);
+#else
+        CV_UNUSED(session); CV_UNUSED(codecId);
+#endif
         return 0;
     }
     static Plugin * loadDecoderPlugin(MFXVideoSession &session, mfxU32 codecId)
     {
+#ifdef HAVE_MFX_PLUGIN
         static const mfxPluginUID hevc_dec_uid = { 0x33, 0xa6, 0x1c, 0x0b, 0x4c, 0x27, 0x45, 0x4c, 0xa8, 0xd8, 0x5d, 0xde, 0x75, 0x7c, 0x6f, 0x8e };
         if (codecId == MFX_CODEC_HEVC)
             return new Plugin(session, hevc_dec_uid);
+#else
+        CV_UNUSED(session); CV_UNUSED(codecId);
+#endif
         return 0;
     }
     ~Plugin()
     {
+#ifdef HAVE_MFX_PLUGIN
         if (isGood())
             MFXVideoUSER_UnLoad(session, &uid);
+#endif
     }
     bool isGood() const { return res >= MFX_ERR_NONE; }
 private:
-    MFXVideoSession &session;
-    mfxPluginUID uid;
     mfxStatus res;
 private:
+#ifdef HAVE_MFX_PLUGIN
+    MFXVideoSession &session;
+    mfxPluginUID uid;
     Plugin(MFXVideoSession &_session, mfxPluginUID _uid) : session(_session), uid(_uid)
     {
         res = MFXVideoUSER_Load(session, &uid, 1);
     }
+#endif
     Plugin(const Plugin &);
     Plugin &operator=(const Plugin &);
 };
diff --git a/modules/videoio/src/videoio_registry.cpp b/modules/videoio/src/videoio_registry.cpp
index 59d96d162c..61e3ac0724 100644
--- a/modules/videoio/src/videoio_registry.cpp
+++ b/modules/videoio/src/videoio_registry.cpp
@@ -84,7 +84,7 @@ static const struct VideoBackendInfo builtin_backends[] =
 
 #ifdef HAVE_MSMF
     DECLARE_STATIC_BACKEND(CAP_MSMF, "MSMF", MODE_CAPTURE_ALL | MODE_WRITER, cvCreateCapture_MSMF, cvCreateCapture_MSMF, cvCreateVideoWriter_MSMF),
-#elif defined(ENABLE_PLUGINS)
+#elif defined(ENABLE_PLUGINS) && defined(_WIN32)
     DECLARE_DYNAMIC_BACKEND(CAP_MSMF, "MSMF", MODE_CAPTURE_ALL | MODE_WRITER),
 #endif
 
@@ -403,6 +403,81 @@ bool hasBackend(VideoCaptureAPIs api)
     return false;
 }
 
+bool isBackendBuiltIn(VideoCaptureAPIs api)
+{
+    std::vector<VideoBackendInfo> backends = VideoBackendRegistry::getInstance().getEnabledBackends();
+    for (size_t i = 0; i < backends.size(); i++)
+    {
+        const VideoBackendInfo& info = backends[i];
+        if (api == info.id)
+        {
+            CV_Assert(!info.backendFactory.empty());
+            return info.backendFactory->isBuiltIn();
+        }
+    }
+    return false;
+}
+
+std::string getCameraBackendPluginVersion(VideoCaptureAPIs api,
+    CV_OUT int& version_ABI,
+    CV_OUT int& version_API
+)
+{
+    const std::vector<VideoBackendInfo> backends = VideoBackendRegistry::getInstance().getAvailableBackends_CaptureByIndex();
+    for (size_t i = 0; i < backends.size(); i++)
+    {
+        const VideoBackendInfo& info = backends[i];
+        if (api == info.id)
+        {
+            CV_Assert(!info.backendFactory.empty());
+            CV_Assert(!info.backendFactory->isBuiltIn());
+            return getCapturePluginVersion(info.backendFactory, version_ABI, version_API);
+        }
+    }
+    CV_Error(Error::StsError, "Unknown or wrong backend ID");
+}
+
+std::string getStreamBackendPluginVersion(VideoCaptureAPIs api,
+    CV_OUT int& version_ABI,
+    CV_OUT int& version_API
+)
+{
+    const std::vector<VideoBackendInfo> backends = VideoBackendRegistry::getInstance().getAvailableBackends_CaptureByFilename();
+    for (size_t i = 0; i < backends.size(); i++)
+    {
+        const VideoBackendInfo& info = backends[i];
+        if (api == info.id)
+        {
+            CV_Assert(!info.backendFactory.empty());
+            CV_Assert(!info.backendFactory->isBuiltIn());
+            return getCapturePluginVersion(info.backendFactory, version_ABI, version_API);
+        }
+    }
+    CV_Error(Error::StsError, "Unknown or wrong backend ID");
+}
+
+
+/** @brief Returns description and ABI/API version of videoio plugin's writer interface */
+std::string getWriterBackendPluginVersion(VideoCaptureAPIs api,
+    CV_OUT int& version_ABI,
+    CV_OUT int& version_API
+)
+{
+    const std::vector<VideoBackendInfo> backends = VideoBackendRegistry::getInstance().getAvailableBackends_Writer();
+    for (size_t i = 0; i < backends.size(); i++)
+    {
+        const VideoBackendInfo& info = backends[i];
+        if (api == info.id)
+        {
+            CV_Assert(!info.backendFactory.empty());
+            CV_Assert(!info.backendFactory->isBuiltIn());
+            return getWriterPluginVersion(info.backendFactory, version_ABI, version_API);
+        }
+    }
+    CV_Error(Error::StsError, "Unknown or wrong backend ID");
+}
+
+
 } // namespace registry
 
 } // namespace
diff --git a/modules/videoio/test/test_plugins.cpp b/modules/videoio/test/test_plugins.cpp
new file mode 100644
index 0000000000..3bae600be1
--- /dev/null
+++ b/modules/videoio/test/test_plugins.cpp
@@ -0,0 +1,105 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+enum VideoBackendMode
+{
+    MODE_CAMERA,
+    MODE_STREAM,
+    MODE_WRITER,
+};
+
+static
+void dumpBackendInfo(VideoCaptureAPIs backend, enum VideoBackendMode mode)
+{
+    std::string name;
+    try
+    {
+        name = videoio_registry::getBackendName(backend);
+    }
+    catch (const std::exception& e)
+    {
+        ADD_FAILURE() << "Can't query name of backend=" << backend << ": " << e.what();
+    }
+    catch (...)
+    {
+        ADD_FAILURE() << "Can't query name of backend=" << backend << ": unknown C++ exception";
+    }
+    bool isBuiltIn = true;
+    try
+    {
+        isBuiltIn = videoio_registry::isBackendBuiltIn(backend);
+    }
+    catch (const std::exception& e)
+    {
+        ADD_FAILURE() << "Failed isBackendBuiltIn(backend=" << backend << "): " << e.what();
+        cout << name << " - UNKNOWN TYPE" << endl;
+        return;
+    }
+    if (isBuiltIn)
+    {
+        cout << name << " - BUILTIN" << endl;
+        return;
+    }
+
+    std::string description = "NO_DESCRIPTION";
+    int version_ABI = 0;
+    int version_API = 0;
+    try
+    {
+        if (mode == MODE_CAMERA)
+            description = videoio_registry::getCameraBackendPluginVersion(backend, version_ABI, version_API);
+        else if (mode == MODE_STREAM)
+            description = videoio_registry::getStreamBackendPluginVersion(backend, version_ABI, version_API);
+        else if (mode == MODE_WRITER)
+            description = videoio_registry::getWriterBackendPluginVersion(backend, version_ABI, version_API);
+        else
+            CV_Error(Error::StsInternal, "");
+        cout << name << " - PLUGIN (" << description << ") ABI=" << version_ABI << " API=" << version_API << endl;
+        return;
+    }
+    catch (const cv::Exception& e)
+    {
+        if (e.code == Error::StsNotImplemented)
+        {
+            cout << name << " - PLUGIN - NOT LOADED" << endl;
+            return;
+        }
+        ADD_FAILURE() << "Failed getBackendPluginDescription(backend=" << backend << "): " << e.what();
+    }
+    catch (const std::exception& e)
+    {
+        ADD_FAILURE() << "Failed getBackendPluginDescription(backend=" << backend << "): " << e.what();
+    }
+    cout << name << " - PLUGIN (ERROR on quering information)" << endl;
+}
+
+TEST(VideoIO_Plugins, query)
+{
+    const std::vector<cv::VideoCaptureAPIs> camera_backends = cv::videoio_registry::getCameraBackends();
+    cout << "== Camera APIs (" << camera_backends.size() << "):" << endl;
+    for (auto backend : camera_backends)
+    {
+        dumpBackendInfo(backend, MODE_CAMERA);
+    }
+
+    const std::vector<cv::VideoCaptureAPIs> stream_backends = cv::videoio_registry::getStreamBackends();
+    cout << "== Stream capture APIs (" << stream_backends.size() << "):" << endl;
+    for (auto backend : stream_backends)
+    {
+        dumpBackendInfo(backend, MODE_STREAM);
+    }
+
+    const std::vector<cv::VideoCaptureAPIs> writer_backends = cv::videoio_registry::getWriterBackends();
+    cout << "== Writer APIs (" << writer_backends.size() << "):" << endl;
+    for (auto backend : writer_backends)
+    {
+        dumpBackendInfo(backend, MODE_WRITER);
+    }
+}
+
+}}
diff --git a/modules/videoio/test/test_video_io.cpp b/modules/videoio/test/test_video_io.cpp
index f9a0be5345..8bc98c73fb 100644
--- a/modules/videoio/test/test_video_io.cpp
+++ b/modules/videoio/test/test_video_io.cpp
@@ -679,7 +679,6 @@ TEST_P(videocapture_acceleration, read)
     VideoCaptureAPIs backend = get<1>(param);
     VideoAccelerationType va_type = get<2>(param);
     bool use_umat = get<3>(param);
-    int device_idx = -1;
     const int frameNum = 15;
 
     std::string filepath = cvtest::findDataFile("video/" + filename);
@@ -699,13 +698,24 @@ TEST_P(videocapture_acceleration, read)
 
 
     // HW reader
-    VideoCapture hw_reader(filepath, backend, {
-            CAP_PROP_HW_ACCELERATION, static_cast<int>(va_type),
-            CAP_PROP_HW_DEVICE, device_idx
-    });
+    std::vector<int> params = { CAP_PROP_HW_ACCELERATION, static_cast<int>(va_type) };
+    if (use_umat)
+    {
+        if (backend != CAP_FFMPEG)
+            throw SkipTestException(cv::String("UMat/OpenCL mapping is not supported by current backend: ") + backend_name);
+        if (!cv::videoio_registry::isBackendBuiltIn(backend))
+            throw SkipTestException(cv::String("UMat/OpenCL mapping is not supported through plugins yet: ") + backend_name);
+        params.push_back(CAP_PROP_HW_ACCELERATION_USE_OPENCL);
+        params.push_back(1);
+    }
+    VideoCapture hw_reader(filepath, backend, params);
     if (!hw_reader.isOpened())
     {
-        if (va_type == VIDEO_ACCELERATION_ANY || va_type == VIDEO_ACCELERATION_NONE)
+        if (use_umat)
+        {
+            throw SkipTestException(backend_name + " VideoCapture on " + filename + " not supported with HW acceleration + OpenCL/Umat mapping, skipping");
+        }
+        else if (va_type == VIDEO_ACCELERATION_ANY || va_type == VIDEO_ACCELERATION_NONE)
         {
             // ANY HW acceleration should have fallback to SW codecs
             VideoCapture sw_reader(filepath, backend, {
@@ -737,13 +747,25 @@ TEST_P(videocapture_acceleration, read)
         if (use_umat)
         {
             UMat umat;
-            EXPECT_TRUE(hw_reader.read(umat));
+            bool read_umat_result = hw_reader.read(umat);
+            if (!read_umat_result && i == 0)
+            {
+                if (filename == "sample_322x242_15frames.yuv420p.libvpx-vp9.mp4")
+                    throw SkipTestException("Unable to read the first frame with VP9 codec (media stack misconfiguration / bug)");
+            }
+            EXPECT_TRUE(read_umat_result);
             ASSERT_FALSE(umat.empty());
             umat.copyTo(frame);
         }
         else
         {
-            EXPECT_TRUE(hw_reader.read(frame));
+            bool read_result = hw_reader.read(frame);
+            if (!read_result && i == 0)
+            {
+                if (filename == "sample_322x242_15frames.yuv420p.libvpx-vp9.mp4")
+                    throw SkipTestException("Unable to read the first frame with VP9 codec (media stack misconfiguration / bug)");
+            }
+            EXPECT_TRUE(read_result);
         }
         ASSERT_FALSE(frame.empty());
 
@@ -799,7 +821,7 @@ static const VideoAccelerationType hw_types[] = {
 
 static bool hw_use_umat[] = {
         false,
-        //true
+        true
 };
 
 INSTANTIATE_TEST_CASE_P(videoio, videocapture_acceleration, testing::Combine(
@@ -823,7 +845,6 @@ TEST_P(videowriter_acceleration, write)
     std::string extension = get<0>(param).ext;
     double psnr_threshold = get<0>(param).PSNR;
     VideoAccelerationType va_type = get<1>(param);
-    int device_idx = -1;
     bool use_umat = get<2>(param);
     std::string backend_name = cv::videoio_registry::getBackendName(backend);
     if (!videoio_registry::hasBackend(backend))
@@ -838,20 +859,31 @@ TEST_P(videowriter_acceleration, write)
     // Write video
     VideoAccelerationType actual_va;
     {
+        std::vector<int> params = { VIDEOWRITER_PROP_HW_ACCELERATION, static_cast<int>(va_type) };
+        if (use_umat) {
+            if (backend != CAP_FFMPEG)
+                throw SkipTestException(cv::String("UMat/OpenCL mapping is not supported by current backend: ") + backend_name);
+            if (!cv::videoio_registry::isBackendBuiltIn(backend))
+                throw SkipTestException(cv::String("UMat/OpenCL mapping is not supported through plugins yet: ") + backend_name);
+            params.push_back(VIDEOWRITER_PROP_HW_ACCELERATION_USE_OPENCL);
+            params.push_back(1);
+        }
         VideoWriter hw_writer(
             filename,
             backend,
             VideoWriter::fourcc(codecid[0], codecid[1], codecid[2], codecid[3]),
             fps,
             sz,
-            {
-                VIDEOWRITER_PROP_HW_ACCELERATION, static_cast<int>(va_type),
-                VIDEOWRITER_PROP_HW_DEVICE, device_idx
-            }
+            params
         );
 
-        if (!hw_writer.isOpened()) {
-            if (va_type == VIDEO_ACCELERATION_ANY || va_type == VIDEO_ACCELERATION_NONE)
+        if (!hw_writer.isOpened())
+        {
+            if (use_umat)
+            {
+                throw SkipTestException(backend_name + " VideoWriter on " + filename + " not supported with HW acceleration + OpenCL/Umat mapping, skipping");
+            }
+            else if (va_type == VIDEO_ACCELERATION_ANY || va_type == VIDEO_ACCELERATION_NONE)
             {
                 // ANY HW acceleration should have fallback to SW codecs
                 {
diff --git a/modules/world/CMakeLists.txt b/modules/world/CMakeLists.txt
index 20edbd5733..2f4b1a4eb1 100644
--- a/modules/world/CMakeLists.txt
+++ b/modules/world/CMakeLists.txt
@@ -59,6 +59,10 @@ ocv_module_include_directories()
 #message(STATUS "${OPENCV_MODULE_${the_module}_SOURCES}")
 ocv_create_module(${link_deps})
 
+if(";${OPENCV_MODULES_BUILD};" MATCHES ";opencv_viz;" AND OPENCV_MODULE_opencv_viz_IS_PART_OF_WORLD AND VTK_VERSION VERSION_GREATER_EQUAL "8.90.0")
+  vtk_module_autoinit(TARGETS opencv_world MODULES ${VTK_LIBRARIES})
+endif()
+
 ocv_target_compile_definitions(${the_module} PRIVATE OPENCV_MODULE_IS_PART_OF_WORLD=1)
 
 if(BUILD_opencv_imgcodecs AND OPENCV_MODULE_opencv_imgcodecs_IS_PART_OF_WORLD)
diff --git a/platforms/ios/build_framework.py b/platforms/ios/build_framework.py
index b0178e036c..c1dd39e763 100755
--- a/platforms/ios/build_framework.py
+++ b/platforms/ios/build_framework.py
@@ -32,7 +32,7 @@ Adding --dynamic parameter will build {framework_name}.framework as App Store dy
 """
 
 from __future__ import print_function, unicode_literals
-import glob, os, os.path, shutil, string, sys, argparse, traceback, multiprocessing
+import glob, os, os.path, shutil, string, sys, argparse, traceback, multiprocessing, codecs, io
 from subprocess import check_call, check_output, CalledProcessError
 from distutils.dir_util import copy_tree
 
@@ -42,7 +42,7 @@ from cv_build_utils import execute, print_error, get_xcode_major, get_xcode_sett
 IPHONEOS_DEPLOYMENT_TARGET='9.0'  # default, can be changed via command line options or environment variable
 
 class Builder:
-    def __init__(self, opencv, contrib, dynamic, bitcodedisabled, exclude, disable, enablenonfree, targets, debug, debug_info, framework_name, run_tests, build_docs):
+    def __init__(self, opencv, contrib, dynamic, bitcodedisabled, exclude, disable, enablenonfree, targets, debug, debug_info, framework_name, run_tests, build_docs, swiftdisabled):
         self.opencv = os.path.abspath(opencv)
         self.contrib = None
         if contrib:
@@ -63,6 +63,7 @@ class Builder:
         self.framework_name = framework_name
         self.run_tests = run_tests
         self.build_docs = build_docs
+        self.swiftdisabled = swiftdisabled
 
     def checkCMakeVersion(self):
         if get_xcode_version() >= (12, 2):
@@ -153,6 +154,22 @@ class Builder:
                 print("To build docs call:")
                 print(sys.argv[0].replace("build_framework", "build_docs") + " " + dirs[0] + "/modules/objc/framework_build")
             self.copy_samples(outdir)
+            if self.swiftdisabled:
+                swift_sources_dir = os.path.join(outdir, "SwiftSources")
+                if not os.path.exists(swift_sources_dir):
+                    os.makedirs(swift_sources_dir)
+                for root, dirs, files in os.walk(dirs[0]):
+                    for file in files:
+                        if file.endswith(".swift") and file.find("Test") == -1:
+                            with io.open(os.path.join(root, file), encoding="utf-8", errors="ignore") as file_in:
+                                body = file_in.read()
+                            if body.find("import Foundation") != -1:
+                                insert_pos = body.find("import Foundation") + len("import Foundation") + 1
+                                body = body[:insert_pos] + "import " + self.framework_name + "\n" + body[insert_pos:]
+                            else:
+                                body = "import " + self.framework_name + "\n\n" + body
+                            with codecs.open(os.path.join(swift_sources_dir, file), "w", "utf-8") as file_out:
+                                file_out.write(body)
 
     def build(self, outdir):
         try:
@@ -297,8 +314,8 @@ class Builder:
         execute(["cmake", "-DBUILD_TYPE=%s" % self.getConfiguration(), "-P", "cmake_install.cmake"], cwd = builddir)
         if self.build_objc_wrapper:
             cmakecmd = self.makeCMakeCmd(arch, target, builddir + "/modules/objc_bindings_generator/{}/gen".format(self.getObjcTarget(target)), cmakeargs)
-            # cmakecmd.append("-DCMAKE_Swift_FLAGS=" + "-target x86_64-apple-ios13.0-macabi")
-            # cmakecmd.append("-DCMAKE_EXE_LINKER_FLAGS=" + "-target x86_64-apple-ios13.0-macabi")
+            if self.swiftdisabled:
+                cmakecmd.append("-DSWIFT_DISABLED=1")
             cmakecmd.append("-DBUILD_ROOT=%s" % builddir)
             cmakecmd.append("-DCMAKE_INSTALL_NAME_TOOL=install_name_tool")
             cmakecmd.append("--no-warn-unused-cli")
@@ -509,6 +526,7 @@ if __name__ == "__main__":
     parser.add_argument('--legacy_build', default=False, dest='legacy_build', action='store_true', help='Build legacy opencv2 framework (default: False, equivalent to "--framework_name=opencv2 --without=objc")')
     parser.add_argument('--run_tests', default=False, dest='run_tests', action='store_true', help='Run tests')
     parser.add_argument('--build_docs', default=False, dest='build_docs', action='store_true', help='Build docs')
+    parser.add_argument('--disable-swift', default=False, dest='swiftdisabled', action='store_true', help='Disable building of Swift extensions')
 
     args, unknown_args = parser.parse_known_args()
     if unknown_args:
@@ -562,6 +580,6 @@ if __name__ == "__main__":
         if iphonesimulator_archs:
             targets.append((iphonesimulator_archs, "iPhoneSimulator"))
 
-    b = iOSBuilder(args.opencv, args.contrib, args.dynamic, args.bitcodedisabled, args.without, args.disable, args.enablenonfree, targets, args.debug, args.debug_info, args.framework_name, args.run_tests, args.build_docs)
+    b = iOSBuilder(args.opencv, args.contrib, args.dynamic, args.bitcodedisabled, args.without, args.disable, args.enablenonfree, targets, args.debug, args.debug_info, args.framework_name, args.run_tests, args.build_docs, args.swiftdisabled)
 
     b.build(args.out)
diff --git a/platforms/linux/riscv64-071-gcc.toolchain.cmake b/platforms/linux/riscv64-071-gcc.toolchain.cmake
new file mode 100644
index 0000000000..be2c7dcda9
--- /dev/null
+++ b/platforms/linux/riscv64-071-gcc.toolchain.cmake
@@ -0,0 +1,9 @@
+set(CMAKE_SYSTEM_NAME "Linux")
+set(CMAKE_C_COMPILER  riscv64-unknown-linux-gnu-gcc)
+set(CMAKE_CXX_COMPILER riscv64-unknown-linux-gnu-g++)
+
+set(CMAKE_CXX_FLAGS ""    CACHE STRING "")
+set(CMAKE_C_FLAGS ""    CACHE STRING "")
+
+set(CMAKE_CXX_FLAGS "-static -march=rv64gcvxthead -mabi=lp64v -pthread -D__riscv_vector_071")
+set(CMAKE_C_FLAGS "-static -march=rv64gcvxthead -mabi=lp64v -pthread -D__riscv_vector_071")
diff --git a/platforms/osx/build_framework.py b/platforms/osx/build_framework.py
index 480f3e3957..2dd5015ee5 100755
--- a/platforms/osx/build_framework.py
+++ b/platforms/osx/build_framework.py
@@ -77,6 +77,7 @@ if __name__ == "__main__":
     parser.add_argument('--legacy_build', default=False, dest='legacy_build', action='store_true', help='Build legacy framework (default: False, equivalent to "--framework_name=opencv2 --without=objc")')
     parser.add_argument('--run_tests', default=False, dest='run_tests', action='store_true', help='Run tests')
     parser.add_argument('--build_docs', default=False, dest='build_docs', action='store_true', help='Build docs')
+    parser.add_argument('--disable-swift', default=False, dest='swiftdisabled', action='store_true', help='Disable building of Swift extensions')
 
     args, unknown_args = parser.parse_known_args()
     if unknown_args:
@@ -127,5 +128,5 @@ if __name__ == "__main__":
     if catalyst_archs:
         targets.append((catalyst_archs, "Catalyst")),
 
-    b = OSXBuilder(args.opencv, args.contrib, args.dynamic, True, args.without, args.disable, args.enablenonfree, targets, args.debug, args.debug_info, args.framework_name, args.run_tests, args.build_docs)
+    b = OSXBuilder(args.opencv, args.contrib, args.dynamic, True, args.without, args.disable, args.enablenonfree, targets, args.debug, args.debug_info, args.framework_name, args.run_tests, args.build_docs, args.swiftdisabled)
     b.build(args.out)
diff --git a/samples/cpp/calibration.cpp b/samples/cpp/calibration.cpp
index 8c8be66929..91f10ae749 100644
--- a/samples/cpp/calibration.cpp
+++ b/samples/cpp/calibration.cpp
@@ -539,9 +539,9 @@ int main( int argc, char** argv )
         if( mode == CAPTURING )
         {
             if(undistortImage)
-                msg = format( "%d/%d Undist", (int)imagePoints.size(), nframes );
+                msg = cv::format( "%d/%d Undist", (int)imagePoints.size(), nframes );
             else
-                msg = format( "%d/%d", (int)imagePoints.size(), nframes );
+                msg = cv::format( "%d/%d", (int)imagePoints.size(), nframes );
         }
 
         putText( view, msg, textOrigin, 1, 1,
diff --git a/samples/cpp/digits_svm.cpp b/samples/cpp/digits_svm.cpp
index e401ab2b89..c55b320da5 100644
--- a/samples/cpp/digits_svm.cpp
+++ b/samples/cpp/digits_svm.cpp
@@ -137,7 +137,7 @@ static void evaluate_model(const vector<float>& predictions, const vector<Mat>&
 
     err /= predictions.size();
 
-    cout << format("error: %.2f %%", err * 100) << endl;
+    cout << cv::format("error: %.2f %%", err * 100) << endl;
 
     int confusion[10][10] = {};
 
@@ -151,7 +151,7 @@ static void evaluate_model(const vector<float>& predictions, const vector<Mat>&
     {
         for (int j = 0; j < 10; j++)
         {
-            cout << format("%2d ", confusion[i][j]);
+            cout << cv::format("%2d ", confusion[i][j]);
         }
         cout << endl;
     }
diff --git a/samples/cpp/select3dobj.cpp b/samples/cpp/select3dobj.cpp
index c9128ec7fe..582c7f2789 100644
--- a/samples/cpp/select3dobj.cpp
+++ b/samples/cpp/select3dobj.cpp
@@ -491,7 +491,7 @@ int main(int argc, char** argv)
     setMouseCallback("View", onMouse, 0);
     bool boardFound = false;
 
-    string indexFilename = format("%s_index.yml", outprefix.c_str());
+    string indexFilename = cv::format("%s_index.yml", outprefix.c_str());
 
     vector<string> capturedImgList;
     vector<Rect> roiList;
diff --git a/samples/cpp/squares.cpp b/samples/cpp/squares.cpp
index 042a716f90..2ea824decd 100644
--- a/samples/cpp/squares.cpp
+++ b/samples/cpp/squares.cpp
@@ -123,14 +123,14 @@ static void findSquares( const Mat& image, vector<vector<Point> >& squares )
 
 int main(int argc, char** argv)
 {
-    static const char* names[] = { "pic1.png", "pic2.png", "pic3.png",
+    const char* names[] = { "pic1.png", "pic2.png", "pic3.png",
         "pic4.png", "pic5.png", "pic6.png", 0 };
     help(argv[0]);
 
     if( argc > 1)
     {
      names[0] =  argv[1];
-     names[1] =  "0";
+     names[1] =  0;
     }
 
     for( int i = 0; names[i] != 0; i++ )
diff --git a/samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp b/samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp
index c9bf60bdec..95c27bd5f0 100644
--- a/samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp
+++ b/samples/cpp/tutorial_code/calib3d/camera_calibration/camera_calibration.cpp
@@ -40,7 +40,7 @@ public:
                   << "Write_gridPoints" << writeGrid
                   << "Write_outputFileName"  << outputFileName
 
-                  << "Show_UndistortedImage" << showUndistorsed
+                  << "Show_UndistortedImage" << showUndistorted
 
                   << "Input_FlipAroundHorizontalAxis" << flipVertical
                   << "Input_Delay" << delay
@@ -63,7 +63,7 @@ public:
         node["Calibrate_FixPrincipalPointAtTheCenter"] >> calibFixPrincipalPoint;
         node["Calibrate_UseFisheyeModel"] >> useFisheye;
         node["Input_FlipAroundHorizontalAxis"] >> flipVertical;
-        node["Show_UndistortedImage"] >> showUndistorsed;
+        node["Show_UndistortedImage"] >> showUndistorted;
         node["Input"] >> input;
         node["Input_Delay"] >> delay;
         node["Fix_K1"] >> fixK1;
@@ -211,7 +211,7 @@ public:
     bool calibFixPrincipalPoint; // Fix the principal point at the center
     bool flipVertical;           // Flip the captured images around the horizontal axis
     string outputFileName;       // The name of the file where to write
-    bool showUndistorsed;        // Show undistorted images after calibration
+    bool showUndistorted;        // Show undistorted images after calibration
     string input;                // The input ->
     bool useFisheye;             // use fisheye camera model for calibration
     bool fixK1;                  // fix K1 distortion coefficient
@@ -402,10 +402,10 @@ int main(int argc, char* argv[])
 
         if( mode == CAPTURING )
         {
-            if(s.showUndistorsed)
-                msg = format( "%d/%d Undist", (int)imagePoints.size(), s.nrFrames );
+            if(s.showUndistorted)
+                msg = cv::format( "%d/%d Undist", (int)imagePoints.size(), s.nrFrames );
             else
-                msg = format( "%d/%d", (int)imagePoints.size(), s.nrFrames );
+                msg = cv::format( "%d/%d", (int)imagePoints.size(), s.nrFrames );
         }
 
         putText( view, msg, textOrigin, 1, 1, mode == CALIBRATED ?  GREEN : RED);
@@ -415,7 +415,7 @@ int main(int argc, char* argv[])
         //! [output_text]
         //------------------------- Video capture  output  undistorted ------------------------------
         //! [output_undistorted]
-        if( mode == CALIBRATED && s.showUndistorsed )
+        if( mode == CALIBRATED && s.showUndistorted )
         {
             Mat temp = view.clone();
             if (s.useFisheye)
@@ -438,7 +438,7 @@ int main(int argc, char* argv[])
             break;
 
         if( key == 'u' && mode == CALIBRATED )
-           s.showUndistorsed = !s.showUndistorsed;
+           s.showUndistorted = !s.showUndistorted;
 
         if( s.inputCapture.isOpened() && key == 'g' )
         {
@@ -450,7 +450,7 @@ int main(int argc, char* argv[])
 
     // -----------------------Show the undistorted image for the image list ------------------------
     //! [show_results]
-    if( s.inputType == Settings::IMAGE_LIST && s.showUndistorsed && !cameraMatrix.empty())
+    if( s.inputType == Settings::IMAGE_LIST && s.showUndistorted && !cameraMatrix.empty())
     {
         Mat view, rview, map1, map2;
 
diff --git a/samples/cpp/tutorial_code/calib3d/real_time_pose_estimation/src/main_detection.cpp b/samples/cpp/tutorial_code/calib3d/real_time_pose_estimation/src/main_detection.cpp
index 9d19d166ec..209eb7abfe 100644
--- a/samples/cpp/tutorial_code/calib3d/real_time_pose_estimation/src/main_detection.cpp
+++ b/samples/cpp/tutorial_code/calib3d/real_time_pose_estimation/src/main_detection.cpp
@@ -361,7 +361,7 @@ int main(int argc, char *argv[])
                 frame_vis.copyTo(frameSave);
             }
 
-            string saveFilename = format(string(saveDirectory + "/image_%04d.png").c_str(), frameCount);
+            string saveFilename = cv::format(string(saveDirectory + "/image_%04d.png").c_str(), frameCount);
             imwrite(saveFilename, frameSave);
             frameCount++;
         }
diff --git a/samples/dnn/CMakeLists.txt b/samples/dnn/CMakeLists.txt
index f2cb949d0a..209fbb586c 100644
--- a/samples/dnn/CMakeLists.txt
+++ b/samples/dnn/CMakeLists.txt
@@ -4,6 +4,7 @@ set(OPENCV_DNN_SAMPLES_REQUIRED_DEPS
   opencv_core
   opencv_imgproc
   opencv_dnn
+  opencv_video
   opencv_imgcodecs
   opencv_videoio
   opencv_highgui)
diff --git a/samples/dnn/classification.cpp b/samples/dnn/classification.cpp
index 8440371688..769d6874be 100644
--- a/samples/dnn/classification.cpp
+++ b/samples/dnn/classification.cpp
@@ -22,12 +22,17 @@ std::string keys =
                             "0: automatically (by default), "
                             "1: Halide language (http://halide-lang.org/), "
                             "2: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
-                            "3: OpenCV implementation }"
+                            "3: OpenCV implementation, "
+                            "4: VKCOM, "
+                            "5: CUDA },"
     "{ target           | 0 | Choose one of target computation devices: "
                             "0: CPU target (by default), "
                             "1: OpenCL, "
                             "2: OpenCL fp16 (half-float precision), "
-                            "3: VPU }";
+                            "3: VPU, "
+                            "4: Vulkan, "
+                            "6: CUDA, "
+                            "7: CUDA fp16 (half-float preprocess) }";
 
 using namespace cv;
 using namespace dnn;
diff --git a/samples/dnn/classification.py b/samples/dnn/classification.py
index 558c8b0bdc..be639e8d74 100644
--- a/samples/dnn/classification.py
+++ b/samples/dnn/classification.py
@@ -7,9 +7,9 @@ from common import *
 
 def get_args_parser(func_args):
     backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_HALIDE, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE,
-                cv.dnn.DNN_BACKEND_OPENCV)
+                cv.dnn.DNN_BACKEND_OPENCV, cv.dnn.DNN_BACKEND_VKCOM, cv.dnn.DNN_BACKEND_CUDA)
     targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD,
-               cv.dnn.DNN_TARGET_HDDL)
+               cv.dnn.DNN_TARGET_HDDL, cv.dnn.DNN_TARGET_VULKAN, cv.dnn.DNN_TARGET_CUDA, cv.dnn.DNN_TARGET_CUDA_FP16)
 
     parser = argparse.ArgumentParser(add_help=False)
     parser.add_argument('--zoo', default=os.path.join(os.path.dirname(os.path.abspath(__file__)), 'models.yml'),
@@ -32,14 +32,19 @@ def get_args_parser(func_args):
                              "%d: automatically (by default), "
                              "%d: Halide language (http://halide-lang.org/), "
                              "%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
-                             "%d: OpenCV implementation" % backends)
+                             "%d: OpenCV implementation, "
+                             "%d: VKCOM, "
+                             "%d: CUDA" % backends)
     parser.add_argument('--target', choices=targets, default=cv.dnn.DNN_TARGET_CPU, type=int,
                         help='Choose one of target computation devices: '
                              '%d: CPU target (by default), '
                              '%d: OpenCL, '
                              '%d: OpenCL fp16 (half-float precision), '
                              '%d: NCS2 VPU, '
-                             '%d: HDDL VPU' % targets)
+                             '%d: HDDL VPU, '
+                             '%d: Vulkan, '
+                             '%d: CUDA, '
+                             '%d: CUDA fp16 (half-float preprocess)'% targets)
 
     args, _ = parser.parse_known_args()
     add_preproc_args(args.zoo, parser, 'classification')
diff --git a/samples/dnn/dasiamrpn_tracker.cpp b/samples/dnn/dasiamrpn_tracker.cpp
index 0008cee255..f6e307c682 100644
--- a/samples/dnn/dasiamrpn_tracker.cpp
+++ b/samples/dnn/dasiamrpn_tracker.cpp
@@ -12,6 +12,7 @@
 #include <opencv2/dnn.hpp>
 #include <opencv2/imgproc.hpp>
 #include <opencv2/highgui.hpp>
+#include <opencv2/video.hpp>
 
 using namespace cv;
 using namespace cv::dnn;
@@ -26,67 +27,19 @@ const char *keys =
                             "0: automatically (by default), "
                             "1: Halide language (http://halide-lang.org/), "
                             "2: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
-                            "3: OpenCV implementation }"
+                            "3: OpenCV implementation, "
+                            "4: VKCOM, "
+                            "5: CUDA },"
         "{ target      | 0 | Choose one of target computation devices: "
                             "0: CPU target (by default), "
                             "1: OpenCL, "
                             "2: OpenCL fp16 (half-float precision), "
-                            "3: VPU }"
+                            "3: VPU, "
+                            "4: Vulkan, "
+                            "6: CUDA, "
+                            "7: CUDA fp16 (half-float preprocess) }"
 ;
 
-// Initial parameters of the model
-struct trackerConfig
-{
-    float windowInfluence = 0.43f;
-    float lr = 0.4f;
-    int scale = 8;
-    bool swapRB = false;
-    int totalStride = 8;
-    float penaltyK = 0.055f;
-    int exemplarSize = 127;
-    int instanceSize = 271;
-    float contextAmount = 0.5f;
-    std::vector<float> ratios = { 0.33f, 0.5f, 1.0f, 2.0f, 3.0f };
-    int anchorNum = int(ratios.size());
-    Mat anchors;
-    Mat windows;
-    Scalar avgChans;
-    Size imgSize = { 0, 0 };
-    Rect2f targetBox = { 0, 0, 0, 0 };
-    int scoreSize = (instanceSize - exemplarSize) / totalStride + 1;
-
-    void update_scoreSize()
-    {
-        scoreSize = int((instanceSize - exemplarSize) / totalStride + 1);
-    }
-};
-
-static void softmax(const Mat& src, Mat& dst);
-static void elementMax(Mat& src);
-static Mat generateHanningWindow(const trackerConfig& trackState);
-static Mat generateAnchors(trackerConfig& trackState);
-static Mat getSubwindow(Mat& img, const Rect2f& targetBox, float originalSize, Scalar avgChans);
-static float trackerEval(Mat img, trackerConfig& trackState, Net& siamRPN);
-static void trackerInit(Mat img, trackerConfig& trackState, Net& siamRPN, Net& siamKernelR1, Net& siamKernelCL1);
-
-template <typename T> static
-T sizeCal(const T& w, const T& h)
-{
-    T pad = (w + h) * T(0.5);
-    T sz2 = (w + pad) * (h + pad);
-    return sqrt(sz2);
-}
-
-template <>
-Mat sizeCal(const Mat& w, const Mat& h)
-{
-    Mat pad = (w + h) * 0.5;
-    Mat sz2 = (w + pad).mul((h + pad));
-
-    cv::sqrt(sz2, sz2);
-    return sz2;
-}
-
 static
 int run(int argc, char** argv)
 {
@@ -106,13 +59,16 @@ int run(int argc, char** argv)
     int backend = parser.get<int>("backend");
     int target = parser.get<int>("target");
 
-    // Read nets.
-    Net siamRPN, siamKernelCL1, siamKernelR1;
+    Ptr<TrackerDaSiamRPN> tracker;
     try
     {
-        siamRPN = readNet(samples::findFile(net));
-        siamKernelCL1 = readNet(samples::findFile(kernel_cls1));
-        siamKernelR1 = readNet(samples::findFile(kernel_r1));
+        TrackerDaSiamRPN::Params params;
+        params.model = samples::findFile(net);
+        params.kernel_cls1 = samples::findFile(kernel_cls1);
+        params.kernel_r1 = samples::findFile(kernel_r1);
+        params.backend = backend;
+        params.target = target;
+        tracker = TrackerDaSiamRPN::create(params);
     }
     catch (const cv::Exception& ee)
     {
@@ -124,14 +80,6 @@ int run(int argc, char** argv)
         return 2;
     }
 
-    // Set model backend.
-    siamRPN.setPreferableBackend(backend);
-    siamRPN.setPreferableTarget(target);
-    siamKernelR1.setPreferableBackend(backend);
-    siamKernelR1.setPreferableTarget(target);
-    siamKernelCL1.setPreferableBackend(backend);
-    siamKernelCL1.setPreferableTarget(target);
-
     const std::string winName = "DaSiamRPN";
     namedWindow(winName, WINDOW_AUTOSIZE);
 
@@ -174,17 +122,7 @@ int run(int argc, char** argv)
     Rect selectRect = selectROI(winName, image_select);
     std::cout << "ROI=" << selectRect << std::endl;
 
-    trackerConfig trackState;
-    trackState.update_scoreSize();
-    trackState.targetBox = Rect2f(
-        float(selectRect.x) + float(selectRect.width) * 0.5f,  // FIXIT don't use center in Rect structures, it is confusing
-        float(selectRect.y) + float(selectRect.height) * 0.5f,
-        float(selectRect.width),
-        float(selectRect.height)
-    );
-
-    // Set tracking template.
-    trackerInit(image, trackState, siamRPN, siamKernelR1, siamKernelCL1);
+    tracker->init(image, selectRect);
 
     TickMeter tickMeter;
 
@@ -197,16 +135,14 @@ int run(int argc, char** argv)
             break;
         }
 
+        Rect rect;
+
         tickMeter.start();
-        float score = trackerEval(image, trackState, siamRPN);
+        bool ok = tracker->update(image, rect);
         tickMeter.stop();
 
-        Rect rect = {
-            int(trackState.targetBox.x - int(trackState.targetBox.width / 2)),
-            int(trackState.targetBox.y - int(trackState.targetBox.height / 2)),
-            int(trackState.targetBox.width),
-            int(trackState.targetBox.height)
-        };
+        float score = tracker->getTrackingScore();
+
         std::cout << "frame " << count <<
             ": predicted score=" << score <<
             "  rect=" << rect <<
@@ -214,12 +150,16 @@ int run(int argc, char** argv)
             std::endl;
 
         Mat render_image = image.clone();
-        rectangle(render_image, rect, Scalar(0, 255, 0), 2);
 
-        std::string timeLabel = format("Inference time: %.2f ms", tickMeter.getTimeMilli());
-        std::string scoreLabel = format("Score: %f", score);
-        putText(render_image, timeLabel, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
-        putText(render_image, scoreLabel, Point(0, 35), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
+        if (ok)
+        {
+            rectangle(render_image, rect, Scalar(0, 255, 0), 2);
+
+            std::string timeLabel = format("Inference time: %.2f ms", tickMeter.getTimeMilli());
+            std::string scoreLabel = format("Score: %f", score);
+            putText(render_image, timeLabel, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
+            putText(render_image, scoreLabel, Point(0, 35), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
+        }
 
         imshow(winName, render_image);
 
@@ -234,275 +174,6 @@ int run(int argc, char** argv)
     return 0;
 }
 
-Mat generateHanningWindow(const trackerConfig& trackState)
-{
-    Mat baseWindows, HanningWindows;
-
-    createHanningWindow(baseWindows, Size(trackState.scoreSize, trackState.scoreSize), CV_32F);
-    baseWindows = baseWindows.reshape(0, { 1, trackState.scoreSize, trackState.scoreSize });
-    HanningWindows = baseWindows.clone();
-    for (int i = 1; i < trackState.anchorNum; i++)
-    {
-        HanningWindows.push_back(baseWindows);
-    }
-
-    return HanningWindows;
-}
-
-Mat generateAnchors(trackerConfig& trackState)
-{
-    int totalStride = trackState.totalStride, scales = trackState.scale, scoreSize = trackState.scoreSize;
-    std::vector<float> ratios = trackState.ratios;
-    std::vector<Rect2f> baseAnchors;
-    int anchorNum = int(ratios.size());
-    int size = totalStride * totalStride;
-
-    float ori = -(float(scoreSize / 2)) * float(totalStride);
-
-    for (auto i = 0; i < anchorNum; i++)
-    {
-        int ws = int(sqrt(size / ratios[i]));
-        int hs = int(ws * ratios[i]);
-
-        float wws = float(ws) * scales;
-        float hhs = float(hs) * scales;
-        Rect2f anchor = { 0, 0, wws, hhs };
-        baseAnchors.push_back(anchor);
-    }
-
-    int anchorIndex[] = { 0, 0, 0, 0 };
-    const int sizes[] = { 4, (int)ratios.size(), scoreSize, scoreSize };
-    Mat anchors(4, sizes, CV_32F);
-
-    for (auto i = 0; i < scoreSize; i++)
-    {
-        for (auto j = 0; j < scoreSize; j++)
-        {
-            for (auto k = 0; k < anchorNum; k++)
-            {
-                anchorIndex[0] = 1, anchorIndex[1] = k, anchorIndex[2] = i, anchorIndex[3] = j;
-                anchors.at<float>(anchorIndex) = ori + totalStride * i;
-
-                anchorIndex[0] = 0;
-                anchors.at<float>(anchorIndex) = ori + totalStride * j;
-
-                anchorIndex[0] = 2;
-                anchors.at<float>(anchorIndex) = baseAnchors[k].width;
-
-                anchorIndex[0] = 3;
-                anchors.at<float>(anchorIndex) = baseAnchors[k].height;
-            }
-        }
-    }
-
-    return anchors;
-}
-
-Mat getSubwindow(Mat& img, const Rect2f& targetBox, float originalSize, Scalar avgChans)
-{
-    Mat zCrop, dst;
-    Size imgSize = img.size();
-    float c = (originalSize + 1) / 2;
-    float xMin = (float)cvRound(targetBox.x - c);
-    float xMax = xMin + originalSize - 1;
-    float yMin = (float)cvRound(targetBox.y - c);
-    float yMax = yMin + originalSize - 1;
-
-    int leftPad = (int)(fmax(0., -xMin));
-    int topPad = (int)(fmax(0., -yMin));
-    int rightPad = (int)(fmax(0., xMax - imgSize.width + 1));
-    int bottomPad = (int)(fmax(0., yMax - imgSize.height + 1));
-
-    xMin = xMin + leftPad;
-    xMax = xMax + leftPad;
-    yMax = yMax + topPad;
-    yMin = yMin + topPad;
-
-    if (topPad == 0 && bottomPad == 0 && leftPad == 0 && rightPad == 0)
-    {
-        img(Rect(int(xMin), int(yMin), int(xMax - xMin + 1), int(yMax - yMin + 1))).copyTo(zCrop);
-    }
-    else
-    {
-        copyMakeBorder(img, dst, topPad, bottomPad, leftPad, rightPad, BORDER_CONSTANT, avgChans);
-        dst(Rect(int(xMin), int(yMin), int(xMax - xMin + 1), int(yMax - yMin + 1))).copyTo(zCrop);
-    }
-
-    return zCrop;
-}
-
-void softmax(const Mat& src, Mat& dst)
-{
-    Mat maxVal;
-    cv::max(src.row(1), src.row(0), maxVal);
-
-    src.row(1) -= maxVal;
-    src.row(0) -= maxVal;
-
-    exp(src, dst);
-
-    Mat sumVal = dst.row(0) + dst.row(1);
-    dst.row(0) = dst.row(0) / sumVal;
-    dst.row(1) = dst.row(1) / sumVal;
-}
-
-void elementMax(Mat& src)
-{
-    int* p = src.size.p;
-    int index[] = { 0, 0, 0, 0 };
-    for (int n = 0; n < *p; n++)
-    {
-        for (int k = 0; k < *(p + 1); k++)
-        {
-            for (int i = 0; i < *(p + 2); i++)
-            {
-                for (int j = 0; j < *(p + 3); j++)
-                {
-                    index[0] = n, index[1] = k, index[2] = i, index[3] = j;
-                    float& v = src.at<float>(index);
-                    v = fmax(v, 1.0f / v);
-                }
-            }
-        }
-    }
-}
-
-float trackerEval(Mat img, trackerConfig& trackState, Net& siamRPN)
-{
-    Rect2f targetBox = trackState.targetBox;
-
-    float wc = targetBox.height + trackState.contextAmount * (targetBox.width + targetBox.height);
-    float hc = targetBox.width + trackState.contextAmount * (targetBox.width + targetBox.height);
-
-    float sz = sqrt(wc * hc);
-    float scaleZ = trackState.exemplarSize / sz;
-
-    float searchSize = float((trackState.instanceSize - trackState.exemplarSize) / 2);
-    float pad = searchSize / scaleZ;
-    float sx = sz + 2 * pad;
-
-    Mat xCrop = getSubwindow(img, targetBox, (float)cvRound(sx), trackState.avgChans);
-
-    static Mat blob;
-    std::vector<Mat> outs;
-    std::vector<String> outNames;
-    Mat delta, score;
-    Mat sc, rc, penalty, pscore;
-
-    blobFromImage(xCrop, blob, 1.0, Size(trackState.instanceSize, trackState.instanceSize), Scalar(), trackState.swapRB, false, CV_32F);
-
-    siamRPN.setInput(blob);
-
-    outNames = siamRPN.getUnconnectedOutLayersNames();
-    siamRPN.forward(outs, outNames);
-
-    delta = outs[0];
-    score = outs[1];
-
-    score = score.reshape(0, { 2, trackState.anchorNum, trackState.scoreSize, trackState.scoreSize });
-    delta = delta.reshape(0, { 4, trackState.anchorNum, trackState.scoreSize, trackState.scoreSize });
-
-    softmax(score, score);
-
-    targetBox.width *= scaleZ;
-    targetBox.height *= scaleZ;
-
-    score = score.row(1);
-    score = score.reshape(0, { 5, 19, 19 });
-
-    // Post processing
-    delta.row(0) = delta.row(0).mul(trackState.anchors.row(2)) + trackState.anchors.row(0);
-    delta.row(1) = delta.row(1).mul(trackState.anchors.row(3)) + trackState.anchors.row(1);
-    exp(delta.row(2), delta.row(2));
-    delta.row(2) = delta.row(2).mul(trackState.anchors.row(2));
-    exp(delta.row(3), delta.row(3));
-    delta.row(3) = delta.row(3).mul(trackState.anchors.row(3));
-
-    sc = sizeCal(delta.row(2), delta.row(3)) / sizeCal(targetBox.width, targetBox.height);
-    elementMax(sc);
-
-    rc = delta.row(2).mul(1 / delta.row(3));
-    rc = (targetBox.width / targetBox.height) / rc;
-    elementMax(rc);
-
-    // Calculating the penalty
-    exp(((rc.mul(sc) - 1.) * trackState.penaltyK * (-1.0)), penalty);
-    penalty = penalty.reshape(0, { trackState.anchorNum, trackState.scoreSize, trackState.scoreSize });
-
-    pscore = penalty.mul(score);
-    pscore = pscore * (1.0 - trackState.windowInfluence) + trackState.windows * trackState.windowInfluence;
-
-    int bestID[] = { 0 };
-    // Find the index of best score.
-    minMaxIdx(pscore.reshape(0, { trackState.anchorNum * trackState.scoreSize * trackState.scoreSize, 1 }), 0, 0, 0, bestID);
-    delta = delta.reshape(0, { 4, trackState.anchorNum * trackState.scoreSize * trackState.scoreSize });
-    penalty = penalty.reshape(0, { trackState.anchorNum * trackState.scoreSize * trackState.scoreSize, 1 });
-    score = score.reshape(0, { trackState.anchorNum * trackState.scoreSize * trackState.scoreSize, 1 });
-
-    int index[] = { 0, bestID[0] };
-    Rect2f resBox = { 0, 0, 0, 0 };
-
-    resBox.x = delta.at<float>(index) / scaleZ;
-    index[0] = 1;
-    resBox.y = delta.at<float>(index) / scaleZ;
-    index[0] = 2;
-    resBox.width = delta.at<float>(index) / scaleZ;
-    index[0] = 3;
-    resBox.height = delta.at<float>(index) / scaleZ;
-
-    float lr = penalty.at<float>(bestID) * score.at<float>(bestID) * trackState.lr;
-
-    resBox.x = resBox.x + targetBox.x;
-    resBox.y = resBox.y + targetBox.y;
-    targetBox.width /= scaleZ;
-    targetBox.height /= scaleZ;
-
-    resBox.width = targetBox.width * (1 - lr) + resBox.width * lr;
-    resBox.height = targetBox.height * (1 - lr) + resBox.height * lr;
-
-    resBox.x = float(fmax(0., fmin(float(trackState.imgSize.width), resBox.x)));
-    resBox.y = float(fmax(0., fmin(float(trackState.imgSize.height), resBox.y)));
-    resBox.width = float(fmax(10., fmin(float(trackState.imgSize.width), resBox.width)));
-    resBox.height = float(fmax(10., fmin(float(trackState.imgSize.height), resBox.height)));
-
-    trackState.targetBox = resBox;
-    return score.at<float>(bestID);
-}
-
-void trackerInit(Mat img, trackerConfig& trackState, Net& siamRPN, Net& siamKernelR1, Net& siamKernelCL1)
-{
-    Rect2f targetBox = trackState.targetBox;
-    Mat anchors = generateAnchors(trackState);
-    trackState.anchors = anchors;
-
-    Mat windows = generateHanningWindow(trackState);
-
-    trackState.windows = windows;
-    trackState.imgSize = img.size();
-
-    trackState.avgChans = mean(img);
-    float wc = targetBox.width + trackState.contextAmount * (targetBox.width + targetBox.height);
-    float hc = targetBox.height + trackState.contextAmount * (targetBox.width + targetBox.height);
-    float sz = (float)cvRound(sqrt(wc * hc));
-
-    Mat zCrop = getSubwindow(img, targetBox, sz, trackState.avgChans);
-    static Mat blob;
-
-    blobFromImage(zCrop, blob, 1.0, Size(trackState.exemplarSize, trackState.exemplarSize), Scalar(), trackState.swapRB, false, CV_32F);
-    siamRPN.setInput(blob);
-    Mat out1;
-    siamRPN.forward(out1, "63");
-
-    siamKernelCL1.setInput(out1);
-    siamKernelR1.setInput(out1);
-
-    Mat cls1 = siamKernelCL1.forward();
-    Mat r1 = siamKernelR1.forward();
-    std::vector<int> r1_shape = { 20, 256, 4, 4 }, cls1_shape = { 10, 256, 4, 4 };
-
-    siamRPN.setParam(siamRPN.getLayerId("65"), 0, r1.reshape(0, r1_shape));
-    siamRPN.setParam(siamRPN.getLayerId("68"), 0, cls1.reshape(0, cls1_shape));
-}
 
 int main(int argc, char **argv)
 {
diff --git a/samples/dnn/dasiamrpn_tracker.py b/samples/dnn/dasiamrpn_tracker.py
deleted file mode 100644
index 03e99d6dbf..0000000000
--- a/samples/dnn/dasiamrpn_tracker.py
+++ /dev/null
@@ -1,291 +0,0 @@
-"""
-DaSiamRPN tracker.
-Original paper: https://arxiv.org/abs/1808.06048
-Link to original repo: https://github.com/foolwood/DaSiamRPN
-Links to onnx models:
-network:     https://www.dropbox.com/s/rr1lk9355vzolqv/dasiamrpn_model.onnx?dl=0
-kernel_r1:   https://www.dropbox.com/s/999cqx5zrfi7w4p/dasiamrpn_kernel_r1.onnx?dl=0
-kernel_cls1: https://www.dropbox.com/s/qvmtszx5h339a0w/dasiamrpn_kernel_cls1.onnx?dl=0
-"""
-
-import numpy as np
-import cv2 as cv
-import argparse
-import sys
-
-class DaSiamRPNTracker:
-    # Initialization of used values, initial bounding box, used network
-    def __init__(self, net="dasiamrpn_model.onnx", kernel_r1="dasiamrpn_kernel_r1.onnx", kernel_cls1="dasiamrpn_kernel_cls1.onnx"):
-        self.windowing = "cosine"
-        self.exemplar_size = 127
-        self.instance_size = 271
-        self.total_stride = 8
-        self.score_size = (self.instance_size - self.exemplar_size) // self.total_stride + 1
-        self.context_amount = 0.5
-        self.ratios = [0.33, 0.5, 1, 2, 3]
-        self.scales = [8, ]
-        self.anchor_num = len(self.ratios) * len(self.scales)
-        self.penalty_k = 0.055
-        self.window_influence = 0.42
-        self.lr = 0.295
-        self.score = []
-        if self.windowing == "cosine":
-            self.window = np.outer(np.hanning(self.score_size), np.hanning(self.score_size))
-        elif self.windowing == "uniform":
-            self.window = np.ones((self.score_size, self.score_size))
-        self.window = np.tile(self.window.flatten(), self.anchor_num)
-        # Loading network`s and kernel`s models
-        self.net = cv.dnn.readNet(net)
-        self.kernel_r1 = cv.dnn.readNet(kernel_r1)
-        self.kernel_cls1 = cv.dnn.readNet(kernel_cls1)
-
-    def init(self, im, init_bb):
-        target_pos, target_sz = np.array([init_bb[0], init_bb[1]]), np.array([init_bb[2], init_bb[3]])
-        self.im_h = im.shape[0]
-        self.im_w = im.shape[1]
-        self.target_pos = target_pos
-        self.target_sz = target_sz
-        self.avg_chans = np.mean(im, axis=(0, 1))
-
-        # When we trying to generate ONNX model from the pre-trained .pth model
-        # we are using only one state of the network. In our case used state
-        # with big bounding box, so we were forced to add assertion for
-        # too small bounding boxes - current state of the network can not
-        # work properly with such small bounding boxes
-        if ((self.target_sz[0] * self.target_sz[1]) / float(self.im_h * self.im_w)) < 0.004:
-            raise AssertionError(
-        "Initializing BB is too small-try to restart tracker with larger BB")
-
-        self.anchor = self.__generate_anchor()
-        wc_z = self.target_sz[0] + self.context_amount * sum(self.target_sz)
-        hc_z = self.target_sz[1] + self.context_amount * sum(self.target_sz)
-        s_z = round(np.sqrt(wc_z * hc_z))
-        z_crop = self.__get_subwindow_tracking(im, self.exemplar_size, s_z)
-        z_crop = z_crop.transpose(2, 0, 1).reshape(1, 3, 127, 127).astype(np.float32)
-        self.net.setInput(z_crop)
-        z_f = self.net.forward('63')
-        self.kernel_r1.setInput(z_f)
-        r1 = self.kernel_r1.forward()
-        self.kernel_cls1.setInput(z_f)
-        cls1 = self.kernel_cls1.forward()
-        r1 = r1.reshape(20, 256, 4, 4)
-        cls1 = cls1.reshape(10, 256 , 4, 4)
-        self.net.setParam(self.net.getLayerId('65'), 0, r1)
-        self.net.setParam(self.net.getLayerId('68'), 0, cls1)
-
-    # Сreating anchor for tracking bounding box
-    def __generate_anchor(self):
-        self.anchor = np.zeros((self.anchor_num, 4),  dtype = np.float32)
-        size = self.total_stride * self.total_stride
-        count = 0
-
-        for ratio in self.ratios:
-            ws = int(np.sqrt(size / ratio))
-            hs = int(ws * ratio)
-            for scale in self.scales:
-                wws = ws * scale
-                hhs = hs * scale
-                self.anchor[count] = [0, 0, wws, hhs]
-                count += 1
-
-        score_sz = int(self.score_size)
-        self.anchor = np.tile(self.anchor, score_sz * score_sz).reshape((-1, 4))
-        ori = - (score_sz / 2) * self.total_stride
-        xx, yy = np.meshgrid([ori + self.total_stride * dx for dx in range(score_sz)], [ori + self.total_stride * dy for dy in range(score_sz)])
-        xx, yy = np.tile(xx.flatten(), (self.anchor_num, 1)).flatten(), np.tile(yy.flatten(), (self.anchor_num, 1)).flatten()
-        self.anchor[:, 0], self.anchor[:, 1] = xx.astype(np.float32), yy.astype(np.float32)
-        return self.anchor
-
-    # Function for updating tracker state
-    def update(self, im):
-        wc_z = self.target_sz[1] + self.context_amount * sum(self.target_sz)
-        hc_z = self.target_sz[0] + self.context_amount * sum(self.target_sz)
-        s_z = np.sqrt(wc_z * hc_z)
-        scale_z = self.exemplar_size / s_z
-        d_search = (self.instance_size - self.exemplar_size) / 2
-        pad = d_search / scale_z
-        s_x = round(s_z + 2 * pad)
-
-        # Region preprocessing part
-        x_crop = self.__get_subwindow_tracking(im, self.instance_size, s_x)
-        x_crop = x_crop.transpose(2, 0, 1).reshape(1, 3, 271, 271).astype(np.float32)
-        self.score = self.__tracker_eval(x_crop, scale_z)
-        self.target_pos[0] = max(0, min(self.im_w, self.target_pos[0]))
-        self.target_pos[1] = max(0, min(self.im_h, self.target_pos[1]))
-        self.target_sz[0] = max(10, min(self.im_w, self.target_sz[0]))
-        self.target_sz[1] = max(10, min(self.im_h, self.target_sz[1]))
-
-        cx, cy = self.target_pos
-        w, h = self.target_sz
-        updated_bb = (cx, cy, w, h)
-        return True, updated_bb
-
-    # Function for updating position of the bounding box
-    def __tracker_eval(self, x_crop, scale_z):
-        target_size = self.target_sz * scale_z
-        self.net.setInput(x_crop)
-        outNames = self.net.getUnconnectedOutLayersNames()
-        outNames = ['66', '68']
-        delta, score = self.net.forward(outNames)
-        delta = np.transpose(delta, (1, 2, 3, 0))
-        delta = np.ascontiguousarray(delta, dtype = np.float32)
-        delta = np.reshape(delta, (4, -1))
-        score = np.transpose(score, (1, 2, 3, 0))
-        score = np.ascontiguousarray(score, dtype = np.float32)
-        score = np.reshape(score, (2, -1))
-        score = self.__softmax(score)[1, :]
-        delta[0, :] = delta[0, :] * self.anchor[:, 2] + self.anchor[:, 0]
-        delta[1, :] = delta[1, :] * self.anchor[:, 3] + self.anchor[:, 1]
-        delta[2, :] = np.exp(delta[2, :]) * self.anchor[:, 2]
-        delta[3, :] = np.exp(delta[3, :]) * self.anchor[:, 3]
-
-        def __change(r):
-            return np.maximum(r, 1./r)
-
-        def __sz(w, h):
-            pad = (w + h) * 0.5
-            sz2 = (w + pad) * (h + pad)
-            return np.sqrt(sz2)
-
-        def __sz_wh(wh):
-            pad = (wh[0] + wh[1]) * 0.5
-            sz2 = (wh[0] + pad) * (wh[1] + pad)
-            return np.sqrt(sz2)
-
-        s_c = __change(__sz(delta[2, :], delta[3, :]) / (__sz_wh(target_size)))
-        r_c = __change((target_size[0] / target_size[1]) / (delta[2, :] / delta[3, :]))
-        penalty = np.exp(-(r_c * s_c - 1.) * self.penalty_k)
-        pscore = penalty * score
-        pscore = pscore * (1 - self.window_influence) + self.window * self.window_influence
-        best_pscore_id = np.argmax(pscore)
-        target = delta[:, best_pscore_id] / scale_z
-        target_size /= scale_z
-        lr = penalty[best_pscore_id] * score[best_pscore_id] * self.lr
-        res_x = target[0] + self.target_pos[0]
-        res_y = target[1] + self.target_pos[1]
-        res_w = target_size[0] * (1 - lr) + target[2] * lr
-        res_h = target_size[1] * (1 - lr) + target[3] * lr
-        self.target_pos = np.array([res_x, res_y])
-        self.target_sz = np.array([res_w, res_h])
-        return score[best_pscore_id]
-
-    def __softmax(self, x):
-        x_max = x.max(0)
-        e_x = np.exp(x - x_max)
-        y = e_x / e_x.sum(axis = 0)
-        return y
-
-    # Reshaping cropped image for using in the model
-    def __get_subwindow_tracking(self, im, model_size, original_sz):
-        im_sz = im.shape
-        c = (original_sz + 1) / 2
-        context_xmin = round(self.target_pos[0] - c)
-        context_xmax = context_xmin + original_sz - 1
-        context_ymin = round(self.target_pos[1] - c)
-        context_ymax = context_ymin + original_sz - 1
-        left_pad = int(max(0., -context_xmin))
-        top_pad = int(max(0., -context_ymin))
-        right_pad = int(max(0., context_xmax - im_sz[1] + 1))
-        bot_pad = int(max(0., context_ymax - im_sz[0] + 1))
-        context_xmin += left_pad
-        context_xmax += left_pad
-        context_ymin += top_pad
-        context_ymax += top_pad
-        r, c, k = im.shape
-
-        if any([top_pad, bot_pad, left_pad, right_pad]):
-            te_im = np.zeros((
-                r + top_pad + bot_pad, c + left_pad + right_pad, k), np.uint8)
-            te_im[top_pad:top_pad + r, left_pad:left_pad + c, :] = im
-            if top_pad:
-                te_im[0:top_pad, left_pad:left_pad + c, :] = self.avg_chans
-            if bot_pad:
-                te_im[r + top_pad:, left_pad:left_pad + c, :] = self.avg_chans
-            if left_pad:
-                te_im[:, 0:left_pad, :] = self.avg_chans
-            if right_pad:
-                te_im[:, c + left_pad:, :] = self.avg_chans
-            im_patch_original = te_im[int(context_ymin):int(context_ymax + 1), int(context_xmin):int(context_xmax + 1), :]
-        else:
-            im_patch_original = im[int(context_ymin):int(context_ymax + 1), int(context_xmin):int(context_xmax + 1), :]
-
-        if not np.array_equal(model_size, original_sz):
-            im_patch_original = cv.resize(im_patch_original, (model_size, model_size))
-        return im_patch_original
-
-# Sample for using DaSiamRPN tracker
-def main():
-    parser = argparse.ArgumentParser(description="Run tracker")
-    parser.add_argument("--input", type=str, help="Full path to input (empty for camera)")
-    parser.add_argument("--net", type=str, default="dasiamrpn_model.onnx", help="Full path to onnx model of net")
-    parser.add_argument("--kernel_r1", type=str, default="dasiamrpn_kernel_r1.onnx", help="Full path to onnx model of kernel_r1")
-    parser.add_argument("--kernel_cls1", type=str, default="dasiamrpn_kernel_cls1.onnx", help="Full path to onnx model of kernel_cls1")
-    args = parser.parse_args()
-    point1 = ()
-    point2 = ()
-    mark = True
-    drawing = False
-    cx, cy, w, h = 0.0, 0.0, 0, 0
-    # Fucntion for drawing during videostream
-    def get_bb(event, x, y, flag, param):
-        nonlocal point1, point2, cx, cy, w, h, drawing, mark
-
-        if event == cv.EVENT_LBUTTONDOWN:
-            if not drawing:
-                drawing = True
-                point1 = (x, y)
-            else:
-                drawing = False
-
-        elif event == cv.EVENT_MOUSEMOVE:
-            if drawing:
-                point2 = (x, y)
-
-        elif event == cv.EVENT_LBUTTONUP:
-            cx = point1[0] - (point1[0] - point2[0]) / 2
-            cy = point1[1] - (point1[1] - point2[1]) / 2
-            w = abs(point1[0] - point2[0])
-            h = abs(point1[1] - point2[1])
-            mark = False
-
-    # Creating window for visualization
-    cap = cv.VideoCapture(args.input if args.input else 0)
-    cv.namedWindow("DaSiamRPN")
-    cv.setMouseCallback("DaSiamRPN", get_bb)
-
-    whitespace_key = 32
-    while cv.waitKey(40) != whitespace_key:
-        has_frame, frame = cap.read()
-        if not has_frame:
-            sys.exit(0)
-        cv.imshow("DaSiamRPN", frame)
-
-    while mark:
-        twin = np.copy(frame)
-        if point1 and point2:
-            cv.rectangle(twin, point1, point2, (0, 255, 255), 3)
-        cv.imshow("DaSiamRPN", twin)
-        cv.waitKey(40)
-
-    init_bb = (cx, cy, w, h)
-    tracker = DaSiamRPNTracker(args.net, args.kernel_r1, args.kernel_cls1)
-    tracker.init(frame, init_bb)
-
-    # Tracking loop
-    while cap.isOpened():
-        has_frame, frame = cap.read()
-        if not has_frame:
-            sys.exit(0)
-        _, new_bb = tracker.update(frame)
-        cx, cy, w, h = new_bb
-        cv.rectangle(frame, (int(cx - w // 2), int(cy - h // 2)), (int(cx - w // 2) + int(w), int(cy - h // 2) + int(h)),(0, 255, 255), 3)
-        cv.imshow("DaSiamRPN", frame)
-        key = cv.waitKey(1)
-        if key == ord("q"):
-            break
-
-    cap.release()
-    cv.destroyAllWindows()
-
-if __name__ == "__main__":
-    main()
diff --git a/samples/dnn/dnn_model_runner/dnn_conversion/paddlepaddle/README.md b/samples/dnn/dnn_model_runner/dnn_conversion/paddlepaddle/README.md
new file mode 100644
index 0000000000..7aba491c9d
--- /dev/null
+++ b/samples/dnn/dnn_model_runner/dnn_conversion/paddlepaddle/README.md
@@ -0,0 +1,25 @@
+# Run PaddlePaddle model by OpenCV
+
+This tutorial shows how to run PaddlePaddle model by opencv.
+
+## Environment Setup
+
+```shell
+pip install paddlepaddle-gpu
+pip install paddlehub
+pip install paddle2onnx
+```
+
+## Run PaddlePaddle model demo
+
+Run the example code as below,
+
+```shell
+python paddle_resnet50.py
+```
+
+there are 3 part of this execution
+
+- 1. Export PaddlePaddle ResNet50 model to onnx format;
+- 2. Use `cv2.dnn.readNetFromONNX` load model file;
+- 3. Preprocess image file and do inference.
diff --git a/samples/dnn/dnn_model_runner/dnn_conversion/paddlepaddle/data/cat.jpg b/samples/dnn/dnn_model_runner/dnn_conversion/paddlepaddle/data/cat.jpg
new file mode 100644
index 0000000000..892536417b
Binary files /dev/null and b/samples/dnn/dnn_model_runner/dnn_conversion/paddlepaddle/data/cat.jpg differ
diff --git a/samples/dnn/dnn_model_runner/dnn_conversion/paddlepaddle/data/labels.txt b/samples/dnn/dnn_model_runner/dnn_conversion/paddlepaddle/data/labels.txt
new file mode 100644
index 0000000000..52baabc68e
--- /dev/null
+++ b/samples/dnn/dnn_model_runner/dnn_conversion/paddlepaddle/data/labels.txt
@@ -0,0 +1,1000 @@
+tench
+goldfish
+great white shark
+tiger shark
+hammerhead
+electric ray
+stingray
+cock
+hen
+ostrich
+brambling
+goldfinch
+house finch
+junco
+indigo bunting
+robin
+bulbul
+jay
+magpie
+chickadee
+water ouzel
+kite
+bald eagle
+vulture
+great grey owl
+European fire salamander
+common newt
+eft
+spotted salamander
+axolotl
+bullfrog
+tree frog
+tailed frog
+loggerhead
+leatherback turtle
+mud turtle
+terrapin
+box turtle
+banded gecko
+common iguana
+American chameleon
+whiptail
+agama
+frilled lizard
+alligator lizard
+Gila monster
+green lizard
+African chameleon
+Komodo dragon
+African crocodile
+American alligator
+triceratops
+thunder snake
+ringneck snake
+hognose snake
+green snake
+king snake
+garter snake
+water snake
+vine snake
+night snake
+boa constrictor
+rock python
+Indian cobra
+green mamba
+sea snake
+horned viper
+diamondback
+sidewinder
+trilobite
+harvestman
+scorpion
+black and gold garden spider
+barn spider
+garden spider
+black widow
+tarantula
+wolf spider
+tick
+centipede
+black grouse
+ptarmigan
+ruffed grouse
+prairie chicken
+peacock
+quail
+partridge
+African grey
+macaw
+sulphur-crested cockatoo
+lorikeet
+coucal
+bee eater
+hornbill
+hummingbird
+jacamar
+toucan
+drake
+red-breasted merganser
+goose
+black swan
+tusker
+echidna
+platypus
+wallaby
+koala
+wombat
+jellyfish
+sea anemone
+brain coral
+flatworm
+nematode
+conch
+snail
+slug
+sea slug
+chiton
+chambered nautilus
+Dungeness crab
+rock crab
+fiddler crab
+king crab
+American lobster
+spiny lobster
+crayfish
+hermit crab
+isopod
+white stork
+black stork
+spoonbill
+flamingo
+little blue heron
+American egret
+bittern
+crane
+limpkin
+European gallinule
+American coot
+bustard
+ruddy turnstone
+red-backed sandpiper
+redshank
+dowitcher
+oystercatcher
+pelican
+king penguin
+albatross
+grey whale
+killer whale
+dugong
+sea lion
+Chihuahua
+Japanese spaniel
+Maltese dog
+Pekinese
+Shih-Tzu
+Blenheim spaniel
+papillon
+toy terrier
+Rhodesian ridgeback
+Afghan hound
+basset
+beagle
+bloodhound
+bluetick
+black-and-tan coonhound
+Walker hound
+English foxhound
+redbone
+borzoi
+Irish wolfhound
+Italian greyhound
+whippet
+Ibizan hound
+Norwegian elkhound
+otterhound
+Saluki
+Scottish deerhound
+Weimaraner
+Staffordshire bullterrier
+American Staffordshire terrier
+Bedlington terrier
+Border terrier
+Kerry blue terrier
+Irish terrier
+Norfolk terrier
+Norwich terrier
+Yorkshire terrier
+wire-haired fox terrier
+Lakeland terrier
+Sealyham terrier
+Airedale
+cairn
+Australian terrier
+Dandie Dinmont
+Boston bull
+miniature schnauzer
+giant schnauzer
+standard schnauzer
+Scotch terrier
+Tibetan terrier
+silky terrier
+soft-coated wheaten terrier
+West Highland white terrier
+Lhasa
+flat-coated retriever
+curly-coated retriever
+golden retriever
+Labrador retriever
+Chesapeake Bay retriever
+German short-haired pointer
+vizsla
+English setter
+Irish setter
+Gordon setter
+Brittany spaniel
+clumber
+English springer
+Welsh springer spaniel
+cocker spaniel
+Sussex spaniel
+Irish water spaniel
+kuvasz
+schipperke
+groenendael
+malinois
+briard
+kelpie
+komondor
+Old English sheepdog
+Shetland sheepdog
+collie
+Border collie
+Bouvier des Flandres
+Rottweiler
+German shepherd
+Doberman
+miniature pinscher
+Greater Swiss Mountain dog
+Bernese mountain dog
+Appenzeller
+EntleBucher
+boxer
+bull mastiff
+Tibetan mastiff
+French bulldog
+Great Dane
+Saint Bernard
+Eskimo dog
+malamute
+Siberian husky
+dalmatian
+affenpinscher
+basenji
+pug
+Leonberg
+Newfoundland
+Great Pyrenees
+Samoyed
+Pomeranian
+chow
+keeshond
+Brabancon griffon
+Pembroke
+Cardigan
+toy poodle
+miniature poodle
+standard poodle
+Mexican hairless
+timber wolf
+white wolf
+red wolf
+coyote
+dingo
+dhole
+African hunting dog
+hyena
+red fox
+kit fox
+Arctic fox
+grey fox
+tabby
+tiger cat
+Persian cat
+Siamese cat
+Egyptian cat
+cougar
+lynx
+leopard
+snow leopard
+jaguar
+lion
+tiger
+cheetah
+brown bear
+American black bear
+ice bear
+sloth bear
+mongoose
+meerkat
+tiger beetle
+ladybug
+ground beetle
+long-horned beetle
+leaf beetle
+dung beetle
+rhinoceros beetle
+weevil
+fly
+bee
+ant
+grasshopper
+cricket
+walking stick
+cockroach
+mantis
+cicada
+leafhopper
+lacewing
+dragonfly
+damselfly
+admiral
+ringlet
+monarch
+cabbage butterfly
+sulphur butterfly
+lycaenid
+starfish
+sea urchin
+sea cucumber
+wood rabbit
+hare
+Angora
+hamster
+porcupine
+fox squirrel
+marmot
+beaver
+guinea pig
+sorrel
+zebra
+hog
+wild boar
+warthog
+hippopotamus
+ox
+water buffalo
+bison
+ram
+bighorn
+ibex
+hartebeest
+impala
+gazelle
+Arabian camel
+llama
+weasel
+mink
+polecat
+black-footed ferret
+otter
+skunk
+badger
+armadillo
+three-toed sloth
+orangutan
+gorilla
+chimpanzee
+gibbon
+siamang
+guenon
+patas
+baboon
+macaque
+langur
+colobus
+proboscis monkey
+marmoset
+capuchin
+howler monkey
+titi
+spider monkey
+squirrel monkey
+Madagascar cat
+indri
+Indian elephant
+African elephant
+lesser panda
+giant panda
+barracouta
+eel
+coho
+rock beauty
+anemone fish
+sturgeon
+gar
+lionfish
+puffer
+abacus
+abaya
+academic gown
+accordion
+acoustic guitar
+aircraft carrier
+airliner
+airship
+altar
+ambulance
+amphibian
+analog clock
+apiary
+apron
+ashcan
+assault rifle
+backpack
+bakery
+balance beam
+balloon
+ballpoint
+Band Aid
+banjo
+bannister
+barbell
+barber chair
+barbershop
+barn
+barometer
+barrel
+barrow
+baseball
+basketball
+bassinet
+bassoon
+bathing cap
+bath towel
+bathtub
+beach wagon
+beacon
+beaker
+bearskin
+beer bottle
+beer glass
+bell cote
+bib
+bicycle-built-for-two
+bikini
+binder
+binoculars
+birdhouse
+boathouse
+bobsled
+bolo tie
+bonnet
+bookcase
+bookshop
+bottlecap
+bow
+bow tie
+brass
+brassiere
+breakwater
+breastplate
+broom
+bucket
+buckle
+bulletproof vest
+bullet train
+butcher shop
+cab
+caldron
+candle
+cannon
+canoe
+can opener
+cardigan
+car mirror
+carousel
+carpenters kit
+carton
+car wheel
+cash machine
+cassette
+cassette player
+castle
+catamaran
+CD player
+cello
+cellular telephone
+chain
+chainlink fence
+chain mail
+chain saw
+chest
+chiffonier
+chime
+china cabinet
+Christmas stocking
+church
+cinema
+cleaver
+cliff dwelling
+cloak
+clog
+cocktail shaker
+coffee mug
+coffeepot
+coil
+combination lock
+computer keyboard
+confectionery
+container ship
+convertible
+corkscrew
+cornet
+cowboy boot
+cowboy hat
+cradle
+crane
+crash helmet
+crate
+crib
+Crock Pot
+croquet ball
+crutch
+cuirass
+dam
+desk
+desktop computer
+dial telephone
+diaper
+digital clock
+digital watch
+dining table
+dishrag
+dishwasher
+disk brake
+dock
+dogsled
+dome
+doormat
+drilling platform
+drum
+drumstick
+dumbbell
+Dutch oven
+electric fan
+electric guitar
+electric locomotive
+entertainment center
+envelope
+espresso maker
+face powder
+feather boa
+file
+fireboat
+fire engine
+fire screen
+flagpole
+flute
+folding chair
+football helmet
+forklift
+fountain
+fountain pen
+four-poster
+freight car
+French horn
+frying pan
+fur coat
+garbage truck
+gasmask
+gas pump
+goblet
+go-kart
+golf ball
+golfcart
+gondola
+gong
+gown
+grand piano
+greenhouse
+grille
+grocery store
+guillotine
+hair slide
+hair spray
+half track
+hammer
+hamper
+hand blower
+hand-held computer
+handkerchief
+hard disc
+harmonica
+harp
+harvester
+hatchet
+holster
+home theater
+honeycomb
+hook
+hoopskirt
+horizontal bar
+horse cart
+hourglass
+iPod
+iron
+jack-o-lantern
+jean
+jeep
+jersey
+jigsaw puzzle
+jinrikisha
+joystick
+kimono
+knee pad
+knot
+lab coat
+ladle
+lampshade
+laptop
+lawn mower
+lens cap
+letter opener
+library
+lifeboat
+lighter
+limousine
+liner
+lipstick
+Loafer
+lotion
+loudspeaker
+loupe
+lumbermill
+magnetic compass
+mailbag
+mailbox
+maillot
+maillot
+manhole cover
+maraca
+marimba
+mask
+matchstick
+maypole
+maze
+measuring cup
+medicine chest
+megalith
+microphone
+microwave
+military uniform
+milk can
+minibus
+miniskirt
+minivan
+missile
+mitten
+mixing bowl
+mobile home
+Model T
+modem
+monastery
+monitor
+moped
+mortar
+mortarboard
+mosque
+mosquito net
+motor scooter
+mountain bike
+mountain tent
+mouse
+mousetrap
+moving van
+muzzle
+nail
+neck brace
+necklace
+nipple
+notebook
+obelisk
+oboe
+ocarina
+odometer
+oil filter
+organ
+oscilloscope
+overskirt
+oxcart
+oxygen mask
+packet
+paddle
+paddlewheel
+padlock
+paintbrush
+pajama
+palace
+panpipe
+paper towel
+parachute
+parallel bars
+park bench
+parking meter
+passenger car
+patio
+pay-phone
+pedestal
+pencil box
+pencil sharpener
+perfume
+Petri dish
+photocopier
+pick
+pickelhaube
+picket fence
+pickup
+pier
+piggy bank
+pill bottle
+pillow
+ping-pong ball
+pinwheel
+pirate
+pitcher
+plane
+planetarium
+plastic bag
+plate rack
+plow
+plunger
+Polaroid camera
+pole
+police van
+poncho
+pool table
+pop bottle
+pot
+potters wheel
+power drill
+prayer rug
+printer
+prison
+projectile
+projector
+puck
+punching bag
+purse
+quill
+quilt
+racer
+racket
+radiator
+radio
+radio telescope
+rain barrel
+recreational vehicle
+reel
+reflex camera
+refrigerator
+remote control
+restaurant
+revolver
+rifle
+rocking chair
+rotisserie
+rubber eraser
+rugby ball
+rule
+running shoe
+safe
+safety pin
+saltshaker
+sandal
+sarong
+sax
+scabbard
+scale
+school bus
+schooner
+scoreboard
+screen
+screw
+screwdriver
+seat belt
+sewing machine
+shield
+shoe shop
+shoji
+shopping basket
+shopping cart
+shovel
+shower cap
+shower curtain
+ski
+ski mask
+sleeping bag
+slide rule
+sliding door
+slot
+snorkel
+snowmobile
+snowplow
+soap dispenser
+soccer ball
+sock
+solar dish
+sombrero
+soup bowl
+space bar
+space heater
+space shuttle
+spatula
+speedboat
+spider web
+spindle
+sports car
+spotlight
+stage
+steam locomotive
+steel arch bridge
+steel drum
+stethoscope
+stole
+stone wall
+stopwatch
+stove
+strainer
+streetcar
+stretcher
+studio couch
+stupa
+submarine
+suit
+sundial
+sunglass
+sunglasses
+sunscreen
+suspension bridge
+swab
+sweatshirt
+swimming trunks
+swing
+switch
+syringe
+table lamp
+tank
+tape player
+teapot
+teddy
+television
+tennis ball
+thatch
+theater curtain
+thimble
+thresher
+throne
+tile roof
+toaster
+tobacco shop
+toilet seat
+torch
+totem pole
+tow truck
+toyshop
+tractor
+trailer truck
+tray
+trench coat
+tricycle
+trimaran
+tripod
+triumphal arch
+trolleybus
+trombone
+tub
+turnstile
+typewriter keyboard
+umbrella
+unicycle
+upright
+vacuum
+vase
+vault
+velvet
+vending machine
+vestment
+viaduct
+violin
+volleyball
+waffle iron
+wall clock
+wallet
+wardrobe
+warplane
+washbasin
+washer
+water bottle
+water jug
+water tower
+whiskey jug
+whistle
+wig
+window screen
+window shade
+Windsor tie
+wine bottle
+wing
+wok
+wooden spoon
+wool
+worm fence
+wreck
+yawl
+yurt
+web site
+comic book
+crossword puzzle
+street sign
+traffic light
+book jacket
+menu
+plate
+guacamole
+consomme
+hot pot
+trifle
+ice cream
+ice lolly
+French loaf
+bagel
+pretzel
+cheeseburger
+hotdog
+mashed potato
+head cabbage
+broccoli
+cauliflower
+zucchini
+spaghetti squash
+acorn squash
+butternut squash
+cucumber
+artichoke
+bell pepper
+cardoon
+mushroom
+Granny Smith
+strawberry
+orange
+lemon
+fig
+pineapple
+banana
+jackfruit
+custard apple
+pomegranate
+hay
+carbonara
+chocolate sauce
+dough
+meat loaf
+pizza
+potpie
+burrito
+red wine
+espresso
+cup
+eggnog
+alp
+bubble
+cliff
+coral reef
+geyser
+lakeside
+promontory
+sandbar
+seashore
+valley
+volcano
+ballplayer
+groom
+scuba diver
+rapeseed
+daisy
+yellow ladys slipper
+corn
+acorn
+hip
+buckeye
+coral fungus
+agaric
+gyromitra
+stinkhorn
+earthstar
+hen-of-the-woods
+bolete
+ear
+toilet tissue
diff --git a/samples/dnn/dnn_model_runner/dnn_conversion/paddlepaddle/paddle_resnet50.py b/samples/dnn/dnn_model_runner/dnn_conversion/paddlepaddle/paddle_resnet50.py
new file mode 100644
index 0000000000..b95ce917e6
--- /dev/null
+++ b/samples/dnn/dnn_model_runner/dnn_conversion/paddlepaddle/paddle_resnet50.py
@@ -0,0 +1,61 @@
+import paddle
+import paddlehub as hub
+import paddlehub.vision.transforms as T
+import cv2 as cv
+import numpy as np
+
+
+def preprocess(image_path):
+    ''' preprocess input image file to np.ndarray
+
+    Args:
+        image_path(str): Path of input image file
+
+    Returns:
+        ProcessedImage(numpy.ndarray): A numpy.ndarray
+                variable which shape is (1, 3, 224, 224)
+    '''
+    transforms = T.Compose([
+            T.Resize((256, 256)),
+            T.CenterCrop(224),
+            T.Normalize(mean=[0.485, 0.456, 0.406],
+                        std=[0.229, 0.224, 0.225])],
+            to_rgb=True)
+    return np.expand_dims(transforms(image_path), axis=0)
+
+
+def export_onnx_mobilenetv2(save_path):
+    ''' export PaddlePaddle model to ONNX format
+
+    Args:
+        save_path(str): Path to save exported ONNX model
+
+    Returns:
+        None
+    '''
+    model = hub.Module(name="resnet50_vd_imagenet_ssld")
+    input_spec = paddle.static.InputSpec(
+            [1, 3, 224, 224], "float32", "image")
+    paddle.onnx.export(model, save_path,
+                       input_spec=[input_spec],
+                       opset_version=10)
+
+
+if __name__ == '__main__':
+    save_path = './resnet50'
+    image_file = './data/cat.jpg'
+    labels = open('./data/labels.txt').read().strip().split('\n')
+    model = export_onnx_mobilenetv2(save_path)
+
+    # load mobilenetv2 use cv.dnn
+    net = cv.dnn.readNetFromONNX(save_path + '.onnx')
+    # read and preprocess image file
+    im = preprocess(image_file)
+    # inference
+    net.setInput(im)
+    result = net.forward(['save_infer_model/scale_0.tmp_0'])
+    # post process
+    class_id = np.argmax(result[0])
+    label = labels[class_id]
+    print("Image: {}".format(image_file))
+    print("Predict Category: {}".format(label))
diff --git a/samples/dnn/dnn_model_runner/dnn_conversion/requirements.txt b/samples/dnn/dnn_model_runner/dnn_conversion/requirements.txt
index 65ab56ad66..eb217e27df 100644
--- a/samples/dnn/dnn_model_runner/dnn_conversion/requirements.txt
+++ b/samples/dnn/dnn_model_runner/dnn_conversion/requirements.txt
@@ -7,3 +7,8 @@ torchvision>=0.6.1
 
 tensorflow>=2.1.0
 tensorflow-gpu>=2.1.0
+
+paddlepaddle>=2.0.0
+paddlepaddle-gpu>=2.0.0
+paddlehub>=2.1.0
+paddle2onnx>=0.5.1
diff --git a/samples/dnn/human_parsing.cpp b/samples/dnn/human_parsing.cpp
index bf2cc294c8..0c00c02841 100644
--- a/samples/dnn/human_parsing.cpp
+++ b/samples/dnn/human_parsing.cpp
@@ -78,12 +78,17 @@ int main(int argc, char**argv)
                                          "0: automatically (by default), "
                                          "1: Halide language (http://halide-lang.org/), "
                                          "2: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
-                                         "3: OpenCV implementation }"
+                                         "3: OpenCV implementation, "
+                                         "4: VKCOM, "
+                                         "5: CUDA }"
         "{target  t | 0               | Choose one of target computation devices: "
                                          "0: CPU target (by default), "
                                          "1: OpenCL, "
                                          "2: OpenCL fp16 (half-float precision), "
-                                         "3: VPU }"
+                                         "3: VPU, "
+                                         "4: Vulkan, "
+                                         "6: CUDA, "
+                                         "7: CUDA fp16 (half-float preprocess) }"
     );
     if (argc == 1 || parser.has("help"))
     {
diff --git a/samples/dnn/human_parsing.py b/samples/dnn/human_parsing.py
index 09371fe4a9..237f764b95 100644
--- a/samples/dnn/human_parsing.py
+++ b/samples/dnn/human_parsing.py
@@ -45,8 +45,10 @@ import numpy as np
 import cv2 as cv
 
 
-backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_BACKEND_OPENCV)
-targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD, cv.dnn.DNN_TARGET_HDDL)
+backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_BACKEND_OPENCV,
+            cv.dnn.DNN_BACKEND_VKCOM, cv.dnn.DNN_BACKEND_CUDA)
+targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD,
+           cv.dnn.DNN_TARGET_HDDL, cv.dnn.DNN_TARGET_VULKAN, cv.dnn.DNN_TARGET_CUDA, cv.dnn.DNN_TARGET_CUDA_FP16)
 
 
 def preprocess(image):
@@ -162,14 +164,19 @@ if __name__ == '__main__':
                         help="Choose one of computation backends: "
                              "%d: automatically (by default), "
                              "%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
-                             "%d: OpenCV implementation" % backends)
+                             "%d: OpenCV implementation, "
+                             "%d: VKCOM, "
+                             "%d: CUDA"% backends)
     parser.add_argument('--target', choices=targets, default=cv.dnn.DNN_TARGET_CPU, type=int,
                         help='Choose one of target computation devices: '
                              '%d: CPU target (by default), '
                              '%d: OpenCL, '
                              '%d: OpenCL fp16 (half-float precision), '
                              '%d: NCS2 VPU, '
-                             '%d: HDDL VPU' % targets)
+                             '%d: HDDL VPU, '
+                             '%d: Vulkan, '
+                             '%d: CUDA, '
+                             '%d: CUDA fp16 (half-float preprocess)' % targets)
     args, _ = parser.parse_known_args()
 
     if not os.path.isfile(args.model):
diff --git a/samples/dnn/object_detection.cpp b/samples/dnn/object_detection.cpp
index 796e729ece..5ff112fe5d 100644
--- a/samples/dnn/object_detection.cpp
+++ b/samples/dnn/object_detection.cpp
@@ -27,12 +27,17 @@ std::string keys =
                          "0: automatically (by default), "
                          "1: Halide language (http://halide-lang.org/), "
                          "2: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
-                         "3: OpenCV implementation }"
+                         "3: OpenCV implementation, "
+                         "4: VKCOM, "
+                         "5: CUDA }"
     "{ target      | 0 | Choose one of target computation devices: "
                          "0: CPU target (by default), "
                          "1: OpenCL, "
                          "2: OpenCL fp16 (half-float precision), "
-                         "3: VPU }"
+                         "3: VPU, "
+                         "4: Vulkan, "
+                         "6: CUDA, "
+                         "7: CUDA fp16 (half-float preprocess) }"
     "{ async       | 0 | Number of asynchronous forwards at the same time. "
                         "Choose 0 for synchronous mode }";
 
diff --git a/samples/dnn/object_detection.py b/samples/dnn/object_detection.py
index ec8bf82866..0ca5586159 100644
--- a/samples/dnn/object_detection.py
+++ b/samples/dnn/object_detection.py
@@ -14,8 +14,10 @@ from tf_text_graph_common import readTextMessage
 from tf_text_graph_ssd import createSSDGraph
 from tf_text_graph_faster_rcnn import createFasterRCNNGraph
 
-backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_HALIDE, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_BACKEND_OPENCV)
-targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD, cv.dnn.DNN_TARGET_HDDL)
+backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_HALIDE, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_BACKEND_OPENCV,
+            cv.dnn.DNN_BACKEND_VKCOM, cv.dnn.DNN_BACKEND_CUDA)
+targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD, cv.dnn.DNN_TARGET_HDDL,
+           cv.dnn.DNN_TARGET_VULKAN, cv.dnn.DNN_TARGET_CUDA, cv.dnn.DNN_TARGET_CUDA_FP16)
 
 parser = argparse.ArgumentParser(add_help=False)
 parser.add_argument('--zoo', default=os.path.join(os.path.dirname(os.path.abspath(__file__)), 'models.yml'),
@@ -35,14 +37,19 @@ parser.add_argument('--backend', choices=backends, default=cv.dnn.DNN_BACKEND_DE
                          "%d: automatically (by default), "
                          "%d: Halide language (http://halide-lang.org/), "
                          "%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
-                         "%d: OpenCV implementation" % backends)
+                         "%d: OpenCV implementation, "
+                         "%d: VKCOM, "
+                         "%d: CUDA" % backends)
 parser.add_argument('--target', choices=targets, default=cv.dnn.DNN_TARGET_CPU, type=int,
                     help='Choose one of target computation devices: '
                          '%d: CPU target (by default), '
                          '%d: OpenCL, '
                          '%d: OpenCL fp16 (half-float precision), '
                          '%d: NCS2 VPU, '
-                         '%d: HDDL VPU' % targets)
+                         '%d: HDDL VPU, '
+                         '%d: Vulkan, '
+                         '%d: CUDA, '
+                         '%d: CUDA fp16 (half-float preprocess)' % targets)
 parser.add_argument('--async', type=int, default=0,
                     dest='asyncN',
                     help='Number of asynchronous forwards at the same time. '
diff --git a/samples/dnn/openpose.py b/samples/dnn/openpose.py
index b79ccd54b8..191d23edd4 100644
--- a/samples/dnn/openpose.py
+++ b/samples/dnn/openpose.py
@@ -41,8 +41,7 @@ elif args.dataset == 'MPI':
                    ["RElbow", "RWrist"], ["Neck", "LShoulder"], ["LShoulder", "LElbow"],
                    ["LElbow", "LWrist"], ["Neck", "Chest"], ["Chest", "RHip"], ["RHip", "RKnee"],
                    ["RKnee", "RAnkle"], ["Chest", "LHip"], ["LHip", "LKnee"], ["LKnee", "LAnkle"] ]
-else:
-    assert(args.dataset == 'HAND')
+elif args.dataset == 'HAND':
     BODY_PARTS = { "Wrist": 0,
                    "ThumbMetacarpal": 1, "ThumbProximal": 2, "ThumbMiddle": 3, "ThumbDistal": 4,
                    "IndexFingerMetacarpal": 5, "IndexFingerProximal": 6, "IndexFingerMiddle": 7, "IndexFingerDistal": 8,
@@ -61,7 +60,8 @@ else:
                    ["RingFingerProximal", "RingFingerMiddle"], ["RingFingerMiddle", "RingFingerDistal"],
                    ["Wrist", "LittleFingerMetacarpal"], ["LittleFingerMetacarpal", "LittleFingerProximal"],
                    ["LittleFingerProximal", "LittleFingerMiddle"], ["LittleFingerMiddle", "LittleFingerDistal"] ]
-
+else:
+    raise(Exception("you need to specify either 'COCO', 'MPI', or 'Hand' in args.dataset"))
 
 inWidth = args.width
 inHeight = args.height
diff --git a/samples/dnn/person_reid.cpp b/samples/dnn/person_reid.cpp
index 23b766114c..f0c22e96ad 100644
--- a/samples/dnn/person_reid.cpp
+++ b/samples/dnn/person_reid.cpp
@@ -36,13 +36,15 @@ const char* keys =
 "0: automatically (by default), "
 "1: Halide language (http://halide-lang.org/), "
 "2: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
-"3: OpenCV implementation ,"
+"3: OpenCV implementation, "
+"4: VKCOM, "
 "5: CUDA }"
 "{target  t  | 0                | choose one of target computation devices: "
 "0: CPU target (by default), "
 "1: OpenCL, "
 "2: OpenCL fp16 (half-float precision), "
-"6: CUDA ,"
+"4: Vulkan, "
+"6: CUDA, "
 "7: CUDA fp16 (half-float preprocess) }";
 
 namespace cv{
diff --git a/samples/dnn/person_reid.py b/samples/dnn/person_reid.py
index 502f126bd5..08f04faa52 100644
--- a/samples/dnn/person_reid.py
+++ b/samples/dnn/person_reid.py
@@ -21,6 +21,7 @@ import cv2 as cv
 backends = (cv.dnn.DNN_BACKEND_DEFAULT,
     cv.dnn.DNN_BACKEND_INFERENCE_ENGINE,
     cv.dnn.DNN_BACKEND_OPENCV,
+    cv.dnn.DNN_BACKEND_VKCOM,
     cv.dnn.DNN_BACKEND_CUDA)
 
 targets = (cv.dnn.DNN_TARGET_CPU,
@@ -28,6 +29,7 @@ targets = (cv.dnn.DNN_TARGET_CPU,
     cv.dnn.DNN_TARGET_OPENCL_FP16,
     cv.dnn.DNN_TARGET_MYRIAD,
     cv.dnn.DNN_TARGET_HDDL,
+    cv.dnn.DNN_TARGET_VULKAN,
     cv.dnn.DNN_TARGET_CUDA,
     cv.dnn.DNN_TARGET_CUDA_FP16)
 
@@ -212,7 +214,8 @@ if __name__ == '__main__':
                         help="Choose one of computation backends: "
                              "%d: automatically (by default), "
                              "%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
-                             "%d: OpenCV implementation"
+                             "%d: OpenCV implementation, "
+                             "%d: VKCOM, "
                              "%d: CUDA backend"% backends)
     parser.add_argument('--target', choices=targets, default=cv.dnn.DNN_TARGET_CPU, type=int,
                         help='Choose one of target computation devices: '
@@ -220,9 +223,10 @@ if __name__ == '__main__':
                              '%d: OpenCL, '
                              '%d: OpenCL fp16 (half-float precision), '
                              '%d: NCS2 VPU, '
-                             '%d: HDDL VPU'
-                             '%d: CUDA,'
-                             '%d: CUDA FP16,'
+                             '%d: HDDL VPU, '
+                             '%d: Vulkan, '
+                             '%d: CUDA, '
+                             '%d: CUDA FP16'
                              % targets)
     args, _ = parser.parse_known_args()
 
diff --git a/samples/dnn/segmentation.cpp b/samples/dnn/segmentation.cpp
index d9fbad8974..777badf51e 100644
--- a/samples/dnn/segmentation.cpp
+++ b/samples/dnn/segmentation.cpp
@@ -21,12 +21,17 @@ std::string keys =
                         "0: automatically (by default), "
                         "1: Halide language (http://halide-lang.org/), "
                         "2: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
-                        "3: OpenCV implementation }"
+                        "3: OpenCV implementation, "
+                        "4: VKCOM, "
+                        "5: CUDA }"
     "{ target      | 0 | Choose one of target computation devices: "
                         "0: CPU target (by default), "
                         "1: OpenCL, "
                         "2: OpenCL fp16 (half-float precision), "
-                        "3: VPU }";
+                        "3: VPU, "
+                        "4: Vulkan, "
+                        "6: CUDA, "
+                        "7: CUDA fp16 (half-float preprocess) }";
 
 using namespace cv;
 using namespace dnn;
diff --git a/samples/dnn/segmentation.py b/samples/dnn/segmentation.py
index 8eeb59ba14..09f3f8dd11 100644
--- a/samples/dnn/segmentation.py
+++ b/samples/dnn/segmentation.py
@@ -5,8 +5,10 @@ import sys
 
 from common import *
 
-backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_HALIDE, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_BACKEND_OPENCV)
-targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD, cv.dnn.DNN_TARGET_HDDL)
+backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_HALIDE, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_BACKEND_OPENCV,
+            cv.dnn.DNN_BACKEND_VKCOM, cv.dnn.DNN_BACKEND_CUDA)
+targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD, cv.dnn.DNN_TARGET_HDDL,
+           cv.dnn.DNN_TARGET_VULKAN, cv.dnn.DNN_TARGET_CUDA, cv.dnn.DNN_TARGET_CUDA_FP16)
 
 parser = argparse.ArgumentParser(add_help=False)
 parser.add_argument('--zoo', default=os.path.join(os.path.dirname(os.path.abspath(__file__)), 'models.yml'),
@@ -22,14 +24,19 @@ parser.add_argument('--backend', choices=backends, default=cv.dnn.DNN_BACKEND_DE
                          "%d: automatically (by default), "
                          "%d: Halide language (http://halide-lang.org/), "
                          "%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
-                         "%d: OpenCV implementation" % backends)
+                         "%d: OpenCV implementation, "
+                         "%d: VKCOM, "
+                         "%d: CUDA"% backends)
 parser.add_argument('--target', choices=targets, default=cv.dnn.DNN_TARGET_CPU, type=int,
                     help='Choose one of target computation devices: '
                          '%d: CPU target (by default), '
                          '%d: OpenCL, '
                          '%d: OpenCL fp16 (half-float precision), '
                          '%d: NCS2 VPU, '
-                         '%d: HDDL VPU' % targets)
+                         '%d: HDDL VPU, '
+                         '%d: Vulkan, '
+                         '%d: CUDA, '
+                         '%d: CUDA fp16 (half-float preprocess)'% targets)
 args, _ = parser.parse_known_args()
 add_preproc_args(args.zoo, parser, 'segmentation')
 parser = argparse.ArgumentParser(parents=[parser],
diff --git a/samples/dnn/siamrpnpp.py b/samples/dnn/siamrpnpp.py
index c7c49b1b85..2e15ec6708 100644
--- a/samples/dnn/siamrpnpp.py
+++ b/samples/dnn/siamrpnpp.py
@@ -327,9 +327,11 @@ def main():
     """ Sample SiamRPN Tracker
     """
     # Computation backends supported by layers
-    backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_HALIDE, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_BACKEND_OPENCV)
+    backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_HALIDE, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_BACKEND_OPENCV,
+                cv.dnn.DNN_BACKEND_VKCOM, cv.dnn.DNN_BACKEND_CUDA)
     # Target Devices for computation
-    targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD)
+    targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD,
+               cv.dnn.DNN_TARGET_VULKAN, cv.dnn.DNN_TARGET_CUDA, cv.dnn.DNN_TARGET_CUDA_FP16)
 
     parser = argparse.ArgumentParser(description='Use this script to run SiamRPN++ Visual Tracker',
                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
@@ -338,17 +340,22 @@ def main():
     parser.add_argument('--search_net', type=str, default='search_net.onnx', help='Path to part of SiamRPN++ ran on search frame.')
     parser.add_argument('--rpn_head', type=str, default='rpn_head.onnx', help='Path to RPN Head ONNX model.')
     parser.add_argument('--backend', choices=backends, default=cv.dnn.DNN_BACKEND_DEFAULT, type=int,
-                        help='Select a computation backend: '
-                        "%d: automatically (by default) "
-                        "%d: Halide"
-                        "%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit)"
-                        "%d: OpenCV Implementation" % backends)
+                        help="Select a computation backend: "
+                        "%d: automatically (by default), "
+                        "%d: Halide, "
+                        "%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
+                        "%d: OpenCV Implementation, "
+                        "%d: VKCOM, "
+                        "%d: CUDA" % backends)
     parser.add_argument('--target', choices=targets, default=cv.dnn.DNN_TARGET_CPU, type=int,
                         help='Select a target device: '
-                        "%d: CPU target (by default)"
-                        "%d: OpenCL"
-                        "%d: OpenCL FP16"
-                        "%d: Myriad" % targets)
+                        '%d: CPU target (by default), '
+                        '%d: OpenCL, '
+                        '%d: OpenCL FP16, '
+                        '%d: Myriad, '
+                        '%d: Vulkan, '
+                        '%d: CUDA, '
+                        '%d: CUDA fp16 (half-float preprocess)' % targets)
     args, _ = parser.parse_known_args()
 
     if args.input_video and not os.path.isfile(args.input_video):
diff --git a/samples/dnn/virtual_try_on.py b/samples/dnn/virtual_try_on.py
index 076cb21d5b..e46f7ece50 100644
--- a/samples/dnn/virtual_try_on.py
+++ b/samples/dnn/virtual_try_on.py
@@ -16,8 +16,10 @@ from numpy import linalg
 from common import findFile
 from human_parsing import parse_human
 
-backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_HALIDE, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_BACKEND_OPENCV)
-targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD, cv.dnn.DNN_TARGET_HDDL)
+backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_HALIDE, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_BACKEND_OPENCV,
+            cv.dnn.DNN_BACKEND_VKCOM, cv.dnn.DNN_BACKEND_CUDA)
+targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD, cv.dnn.DNN_TARGET_HDDL,
+           cv.dnn.DNN_TARGET_VULKAN, cv.dnn.DNN_TARGET_CUDA, cv.dnn.DNN_TARGET_CUDA_FP16)
 
 parser = argparse.ArgumentParser(description='Use this script to run virtial try-on using CP-VTON',
                                  formatter_class=argparse.ArgumentDefaultsHelpFormatter)
@@ -33,14 +35,19 @@ parser.add_argument('--backend', choices=backends, default=cv.dnn.DNN_BACKEND_DE
                             "%d: automatically (by default), "
                             "%d: Halide language (http://halide-lang.org/), "
                             "%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit), "
-                            "%d: OpenCV implementation" % backends)
+                            "%d: OpenCV implementation, "
+                            "%d: VKCOM, "
+                            "%d: CUDA" % backends)
 parser.add_argument('--target', choices=targets, default=cv.dnn.DNN_TARGET_CPU, type=int,
                     help='Choose one of target computation devices: '
                             '%d: CPU target (by default), '
                             '%d: OpenCL, '
                             '%d: OpenCL fp16 (half-float precision), '
                             '%d: NCS2 VPU, '
-                            '%d: HDDL VPU' % targets)
+                            '%d: HDDL VPU, '
+                            '%d: Vulkan, '
+                            '%d: CUDA, '
+                            '%d: CUDA fp16 (half-float preprocess)'% targets)
 args, _ = parser.parse_known_args()
 
 
diff --git a/samples/python/stitching_detailed.py b/samples/python/stitching_detailed.py
index 7bf5a9ac0d..a7e316105e 100644
--- a/samples/python/stitching_detailed.py
+++ b/samples/python/stitching_detailed.py
@@ -49,8 +49,6 @@ except AttributeError:
     print("AKAZE not available")
 
 SEAM_FIND_CHOICES = OrderedDict()
-SEAM_FIND_CHOICES['gc_color'] = cv.detail_GraphCutSeamFinder('COST_COLOR')
-SEAM_FIND_CHOICES['gc_colorgrad'] = cv.detail_GraphCutSeamFinder('COST_COLOR_GRAD')
 SEAM_FIND_CHOICES['dp_color'] = cv.detail_DpSeamFinder('COLOR')
 SEAM_FIND_CHOICES['dp_colorgrad'] = cv.detail_DpSeamFinder('COLOR_GRAD')
 SEAM_FIND_CHOICES['voronoi'] = cv.detail.SeamFinder_createDefault(cv.detail.SeamFinder_VORONOI_SEAM)
@@ -79,7 +77,10 @@ WARP_CHOICES = (
     'transverseMercator',
 )
 
-WAVE_CORRECT_CHOICES = ('horiz', 'no', 'vert',)
+WAVE_CORRECT_CHOICES = OrderedDict()
+WAVE_CORRECT_CHOICES['horiz'] = cv.detail.WAVE_CORRECT_HORIZ
+WAVE_CORRECT_CHOICES['no'] = None
+WAVE_CORRECT_CHOICES['vert'] = cv.detail.WAVE_CORRECT_VERT
 
 BLEND_CHOICES = ('multiband', 'feather', 'no',)
 
@@ -147,9 +148,9 @@ parser.add_argument(
     type=str, dest='ba_refine_mask'
 )
 parser.add_argument(
-    '--wave_correct', action='store', default=WAVE_CORRECT_CHOICES[0],
-    help="Perform wave effect correction. The default is '%s'" % WAVE_CORRECT_CHOICES[0],
-    choices=WAVE_CORRECT_CHOICES,
+    '--wave_correct', action='store', default=list(WAVE_CORRECT_CHOICES.keys())[0],
+    help="Perform wave effect correction. The default is '%s'" % list(WAVE_CORRECT_CHOICES.keys())[0],
+    choices=WAVE_CORRECT_CHOICES.keys(),
     type=str, dest='wave_correct'
 )
 parser.add_argument(
@@ -279,11 +280,7 @@ def main():
     compose_megapix = args.compose_megapix
     conf_thresh = args.conf_thresh
     ba_refine_mask = args.ba_refine_mask
-    wave_correct = args.wave_correct
-    if wave_correct == 'no':
-        do_wave_correct = False
-    else:
-        do_wave_correct = True
+    wave_correct = WAVE_CORRECT_CHOICES[args.wave_correct]
     if args.save_graph is None:
         save_graph = False
     else:
@@ -343,7 +340,7 @@ def main():
         with open(args.save_graph, 'w') as fh:
             fh.write(cv.detail.matchesGraphAsString(img_names, p, conf_thresh))
 
-    indices = cv.detail.leaveBiggestComponent(features, p, 0.3)
+    indices = cv.detail.leaveBiggestComponent(features, p, conf_thresh)
     img_subset = []
     img_names_subset = []
     full_img_sizes_subset = []
@@ -393,11 +390,11 @@ def main():
         warped_image_scale = focals[len(focals) // 2]
     else:
         warped_image_scale = (focals[len(focals) // 2] + focals[len(focals) // 2 - 1]) / 2
-    if do_wave_correct:
+    if wave_correct is not None:
         rmats = []
         for cam in cameras:
             rmats.append(np.copy(cam.R))
-        rmats = cv.detail.waveCorrect(rmats, cv.detail.WAVE_CORRECT_HORIZ)
+        rmats = cv.detail.waveCorrect(rmats, wave_correct)
         for idx, cam in enumerate(cameras):
             cam.R = rmats[idx]
     corners = []
@@ -433,7 +430,7 @@ def main():
     compensator.feed(corners=corners, images=images_warped, masks=masks_warped)
 
     seam_finder = SEAM_FIND_CHOICES[args.seam]
-    seam_finder.find(images_warped_f, corners, masks_warped)
+    masks_warped = seam_finder.find(images_warped_f, corners, masks_warped)
     compose_scale = 1
     corners = []
     sizes = []
diff --git a/samples/python/tracker.py b/samples/python/tracker.py
index f67499cd15..753e166ad8 100644
--- a/samples/python/tracker.py
+++ b/samples/python/tracker.py
@@ -3,8 +3,22 @@
 '''
 Tracker demo
 
+For usage download models by following links
+For GOTURN:
+    goturn.prototxt and goturn.caffemodel: https://github.com/opencv/opencv_extra/tree/c4219d5eb3105ed8e634278fad312a1a8d2c182d/testdata/tracking
+For DaSiamRPN:
+    network:     https://www.dropbox.com/s/rr1lk9355vzolqv/dasiamrpn_model.onnx?dl=0
+    kernel_r1:   https://www.dropbox.com/s/999cqx5zrfi7w4p/dasiamrpn_kernel_r1.onnx?dl=0
+    kernel_cls1: https://www.dropbox.com/s/qvmtszx5h339a0w/dasiamrpn_kernel_cls1.onnx?dl=0
+
 USAGE:
-    tracker.py [<video_source>]
+    tracker.py [-h] [--input INPUT] [--tracker_algo TRACKER_ALGO]
+                    [--goturn GOTURN] [--goturn_model GOTURN_MODEL]
+                    [--dasiamrpn_net DASIAMRPN_NET]
+                    [--dasiamrpn_kernel_r1 DASIAMRPN_KERNEL_R1]
+                    [--dasiamrpn_kernel_cls1 DASIAMRPN_KERNEL_CLS1]
+                    [--dasiamrpn_backend DASIAMRPN_BACKEND]
+                    [--dasiamrpn_target DASIAMRPN_TARGET]
 '''
 
 # Python 2/3 compatibility
@@ -14,18 +28,37 @@ import sys
 
 import numpy as np
 import cv2 as cv
+import argparse
 
 from video import create_capture, presets
 
 class App(object):
 
-    def initializeTracker(self, image):
+    def __init__(self, args):
+        self.args = args
+
+    def initializeTracker(self, image, trackerAlgorithm):
         while True:
+            if trackerAlgorithm == 'mil':
+                tracker = cv.TrackerMIL_create()
+            elif trackerAlgorithm == 'goturn':
+                params = cv.TrackerGOTURN_Params()
+                params.modelTxt = self.args.goturn
+                params.modelBin = self.args.goturn_model
+                tracker = cv.TrackerGOTURN_create(params)
+            elif trackerAlgorithm == 'dasiamrpn':
+                params = cv.TrackerDaSiamRPN_Params()
+                params.model = self.args.dasiamrpn_net
+                params.kernel_cls1 = self.args.dasiamrpn_kernel_cls1
+                params.kernel_r1 = self.args.dasiamrpn_kernel_r1
+                tracker = cv.TrackerDaSiamRPN_create(params)
+            else:
+                sys.exit("Tracker {} is not recognized. Please use one of three available: mil, goturn, dasiamrpn.".format(trackerAlgorithm))
+
             print('==> Select object ROI for tracker ...')
             bbox = cv.selectROI('tracking', image)
             print('ROI: {}'.format(bbox))
 
-            tracker = cv.TrackerMIL_create()
             try:
                 tracker.init(image, bbox)
             except Exception as e:
@@ -37,7 +70,8 @@ class App(object):
             return tracker
 
     def run(self):
-        videoPath = sys.argv[1] if len(sys.argv) >= 2 else 'vtest.avi'
+        videoPath = self.args.input
+        trackerAlgorithm = self.args.tracker_algo
         camera = create_capture(videoPath, presets['cube'])
         if not camera.isOpened():
             sys.exit("Can't open video stream: {}".format(videoPath))
@@ -48,7 +82,7 @@ class App(object):
         assert image is not None
 
         cv.namedWindow('tracking')
-        tracker = self.initializeTracker(image)
+        tracker = self.initializeTracker(image, trackerAlgorithm)
 
         print("==> Tracking is started. Press 'SPACE' to re-initialize tracker or 'ESC' for exit...")
 
@@ -76,5 +110,24 @@ class App(object):
 
 if __name__ == '__main__':
     print(__doc__)
-    App().run()
+    parser = argparse.ArgumentParser(description="Run tracker")
+    parser.add_argument("--input", type=str, default="vtest.avi", help="Path to video source")
+    parser.add_argument("--tracker_algo", type=str, default="mil", help="One of three available tracking algorithms: mil, goturn, dasiamrpn")
+    parser.add_argument("--goturn", type=str, default="goturn.prototxt", help="Path to GOTURN architecture")
+    parser.add_argument("--goturn_model", type=str, default="goturn.caffemodel", help="Path to GOTERN model")
+    parser.add_argument("--dasiamrpn_net", type=str, default="dasiamrpn_model.onnx", help="Path to onnx model of DaSiamRPN net")
+    parser.add_argument("--dasiamrpn_kernel_r1", type=str, default="dasiamrpn_kernel_r1.onnx", help="Path to onnx model of DaSiamRPN kernel_r1")
+    parser.add_argument("--dasiamrpn_kernel_cls1", type=str, default="dasiamrpn_kernel_cls1.onnx", help="Path to onnx model of DaSiamRPN kernel_cls1")
+    parser.add_argument("--dasiamrpn_backend", type=int, default=0, help="Choose one of computation backends:\
+                                                                           0: automatically (by default),\
+                                                                           1: Halide language (http://halide-lang.org/),\
+                                                                           2: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit),\
+                                                                           3: OpenCV implementation")
+    parser.add_argument("--dasiamrpn_target", type=int, default=0, help="Choose one of target computation devices:\
+                                                                         0: CPU target (by default),\
+                                                                         1: OpenCL,\
+                                                                         2: OpenCL fp16 (half-float precision),\
+                                                                         3: VPU")
+    args = parser.parse_args()
+    App(args).run()
     cv.destroyAllWindows()
diff --git a/samples/tapi/ufacedetect.cpp b/samples/tapi/ufacedetect.cpp
index 0a6d91c3d6..23445cc28d 100644
--- a/samples/tapi/ufacedetect.cpp
+++ b/samples/tapi/ufacedetect.cpp
@@ -206,7 +206,7 @@ void detectAndDraw( UMat& img, Mat& canvas, CascadeClassifier& cascade,
     double alpha = nframes > 50 ? 0.01 : 1./nframes;
     avgfps = avgfps*(1-alpha) + fps*alpha;
 
-    putText(canvas, format("OpenCL: %s, fps: %.1f", ocl::useOpenCL() ? "ON" : "OFF", avgfps), Point(50, 30),
+    putText(canvas, cv::format("OpenCL: %s, fps: %.1f", ocl::useOpenCL() ? "ON" : "OFF", avgfps), Point(50, 30),
             FONT_HERSHEY_SIMPLEX, 0.8, Scalar(0,255,0), 2);
 
     for ( size_t i = 0; i < faces.size(); i++ )
diff --git a/samples/tapi/video_acceleration.cpp b/samples/tapi/video_acceleration.cpp
index 2c997710b3..5169236c29 100644
--- a/samples/tapi/video_acceleration.cpp
+++ b/samples/tapi/video_acceleration.cpp
@@ -151,11 +151,11 @@ int main(int argc, char** argv)
             return 1;
         }
         cout << "VideoWriter backend = " << writer.getBackendName() << endl;
-        actual_accel = static_cast<VideoAccelerationType>(static_cast<int>(writer.get(CAP_PROP_HW_ACCELERATION)));
+        actual_accel = static_cast<VideoAccelerationType>(static_cast<int>(writer.get(VIDEOWRITER_PROP_HW_ACCELERATION)));
         for (size_t i = 0; i < sizeof(acceleration_strings) / sizeof(acceleration_strings[0]); i++) {
             if (actual_accel == acceleration_strings[i].acceleration) {
                 cout << "VideoWriter acceleration = " << acceleration_strings[i].str << endl;
-                cout << "VideoWriter acceleration device = " << (int)writer.get(CAP_PROP_HW_DEVICE) << endl;
+                cout << "VideoWriter acceleration device = " << (int)writer.get(VIDEOWRITER_PROP_HW_DEVICE) << endl;
                 break;
             }
         }